diff --git a/codellama/c/dataflow_c_pretrained/all_results.json b/codellama/c/dataflow_c_pretrained/all_results.json index 83ca8ccccb78bbd05bfc39fae1f253bc31bdb395..90bfffe14242b2e03c5dd58e37596025330d4548 100644 --- a/codellama/c/dataflow_c_pretrained/all_results.json +++ b/codellama/c/dataflow_c_pretrained/all_results.json @@ -1,8 +1,8 @@ { - "epoch": 1.5076373735369968, - "total_flos": 1.4535297138363187e+18, - "train_loss": 0.11740684490454824, - "train_runtime": 39384.0084, - "train_samples_per_second": 0.772, - "train_steps_per_second": 0.012 + "epoch": 1.2058706862356208, + "total_flos": 1.216645538039931e+18, + "train_loss": 0.10745409297707834, + "train_runtime": 37043.3755, + "train_samples_per_second": 0.657, + "train_steps_per_second": 0.005 } \ No newline at end of file diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-190/README.md b/codellama/c/dataflow_c_pretrained/checkpoint-190/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/c/dataflow_c_pretrained/checkpoint-190/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_config.json b/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..08403128ecb652a98a4e935672da65aa91a5918d --- /dev/null +++ b/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "up_proj", + "q_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_model.safetensors b/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8b3a4fe72788a9cd9ee04efc5c639103a8edf531 --- /dev/null +++ b/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9de8d340f5057379260edf56d8c2bf090c3f6e213b999eafc222fced213416fe +size 1156480200 diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_model/README.md b/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_model/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_model/adapter_config.json b/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..08403128ecb652a98a4e935672da65aa91a5918d --- /dev/null +++ b/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_model/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "up_proj", + "q_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_model/adapter_model.safetensors b/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_model/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8b3a4fe72788a9cd9ee04efc5c639103a8edf531 --- /dev/null +++ b/codellama/c/dataflow_c_pretrained/checkpoint-190/adapter_model/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9de8d340f5057379260edf56d8c2bf090c3f6e213b999eafc222fced213416fe +size 1156480200 diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/added_tokens.json b/codellama/c/dataflow_c_pretrained/checkpoint-190/added_tokens.json similarity index 100% rename from codellama/c/dataflow_c_pretrained/checkpoint-475/added_tokens.json rename to codellama/c/dataflow_c_pretrained/checkpoint-190/added_tokens.json diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-190/optimizer.pt b/codellama/c/dataflow_c_pretrained/checkpoint-190/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7e5e6cb3d4599de0f65b5f7f658148582f8cd45 --- /dev/null +++ b/codellama/c/dataflow_c_pretrained/checkpoint-190/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6951313c1b248ce4c836696dac190e0dab42809267e147c8304926cfd6019b36 +size 2003126962 diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/rng_state.pth b/codellama/c/dataflow_c_pretrained/checkpoint-190/rng_state.pth similarity index 100% rename from codellama/c/dataflow_c_pretrained/checkpoint-475/rng_state.pth rename to codellama/c/dataflow_c_pretrained/checkpoint-190/rng_state.pth diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-190/scheduler.pt b/codellama/c/dataflow_c_pretrained/checkpoint-190/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..40ed602a6b8e648dee4651c812d55746c4306967 --- /dev/null +++ b/codellama/c/dataflow_c_pretrained/checkpoint-190/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4326f7a2418d4815e699b330bd26d5f5313efeeb51d248a1c8d3070a922c1ddd +size 1064 diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/special_tokens_map.json b/codellama/c/dataflow_c_pretrained/checkpoint-190/special_tokens_map.json similarity index 100% rename from codellama/c/dataflow_c_pretrained/checkpoint-475/special_tokens_map.json rename to codellama/c/dataflow_c_pretrained/checkpoint-190/special_tokens_map.json diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/tokenizer.model b/codellama/c/dataflow_c_pretrained/checkpoint-190/tokenizer.model similarity index 100% rename from codellama/c/dataflow_c_pretrained/checkpoint-475/tokenizer.model rename to codellama/c/dataflow_c_pretrained/checkpoint-190/tokenizer.model diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/tokenizer_config.json b/codellama/c/dataflow_c_pretrained/checkpoint-190/tokenizer_config.json similarity index 100% rename from codellama/c/dataflow_c_pretrained/checkpoint-475/tokenizer_config.json rename to codellama/c/dataflow_c_pretrained/checkpoint-190/tokenizer_config.json diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-190/trainer_state.json b/codellama/c/dataflow_c_pretrained/checkpoint-190/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..17e9a31953cc892a4be9c09f9427e883c046de69 --- /dev/null +++ b/codellama/c/dataflow_c_pretrained/checkpoint-190/trainer_state.json @@ -0,0 +1,299 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2058706862356208, + "eval_steps": 500, + "global_step": 190, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0317334391114637, + "grad_norm": 0.060546875, + "learning_rate": 0.0001, + "loss": 0.6421, + "step": 5 + }, + { + "epoch": 0.0634668782229274, + "grad_norm": 0.11572265625, + "learning_rate": 0.0001, + "loss": 0.5213, + "step": 10 + }, + { + "epoch": 0.09520031733439112, + "grad_norm": 0.08251953125, + "learning_rate": 0.0001, + "loss": 0.2925, + "step": 15 + }, + { + "epoch": 0.1269337564458548, + "grad_norm": 0.0634765625, + "learning_rate": 0.0001, + "loss": 0.1978, + "step": 20 + }, + { + "epoch": 0.15866719555731854, + "grad_norm": 0.08251953125, + "learning_rate": 0.0001, + "loss": 0.1538, + "step": 25 + }, + { + "epoch": 0.19040063466878224, + "grad_norm": 0.10888671875, + "learning_rate": 0.0001, + "loss": 0.106, + "step": 30 + }, + { + "epoch": 0.22213407378024594, + "grad_norm": 0.049560546875, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 35 + }, + { + "epoch": 0.2538675128917096, + "grad_norm": 0.310546875, + "learning_rate": 0.0001, + "loss": 0.1215, + "step": 40 + }, + { + "epoch": 0.28560095200317337, + "grad_norm": 0.06494140625, + "learning_rate": 0.0001, + "loss": 0.2476, + "step": 45 + }, + { + "epoch": 0.31733439111463707, + "grad_norm": 0.40234375, + "learning_rate": 0.0001, + "loss": 0.1073, + "step": 50 + }, + { + "epoch": 0.3490678302261008, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001, + "loss": 0.0863, + "step": 55 + }, + { + "epoch": 0.3808012693375645, + "grad_norm": 0.03369140625, + "learning_rate": 0.0001, + "loss": 0.0671, + "step": 60 + }, + { + "epoch": 0.4125347084490282, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0001, + "loss": 0.0493, + "step": 65 + }, + { + "epoch": 0.4442681475604919, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 70 + }, + { + "epoch": 0.4760015866719556, + "grad_norm": 0.01275634765625, + "learning_rate": 0.0001, + "loss": 0.0125, + "step": 75 + }, + { + "epoch": 0.5077350257834192, + "grad_norm": 0.06787109375, + "learning_rate": 0.0001, + "loss": 0.1307, + "step": 80 + }, + { + "epoch": 0.539468464894883, + "grad_norm": 0.050048828125, + "learning_rate": 0.0001, + "loss": 0.171, + "step": 85 + }, + { + "epoch": 0.5712019040063467, + "grad_norm": 0.060791015625, + "learning_rate": 0.0001, + "loss": 0.0818, + "step": 90 + }, + { + "epoch": 0.6029353431178104, + "grad_norm": 0.033203125, + "learning_rate": 0.0001, + "loss": 0.0658, + "step": 95 + }, + { + "epoch": 0.6346687822292741, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 100 + }, + { + "epoch": 0.6664022213407378, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 105 + }, + { + "epoch": 0.6981356604522015, + "grad_norm": 0.0181884765625, + "learning_rate": 0.0001, + "loss": 0.0187, + "step": 110 + }, + { + "epoch": 0.7298690995636652, + "grad_norm": 0.019775390625, + "learning_rate": 0.0001, + "loss": 0.0095, + "step": 115 + }, + { + "epoch": 0.761602538675129, + "grad_norm": 0.060791015625, + "learning_rate": 0.0001, + "loss": 0.1381, + "step": 120 + }, + { + "epoch": 0.7933359777865926, + "grad_norm": 0.038818359375, + "learning_rate": 0.0001, + "loss": 0.1125, + "step": 125 + }, + { + "epoch": 0.8250694168980564, + "grad_norm": 0.032958984375, + "learning_rate": 0.0001, + "loss": 0.062, + "step": 130 + }, + { + "epoch": 0.85680285600952, + "grad_norm": 0.03173828125, + "learning_rate": 0.0001, + "loss": 0.0526, + "step": 135 + }, + { + "epoch": 0.8885362951209838, + "grad_norm": 0.02392578125, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 140 + }, + { + "epoch": 0.9202697342324474, + "grad_norm": 0.027099609375, + "learning_rate": 0.0001, + "loss": 0.027, + "step": 145 + }, + { + "epoch": 0.9520031733439112, + "grad_norm": 0.02294921875, + "learning_rate": 0.0001, + "loss": 0.0115, + "step": 150 + }, + { + "epoch": 0.9837366124553748, + "grad_norm": 0.02099609375, + "learning_rate": 0.0001, + "loss": 0.005, + "step": 155 + }, + { + "epoch": 1.0154700515668384, + "grad_norm": 0.0703125, + "learning_rate": 0.0001, + "loss": 0.1291, + "step": 160 + }, + { + "epoch": 1.0472034906783023, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001, + "loss": 0.1033, + "step": 165 + }, + { + "epoch": 1.078936929789766, + "grad_norm": 0.03173828125, + "learning_rate": 0.0001, + "loss": 0.0539, + "step": 170 + }, + { + "epoch": 1.1106703689012296, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 175 + }, + { + "epoch": 1.1424038080126935, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 180 + }, + { + "epoch": 1.1741372471241571, + "grad_norm": 0.060791015625, + "learning_rate": 0.0001, + "loss": 0.0239, + "step": 185 + }, + { + "epoch": 1.2058706862356208, + "grad_norm": 0.015625, + "learning_rate": 0.0001, + "loss": 0.0095, + "step": 190 + } + ], + "logging_steps": 5, + "max_steps": 190, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 90, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.216645538039931e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-190/training_args.bin b/codellama/c/dataflow_c_pretrained/checkpoint-190/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d4a2d49e40973cd8ce0bb68429be56f03108f0db --- /dev/null +++ b/codellama/c/dataflow_c_pretrained/checkpoint-190/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f41d6ab0b3b00576cfaae17b2c89c881cde6b3ddf94e79209bf6c926c2f26a +size 7416 diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model.safetensors b/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model.safetensors deleted file mode 100644 index fbe0ddd208b18ae490cd620a13be9649f3a9137c..0000000000000000000000000000000000000000 --- a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e3bd7cb053c3e00ea48ed365eed4b65ae5e2d7d807e71ec5615d765dfba19de -size 1156480200 diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/adapter_model.safetensors b/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/adapter_model.safetensors deleted file mode 100644 index fbe0ddd208b18ae490cd620a13be9649f3a9137c..0000000000000000000000000000000000000000 --- a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e3bd7cb053c3e00ea48ed365eed4b65ae5e2d7d807e71ec5615d765dfba19de -size 1156480200 diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/optimizer.pt b/codellama/c/dataflow_c_pretrained/checkpoint-475/optimizer.pt deleted file mode 100644 index 967cc2106211d6a7f289e2d14d78e9c6b19d67fd..0000000000000000000000000000000000000000 --- a/codellama/c/dataflow_c_pretrained/checkpoint-475/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cb15df4736bb45403f2444791e0af6b8cb6ce098124d0e654e1c324cac779265 -size 2003127538 diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/scheduler.pt b/codellama/c/dataflow_c_pretrained/checkpoint-475/scheduler.pt deleted file mode 100644 index 25f1fc9a568e1572c2b33dcfe44c060818d7894d..0000000000000000000000000000000000000000 --- a/codellama/c/dataflow_c_pretrained/checkpoint-475/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1219a1788d5d094f428228d99e4982dc061bcd85dea2cf1e1ca0c7a969573be6 -size 1064 diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/trainer_state.json b/codellama/c/dataflow_c_pretrained/checkpoint-475/trainer_state.json deleted file mode 100644 index f4c30481ff53bbb9fdf1f266fd6622c74499701c..0000000000000000000000000000000000000000 --- a/codellama/c/dataflow_c_pretrained/checkpoint-475/trainer_state.json +++ /dev/null @@ -1,698 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.5076373735369968, - "eval_steps": 500, - "global_step": 475, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.015869867089863123, - "grad_norm": 0.058837890625, - "learning_rate": 0.0001, - "loss": 0.769, - "step": 5 - }, - { - "epoch": 0.031739734179726246, - "grad_norm": 0.11572265625, - "learning_rate": 0.0001, - "loss": 0.615, - "step": 10 - }, - { - "epoch": 0.047609601269589366, - "grad_norm": 0.0634765625, - "learning_rate": 0.0001, - "loss": 0.3973, - "step": 15 - }, - { - "epoch": 0.06347946835945249, - "grad_norm": 0.07470703125, - "learning_rate": 0.0001, - "loss": 0.2804, - "step": 20 - }, - { - "epoch": 0.0793493354493156, - "grad_norm": 0.06884765625, - "learning_rate": 0.0001, - "loss": 0.2244, - "step": 25 - }, - { - "epoch": 0.09521920253917873, - "grad_norm": 0.10498046875, - "learning_rate": 0.0001, - "loss": 0.1925, - "step": 30 - }, - { - "epoch": 0.11108906962904186, - "grad_norm": 0.08056640625, - "learning_rate": 0.0001, - "loss": 0.1477, - "step": 35 - }, - { - "epoch": 0.12695893671890499, - "grad_norm": 0.0732421875, - "learning_rate": 0.0001, - "loss": 0.0969, - "step": 40 - }, - { - "epoch": 0.1428288038087681, - "grad_norm": 0.07568359375, - "learning_rate": 0.0001, - "loss": 0.0695, - "step": 45 - }, - { - "epoch": 0.1586986708986312, - "grad_norm": 0.125, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 50 - }, - { - "epoch": 0.17456853798849434, - "grad_norm": 0.0859375, - "learning_rate": 0.0001, - "loss": 0.4702, - "step": 55 - }, - { - "epoch": 0.19043840507835746, - "grad_norm": 0.06787109375, - "learning_rate": 0.0001, - "loss": 0.2393, - "step": 60 - }, - { - "epoch": 0.2063082721682206, - "grad_norm": 0.045166015625, - "learning_rate": 0.0001, - "loss": 0.1604, - "step": 65 - }, - { - "epoch": 0.22217813925808372, - "grad_norm": 0.04931640625, - "learning_rate": 0.0001, - "loss": 0.1499, - "step": 70 - }, - { - "epoch": 0.23804800634794684, - "grad_norm": 0.041748046875, - "learning_rate": 0.0001, - "loss": 0.123, - "step": 75 - }, - { - "epoch": 0.25391787343780997, - "grad_norm": 0.042236328125, - "learning_rate": 0.0001, - "loss": 0.1056, - "step": 80 - }, - { - "epoch": 0.26978774052767307, - "grad_norm": 0.049560546875, - "learning_rate": 0.0001, - "loss": 0.0801, - "step": 85 - }, - { - "epoch": 0.2856576076175362, - "grad_norm": 0.043212890625, - "learning_rate": 0.0001, - "loss": 0.0617, - "step": 90 - }, - { - "epoch": 0.3015274747073993, - "grad_norm": 0.037109375, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 95 - }, - { - "epoch": 0.3173973417972624, - "grad_norm": 0.028564453125, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 100 - }, - { - "epoch": 0.3332672088871256, - "grad_norm": 0.0634765625, - "learning_rate": 0.0001, - "loss": 0.3494, - "step": 105 - }, - { - "epoch": 0.3491370759769887, - "grad_norm": 0.07958984375, - "learning_rate": 0.0001, - "loss": 0.1779, - "step": 110 - }, - { - "epoch": 0.36500694306685183, - "grad_norm": 0.040283203125, - "learning_rate": 0.0001, - "loss": 0.1283, - "step": 115 - }, - { - "epoch": 0.38087681015671493, - "grad_norm": 0.038818359375, - "learning_rate": 0.0001, - "loss": 0.111, - "step": 120 - }, - { - "epoch": 0.3967466772465781, - "grad_norm": 0.048095703125, - "learning_rate": 0.0001, - "loss": 0.0945, - "step": 125 - }, - { - "epoch": 0.4126165443364412, - "grad_norm": 0.06103515625, - "learning_rate": 0.0001, - "loss": 0.0833, - "step": 130 - }, - { - "epoch": 0.4284864114263043, - "grad_norm": 0.05859375, - "learning_rate": 0.0001, - "loss": 0.0702, - "step": 135 - }, - { - "epoch": 0.44435627851616744, - "grad_norm": 0.060302734375, - "learning_rate": 0.0001, - "loss": 0.0509, - "step": 140 - }, - { - "epoch": 0.46022614560603053, - "grad_norm": 0.042724609375, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 145 - }, - { - "epoch": 0.4760960126958937, - "grad_norm": 0.048583984375, - "learning_rate": 0.0001, - "loss": 0.0225, - "step": 150 - }, - { - "epoch": 0.4919658797857568, - "grad_norm": 0.056396484375, - "learning_rate": 0.0001, - "loss": 0.3315, - "step": 155 - }, - { - "epoch": 0.5078357468756199, - "grad_norm": 0.0478515625, - "learning_rate": 0.0001, - "loss": 0.1585, - "step": 160 - }, - { - "epoch": 0.523705613965483, - "grad_norm": 0.07177734375, - "learning_rate": 0.0001, - "loss": 0.1173, - "step": 165 - }, - { - "epoch": 0.5395754810553461, - "grad_norm": 0.050537109375, - "learning_rate": 0.0001, - "loss": 0.1054, - "step": 170 - }, - { - "epoch": 0.5554453481452093, - "grad_norm": 0.052734375, - "learning_rate": 0.0001, - "loss": 0.0828, - "step": 175 - }, - { - "epoch": 0.5713152152350724, - "grad_norm": 0.05126953125, - "learning_rate": 0.0001, - "loss": 0.0778, - "step": 180 - }, - { - "epoch": 0.5871850823249355, - "grad_norm": 0.034423828125, - "learning_rate": 0.0001, - "loss": 0.0632, - "step": 185 - }, - { - "epoch": 0.6030549494147986, - "grad_norm": 0.038330078125, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 190 - }, - { - "epoch": 0.6189248165046618, - "grad_norm": 0.0400390625, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 195 - }, - { - "epoch": 0.6347946835945248, - "grad_norm": 0.08642578125, - "learning_rate": 0.0001, - "loss": 0.0195, - "step": 200 - }, - { - "epoch": 0.650664550684388, - "grad_norm": 0.07080078125, - "learning_rate": 0.0001, - "loss": 0.3038, - "step": 205 - }, - { - "epoch": 0.6665344177742512, - "grad_norm": 0.0556640625, - "learning_rate": 0.0001, - "loss": 0.1574, - "step": 210 - }, - { - "epoch": 0.6824042848641143, - "grad_norm": 0.054443359375, - "learning_rate": 0.0001, - "loss": 0.1049, - "step": 215 - }, - { - "epoch": 0.6982741519539774, - "grad_norm": 0.052490234375, - "learning_rate": 0.0001, - "loss": 0.0955, - "step": 220 - }, - { - "epoch": 0.7141440190438405, - "grad_norm": 0.046630859375, - "learning_rate": 0.0001, - "loss": 0.0767, - "step": 225 - }, - { - "epoch": 0.7300138861337037, - "grad_norm": 0.052978515625, - "learning_rate": 0.0001, - "loss": 0.0636, - "step": 230 - }, - { - "epoch": 0.7458837532235667, - "grad_norm": 0.0546875, - "learning_rate": 0.0001, - "loss": 0.0584, - "step": 235 - }, - { - "epoch": 0.7617536203134299, - "grad_norm": 0.0546875, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 240 - }, - { - "epoch": 0.777623487403293, - "grad_norm": 0.035400390625, - "learning_rate": 0.0001, - "loss": 0.0268, - "step": 245 - }, - { - "epoch": 0.7934933544931562, - "grad_norm": 0.03564453125, - "learning_rate": 0.0001, - "loss": 0.0197, - "step": 250 - }, - { - "epoch": 0.8093632215830192, - "grad_norm": 0.0673828125, - "learning_rate": 0.0001, - "loss": 0.264, - "step": 255 - }, - { - "epoch": 0.8252330886728824, - "grad_norm": 0.050048828125, - "learning_rate": 0.0001, - "loss": 0.1382, - "step": 260 - }, - { - "epoch": 0.8411029557627455, - "grad_norm": 0.053955078125, - "learning_rate": 0.0001, - "loss": 0.0959, - "step": 265 - }, - { - "epoch": 0.8569728228526086, - "grad_norm": 0.055908203125, - "learning_rate": 0.0001, - "loss": 0.0986, - "step": 270 - }, - { - "epoch": 0.8728426899424717, - "grad_norm": 0.05322265625, - "learning_rate": 0.0001, - "loss": 0.0806, - "step": 275 - }, - { - "epoch": 0.8887125570323349, - "grad_norm": 0.037109375, - "learning_rate": 0.0001, - "loss": 0.0627, - "step": 280 - }, - { - "epoch": 0.904582424122198, - "grad_norm": 0.035888671875, - "learning_rate": 0.0001, - "loss": 0.0488, - "step": 285 - }, - { - "epoch": 0.9204522912120611, - "grad_norm": 0.049072265625, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 290 - }, - { - "epoch": 0.9363221583019242, - "grad_norm": 0.042236328125, - "learning_rate": 0.0001, - "loss": 0.0259, - "step": 295 - }, - { - "epoch": 0.9521920253917874, - "grad_norm": 0.02490234375, - "learning_rate": 0.0001, - "loss": 0.0168, - "step": 300 - }, - { - "epoch": 0.9680618924816504, - "grad_norm": 0.07080078125, - "learning_rate": 0.0001, - "loss": 0.1856, - "step": 305 - }, - { - "epoch": 0.9839317595715136, - "grad_norm": 0.09814453125, - "learning_rate": 0.0001, - "loss": 0.0806, - "step": 310 - }, - { - "epoch": 0.9998016266613767, - "grad_norm": 0.0380859375, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 315 - }, - { - "epoch": 1.0156714937512399, - "grad_norm": 0.07373046875, - "learning_rate": 0.0001, - "loss": 0.2891, - "step": 320 - }, - { - "epoch": 1.031541360841103, - "grad_norm": 0.06982421875, - "learning_rate": 0.0001, - "loss": 0.1519, - "step": 325 - }, - { - "epoch": 1.047411227930966, - "grad_norm": 0.048095703125, - "learning_rate": 0.0001, - "loss": 0.094, - "step": 330 - }, - { - "epoch": 1.0632810950208291, - "grad_norm": 0.051513671875, - "learning_rate": 0.0001, - "loss": 0.0843, - "step": 335 - }, - { - "epoch": 1.0791509621106923, - "grad_norm": 0.0517578125, - "learning_rate": 0.0001, - "loss": 0.0695, - "step": 340 - }, - { - "epoch": 1.0950208292005554, - "grad_norm": 0.04931640625, - "learning_rate": 0.0001, - "loss": 0.0586, - "step": 345 - }, - { - "epoch": 1.1108906962904186, - "grad_norm": 0.06201171875, - "learning_rate": 0.0001, - "loss": 0.0493, - "step": 350 - }, - { - "epoch": 1.1267605633802817, - "grad_norm": 0.0272216796875, - "learning_rate": 0.0001, - "loss": 0.0278, - "step": 355 - }, - { - "epoch": 1.142630430470145, - "grad_norm": 0.05419921875, - "learning_rate": 0.0001, - "loss": 0.0219, - "step": 360 - }, - { - "epoch": 1.1585002975600078, - "grad_norm": 0.07177734375, - "learning_rate": 0.0001, - "loss": 0.015, - "step": 365 - }, - { - "epoch": 1.174370164649871, - "grad_norm": 0.09521484375, - "learning_rate": 0.0001, - "loss": 0.2371, - "step": 370 - }, - { - "epoch": 1.1902400317397341, - "grad_norm": 0.060791015625, - "learning_rate": 0.0001, - "loss": 0.118, - "step": 375 - }, - { - "epoch": 1.2061098988295973, - "grad_norm": 0.059814453125, - "learning_rate": 0.0001, - "loss": 0.0904, - "step": 380 - }, - { - "epoch": 1.2219797659194604, - "grad_norm": 0.051513671875, - "learning_rate": 0.0001, - "loss": 0.079, - "step": 385 - }, - { - "epoch": 1.2378496330093236, - "grad_norm": 0.05126953125, - "learning_rate": 0.0001, - "loss": 0.0618, - "step": 390 - }, - { - "epoch": 1.2537195000991868, - "grad_norm": 0.06982421875, - "learning_rate": 0.0001, - "loss": 0.0501, - "step": 395 - }, - { - "epoch": 1.2695893671890497, - "grad_norm": 0.046142578125, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 400 - }, - { - "epoch": 1.2854592342789128, - "grad_norm": 0.03564453125, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 405 - }, - { - "epoch": 1.301329101368776, - "grad_norm": 0.0341796875, - "learning_rate": 0.0001, - "loss": 0.0185, - "step": 410 - }, - { - "epoch": 1.3171989684586392, - "grad_norm": 0.0286865234375, - "learning_rate": 0.0001, - "loss": 0.0123, - "step": 415 - }, - { - "epoch": 1.3330688355485023, - "grad_norm": 0.054931640625, - "learning_rate": 0.0001, - "loss": 0.2018, - "step": 420 - }, - { - "epoch": 1.3489387026383655, - "grad_norm": 0.060302734375, - "learning_rate": 0.0001, - "loss": 0.1189, - "step": 425 - }, - { - "epoch": 1.3648085697282286, - "grad_norm": 0.046630859375, - "learning_rate": 0.0001, - "loss": 0.0821, - "step": 430 - }, - { - "epoch": 1.3806784368180915, - "grad_norm": 0.0576171875, - "learning_rate": 0.0001, - "loss": 0.0759, - "step": 435 - }, - { - "epoch": 1.3965483039079547, - "grad_norm": 0.058349609375, - "learning_rate": 0.0001, - "loss": 0.0567, - "step": 440 - }, - { - "epoch": 1.4124181709978179, - "grad_norm": 0.05908203125, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 445 - }, - { - "epoch": 1.428288038087681, - "grad_norm": 0.054443359375, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 450 - }, - { - "epoch": 1.4441579051775442, - "grad_norm": 0.036376953125, - "learning_rate": 0.0001, - "loss": 0.0283, - "step": 455 - }, - { - "epoch": 1.4600277722674073, - "grad_norm": 0.142578125, - "learning_rate": 0.0001, - "loss": 0.0206, - "step": 460 - }, - { - "epoch": 1.4758976393572705, - "grad_norm": 0.044677734375, - "learning_rate": 0.0001, - "loss": 0.0129, - "step": 465 - }, - { - "epoch": 1.4917675064471334, - "grad_norm": 0.07275390625, - "learning_rate": 0.0001, - "loss": 0.2036, - "step": 470 - }, - { - "epoch": 1.5076373735369968, - "grad_norm": 0.052490234375, - "learning_rate": 0.0001, - "loss": 0.1093, - "step": 475 - } - ], - "logging_steps": 5, - "max_steps": 475, - "num_input_tokens_seen": 0, - "num_train_epochs": 2, - "save_steps": 90, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 1.4535297138363187e+18, - "train_batch_size": 4, - "trial_name": null, - "trial_params": null -} diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/training_args.bin b/codellama/c/dataflow_c_pretrained/checkpoint-475/training_args.bin deleted file mode 100644 index e91836f3d522bb0f661899abaf03ba42b585e6e7..0000000000000000000000000000000000000000 --- a/codellama/c/dataflow_c_pretrained/checkpoint-475/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0c2563f751da0f955348ed5d2c3112b7092683a85415d6f8758379982f01f992 -size 7416 diff --git a/codellama/c/dataflow_c_pretrained/metrics.json b/codellama/c/dataflow_c_pretrained/metrics.json index 582c1d4def10058472bb88fe726a4a3475d4edd7..3bb45134259072f7dc353af834e43d5b6cf31df4 100644 --- a/codellama/c/dataflow_c_pretrained/metrics.json +++ b/codellama/c/dataflow_c_pretrained/metrics.json @@ -1 +1 @@ -{"run_name": "dataflow_c_pretrained", "train_runtime": 39384.0084, "train_samples_per_second": 0.772, "train_steps_per_second": 0.012, "total_flos": 1.4535297138363187e+18, "train_loss": 0.11740684490454824, "epoch": 1.5076373735369968} \ No newline at end of file +{"run_name": "dataflow_c", "train_runtime": 37043.3755, "train_samples_per_second": 0.657, "train_steps_per_second": 0.005, "total_flos": 1.216645538039931e+18, "train_loss": 0.10745409297707834, "epoch": 1.2058706862356208} \ No newline at end of file diff --git a/codellama/c/dataflow_c_pretrained/train_results.json b/codellama/c/dataflow_c_pretrained/train_results.json index 83ca8ccccb78bbd05bfc39fae1f253bc31bdb395..90bfffe14242b2e03c5dd58e37596025330d4548 100644 --- a/codellama/c/dataflow_c_pretrained/train_results.json +++ b/codellama/c/dataflow_c_pretrained/train_results.json @@ -1,8 +1,8 @@ { - "epoch": 1.5076373735369968, - "total_flos": 1.4535297138363187e+18, - "train_loss": 0.11740684490454824, - "train_runtime": 39384.0084, - "train_samples_per_second": 0.772, - "train_steps_per_second": 0.012 + "epoch": 1.2058706862356208, + "total_flos": 1.216645538039931e+18, + "train_loss": 0.10745409297707834, + "train_runtime": 37043.3755, + "train_samples_per_second": 0.657, + "train_steps_per_second": 0.005 } \ No newline at end of file diff --git a/codellama/c/dataflow_c_pretrained/trainer_state.json b/codellama/c/dataflow_c_pretrained/trainer_state.json index d931850d168aa61e9a2922c516b0498bf7347838..a936dfec4596f4802dff746cd493b876c5b5992d 100644 --- a/codellama/c/dataflow_c_pretrained/trainer_state.json +++ b/codellama/c/dataflow_c_pretrained/trainer_state.json @@ -1,690 +1,291 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.5076373735369968, + "epoch": 1.2058706862356208, "eval_steps": 500, - "global_step": 475, + "global_step": 190, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.015869867089863123, - "grad_norm": 0.058837890625, + "epoch": 0.0317334391114637, + "grad_norm": 0.060546875, "learning_rate": 0.0001, - "loss": 0.769, + "loss": 0.6421, "step": 5 }, { - "epoch": 0.031739734179726246, + "epoch": 0.0634668782229274, "grad_norm": 0.11572265625, "learning_rate": 0.0001, - "loss": 0.615, + "loss": 0.5213, "step": 10 }, { - "epoch": 0.047609601269589366, - "grad_norm": 0.0634765625, + "epoch": 0.09520031733439112, + "grad_norm": 0.08251953125, "learning_rate": 0.0001, - "loss": 0.3973, + "loss": 0.2925, "step": 15 }, { - "epoch": 0.06347946835945249, - "grad_norm": 0.07470703125, + "epoch": 0.1269337564458548, + "grad_norm": 0.0634765625, "learning_rate": 0.0001, - "loss": 0.2804, + "loss": 0.1978, "step": 20 }, { - "epoch": 0.0793493354493156, - "grad_norm": 0.06884765625, + "epoch": 0.15866719555731854, + "grad_norm": 0.08251953125, "learning_rate": 0.0001, - "loss": 0.2244, + "loss": 0.1538, "step": 25 }, { - "epoch": 0.09521920253917873, - "grad_norm": 0.10498046875, + "epoch": 0.19040063466878224, + "grad_norm": 0.10888671875, "learning_rate": 0.0001, - "loss": 0.1925, + "loss": 0.106, "step": 30 }, { - "epoch": 0.11108906962904186, - "grad_norm": 0.08056640625, + "epoch": 0.22213407378024594, + "grad_norm": 0.049560546875, "learning_rate": 0.0001, - "loss": 0.1477, + "loss": 0.0454, "step": 35 }, { - "epoch": 0.12695893671890499, - "grad_norm": 0.0732421875, + "epoch": 0.2538675128917096, + "grad_norm": 0.310546875, "learning_rate": 0.0001, - "loss": 0.0969, + "loss": 0.1215, "step": 40 }, { - "epoch": 0.1428288038087681, - "grad_norm": 0.07568359375, + "epoch": 0.28560095200317337, + "grad_norm": 0.06494140625, "learning_rate": 0.0001, - "loss": 0.0695, + "loss": 0.2476, "step": 45 }, { - "epoch": 0.1586986708986312, - "grad_norm": 0.125, + "epoch": 0.31733439111463707, + "grad_norm": 0.40234375, "learning_rate": 0.0001, - "loss": 0.046, + "loss": 0.1073, "step": 50 }, { - "epoch": 0.17456853798849434, - "grad_norm": 0.0859375, + "epoch": 0.3490678302261008, + "grad_norm": 0.04052734375, "learning_rate": 0.0001, - "loss": 0.4702, + "loss": 0.0863, "step": 55 }, { - "epoch": 0.19043840507835746, - "grad_norm": 0.06787109375, + "epoch": 0.3808012693375645, + "grad_norm": 0.03369140625, "learning_rate": 0.0001, - "loss": 0.2393, + "loss": 0.0671, "step": 60 }, { - "epoch": 0.2063082721682206, - "grad_norm": 0.045166015625, + "epoch": 0.4125347084490282, + "grad_norm": 0.0274658203125, "learning_rate": 0.0001, - "loss": 0.1604, + "loss": 0.0493, "step": 65 }, { - "epoch": 0.22217813925808372, - "grad_norm": 0.04931640625, + "epoch": 0.4442681475604919, + "grad_norm": 0.0277099609375, "learning_rate": 0.0001, - "loss": 0.1499, + "loss": 0.0311, "step": 70 }, { - "epoch": 0.23804800634794684, - "grad_norm": 0.041748046875, + "epoch": 0.4760015866719556, + "grad_norm": 0.01275634765625, "learning_rate": 0.0001, - "loss": 0.123, + "loss": 0.0125, "step": 75 }, { - "epoch": 0.25391787343780997, - "grad_norm": 0.042236328125, + "epoch": 0.5077350257834192, + "grad_norm": 0.06787109375, "learning_rate": 0.0001, - "loss": 0.1056, + "loss": 0.1307, "step": 80 }, { - "epoch": 0.26978774052767307, - "grad_norm": 0.049560546875, + "epoch": 0.539468464894883, + "grad_norm": 0.050048828125, "learning_rate": 0.0001, - "loss": 0.0801, + "loss": 0.171, "step": 85 }, { - "epoch": 0.2856576076175362, - "grad_norm": 0.043212890625, + "epoch": 0.5712019040063467, + "grad_norm": 0.060791015625, "learning_rate": 0.0001, - "loss": 0.0617, + "loss": 0.0818, "step": 90 }, { - "epoch": 0.3015274747073993, - "grad_norm": 0.037109375, + "epoch": 0.6029353431178104, + "grad_norm": 0.033203125, "learning_rate": 0.0001, - "loss": 0.0423, + "loss": 0.0658, "step": 95 }, { - "epoch": 0.3173973417972624, - "grad_norm": 0.028564453125, + "epoch": 0.6346687822292741, + "grad_norm": 0.0235595703125, "learning_rate": 0.0001, - "loss": 0.0295, + "loss": 0.046, "step": 100 }, { - "epoch": 0.3332672088871256, - "grad_norm": 0.0634765625, + "epoch": 0.6664022213407378, + "grad_norm": 0.0299072265625, "learning_rate": 0.0001, - "loss": 0.3494, + "loss": 0.0384, "step": 105 }, { - "epoch": 0.3491370759769887, - "grad_norm": 0.07958984375, + "epoch": 0.6981356604522015, + "grad_norm": 0.0181884765625, "learning_rate": 0.0001, - "loss": 0.1779, + "loss": 0.0187, "step": 110 }, { - "epoch": 0.36500694306685183, - "grad_norm": 0.040283203125, + "epoch": 0.7298690995636652, + "grad_norm": 0.019775390625, "learning_rate": 0.0001, - "loss": 0.1283, + "loss": 0.0095, "step": 115 }, { - "epoch": 0.38087681015671493, - "grad_norm": 0.038818359375, + "epoch": 0.761602538675129, + "grad_norm": 0.060791015625, "learning_rate": 0.0001, - "loss": 0.111, + "loss": 0.1381, "step": 120 }, { - "epoch": 0.3967466772465781, - "grad_norm": 0.048095703125, + "epoch": 0.7933359777865926, + "grad_norm": 0.038818359375, "learning_rate": 0.0001, - "loss": 0.0945, + "loss": 0.1125, "step": 125 }, { - "epoch": 0.4126165443364412, - "grad_norm": 0.06103515625, + "epoch": 0.8250694168980564, + "grad_norm": 0.032958984375, "learning_rate": 0.0001, - "loss": 0.0833, + "loss": 0.062, "step": 130 }, { - "epoch": 0.4284864114263043, - "grad_norm": 0.05859375, + "epoch": 0.85680285600952, + "grad_norm": 0.03173828125, "learning_rate": 0.0001, - "loss": 0.0702, + "loss": 0.0526, "step": 135 }, { - "epoch": 0.44435627851616744, - "grad_norm": 0.060302734375, + "epoch": 0.8885362951209838, + "grad_norm": 0.02392578125, "learning_rate": 0.0001, - "loss": 0.0509, + "loss": 0.0382, "step": 140 }, { - "epoch": 0.46022614560603053, - "grad_norm": 0.042724609375, + "epoch": 0.9202697342324474, + "grad_norm": 0.027099609375, "learning_rate": 0.0001, - "loss": 0.0363, + "loss": 0.027, "step": 145 }, { - "epoch": 0.4760960126958937, - "grad_norm": 0.048583984375, + "epoch": 0.9520031733439112, + "grad_norm": 0.02294921875, "learning_rate": 0.0001, - "loss": 0.0225, + "loss": 0.0115, "step": 150 }, { - "epoch": 0.4919658797857568, - "grad_norm": 0.056396484375, + "epoch": 0.9837366124553748, + "grad_norm": 0.02099609375, "learning_rate": 0.0001, - "loss": 0.3315, + "loss": 0.005, "step": 155 }, { - "epoch": 0.5078357468756199, - "grad_norm": 0.0478515625, + "epoch": 1.0154700515668384, + "grad_norm": 0.0703125, "learning_rate": 0.0001, - "loss": 0.1585, + "loss": 0.1291, "step": 160 }, { - "epoch": 0.523705613965483, - "grad_norm": 0.07177734375, + "epoch": 1.0472034906783023, + "grad_norm": 0.04052734375, "learning_rate": 0.0001, - "loss": 0.1173, + "loss": 0.1033, "step": 165 }, { - "epoch": 0.5395754810553461, - "grad_norm": 0.050537109375, + "epoch": 1.078936929789766, + "grad_norm": 0.03173828125, "learning_rate": 0.0001, - "loss": 0.1054, + "loss": 0.0539, "step": 170 }, { - "epoch": 0.5554453481452093, - "grad_norm": 0.052734375, + "epoch": 1.1106703689012296, + "grad_norm": 0.0299072265625, "learning_rate": 0.0001, - "loss": 0.0828, + "loss": 0.043, "step": 175 }, { - "epoch": 0.5713152152350724, - "grad_norm": 0.05126953125, + "epoch": 1.1424038080126935, + "grad_norm": 0.0262451171875, "learning_rate": 0.0001, - "loss": 0.0778, + "loss": 0.0303, "step": 180 }, { - "epoch": 0.5871850823249355, - "grad_norm": 0.034423828125, + "epoch": 1.1741372471241571, + "grad_norm": 0.060791015625, "learning_rate": 0.0001, - "loss": 0.0632, + "loss": 0.0239, "step": 185 }, { - "epoch": 0.6030549494147986, - "grad_norm": 0.038330078125, + "epoch": 1.2058706862356208, + "grad_norm": 0.015625, "learning_rate": 0.0001, - "loss": 0.042, + "loss": 0.0095, "step": 190 }, { - "epoch": 0.6189248165046618, - "grad_norm": 0.0400390625, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 195 - }, - { - "epoch": 0.6347946835945248, - "grad_norm": 0.08642578125, - "learning_rate": 0.0001, - "loss": 0.0195, - "step": 200 - }, - { - "epoch": 0.650664550684388, - "grad_norm": 0.07080078125, - "learning_rate": 0.0001, - "loss": 0.3038, - "step": 205 - }, - { - "epoch": 0.6665344177742512, - "grad_norm": 0.0556640625, - "learning_rate": 0.0001, - "loss": 0.1574, - "step": 210 - }, - { - "epoch": 0.6824042848641143, - "grad_norm": 0.054443359375, - "learning_rate": 0.0001, - "loss": 0.1049, - "step": 215 - }, - { - "epoch": 0.6982741519539774, - "grad_norm": 0.052490234375, - "learning_rate": 0.0001, - "loss": 0.0955, - "step": 220 - }, - { - "epoch": 0.7141440190438405, - "grad_norm": 0.046630859375, - "learning_rate": 0.0001, - "loss": 0.0767, - "step": 225 - }, - { - "epoch": 0.7300138861337037, - "grad_norm": 0.052978515625, - "learning_rate": 0.0001, - "loss": 0.0636, - "step": 230 - }, - { - "epoch": 0.7458837532235667, - "grad_norm": 0.0546875, - "learning_rate": 0.0001, - "loss": 0.0584, - "step": 235 - }, - { - "epoch": 0.7617536203134299, - "grad_norm": 0.0546875, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 240 - }, - { - "epoch": 0.777623487403293, - "grad_norm": 0.035400390625, - "learning_rate": 0.0001, - "loss": 0.0268, - "step": 245 - }, - { - "epoch": 0.7934933544931562, - "grad_norm": 0.03564453125, - "learning_rate": 0.0001, - "loss": 0.0197, - "step": 250 - }, - { - "epoch": 0.8093632215830192, - "grad_norm": 0.0673828125, - "learning_rate": 0.0001, - "loss": 0.264, - "step": 255 - }, - { - "epoch": 0.8252330886728824, - "grad_norm": 0.050048828125, - "learning_rate": 0.0001, - "loss": 0.1382, - "step": 260 - }, - { - "epoch": 0.8411029557627455, - "grad_norm": 0.053955078125, - "learning_rate": 0.0001, - "loss": 0.0959, - "step": 265 - }, - { - "epoch": 0.8569728228526086, - "grad_norm": 0.055908203125, - "learning_rate": 0.0001, - "loss": 0.0986, - "step": 270 - }, - { - "epoch": 0.8728426899424717, - "grad_norm": 0.05322265625, - "learning_rate": 0.0001, - "loss": 0.0806, - "step": 275 - }, - { - "epoch": 0.8887125570323349, - "grad_norm": 0.037109375, - "learning_rate": 0.0001, - "loss": 0.0627, - "step": 280 - }, - { - "epoch": 0.904582424122198, - "grad_norm": 0.035888671875, - "learning_rate": 0.0001, - "loss": 0.0488, - "step": 285 - }, - { - "epoch": 0.9204522912120611, - "grad_norm": 0.049072265625, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 290 - }, - { - "epoch": 0.9363221583019242, - "grad_norm": 0.042236328125, - "learning_rate": 0.0001, - "loss": 0.0259, - "step": 295 - }, - { - "epoch": 0.9521920253917874, - "grad_norm": 0.02490234375, - "learning_rate": 0.0001, - "loss": 0.0168, - "step": 300 - }, - { - "epoch": 0.9680618924816504, - "grad_norm": 0.07080078125, - "learning_rate": 0.0001, - "loss": 0.1856, - "step": 305 - }, - { - "epoch": 0.9839317595715136, - "grad_norm": 0.09814453125, - "learning_rate": 0.0001, - "loss": 0.0806, - "step": 310 - }, - { - "epoch": 0.9998016266613767, - "grad_norm": 0.0380859375, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 315 - }, - { - "epoch": 1.0156714937512399, - "grad_norm": 0.07373046875, - "learning_rate": 0.0001, - "loss": 0.2891, - "step": 320 - }, - { - "epoch": 1.031541360841103, - "grad_norm": 0.06982421875, - "learning_rate": 0.0001, - "loss": 0.1519, - "step": 325 - }, - { - "epoch": 1.047411227930966, - "grad_norm": 0.048095703125, - "learning_rate": 0.0001, - "loss": 0.094, - "step": 330 - }, - { - "epoch": 1.0632810950208291, - "grad_norm": 0.051513671875, - "learning_rate": 0.0001, - "loss": 0.0843, - "step": 335 - }, - { - "epoch": 1.0791509621106923, - "grad_norm": 0.0517578125, - "learning_rate": 0.0001, - "loss": 0.0695, - "step": 340 - }, - { - "epoch": 1.0950208292005554, - "grad_norm": 0.04931640625, - "learning_rate": 0.0001, - "loss": 0.0586, - "step": 345 - }, - { - "epoch": 1.1108906962904186, - "grad_norm": 0.06201171875, - "learning_rate": 0.0001, - "loss": 0.0493, - "step": 350 - }, - { - "epoch": 1.1267605633802817, - "grad_norm": 0.0272216796875, - "learning_rate": 0.0001, - "loss": 0.0278, - "step": 355 - }, - { - "epoch": 1.142630430470145, - "grad_norm": 0.05419921875, - "learning_rate": 0.0001, - "loss": 0.0219, - "step": 360 - }, - { - "epoch": 1.1585002975600078, - "grad_norm": 0.07177734375, - "learning_rate": 0.0001, - "loss": 0.015, - "step": 365 - }, - { - "epoch": 1.174370164649871, - "grad_norm": 0.09521484375, - "learning_rate": 0.0001, - "loss": 0.2371, - "step": 370 - }, - { - "epoch": 1.1902400317397341, - "grad_norm": 0.060791015625, - "learning_rate": 0.0001, - "loss": 0.118, - "step": 375 - }, - { - "epoch": 1.2061098988295973, - "grad_norm": 0.059814453125, - "learning_rate": 0.0001, - "loss": 0.0904, - "step": 380 - }, - { - "epoch": 1.2219797659194604, - "grad_norm": 0.051513671875, - "learning_rate": 0.0001, - "loss": 0.079, - "step": 385 - }, - { - "epoch": 1.2378496330093236, - "grad_norm": 0.05126953125, - "learning_rate": 0.0001, - "loss": 0.0618, - "step": 390 - }, - { - "epoch": 1.2537195000991868, - "grad_norm": 0.06982421875, - "learning_rate": 0.0001, - "loss": 0.0501, - "step": 395 - }, - { - "epoch": 1.2695893671890497, - "grad_norm": 0.046142578125, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 400 - }, - { - "epoch": 1.2854592342789128, - "grad_norm": 0.03564453125, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 405 - }, - { - "epoch": 1.301329101368776, - "grad_norm": 0.0341796875, - "learning_rate": 0.0001, - "loss": 0.0185, - "step": 410 - }, - { - "epoch": 1.3171989684586392, - "grad_norm": 0.0286865234375, - "learning_rate": 0.0001, - "loss": 0.0123, - "step": 415 - }, - { - "epoch": 1.3330688355485023, - "grad_norm": 0.054931640625, - "learning_rate": 0.0001, - "loss": 0.2018, - "step": 420 - }, - { - "epoch": 1.3489387026383655, - "grad_norm": 0.060302734375, - "learning_rate": 0.0001, - "loss": 0.1189, - "step": 425 - }, - { - "epoch": 1.3648085697282286, - "grad_norm": 0.046630859375, - "learning_rate": 0.0001, - "loss": 0.0821, - "step": 430 - }, - { - "epoch": 1.3806784368180915, - "grad_norm": 0.0576171875, - "learning_rate": 0.0001, - "loss": 0.0759, - "step": 435 - }, - { - "epoch": 1.3965483039079547, - "grad_norm": 0.058349609375, - "learning_rate": 0.0001, - "loss": 0.0567, - "step": 440 - }, - { - "epoch": 1.4124181709978179, - "grad_norm": 0.05908203125, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 445 - }, - { - "epoch": 1.428288038087681, - "grad_norm": 0.054443359375, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 450 - }, - { - "epoch": 1.4441579051775442, - "grad_norm": 0.036376953125, - "learning_rate": 0.0001, - "loss": 0.0283, - "step": 455 - }, - { - "epoch": 1.4600277722674073, - "grad_norm": 0.142578125, - "learning_rate": 0.0001, - "loss": 0.0206, - "step": 460 - }, - { - "epoch": 1.4758976393572705, - "grad_norm": 0.044677734375, - "learning_rate": 0.0001, - "loss": 0.0129, - "step": 465 - }, - { - "epoch": 1.4917675064471334, - "grad_norm": 0.07275390625, - "learning_rate": 0.0001, - "loss": 0.2036, - "step": 470 - }, - { - "epoch": 1.5076373735369968, - "grad_norm": 0.052490234375, - "learning_rate": 0.0001, - "loss": 0.1093, - "step": 475 - }, - { - "epoch": 1.5076373735369968, - "step": 475, - "total_flos": 1.4535297138363187e+18, - "train_loss": 0.11740684490454824, - "train_runtime": 39384.0084, - "train_samples_per_second": 0.772, - "train_steps_per_second": 0.012 + "epoch": 1.2058706862356208, + "step": 190, + "total_flos": 1.216645538039931e+18, + "train_loss": 0.10745409297707834, + "train_runtime": 37043.3755, + "train_samples_per_second": 0.657, + "train_steps_per_second": 0.005 } ], "logging_steps": 5, - "max_steps": 475, + "max_steps": 190, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 90, @@ -700,8 +301,8 @@ "attributes": {} } }, - "total_flos": 1.4535297138363187e+18, - "train_batch_size": 4, + "total_flos": 1.216645538039931e+18, + "train_batch_size": 8, "trial_name": null, "trial_params": null } diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/all_results.json b/codellama/c/dmcodegen/dmcodegen_base_c/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..81a7b3a98bd4b836d2774648980f05fe250fbe62 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 2.0779220779220777, + "total_flos": 1.8562430640540058e+18, + "train_loss": 0.45838437411520216, + "train_runtime": 56927.0701, + "train_samples_per_second": 0.405, + "train_steps_per_second": 0.003 +} \ No newline at end of file diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/README.md b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_config.json b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e4a9687079697f7e02d95a96f8f3d174b50c1db --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "down_proj", + "k_proj", + "gate_proj", + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_model.safetensors b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..318c25b2c70e071dee17af6d80cdc96260e5dda5 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4018337dfdcbc4ca01ac822efa32f00bdad5c13cd3bd3d5c8308564380b060be +size 1156480200 diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_model/README.md b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_model/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_model/adapter_config.json b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e4a9687079697f7e02d95a96f8f3d174b50c1db --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_model/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "down_proj", + "k_proj", + "gate_proj", + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_model/adapter_model.safetensors b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_model/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..318c25b2c70e071dee17af6d80cdc96260e5dda5 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/adapter_model/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4018337dfdcbc4ca01ac822efa32f00bdad5c13cd3bd3d5c8308564380b060be +size 1156480200 diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/added_tokens.json b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/added_tokens.json similarity index 100% rename from codellama/java/dataflow_pretrained/checkpoint-720/added_tokens.json rename to codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/added_tokens.json diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/optimizer.pt b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..edf0691b549b5f91fc25c87d89de1305e8f88a4e --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c311f9837fead460249a52390564c15ad9137ede61358886a6cf8ab7c563896 +size 2003126962 diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/rng_state.pth b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bec7e97107a17ef03e35e48facd6d1094bab3841 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09d9a2f5e5c671e23fbed743832c2d77a42f5fdb0981bfd74289171a6b58bdb8 +size 14244 diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/scheduler.pt b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd32f24b55247712dc306a7f48b1e67f9136b26b --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:244453cd6aad26ed6e8f9d969778193b9354089d8336fe58bfb91c089a53bf6f +size 1064 diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/special_tokens_map.json b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/special_tokens_map.json similarity index 100% rename from codellama/java/dataflow_pretrained/checkpoint-720/special_tokens_map.json rename to codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/special_tokens_map.json diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/tokenizer.model b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/tokenizer.model similarity index 100% rename from codellama/java/dataflow_pretrained/checkpoint-720/tokenizer.model rename to codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/tokenizer.model diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/tokenizer_config.json b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/tokenizer_config.json similarity index 100% rename from codellama/java/dataflow_pretrained/checkpoint-720/tokenizer_config.json rename to codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/tokenizer_config.json diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/trainer_state.json b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..15f084256693c84c2283b383b7869144d5d2f569 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/trainer_state.json @@ -0,0 +1,285 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0779220779220777, + "eval_steps": 500, + "global_step": 180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05772005772005772, + "grad_norm": 0.018310546875, + "learning_rate": 0.0001, + "loss": 0.5558, + "step": 5 + }, + { + "epoch": 0.11544011544011544, + "grad_norm": 0.01544189453125, + "learning_rate": 0.0001, + "loss": 0.4953, + "step": 10 + }, + { + "epoch": 0.17316017316017315, + "grad_norm": 0.01611328125, + "learning_rate": 0.0001, + "loss": 0.4465, + "step": 15 + }, + { + "epoch": 0.23088023088023088, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0001, + "loss": 0.419, + "step": 20 + }, + { + "epoch": 0.2886002886002886, + "grad_norm": 0.022705078125, + "learning_rate": 0.0001, + "loss": 0.5274, + "step": 25 + }, + { + "epoch": 0.3463203463203463, + "grad_norm": 0.020751953125, + "learning_rate": 0.0001, + "loss": 0.5132, + "step": 30 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0001, + "loss": 0.4466, + "step": 35 + }, + { + "epoch": 0.46176046176046176, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001, + "loss": 0.4116, + "step": 40 + }, + { + "epoch": 0.5194805194805194, + "grad_norm": 0.026611328125, + "learning_rate": 0.0001, + "loss": 0.4913, + "step": 45 + }, + { + "epoch": 0.5772005772005772, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0001, + "loss": 0.4982, + "step": 50 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.021484375, + "learning_rate": 0.0001, + "loss": 0.4586, + "step": 55 + }, + { + "epoch": 0.6926406926406926, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0001, + "loss": 0.4132, + "step": 60 + }, + { + "epoch": 0.7503607503607503, + "grad_norm": 0.03125, + "learning_rate": 0.0001, + "loss": 0.445, + "step": 65 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0001, + "loss": 0.5186, + "step": 70 + }, + { + "epoch": 0.8658008658008658, + "grad_norm": 0.022216796875, + "learning_rate": 0.0001, + "loss": 0.463, + "step": 75 + }, + { + "epoch": 0.9235209235209235, + "grad_norm": 0.02001953125, + "learning_rate": 0.0001, + "loss": 0.4233, + "step": 80 + }, + { + "epoch": 0.9812409812409812, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0001, + "loss": 0.4396, + "step": 85 + }, + { + "epoch": 1.0389610389610389, + "grad_norm": 0.02685546875, + "learning_rate": 0.0001, + "loss": 0.4944, + "step": 90 + }, + { + "epoch": 1.0966810966810967, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0001, + "loss": 0.4896, + "step": 95 + }, + { + "epoch": 1.1544011544011543, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0001, + "loss": 0.4402, + "step": 100 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001, + "loss": 0.3963, + "step": 105 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.037353515625, + "learning_rate": 0.0001, + "loss": 0.4535, + "step": 110 + }, + { + "epoch": 1.3275613275613276, + "grad_norm": 0.032470703125, + "learning_rate": 0.0001, + "loss": 0.5045, + "step": 115 + }, + { + "epoch": 1.3852813852813852, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0001, + "loss": 0.4466, + "step": 120 + }, + { + "epoch": 1.443001443001443, + "grad_norm": 0.0244140625, + "learning_rate": 0.0001, + "loss": 0.4095, + "step": 125 + }, + { + "epoch": 1.5007215007215007, + "grad_norm": 0.046630859375, + "learning_rate": 0.0001, + "loss": 0.4346, + "step": 130 + }, + { + "epoch": 1.5584415584415585, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0001, + "loss": 0.5046, + "step": 135 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.032958984375, + "learning_rate": 0.0001, + "loss": 0.4556, + "step": 140 + }, + { + "epoch": 1.6738816738816737, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001, + "loss": 0.4245, + "step": 145 + }, + { + "epoch": 1.7316017316017316, + "grad_norm": 0.036865234375, + "learning_rate": 0.0001, + "loss": 0.3834, + "step": 150 + }, + { + "epoch": 1.7893217893217894, + "grad_norm": 0.03662109375, + "learning_rate": 0.0001, + "loss": 0.5163, + "step": 155 + }, + { + "epoch": 1.847041847041847, + "grad_norm": 0.033935546875, + "learning_rate": 0.0001, + "loss": 0.4565, + "step": 160 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.02880859375, + "learning_rate": 0.0001, + "loss": 0.4164, + "step": 165 + }, + { + "epoch": 1.9624819624819625, + "grad_norm": 0.03271484375, + "learning_rate": 0.0001, + "loss": 0.3956, + "step": 170 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 0.041748046875, + "learning_rate": 0.0001, + "loss": 0.4453, + "step": 175 + }, + { + "epoch": 2.0779220779220777, + "grad_norm": 0.03857421875, + "learning_rate": 0.0001, + "loss": 0.4681, + "step": 180 + } + ], + "logging_steps": 5, + "max_steps": 180, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 180, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.8562430640540058e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/training_args.bin b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0133396b57a21e0862c112adcc5a95fdd2e21a0 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-180/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34f792b2641cd9a5b1462d877d12107fd74f2ec203190ed5f66b658484d1e7b3 +size 7416 diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/README.md b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_config.json b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..990f4ef3aba21ef2ebd907f1502b4848d9e6b62e --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "up_proj", + "gate_proj", + "q_proj", + "o_proj", + "v_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_model.safetensors b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3dac47beb474e15b17251aff08416c0421ae1896 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b83cea6a8ba3abb411145bffb7c0c1f6cba9fd43721b41ecc8ed32b2c3e179fe +size 1156480200 diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_model/README.md b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_model/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_model/adapter_config.json b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..990f4ef3aba21ef2ebd907f1502b4848d9e6b62e --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_model/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "up_proj", + "gate_proj", + "q_proj", + "o_proj", + "v_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_model/adapter_model.safetensors b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_model/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3dac47beb474e15b17251aff08416c0421ae1896 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/adapter_model/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b83cea6a8ba3abb411145bffb7c0c1f6cba9fd43721b41ecc8ed32b2c3e179fe +size 1156480200 diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/added_tokens.json b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/added_tokens.json @@ -0,0 +1,3 @@ +{ + "[PAD]": 32016 +} diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/optimizer.pt b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c739c4fd846a2353fb97a0f976077cf2de1b534 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:119891f1916a60aabb0c81ceee15f0c8178c275d5b6ef81c9059743549e9653d +size 2003126962 diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/rng_state.pth b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3a5927b8d9d54d38592ba1e6b5779c0e34307afa --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b04d21a080b5e438b4b32adc506500b95e99b49e74f9d44a991ff92733e72054 +size 14244 diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/scheduler.pt b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5088c269cc64d90dc46ecb1a7fb7927ad6415d8 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c7d5936e70e72bf0e3651da983818a5b36c8198eb19437975051ad543d68cc9 +size 1064 diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/special_tokens_map.json b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/special_tokens_map.json @@ -0,0 +1,36 @@ +{ + "additional_special_tokens": [ + "▁
", + "▁", + "▁ ", + "▁ " + ], + "bos_token": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/tokenizer.model b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6 +size 500058 diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/tokenizer_config.json b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/tokenizer_config.json @@ -0,0 +1,94 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32007": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32008": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32009": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32010": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32016": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": " ", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<", + "eot_token": "▁>\\n' + system_message + '\\n< >\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "fill_token": " ", + "legacy": null, + "middle_token": "▁ ", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "padding_side": "right", + "prefix_token": "▁ ", + "sp_model_kwargs": {}, + "suffix_first": false, + "suffix_token": "▁", + "tokenizer_class": "CodeLlamaTokenizer", + "unk_token": " ", + "use_default_system_prompt": false +} diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/trainer_state.json b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d3af4476363e7267883b22db47e17c252b711f60 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/trainer_state.json @@ -0,0 +1,159 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0389610389610389, + "eval_steps": 500, + "global_step": 90, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05772005772005772, + "grad_norm": 0.036376953125, + "learning_rate": 0.0001, + "loss": 0.7562, + "step": 5 + }, + { + "epoch": 0.11544011544011544, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0001, + "loss": 0.5993, + "step": 10 + }, + { + "epoch": 0.17316017316017315, + "grad_norm": 0.0244140625, + "learning_rate": 0.0001, + "loss": 0.5209, + "step": 15 + }, + { + "epoch": 0.23088023088023088, + "grad_norm": 0.0673828125, + "learning_rate": 0.0001, + "loss": 0.5029, + "step": 20 + }, + { + "epoch": 0.2886002886002886, + "grad_norm": 0.04150390625, + "learning_rate": 0.0001, + "loss": 0.622, + "step": 25 + }, + { + "epoch": 0.3463203463203463, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0001, + "loss": 0.5561, + "step": 30 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0001, + "loss": 0.4707, + "step": 35 + }, + { + "epoch": 0.46176046176046176, + "grad_norm": 0.019775390625, + "learning_rate": 0.0001, + "loss": 0.4331, + "step": 40 + }, + { + "epoch": 0.5194805194805194, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0001, + "loss": 0.534, + "step": 45 + }, + { + "epoch": 0.5772005772005772, + "grad_norm": 0.0185546875, + "learning_rate": 0.0001, + "loss": 0.5314, + "step": 50 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.0169677734375, + "learning_rate": 0.0001, + "loss": 0.4802, + "step": 55 + }, + { + "epoch": 0.6926406926406926, + "grad_norm": 0.0172119140625, + "learning_rate": 0.0001, + "loss": 0.4332, + "step": 60 + }, + { + "epoch": 0.7503607503607503, + "grad_norm": 0.021728515625, + "learning_rate": 0.0001, + "loss": 0.4812, + "step": 65 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.0155029296875, + "learning_rate": 0.0001, + "loss": 0.5443, + "step": 70 + }, + { + "epoch": 0.8658008658008658, + "grad_norm": 0.0159912109375, + "learning_rate": 0.0001, + "loss": 0.4805, + "step": 75 + }, + { + "epoch": 0.9235209235209235, + "grad_norm": 0.0146484375, + "learning_rate": 0.0001, + "loss": 0.4377, + "step": 80 + }, + { + "epoch": 0.9812409812409812, + "grad_norm": 0.019775390625, + "learning_rate": 0.0001, + "loss": 0.4548, + "step": 85 + }, + { + "epoch": 1.0389610389610389, + "grad_norm": 0.018310546875, + "learning_rate": 0.0001, + "loss": 0.5309, + "step": 90 + } + ], + "logging_steps": 5, + "max_steps": 270, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 90, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.286644455131546e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/training_args.bin b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a43da9a5221efcb269cce9e179e555fc6074535a --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/checkpoint-90/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3b5defc64c4beb14d7155cd334736bdf2476d6f1da0691ee9be164112ac9f9c +size 7416 diff --git a/codellama/java/dataflow_pretrained/completed b/codellama/c/dmcodegen/dmcodegen_base_c/completed similarity index 100% rename from codellama/java/dataflow_pretrained/completed rename to codellama/c/dmcodegen/dmcodegen_base_c/completed diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/metrics.json b/codellama/c/dmcodegen/dmcodegen_base_c/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..8efa7ec3b3d25cd4cf6f1a7de74d1d02ebbbbf46 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/metrics.json @@ -0,0 +1 @@ +{"run_name": "dmcodegen_base_c", "train_runtime": 56927.0701, "train_samples_per_second": 0.405, "train_steps_per_second": 0.003, "total_flos": 1.8562430640540058e+18, "train_loss": 0.45838437411520216, "epoch": 2.0779220779220777} \ No newline at end of file diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/train_results.json b/codellama/c/dmcodegen/dmcodegen_base_c/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..81a7b3a98bd4b836d2774648980f05fe250fbe62 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 2.0779220779220777, + "total_flos": 1.8562430640540058e+18, + "train_loss": 0.45838437411520216, + "train_runtime": 56927.0701, + "train_samples_per_second": 0.405, + "train_steps_per_second": 0.003 +} \ No newline at end of file diff --git a/codellama/c/dmcodegen/dmcodegen_base_c/trainer_state.json b/codellama/c/dmcodegen/dmcodegen_base_c/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4eeb4d78b6455dc4305baed78d653acd89de8851 --- /dev/null +++ b/codellama/c/dmcodegen/dmcodegen_base_c/trainer_state.json @@ -0,0 +1,294 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0779220779220777, + "eval_steps": 500, + "global_step": 180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05772005772005772, + "grad_norm": 0.018310546875, + "learning_rate": 0.0001, + "loss": 0.5558, + "step": 5 + }, + { + "epoch": 0.11544011544011544, + "grad_norm": 0.01544189453125, + "learning_rate": 0.0001, + "loss": 0.4953, + "step": 10 + }, + { + "epoch": 0.17316017316017315, + "grad_norm": 0.01611328125, + "learning_rate": 0.0001, + "loss": 0.4465, + "step": 15 + }, + { + "epoch": 0.23088023088023088, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0001, + "loss": 0.419, + "step": 20 + }, + { + "epoch": 0.2886002886002886, + "grad_norm": 0.022705078125, + "learning_rate": 0.0001, + "loss": 0.5274, + "step": 25 + }, + { + "epoch": 0.3463203463203463, + "grad_norm": 0.020751953125, + "learning_rate": 0.0001, + "loss": 0.5132, + "step": 30 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0001, + "loss": 0.4466, + "step": 35 + }, + { + "epoch": 0.46176046176046176, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001, + "loss": 0.4116, + "step": 40 + }, + { + "epoch": 0.5194805194805194, + "grad_norm": 0.026611328125, + "learning_rate": 0.0001, + "loss": 0.4913, + "step": 45 + }, + { + "epoch": 0.5772005772005772, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0001, + "loss": 0.4982, + "step": 50 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.021484375, + "learning_rate": 0.0001, + "loss": 0.4586, + "step": 55 + }, + { + "epoch": 0.6926406926406926, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0001, + "loss": 0.4132, + "step": 60 + }, + { + "epoch": 0.7503607503607503, + "grad_norm": 0.03125, + "learning_rate": 0.0001, + "loss": 0.445, + "step": 65 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0001, + "loss": 0.5186, + "step": 70 + }, + { + "epoch": 0.8658008658008658, + "grad_norm": 0.022216796875, + "learning_rate": 0.0001, + "loss": 0.463, + "step": 75 + }, + { + "epoch": 0.9235209235209235, + "grad_norm": 0.02001953125, + "learning_rate": 0.0001, + "loss": 0.4233, + "step": 80 + }, + { + "epoch": 0.9812409812409812, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0001, + "loss": 0.4396, + "step": 85 + }, + { + "epoch": 1.0389610389610389, + "grad_norm": 0.02685546875, + "learning_rate": 0.0001, + "loss": 0.4944, + "step": 90 + }, + { + "epoch": 1.0966810966810967, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0001, + "loss": 0.4896, + "step": 95 + }, + { + "epoch": 1.1544011544011543, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0001, + "loss": 0.4402, + "step": 100 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001, + "loss": 0.3963, + "step": 105 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.037353515625, + "learning_rate": 0.0001, + "loss": 0.4535, + "step": 110 + }, + { + "epoch": 1.3275613275613276, + "grad_norm": 0.032470703125, + "learning_rate": 0.0001, + "loss": 0.5045, + "step": 115 + }, + { + "epoch": 1.3852813852813852, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0001, + "loss": 0.4466, + "step": 120 + }, + { + "epoch": 1.443001443001443, + "grad_norm": 0.0244140625, + "learning_rate": 0.0001, + "loss": 0.4095, + "step": 125 + }, + { + "epoch": 1.5007215007215007, + "grad_norm": 0.046630859375, + "learning_rate": 0.0001, + "loss": 0.4346, + "step": 130 + }, + { + "epoch": 1.5584415584415585, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0001, + "loss": 0.5046, + "step": 135 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.032958984375, + "learning_rate": 0.0001, + "loss": 0.4556, + "step": 140 + }, + { + "epoch": 1.6738816738816737, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001, + "loss": 0.4245, + "step": 145 + }, + { + "epoch": 1.7316017316017316, + "grad_norm": 0.036865234375, + "learning_rate": 0.0001, + "loss": 0.3834, + "step": 150 + }, + { + "epoch": 1.7893217893217894, + "grad_norm": 0.03662109375, + "learning_rate": 0.0001, + "loss": 0.5163, + "step": 155 + }, + { + "epoch": 1.847041847041847, + "grad_norm": 0.033935546875, + "learning_rate": 0.0001, + "loss": 0.4565, + "step": 160 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.02880859375, + "learning_rate": 0.0001, + "loss": 0.4164, + "step": 165 + }, + { + "epoch": 1.9624819624819625, + "grad_norm": 0.03271484375, + "learning_rate": 0.0001, + "loss": 0.3956, + "step": 170 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 0.041748046875, + "learning_rate": 0.0001, + "loss": 0.4453, + "step": 175 + }, + { + "epoch": 2.0779220779220777, + "grad_norm": 0.03857421875, + "learning_rate": 0.0001, + "loss": 0.4681, + "step": 180 + }, + { + "epoch": 2.0779220779220777, + "step": 180, + "total_flos": 1.8562430640540058e+18, + "train_loss": 0.45838437411520216, + "train_runtime": 56927.0701, + "train_samples_per_second": 0.405, + "train_steps_per_second": 0.003 + } + ], + "logging_steps": 5, + "max_steps": 180, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 180, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.8562430640540058e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/all_results.json b/codellama/java/codetrans/codetransocean/codetransocean_base_java/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..3b669a03e623cd186ec8a3b2b55237d72b2f846d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.5294117647058822, + "total_flos": 3.685541393109811e+17, + "train_loss": 0.5357006496853298, + "train_runtime": 11174.9028, + "train_samples_per_second": 0.515, + "train_steps_per_second": 0.004 +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/README.md b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_config.json b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2ef8f1a20ad1e9d558732a73b15da0bbccfedade --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "up_proj", + "k_proj", + "o_proj", + "down_proj", + "q_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_model.safetensors b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..93d9f00eeb42511b6e4533caa3ad5245535db68c --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b0445e318d4db848f2144e65f6c1ede906a680d3b66c4aeb106575502d872b2 +size 1156480200 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_model/README.md b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_model/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_model/adapter_config.json b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2ef8f1a20ad1e9d558732a73b15da0bbccfedade --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_model/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "up_proj", + "k_proj", + "o_proj", + "down_proj", + "q_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_model/adapter_model.safetensors b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_model/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..93d9f00eeb42511b6e4533caa3ad5245535db68c --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/adapter_model/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b0445e318d4db848f2144e65f6c1ede906a680d3b66c4aeb106575502d872b2 +size 1156480200 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/added_tokens.json b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/added_tokens.json @@ -0,0 +1,3 @@ +{ + "[PAD]": 32016 +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/optimizer.pt b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..96ffae124841975958bfda924914974a7da8f03d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1afba3b9a460b6b2a84fcaf925ae469bb288ebfdfd85c58fa9aec5691012d90d +size 2003126962 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/rng_state.pth b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9570b02680b4230f9ba15ce25da1b40ddba969a --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585f15ae1d5104d9384b07ae641e0e10926f991dea913b9243bcce14a7965a42 +size 14244 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/scheduler.pt b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..638774351b15ff484ffc0ca6c82bd744318f1cb0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e116f8d4565c3c175d4858b7ff08054c3bd13ca8f526b793ad3d5a6f5f8f4fb +size 1064 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/special_tokens_map.json b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/special_tokens_map.json @@ -0,0 +1,36 @@ +{ + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/tokenizer.model b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6 +size 500058 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/tokenizer_config.json b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/tokenizer_config.json @@ -0,0 +1,94 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32007": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32008": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32009": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32010": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32016": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": " ", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<", + "eot_token": "▁>\\n' + system_message + '\\n< >\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "fill_token": " ", + "legacy": null, + "middle_token": "▁ ", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "padding_side": "right", + "prefix_token": "▁ ", + "sp_model_kwargs": {}, + "suffix_first": false, + "suffix_token": "▁", + "tokenizer_class": "CodeLlamaTokenizer", + "unk_token": " ", + "use_default_system_prompt": false +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/trainer_state.json b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..88cd127795af80f7c76a9dee2bc9549425b740d5 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/trainer_state.json @@ -0,0 +1,96 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.5294117647058822, + "eval_steps": 500, + "global_step": 45, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.39215686274509803, + "grad_norm": 0.034423828125, + "learning_rate": 0.0001, + "loss": 0.6016, + "step": 5 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0001, + "loss": 0.5999, + "step": 10 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.0172119140625, + "learning_rate": 0.0001, + "loss": 0.5587, + "step": 15 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.0162353515625, + "learning_rate": 0.0001, + "loss": 0.5489, + "step": 20 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0001, + "loss": 0.526, + "step": 25 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.01611328125, + "learning_rate": 0.0001, + "loss": 0.5198, + "step": 30 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.041015625, + "learning_rate": 0.0001, + "loss": 0.5079, + "step": 35 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.02197265625, + "learning_rate": 0.0001, + "loss": 0.5052, + "step": 40 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.02978515625, + "learning_rate": 0.0001, + "loss": 0.4534, + "step": 45 + } + ], + "logging_steps": 5, + "max_steps": 45, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 180, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.685541393109811e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/training_args.bin b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f0dfcb95b4589ffcb36249b9c81bcd35370adc0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/checkpoint-45/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1e016a4c7738d1feeb74102ad9823d33dbace626bb26bab05d8621a3b7111bb +size 7480 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/completed b/codellama/java/codetrans/codetransocean/codetransocean_base_java/completed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/metrics.json b/codellama/java/codetrans/codetransocean/codetransocean_base_java/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..135d7d40a62ebb4cf0456db9a436d2726d7b20a7 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/metrics.json @@ -0,0 +1 @@ +{"run_name": "codetransocean_base_java", "train_runtime": 11174.9028, "train_samples_per_second": 0.515, "train_steps_per_second": 0.004, "total_flos": 3.685541393109811e+17, "train_loss": 0.5357006496853298, "epoch": 3.5294117647058822} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/train_results.json b/codellama/java/codetrans/codetransocean/codetransocean_base_java/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..3b669a03e623cd186ec8a3b2b55237d72b2f846d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.5294117647058822, + "total_flos": 3.685541393109811e+17, + "train_loss": 0.5357006496853298, + "train_runtime": 11174.9028, + "train_samples_per_second": 0.515, + "train_steps_per_second": 0.004 +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_base_java/trainer_state.json b/codellama/java/codetrans/codetransocean/codetransocean_base_java/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..df37e4e267c5f759741a2db5f5d64246aceafca5 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_base_java/trainer_state.json @@ -0,0 +1,105 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.5294117647058822, + "eval_steps": 500, + "global_step": 45, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.39215686274509803, + "grad_norm": 0.034423828125, + "learning_rate": 0.0001, + "loss": 0.6016, + "step": 5 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0001, + "loss": 0.5999, + "step": 10 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.0172119140625, + "learning_rate": 0.0001, + "loss": 0.5587, + "step": 15 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.0162353515625, + "learning_rate": 0.0001, + "loss": 0.5489, + "step": 20 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0001, + "loss": 0.526, + "step": 25 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.01611328125, + "learning_rate": 0.0001, + "loss": 0.5198, + "step": 30 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.041015625, + "learning_rate": 0.0001, + "loss": 0.5079, + "step": 35 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.02197265625, + "learning_rate": 0.0001, + "loss": 0.5052, + "step": 40 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.02978515625, + "learning_rate": 0.0001, + "loss": 0.4534, + "step": 45 + }, + { + "epoch": 3.5294117647058822, + "step": 45, + "total_flos": 3.685541393109811e+17, + "train_loss": 0.5357006496853298, + "train_runtime": 11174.9028, + "train_samples_per_second": 0.515, + "train_steps_per_second": 0.004 + } + ], + "logging_steps": 5, + "max_steps": 45, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 180, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.685541393109811e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/all_results.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e749165f7e71cc7d33038aa6b587c45168c1779d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.5294117647058822, + "total_flos": 3.685541393109811e+17, + "train_loss": 0.6542216989729139, + "train_runtime": 11159.7771, + "train_samples_per_second": 0.516, + "train_steps_per_second": 0.004 +} \ No newline at end of file diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/README.md b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/README.md similarity index 100% rename from codellama/c/dataflow_c_pretrained/checkpoint-475/README.md rename to codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/README.md diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_config.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_config.json similarity index 96% rename from codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_config.json rename to codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_config.json index 3bd1bc3d6771bd312ef762b5d7de15f2bf59347b..dccd6b7bf948fe35625c537a5a6a41da3b51f7db 100644 --- a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_config.json +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_config.json @@ -20,13 +20,13 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "v_proj", - "q_proj", - "down_proj", + "k_proj", "o_proj", + "down_proj", + "up_proj", "gate_proj", - "k_proj", - "up_proj" + "q_proj", + "v_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_model.safetensors b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d6e6200979e0713757ce3dbcaad47629e62d519b --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e56f74b53d9144dd1c04db7b8594194255abb8b796fd695dd3221459f495a5b7 +size 1156480200 diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/README.md b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_model/README.md similarity index 100% rename from codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/README.md rename to codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_model/README.md diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/adapter_config.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_model/adapter_config.json similarity index 96% rename from codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/adapter_config.json rename to codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_model/adapter_config.json index 3bd1bc3d6771bd312ef762b5d7de15f2bf59347b..dccd6b7bf948fe35625c537a5a6a41da3b51f7db 100644 --- a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/adapter_config.json +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_model/adapter_config.json @@ -20,13 +20,13 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "v_proj", - "q_proj", - "down_proj", + "k_proj", "o_proj", + "down_proj", + "up_proj", "gate_proj", - "k_proj", - "up_proj" + "q_proj", + "v_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_model/adapter_model.safetensors b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_model/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d6e6200979e0713757ce3dbcaad47629e62d519b --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/adapter_model/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e56f74b53d9144dd1c04db7b8594194255abb8b796fd695dd3221459f495a5b7 +size 1156480200 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/added_tokens.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/added_tokens.json @@ -0,0 +1,3 @@ +{ + "[PAD]": 32016 +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/optimizer.pt b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fea6a29dec41c68a4866de5d130623fa4a1908b --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a222c5e451f1714f72c23228bc1066a13381216a60f9f1e5a70f71d7ae83e9e6 +size 2003127538 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/rng_state.pth b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..77dcde2809ddf3bbb0c930b2fd800a8030a028a0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fb345cb7f2521e856ece7ee321c0175bfb55c504e153b5f63e8413250253ff6 +size 14244 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/scheduler.pt b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..56a89f3d673b833f4a836161e416e00def3a0060 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25308db591b13d343977ce2367d0ae1afa54461f8777e7abfaef81ec2f99db6d +size 1064 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/special_tokens_map.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/special_tokens_map.json @@ -0,0 +1,36 @@ +{ + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/tokenizer.model b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6 +size 500058 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/tokenizer_config.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/tokenizer_config.json @@ -0,0 +1,94 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32007": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32008": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32009": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32010": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32016": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": " ", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<", + "eot_token": "▁>\\n' + system_message + '\\n< >\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "fill_token": " ", + "legacy": null, + "middle_token": "▁ ", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "padding_side": "right", + "prefix_token": "▁ ", + "sp_model_kwargs": {}, + "suffix_first": false, + "suffix_token": "▁", + "tokenizer_class": "CodeLlamaTokenizer", + "unk_token": " ", + "use_default_system_prompt": false +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/trainer_state.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1362e1a6df3096b377cc79611c3ea3ea6da76bfd --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/trainer_state.json @@ -0,0 +1,1503 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5074024226110363, + "eval_steps": 500, + "global_step": 1050, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007178106774338268, + "grad_norm": 0.65234375, + "learning_rate": 0.0001, + "loss": 6.8185, + "step": 5 + }, + { + "epoch": 0.014356213548676536, + "grad_norm": 0.69921875, + "learning_rate": 0.0001, + "loss": 5.3587, + "step": 10 + }, + { + "epoch": 0.021534320323014805, + "grad_norm": 0.984375, + "learning_rate": 0.0001, + "loss": 3.9044, + "step": 15 + }, + { + "epoch": 0.028712427097353072, + "grad_norm": 0.84765625, + "learning_rate": 0.0001, + "loss": 2.4036, + "step": 20 + }, + { + "epoch": 0.03589053387169134, + "grad_norm": 0.63671875, + "learning_rate": 0.0001, + "loss": 1.5506, + "step": 25 + }, + { + "epoch": 0.04306864064602961, + "grad_norm": 0.44921875, + "learning_rate": 0.0001, + "loss": 0.8859, + "step": 30 + }, + { + "epoch": 0.05024674742036788, + "grad_norm": 0.259765625, + "learning_rate": 0.0001, + "loss": 0.3927, + "step": 35 + }, + { + "epoch": 0.057424854194706144, + "grad_norm": 0.11669921875, + "learning_rate": 0.0001, + "loss": 0.1452, + "step": 40 + }, + { + "epoch": 0.06460296096904442, + "grad_norm": 0.10400390625, + "learning_rate": 0.0001, + "loss": 0.0693, + "step": 45 + }, + { + "epoch": 0.07178106774338268, + "grad_norm": 0.040283203125, + "learning_rate": 0.0001, + "loss": 0.0279, + "step": 50 + }, + { + "epoch": 0.07895917451772096, + "grad_norm": 0.46484375, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 55 + }, + { + "epoch": 0.08613728129205922, + "grad_norm": 0.201171875, + "learning_rate": 0.0001, + "loss": 0.9721, + "step": 60 + }, + { + "epoch": 0.09331538806639748, + "grad_norm": 0.1953125, + "learning_rate": 0.0001, + "loss": 0.8273, + "step": 65 + }, + { + "epoch": 0.10049349484073576, + "grad_norm": 0.1259765625, + "learning_rate": 0.0001, + "loss": 0.6694, + "step": 70 + }, + { + "epoch": 0.10767160161507403, + "grad_norm": 0.1171875, + "learning_rate": 0.0001, + "loss": 0.5689, + "step": 75 + }, + { + "epoch": 0.11484970838941229, + "grad_norm": 0.1357421875, + "learning_rate": 0.0001, + "loss": 0.35, + "step": 80 + }, + { + "epoch": 0.12202781516375057, + "grad_norm": 0.06640625, + "learning_rate": 0.0001, + "loss": 0.1548, + "step": 85 + }, + { + "epoch": 0.12920592193808883, + "grad_norm": 0.0791015625, + "learning_rate": 0.0001, + "loss": 0.0625, + "step": 90 + }, + { + "epoch": 0.1363840287124271, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 95 + }, + { + "epoch": 0.14356213548676536, + "grad_norm": 0.0654296875, + "learning_rate": 0.0001, + "loss": 0.0194, + "step": 100 + }, + { + "epoch": 0.15074024226110364, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001, + "loss": 1.1732, + "step": 105 + }, + { + "epoch": 0.1579183490354419, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001, + "loss": 0.87, + "step": 110 + }, + { + "epoch": 0.16509645580978016, + "grad_norm": 0.1298828125, + "learning_rate": 0.0001, + "loss": 0.7213, + "step": 115 + }, + { + "epoch": 0.17227456258411844, + "grad_norm": 0.158203125, + "learning_rate": 0.0001, + "loss": 0.5522, + "step": 120 + }, + { + "epoch": 0.17945266935845672, + "grad_norm": 0.1015625, + "learning_rate": 0.0001, + "loss": 0.4513, + "step": 125 + }, + { + "epoch": 0.18663077613279497, + "grad_norm": 0.1064453125, + "learning_rate": 0.0001, + "loss": 0.2306, + "step": 130 + }, + { + "epoch": 0.19380888290713325, + "grad_norm": 0.06591796875, + "learning_rate": 0.0001, + "loss": 0.0997, + "step": 135 + }, + { + "epoch": 0.20098698968147152, + "grad_norm": 0.060546875, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 140 + }, + { + "epoch": 0.20816509645580977, + "grad_norm": 0.037109375, + "learning_rate": 0.0001, + "loss": 0.0274, + "step": 145 + }, + { + "epoch": 0.21534320323014805, + "grad_norm": 0.0234375, + "learning_rate": 0.0001, + "loss": 0.0054, + "step": 150 + }, + { + "epoch": 0.22252131000448633, + "grad_norm": 0.337890625, + "learning_rate": 0.0001, + "loss": 1.0624, + "step": 155 + }, + { + "epoch": 0.22969941677882458, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001, + "loss": 0.829, + "step": 160 + }, + { + "epoch": 0.23687752355316286, + "grad_norm": 0.15234375, + "learning_rate": 0.0001, + "loss": 0.6497, + "step": 165 + }, + { + "epoch": 0.24405563032750113, + "grad_norm": 0.1142578125, + "learning_rate": 0.0001, + "loss": 0.5721, + "step": 170 + }, + { + "epoch": 0.2512337371018394, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001, + "loss": 0.4299, + "step": 175 + }, + { + "epoch": 0.25841184387617766, + "grad_norm": 0.11962890625, + "learning_rate": 0.0001, + "loss": 0.2842, + "step": 180 + }, + { + "epoch": 0.26558995065051594, + "grad_norm": 0.049560546875, + "learning_rate": 0.0001, + "loss": 0.1096, + "step": 185 + }, + { + "epoch": 0.2727680574248542, + "grad_norm": 0.072265625, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 190 + }, + { + "epoch": 0.27994616419919244, + "grad_norm": 0.0634765625, + "learning_rate": 0.0001, + "loss": 0.0188, + "step": 195 + }, + { + "epoch": 0.2871242709735307, + "grad_norm": 0.0167236328125, + "learning_rate": 0.0001, + "loss": 0.0077, + "step": 200 + }, + { + "epoch": 0.294302377747869, + "grad_norm": 0.2109375, + "learning_rate": 0.0001, + "loss": 1.0719, + "step": 205 + }, + { + "epoch": 0.30148048452220727, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001, + "loss": 0.79, + "step": 210 + }, + { + "epoch": 0.30865859129654555, + "grad_norm": 0.1328125, + "learning_rate": 0.0001, + "loss": 0.6307, + "step": 215 + }, + { + "epoch": 0.3158366980708838, + "grad_norm": 0.126953125, + "learning_rate": 0.0001, + "loss": 0.5041, + "step": 220 + }, + { + "epoch": 0.32301480484522205, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001, + "loss": 0.4389, + "step": 225 + }, + { + "epoch": 0.3301929116195603, + "grad_norm": 0.1181640625, + "learning_rate": 0.0001, + "loss": 0.2337, + "step": 230 + }, + { + "epoch": 0.3373710183938986, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001, + "loss": 0.1152, + "step": 235 + }, + { + "epoch": 0.3445491251682369, + "grad_norm": 0.038818359375, + "learning_rate": 0.0001, + "loss": 0.0224, + "step": 240 + }, + { + "epoch": 0.35172723194257516, + "grad_norm": 0.0703125, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 245 + }, + { + "epoch": 0.35890533871691344, + "grad_norm": 0.0400390625, + "learning_rate": 0.0001, + "loss": 0.0073, + "step": 250 + }, + { + "epoch": 0.36608344549125166, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001, + "loss": 1.0824, + "step": 255 + }, + { + "epoch": 0.37326155226558994, + "grad_norm": 0.12890625, + "learning_rate": 0.0001, + "loss": 0.8525, + "step": 260 + }, + { + "epoch": 0.3804396590399282, + "grad_norm": 0.1484375, + "learning_rate": 0.0001, + "loss": 0.6736, + "step": 265 + }, + { + "epoch": 0.3876177658142665, + "grad_norm": 0.16015625, + "learning_rate": 0.0001, + "loss": 0.5694, + "step": 270 + }, + { + "epoch": 0.39479587258860477, + "grad_norm": 0.146484375, + "learning_rate": 0.0001, + "loss": 0.4329, + "step": 275 + }, + { + "epoch": 0.40197397936294305, + "grad_norm": 0.095703125, + "learning_rate": 0.0001, + "loss": 0.2051, + "step": 280 + }, + { + "epoch": 0.40915208613728127, + "grad_norm": 0.130859375, + "learning_rate": 0.0001, + "loss": 0.1067, + "step": 285 + }, + { + "epoch": 0.41633019291161955, + "grad_norm": 0.10302734375, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 290 + }, + { + "epoch": 0.4235082996859578, + "grad_norm": 0.05126953125, + "learning_rate": 0.0001, + "loss": 0.0252, + "step": 295 + }, + { + "epoch": 0.4306864064602961, + "grad_norm": 0.0029449462890625, + "learning_rate": 0.0001, + "loss": 0.0046, + "step": 300 + }, + { + "epoch": 0.4378645132346344, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001, + "loss": 1.0461, + "step": 305 + }, + { + "epoch": 0.44504262000897266, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001, + "loss": 0.7834, + "step": 310 + }, + { + "epoch": 0.4522207267833109, + "grad_norm": 0.11669921875, + "learning_rate": 0.0001, + "loss": 0.6162, + "step": 315 + }, + { + "epoch": 0.45939883355764916, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001, + "loss": 0.4886, + "step": 320 + }, + { + "epoch": 0.46657694033198743, + "grad_norm": 0.11962890625, + "learning_rate": 0.0001, + "loss": 0.3858, + "step": 325 + }, + { + "epoch": 0.4737550471063257, + "grad_norm": 0.09521484375, + "learning_rate": 0.0001, + "loss": 0.2249, + "step": 330 + }, + { + "epoch": 0.480933153880664, + "grad_norm": 0.061279296875, + "learning_rate": 0.0001, + "loss": 0.0778, + "step": 335 + }, + { + "epoch": 0.48811126065500227, + "grad_norm": 0.04931640625, + "learning_rate": 0.0001, + "loss": 0.0258, + "step": 340 + }, + { + "epoch": 0.4952893674293405, + "grad_norm": 0.0283203125, + "learning_rate": 0.0001, + "loss": 0.0245, + "step": 345 + }, + { + "epoch": 0.5024674742036788, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0001, + "loss": 0.0108, + "step": 350 + }, + { + "epoch": 0.509645580978017, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001, + "loss": 1.1229, + "step": 355 + }, + { + "epoch": 0.5168236877523553, + "grad_norm": 0.130859375, + "learning_rate": 0.0001, + "loss": 0.7767, + "step": 360 + }, + { + "epoch": 0.5240017945266936, + "grad_norm": 0.1162109375, + "learning_rate": 0.0001, + "loss": 0.6151, + "step": 365 + }, + { + "epoch": 0.5311799013010319, + "grad_norm": 0.11767578125, + "learning_rate": 0.0001, + "loss": 0.4997, + "step": 370 + }, + { + "epoch": 0.5383580080753702, + "grad_norm": 0.1181640625, + "learning_rate": 0.0001, + "loss": 0.3645, + "step": 375 + }, + { + "epoch": 0.5455361148497084, + "grad_norm": 0.09228515625, + "learning_rate": 0.0001, + "loss": 0.2487, + "step": 380 + }, + { + "epoch": 0.5527142216240467, + "grad_norm": 0.043212890625, + "learning_rate": 0.0001, + "loss": 0.1116, + "step": 385 + }, + { + "epoch": 0.5598923283983849, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0001, + "loss": 0.0278, + "step": 390 + }, + { + "epoch": 0.5670704351727232, + "grad_norm": 0.048583984375, + "learning_rate": 0.0001, + "loss": 0.0104, + "step": 395 + }, + { + "epoch": 0.5742485419470614, + "grad_norm": 0.0458984375, + "learning_rate": 0.0001, + "loss": 0.0104, + "step": 400 + }, + { + "epoch": 0.5814266487213997, + "grad_norm": 0.1953125, + "learning_rate": 0.0001, + "loss": 0.9303, + "step": 405 + }, + { + "epoch": 0.588604755495738, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001, + "loss": 0.766, + "step": 410 + }, + { + "epoch": 0.5957828622700763, + "grad_norm": 0.130859375, + "learning_rate": 0.0001, + "loss": 0.5917, + "step": 415 + }, + { + "epoch": 0.6029609690444145, + "grad_norm": 0.10595703125, + "learning_rate": 0.0001, + "loss": 0.5611, + "step": 420 + }, + { + "epoch": 0.6101390758187528, + "grad_norm": 0.1220703125, + "learning_rate": 0.0001, + "loss": 0.3833, + "step": 425 + }, + { + "epoch": 0.6173171825930911, + "grad_norm": 0.11865234375, + "learning_rate": 0.0001, + "loss": 0.2563, + "step": 430 + }, + { + "epoch": 0.6244952893674294, + "grad_norm": 0.07568359375, + "learning_rate": 0.0001, + "loss": 0.1056, + "step": 435 + }, + { + "epoch": 0.6316733961417677, + "grad_norm": 0.087890625, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 440 + }, + { + "epoch": 0.6388515029161059, + "grad_norm": 0.038818359375, + "learning_rate": 0.0001, + "loss": 0.0113, + "step": 445 + }, + { + "epoch": 0.6460296096904441, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0001, + "loss": 0.0062, + "step": 450 + }, + { + "epoch": 0.6532077164647824, + "grad_norm": 0.18359375, + "learning_rate": 0.0001, + "loss": 0.894, + "step": 455 + }, + { + "epoch": 0.6603858232391207, + "grad_norm": 0.158203125, + "learning_rate": 0.0001, + "loss": 0.7454, + "step": 460 + }, + { + "epoch": 0.6675639300134589, + "grad_norm": 0.123046875, + "learning_rate": 0.0001, + "loss": 0.5539, + "step": 465 + }, + { + "epoch": 0.6747420367877972, + "grad_norm": 0.1357421875, + "learning_rate": 0.0001, + "loss": 0.5263, + "step": 470 + }, + { + "epoch": 0.6819201435621355, + "grad_norm": 0.09521484375, + "learning_rate": 0.0001, + "loss": 0.3882, + "step": 475 + }, + { + "epoch": 0.6890982503364738, + "grad_norm": 0.07958984375, + "learning_rate": 0.0001, + "loss": 0.2243, + "step": 480 + }, + { + "epoch": 0.696276357110812, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001, + "loss": 0.0728, + "step": 485 + }, + { + "epoch": 0.7034544638851503, + "grad_norm": 0.048583984375, + "learning_rate": 0.0001, + "loss": 0.0205, + "step": 490 + }, + { + "epoch": 0.7106325706594886, + "grad_norm": 0.06103515625, + "learning_rate": 0.0001, + "loss": 0.0179, + "step": 495 + }, + { + "epoch": 0.7178106774338269, + "grad_norm": 0.031494140625, + "learning_rate": 0.0001, + "loss": 0.0072, + "step": 500 + }, + { + "epoch": 0.7249887842081651, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001, + "loss": 0.9516, + "step": 505 + }, + { + "epoch": 0.7321668909825033, + "grad_norm": 0.1240234375, + "learning_rate": 0.0001, + "loss": 0.6854, + "step": 510 + }, + { + "epoch": 0.7393449977568416, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001, + "loss": 0.5769, + "step": 515 + }, + { + "epoch": 0.7465231045311799, + "grad_norm": 0.11962890625, + "learning_rate": 0.0001, + "loss": 0.4634, + "step": 520 + }, + { + "epoch": 0.7537012113055181, + "grad_norm": 0.11962890625, + "learning_rate": 0.0001, + "loss": 0.3856, + "step": 525 + }, + { + "epoch": 0.7608793180798564, + "grad_norm": 0.10791015625, + "learning_rate": 0.0001, + "loss": 0.2155, + "step": 530 + }, + { + "epoch": 0.7680574248541947, + "grad_norm": 0.0634765625, + "learning_rate": 0.0001, + "loss": 0.0857, + "step": 535 + }, + { + "epoch": 0.775235531628533, + "grad_norm": 0.07861328125, + "learning_rate": 0.0001, + "loss": 0.0233, + "step": 540 + }, + { + "epoch": 0.7824136384028713, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0001, + "loss": 0.013, + "step": 545 + }, + { + "epoch": 0.7895917451772095, + "grad_norm": 0.016845703125, + "learning_rate": 0.0001, + "loss": 0.0061, + "step": 550 + }, + { + "epoch": 0.7967698519515478, + "grad_norm": 0.1796875, + "learning_rate": 0.0001, + "loss": 0.8853, + "step": 555 + }, + { + "epoch": 0.8039479587258861, + "grad_norm": 0.154296875, + "learning_rate": 0.0001, + "loss": 0.726, + "step": 560 + }, + { + "epoch": 0.8111260655002244, + "grad_norm": 0.1328125, + "learning_rate": 0.0001, + "loss": 0.62, + "step": 565 + }, + { + "epoch": 0.8183041722745625, + "grad_norm": 0.126953125, + "learning_rate": 0.0001, + "loss": 0.5036, + "step": 570 + }, + { + "epoch": 0.8254822790489008, + "grad_norm": 0.1279296875, + "learning_rate": 0.0001, + "loss": 0.4053, + "step": 575 + }, + { + "epoch": 0.8326603858232391, + "grad_norm": 0.1142578125, + "learning_rate": 0.0001, + "loss": 0.2355, + "step": 580 + }, + { + "epoch": 0.8398384925975774, + "grad_norm": 0.045654296875, + "learning_rate": 0.0001, + "loss": 0.0751, + "step": 585 + }, + { + "epoch": 0.8470165993719156, + "grad_norm": 0.109375, + "learning_rate": 0.0001, + "loss": 0.0226, + "step": 590 + }, + { + "epoch": 0.8541947061462539, + "grad_norm": 0.027587890625, + "learning_rate": 0.0001, + "loss": 0.0055, + "step": 595 + }, + { + "epoch": 0.8613728129205922, + "grad_norm": 0.05712890625, + "learning_rate": 0.0001, + "loss": 0.0052, + "step": 600 + }, + { + "epoch": 0.8685509196949305, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001, + "loss": 0.9366, + "step": 605 + }, + { + "epoch": 0.8757290264692688, + "grad_norm": 0.1484375, + "learning_rate": 0.0001, + "loss": 0.7429, + "step": 610 + }, + { + "epoch": 0.882907133243607, + "grad_norm": 0.14453125, + "learning_rate": 0.0001, + "loss": 0.564, + "step": 615 + }, + { + "epoch": 0.8900852400179453, + "grad_norm": 0.140625, + "learning_rate": 0.0001, + "loss": 0.5045, + "step": 620 + }, + { + "epoch": 0.8972633467922836, + "grad_norm": 0.1259765625, + "learning_rate": 0.0001, + "loss": 0.3997, + "step": 625 + }, + { + "epoch": 0.9044414535666218, + "grad_norm": 0.0830078125, + "learning_rate": 0.0001, + "loss": 0.1856, + "step": 630 + }, + { + "epoch": 0.91161956034096, + "grad_norm": 0.06298828125, + "learning_rate": 0.0001, + "loss": 0.0583, + "step": 635 + }, + { + "epoch": 0.9187976671152983, + "grad_norm": 0.033935546875, + "learning_rate": 0.0001, + "loss": 0.0274, + "step": 640 + }, + { + "epoch": 0.9259757738896366, + "grad_norm": 0.03271484375, + "learning_rate": 0.0001, + "loss": 0.0078, + "step": 645 + }, + { + "epoch": 0.9331538806639749, + "grad_norm": 0.0244140625, + "learning_rate": 0.0001, + "loss": 0.003, + "step": 650 + }, + { + "epoch": 0.9403319874383131, + "grad_norm": 0.220703125, + "learning_rate": 0.0001, + "loss": 0.9234, + "step": 655 + }, + { + "epoch": 0.9475100942126514, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001, + "loss": 0.7145, + "step": 660 + }, + { + "epoch": 0.9546882009869897, + "grad_norm": 0.138671875, + "learning_rate": 0.0001, + "loss": 0.5764, + "step": 665 + }, + { + "epoch": 0.961866307761328, + "grad_norm": 0.1298828125, + "learning_rate": 0.0001, + "loss": 0.4568, + "step": 670 + }, + { + "epoch": 0.9690444145356663, + "grad_norm": 0.10400390625, + "learning_rate": 0.0001, + "loss": 0.2681, + "step": 675 + }, + { + "epoch": 0.9762225213100045, + "grad_norm": 0.080078125, + "learning_rate": 0.0001, + "loss": 0.1399, + "step": 680 + }, + { + "epoch": 0.9834006280843428, + "grad_norm": 0.068359375, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 685 + }, + { + "epoch": 0.990578734858681, + "grad_norm": 0.040283203125, + "learning_rate": 0.0001, + "loss": 0.0108, + "step": 690 + }, + { + "epoch": 0.9977568416330193, + "grad_norm": 0.022216796875, + "learning_rate": 0.0001, + "loss": 0.0082, + "step": 695 + }, + { + "epoch": 1.0049349484073575, + "grad_norm": 0.193359375, + "learning_rate": 0.0001, + "loss": 0.6031, + "step": 700 + }, + { + "epoch": 1.012113055181696, + "grad_norm": 0.1640625, + "learning_rate": 0.0001, + "loss": 0.7291, + "step": 705 + }, + { + "epoch": 1.019291161956034, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001, + "loss": 0.5393, + "step": 710 + }, + { + "epoch": 1.0264692687303723, + "grad_norm": 0.1416015625, + "learning_rate": 0.0001, + "loss": 0.413, + "step": 715 + }, + { + "epoch": 1.0336473755047106, + "grad_norm": 0.11669921875, + "learning_rate": 0.0001, + "loss": 0.3693, + "step": 720 + }, + { + "epoch": 1.0408254822790488, + "grad_norm": 0.123046875, + "learning_rate": 0.0001, + "loss": 0.2104, + "step": 725 + }, + { + "epoch": 1.0480035890533872, + "grad_norm": 0.055908203125, + "learning_rate": 0.0001, + "loss": 0.0834, + "step": 730 + }, + { + "epoch": 1.0551816958277254, + "grad_norm": 0.0546875, + "learning_rate": 0.0001, + "loss": 0.0144, + "step": 735 + }, + { + "epoch": 1.0623598026020638, + "grad_norm": 0.11181640625, + "learning_rate": 0.0001, + "loss": 0.0119, + "step": 740 + }, + { + "epoch": 1.069537909376402, + "grad_norm": 0.0034332275390625, + "learning_rate": 0.0001, + "loss": 0.0023, + "step": 745 + }, + { + "epoch": 1.0767160161507403, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001, + "loss": 0.5662, + "step": 750 + }, + { + "epoch": 1.0838941229250785, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001, + "loss": 0.7079, + "step": 755 + }, + { + "epoch": 1.0910722296994169, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001, + "loss": 0.5619, + "step": 760 + }, + { + "epoch": 1.098250336473755, + "grad_norm": 0.12890625, + "learning_rate": 0.0001, + "loss": 0.4236, + "step": 765 + }, + { + "epoch": 1.1054284432480934, + "grad_norm": 0.11328125, + "learning_rate": 0.0001, + "loss": 0.3422, + "step": 770 + }, + { + "epoch": 1.1126065500224316, + "grad_norm": 0.11181640625, + "learning_rate": 0.0001, + "loss": 0.2757, + "step": 775 + }, + { + "epoch": 1.1197846567967698, + "grad_norm": 0.1103515625, + "learning_rate": 0.0001, + "loss": 0.101, + "step": 780 + }, + { + "epoch": 1.1269627635711081, + "grad_norm": 0.0615234375, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 785 + }, + { + "epoch": 1.1341408703454463, + "grad_norm": 0.01123046875, + "learning_rate": 0.0001, + "loss": 0.0117, + "step": 790 + }, + { + "epoch": 1.1413189771197847, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0001, + "loss": 0.0068, + "step": 795 + }, + { + "epoch": 1.1484970838941229, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001, + "loss": 0.5275, + "step": 800 + }, + { + "epoch": 1.1556751906684612, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001, + "loss": 0.7151, + "step": 805 + }, + { + "epoch": 1.1628532974427994, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001, + "loss": 0.5625, + "step": 810 + }, + { + "epoch": 1.1700314042171378, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001, + "loss": 0.4765, + "step": 815 + }, + { + "epoch": 1.177209510991476, + "grad_norm": 0.1875, + "learning_rate": 0.0001, + "loss": 0.3728, + "step": 820 + }, + { + "epoch": 1.1843876177658144, + "grad_norm": 0.11181640625, + "learning_rate": 0.0001, + "loss": 0.2169, + "step": 825 + }, + { + "epoch": 1.1915657245401525, + "grad_norm": 0.0888671875, + "learning_rate": 0.0001, + "loss": 0.09, + "step": 830 + }, + { + "epoch": 1.198743831314491, + "grad_norm": 0.06396484375, + "learning_rate": 0.0001, + "loss": 0.0261, + "step": 835 + }, + { + "epoch": 1.205921938088829, + "grad_norm": 0.06591796875, + "learning_rate": 0.0001, + "loss": 0.0169, + "step": 840 + }, + { + "epoch": 1.2131000448631672, + "grad_norm": 0.01409912109375, + "learning_rate": 0.0001, + "loss": 0.0041, + "step": 845 + }, + { + "epoch": 1.2202781516375056, + "grad_norm": 0.2265625, + "learning_rate": 0.0001, + "loss": 0.5508, + "step": 850 + }, + { + "epoch": 1.2274562584118438, + "grad_norm": 0.255859375, + "learning_rate": 0.0001, + "loss": 0.7281, + "step": 855 + }, + { + "epoch": 1.2346343651861822, + "grad_norm": 0.212890625, + "learning_rate": 0.0001, + "loss": 0.499, + "step": 860 + }, + { + "epoch": 1.2418124719605204, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001, + "loss": 0.5054, + "step": 865 + }, + { + "epoch": 1.2489905787348587, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001, + "loss": 0.3918, + "step": 870 + }, + { + "epoch": 1.256168685509197, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001, + "loss": 0.2211, + "step": 875 + }, + { + "epoch": 1.263346792283535, + "grad_norm": 0.053955078125, + "learning_rate": 0.0001, + "loss": 0.099, + "step": 880 + }, + { + "epoch": 1.2705248990578735, + "grad_norm": 0.0263671875, + "learning_rate": 0.0001, + "loss": 0.0239, + "step": 885 + }, + { + "epoch": 1.2777030058322119, + "grad_norm": 0.055908203125, + "learning_rate": 0.0001, + "loss": 0.0203, + "step": 890 + }, + { + "epoch": 1.28488111260655, + "grad_norm": 0.0172119140625, + "learning_rate": 0.0001, + "loss": 0.0053, + "step": 895 + }, + { + "epoch": 1.2920592193808882, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001, + "loss": 0.4856, + "step": 900 + }, + { + "epoch": 1.2992373261552266, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001, + "loss": 0.7204, + "step": 905 + }, + { + "epoch": 1.3064154329295647, + "grad_norm": 0.19140625, + "learning_rate": 0.0001, + "loss": 0.5374, + "step": 910 + }, + { + "epoch": 1.3135935397039031, + "grad_norm": 0.216796875, + "learning_rate": 0.0001, + "loss": 0.48, + "step": 915 + }, + { + "epoch": 1.3207716464782413, + "grad_norm": 0.19921875, + "learning_rate": 0.0001, + "loss": 0.3897, + "step": 920 + }, + { + "epoch": 1.3279497532525797, + "grad_norm": 0.10205078125, + "learning_rate": 0.0001, + "loss": 0.2242, + "step": 925 + }, + { + "epoch": 1.3351278600269179, + "grad_norm": 0.08251953125, + "learning_rate": 0.0001, + "loss": 0.1292, + "step": 930 + }, + { + "epoch": 1.3423059668012562, + "grad_norm": 0.068359375, + "learning_rate": 0.0001, + "loss": 0.0242, + "step": 935 + }, + { + "epoch": 1.3494840735755944, + "grad_norm": 0.08544921875, + "learning_rate": 0.0001, + "loss": 0.0092, + "step": 940 + }, + { + "epoch": 1.3566621803499328, + "grad_norm": 0.004241943359375, + "learning_rate": 0.0001, + "loss": 0.0039, + "step": 945 + }, + { + "epoch": 1.363840287124271, + "grad_norm": 0.25390625, + "learning_rate": 0.0001, + "loss": 0.5465, + "step": 950 + }, + { + "epoch": 1.3710183938986091, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001, + "loss": 0.6114, + "step": 955 + }, + { + "epoch": 1.3781965006729475, + "grad_norm": 0.244140625, + "learning_rate": 0.0001, + "loss": 0.5226, + "step": 960 + }, + { + "epoch": 1.385374607447286, + "grad_norm": 0.205078125, + "learning_rate": 0.0001, + "loss": 0.4234, + "step": 965 + }, + { + "epoch": 1.392552714221624, + "grad_norm": 0.130859375, + "learning_rate": 0.0001, + "loss": 0.3595, + "step": 970 + }, + { + "epoch": 1.3997308209959622, + "grad_norm": 0.123046875, + "learning_rate": 0.0001, + "loss": 0.2464, + "step": 975 + }, + { + "epoch": 1.4069089277703006, + "grad_norm": 0.11767578125, + "learning_rate": 0.0001, + "loss": 0.11, + "step": 980 + }, + { + "epoch": 1.4140870345446388, + "grad_norm": 0.05322265625, + "learning_rate": 0.0001, + "loss": 0.0205, + "step": 985 + }, + { + "epoch": 1.4212651413189772, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0001, + "loss": 0.0102, + "step": 990 + }, + { + "epoch": 1.4284432480933154, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0001, + "loss": 0.0044, + "step": 995 + }, + { + "epoch": 1.4356213548676537, + "grad_norm": 0.23046875, + "learning_rate": 0.0001, + "loss": 0.4827, + "step": 1000 + }, + { + "epoch": 1.442799461641992, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001, + "loss": 0.6536, + "step": 1005 + }, + { + "epoch": 1.44997756841633, + "grad_norm": 0.1953125, + "learning_rate": 0.0001, + "loss": 0.5993, + "step": 1010 + }, + { + "epoch": 1.4571556751906685, + "grad_norm": 0.158203125, + "learning_rate": 0.0001, + "loss": 0.4176, + "step": 1015 + }, + { + "epoch": 1.4643337819650069, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001, + "loss": 0.307, + "step": 1020 + }, + { + "epoch": 1.471511888739345, + "grad_norm": 0.1005859375, + "learning_rate": 0.0001, + "loss": 0.2381, + "step": 1025 + }, + { + "epoch": 1.4786899955136832, + "grad_norm": 0.06396484375, + "learning_rate": 0.0001, + "loss": 0.084, + "step": 1030 + }, + { + "epoch": 1.4858681022880216, + "grad_norm": 0.01153564453125, + "learning_rate": 0.0001, + "loss": 0.0165, + "step": 1035 + }, + { + "epoch": 1.4930462090623597, + "grad_norm": 0.0283203125, + "learning_rate": 0.0001, + "loss": 0.0059, + "step": 1040 + }, + { + "epoch": 1.500224315836698, + "grad_norm": 0.0380859375, + "learning_rate": 0.0001, + "loss": 0.0051, + "step": 1045 + }, + { + "epoch": 1.5074024226110363, + "grad_norm": 0.296875, + "learning_rate": 0.0001, + "loss": 0.5321, + "step": 1050 + } + ], + "logging_steps": 5, + "max_steps": 1050, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 90, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.83809405232513e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/training_args.bin b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f96c3a448688b1b9bdf6bb55e263846630401def --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-1050/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:337c706998f7adefea3d36fb9751185f26f30ea6bf7ce24cfb830dd973c3fe15 +size 7416 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/README.md b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_config.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_config.json similarity index 100% rename from codellama/java/dataflow_pretrained/checkpoint-720/adapter_config.json rename to codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_config.json index 725feabb5b40786c81604df22999d165641e135e..cee555b68bda5470d8924cc5b423788b8994d8df 100644 --- a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_config.json +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_config.json @@ -20,12 +20,12 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "gate_proj", + "v_proj", "up_proj", "o_proj", "down_proj", - "v_proj", "k_proj", + "gate_proj", "q_proj" ], "task_type": "CAUSAL_LM", diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_model.safetensors b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..82d1d0387c411fdab4f59437a5e07c8bd16bd471 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f95b4bd870af822f158c83c9f22dff6c5cff42df4e81335c99eacd38d76426b5 +size 500771216 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_model/README.md b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_model/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/adapter_config.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_model/adapter_config.json similarity index 100% rename from codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/adapter_config.json rename to codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_model/adapter_config.json index 725feabb5b40786c81604df22999d165641e135e..cee555b68bda5470d8924cc5b423788b8994d8df 100644 --- a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/adapter_config.json +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_model/adapter_config.json @@ -20,12 +20,12 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "gate_proj", + "v_proj", "up_proj", "o_proj", "down_proj", - "v_proj", "k_proj", + "gate_proj", "q_proj" ], "task_type": "CAUSAL_LM", diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_model/adapter_model.safetensors b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_model/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..82d1d0387c411fdab4f59437a5e07c8bd16bd471 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/adapter_model/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f95b4bd870af822f158c83c9f22dff6c5cff42df4e81335c99eacd38d76426b5 +size 500771216 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/added_tokens.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/added_tokens.json @@ -0,0 +1,3 @@ +{ + "[PAD]": 32016 +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/optimizer.pt b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..880329d5912840df232d60876f72dcf479acbbc1 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f746b6db1789ba6b768ae40c2e2a4153017b0276da6498aac4130f40e7381bf +size 2003126962 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/rng_state.pth b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9570b02680b4230f9ba15ce25da1b40ddba969a --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585f15ae1d5104d9384b07ae641e0e10926f991dea913b9243bcce14a7965a42 +size 14244 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/scheduler.pt b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..638774351b15ff484ffc0ca6c82bd744318f1cb0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e116f8d4565c3c175d4858b7ff08054c3bd13ca8f526b793ad3d5a6f5f8f4fb +size 1064 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/special_tokens_map.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/special_tokens_map.json @@ -0,0 +1,36 @@ +{ + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/tokenizer.model b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6 +size 500058 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/tokenizer_config.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/tokenizer_config.json @@ -0,0 +1,94 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32007": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32008": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32009": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32010": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32016": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": " ", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<", + "eot_token": "▁>\\n' + system_message + '\\n< >\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "fill_token": " ", + "legacy": null, + "middle_token": "▁ ", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "padding_side": "right", + "prefix_token": "▁ ", + "sp_model_kwargs": {}, + "suffix_first": false, + "suffix_token": "▁", + "tokenizer_class": "CodeLlamaTokenizer", + "unk_token": " ", + "use_default_system_prompt": false +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/trainer_state.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4de8b5c0fc2f6179b0cc90e63ef574698c469a7e --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/trainer_state.json @@ -0,0 +1,96 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.5294117647058822, + "eval_steps": 500, + "global_step": 45, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.39215686274509803, + "grad_norm": 0.05810546875, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 5 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.042724609375, + "learning_rate": 0.0001, + "loss": 0.619, + "step": 10 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.02197265625, + "learning_rate": 0.0001, + "loss": 0.5684, + "step": 15 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0001, + "loss": 0.559, + "step": 20 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.027099609375, + "learning_rate": 0.0001, + "loss": 0.5347, + "step": 25 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.0172119140625, + "learning_rate": 0.0001, + "loss": 0.5314, + "step": 30 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.02783203125, + "learning_rate": 0.0001, + "loss": 0.5225, + "step": 35 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.017333984375, + "learning_rate": 0.0001, + "loss": 0.525, + "step": 40 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0001, + "loss": 0.4793, + "step": 45 + } + ], + "logging_steps": 5, + "max_steps": 45, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 180, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.685541393109811e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/training_args.bin b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1ec5c4a43c1dbbd247177f67ddc8c5e7f0dc7dd --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/checkpoint-45/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2abb1148c0ef26127fb936df2993d025cb794302daf263027e6201a3c124bd8 +size 7480 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/completed b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/completed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/metrics.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..560f1f85927011573afbb204e080d78e9c572052 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/metrics.json @@ -0,0 +1 @@ +{"run_name": "codetransocean_callgraph_java", "train_runtime": 11159.7771, "train_samples_per_second": 0.516, "train_steps_per_second": 0.004, "total_flos": 3.685541393109811e+17, "train_loss": 0.6542216989729139, "epoch": 3.5294117647058822} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/train_results.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e749165f7e71cc7d33038aa6b587c45168c1779d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.5294117647058822, + "total_flos": 3.685541393109811e+17, + "train_loss": 0.6542216989729139, + "train_runtime": 11159.7771, + "train_samples_per_second": 0.516, + "train_steps_per_second": 0.004 +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/trainer_state.json b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..30335aceac10055f633a314b3276fda1b7c4df5b --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_callgraph_java/trainer_state.json @@ -0,0 +1,105 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.5294117647058822, + "eval_steps": 500, + "global_step": 45, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.39215686274509803, + "grad_norm": 0.05810546875, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 5 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.042724609375, + "learning_rate": 0.0001, + "loss": 0.619, + "step": 10 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.02197265625, + "learning_rate": 0.0001, + "loss": 0.5684, + "step": 15 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0001, + "loss": 0.559, + "step": 20 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.027099609375, + "learning_rate": 0.0001, + "loss": 0.5347, + "step": 25 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.0172119140625, + "learning_rate": 0.0001, + "loss": 0.5314, + "step": 30 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.02783203125, + "learning_rate": 0.0001, + "loss": 0.5225, + "step": 35 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.017333984375, + "learning_rate": 0.0001, + "loss": 0.525, + "step": 40 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0001, + "loss": 0.4793, + "step": 45 + }, + { + "epoch": 3.5294117647058822, + "step": 45, + "total_flos": 3.685541393109811e+17, + "train_loss": 0.6542216989729139, + "train_runtime": 11159.7771, + "train_samples_per_second": 0.516, + "train_steps_per_second": 0.004 + } + ], + "logging_steps": 5, + "max_steps": 45, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 180, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.685541393109811e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/all_results.json b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..98dc6c2ee4518eb33b44b76dccac890ed4634be6 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.5294117647058822, + "total_flos": 3.685541393109811e+17, + "train_loss": 0.5500616497463651, + "train_runtime": 9018.8961, + "train_samples_per_second": 0.639, + "train_steps_per_second": 0.005 +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/README.md b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b1f6f9e34a7687a5dc07012d16ff7b343f3bade4 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/README.md @@ -0,0 +1,202 @@ +--- +base_model: ./CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_config.json b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4d8560bd5b632071846880bde6dbf4ca8a6525b5 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_model.safetensors b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..929176ddb1863bc93f58e01647be196697a68b00 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fea2d01446692a9772bedb977099704c0a8bda8d4e358da20f64e1bf614deda2 +size 500771216 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_model/README.md b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b1f6f9e34a7687a5dc07012d16ff7b343f3bade4 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_model/README.md @@ -0,0 +1,202 @@ +--- +base_model: ./CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_model/adapter_config.json b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4d8560bd5b632071846880bde6dbf4ca8a6525b5 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_model/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_model/adapter_model.safetensors b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_model/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..929176ddb1863bc93f58e01647be196697a68b00 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/adapter_model/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fea2d01446692a9772bedb977099704c0a8bda8d4e358da20f64e1bf614deda2 +size 500771216 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/added_tokens.json b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/added_tokens.json @@ -0,0 +1,3 @@ +{ + "[PAD]": 32016 +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/optimizer.pt b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4fe2901bbc9f3c11c1ec6ca4896148b6d0defae --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7972e2c47c4a9adb53fa8be0134f39784457d793d7ff3466aa973c6442c233c9 +size 2003126962 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/rng_state.pth b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9570b02680b4230f9ba15ce25da1b40ddba969a --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585f15ae1d5104d9384b07ae641e0e10926f991dea913b9243bcce14a7965a42 +size 14244 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/scheduler.pt b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..638774351b15ff484ffc0ca6c82bd744318f1cb0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e116f8d4565c3c175d4858b7ff08054c3bd13ca8f526b793ad3d5a6f5f8f4fb +size 1064 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/special_tokens_map.json b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/special_tokens_map.json @@ -0,0 +1,36 @@ +{ + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/tokenizer.model b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6 +size 500058 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/tokenizer_config.json b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/tokenizer_config.json @@ -0,0 +1,94 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32007": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32008": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32009": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32010": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32016": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": " ", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<", + "eot_token": "▁>\\n' + system_message + '\\n< >\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "fill_token": " ", + "legacy": null, + "middle_token": "▁ ", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "padding_side": "right", + "prefix_token": "▁ ", + "sp_model_kwargs": {}, + "suffix_first": false, + "suffix_token": "▁", + "tokenizer_class": "CodeLlamaTokenizer", + "unk_token": " ", + "use_default_system_prompt": false +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/trainer_state.json b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..48fe3bf0592e92dece03986677dd34e5564b375a --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/trainer_state.json @@ -0,0 +1,96 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.5294117647058822, + "eval_steps": 500, + "global_step": 45, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.39215686274509803, + "grad_norm": 0.0654296875, + "learning_rate": 0.0001, + "loss": 0.6657, + "step": 5 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0001, + "loss": 0.599, + "step": 10 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0001, + "loss": 0.5605, + "step": 15 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0001, + "loss": 0.5525, + "step": 20 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.027587890625, + "learning_rate": 0.0001, + "loss": 0.5312, + "step": 25 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.01708984375, + "learning_rate": 0.0001, + "loss": 0.529, + "step": 30 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.02734375, + "learning_rate": 0.0001, + "loss": 0.5179, + "step": 35 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.017578125, + "learning_rate": 0.0001, + "loss": 0.5203, + "step": 40 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0001, + "loss": 0.4744, + "step": 45 + } + ], + "logging_steps": 5, + "max_steps": 45, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 180, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.685541393109811e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/training_args.bin b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bf54a22e74fc827dc1dcf396e6b5384307f103ea --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/checkpoint-45/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bacb3be81a58908cd5b573f9c22710267a1381c4e4b39655345072d3e8da3eb +size 7416 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/completed b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/completed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/metrics.json b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..6148c805290c9589feab097b687262dad66de076 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/metrics.json @@ -0,0 +1 @@ +{"run_name": "codetransocean_dataflow_java", "train_runtime": 9018.8961, "train_samples_per_second": 0.639, "train_steps_per_second": 0.005, "total_flos": 3.685541393109811e+17, "train_loss": 0.5500616497463651, "epoch": 3.5294117647058822} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/train_results.json b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..98dc6c2ee4518eb33b44b76dccac890ed4634be6 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.5294117647058822, + "total_flos": 3.685541393109811e+17, + "train_loss": 0.5500616497463651, + "train_runtime": 9018.8961, + "train_samples_per_second": 0.639, + "train_steps_per_second": 0.005 +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/trainer_state.json b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..937fc0731718bd59b0da3849da93e370f625c1db --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_dataflow_java/trainer_state.json @@ -0,0 +1,105 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.5294117647058822, + "eval_steps": 500, + "global_step": 45, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.39215686274509803, + "grad_norm": 0.0654296875, + "learning_rate": 0.0001, + "loss": 0.6657, + "step": 5 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0001, + "loss": 0.599, + "step": 10 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0001, + "loss": 0.5605, + "step": 15 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0001, + "loss": 0.5525, + "step": 20 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.027587890625, + "learning_rate": 0.0001, + "loss": 0.5312, + "step": 25 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.01708984375, + "learning_rate": 0.0001, + "loss": 0.529, + "step": 30 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.02734375, + "learning_rate": 0.0001, + "loss": 0.5179, + "step": 35 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.017578125, + "learning_rate": 0.0001, + "loss": 0.5203, + "step": 40 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0001, + "loss": 0.4744, + "step": 45 + }, + { + "epoch": 3.5294117647058822, + "step": 45, + "total_flos": 3.685541393109811e+17, + "train_loss": 0.5500616497463651, + "train_runtime": 9018.8961, + "train_samples_per_second": 0.639, + "train_steps_per_second": 0.005 + } + ], + "logging_steps": 5, + "max_steps": 45, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 180, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.685541393109811e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/all_results.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a4011d30782f214192d8e1542f0d95831d6ae3b0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.5294117647058822, + "total_flos": 3.685541393109811e+17, + "train_loss": 0.5359265327453613, + "train_runtime": 11061.1318, + "train_samples_per_second": 0.521, + "train_steps_per_second": 0.004 +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/README.md b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_config.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..db0272acfce83f9f8edfee6525753f7f4a7087e3 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "k_proj", + "down_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_model.safetensors b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..02908230ce29bbb1e4b765becf34f023dc259932 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:508c59aa8df8bf0273160337995d2b025715d72cdd161fa4966516fbc14f93ae +size 500771216 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_model/README.md b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_model/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_model/adapter_config.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..db0272acfce83f9f8edfee6525753f7f4a7087e3 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_model/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "k_proj", + "down_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_model/adapter_model.safetensors b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_model/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..02908230ce29bbb1e4b765becf34f023dc259932 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/adapter_model/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:508c59aa8df8bf0273160337995d2b025715d72cdd161fa4966516fbc14f93ae +size 500771216 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/added_tokens.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/added_tokens.json @@ -0,0 +1,3 @@ +{ + "[PAD]": 32016 +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/optimizer.pt b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5fa2e98fec50b8bad11dfa7b026bfe11f1db48d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6f5b295addc741c50b650324510e731c9615ba00049ce01623ff7dd87f08aaa +size 2003126962 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/rng_state.pth b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9570b02680b4230f9ba15ce25da1b40ddba969a --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585f15ae1d5104d9384b07ae641e0e10926f991dea913b9243bcce14a7965a42 +size 14244 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/scheduler.pt b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..638774351b15ff484ffc0ca6c82bd744318f1cb0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e116f8d4565c3c175d4858b7ff08054c3bd13ca8f526b793ad3d5a6f5f8f4fb +size 1064 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/special_tokens_map.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/special_tokens_map.json @@ -0,0 +1,36 @@ +{ + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/tokenizer.model b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6 +size 500058 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/tokenizer_config.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/tokenizer_config.json @@ -0,0 +1,94 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32007": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32008": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32009": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32010": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32016": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": " ", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<", + "eot_token": "▁>\\n' + system_message + '\\n< >\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "fill_token": " ", + "legacy": null, + "middle_token": "▁ ", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "padding_side": "right", + "prefix_token": "▁ ", + "sp_model_kwargs": {}, + "suffix_first": false, + "suffix_token": "▁", + "tokenizer_class": "CodeLlamaTokenizer", + "unk_token": " ", + "use_default_system_prompt": false +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/trainer_state.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fc54f0c1fe745b48b778a185c4f16f16bfcbed51 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/trainer_state.json @@ -0,0 +1,96 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.5294117647058822, + "eval_steps": 500, + "global_step": 45, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.39215686274509803, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0001, + "loss": 0.5995, + "step": 5 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.022705078125, + "learning_rate": 0.0001, + "loss": 0.5904, + "step": 10 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0001, + "loss": 0.556, + "step": 15 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.0162353515625, + "learning_rate": 0.0001, + "loss": 0.5464, + "step": 20 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0001, + "loss": 0.5259, + "step": 25 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.0147705078125, + "learning_rate": 0.0001, + "loss": 0.5226, + "step": 30 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.029296875, + "learning_rate": 0.0001, + "loss": 0.5098, + "step": 35 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.016357421875, + "learning_rate": 0.0001, + "loss": 0.5106, + "step": 40 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0001, + "loss": 0.4622, + "step": 45 + } + ], + "logging_steps": 5, + "max_steps": 45, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 180, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.685541393109811e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/training_args.bin b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..07e7d4dc28ee82f8627ecda29fc4165bf6767b4d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-45/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e8e363681d9a0a69ff5b6225fa909e1d35235b8b2e7bda997204fba4ab79a75 +size 7480 diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/README.md b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/README.md similarity index 100% rename from codellama/java/dataflow_pretrained/checkpoint-720/README.md rename to codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/README.md diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_config.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..787e952b4f16ebd29c2406338dfab6b0cd4d639d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "up_proj", + "down_proj", + "q_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_model.safetensors b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..975e33b96bccd37d6219e956c8859dad3f7b7029 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01e77ca05e88984d4b72dc5f1c8c94d328445cf729e8b17e1afa1fad50606501 +size 1156480200 diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/README.md b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_model/README.md similarity index 100% rename from codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/README.md rename to codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_model/README.md diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_model/adapter_config.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..787e952b4f16ebd29c2406338dfab6b0cd4d639d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_model/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "up_proj", + "down_proj", + "q_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_model/adapter_model.safetensors b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_model/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..975e33b96bccd37d6219e956c8859dad3f7b7029 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/adapter_model/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01e77ca05e88984d4b72dc5f1c8c94d328445cf729e8b17e1afa1fad50606501 +size 1156480200 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/added_tokens.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/added_tokens.json @@ -0,0 +1,3 @@ +{ + "[PAD]": 32016 +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/optimizer.pt b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a595fdc072b9d5f340dcf7788a93f8a950ce319 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:666a0ddc00cba0757ac08ae6f0534170c86020f442f37895bd9444e540899e0f +size 2003127538 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/rng_state.pth b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..812e2783f6865ef8011ac461a289d2729020baf1 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf7ded6b50215b6ca731803acc0f628d8869946a601aa0eefc9bc5a3ea634352 +size 14244 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/scheduler.pt b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf093e565147c0e3ffb02b459d87a1f151f8cb5f --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b65cb75c8ba291e997d5990244bed326162b84cd48c7d8b2dc9c2d6e13468f82 +size 1064 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/special_tokens_map.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/special_tokens_map.json @@ -0,0 +1,36 @@ +{ + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/tokenizer.model b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6 +size 500058 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/tokenizer_config.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/tokenizer_config.json @@ -0,0 +1,94 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32007": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32008": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32009": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32010": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32016": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": " ", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<", + "eot_token": "▁>\\n' + system_message + '\\n< >\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "fill_token": " ", + "legacy": null, + "middle_token": "▁ ", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "padding_side": "right", + "prefix_token": "▁ ", + "sp_model_kwargs": {}, + "suffix_first": false, + "suffix_token": "▁", + "tokenizer_class": "CodeLlamaTokenizer", + "unk_token": " ", + "use_default_system_prompt": false +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/trainer_state.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..11eb02d4f21ad16906954fde89b9d7b4886b9836 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/trainer_state.json @@ -0,0 +1,6753 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.072, + "eval_steps": 500, + "global_step": 4800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.0537109375, + "learning_rate": 0.0001, + "loss": 0.3678, + "step": 5 + }, + { + "epoch": 0.0064, + "grad_norm": 0.056884765625, + "learning_rate": 0.0001, + "loss": 0.2519, + "step": 10 + }, + { + "epoch": 0.0096, + "grad_norm": 0.1240234375, + "learning_rate": 0.0001, + "loss": 0.136, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.08642578125, + "learning_rate": 0.0001, + "loss": 0.08, + "step": 20 + }, + { + "epoch": 0.016, + "grad_norm": 0.0419921875, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 25 + }, + { + "epoch": 0.0192, + "grad_norm": 0.08544921875, + "learning_rate": 0.0001, + "loss": 0.0782, + "step": 30 + }, + { + "epoch": 0.0224, + "grad_norm": 0.0654296875, + "learning_rate": 0.0001, + "loss": 0.144, + "step": 35 + }, + { + "epoch": 0.0256, + "grad_norm": 0.07958984375, + "learning_rate": 0.0001, + "loss": 0.1175, + "step": 40 + }, + { + "epoch": 0.0288, + "grad_norm": 0.10791015625, + "learning_rate": 0.0001, + "loss": 0.0657, + "step": 45 + }, + { + "epoch": 0.032, + "grad_norm": 0.08642578125, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 50 + }, + { + "epoch": 0.0352, + "grad_norm": 0.039306640625, + "learning_rate": 0.0001, + "loss": 0.0764, + "step": 55 + }, + { + "epoch": 0.0384, + "grad_norm": 0.045654296875, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 60 + }, + { + "epoch": 0.0416, + "grad_norm": 0.0181884765625, + "learning_rate": 0.0001, + "loss": 0.0111, + "step": 65 + }, + { + "epoch": 0.0448, + "grad_norm": 0.03125, + "learning_rate": 0.0001, + "loss": 0.0184, + "step": 70 + }, + { + "epoch": 0.048, + "grad_norm": 0.00201416015625, + "learning_rate": 0.0001, + "loss": 0.0033, + "step": 75 + }, + { + "epoch": 0.0512, + "grad_norm": 0.023681640625, + "learning_rate": 0.0001, + "loss": 0.0134, + "step": 80 + }, + { + "epoch": 0.0544, + "grad_norm": 0.0018463134765625, + "learning_rate": 0.0001, + "loss": 0.0023, + "step": 85 + }, + { + "epoch": 0.0576, + "grad_norm": 0.024658203125, + "learning_rate": 0.0001, + "loss": 0.0021, + "step": 90 + }, + { + "epoch": 0.0608, + "grad_norm": 0.0185546875, + "learning_rate": 0.0001, + "loss": 0.0031, + "step": 95 + }, + { + "epoch": 0.064, + "grad_norm": 0.017822265625, + "learning_rate": 0.0001, + "loss": 0.0032, + "step": 100 + }, + { + "epoch": 0.0672, + "grad_norm": 0.041259765625, + "learning_rate": 0.0001, + "loss": 0.0457, + "step": 105 + }, + { + "epoch": 0.0704, + "grad_norm": 0.02880859375, + "learning_rate": 0.0001, + "loss": 0.0152, + "step": 110 + }, + { + "epoch": 0.0736, + "grad_norm": 0.00982666015625, + "learning_rate": 0.0001, + "loss": 0.0095, + "step": 115 + }, + { + "epoch": 0.0768, + "grad_norm": 0.016845703125, + "learning_rate": 0.0001, + "loss": 0.0049, + "step": 120 + }, + { + "epoch": 0.08, + "grad_norm": 0.002166748046875, + "learning_rate": 0.0001, + "loss": 0.0032, + "step": 125 + }, + { + "epoch": 0.0832, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0001, + "loss": 0.0061, + "step": 130 + }, + { + "epoch": 0.0864, + "grad_norm": 0.0078125, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 135 + }, + { + "epoch": 0.0896, + "grad_norm": 0.00086212158203125, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 140 + }, + { + "epoch": 0.0928, + "grad_norm": 0.0006256103515625, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 145 + }, + { + "epoch": 0.096, + "grad_norm": 0.00164794921875, + "learning_rate": 0.0001, + "loss": 0.0054, + "step": 150 + }, + { + "epoch": 0.0992, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0001, + "loss": 0.0264, + "step": 155 + }, + { + "epoch": 0.1024, + "grad_norm": 0.019287109375, + "learning_rate": 0.0001, + "loss": 0.0108, + "step": 160 + }, + { + "epoch": 0.1056, + "grad_norm": 0.007354736328125, + "learning_rate": 0.0001, + "loss": 0.0035, + "step": 165 + }, + { + "epoch": 0.1088, + "grad_norm": 0.0103759765625, + "learning_rate": 0.0001, + "loss": 0.004, + "step": 170 + }, + { + "epoch": 0.112, + "grad_norm": 0.0013580322265625, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 175 + }, + { + "epoch": 0.1152, + "grad_norm": 0.001434326171875, + "learning_rate": 0.0001, + "loss": 0.0015, + "step": 180 + }, + { + "epoch": 0.1184, + "grad_norm": 0.00102996826171875, + "learning_rate": 0.0001, + "loss": 0.0104, + "step": 185 + }, + { + "epoch": 0.1216, + "grad_norm": 0.001708984375, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 190 + }, + { + "epoch": 0.1248, + "grad_norm": 0.02099609375, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 195 + }, + { + "epoch": 0.128, + "grad_norm": 0.0014190673828125, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 200 + }, + { + "epoch": 0.1312, + "grad_norm": 0.031005859375, + "learning_rate": 0.0001, + "loss": 0.0165, + "step": 205 + }, + { + "epoch": 0.1344, + "grad_norm": 0.00836181640625, + "learning_rate": 0.0001, + "loss": 0.0063, + "step": 210 + }, + { + "epoch": 0.1376, + "grad_norm": 0.0111083984375, + "learning_rate": 0.0001, + "loss": 0.0018, + "step": 215 + }, + { + "epoch": 0.1408, + "grad_norm": 0.036865234375, + "learning_rate": 0.0001, + "loss": 0.0048, + "step": 220 + }, + { + "epoch": 0.144, + "grad_norm": 0.00061798095703125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 225 + }, + { + "epoch": 0.1472, + "grad_norm": 0.0162353515625, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 230 + }, + { + "epoch": 0.1504, + "grad_norm": 0.00077056884765625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 235 + }, + { + "epoch": 0.1536, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0001, + "loss": 0.0015, + "step": 240 + }, + { + "epoch": 0.1568, + "grad_norm": 0.00579833984375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 245 + }, + { + "epoch": 0.16, + "grad_norm": 0.00121307373046875, + "learning_rate": 0.0001, + "loss": 0.0022, + "step": 250 + }, + { + "epoch": 0.1632, + "grad_norm": 0.0115966796875, + "learning_rate": 0.0001, + "loss": 0.0077, + "step": 255 + }, + { + "epoch": 0.1664, + "grad_norm": 0.01806640625, + "learning_rate": 0.0001, + "loss": 0.0033, + "step": 260 + }, + { + "epoch": 0.1696, + "grad_norm": 0.000514984130859375, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 265 + }, + { + "epoch": 0.1728, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0001, + "loss": 0.0025, + "step": 270 + }, + { + "epoch": 0.176, + "grad_norm": 0.00579833984375, + "learning_rate": 0.0001, + "loss": 0.0014, + "step": 275 + }, + { + "epoch": 0.1792, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0001, + "loss": 0.0014, + "step": 280 + }, + { + "epoch": 0.1824, + "grad_norm": 0.000598907470703125, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 285 + }, + { + "epoch": 0.1856, + "grad_norm": 0.031982421875, + "learning_rate": 0.0001, + "loss": 0.0021, + "step": 290 + }, + { + "epoch": 0.1888, + "grad_norm": 0.0008087158203125, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 295 + }, + { + "epoch": 0.192, + "grad_norm": 0.0029296875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 300 + }, + { + "epoch": 0.1952, + "grad_norm": 0.00750732421875, + "learning_rate": 0.0001, + "loss": 0.0038, + "step": 305 + }, + { + "epoch": 0.1984, + "grad_norm": 0.0185546875, + "learning_rate": 0.0001, + "loss": 0.0023, + "step": 310 + }, + { + "epoch": 0.2016, + "grad_norm": 0.0128173828125, + "learning_rate": 0.0001, + "loss": 0.0025, + "step": 315 + }, + { + "epoch": 0.2048, + "grad_norm": 0.0157470703125, + "learning_rate": 0.0001, + "loss": 0.0014, + "step": 320 + }, + { + "epoch": 0.208, + "grad_norm": 0.0096435546875, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 325 + }, + { + "epoch": 0.2112, + "grad_norm": 0.01458740234375, + "learning_rate": 0.0001, + "loss": 0.0015, + "step": 330 + }, + { + "epoch": 0.2144, + "grad_norm": 0.004150390625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 335 + }, + { + "epoch": 0.2176, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 340 + }, + { + "epoch": 0.2208, + "grad_norm": 0.0010986328125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 345 + }, + { + "epoch": 0.224, + "grad_norm": 0.001220703125, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 350 + }, + { + "epoch": 0.2272, + "grad_norm": 0.01348876953125, + "learning_rate": 0.0001, + "loss": 0.0051, + "step": 355 + }, + { + "epoch": 0.2304, + "grad_norm": 0.01025390625, + "learning_rate": 0.0001, + "loss": 0.0024, + "step": 360 + }, + { + "epoch": 0.2336, + "grad_norm": 0.0037994384765625, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 365 + }, + { + "epoch": 0.2368, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001, + "loss": 0.0023, + "step": 370 + }, + { + "epoch": 0.24, + "grad_norm": 0.0001373291015625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 375 + }, + { + "epoch": 0.2432, + "grad_norm": 0.006500244140625, + "learning_rate": 0.0001, + "loss": 0.0025, + "step": 380 + }, + { + "epoch": 0.2464, + "grad_norm": 0.00020503997802734375, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 385 + }, + { + "epoch": 0.2496, + "grad_norm": 0.00022983551025390625, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 390 + }, + { + "epoch": 0.2528, + "grad_norm": 0.00018787384033203125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 395 + }, + { + "epoch": 0.256, + "grad_norm": 0.000614166259765625, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 400 + }, + { + "epoch": 0.2592, + "grad_norm": 0.016845703125, + "learning_rate": 0.0001, + "loss": 0.0102, + "step": 405 + }, + { + "epoch": 0.2624, + "grad_norm": 0.00946044921875, + "learning_rate": 0.0001, + "loss": 0.0018, + "step": 410 + }, + { + "epoch": 0.2656, + "grad_norm": 0.00098419189453125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 415 + }, + { + "epoch": 0.2688, + "grad_norm": 0.01025390625, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 420 + }, + { + "epoch": 0.272, + "grad_norm": 0.000278472900390625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 425 + }, + { + "epoch": 0.2752, + "grad_norm": 0.006866455078125, + "learning_rate": 0.0001, + "loss": 0.0013, + "step": 430 + }, + { + "epoch": 0.2784, + "grad_norm": 0.0003032684326171875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 435 + }, + { + "epoch": 0.2816, + "grad_norm": 0.01123046875, + "learning_rate": 0.0001, + "loss": 0.0072, + "step": 440 + }, + { + "epoch": 0.2848, + "grad_norm": 0.00022602081298828125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 445 + }, + { + "epoch": 0.288, + "grad_norm": 0.000621795654296875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 450 + }, + { + "epoch": 0.2912, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0001, + "loss": 0.0156, + "step": 455 + }, + { + "epoch": 0.2944, + "grad_norm": 0.0038604736328125, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 460 + }, + { + "epoch": 0.2976, + "grad_norm": 0.00179290771484375, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 465 + }, + { + "epoch": 0.3008, + "grad_norm": 0.01519775390625, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 470 + }, + { + "epoch": 0.304, + "grad_norm": 0.0004405975341796875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 475 + }, + { + "epoch": 0.3072, + "grad_norm": 0.024169921875, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 480 + }, + { + "epoch": 0.3104, + "grad_norm": 0.000926971435546875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 485 + }, + { + "epoch": 0.3136, + "grad_norm": 0.0003108978271484375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 490 + }, + { + "epoch": 0.3168, + "grad_norm": 0.00020503997802734375, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 495 + }, + { + "epoch": 0.32, + "grad_norm": 0.0005950927734375, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 500 + }, + { + "epoch": 0.3232, + "grad_norm": 0.032470703125, + "learning_rate": 0.0001, + "loss": 0.0051, + "step": 505 + }, + { + "epoch": 0.3264, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001, + "loss": 0.0038, + "step": 510 + }, + { + "epoch": 0.3296, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001, + "loss": 0.0018, + "step": 515 + }, + { + "epoch": 0.3328, + "grad_norm": 0.0038299560546875, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 520 + }, + { + "epoch": 0.336, + "grad_norm": 0.0002880096435546875, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 525 + }, + { + "epoch": 0.3392, + "grad_norm": 0.007049560546875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 530 + }, + { + "epoch": 0.3424, + "grad_norm": 0.01397705078125, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 535 + }, + { + "epoch": 0.3456, + "grad_norm": 0.00147247314453125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 540 + }, + { + "epoch": 0.3488, + "grad_norm": 0.00238037109375, + "learning_rate": 0.0001, + "loss": 0.0064, + "step": 545 + }, + { + "epoch": 0.352, + "grad_norm": 0.001953125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 550 + }, + { + "epoch": 0.3552, + "grad_norm": 0.0047607421875, + "learning_rate": 0.0001, + "loss": 0.0067, + "step": 555 + }, + { + "epoch": 0.3584, + "grad_norm": 0.0040283203125, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 560 + }, + { + "epoch": 0.3616, + "grad_norm": 0.0001983642578125, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 565 + }, + { + "epoch": 0.3648, + "grad_norm": 0.0072021484375, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 570 + }, + { + "epoch": 0.368, + "grad_norm": 6.246566772460938e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 575 + }, + { + "epoch": 0.3712, + "grad_norm": 0.00811767578125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 580 + }, + { + "epoch": 0.3744, + "grad_norm": 0.00020694732666015625, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 585 + }, + { + "epoch": 0.3776, + "grad_norm": 0.0098876953125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 590 + }, + { + "epoch": 0.3808, + "grad_norm": 0.00012874603271484375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 595 + }, + { + "epoch": 0.384, + "grad_norm": 0.0003490447998046875, + "learning_rate": 0.0001, + "loss": 0.0025, + "step": 600 + }, + { + "epoch": 0.3872, + "grad_norm": 0.023681640625, + "learning_rate": 0.0001, + "loss": 0.0029, + "step": 605 + }, + { + "epoch": 0.3904, + "grad_norm": 0.010986328125, + "learning_rate": 0.0001, + "loss": 0.0032, + "step": 610 + }, + { + "epoch": 0.3936, + "grad_norm": 0.000507354736328125, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 615 + }, + { + "epoch": 0.3968, + "grad_norm": 0.0081787109375, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 620 + }, + { + "epoch": 0.4, + "grad_norm": 0.00032806396484375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 625 + }, + { + "epoch": 0.4032, + "grad_norm": 0.0032806396484375, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 630 + }, + { + "epoch": 0.4064, + "grad_norm": 0.000125885009765625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 635 + }, + { + "epoch": 0.4096, + "grad_norm": 0.0002956390380859375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 640 + }, + { + "epoch": 0.4128, + "grad_norm": 0.00010919570922851562, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 645 + }, + { + "epoch": 0.416, + "grad_norm": 0.0001983642578125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 650 + }, + { + "epoch": 0.4192, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0001, + "loss": 0.0037, + "step": 655 + }, + { + "epoch": 0.4224, + "grad_norm": 0.01007080078125, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 660 + }, + { + "epoch": 0.4256, + "grad_norm": 0.000640869140625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 665 + }, + { + "epoch": 0.4288, + "grad_norm": 0.009033203125, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 670 + }, + { + "epoch": 0.432, + "grad_norm": 6.580352783203125e-05, + "learning_rate": 0.0001, + "loss": 0.0022, + "step": 675 + }, + { + "epoch": 0.4352, + "grad_norm": 0.004730224609375, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 680 + }, + { + "epoch": 0.4384, + "grad_norm": 0.0024871826171875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 685 + }, + { + "epoch": 0.4416, + "grad_norm": 0.00958251953125, + "learning_rate": 0.0001, + "loss": 0.0051, + "step": 690 + }, + { + "epoch": 0.4448, + "grad_norm": 0.00011110305786132812, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 695 + }, + { + "epoch": 0.448, + "grad_norm": 0.00019931793212890625, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 700 + }, + { + "epoch": 0.4512, + "grad_norm": 0.00946044921875, + "learning_rate": 0.0001, + "loss": 0.0031, + "step": 705 + }, + { + "epoch": 0.4544, + "grad_norm": 0.00885009765625, + "learning_rate": 0.0001, + "loss": 0.0065, + "step": 710 + }, + { + "epoch": 0.4576, + "grad_norm": 0.0101318359375, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 715 + }, + { + "epoch": 0.4608, + "grad_norm": 0.00579833984375, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 720 + }, + { + "epoch": 0.464, + "grad_norm": 4.649162292480469e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 725 + }, + { + "epoch": 0.4672, + "grad_norm": 0.01806640625, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 730 + }, + { + "epoch": 0.4704, + "grad_norm": 0.00010061264038085938, + "learning_rate": 0.0001, + "loss": 0.0104, + "step": 735 + }, + { + "epoch": 0.4736, + "grad_norm": 0.0101318359375, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 740 + }, + { + "epoch": 0.4768, + "grad_norm": 0.00017452239990234375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 745 + }, + { + "epoch": 0.48, + "grad_norm": 0.0003948211669921875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 750 + }, + { + "epoch": 0.4832, + "grad_norm": 0.0024566650390625, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 755 + }, + { + "epoch": 0.4864, + "grad_norm": 0.00628662109375, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 760 + }, + { + "epoch": 0.4896, + "grad_norm": 0.000637054443359375, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 765 + }, + { + "epoch": 0.4928, + "grad_norm": 0.03466796875, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 770 + }, + { + "epoch": 0.496, + "grad_norm": 3.743171691894531e-05, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 775 + }, + { + "epoch": 0.4992, + "grad_norm": 0.0018310546875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 780 + }, + { + "epoch": 0.5024, + "grad_norm": 0.00066375732421875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 785 + }, + { + "epoch": 0.5056, + "grad_norm": 0.00020503997802734375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 790 + }, + { + "epoch": 0.5088, + "grad_norm": 0.0002899169921875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 795 + }, + { + "epoch": 0.512, + "grad_norm": 0.00012159347534179688, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 800 + }, + { + "epoch": 0.5152, + "grad_norm": 0.00151824951171875, + "learning_rate": 0.0001, + "loss": 0.0013, + "step": 805 + }, + { + "epoch": 0.5184, + "grad_norm": 0.01953125, + "learning_rate": 0.0001, + "loss": 0.0023, + "step": 810 + }, + { + "epoch": 0.5216, + "grad_norm": 0.0002727508544921875, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 815 + }, + { + "epoch": 0.5248, + "grad_norm": 0.00087738037109375, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 820 + }, + { + "epoch": 0.528, + "grad_norm": 4.100799560546875e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 825 + }, + { + "epoch": 0.5312, + "grad_norm": 0.00110626220703125, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 830 + }, + { + "epoch": 0.5344, + "grad_norm": 0.000926971435546875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 835 + }, + { + "epoch": 0.5376, + "grad_norm": 0.01214599609375, + "learning_rate": 0.0001, + "loss": 0.0059, + "step": 840 + }, + { + "epoch": 0.5408, + "grad_norm": 0.000423431396484375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 845 + }, + { + "epoch": 0.544, + "grad_norm": 0.00445556640625, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 850 + }, + { + "epoch": 0.5472, + "grad_norm": 0.00958251953125, + "learning_rate": 0.0001, + "loss": 0.0028, + "step": 855 + }, + { + "epoch": 0.5504, + "grad_norm": 0.0155029296875, + "learning_rate": 0.0001, + "loss": 0.0017, + "step": 860 + }, + { + "epoch": 0.5536, + "grad_norm": 0.01611328125, + "learning_rate": 0.0001, + "loss": 0.0017, + "step": 865 + }, + { + "epoch": 0.5568, + "grad_norm": 0.0128173828125, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 870 + }, + { + "epoch": 0.56, + "grad_norm": 0.000263214111328125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 875 + }, + { + "epoch": 0.5632, + "grad_norm": 0.0016937255859375, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 880 + }, + { + "epoch": 0.5664, + "grad_norm": 0.0004329681396484375, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 885 + }, + { + "epoch": 0.5696, + "grad_norm": 8.726119995117188e-05, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 890 + }, + { + "epoch": 0.5728, + "grad_norm": 0.0001392364501953125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 895 + }, + { + "epoch": 0.576, + "grad_norm": 0.0032806396484375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 900 + }, + { + "epoch": 0.5792, + "grad_norm": 0.016357421875, + "learning_rate": 0.0001, + "loss": 0.0023, + "step": 905 + }, + { + "epoch": 0.5824, + "grad_norm": 0.0130615234375, + "learning_rate": 0.0001, + "loss": 0.0029, + "step": 910 + }, + { + "epoch": 0.5856, + "grad_norm": 0.00136566162109375, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 915 + }, + { + "epoch": 0.5888, + "grad_norm": 0.0028076171875, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 920 + }, + { + "epoch": 0.592, + "grad_norm": 0.000263214111328125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 925 + }, + { + "epoch": 0.5952, + "grad_norm": 0.004302978515625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 930 + }, + { + "epoch": 0.5984, + "grad_norm": 0.0001239776611328125, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 935 + }, + { + "epoch": 0.6016, + "grad_norm": 8.821487426757812e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 940 + }, + { + "epoch": 0.6048, + "grad_norm": 9.822845458984375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 945 + }, + { + "epoch": 0.608, + "grad_norm": 0.00013446807861328125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 950 + }, + { + "epoch": 0.6112, + "grad_norm": 0.0181884765625, + "learning_rate": 0.0001, + "loss": 0.0028, + "step": 955 + }, + { + "epoch": 0.6144, + "grad_norm": 0.0020599365234375, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 960 + }, + { + "epoch": 0.6176, + "grad_norm": 0.00193023681640625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 965 + }, + { + "epoch": 0.6208, + "grad_norm": 0.00020885467529296875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 970 + }, + { + "epoch": 0.624, + "grad_norm": 3.600120544433594e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 975 + }, + { + "epoch": 0.6272, + "grad_norm": 0.000385284423828125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 980 + }, + { + "epoch": 0.6304, + "grad_norm": 0.01385498046875, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 985 + }, + { + "epoch": 0.6336, + "grad_norm": 0.00010728836059570312, + "learning_rate": 0.0001, + "loss": 0.0045, + "step": 990 + }, + { + "epoch": 0.6368, + "grad_norm": 7.2479248046875e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 995 + }, + { + "epoch": 0.64, + "grad_norm": 0.00011730194091796875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1000 + }, + { + "epoch": 0.6432, + "grad_norm": 0.0142822265625, + "learning_rate": 0.0001, + "loss": 0.0033, + "step": 1005 + }, + { + "epoch": 0.6464, + "grad_norm": 0.0048828125, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 1010 + }, + { + "epoch": 0.6496, + "grad_norm": 0.0040283203125, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 1015 + }, + { + "epoch": 0.6528, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0001, + "loss": 0.0015, + "step": 1020 + }, + { + "epoch": 0.656, + "grad_norm": 0.00604248046875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1025 + }, + { + "epoch": 0.6592, + "grad_norm": 0.00054168701171875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1030 + }, + { + "epoch": 0.6624, + "grad_norm": 0.001129150390625, + "learning_rate": 0.0001, + "loss": 0.0096, + "step": 1035 + }, + { + "epoch": 0.6656, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1040 + }, + { + "epoch": 0.6688, + "grad_norm": 0.002716064453125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1045 + }, + { + "epoch": 0.672, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1050 + }, + { + "epoch": 0.6752, + "grad_norm": 0.005126953125, + "learning_rate": 0.0001, + "loss": 0.0017, + "step": 1055 + }, + { + "epoch": 0.6784, + "grad_norm": 0.00110626220703125, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 1060 + }, + { + "epoch": 0.6816, + "grad_norm": 7.867813110351562e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1065 + }, + { + "epoch": 0.6848, + "grad_norm": 0.01239013671875, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 1070 + }, + { + "epoch": 0.688, + "grad_norm": 0.0002841949462890625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1075 + }, + { + "epoch": 0.6912, + "grad_norm": 0.0002117156982421875, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 1080 + }, + { + "epoch": 0.6944, + "grad_norm": 0.00010156631469726562, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1085 + }, + { + "epoch": 0.6976, + "grad_norm": 0.0006256103515625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1090 + }, + { + "epoch": 0.7008, + "grad_norm": 9.870529174804688e-05, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 1095 + }, + { + "epoch": 0.704, + "grad_norm": 0.000858306884765625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1100 + }, + { + "epoch": 0.7072, + "grad_norm": 0.0023040771484375, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 1105 + }, + { + "epoch": 0.7104, + "grad_norm": 0.013916015625, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 1110 + }, + { + "epoch": 0.7136, + "grad_norm": 0.000110626220703125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1115 + }, + { + "epoch": 0.7168, + "grad_norm": 0.0002536773681640625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1120 + }, + { + "epoch": 0.72, + "grad_norm": 0.0001163482666015625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1125 + }, + { + "epoch": 0.7232, + "grad_norm": 0.00014495849609375, + "learning_rate": 0.0001, + "loss": 0.0091, + "step": 1130 + }, + { + "epoch": 0.7264, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1135 + }, + { + "epoch": 0.7296, + "grad_norm": 9.5367431640625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1140 + }, + { + "epoch": 0.7328, + "grad_norm": 8.249282836914062e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1145 + }, + { + "epoch": 0.736, + "grad_norm": 0.001007080078125, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 1150 + }, + { + "epoch": 0.7392, + "grad_norm": 0.01226806640625, + "learning_rate": 0.0001, + "loss": 0.0019, + "step": 1155 + }, + { + "epoch": 0.7424, + "grad_norm": 0.005828857421875, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 1160 + }, + { + "epoch": 0.7456, + "grad_norm": 5.6743621826171875e-05, + "learning_rate": 0.0001, + "loss": 0.003, + "step": 1165 + }, + { + "epoch": 0.7488, + "grad_norm": 0.0002899169921875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1170 + }, + { + "epoch": 0.752, + "grad_norm": 0.0003147125244140625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1175 + }, + { + "epoch": 0.7552, + "grad_norm": 0.015625, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1180 + }, + { + "epoch": 0.7584, + "grad_norm": 0.000274658203125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1185 + }, + { + "epoch": 0.7616, + "grad_norm": 0.00093841552734375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1190 + }, + { + "epoch": 0.7648, + "grad_norm": 0.000335693359375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1195 + }, + { + "epoch": 0.768, + "grad_norm": 0.0003681182861328125, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 1200 + }, + { + "epoch": 0.7712, + "grad_norm": 0.0115966796875, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 1205 + }, + { + "epoch": 0.7744, + "grad_norm": 0.00811767578125, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 1210 + }, + { + "epoch": 0.7776, + "grad_norm": 0.00017547607421875, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 1215 + }, + { + "epoch": 0.7808, + "grad_norm": 0.000782012939453125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1220 + }, + { + "epoch": 0.784, + "grad_norm": 6.198883056640625e-05, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1225 + }, + { + "epoch": 0.7872, + "grad_norm": 0.06591796875, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 1230 + }, + { + "epoch": 0.7904, + "grad_norm": 0.00180816650390625, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 1235 + }, + { + "epoch": 0.7936, + "grad_norm": 0.015380859375, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1240 + }, + { + "epoch": 0.7968, + "grad_norm": 0.001373291015625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1245 + }, + { + "epoch": 0.8, + "grad_norm": 0.003997802734375, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 1250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.01226806640625, + "learning_rate": 0.0001, + "loss": 0.0087, + "step": 1255 + }, + { + "epoch": 0.8064, + "grad_norm": 0.00616455078125, + "learning_rate": 0.0001, + "loss": 0.0017, + "step": 1260 + }, + { + "epoch": 0.8096, + "grad_norm": 0.00077056884765625, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 1265 + }, + { + "epoch": 0.8128, + "grad_norm": 0.00128936767578125, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 1270 + }, + { + "epoch": 0.816, + "grad_norm": 9.679794311523438e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1275 + }, + { + "epoch": 0.8192, + "grad_norm": 0.0008087158203125, + "learning_rate": 0.0001, + "loss": 0.0015, + "step": 1280 + }, + { + "epoch": 0.8224, + "grad_norm": 0.00019359588623046875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1285 + }, + { + "epoch": 0.8256, + "grad_norm": 0.000812530517578125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1290 + }, + { + "epoch": 0.8288, + "grad_norm": 0.0006256103515625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1295 + }, + { + "epoch": 0.832, + "grad_norm": 0.00067901611328125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1300 + }, + { + "epoch": 0.8352, + "grad_norm": 0.017822265625, + "learning_rate": 0.0001, + "loss": 0.0045, + "step": 1305 + }, + { + "epoch": 0.8384, + "grad_norm": 0.000347137451171875, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 1310 + }, + { + "epoch": 0.8416, + "grad_norm": 0.00016117095947265625, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1315 + }, + { + "epoch": 0.8448, + "grad_norm": 0.0023956298828125, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 1320 + }, + { + "epoch": 0.848, + "grad_norm": 4.6253204345703125e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1325 + }, + { + "epoch": 0.8512, + "grad_norm": 0.00543212890625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1330 + }, + { + "epoch": 0.8544, + "grad_norm": 7.009506225585938e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1335 + }, + { + "epoch": 0.8576, + "grad_norm": 0.00101470947265625, + "learning_rate": 0.0001, + "loss": 0.0048, + "step": 1340 + }, + { + "epoch": 0.8608, + "grad_norm": 0.00011491775512695312, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1345 + }, + { + "epoch": 0.864, + "grad_norm": 0.0004730224609375, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1350 + }, + { + "epoch": 0.8672, + "grad_norm": 0.00885009765625, + "learning_rate": 0.0001, + "loss": 0.0045, + "step": 1355 + }, + { + "epoch": 0.8704, + "grad_norm": 0.02685546875, + "learning_rate": 0.0001, + "loss": 0.0035, + "step": 1360 + }, + { + "epoch": 0.8736, + "grad_norm": 0.0002651214599609375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1365 + }, + { + "epoch": 0.8768, + "grad_norm": 0.002777099609375, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 1370 + }, + { + "epoch": 0.88, + "grad_norm": 7.390975952148438e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1375 + }, + { + "epoch": 0.8832, + "grad_norm": 0.00023937225341796875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1380 + }, + { + "epoch": 0.8864, + "grad_norm": 0.01202392578125, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 1385 + }, + { + "epoch": 0.8896, + "grad_norm": 0.00543212890625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1390 + }, + { + "epoch": 0.8928, + "grad_norm": 8.487701416015625e-05, + "learning_rate": 0.0001, + "loss": 0.0047, + "step": 1395 + }, + { + "epoch": 0.896, + "grad_norm": 0.007110595703125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1400 + }, + { + "epoch": 0.8992, + "grad_norm": 0.026611328125, + "learning_rate": 0.0001, + "loss": 0.0038, + "step": 1405 + }, + { + "epoch": 0.9024, + "grad_norm": 0.02099609375, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 1410 + }, + { + "epoch": 0.9056, + "grad_norm": 9.918212890625e-05, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 1415 + }, + { + "epoch": 0.9088, + "grad_norm": 0.0013580322265625, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 1420 + }, + { + "epoch": 0.912, + "grad_norm": 0.000720977783203125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1425 + }, + { + "epoch": 0.9152, + "grad_norm": 0.0030670166015625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1430 + }, + { + "epoch": 0.9184, + "grad_norm": 0.00010013580322265625, + "learning_rate": 0.0001, + "loss": 0.0088, + "step": 1435 + }, + { + "epoch": 0.9216, + "grad_norm": 0.00153350830078125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1440 + }, + { + "epoch": 0.9248, + "grad_norm": 6.079673767089844e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1445 + }, + { + "epoch": 0.928, + "grad_norm": 0.0001621246337890625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1450 + }, + { + "epoch": 0.9312, + "grad_norm": 0.0111083984375, + "learning_rate": 0.0001, + "loss": 0.0028, + "step": 1455 + }, + { + "epoch": 0.9344, + "grad_norm": 0.020751953125, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 1460 + }, + { + "epoch": 0.9376, + "grad_norm": 0.00250244140625, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 1465 + }, + { + "epoch": 0.9408, + "grad_norm": 0.00299072265625, + "learning_rate": 0.0001, + "loss": 0.0027, + "step": 1470 + }, + { + "epoch": 0.944, + "grad_norm": 0.000110626220703125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1475 + }, + { + "epoch": 0.9472, + "grad_norm": 0.0003757476806640625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1480 + }, + { + "epoch": 0.9504, + "grad_norm": 7.05718994140625e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1485 + }, + { + "epoch": 0.9536, + "grad_norm": 0.0004425048828125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1490 + }, + { + "epoch": 0.9568, + "grad_norm": 0.0001220703125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1495 + }, + { + "epoch": 0.96, + "grad_norm": 0.00020503997802734375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1500 + }, + { + "epoch": 0.9632, + "grad_norm": 0.014892578125, + "learning_rate": 0.0001, + "loss": 0.0014, + "step": 1505 + }, + { + "epoch": 0.9664, + "grad_norm": 0.01470947265625, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 1510 + }, + { + "epoch": 0.9696, + "grad_norm": 0.0004863739013671875, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1515 + }, + { + "epoch": 0.9728, + "grad_norm": 0.0019683837890625, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1520 + }, + { + "epoch": 0.976, + "grad_norm": 4.6253204345703125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1525 + }, + { + "epoch": 0.9792, + "grad_norm": 0.00469970703125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1530 + }, + { + "epoch": 0.9824, + "grad_norm": 0.00494384765625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1535 + }, + { + "epoch": 0.9856, + "grad_norm": 0.00011920928955078125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1540 + }, + { + "epoch": 0.9888, + "grad_norm": 4.029273986816406e-05, + "learning_rate": 0.0001, + "loss": 0.0023, + "step": 1545 + }, + { + "epoch": 0.992, + "grad_norm": 0.00013828277587890625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1550 + }, + { + "epoch": 0.9952, + "grad_norm": 7.581710815429688e-05, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 1555 + }, + { + "epoch": 0.9984, + "grad_norm": 0.000148773193359375, + "learning_rate": 0.0001, + "loss": 0.0023, + "step": 1560 + }, + { + "epoch": 1.0016, + "grad_norm": 0.051025390625, + "learning_rate": 0.0001, + "loss": 0.006, + "step": 1565 + }, + { + "epoch": 1.0048, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0001, + "loss": 0.0044, + "step": 1570 + }, + { + "epoch": 1.008, + "grad_norm": 0.01556396484375, + "learning_rate": 0.0001, + "loss": 0.004, + "step": 1575 + }, + { + "epoch": 1.0112, + "grad_norm": 0.01068115234375, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 1580 + }, + { + "epoch": 1.0144, + "grad_norm": 0.0007781982421875, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 1585 + }, + { + "epoch": 1.0176, + "grad_norm": 0.008056640625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1590 + }, + { + "epoch": 1.0208, + "grad_norm": 0.000514984130859375, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 1595 + }, + { + "epoch": 1.024, + "grad_norm": 0.03369140625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1600 + }, + { + "epoch": 1.0272, + "grad_norm": 0.001129150390625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1605 + }, + { + "epoch": 1.0304, + "grad_norm": 0.11474609375, + "learning_rate": 0.0001, + "loss": 0.0019, + "step": 1610 + }, + { + "epoch": 1.0336, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0001, + "loss": 0.003, + "step": 1615 + }, + { + "epoch": 1.0368, + "grad_norm": 0.01446533203125, + "learning_rate": 0.0001, + "loss": 0.0021, + "step": 1620 + }, + { + "epoch": 1.04, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 1625 + }, + { + "epoch": 1.0432, + "grad_norm": 0.00640869140625, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 1630 + }, + { + "epoch": 1.0464, + "grad_norm": 0.0018310546875, + "learning_rate": 0.0001, + "loss": 0.0021, + "step": 1635 + }, + { + "epoch": 1.0496, + "grad_norm": 0.0004405975341796875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1640 + }, + { + "epoch": 1.0528, + "grad_norm": 0.0034332275390625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1645 + }, + { + "epoch": 1.056, + "grad_norm": 0.000762939453125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1650 + }, + { + "epoch": 1.0592, + "grad_norm": 0.0002536773681640625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1655 + }, + { + "epoch": 1.0624, + "grad_norm": 0.0125732421875, + "learning_rate": 0.0001, + "loss": 0.0032, + "step": 1660 + }, + { + "epoch": 1.0656, + "grad_norm": 0.01055908203125, + "learning_rate": 0.0001, + "loss": 0.0013, + "step": 1665 + }, + { + "epoch": 1.0688, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0001, + "loss": 0.0017, + "step": 1670 + }, + { + "epoch": 1.072, + "grad_norm": 0.00064849853515625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1675 + }, + { + "epoch": 1.0752, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1680 + }, + { + "epoch": 1.0784, + "grad_norm": 0.000354766845703125, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 1685 + }, + { + "epoch": 1.0816, + "grad_norm": 0.00518798828125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1690 + }, + { + "epoch": 1.0848, + "grad_norm": 0.0026092529296875, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 1695 + }, + { + "epoch": 1.088, + "grad_norm": 0.00125885009765625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1700 + }, + { + "epoch": 1.0912, + "grad_norm": 0.00156402587890625, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 1705 + }, + { + "epoch": 1.0944, + "grad_norm": 0.03369140625, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1710 + }, + { + "epoch": 1.0976, + "grad_norm": 0.0028533935546875, + "learning_rate": 0.0001, + "loss": 0.0013, + "step": 1715 + }, + { + "epoch": 1.1008, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1720 + }, + { + "epoch": 1.104, + "grad_norm": 0.001251220703125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1725 + }, + { + "epoch": 1.1072, + "grad_norm": 0.0032196044921875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1730 + }, + { + "epoch": 1.1104, + "grad_norm": 5.245208740234375e-05, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1735 + }, + { + "epoch": 1.1136, + "grad_norm": 0.000293731689453125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1740 + }, + { + "epoch": 1.1168, + "grad_norm": 0.000751495361328125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1745 + }, + { + "epoch": 1.12, + "grad_norm": 0.000194549560546875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1750 + }, + { + "epoch": 1.1232, + "grad_norm": 5.793571472167969e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1755 + }, + { + "epoch": 1.1264, + "grad_norm": 9.489059448242188e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1760 + }, + { + "epoch": 1.1296, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0001, + "loss": 0.0013, + "step": 1765 + }, + { + "epoch": 1.1328, + "grad_norm": 0.00244140625, + "learning_rate": 0.0001, + "loss": 0.0042, + "step": 1770 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.0003643035888671875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1775 + }, + { + "epoch": 1.1392, + "grad_norm": 0.00762939453125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1780 + }, + { + "epoch": 1.1424, + "grad_norm": 1.5616416931152344e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1785 + }, + { + "epoch": 1.1456, + "grad_norm": 4.482269287109375e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1790 + }, + { + "epoch": 1.1488, + "grad_norm": 0.00013256072998046875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1795 + }, + { + "epoch": 1.152, + "grad_norm": 3.600120544433594e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1800 + }, + { + "epoch": 1.1552, + "grad_norm": 3.981590270996094e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1805 + }, + { + "epoch": 1.1584, + "grad_norm": 0.0009765625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1810 + }, + { + "epoch": 1.1616, + "grad_norm": 0.0034637451171875, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 1815 + }, + { + "epoch": 1.1648, + "grad_norm": 0.00775146484375, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1820 + }, + { + "epoch": 1.168, + "grad_norm": 0.00029754638671875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1825 + }, + { + "epoch": 1.1712, + "grad_norm": 3.2901763916015625e-05, + "learning_rate": 0.0001, + "loss": 0.0024, + "step": 1830 + }, + { + "epoch": 1.1743999999999999, + "grad_norm": 0.0003795623779296875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1835 + }, + { + "epoch": 1.1776, + "grad_norm": 2.765655517578125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1840 + }, + { + "epoch": 1.1808, + "grad_norm": 0.1875, + "learning_rate": 0.0001, + "loss": 0.0021, + "step": 1845 + }, + { + "epoch": 1.184, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0001, + "loss": 0.0039, + "step": 1850 + }, + { + "epoch": 1.1872, + "grad_norm": 0.0027008056640625, + "learning_rate": 0.0001, + "loss": 0.005, + "step": 1855 + }, + { + "epoch": 1.1904, + "grad_norm": 0.0017852783203125, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 1860 + }, + { + "epoch": 1.1936, + "grad_norm": 0.041259765625, + "learning_rate": 0.0001, + "loss": 0.0096, + "step": 1865 + }, + { + "epoch": 1.1968, + "grad_norm": 0.035888671875, + "learning_rate": 0.0001, + "loss": 0.0081, + "step": 1870 + }, + { + "epoch": 1.2, + "grad_norm": 0.0086669921875, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 1875 + }, + { + "epoch": 1.2032, + "grad_norm": 0.000690460205078125, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 1880 + }, + { + "epoch": 1.2064, + "grad_norm": 0.03076171875, + "learning_rate": 0.0001, + "loss": 0.0027, + "step": 1885 + }, + { + "epoch": 1.2096, + "grad_norm": 0.00012969970703125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1890 + }, + { + "epoch": 1.2128, + "grad_norm": 0.003631591796875, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1895 + }, + { + "epoch": 1.216, + "grad_norm": 0.0004482269287109375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1900 + }, + { + "epoch": 1.2192, + "grad_norm": 9.584426879882812e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1905 + }, + { + "epoch": 1.2224, + "grad_norm": 0.00075531005859375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1910 + }, + { + "epoch": 1.2256, + "grad_norm": 0.00628662109375, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 1915 + }, + { + "epoch": 1.2288000000000001, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001, + "loss": 0.0015, + "step": 1920 + }, + { + "epoch": 1.232, + "grad_norm": 0.027587890625, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 1925 + }, + { + "epoch": 1.2352, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1930 + }, + { + "epoch": 1.2384, + "grad_norm": 0.00016689300537109375, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1935 + }, + { + "epoch": 1.2416, + "grad_norm": 0.00014495849609375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1940 + }, + { + "epoch": 1.2448, + "grad_norm": 0.0002803802490234375, + "learning_rate": 0.0001, + "loss": 0.0089, + "step": 1945 + }, + { + "epoch": 1.248, + "grad_norm": 0.0087890625, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1950 + }, + { + "epoch": 1.2511999999999999, + "grad_norm": 8.106231689453125e-05, + "learning_rate": 0.0001, + "loss": 0.0035, + "step": 1955 + }, + { + "epoch": 1.2544, + "grad_norm": 0.0004405975341796875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1960 + }, + { + "epoch": 1.2576, + "grad_norm": 0.015625, + "learning_rate": 0.0001, + "loss": 0.0015, + "step": 1965 + }, + { + "epoch": 1.2608, + "grad_norm": 0.00543212890625, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 1970 + }, + { + "epoch": 1.264, + "grad_norm": 0.00104522705078125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1975 + }, + { + "epoch": 1.2671999999999999, + "grad_norm": 0.015625, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1980 + }, + { + "epoch": 1.2704, + "grad_norm": 0.00013256072998046875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1985 + }, + { + "epoch": 1.2736, + "grad_norm": 0.0091552734375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1990 + }, + { + "epoch": 1.2768, + "grad_norm": 0.000385284423828125, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1995 + }, + { + "epoch": 1.28, + "grad_norm": 5.435943603515625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2000 + }, + { + "epoch": 1.2832, + "grad_norm": 8.392333984375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2005 + }, + { + "epoch": 1.2864, + "grad_norm": 0.00024127960205078125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2010 + }, + { + "epoch": 1.2896, + "grad_norm": 0.00421142578125, + "learning_rate": 0.0001, + "loss": 0.0013, + "step": 2015 + }, + { + "epoch": 1.2928, + "grad_norm": 0.00872802734375, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 2020 + }, + { + "epoch": 1.296, + "grad_norm": 0.000392913818359375, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 2025 + }, + { + "epoch": 1.2992, + "grad_norm": 0.021240234375, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 2030 + }, + { + "epoch": 1.3024, + "grad_norm": 0.00051116943359375, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2035 + }, + { + "epoch": 1.3056, + "grad_norm": 0.0001697540283203125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2040 + }, + { + "epoch": 1.3088, + "grad_norm": 0.01336669921875, + "learning_rate": 0.0001, + "loss": 0.008, + "step": 2045 + }, + { + "epoch": 1.312, + "grad_norm": 5.435943603515625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2050 + }, + { + "epoch": 1.3152, + "grad_norm": 0.000164031982421875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2055 + }, + { + "epoch": 1.3184, + "grad_norm": 0.0004558563232421875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2060 + }, + { + "epoch": 1.3216, + "grad_norm": 0.020751953125, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 2065 + }, + { + "epoch": 1.3248, + "grad_norm": 0.000415802001953125, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 2070 + }, + { + "epoch": 1.328, + "grad_norm": 0.00104522705078125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2075 + }, + { + "epoch": 1.3312, + "grad_norm": 0.0003910064697265625, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2080 + }, + { + "epoch": 1.3344, + "grad_norm": 0.00020313262939453125, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2085 + }, + { + "epoch": 1.3376000000000001, + "grad_norm": 5.030632019042969e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2090 + }, + { + "epoch": 1.3408, + "grad_norm": 0.00090789794921875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2095 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.00037384033203125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2100 + }, + { + "epoch": 1.3472, + "grad_norm": 0.00014400482177734375, + "learning_rate": 0.0001, + "loss": 0.003, + "step": 2105 + }, + { + "epoch": 1.3504, + "grad_norm": 0.00188446044921875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2110 + }, + { + "epoch": 1.3536000000000001, + "grad_norm": 0.0023956298828125, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 2115 + }, + { + "epoch": 1.3568, + "grad_norm": 0.015869140625, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2120 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.0103759765625, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 2125 + }, + { + "epoch": 1.3632, + "grad_norm": 0.000926971435546875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2130 + }, + { + "epoch": 1.3664, + "grad_norm": 3.0159950256347656e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2135 + }, + { + "epoch": 1.3696, + "grad_norm": 0.00174713134765625, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2140 + }, + { + "epoch": 1.3728, + "grad_norm": 3.1948089599609375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2145 + }, + { + "epoch": 1.376, + "grad_norm": 0.00030517578125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2150 + }, + { + "epoch": 1.3792, + "grad_norm": 3.0279159545898438e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2155 + }, + { + "epoch": 1.3824, + "grad_norm": 5.030632019042969e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2160 + }, + { + "epoch": 1.3856, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 2165 + }, + { + "epoch": 1.3888, + "grad_norm": 0.00439453125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2170 + }, + { + "epoch": 1.392, + "grad_norm": 0.0004425048828125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2175 + }, + { + "epoch": 1.3952, + "grad_norm": 0.005523681640625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2180 + }, + { + "epoch": 1.3984, + "grad_norm": 0.0007781982421875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2185 + }, + { + "epoch": 1.4016, + "grad_norm": 2.8252601623535156e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2190 + }, + { + "epoch": 1.4048, + "grad_norm": 0.000560760498046875, + "learning_rate": 0.0001, + "loss": 0.0043, + "step": 2195 + }, + { + "epoch": 1.408, + "grad_norm": 0.0002574920654296875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2200 + }, + { + "epoch": 1.4112, + "grad_norm": 5.507469177246094e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2205 + }, + { + "epoch": 1.4144, + "grad_norm": 0.00087738037109375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2210 + }, + { + "epoch": 1.4176, + "grad_norm": 0.0057373046875, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 2215 + }, + { + "epoch": 1.4208, + "grad_norm": 0.0093994140625, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2220 + }, + { + "epoch": 1.424, + "grad_norm": 0.000213623046875, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 2225 + }, + { + "epoch": 1.4272, + "grad_norm": 0.00421142578125, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 2230 + }, + { + "epoch": 1.4304000000000001, + "grad_norm": 0.0002956390380859375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2235 + }, + { + "epoch": 1.4336, + "grad_norm": 0.00482177734375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2240 + }, + { + "epoch": 1.4368, + "grad_norm": 7.05718994140625e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2245 + }, + { + "epoch": 1.44, + "grad_norm": 0.016845703125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2250 + }, + { + "epoch": 1.4432, + "grad_norm": 4.1484832763671875e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2255 + }, + { + "epoch": 1.4464000000000001, + "grad_norm": 0.000701904296875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2260 + }, + { + "epoch": 1.4496, + "grad_norm": 0.0123291015625, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 2265 + }, + { + "epoch": 1.4527999999999999, + "grad_norm": 0.007110595703125, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 2270 + }, + { + "epoch": 1.456, + "grad_norm": 0.00049591064453125, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2275 + }, + { + "epoch": 1.4592, + "grad_norm": 0.00604248046875, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 2280 + }, + { + "epoch": 1.4624, + "grad_norm": 9.965896606445312e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2285 + }, + { + "epoch": 1.4656, + "grad_norm": 0.00040435791015625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2290 + }, + { + "epoch": 1.4687999999999999, + "grad_norm": 0.0001773834228515625, + "learning_rate": 0.0001, + "loss": 0.0077, + "step": 2295 + }, + { + "epoch": 1.472, + "grad_norm": 2.6226043701171875e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2300 + }, + { + "epoch": 1.4752, + "grad_norm": 3.039836883544922e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2305 + }, + { + "epoch": 1.4784, + "grad_norm": 0.00025177001953125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2310 + }, + { + "epoch": 1.4816, + "grad_norm": 0.006805419921875, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2315 + }, + { + "epoch": 1.4848, + "grad_norm": 0.002532958984375, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 2320 + }, + { + "epoch": 1.488, + "grad_norm": 0.0169677734375, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2325 + }, + { + "epoch": 1.4912, + "grad_norm": 8.726119995117188e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2330 + }, + { + "epoch": 1.4944, + "grad_norm": 1.7762184143066406e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2335 + }, + { + "epoch": 1.4976, + "grad_norm": 8.630752563476562e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2340 + }, + { + "epoch": 1.5008, + "grad_norm": 0.00075531005859375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2345 + }, + { + "epoch": 1.504, + "grad_norm": 1.7523765563964844e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2350 + }, + { + "epoch": 1.5072, + "grad_norm": 1.990795135498047e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2355 + }, + { + "epoch": 1.5104, + "grad_norm": 9.870529174804688e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2360 + }, + { + "epoch": 1.5135999999999998, + "grad_norm": 0.00115966796875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2365 + }, + { + "epoch": 1.5168, + "grad_norm": 0.001068115234375, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 2370 + }, + { + "epoch": 1.52, + "grad_norm": 0.0001659393310546875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2375 + }, + { + "epoch": 1.5232, + "grad_norm": 0.00011730194091796875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2380 + }, + { + "epoch": 1.5264, + "grad_norm": 1.9550323486328125e-05, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2385 + }, + { + "epoch": 1.5295999999999998, + "grad_norm": 9.107589721679688e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2390 + }, + { + "epoch": 1.5328, + "grad_norm": 7.82012939453125e-05, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2395 + }, + { + "epoch": 1.536, + "grad_norm": 1.6927719116210938e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2400 + }, + { + "epoch": 1.5392000000000001, + "grad_norm": 1.6689300537109375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2405 + }, + { + "epoch": 1.5424, + "grad_norm": 7.05718994140625e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2410 + }, + { + "epoch": 1.5455999999999999, + "grad_norm": 0.00075531005859375, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 2415 + }, + { + "epoch": 1.5488, + "grad_norm": 0.01153564453125, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 2420 + }, + { + "epoch": 1.552, + "grad_norm": 0.000339508056640625, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 2425 + }, + { + "epoch": 1.5552000000000001, + "grad_norm": 0.0022735595703125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2430 + }, + { + "epoch": 1.5584, + "grad_norm": 0.000431060791015625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2435 + }, + { + "epoch": 1.5615999999999999, + "grad_norm": 0.001129150390625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2440 + }, + { + "epoch": 1.5648, + "grad_norm": 0.00066375732421875, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 2445 + }, + { + "epoch": 1.568, + "grad_norm": 0.00010156631469726562, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2450 + }, + { + "epoch": 1.5712000000000002, + "grad_norm": 0.00031280517578125, + "learning_rate": 0.0001, + "loss": 0.0022, + "step": 2455 + }, + { + "epoch": 1.5744, + "grad_norm": 0.0007476806640625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2460 + }, + { + "epoch": 1.5776, + "grad_norm": 0.00994873046875, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 2465 + }, + { + "epoch": 1.5808, + "grad_norm": 0.01287841796875, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 2470 + }, + { + "epoch": 1.584, + "grad_norm": 0.0098876953125, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 2475 + }, + { + "epoch": 1.5872000000000002, + "grad_norm": 0.000102996826171875, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2480 + }, + { + "epoch": 1.5904, + "grad_norm": 0.00010824203491210938, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2485 + }, + { + "epoch": 1.5936, + "grad_norm": 0.00016307830810546875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2490 + }, + { + "epoch": 1.5968, + "grad_norm": 6.580352783203125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2495 + }, + { + "epoch": 1.6, + "grad_norm": 0.01251220703125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2500 + }, + { + "epoch": 1.6032, + "grad_norm": 0.00018596649169921875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2505 + }, + { + "epoch": 1.6064, + "grad_norm": 7.62939453125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2510 + }, + { + "epoch": 1.6096, + "grad_norm": 0.0015869140625, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 2515 + }, + { + "epoch": 1.6128, + "grad_norm": 0.00020122528076171875, + "learning_rate": 0.0001, + "loss": 0.0028, + "step": 2520 + }, + { + "epoch": 1.616, + "grad_norm": 0.00058746337890625, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2525 + }, + { + "epoch": 1.6192, + "grad_norm": 0.00017070770263671875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2530 + }, + { + "epoch": 1.6223999999999998, + "grad_norm": 5.340576171875e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2535 + }, + { + "epoch": 1.6256, + "grad_norm": 6.818771362304688e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2540 + }, + { + "epoch": 1.6288, + "grad_norm": 0.00014495849609375, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2545 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 4.673004150390625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2550 + }, + { + "epoch": 1.6352, + "grad_norm": 6.580352783203125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2555 + }, + { + "epoch": 1.6383999999999999, + "grad_norm": 0.00011396408081054688, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2560 + }, + { + "epoch": 1.6416, + "grad_norm": 0.0036773681640625, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2565 + }, + { + "epoch": 1.6448, + "grad_norm": 0.004425048828125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2570 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 8.678436279296875e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2575 + }, + { + "epoch": 1.6512, + "grad_norm": 0.00040435791015625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2580 + }, + { + "epoch": 1.6543999999999999, + "grad_norm": 5.435943603515625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2585 + }, + { + "epoch": 1.6576, + "grad_norm": 0.0019378662109375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2590 + }, + { + "epoch": 1.6608, + "grad_norm": 2.5153160095214844e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2595 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 2.47955322265625e-05, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2600 + }, + { + "epoch": 1.6672, + "grad_norm": 6.437301635742188e-05, + "learning_rate": 0.0001, + "loss": 0.0055, + "step": 2605 + }, + { + "epoch": 1.6703999999999999, + "grad_norm": 4.410743713378906e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2610 + }, + { + "epoch": 1.6736, + "grad_norm": 0.01953125, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 2615 + }, + { + "epoch": 1.6768, + "grad_norm": 0.000286102294921875, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2620 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 4.9591064453125e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2625 + }, + { + "epoch": 1.6832, + "grad_norm": 0.009765625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2630 + }, + { + "epoch": 1.6864, + "grad_norm": 0.0001964569091796875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2635 + }, + { + "epoch": 1.6896, + "grad_norm": 0.001983642578125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2640 + }, + { + "epoch": 1.6928, + "grad_norm": 0.0001049041748046875, + "learning_rate": 0.0001, + "loss": 0.002, + "step": 2645 + }, + { + "epoch": 1.696, + "grad_norm": 4.839897155761719e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2650 + }, + { + "epoch": 1.6992, + "grad_norm": 6.4849853515625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2655 + }, + { + "epoch": 1.7024, + "grad_norm": 0.00010347366333007812, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2660 + }, + { + "epoch": 1.7056, + "grad_norm": 0.00347900390625, + "learning_rate": 0.0001, + "loss": 0.0015, + "step": 2665 + }, + { + "epoch": 1.7088, + "grad_norm": 0.00022220611572265625, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2670 + }, + { + "epoch": 1.712, + "grad_norm": 0.000514984130859375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2675 + }, + { + "epoch": 1.7151999999999998, + "grad_norm": 0.000308990478515625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2680 + }, + { + "epoch": 1.7184, + "grad_norm": 4.00543212890625e-05, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 2685 + }, + { + "epoch": 1.7216, + "grad_norm": 0.007568359375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2690 + }, + { + "epoch": 1.7248, + "grad_norm": 0.00115966796875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2695 + }, + { + "epoch": 1.728, + "grad_norm": 0.0076904296875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2700 + }, + { + "epoch": 1.7311999999999999, + "grad_norm": 0.00022602081298828125, + "learning_rate": 0.0001, + "loss": 0.0026, + "step": 2705 + }, + { + "epoch": 1.7344, + "grad_norm": 9.632110595703125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2710 + }, + { + "epoch": 1.7376, + "grad_norm": 0.0118408203125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2715 + }, + { + "epoch": 1.7408000000000001, + "grad_norm": 0.00146484375, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2720 + }, + { + "epoch": 1.744, + "grad_norm": 0.0023345947265625, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 2725 + }, + { + "epoch": 1.7471999999999999, + "grad_norm": 9.822845458984375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2730 + }, + { + "epoch": 1.7504, + "grad_norm": 2.4318695068359375e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2735 + }, + { + "epoch": 1.7536, + "grad_norm": 9.393692016601562e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2740 + }, + { + "epoch": 1.7568000000000001, + "grad_norm": 1.823902130126953e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2745 + }, + { + "epoch": 1.76, + "grad_norm": 0.0001773834228515625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2750 + }, + { + "epoch": 1.7631999999999999, + "grad_norm": 1.6927719116210938e-05, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 2755 + }, + { + "epoch": 1.7664, + "grad_norm": 0.0002307891845703125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2760 + }, + { + "epoch": 1.7696, + "grad_norm": 0.0021820068359375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2765 + }, + { + "epoch": 1.7728000000000002, + "grad_norm": 0.01446533203125, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 2770 + }, + { + "epoch": 1.776, + "grad_norm": 0.0062255859375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2775 + }, + { + "epoch": 1.7792, + "grad_norm": 7.2479248046875e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2780 + }, + { + "epoch": 1.7824, + "grad_norm": 6.031990051269531e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2785 + }, + { + "epoch": 1.7856, + "grad_norm": 3.314018249511719e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2790 + }, + { + "epoch": 1.7888, + "grad_norm": 0.00011873245239257812, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2795 + }, + { + "epoch": 1.792, + "grad_norm": 0.00274658203125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2800 + }, + { + "epoch": 1.7952, + "grad_norm": 1.7762184143066406e-05, + "learning_rate": 0.0001, + "loss": 0.0026, + "step": 2805 + }, + { + "epoch": 1.7984, + "grad_norm": 0.00110626220703125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2810 + }, + { + "epoch": 1.8016, + "grad_norm": 0.000553131103515625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2815 + }, + { + "epoch": 1.8048, + "grad_norm": 0.00390625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2820 + }, + { + "epoch": 1.808, + "grad_norm": 8.440017700195312e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2825 + }, + { + "epoch": 1.8112, + "grad_norm": 0.01214599609375, + "learning_rate": 0.0001, + "loss": 0.002, + "step": 2830 + }, + { + "epoch": 1.8144, + "grad_norm": 1.3053417205810547e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2835 + }, + { + "epoch": 1.8176, + "grad_norm": 0.000514984130859375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2840 + }, + { + "epoch": 1.8208, + "grad_norm": 1.990795135498047e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2845 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 2.968311309814453e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2850 + }, + { + "epoch": 1.8272, + "grad_norm": 2.2292137145996094e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2855 + }, + { + "epoch": 1.8304, + "grad_norm": 5.078315734863281e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2860 + }, + { + "epoch": 1.8336000000000001, + "grad_norm": 0.000812530517578125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2865 + }, + { + "epoch": 1.8368, + "grad_norm": 0.00077056884765625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2870 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.007781982421875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2875 + }, + { + "epoch": 1.8432, + "grad_norm": 0.01385498046875, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 2880 + }, + { + "epoch": 1.8464, + "grad_norm": 0.01507568359375, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 2885 + }, + { + "epoch": 1.8496000000000001, + "grad_norm": 8.96453857421875e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2890 + }, + { + "epoch": 1.8528, + "grad_norm": 0.003997802734375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2895 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.000301361083984375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2900 + }, + { + "epoch": 1.8592, + "grad_norm": 2.4199485778808594e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2905 + }, + { + "epoch": 1.8624, + "grad_norm": 0.000530242919921875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2910 + }, + { + "epoch": 1.8656000000000001, + "grad_norm": 0.0048828125, + "learning_rate": 0.0001, + "loss": 0.0017, + "step": 2915 + }, + { + "epoch": 1.8688, + "grad_norm": 0.00115203857421875, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2920 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.001678466796875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2925 + }, + { + "epoch": 1.8752, + "grad_norm": 8.296966552734375e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2930 + }, + { + "epoch": 1.8784, + "grad_norm": 7.724761962890625e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2935 + }, + { + "epoch": 1.8816000000000002, + "grad_norm": 0.00075531005859375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2940 + }, + { + "epoch": 1.8848, + "grad_norm": 0.0172119140625, + "learning_rate": 0.0001, + "loss": 0.0082, + "step": 2945 + }, + { + "epoch": 1.888, + "grad_norm": 9.34600830078125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2950 + }, + { + "epoch": 1.8912, + "grad_norm": 6.341934204101562e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2955 + }, + { + "epoch": 1.8944, + "grad_norm": 0.000598907470703125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2960 + }, + { + "epoch": 1.8976, + "grad_norm": 0.000759124755859375, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2965 + }, + { + "epoch": 1.9008, + "grad_norm": 0.00115203857421875, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 2970 + }, + { + "epoch": 1.904, + "grad_norm": 0.00017452239990234375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2975 + }, + { + "epoch": 1.9072, + "grad_norm": 0.0005645751953125, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 2980 + }, + { + "epoch": 1.9104, + "grad_norm": 0.0007781982421875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2985 + }, + { + "epoch": 1.9136, + "grad_norm": 0.00010824203491210938, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2990 + }, + { + "epoch": 1.9167999999999998, + "grad_norm": 0.019287109375, + "learning_rate": 0.0001, + "loss": 0.0083, + "step": 2995 + }, + { + "epoch": 1.92, + "grad_norm": 5.245208740234375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3000 + }, + { + "epoch": 1.9232, + "grad_norm": 4.315376281738281e-05, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 3005 + }, + { + "epoch": 1.9264000000000001, + "grad_norm": 0.00014400482177734375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3010 + }, + { + "epoch": 1.9296, + "grad_norm": 0.01171875, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 3015 + }, + { + "epoch": 1.9327999999999999, + "grad_norm": 0.000598907470703125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3020 + }, + { + "epoch": 1.936, + "grad_norm": 3.504753112792969e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3025 + }, + { + "epoch": 1.9392, + "grad_norm": 6.628036499023438e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3030 + }, + { + "epoch": 1.9424000000000001, + "grad_norm": 5.8650970458984375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3035 + }, + { + "epoch": 1.9456, + "grad_norm": 6.866455078125e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3040 + }, + { + "epoch": 1.9487999999999999, + "grad_norm": 0.00048828125, + "learning_rate": 0.0001, + "loss": 0.0041, + "step": 3045 + }, + { + "epoch": 1.952, + "grad_norm": 0.0172119140625, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 3050 + }, + { + "epoch": 1.9552, + "grad_norm": 0.000919342041015625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3055 + }, + { + "epoch": 1.9584000000000001, + "grad_norm": 0.0103759765625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3060 + }, + { + "epoch": 1.9616, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0001, + "loss": 0.0022, + "step": 3065 + }, + { + "epoch": 1.9647999999999999, + "grad_norm": 0.00982666015625, + "learning_rate": 0.0001, + "loss": 0.0014, + "step": 3070 + }, + { + "epoch": 1.968, + "grad_norm": 0.00089263916015625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3075 + }, + { + "epoch": 1.9712, + "grad_norm": 0.0076904296875, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 3080 + }, + { + "epoch": 1.9744000000000002, + "grad_norm": 0.00604248046875, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 3085 + }, + { + "epoch": 1.9776, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3090 + }, + { + "epoch": 1.9808, + "grad_norm": 0.0205078125, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 3095 + }, + { + "epoch": 1.984, + "grad_norm": 4.935264587402344e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3100 + }, + { + "epoch": 1.9872, + "grad_norm": 0.00018596649169921875, + "learning_rate": 0.0001, + "loss": 0.0035, + "step": 3105 + }, + { + "epoch": 1.9904, + "grad_norm": 0.000698089599609375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3110 + }, + { + "epoch": 1.9936, + "grad_norm": 0.00113677978515625, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 3115 + }, + { + "epoch": 1.9968, + "grad_norm": 7.200241088867188e-05, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 3120 + }, + { + "epoch": 2.0, + "grad_norm": 9.5367431640625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3125 + }, + { + "epoch": 2.0032, + "grad_norm": 0.007568359375, + "learning_rate": 0.0001, + "loss": 0.0032, + "step": 3130 + }, + { + "epoch": 2.0064, + "grad_norm": 0.0019073486328125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 3135 + }, + { + "epoch": 2.0096, + "grad_norm": 0.0081787109375, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3140 + }, + { + "epoch": 2.0128, + "grad_norm": 0.00014019012451171875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3145 + }, + { + "epoch": 2.016, + "grad_norm": 1.8358230590820312e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3150 + }, + { + "epoch": 2.0192, + "grad_norm": 0.005828857421875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3155 + }, + { + "epoch": 2.0224, + "grad_norm": 4.3392181396484375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3160 + }, + { + "epoch": 2.0256, + "grad_norm": 0.0002460479736328125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3165 + }, + { + "epoch": 2.0288, + "grad_norm": 0.000545501708984375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3170 + }, + { + "epoch": 2.032, + "grad_norm": 2.849102020263672e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3175 + }, + { + "epoch": 2.0352, + "grad_norm": 0.007781982421875, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 3180 + }, + { + "epoch": 2.0384, + "grad_norm": 0.0003719329833984375, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 3185 + }, + { + "epoch": 2.0416, + "grad_norm": 1.1682510375976562e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3190 + }, + { + "epoch": 2.0448, + "grad_norm": 0.000732421875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3195 + }, + { + "epoch": 2.048, + "grad_norm": 1.633167266845703e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3200 + }, + { + "epoch": 2.0512, + "grad_norm": 0.000431060791015625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3205 + }, + { + "epoch": 2.0544, + "grad_norm": 3.123283386230469e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3210 + }, + { + "epoch": 2.0576, + "grad_norm": 2.491474151611328e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3215 + }, + { + "epoch": 2.0608, + "grad_norm": 1.8358230590820312e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3220 + }, + { + "epoch": 2.064, + "grad_norm": 2.7179718017578125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3225 + }, + { + "epoch": 2.0672, + "grad_norm": 0.0028839111328125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 3230 + }, + { + "epoch": 2.0704, + "grad_norm": 0.00506591796875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3235 + }, + { + "epoch": 2.0736, + "grad_norm": 0.0004749298095703125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 3240 + }, + { + "epoch": 2.0768, + "grad_norm": 0.001251220703125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3245 + }, + { + "epoch": 2.08, + "grad_norm": 1.633167266845703e-05, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 3250 + }, + { + "epoch": 2.0832, + "grad_norm": 5.817413330078125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3255 + }, + { + "epoch": 2.0864, + "grad_norm": 5.817413330078125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3260 + }, + { + "epoch": 2.0896, + "grad_norm": 0.007598876953125, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 3265 + }, + { + "epoch": 2.0928, + "grad_norm": 7.200241088867188e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3270 + }, + { + "epoch": 2.096, + "grad_norm": 6.866455078125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3275 + }, + { + "epoch": 2.0992, + "grad_norm": 0.000553131103515625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3280 + }, + { + "epoch": 2.1024, + "grad_norm": 0.015380859375, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 3285 + }, + { + "epoch": 2.1056, + "grad_norm": 8.58306884765625e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3290 + }, + { + "epoch": 2.1088, + "grad_norm": 0.01165771484375, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 3295 + }, + { + "epoch": 2.112, + "grad_norm": 1.2755393981933594e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3300 + }, + { + "epoch": 2.1152, + "grad_norm": 0.000213623046875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3305 + }, + { + "epoch": 2.1184, + "grad_norm": 2.0503997802734375e-05, + "learning_rate": 0.0001, + "loss": 0.0048, + "step": 3310 + }, + { + "epoch": 2.1216, + "grad_norm": 0.00011348724365234375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3315 + }, + { + "epoch": 2.1248, + "grad_norm": 0.000301361083984375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3320 + }, + { + "epoch": 2.128, + "grad_norm": 0.00041961669921875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3325 + }, + { + "epoch": 2.1312, + "grad_norm": 0.0007781982421875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3330 + }, + { + "epoch": 2.1344, + "grad_norm": 0.00171661376953125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 3335 + }, + { + "epoch": 2.1376, + "grad_norm": 8.7738037109375e-05, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 3340 + }, + { + "epoch": 2.1408, + "grad_norm": 0.000270843505859375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3345 + }, + { + "epoch": 2.144, + "grad_norm": 7.867813110351562e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3350 + }, + { + "epoch": 2.1471999999999998, + "grad_norm": 0.0002269744873046875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3355 + }, + { + "epoch": 2.1504, + "grad_norm": 1.4781951904296875e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3360 + }, + { + "epoch": 2.1536, + "grad_norm": 0.0087890625, + "learning_rate": 0.0001, + "loss": 0.0013, + "step": 3365 + }, + { + "epoch": 2.1568, + "grad_norm": 1.4066696166992188e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3370 + }, + { + "epoch": 2.16, + "grad_norm": 3.4332275390625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3375 + }, + { + "epoch": 2.1632, + "grad_norm": 0.0024566650390625, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 3380 + }, + { + "epoch": 2.1664, + "grad_norm": 0.01202392578125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3385 + }, + { + "epoch": 2.1696, + "grad_norm": 8.058547973632812e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3390 + }, + { + "epoch": 2.1728, + "grad_norm": 0.00136566162109375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3395 + }, + { + "epoch": 2.176, + "grad_norm": 5.53131103515625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3400 + }, + { + "epoch": 2.1792, + "grad_norm": 0.00021266937255859375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3405 + }, + { + "epoch": 2.1824, + "grad_norm": 2.1576881408691406e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3410 + }, + { + "epoch": 2.1856, + "grad_norm": 2.288818359375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3415 + }, + { + "epoch": 2.1888, + "grad_norm": 2.0623207092285156e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3420 + }, + { + "epoch": 2.192, + "grad_norm": 0.0003108978271484375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3425 + }, + { + "epoch": 2.1952, + "grad_norm": 0.001983642578125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3430 + }, + { + "epoch": 2.1984, + "grad_norm": 0.00531005859375, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3435 + }, + { + "epoch": 2.2016, + "grad_norm": 0.0015716552734375, + "learning_rate": 0.0001, + "loss": 0.003, + "step": 3440 + }, + { + "epoch": 2.2048, + "grad_norm": 0.0003223419189453125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3445 + }, + { + "epoch": 2.208, + "grad_norm": 0.0003376007080078125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3450 + }, + { + "epoch": 2.2112, + "grad_norm": 3.0040740966796875e-05, + "learning_rate": 0.0001, + "loss": 0.0035, + "step": 3455 + }, + { + "epoch": 2.2144, + "grad_norm": 0.000858306884765625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3460 + }, + { + "epoch": 2.2176, + "grad_norm": 0.00054168701171875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3465 + }, + { + "epoch": 2.2208, + "grad_norm": 1.9311904907226562e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3470 + }, + { + "epoch": 2.224, + "grad_norm": 0.0002803802490234375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3475 + }, + { + "epoch": 2.2272, + "grad_norm": 0.0125732421875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3480 + }, + { + "epoch": 2.2304, + "grad_norm": 4.76837158203125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3485 + }, + { + "epoch": 2.2336, + "grad_norm": 5.1975250244140625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3490 + }, + { + "epoch": 2.2368, + "grad_norm": 7.915496826171875e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3495 + }, + { + "epoch": 2.24, + "grad_norm": 7.212162017822266e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3500 + }, + { + "epoch": 2.2432, + "grad_norm": 7.05718994140625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3505 + }, + { + "epoch": 2.2464, + "grad_norm": 1.4960765838623047e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3510 + }, + { + "epoch": 2.2496, + "grad_norm": 0.017578125, + "learning_rate": 0.0001, + "loss": 0.0027, + "step": 3515 + }, + { + "epoch": 2.2528, + "grad_norm": 1.1146068572998047e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3520 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 0.0012664794921875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3525 + }, + { + "epoch": 2.2592, + "grad_norm": 0.00010824203491210938, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3530 + }, + { + "epoch": 2.2624, + "grad_norm": 0.003509521484375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3535 + }, + { + "epoch": 2.2656, + "grad_norm": 7.510185241699219e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3540 + }, + { + "epoch": 2.2688, + "grad_norm": 0.00157928466796875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3545 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 4.500150680541992e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3550 + }, + { + "epoch": 2.2752, + "grad_norm": 2.086162567138672e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3555 + }, + { + "epoch": 2.2784, + "grad_norm": 0.00011110305786132812, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3560 + }, + { + "epoch": 2.2816, + "grad_norm": 1.4722347259521484e-05, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 3565 + }, + { + "epoch": 2.2848, + "grad_norm": 1.0788440704345703e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3570 + }, + { + "epoch": 2.288, + "grad_norm": 5.245208740234375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3575 + }, + { + "epoch": 2.2912, + "grad_norm": 0.0019989013671875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3580 + }, + { + "epoch": 2.2944, + "grad_norm": 0.0069580078125, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 3585 + }, + { + "epoch": 2.2976, + "grad_norm": 6.22868537902832e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3590 + }, + { + "epoch": 2.3008, + "grad_norm": 6.580352783203125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3595 + }, + { + "epoch": 2.304, + "grad_norm": 5.185604095458984e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3600 + }, + { + "epoch": 2.3072, + "grad_norm": 2.288818359375e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3605 + }, + { + "epoch": 2.3104, + "grad_norm": 6.29425048828125e-05, + "learning_rate": 0.0001, + "loss": 0.0067, + "step": 3610 + }, + { + "epoch": 2.3136, + "grad_norm": 1.9669532775878906e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3615 + }, + { + "epoch": 2.3168, + "grad_norm": 1.2755393981933594e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3620 + }, + { + "epoch": 2.32, + "grad_norm": 2.9206275939941406e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3625 + }, + { + "epoch": 2.3232, + "grad_norm": 0.0078125, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 3630 + }, + { + "epoch": 2.3264, + "grad_norm": 0.0005950927734375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3635 + }, + { + "epoch": 2.3296, + "grad_norm": 0.0009613037109375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3640 + }, + { + "epoch": 2.3327999999999998, + "grad_norm": 3.409385681152344e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3645 + }, + { + "epoch": 2.336, + "grad_norm": 0.000598907470703125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3650 + }, + { + "epoch": 2.3392, + "grad_norm": 0.002532958984375, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3655 + }, + { + "epoch": 2.3424, + "grad_norm": 1.3053417205810547e-05, + "learning_rate": 0.0001, + "loss": 0.0065, + "step": 3660 + }, + { + "epoch": 2.3456, + "grad_norm": 3.4809112548828125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3665 + }, + { + "epoch": 2.3487999999999998, + "grad_norm": 9.655952453613281e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3670 + }, + { + "epoch": 2.352, + "grad_norm": 2.4199485778808594e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3675 + }, + { + "epoch": 2.3552, + "grad_norm": 0.00072479248046875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3680 + }, + { + "epoch": 2.3584, + "grad_norm": 0.000553131103515625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3685 + }, + { + "epoch": 2.3616, + "grad_norm": 5.7220458984375e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3690 + }, + { + "epoch": 2.3648, + "grad_norm": 4.2438507080078125e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3695 + }, + { + "epoch": 2.368, + "grad_norm": 0.0003528594970703125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3700 + }, + { + "epoch": 2.3712, + "grad_norm": 3.552436828613281e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3705 + }, + { + "epoch": 2.3744, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 3710 + }, + { + "epoch": 2.3776, + "grad_norm": 2.2172927856445312e-05, + "learning_rate": 0.0001, + "loss": 0.004, + "step": 3715 + }, + { + "epoch": 2.3808, + "grad_norm": 1.9669532775878906e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3720 + }, + { + "epoch": 2.384, + "grad_norm": 2.110004425048828e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3725 + }, + { + "epoch": 2.3872, + "grad_norm": 0.006866455078125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3730 + }, + { + "epoch": 2.3904, + "grad_norm": 0.00118255615234375, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 3735 + }, + { + "epoch": 2.3936, + "grad_norm": 1.2099742889404297e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3740 + }, + { + "epoch": 2.3968, + "grad_norm": 0.00213623046875, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 3745 + }, + { + "epoch": 2.4, + "grad_norm": 1.0848045349121094e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3750 + }, + { + "epoch": 2.4032, + "grad_norm": 2.8371810913085938e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3755 + }, + { + "epoch": 2.4064, + "grad_norm": 1.4662742614746094e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3760 + }, + { + "epoch": 2.4096, + "grad_norm": 1.6927719116210938e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3765 + }, + { + "epoch": 2.4128, + "grad_norm": 1.2516975402832031e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3770 + }, + { + "epoch": 2.416, + "grad_norm": 1.8835067749023438e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3775 + }, + { + "epoch": 2.4192, + "grad_norm": 0.000213623046875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3780 + }, + { + "epoch": 2.4224, + "grad_norm": 8.153915405273438e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3785 + }, + { + "epoch": 2.4256, + "grad_norm": 2.288818359375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3790 + }, + { + "epoch": 2.4288, + "grad_norm": 0.0001430511474609375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3795 + }, + { + "epoch": 2.432, + "grad_norm": 6.288290023803711e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3800 + }, + { + "epoch": 2.4352, + "grad_norm": 0.0002727508544921875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3805 + }, + { + "epoch": 2.4384, + "grad_norm": 1.4603137969970703e-05, + "learning_rate": 0.0001, + "loss": 0.0082, + "step": 3810 + }, + { + "epoch": 2.4416, + "grad_norm": 2.0742416381835938e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3815 + }, + { + "epoch": 2.4448, + "grad_norm": 9.000301361083984e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3820 + }, + { + "epoch": 2.448, + "grad_norm": 1.4185905456542969e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3825 + }, + { + "epoch": 2.4512, + "grad_norm": 0.00021648406982421875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3830 + }, + { + "epoch": 2.4544, + "grad_norm": 4.172325134277344e-05, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 3835 + }, + { + "epoch": 2.4576000000000002, + "grad_norm": 6.377696990966797e-06, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3840 + }, + { + "epoch": 2.4608, + "grad_norm": 0.00145721435546875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3845 + }, + { + "epoch": 2.464, + "grad_norm": 9.59634780883789e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3850 + }, + { + "epoch": 2.4672, + "grad_norm": 0.0015869140625, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 3855 + }, + { + "epoch": 2.4704, + "grad_norm": 1.7404556274414062e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3860 + }, + { + "epoch": 2.4736000000000002, + "grad_norm": 0.00075531005859375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3865 + }, + { + "epoch": 2.4768, + "grad_norm": 7.420778274536133e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3870 + }, + { + "epoch": 2.48, + "grad_norm": 1.4483928680419922e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3875 + }, + { + "epoch": 2.4832, + "grad_norm": 4.6253204345703125e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3880 + }, + { + "epoch": 2.4864, + "grad_norm": 0.0013275146484375, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3885 + }, + { + "epoch": 2.4896, + "grad_norm": 0.000568389892578125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3890 + }, + { + "epoch": 2.4928, + "grad_norm": 0.04541015625, + "learning_rate": 0.0001, + "loss": 0.0014, + "step": 3895 + }, + { + "epoch": 2.496, + "grad_norm": 0.00555419921875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3900 + }, + { + "epoch": 2.4992, + "grad_norm": 0.00024318695068359375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3905 + }, + { + "epoch": 2.5023999999999997, + "grad_norm": 3.7670135498046875e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3910 + }, + { + "epoch": 2.5056000000000003, + "grad_norm": 0.0002765655517578125, + "learning_rate": 0.0001, + "loss": 0.0036, + "step": 3915 + }, + { + "epoch": 2.5088, + "grad_norm": 0.005462646484375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3920 + }, + { + "epoch": 2.512, + "grad_norm": 3.886222839355469e-05, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 3925 + }, + { + "epoch": 2.5152, + "grad_norm": 0.0020751953125, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 3930 + }, + { + "epoch": 2.5183999999999997, + "grad_norm": 0.003082275390625, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 3935 + }, + { + "epoch": 2.5216, + "grad_norm": 0.0002231597900390625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3940 + }, + { + "epoch": 2.5248, + "grad_norm": 0.0017547607421875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3945 + }, + { + "epoch": 2.528, + "grad_norm": 8.487701416015625e-05, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 3950 + }, + { + "epoch": 2.5312, + "grad_norm": 0.000263214111328125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3955 + }, + { + "epoch": 2.5343999999999998, + "grad_norm": 0.0002307891845703125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3960 + }, + { + "epoch": 2.5376, + "grad_norm": 0.000244140625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3965 + }, + { + "epoch": 2.5408, + "grad_norm": 5.459785461425781e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3970 + }, + { + "epoch": 2.544, + "grad_norm": 3.075599670410156e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3975 + }, + { + "epoch": 2.5472, + "grad_norm": 0.0008087158203125, + "learning_rate": 0.0001, + "loss": 0.0024, + "step": 3980 + }, + { + "epoch": 2.5504, + "grad_norm": 0.007781982421875, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 3985 + }, + { + "epoch": 2.5536, + "grad_norm": 0.0004863739013671875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3990 + }, + { + "epoch": 2.5568, + "grad_norm": 0.023193359375, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 3995 + }, + { + "epoch": 2.56, + "grad_norm": 0.010009765625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 4000 + }, + { + "epoch": 2.5632, + "grad_norm": 0.006134033203125, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 4005 + }, + { + "epoch": 2.5664, + "grad_norm": 0.0010833740234375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4010 + }, + { + "epoch": 2.5696, + "grad_norm": 3.695487976074219e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4015 + }, + { + "epoch": 2.5728, + "grad_norm": 4.2438507080078125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4020 + }, + { + "epoch": 2.576, + "grad_norm": 2.4199485778808594e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4025 + }, + { + "epoch": 2.5792, + "grad_norm": 0.00144195556640625, + "learning_rate": 0.0001, + "loss": 0.0026, + "step": 4030 + }, + { + "epoch": 2.5824, + "grad_norm": 0.0015716552734375, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 4035 + }, + { + "epoch": 2.5856, + "grad_norm": 0.00049591064453125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 4040 + }, + { + "epoch": 2.5888, + "grad_norm": 0.0022430419921875, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 4045 + }, + { + "epoch": 2.592, + "grad_norm": 0.0001544952392578125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 4050 + }, + { + "epoch": 2.5952, + "grad_norm": 0.0076904296875, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 4055 + }, + { + "epoch": 2.5984, + "grad_norm": 0.000446319580078125, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 4060 + }, + { + "epoch": 2.6016, + "grad_norm": 6.341934204101562e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4065 + }, + { + "epoch": 2.6048, + "grad_norm": 3.6716461181640625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4070 + }, + { + "epoch": 2.608, + "grad_norm": 0.000461578369140625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4075 + }, + { + "epoch": 2.6112, + "grad_norm": 0.003204345703125, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 4080 + }, + { + "epoch": 2.6144, + "grad_norm": 0.00286865234375, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 4085 + }, + { + "epoch": 2.6176, + "grad_norm": 6.67572021484375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4090 + }, + { + "epoch": 2.6208, + "grad_norm": 0.00110626220703125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4095 + }, + { + "epoch": 2.624, + "grad_norm": 0.0003604888916015625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4100 + }, + { + "epoch": 2.6272, + "grad_norm": 0.004302978515625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4105 + }, + { + "epoch": 2.6304, + "grad_norm": 6.4849853515625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4110 + }, + { + "epoch": 2.6336, + "grad_norm": 0.000171661376953125, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 4115 + }, + { + "epoch": 2.6368, + "grad_norm": 5.781650543212891e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4120 + }, + { + "epoch": 2.64, + "grad_norm": 9.000301361083984e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4125 + }, + { + "epoch": 2.6432, + "grad_norm": 0.0030975341796875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 4130 + }, + { + "epoch": 2.6464, + "grad_norm": 0.00131988525390625, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 4135 + }, + { + "epoch": 2.6496, + "grad_norm": 0.000354766845703125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4140 + }, + { + "epoch": 2.6528, + "grad_norm": 7.724761962890625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4145 + }, + { + "epoch": 2.656, + "grad_norm": 4.231929779052734e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4150 + }, + { + "epoch": 2.6592000000000002, + "grad_norm": 1.704692840576172e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4155 + }, + { + "epoch": 2.6624, + "grad_norm": 6.616115570068359e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4160 + }, + { + "epoch": 2.6656, + "grad_norm": 6.884336471557617e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4165 + }, + { + "epoch": 2.6688, + "grad_norm": 5.453824996948242e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4170 + }, + { + "epoch": 2.672, + "grad_norm": 0.000179290771484375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4175 + }, + { + "epoch": 2.6752000000000002, + "grad_norm": 0.013916015625, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 4180 + }, + { + "epoch": 2.6784, + "grad_norm": 0.0031890869140625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4185 + }, + { + "epoch": 2.6816, + "grad_norm": 1.7404556274414062e-05, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 4190 + }, + { + "epoch": 2.6848, + "grad_norm": 4.291534423828125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4195 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 7.420778274536133e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4200 + }, + { + "epoch": 2.6912000000000003, + "grad_norm": 0.0001087188720703125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4205 + }, + { + "epoch": 2.6944, + "grad_norm": 3.647804260253906e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4210 + }, + { + "epoch": 2.6976, + "grad_norm": 0.01220703125, + "learning_rate": 0.0001, + "loss": 0.0019, + "step": 4215 + }, + { + "epoch": 2.7008, + "grad_norm": 6.109476089477539e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4220 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 1.8477439880371094e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4225 + }, + { + "epoch": 2.7072000000000003, + "grad_norm": 0.0009765625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4230 + }, + { + "epoch": 2.7104, + "grad_norm": 0.0002918243408203125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 4235 + }, + { + "epoch": 2.7136, + "grad_norm": 7.62939453125e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4240 + }, + { + "epoch": 2.7168, + "grad_norm": 5.340576171875e-05, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 4245 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 5.930662155151367e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4250 + }, + { + "epoch": 2.7232, + "grad_norm": 0.0009613037109375, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 4255 + }, + { + "epoch": 2.7264, + "grad_norm": 2.3603439331054688e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4260 + }, + { + "epoch": 2.7296, + "grad_norm": 2.110004425048828e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4265 + }, + { + "epoch": 2.7328, + "grad_norm": 1.055002212524414e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4270 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 0.00011110305786132812, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4275 + }, + { + "epoch": 2.7392, + "grad_norm": 0.0032196044921875, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 4280 + }, + { + "epoch": 2.7424, + "grad_norm": 0.01239013671875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4285 + }, + { + "epoch": 2.7456, + "grad_norm": 5.626678466796875e-05, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 4290 + }, + { + "epoch": 2.7488, + "grad_norm": 0.000469207763671875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4295 + }, + { + "epoch": 2.752, + "grad_norm": 2.372264862060547e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4300 + }, + { + "epoch": 2.7552, + "grad_norm": 0.00115203857421875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4305 + }, + { + "epoch": 2.7584, + "grad_norm": 4.57763671875e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4310 + }, + { + "epoch": 2.7616, + "grad_norm": 0.0001354217529296875, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 4315 + }, + { + "epoch": 2.7648, + "grad_norm": 1.4185905456542969e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4320 + }, + { + "epoch": 2.768, + "grad_norm": 0.00012159347534179688, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4325 + }, + { + "epoch": 2.7712, + "grad_norm": 0.0087890625, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 4330 + }, + { + "epoch": 2.7744, + "grad_norm": 0.00128936767578125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4335 + }, + { + "epoch": 2.7776, + "grad_norm": 7.0035457611083984e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4340 + }, + { + "epoch": 2.7808, + "grad_norm": 4.458427429199219e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4345 + }, + { + "epoch": 2.784, + "grad_norm": 1.4960765838623047e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4350 + }, + { + "epoch": 2.7872, + "grad_norm": 0.00145721435546875, + "learning_rate": 0.0001, + "loss": 0.0077, + "step": 4355 + }, + { + "epoch": 2.7904, + "grad_norm": 1.2636184692382812e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4360 + }, + { + "epoch": 2.7936, + "grad_norm": 0.01092529296875, + "learning_rate": 0.0001, + "loss": 0.0019, + "step": 4365 + }, + { + "epoch": 2.7968, + "grad_norm": 1.3828277587890625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4370 + }, + { + "epoch": 2.8, + "grad_norm": 0.0002593994140625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4375 + }, + { + "epoch": 2.8032, + "grad_norm": 8.96453857421875e-05, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 4380 + }, + { + "epoch": 2.8064, + "grad_norm": 0.00038909912109375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4385 + }, + { + "epoch": 2.8096, + "grad_norm": 1.7404556274414062e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4390 + }, + { + "epoch": 2.8128, + "grad_norm": 0.0181884765625, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 4395 + }, + { + "epoch": 2.816, + "grad_norm": 1.9788742065429688e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4400 + }, + { + "epoch": 2.8192, + "grad_norm": 0.000514984130859375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4405 + }, + { + "epoch": 2.8224, + "grad_norm": 3.981590270996094e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4410 + }, + { + "epoch": 2.8256, + "grad_norm": 1.3172626495361328e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4415 + }, + { + "epoch": 2.8288, + "grad_norm": 1.0192394256591797e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4420 + }, + { + "epoch": 2.832, + "grad_norm": 0.00017070770263671875, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 4425 + }, + { + "epoch": 2.8352, + "grad_norm": 0.0004100799560546875, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 4430 + }, + { + "epoch": 2.8384, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 4435 + }, + { + "epoch": 2.8416, + "grad_norm": 0.000797271728515625, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 4440 + }, + { + "epoch": 2.8448, + "grad_norm": 4.00543212890625e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4445 + }, + { + "epoch": 2.848, + "grad_norm": 5.513429641723633e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4450 + }, + { + "epoch": 2.8512, + "grad_norm": 2.0265579223632812e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 4455 + }, + { + "epoch": 2.8544, + "grad_norm": 0.00010013580322265625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4460 + }, + { + "epoch": 2.8576, + "grad_norm": 0.00189971923828125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4465 + }, + { + "epoch": 2.8608000000000002, + "grad_norm": 1.9311904907226562e-05, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 4470 + }, + { + "epoch": 2.864, + "grad_norm": 0.0150146484375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4475 + }, + { + "epoch": 2.8672, + "grad_norm": 0.000732421875, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 4480 + }, + { + "epoch": 2.8704, + "grad_norm": 0.00018596649169921875, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 4485 + }, + { + "epoch": 2.8736, + "grad_norm": 0.00049591064453125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4490 + }, + { + "epoch": 2.8768000000000002, + "grad_norm": 0.0032501220703125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4495 + }, + { + "epoch": 2.88, + "grad_norm": 1.055002212524414e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4500 + }, + { + "epoch": 2.8832, + "grad_norm": 6.151199340820312e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4505 + }, + { + "epoch": 2.8864, + "grad_norm": 0.0003452301025390625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4510 + }, + { + "epoch": 2.8895999999999997, + "grad_norm": 2.9325485229492188e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4515 + }, + { + "epoch": 2.8928000000000003, + "grad_norm": 0.0172119140625, + "learning_rate": 0.0001, + "loss": 0.0019, + "step": 4520 + }, + { + "epoch": 2.896, + "grad_norm": 0.00994873046875, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 4525 + }, + { + "epoch": 2.8992, + "grad_norm": 0.0004177093505859375, + "learning_rate": 0.0001, + "loss": 0.0014, + "step": 4530 + }, + { + "epoch": 2.9024, + "grad_norm": 0.000476837158203125, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 4535 + }, + { + "epoch": 2.9055999999999997, + "grad_norm": 0.0004215240478515625, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 4540 + }, + { + "epoch": 2.9088000000000003, + "grad_norm": 0.000247955322265625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4545 + }, + { + "epoch": 2.912, + "grad_norm": 0.0040283203125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4550 + }, + { + "epoch": 2.9152, + "grad_norm": 0.00124359130859375, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4555 + }, + { + "epoch": 2.9184, + "grad_norm": 4.6253204345703125e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4560 + }, + { + "epoch": 2.9215999999999998, + "grad_norm": 0.0004825592041015625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4565 + }, + { + "epoch": 2.9248, + "grad_norm": 0.0002574920654296875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4570 + }, + { + "epoch": 2.928, + "grad_norm": 1.9431114196777344e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4575 + }, + { + "epoch": 2.9312, + "grad_norm": 0.003387451171875, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 4580 + }, + { + "epoch": 2.9344, + "grad_norm": 0.00160980224609375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4585 + }, + { + "epoch": 2.9375999999999998, + "grad_norm": 2.5987625122070312e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4590 + }, + { + "epoch": 2.9408, + "grad_norm": 7.009506225585938e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4595 + }, + { + "epoch": 2.944, + "grad_norm": 1.341104507446289e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4600 + }, + { + "epoch": 2.9472, + "grad_norm": 9.393692016601562e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4605 + }, + { + "epoch": 2.9504, + "grad_norm": 1.919269561767578e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4610 + }, + { + "epoch": 2.9536, + "grad_norm": 4.76837158203125e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4615 + }, + { + "epoch": 2.9568, + "grad_norm": 9.417533874511719e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4620 + }, + { + "epoch": 2.96, + "grad_norm": 1.1980533599853516e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4625 + }, + { + "epoch": 2.9632, + "grad_norm": 0.00116729736328125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4630 + }, + { + "epoch": 2.9664, + "grad_norm": 0.0016632080078125, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4635 + }, + { + "epoch": 2.9696, + "grad_norm": 0.001373291015625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4640 + }, + { + "epoch": 2.9728, + "grad_norm": 0.00014972686767578125, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 4645 + }, + { + "epoch": 2.976, + "grad_norm": 5.125999450683594e-06, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 4650 + }, + { + "epoch": 2.9792, + "grad_norm": 6.67572021484375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4655 + }, + { + "epoch": 2.9824, + "grad_norm": 1.1086463928222656e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4660 + }, + { + "epoch": 2.9856, + "grad_norm": 0.0002803802490234375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4665 + }, + { + "epoch": 2.9888, + "grad_norm": 3.218650817871094e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4670 + }, + { + "epoch": 2.992, + "grad_norm": 1.1086463928222656e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4675 + }, + { + "epoch": 2.9952, + "grad_norm": 1.6689300537109375e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4680 + }, + { + "epoch": 2.9984, + "grad_norm": 9.47713851928711e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4685 + }, + { + "epoch": 3.0016, + "grad_norm": 0.005615234375, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 4690 + }, + { + "epoch": 3.0048, + "grad_norm": 0.00191497802734375, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 4695 + }, + { + "epoch": 3.008, + "grad_norm": 0.0022125244140625, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4700 + }, + { + "epoch": 3.0112, + "grad_norm": 0.000110626220703125, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4705 + }, + { + "epoch": 3.0144, + "grad_norm": 2.1576881408691406e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4710 + }, + { + "epoch": 3.0176, + "grad_norm": 4.172325134277344e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4715 + }, + { + "epoch": 3.0208, + "grad_norm": 0.000225067138671875, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4720 + }, + { + "epoch": 3.024, + "grad_norm": 1.341104507446289e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4725 + }, + { + "epoch": 3.0272, + "grad_norm": 8.940696716308594e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4730 + }, + { + "epoch": 3.0304, + "grad_norm": 7.772445678710938e-05, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 4735 + }, + { + "epoch": 3.0336, + "grad_norm": 5.316734313964844e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4740 + }, + { + "epoch": 3.0368, + "grad_norm": 2.2172927856445312e-05, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 4745 + }, + { + "epoch": 3.04, + "grad_norm": 0.00162506103515625, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4750 + }, + { + "epoch": 3.0432, + "grad_norm": 3.528594970703125e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4755 + }, + { + "epoch": 3.0464, + "grad_norm": 1.2099742889404297e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4760 + }, + { + "epoch": 3.0496, + "grad_norm": 0.003631591796875, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4765 + }, + { + "epoch": 3.0528, + "grad_norm": 2.7298927307128906e-05, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 4770 + }, + { + "epoch": 3.056, + "grad_norm": 3.266334533691406e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4775 + }, + { + "epoch": 3.0592, + "grad_norm": 7.063150405883789e-06, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4780 + }, + { + "epoch": 3.0624, + "grad_norm": 1.2516975402832031e-05, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4785 + }, + { + "epoch": 3.0656, + "grad_norm": 0.0146484375, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 4790 + }, + { + "epoch": 3.0688, + "grad_norm": 0.00020503997802734375, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 4795 + }, + { + "epoch": 3.072, + "grad_norm": 0.00091552734375, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 4800 + } + ], + "logging_steps": 5, + "max_steps": 4800, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 90, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.883056636914381e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/training_args.bin b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..29f75dce5b29053c93ee48c9b3f647e3f5e83f58 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/checkpoint-4800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bfb5a7396260331223e1b3fd2f19765dd4d7b0a41660ebb1d64c6e7fa95fe90 +size 7416 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/completed b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/completed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/metrics.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..7c36e203c08897fb5720186c59224bbada85b822 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/metrics.json @@ -0,0 +1 @@ +{"run_name": "codetransocean_srcml_java", "train_runtime": 11061.1318, "train_samples_per_second": 0.521, "train_steps_per_second": 0.004, "total_flos": 3.685541393109811e+17, "train_loss": 0.5359265327453613, "epoch": 3.5294117647058822} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/train_results.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a4011d30782f214192d8e1542f0d95831d6ae3b0 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.5294117647058822, + "total_flos": 3.685541393109811e+17, + "train_loss": 0.5359265327453613, + "train_runtime": 11061.1318, + "train_samples_per_second": 0.521, + "train_steps_per_second": 0.004 +} \ No newline at end of file diff --git a/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/trainer_state.json b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c5664d739288de376a9927b278c54b462626de23 --- /dev/null +++ b/codellama/java/codetrans/codetransocean/codetransocean_srcml_java/trainer_state.json @@ -0,0 +1,105 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.5294117647058822, + "eval_steps": 500, + "global_step": 45, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.39215686274509803, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0001, + "loss": 0.5995, + "step": 5 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.022705078125, + "learning_rate": 0.0001, + "loss": 0.5904, + "step": 10 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0001, + "loss": 0.556, + "step": 15 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.0162353515625, + "learning_rate": 0.0001, + "loss": 0.5464, + "step": 20 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0001, + "loss": 0.5259, + "step": 25 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.0147705078125, + "learning_rate": 0.0001, + "loss": 0.5226, + "step": 30 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.029296875, + "learning_rate": 0.0001, + "loss": 0.5098, + "step": 35 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.016357421875, + "learning_rate": 0.0001, + "loss": 0.5106, + "step": 40 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0001, + "loss": 0.4622, + "step": 45 + }, + { + "epoch": 3.5294117647058822, + "step": 45, + "total_flos": 3.685541393109811e+17, + "train_loss": 0.5359265327453613, + "train_runtime": 11061.1318, + "train_samples_per_second": 0.521, + "train_steps_per_second": 0.004 + } + ], + "logging_steps": 5, + "max_steps": 45, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 180, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.685541393109811e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/java/dataflow_java_pretrained/all_results.json b/codellama/java/dataflow_java_pretrained/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..bce81059a7b1dd1728248f3f77c43d2df68ad77d --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 0.9049489395129615, + "total_flos": 1.6220305320330854e+18, + "train_loss": 0.07024859038905965, + "train_runtime": 50041.181, + "train_samples_per_second": 0.921, + "train_steps_per_second": 0.007 +} \ No newline at end of file diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/README.md b/codellama/java/dataflow_java_pretrained/checkpoint-360/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_config.json b/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6187bba1151b14c1207088bf6aefc2a05c33523e --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "gate_proj", + "up_proj", + "down_proj", + "q_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_model.safetensors b/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b8c9e9cca9f6765775c083697e3ca540cc68c21c --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9255ef4d476c8cbd01b41ea21b7c34815d6f169ffc89989dacc449605fbb6204 +size 1156480200 diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_model/README.md b/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f701e106913179e53b07103ec61ffc10178fd6c0 --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_model/README.md @@ -0,0 +1,202 @@ +--- +base_model: ../CodeLlama-13b-Instruct-hf/ +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_model/adapter_config.json b/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6187bba1151b14c1207088bf6aefc2a05c33523e --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_model/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../CodeLlama-13b-Instruct-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "gate_proj", + "up_proj", + "down_proj", + "q_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_model/adapter_model.safetensors b/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_model/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b8c9e9cca9f6765775c083697e3ca540cc68c21c --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/adapter_model/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9255ef4d476c8cbd01b41ea21b7c34815d6f169ffc89989dacc449605fbb6204 +size 1156480200 diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/added_tokens.json b/codellama/java/dataflow_java_pretrained/checkpoint-360/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074 --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/added_tokens.json @@ -0,0 +1,3 @@ +{ + "[PAD]": 32016 +} diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/optimizer.pt b/codellama/java/dataflow_java_pretrained/checkpoint-360/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..795ecde19f4d82dda36e4d31d1aa25f1861d633c --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:510d26930880ee5a4bd5c07dd609e378cb1f5b78285942f6effe2385e829bc08 +size 2003127538 diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/rng_state.pth b/codellama/java/dataflow_java_pretrained/checkpoint-360/rng_state.pth similarity index 100% rename from codellama/java/dataflow_pretrained/checkpoint-720/rng_state.pth rename to codellama/java/dataflow_java_pretrained/checkpoint-360/rng_state.pth diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/scheduler.pt b/codellama/java/dataflow_java_pretrained/checkpoint-360/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffb7177b487c41d6b9f78f59fcdd9023706925df --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baba7c5dff09a1d575a7ff0a27f1158d5dd92adec2a108211e3ca605cfdd03a6 +size 1064 diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/special_tokens_map.json b/codellama/java/dataflow_java_pretrained/checkpoint-360/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/special_tokens_map.json @@ -0,0 +1,36 @@ +{ + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/tokenizer.model b/codellama/java/dataflow_java_pretrained/checkpoint-360/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4 --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6 +size 500058 diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/tokenizer_config.json b/codellama/java/dataflow_java_pretrained/checkpoint-360/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/tokenizer_config.json @@ -0,0 +1,94 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": " ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32007": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32008": { + "content": "▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32009": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32010": { + "content": "▁ ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32016": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "▁ ", + "▁", + "▁ ", + "▁ " + ], + "bos_token": " ", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<", + "eot_token": "▁>\\n' + system_message + '\\n< >\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "fill_token": " ", + "legacy": null, + "middle_token": "▁ ", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "padding_side": "right", + "prefix_token": "▁ ", + "sp_model_kwargs": {}, + "suffix_first": false, + "suffix_token": "▁", + "tokenizer_class": "CodeLlamaTokenizer", + "unk_token": " ", + "use_default_system_prompt": false +} diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/trainer_state.json b/codellama/java/dataflow_java_pretrained/checkpoint-360/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bb7849dea0ea7d8fa11ad3a3eb0470935e25cdd3 --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/trainer_state.json @@ -0,0 +1,537 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9049489395129615, + "eval_steps": 500, + "global_step": 360, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012568735271013355, + "grad_norm": 0.06298828125, + "learning_rate": 0.0001, + "loss": 0.6012, + "step": 5 + }, + { + "epoch": 0.02513747054202671, + "grad_norm": 0.11767578125, + "learning_rate": 0.0001, + "loss": 0.3895, + "step": 10 + }, + { + "epoch": 0.037706205813040065, + "grad_norm": 0.0908203125, + "learning_rate": 0.0001, + "loss": 0.2298, + "step": 15 + }, + { + "epoch": 0.05027494108405342, + "grad_norm": 0.068359375, + "learning_rate": 0.0001, + "loss": 0.1486, + "step": 20 + }, + { + "epoch": 0.06284367635506677, + "grad_norm": 0.06396484375, + "learning_rate": 0.0001, + "loss": 0.1333, + "step": 25 + }, + { + "epoch": 0.07541241162608013, + "grad_norm": 0.0849609375, + "learning_rate": 0.0001, + "loss": 0.1203, + "step": 30 + }, + { + "epoch": 0.08798114689709348, + "grad_norm": 0.0908203125, + "learning_rate": 0.0001, + "loss": 0.0904, + "step": 35 + }, + { + "epoch": 0.10054988216810684, + "grad_norm": 0.05859375, + "learning_rate": 0.0001, + "loss": 0.0617, + "step": 40 + }, + { + "epoch": 0.11311861743912019, + "grad_norm": 0.0478515625, + "learning_rate": 0.0001, + "loss": 0.0515, + "step": 45 + }, + { + "epoch": 0.12568735271013354, + "grad_norm": 0.0634765625, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 50 + }, + { + "epoch": 0.13825608798114689, + "grad_norm": 0.07421875, + "learning_rate": 0.0001, + "loss": 0.2835, + "step": 55 + }, + { + "epoch": 0.15082482325216026, + "grad_norm": 0.057861328125, + "learning_rate": 0.0001, + "loss": 0.0973, + "step": 60 + }, + { + "epoch": 0.1633935585231736, + "grad_norm": 0.026611328125, + "learning_rate": 0.0001, + "loss": 0.0755, + "step": 65 + }, + { + "epoch": 0.17596229379418696, + "grad_norm": 0.0244140625, + "learning_rate": 0.0001, + "loss": 0.0547, + "step": 70 + }, + { + "epoch": 0.1885310290652003, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0001, + "loss": 0.0638, + "step": 75 + }, + { + "epoch": 0.20109976433621368, + "grad_norm": 0.029052734375, + "learning_rate": 0.0001, + "loss": 0.0541, + "step": 80 + }, + { + "epoch": 0.21366849960722703, + "grad_norm": 0.039306640625, + "learning_rate": 0.0001, + "loss": 0.0511, + "step": 85 + }, + { + "epoch": 0.22623723487824038, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 90 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 95 + }, + { + "epoch": 0.2513747054202671, + "grad_norm": 0.02734375, + "learning_rate": 0.0001, + "loss": 0.0168, + "step": 100 + }, + { + "epoch": 0.26394344069128045, + "grad_norm": 0.0556640625, + "learning_rate": 0.0001, + "loss": 0.2346, + "step": 105 + }, + { + "epoch": 0.27651217596229377, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0001, + "loss": 0.0746, + "step": 110 + }, + { + "epoch": 0.28908091123330715, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0001, + "loss": 0.0534, + "step": 115 + }, + { + "epoch": 0.3016496465043205, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 120 + }, + { + "epoch": 0.31421838177533384, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0001, + "loss": 0.0488, + "step": 125 + }, + { + "epoch": 0.3267871170463472, + "grad_norm": 0.02490234375, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 130 + }, + { + "epoch": 0.33935585231736054, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 135 + }, + { + "epoch": 0.3519245875883739, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 140 + }, + { + "epoch": 0.3644933228593873, + "grad_norm": 0.039306640625, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 145 + }, + { + "epoch": 0.3770620581304006, + "grad_norm": 0.031982421875, + "learning_rate": 0.0001, + "loss": 0.0142, + "step": 150 + }, + { + "epoch": 0.389630793401414, + "grad_norm": 0.045654296875, + "learning_rate": 0.0001, + "loss": 0.2053, + "step": 155 + }, + { + "epoch": 0.40219952867242736, + "grad_norm": 0.0400390625, + "learning_rate": 0.0001, + "loss": 0.0658, + "step": 160 + }, + { + "epoch": 0.4147682639434407, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 165 + }, + { + "epoch": 0.42733699921445406, + "grad_norm": 0.021240234375, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 170 + }, + { + "epoch": 0.4399057344854674, + "grad_norm": 0.0263671875, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 175 + }, + { + "epoch": 0.45247446975648076, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 180 + }, + { + "epoch": 0.46504320502749413, + "grad_norm": 0.022705078125, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 185 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0001, + "loss": 0.0257, + "step": 190 + }, + { + "epoch": 0.49018067556952083, + "grad_norm": 0.02490234375, + "learning_rate": 0.0001, + "loss": 0.0217, + "step": 195 + }, + { + "epoch": 0.5027494108405341, + "grad_norm": 0.006866455078125, + "learning_rate": 0.0001, + "loss": 0.0073, + "step": 200 + }, + { + "epoch": 0.5153181461115475, + "grad_norm": 0.04443359375, + "learning_rate": 0.0001, + "loss": 0.1655, + "step": 205 + }, + { + "epoch": 0.5278868813825609, + "grad_norm": 0.056640625, + "learning_rate": 0.0001, + "loss": 0.051, + "step": 210 + }, + { + "epoch": 0.5404556166535742, + "grad_norm": 0.026123046875, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 215 + }, + { + "epoch": 0.5530243519245875, + "grad_norm": 0.01806640625, + "learning_rate": 0.0001, + "loss": 0.0255, + "step": 220 + }, + { + "epoch": 0.565593087195601, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 225 + }, + { + "epoch": 0.5781618224666143, + "grad_norm": 0.024658203125, + "learning_rate": 0.0001, + "loss": 0.0289, + "step": 230 + }, + { + "epoch": 0.5907305577376276, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0001, + "loss": 0.0235, + "step": 235 + }, + { + "epoch": 0.603299293008641, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0001, + "loss": 0.0208, + "step": 240 + }, + { + "epoch": 0.6158680282796544, + "grad_norm": 0.025634765625, + "learning_rate": 0.0001, + "loss": 0.0119, + "step": 245 + }, + { + "epoch": 0.6284367635506677, + "grad_norm": 0.0125732421875, + "learning_rate": 0.0001, + "loss": 0.0093, + "step": 250 + }, + { + "epoch": 0.6410054988216811, + "grad_norm": 0.051025390625, + "learning_rate": 0.0001, + "loss": 0.1598, + "step": 255 + }, + { + "epoch": 0.6535742340926944, + "grad_norm": 0.0546875, + "learning_rate": 0.0001, + "loss": 0.0457, + "step": 260 + }, + { + "epoch": 0.6661429693637078, + "grad_norm": 0.03564453125, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 265 + }, + { + "epoch": 0.6787117046347211, + "grad_norm": 0.019775390625, + "learning_rate": 0.0001, + "loss": 0.024, + "step": 270 + }, + { + "epoch": 0.6912804399057345, + "grad_norm": 0.0234375, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 275 + }, + { + "epoch": 0.7038491751767478, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0001, + "loss": 0.0249, + "step": 280 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 0.029541015625, + "learning_rate": 0.0001, + "loss": 0.0199, + "step": 285 + }, + { + "epoch": 0.7289866457187746, + "grad_norm": 0.02294921875, + "learning_rate": 0.0001, + "loss": 0.0154, + "step": 290 + }, + { + "epoch": 0.7415553809897879, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0001, + "loss": 0.0116, + "step": 295 + }, + { + "epoch": 0.7541241162608012, + "grad_norm": 0.00531005859375, + "learning_rate": 0.0001, + "loss": 0.0058, + "step": 300 + }, + { + "epoch": 0.7666928515318147, + "grad_norm": 0.049560546875, + "learning_rate": 0.0001, + "loss": 0.1521, + "step": 305 + }, + { + "epoch": 0.779261586802828, + "grad_norm": 0.140625, + "learning_rate": 0.0001, + "loss": 0.0482, + "step": 310 + }, + { + "epoch": 0.7918303220738413, + "grad_norm": 0.035888671875, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 315 + }, + { + "epoch": 0.8043990573448547, + "grad_norm": 0.036865234375, + "learning_rate": 0.0001, + "loss": 0.0244, + "step": 320 + }, + { + "epoch": 0.816967792615868, + "grad_norm": 0.030517578125, + "learning_rate": 0.0001, + "loss": 0.0263, + "step": 325 + }, + { + "epoch": 0.8295365278868814, + "grad_norm": 0.024169921875, + "learning_rate": 0.0001, + "loss": 0.0218, + "step": 330 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0001, + "loss": 0.0182, + "step": 335 + }, + { + "epoch": 0.8546739984289081, + "grad_norm": 0.02880859375, + "learning_rate": 0.0001, + "loss": 0.014, + "step": 340 + }, + { + "epoch": 0.8672427336999214, + "grad_norm": 0.03173828125, + "learning_rate": 0.0001, + "loss": 0.0109, + "step": 345 + }, + { + "epoch": 0.8798114689709348, + "grad_norm": 0.01483154296875, + "learning_rate": 0.0001, + "loss": 0.0044, + "step": 350 + }, + { + "epoch": 0.8923802042419482, + "grad_norm": 0.03955078125, + "learning_rate": 0.0001, + "loss": 0.1312, + "step": 355 + }, + { + "epoch": 0.9049489395129615, + "grad_norm": 0.031982421875, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 360 + } + ], + "logging_steps": 5, + "max_steps": 360, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 90, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6220305320330854e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/java/dataflow_java_pretrained/checkpoint-360/training_args.bin b/codellama/java/dataflow_java_pretrained/checkpoint-360/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..59f7a00edbdbd6b3221150f65609c5c8a5ec2f18 --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/checkpoint-360/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecc1afb72b58dca46f5cfa652b2afb64f998044182e1b761ed2f00cbb47fd9de +size 7416 diff --git a/codellama/java/dataflow_java_pretrained/completed b/codellama/java/dataflow_java_pretrained/completed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/codellama/java/dataflow_java_pretrained/metrics.json b/codellama/java/dataflow_java_pretrained/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..6424a174064e14a147f24390c009c96aff9d8a3b --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/metrics.json @@ -0,0 +1 @@ +{"run_name": "dataflow_java", "train_runtime": 50041.181, "train_samples_per_second": 0.921, "train_steps_per_second": 0.007, "total_flos": 1.6220305320330854e+18, "train_loss": 0.07024859038905965, "epoch": 0.9049489395129615} \ No newline at end of file diff --git a/codellama/java/dataflow_java_pretrained/train_results.json b/codellama/java/dataflow_java_pretrained/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..bce81059a7b1dd1728248f3f77c43d2df68ad77d --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 0.9049489395129615, + "total_flos": 1.6220305320330854e+18, + "train_loss": 0.07024859038905965, + "train_runtime": 50041.181, + "train_samples_per_second": 0.921, + "train_steps_per_second": 0.007 +} \ No newline at end of file diff --git a/codellama/java/dataflow_java_pretrained/trainer_state.json b/codellama/java/dataflow_java_pretrained/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3eb848702cc1e00738b73844f3e656a6409e7802 --- /dev/null +++ b/codellama/java/dataflow_java_pretrained/trainer_state.json @@ -0,0 +1,546 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9049489395129615, + "eval_steps": 500, + "global_step": 360, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012568735271013355, + "grad_norm": 0.06298828125, + "learning_rate": 0.0001, + "loss": 0.6012, + "step": 5 + }, + { + "epoch": 0.02513747054202671, + "grad_norm": 0.11767578125, + "learning_rate": 0.0001, + "loss": 0.3895, + "step": 10 + }, + { + "epoch": 0.037706205813040065, + "grad_norm": 0.0908203125, + "learning_rate": 0.0001, + "loss": 0.2298, + "step": 15 + }, + { + "epoch": 0.05027494108405342, + "grad_norm": 0.068359375, + "learning_rate": 0.0001, + "loss": 0.1486, + "step": 20 + }, + { + "epoch": 0.06284367635506677, + "grad_norm": 0.06396484375, + "learning_rate": 0.0001, + "loss": 0.1333, + "step": 25 + }, + { + "epoch": 0.07541241162608013, + "grad_norm": 0.0849609375, + "learning_rate": 0.0001, + "loss": 0.1203, + "step": 30 + }, + { + "epoch": 0.08798114689709348, + "grad_norm": 0.0908203125, + "learning_rate": 0.0001, + "loss": 0.0904, + "step": 35 + }, + { + "epoch": 0.10054988216810684, + "grad_norm": 0.05859375, + "learning_rate": 0.0001, + "loss": 0.0617, + "step": 40 + }, + { + "epoch": 0.11311861743912019, + "grad_norm": 0.0478515625, + "learning_rate": 0.0001, + "loss": 0.0515, + "step": 45 + }, + { + "epoch": 0.12568735271013354, + "grad_norm": 0.0634765625, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 50 + }, + { + "epoch": 0.13825608798114689, + "grad_norm": 0.07421875, + "learning_rate": 0.0001, + "loss": 0.2835, + "step": 55 + }, + { + "epoch": 0.15082482325216026, + "grad_norm": 0.057861328125, + "learning_rate": 0.0001, + "loss": 0.0973, + "step": 60 + }, + { + "epoch": 0.1633935585231736, + "grad_norm": 0.026611328125, + "learning_rate": 0.0001, + "loss": 0.0755, + "step": 65 + }, + { + "epoch": 0.17596229379418696, + "grad_norm": 0.0244140625, + "learning_rate": 0.0001, + "loss": 0.0547, + "step": 70 + }, + { + "epoch": 0.1885310290652003, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0001, + "loss": 0.0638, + "step": 75 + }, + { + "epoch": 0.20109976433621368, + "grad_norm": 0.029052734375, + "learning_rate": 0.0001, + "loss": 0.0541, + "step": 80 + }, + { + "epoch": 0.21366849960722703, + "grad_norm": 0.039306640625, + "learning_rate": 0.0001, + "loss": 0.0511, + "step": 85 + }, + { + "epoch": 0.22623723487824038, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 90 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 95 + }, + { + "epoch": 0.2513747054202671, + "grad_norm": 0.02734375, + "learning_rate": 0.0001, + "loss": 0.0168, + "step": 100 + }, + { + "epoch": 0.26394344069128045, + "grad_norm": 0.0556640625, + "learning_rate": 0.0001, + "loss": 0.2346, + "step": 105 + }, + { + "epoch": 0.27651217596229377, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0001, + "loss": 0.0746, + "step": 110 + }, + { + "epoch": 0.28908091123330715, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0001, + "loss": 0.0534, + "step": 115 + }, + { + "epoch": 0.3016496465043205, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 120 + }, + { + "epoch": 0.31421838177533384, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0001, + "loss": 0.0488, + "step": 125 + }, + { + "epoch": 0.3267871170463472, + "grad_norm": 0.02490234375, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 130 + }, + { + "epoch": 0.33935585231736054, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 135 + }, + { + "epoch": 0.3519245875883739, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 140 + }, + { + "epoch": 0.3644933228593873, + "grad_norm": 0.039306640625, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 145 + }, + { + "epoch": 0.3770620581304006, + "grad_norm": 0.031982421875, + "learning_rate": 0.0001, + "loss": 0.0142, + "step": 150 + }, + { + "epoch": 0.389630793401414, + "grad_norm": 0.045654296875, + "learning_rate": 0.0001, + "loss": 0.2053, + "step": 155 + }, + { + "epoch": 0.40219952867242736, + "grad_norm": 0.0400390625, + "learning_rate": 0.0001, + "loss": 0.0658, + "step": 160 + }, + { + "epoch": 0.4147682639434407, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 165 + }, + { + "epoch": 0.42733699921445406, + "grad_norm": 0.021240234375, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 170 + }, + { + "epoch": 0.4399057344854674, + "grad_norm": 0.0263671875, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 175 + }, + { + "epoch": 0.45247446975648076, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 180 + }, + { + "epoch": 0.46504320502749413, + "grad_norm": 0.022705078125, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 185 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0001, + "loss": 0.0257, + "step": 190 + }, + { + "epoch": 0.49018067556952083, + "grad_norm": 0.02490234375, + "learning_rate": 0.0001, + "loss": 0.0217, + "step": 195 + }, + { + "epoch": 0.5027494108405341, + "grad_norm": 0.006866455078125, + "learning_rate": 0.0001, + "loss": 0.0073, + "step": 200 + }, + { + "epoch": 0.5153181461115475, + "grad_norm": 0.04443359375, + "learning_rate": 0.0001, + "loss": 0.1655, + "step": 205 + }, + { + "epoch": 0.5278868813825609, + "grad_norm": 0.056640625, + "learning_rate": 0.0001, + "loss": 0.051, + "step": 210 + }, + { + "epoch": 0.5404556166535742, + "grad_norm": 0.026123046875, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 215 + }, + { + "epoch": 0.5530243519245875, + "grad_norm": 0.01806640625, + "learning_rate": 0.0001, + "loss": 0.0255, + "step": 220 + }, + { + "epoch": 0.565593087195601, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 225 + }, + { + "epoch": 0.5781618224666143, + "grad_norm": 0.024658203125, + "learning_rate": 0.0001, + "loss": 0.0289, + "step": 230 + }, + { + "epoch": 0.5907305577376276, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0001, + "loss": 0.0235, + "step": 235 + }, + { + "epoch": 0.603299293008641, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0001, + "loss": 0.0208, + "step": 240 + }, + { + "epoch": 0.6158680282796544, + "grad_norm": 0.025634765625, + "learning_rate": 0.0001, + "loss": 0.0119, + "step": 245 + }, + { + "epoch": 0.6284367635506677, + "grad_norm": 0.0125732421875, + "learning_rate": 0.0001, + "loss": 0.0093, + "step": 250 + }, + { + "epoch": 0.6410054988216811, + "grad_norm": 0.051025390625, + "learning_rate": 0.0001, + "loss": 0.1598, + "step": 255 + }, + { + "epoch": 0.6535742340926944, + "grad_norm": 0.0546875, + "learning_rate": 0.0001, + "loss": 0.0457, + "step": 260 + }, + { + "epoch": 0.6661429693637078, + "grad_norm": 0.03564453125, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 265 + }, + { + "epoch": 0.6787117046347211, + "grad_norm": 0.019775390625, + "learning_rate": 0.0001, + "loss": 0.024, + "step": 270 + }, + { + "epoch": 0.6912804399057345, + "grad_norm": 0.0234375, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 275 + }, + { + "epoch": 0.7038491751767478, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0001, + "loss": 0.0249, + "step": 280 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 0.029541015625, + "learning_rate": 0.0001, + "loss": 0.0199, + "step": 285 + }, + { + "epoch": 0.7289866457187746, + "grad_norm": 0.02294921875, + "learning_rate": 0.0001, + "loss": 0.0154, + "step": 290 + }, + { + "epoch": 0.7415553809897879, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0001, + "loss": 0.0116, + "step": 295 + }, + { + "epoch": 0.7541241162608012, + "grad_norm": 0.00531005859375, + "learning_rate": 0.0001, + "loss": 0.0058, + "step": 300 + }, + { + "epoch": 0.7666928515318147, + "grad_norm": 0.049560546875, + "learning_rate": 0.0001, + "loss": 0.1521, + "step": 305 + }, + { + "epoch": 0.779261586802828, + "grad_norm": 0.140625, + "learning_rate": 0.0001, + "loss": 0.0482, + "step": 310 + }, + { + "epoch": 0.7918303220738413, + "grad_norm": 0.035888671875, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 315 + }, + { + "epoch": 0.8043990573448547, + "grad_norm": 0.036865234375, + "learning_rate": 0.0001, + "loss": 0.0244, + "step": 320 + }, + { + "epoch": 0.816967792615868, + "grad_norm": 0.030517578125, + "learning_rate": 0.0001, + "loss": 0.0263, + "step": 325 + }, + { + "epoch": 0.8295365278868814, + "grad_norm": 0.024169921875, + "learning_rate": 0.0001, + "loss": 0.0218, + "step": 330 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0001, + "loss": 0.0182, + "step": 335 + }, + { + "epoch": 0.8546739984289081, + "grad_norm": 0.02880859375, + "learning_rate": 0.0001, + "loss": 0.014, + "step": 340 + }, + { + "epoch": 0.8672427336999214, + "grad_norm": 0.03173828125, + "learning_rate": 0.0001, + "loss": 0.0109, + "step": 345 + }, + { + "epoch": 0.8798114689709348, + "grad_norm": 0.01483154296875, + "learning_rate": 0.0001, + "loss": 0.0044, + "step": 350 + }, + { + "epoch": 0.8923802042419482, + "grad_norm": 0.03955078125, + "learning_rate": 0.0001, + "loss": 0.1312, + "step": 355 + }, + { + "epoch": 0.9049489395129615, + "grad_norm": 0.031982421875, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 360 + }, + { + "epoch": 0.9049489395129615, + "step": 360, + "total_flos": 1.6220305320330854e+18, + "train_loss": 0.07024859038905965, + "train_runtime": 50041.181, + "train_samples_per_second": 0.921, + "train_steps_per_second": 0.007 + } + ], + "logging_steps": 5, + "max_steps": 360, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 90, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6220305320330854e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/codellama/java/dataflow_pretrained/all_results.json b/codellama/java/dataflow_pretrained/all_results.json deleted file mode 100644 index 64bfe2710811ec1b306126c677d1dd39ec762ea4..0000000000000000000000000000000000000000 --- a/codellama/java/dataflow_pretrained/all_results.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "epoch": 0.905020032995522, - "total_flos": 1.5364568007927398e+18, - "train_loss": 0.11899957797593541, - "train_runtime": 69215.1765, - "train_samples_per_second": 0.666, - "train_steps_per_second": 0.01 -} \ No newline at end of file diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model.safetensors b/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model.safetensors deleted file mode 100644 index 2cfdf7bed0df57ec1c9f14be31ccdc570473e0ee..0000000000000000000000000000000000000000 --- a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3cf8f1c4cc300ca5094e08295cc0dcffacce527b464e1372de75271bb4d522a9 -size 1156480200 diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/adapter_model.safetensors b/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/adapter_model.safetensors deleted file mode 100644 index 2cfdf7bed0df57ec1c9f14be31ccdc570473e0ee..0000000000000000000000000000000000000000 --- a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3cf8f1c4cc300ca5094e08295cc0dcffacce527b464e1372de75271bb4d522a9 -size 1156480200 diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/optimizer.pt b/codellama/java/dataflow_pretrained/checkpoint-720/optimizer.pt deleted file mode 100644 index 2865a6353c416caaf540a8718eeace538916ccc1..0000000000000000000000000000000000000000 --- a/codellama/java/dataflow_pretrained/checkpoint-720/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:31c4a8fb04732973611d06dc14c79dd69c2644d9167a680a4d9760a3cdc9059d -size 2003127538 diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/scheduler.pt b/codellama/java/dataflow_pretrained/checkpoint-720/scheduler.pt deleted file mode 100644 index f14a4ae58aa46cb66c004b49dfe361461655b55b..0000000000000000000000000000000000000000 --- a/codellama/java/dataflow_pretrained/checkpoint-720/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c48ea2f606cbbb6177c782dd71ba690a6d43d7f02de58760a50cf5c03d3d9324 -size 1064 diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/trainer_state.json b/codellama/java/dataflow_pretrained/checkpoint-720/trainer_state.json deleted file mode 100644 index 3c48eb6bf4df5be4d6ad6819fc6cea34ab2deca1..0000000000000000000000000000000000000000 --- a/codellama/java/dataflow_pretrained/checkpoint-720/trainer_state.json +++ /dev/null @@ -1,1041 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.905020032995522, - "eval_steps": 500, - "global_step": 720, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.006284861340246681, - "grad_norm": 0.0712890625, - "learning_rate": 0.0001, - "loss": 0.7884, - "step": 5 - }, - { - "epoch": 0.012569722680493361, - "grad_norm": 0.1318359375, - "learning_rate": 0.0001, - "loss": 0.5229, - "step": 10 - }, - { - "epoch": 0.018854584020740042, - "grad_norm": 0.0927734375, - "learning_rate": 0.0001, - "loss": 0.3535, - "step": 15 - }, - { - "epoch": 0.025139445360986723, - "grad_norm": 0.08251953125, - "learning_rate": 0.0001, - "loss": 0.2525, - "step": 20 - }, - { - "epoch": 0.031424306701233404, - "grad_norm": 0.0751953125, - "learning_rate": 0.0001, - "loss": 0.229, - "step": 25 - }, - { - "epoch": 0.037709168041480084, - "grad_norm": 0.10888671875, - "learning_rate": 0.0001, - "loss": 0.204, - "step": 30 - }, - { - "epoch": 0.043994029381726765, - "grad_norm": 0.0927734375, - "learning_rate": 0.0001, - "loss": 0.1598, - "step": 35 - }, - { - "epoch": 0.050278890721973446, - "grad_norm": 0.06494140625, - "learning_rate": 0.0001, - "loss": 0.1241, - "step": 40 - }, - { - "epoch": 0.05656375206222013, - "grad_norm": 0.059814453125, - "learning_rate": 0.0001, - "loss": 0.1026, - "step": 45 - }, - { - "epoch": 0.06284861340246681, - "grad_norm": 0.2265625, - "learning_rate": 0.0001, - "loss": 0.0843, - "step": 50 - }, - { - "epoch": 0.06913347474271349, - "grad_norm": 0.08349609375, - "learning_rate": 0.0001, - "loss": 0.5241, - "step": 55 - }, - { - "epoch": 0.07541833608296017, - "grad_norm": 0.07958984375, - "learning_rate": 0.0001, - "loss": 0.1898, - "step": 60 - }, - { - "epoch": 0.08170319742320685, - "grad_norm": 0.052490234375, - "learning_rate": 0.0001, - "loss": 0.1542, - "step": 65 - }, - { - "epoch": 0.08798805876345353, - "grad_norm": 0.0546875, - "learning_rate": 0.0001, - "loss": 0.1152, - "step": 70 - }, - { - "epoch": 0.09427292010370021, - "grad_norm": 0.058349609375, - "learning_rate": 0.0001, - "loss": 0.1399, - "step": 75 - }, - { - "epoch": 0.10055778144394689, - "grad_norm": 0.04052734375, - "learning_rate": 0.0001, - "loss": 0.1282, - "step": 80 - }, - { - "epoch": 0.10684264278419357, - "grad_norm": 0.044189453125, - "learning_rate": 0.0001, - "loss": 0.1135, - "step": 85 - }, - { - "epoch": 0.11312750412444025, - "grad_norm": 0.037109375, - "learning_rate": 0.0001, - "loss": 0.0923, - "step": 90 - }, - { - "epoch": 0.11941236546468693, - "grad_norm": 0.050048828125, - "learning_rate": 0.0001, - "loss": 0.0895, - "step": 95 - }, - { - "epoch": 0.12569722680493361, - "grad_norm": 0.064453125, - "learning_rate": 0.0001, - "loss": 0.0574, - "step": 100 - }, - { - "epoch": 0.1319820881451803, - "grad_norm": 0.0625, - "learning_rate": 0.0001, - "loss": 0.3794, - "step": 105 - }, - { - "epoch": 0.13826694948542698, - "grad_norm": 0.04443359375, - "learning_rate": 0.0001, - "loss": 0.1638, - "step": 110 - }, - { - "epoch": 0.14455181082567367, - "grad_norm": 0.04931640625, - "learning_rate": 0.0001, - "loss": 0.1154, - "step": 115 - }, - { - "epoch": 0.15083667216592034, - "grad_norm": 0.04931640625, - "learning_rate": 0.0001, - "loss": 0.0967, - "step": 120 - }, - { - "epoch": 0.15712153350616703, - "grad_norm": 0.04248046875, - "learning_rate": 0.0001, - "loss": 0.1275, - "step": 125 - }, - { - "epoch": 0.1634063948464137, - "grad_norm": 0.03759765625, - "learning_rate": 0.0001, - "loss": 0.11, - "step": 130 - }, - { - "epoch": 0.1696912561866604, - "grad_norm": 0.039794921875, - "learning_rate": 0.0001, - "loss": 0.0986, - "step": 135 - }, - { - "epoch": 0.17597611752690706, - "grad_norm": 0.042724609375, - "learning_rate": 0.0001, - "loss": 0.082, - "step": 140 - }, - { - "epoch": 0.18226097886715376, - "grad_norm": 0.049072265625, - "learning_rate": 0.0001, - "loss": 0.0729, - "step": 145 - }, - { - "epoch": 0.18854584020740042, - "grad_norm": 0.044189453125, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 150 - }, - { - "epoch": 0.19483070154764712, - "grad_norm": 0.06982421875, - "learning_rate": 0.0001, - "loss": 0.3499, - "step": 155 - }, - { - "epoch": 0.20111556288789378, - "grad_norm": 0.054931640625, - "learning_rate": 0.0001, - "loss": 0.1535, - "step": 160 - }, - { - "epoch": 0.20740042422814048, - "grad_norm": 0.045166015625, - "learning_rate": 0.0001, - "loss": 0.1166, - "step": 165 - }, - { - "epoch": 0.21368528556838715, - "grad_norm": 0.047119140625, - "learning_rate": 0.0001, - "loss": 0.0816, - "step": 170 - }, - { - "epoch": 0.21997014690863384, - "grad_norm": 0.0634765625, - "learning_rate": 0.0001, - "loss": 0.1164, - "step": 175 - }, - { - "epoch": 0.2262550082488805, - "grad_norm": 0.0478515625, - "learning_rate": 0.0001, - "loss": 0.1004, - "step": 180 - }, - { - "epoch": 0.2325398695891272, - "grad_norm": 0.06103515625, - "learning_rate": 0.0001, - "loss": 0.092, - "step": 185 - }, - { - "epoch": 0.23882473092937387, - "grad_norm": 0.0458984375, - "learning_rate": 0.0001, - "loss": 0.0815, - "step": 190 - }, - { - "epoch": 0.24510959226962056, - "grad_norm": 0.05419921875, - "learning_rate": 0.0001, - "loss": 0.0708, - "step": 195 - }, - { - "epoch": 0.25139445360986723, - "grad_norm": 0.07763671875, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 200 - }, - { - "epoch": 0.2576793149501139, - "grad_norm": 0.11376953125, - "learning_rate": 0.0001, - "loss": 0.3435, - "step": 205 - }, - { - "epoch": 0.2639641762903606, - "grad_norm": 0.057373046875, - "learning_rate": 0.0001, - "loss": 0.1445, - "step": 210 - }, - { - "epoch": 0.27024903763060726, - "grad_norm": 0.03759765625, - "learning_rate": 0.0001, - "loss": 0.1052, - "step": 215 - }, - { - "epoch": 0.27653389897085395, - "grad_norm": 0.03515625, - "learning_rate": 0.0001, - "loss": 0.0789, - "step": 220 - }, - { - "epoch": 0.28281876031110065, - "grad_norm": 0.043212890625, - "learning_rate": 0.0001, - "loss": 0.1068, - "step": 225 - }, - { - "epoch": 0.28910362165134734, - "grad_norm": 0.043212890625, - "learning_rate": 0.0001, - "loss": 0.0958, - "step": 230 - }, - { - "epoch": 0.295388482991594, - "grad_norm": 0.04052734375, - "learning_rate": 0.0001, - "loss": 0.0817, - "step": 235 - }, - { - "epoch": 0.3016733443318407, - "grad_norm": 0.06494140625, - "learning_rate": 0.0001, - "loss": 0.07, - "step": 240 - }, - { - "epoch": 0.30795820567208737, - "grad_norm": 0.047607421875, - "learning_rate": 0.0001, - "loss": 0.0596, - "step": 245 - }, - { - "epoch": 0.31424306701233407, - "grad_norm": 0.06884765625, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 250 - }, - { - "epoch": 0.3205279283525807, - "grad_norm": 0.08154296875, - "learning_rate": 0.0001, - "loss": 0.3339, - "step": 255 - }, - { - "epoch": 0.3268127896928274, - "grad_norm": 0.048583984375, - "learning_rate": 0.0001, - "loss": 0.1467, - "step": 260 - }, - { - "epoch": 0.3330976510330741, - "grad_norm": 0.035400390625, - "learning_rate": 0.0001, - "loss": 0.1058, - "step": 265 - }, - { - "epoch": 0.3393825123733208, - "grad_norm": 0.034423828125, - "learning_rate": 0.0001, - "loss": 0.0701, - "step": 270 - }, - { - "epoch": 0.3456673737135674, - "grad_norm": 0.04443359375, - "learning_rate": 0.0001, - "loss": 0.1052, - "step": 275 - }, - { - "epoch": 0.3519522350538141, - "grad_norm": 0.047119140625, - "learning_rate": 0.0001, - "loss": 0.0958, - "step": 280 - }, - { - "epoch": 0.3582370963940608, - "grad_norm": 0.033447265625, - "learning_rate": 0.0001, - "loss": 0.0784, - "step": 285 - }, - { - "epoch": 0.3645219577343075, - "grad_norm": 0.051025390625, - "learning_rate": 0.0001, - "loss": 0.0671, - "step": 290 - }, - { - "epoch": 0.37080681907455415, - "grad_norm": 0.0673828125, - "learning_rate": 0.0001, - "loss": 0.0517, - "step": 295 - }, - { - "epoch": 0.37709168041480084, - "grad_norm": 0.08203125, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 300 - }, - { - "epoch": 0.38337654175504754, - "grad_norm": 0.06201171875, - "learning_rate": 0.0001, - "loss": 0.3533, - "step": 305 - }, - { - "epoch": 0.38966140309529423, - "grad_norm": 0.055419921875, - "learning_rate": 0.0001, - "loss": 0.1495, - "step": 310 - }, - { - "epoch": 0.3959462644355409, - "grad_norm": 0.044189453125, - "learning_rate": 0.0001, - "loss": 0.0914, - "step": 315 - }, - { - "epoch": 0.40223112577578757, - "grad_norm": 0.042236328125, - "learning_rate": 0.0001, - "loss": 0.0759, - "step": 320 - }, - { - "epoch": 0.40851598711603426, - "grad_norm": 0.0439453125, - "learning_rate": 0.0001, - "loss": 0.0956, - "step": 325 - }, - { - "epoch": 0.41480084845628096, - "grad_norm": 0.042724609375, - "learning_rate": 0.0001, - "loss": 0.0874, - "step": 330 - }, - { - "epoch": 0.4210857097965276, - "grad_norm": 0.045166015625, - "learning_rate": 0.0001, - "loss": 0.0697, - "step": 335 - }, - { - "epoch": 0.4273705711367743, - "grad_norm": 0.140625, - "learning_rate": 0.0001, - "loss": 0.0647, - "step": 340 - }, - { - "epoch": 0.433655432477021, - "grad_norm": 0.0439453125, - "learning_rate": 0.0001, - "loss": 0.0538, - "step": 345 - }, - { - "epoch": 0.4399402938172677, - "grad_norm": 0.05029296875, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 350 - }, - { - "epoch": 0.4462251551575143, - "grad_norm": 0.0556640625, - "learning_rate": 0.0001, - "loss": 0.3265, - "step": 355 - }, - { - "epoch": 0.452510016497761, - "grad_norm": 0.06982421875, - "learning_rate": 0.0001, - "loss": 0.1376, - "step": 360 - }, - { - "epoch": 0.4587948778380077, - "grad_norm": 0.034423828125, - "learning_rate": 0.0001, - "loss": 0.0982, - "step": 365 - }, - { - "epoch": 0.4650797391782544, - "grad_norm": 0.042236328125, - "learning_rate": 0.0001, - "loss": 0.0813, - "step": 370 - }, - { - "epoch": 0.47136460051850104, - "grad_norm": 0.040771484375, - "learning_rate": 0.0001, - "loss": 0.0947, - "step": 375 - }, - { - "epoch": 0.47764946185874774, - "grad_norm": 0.0400390625, - "learning_rate": 0.0001, - "loss": 0.0847, - "step": 380 - }, - { - "epoch": 0.48393432319899443, - "grad_norm": 0.0419921875, - "learning_rate": 0.0001, - "loss": 0.0738, - "step": 385 - }, - { - "epoch": 0.4902191845392411, - "grad_norm": 0.043701171875, - "learning_rate": 0.0001, - "loss": 0.062, - "step": 390 - }, - { - "epoch": 0.49650404587948777, - "grad_norm": 0.08251953125, - "learning_rate": 0.0001, - "loss": 0.0558, - "step": 395 - }, - { - "epoch": 0.5027889072197345, - "grad_norm": 0.040771484375, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 400 - }, - { - "epoch": 0.5090737685599811, - "grad_norm": 0.062255859375, - "learning_rate": 0.0001, - "loss": 0.3109, - "step": 405 - }, - { - "epoch": 0.5153586299002278, - "grad_norm": 0.06689453125, - "learning_rate": 0.0001, - "loss": 0.1447, - "step": 410 - }, - { - "epoch": 0.5216434912404745, - "grad_norm": 0.033935546875, - "learning_rate": 0.0001, - "loss": 0.0943, - "step": 415 - }, - { - "epoch": 0.5279283525807212, - "grad_norm": 0.037353515625, - "learning_rate": 0.0001, - "loss": 0.0724, - "step": 420 - }, - { - "epoch": 0.5342132139209679, - "grad_norm": 0.03466796875, - "learning_rate": 0.0001, - "loss": 0.1063, - "step": 425 - }, - { - "epoch": 0.5404980752612145, - "grad_norm": 0.068359375, - "learning_rate": 0.0001, - "loss": 0.0855, - "step": 430 - }, - { - "epoch": 0.5467829366014613, - "grad_norm": 0.044677734375, - "learning_rate": 0.0001, - "loss": 0.076, - "step": 435 - }, - { - "epoch": 0.5530677979417079, - "grad_norm": 0.04638671875, - "learning_rate": 0.0001, - "loss": 0.0608, - "step": 440 - }, - { - "epoch": 0.5593526592819545, - "grad_norm": 0.03515625, - "learning_rate": 0.0001, - "loss": 0.0506, - "step": 445 - }, - { - "epoch": 0.5656375206222013, - "grad_norm": 0.02099609375, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 450 - }, - { - "epoch": 0.5719223819624479, - "grad_norm": 0.059326171875, - "learning_rate": 0.0001, - "loss": 0.2604, - "step": 455 - }, - { - "epoch": 0.5782072433026947, - "grad_norm": 0.07470703125, - "learning_rate": 0.0001, - "loss": 0.1273, - "step": 460 - }, - { - "epoch": 0.5844921046429413, - "grad_norm": 0.054931640625, - "learning_rate": 0.0001, - "loss": 0.094, - "step": 465 - }, - { - "epoch": 0.590776965983188, - "grad_norm": 0.021240234375, - "learning_rate": 0.0001, - "loss": 0.0642, - "step": 470 - }, - { - "epoch": 0.5970618273234347, - "grad_norm": 0.032958984375, - "learning_rate": 0.0001, - "loss": 0.0914, - "step": 475 - }, - { - "epoch": 0.6033466886636814, - "grad_norm": 0.0400390625, - "learning_rate": 0.0001, - "loss": 0.08, - "step": 480 - }, - { - "epoch": 0.609631550003928, - "grad_norm": 0.046875, - "learning_rate": 0.0001, - "loss": 0.0709, - "step": 485 - }, - { - "epoch": 0.6159164113441747, - "grad_norm": 0.048828125, - "learning_rate": 0.0001, - "loss": 0.0588, - "step": 490 - }, - { - "epoch": 0.6222012726844214, - "grad_norm": 0.056884765625, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 495 - }, - { - "epoch": 0.6284861340246681, - "grad_norm": 0.041259765625, - "learning_rate": 0.0001, - "loss": 0.0281, - "step": 500 - }, - { - "epoch": 0.6347709953649148, - "grad_norm": 0.064453125, - "learning_rate": 0.0001, - "loss": 0.2518, - "step": 505 - }, - { - "epoch": 0.6410558567051614, - "grad_norm": 0.058837890625, - "learning_rate": 0.0001, - "loss": 0.1275, - "step": 510 - }, - { - "epoch": 0.6473407180454082, - "grad_norm": 0.034912109375, - "learning_rate": 0.0001, - "loss": 0.086, - "step": 515 - }, - { - "epoch": 0.6536255793856548, - "grad_norm": 0.042236328125, - "learning_rate": 0.0001, - "loss": 0.0677, - "step": 520 - }, - { - "epoch": 0.6599104407259014, - "grad_norm": 0.03369140625, - "learning_rate": 0.0001, - "loss": 0.0934, - "step": 525 - }, - { - "epoch": 0.6661953020661482, - "grad_norm": 0.040771484375, - "learning_rate": 0.0001, - "loss": 0.0781, - "step": 530 - }, - { - "epoch": 0.6724801634063948, - "grad_norm": 0.041748046875, - "learning_rate": 0.0001, - "loss": 0.0638, - "step": 535 - }, - { - "epoch": 0.6787650247466416, - "grad_norm": 0.035888671875, - "learning_rate": 0.0001, - "loss": 0.0543, - "step": 540 - }, - { - "epoch": 0.6850498860868882, - "grad_norm": 0.0341796875, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 545 - }, - { - "epoch": 0.6913347474271349, - "grad_norm": 0.03271484375, - "learning_rate": 0.0001, - "loss": 0.0278, - "step": 550 - }, - { - "epoch": 0.6976196087673816, - "grad_norm": 0.055419921875, - "learning_rate": 0.0001, - "loss": 0.2516, - "step": 555 - }, - { - "epoch": 0.7039044701076282, - "grad_norm": 0.0634765625, - "learning_rate": 0.0001, - "loss": 0.1206, - "step": 560 - }, - { - "epoch": 0.7101893314478749, - "grad_norm": 0.038818359375, - "learning_rate": 0.0001, - "loss": 0.0805, - "step": 565 - }, - { - "epoch": 0.7164741927881216, - "grad_norm": 0.036865234375, - "learning_rate": 0.0001, - "loss": 0.0648, - "step": 570 - }, - { - "epoch": 0.7227590541283683, - "grad_norm": 0.03857421875, - "learning_rate": 0.0001, - "loss": 0.0835, - "step": 575 - }, - { - "epoch": 0.729043915468615, - "grad_norm": 0.041748046875, - "learning_rate": 0.0001, - "loss": 0.0773, - "step": 580 - }, - { - "epoch": 0.7353287768088617, - "grad_norm": 0.04443359375, - "learning_rate": 0.0001, - "loss": 0.0609, - "step": 585 - }, - { - "epoch": 0.7416136381491083, - "grad_norm": 0.05224609375, - "learning_rate": 0.0001, - "loss": 0.0516, - "step": 590 - }, - { - "epoch": 0.747898499489355, - "grad_norm": 0.1240234375, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 595 - }, - { - "epoch": 0.7541833608296017, - "grad_norm": 0.0206298828125, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 600 - }, - { - "epoch": 0.7604682221698483, - "grad_norm": 0.07080078125, - "learning_rate": 0.0001, - "loss": 0.243, - "step": 605 - }, - { - "epoch": 0.7667530835100951, - "grad_norm": 0.06494140625, - "learning_rate": 0.0001, - "loss": 0.1263, - "step": 610 - }, - { - "epoch": 0.7730379448503417, - "grad_norm": 0.0537109375, - "learning_rate": 0.0001, - "loss": 0.088, - "step": 615 - }, - { - "epoch": 0.7793228061905885, - "grad_norm": 0.03515625, - "learning_rate": 0.0001, - "loss": 0.0559, - "step": 620 - }, - { - "epoch": 0.7856076675308351, - "grad_norm": 0.047607421875, - "learning_rate": 0.0001, - "loss": 0.0853, - "step": 625 - }, - { - "epoch": 0.7918925288710817, - "grad_norm": 0.0419921875, - "learning_rate": 0.0001, - "loss": 0.0715, - "step": 630 - }, - { - "epoch": 0.7981773902113285, - "grad_norm": 0.0927734375, - "learning_rate": 0.0001, - "loss": 0.0598, - "step": 635 - }, - { - "epoch": 0.8044622515515751, - "grad_norm": 0.0419921875, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 640 - }, - { - "epoch": 0.8107471128918218, - "grad_norm": 0.043701171875, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 645 - }, - { - "epoch": 0.8170319742320685, - "grad_norm": 0.033935546875, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 650 - }, - { - "epoch": 0.8233168355723152, - "grad_norm": 0.08251953125, - "learning_rate": 0.0001, - "loss": 0.2336, - "step": 655 - }, - { - "epoch": 0.8296016969125619, - "grad_norm": 0.053955078125, - "learning_rate": 0.0001, - "loss": 0.1183, - "step": 660 - }, - { - "epoch": 0.8358865582528086, - "grad_norm": 0.03759765625, - "learning_rate": 0.0001, - "loss": 0.0826, - "step": 665 - }, - { - "epoch": 0.8421714195930552, - "grad_norm": 0.046142578125, - "learning_rate": 0.0001, - "loss": 0.0657, - "step": 670 - }, - { - "epoch": 0.8484562809333019, - "grad_norm": 0.04248046875, - "learning_rate": 0.0001, - "loss": 0.0845, - "step": 675 - }, - { - "epoch": 0.8547411422735486, - "grad_norm": 0.048828125, - "learning_rate": 0.0001, - "loss": 0.0663, - "step": 680 - }, - { - "epoch": 0.8610260036137952, - "grad_norm": 0.0625, - "learning_rate": 0.0001, - "loss": 0.0565, - "step": 685 - }, - { - "epoch": 0.867310864954042, - "grad_norm": 0.05810546875, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 690 - }, - { - "epoch": 0.8735957262942886, - "grad_norm": 0.054931640625, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 695 - }, - { - "epoch": 0.8798805876345354, - "grad_norm": 0.037353515625, - "learning_rate": 0.0001, - "loss": 0.0268, - "step": 700 - }, - { - "epoch": 0.886165448974782, - "grad_norm": 0.09521484375, - "learning_rate": 0.0001, - "loss": 0.2371, - "step": 705 - }, - { - "epoch": 0.8924503103150286, - "grad_norm": 0.06494140625, - "learning_rate": 0.0001, - "loss": 0.1144, - "step": 710 - }, - { - "epoch": 0.8987351716552754, - "grad_norm": 0.041748046875, - "learning_rate": 0.0001, - "loss": 0.0906, - "step": 715 - }, - { - "epoch": 0.905020032995522, - "grad_norm": 0.033203125, - "learning_rate": 0.0001, - "loss": 0.0549, - "step": 720 - } - ], - "logging_steps": 5, - "max_steps": 720, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 90, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 1.5364568007927398e+18, - "train_batch_size": 4, - "trial_name": null, - "trial_params": null -} diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/training_args.bin b/codellama/java/dataflow_pretrained/checkpoint-720/training_args.bin deleted file mode 100644 index 63447c8ef1abaa098f00b023ed64c96e71210d61..0000000000000000000000000000000000000000 --- a/codellama/java/dataflow_pretrained/checkpoint-720/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:824c938bee04d46c16bd5438c177873620e56e36a6e51c3a35b2b80c6e87b25b -size 7416 diff --git a/codellama/java/dataflow_pretrained/metrics.json b/codellama/java/dataflow_pretrained/metrics.json deleted file mode 100644 index 4ba5fb8953fd9541518eb85f3a79275a34fa88c1..0000000000000000000000000000000000000000 --- a/codellama/java/dataflow_pretrained/metrics.json +++ /dev/null @@ -1 +0,0 @@ -{"run_name": "dataflow_pretrained_java", "train_runtime": 69215.1765, "train_samples_per_second": 0.666, "train_steps_per_second": 0.01, "total_flos": 1.5364568007927398e+18, "train_loss": 0.11899957797593541, "epoch": 0.905020032995522} \ No newline at end of file diff --git a/codellama/java/dataflow_pretrained/train_results.json b/codellama/java/dataflow_pretrained/train_results.json deleted file mode 100644 index 64bfe2710811ec1b306126c677d1dd39ec762ea4..0000000000000000000000000000000000000000 --- a/codellama/java/dataflow_pretrained/train_results.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "epoch": 0.905020032995522, - "total_flos": 1.5364568007927398e+18, - "train_loss": 0.11899957797593541, - "train_runtime": 69215.1765, - "train_samples_per_second": 0.666, - "train_steps_per_second": 0.01 -} \ No newline at end of file diff --git a/codellama/java/dataflow_pretrained/trainer_state.json b/codellama/java/dataflow_pretrained/trainer_state.json deleted file mode 100644 index ef68b916109e586c454f1fd1c4f3eb75ecb265e4..0000000000000000000000000000000000000000 --- a/codellama/java/dataflow_pretrained/trainer_state.json +++ /dev/null @@ -1,1050 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.905020032995522, - "eval_steps": 500, - "global_step": 720, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.006284861340246681, - "grad_norm": 0.0712890625, - "learning_rate": 0.0001, - "loss": 0.7884, - "step": 5 - }, - { - "epoch": 0.012569722680493361, - "grad_norm": 0.1318359375, - "learning_rate": 0.0001, - "loss": 0.5229, - "step": 10 - }, - { - "epoch": 0.018854584020740042, - "grad_norm": 0.0927734375, - "learning_rate": 0.0001, - "loss": 0.3535, - "step": 15 - }, - { - "epoch": 0.025139445360986723, - "grad_norm": 0.08251953125, - "learning_rate": 0.0001, - "loss": 0.2525, - "step": 20 - }, - { - "epoch": 0.031424306701233404, - "grad_norm": 0.0751953125, - "learning_rate": 0.0001, - "loss": 0.229, - "step": 25 - }, - { - "epoch": 0.037709168041480084, - "grad_norm": 0.10888671875, - "learning_rate": 0.0001, - "loss": 0.204, - "step": 30 - }, - { - "epoch": 0.043994029381726765, - "grad_norm": 0.0927734375, - "learning_rate": 0.0001, - "loss": 0.1598, - "step": 35 - }, - { - "epoch": 0.050278890721973446, - "grad_norm": 0.06494140625, - "learning_rate": 0.0001, - "loss": 0.1241, - "step": 40 - }, - { - "epoch": 0.05656375206222013, - "grad_norm": 0.059814453125, - "learning_rate": 0.0001, - "loss": 0.1026, - "step": 45 - }, - { - "epoch": 0.06284861340246681, - "grad_norm": 0.2265625, - "learning_rate": 0.0001, - "loss": 0.0843, - "step": 50 - }, - { - "epoch": 0.06913347474271349, - "grad_norm": 0.08349609375, - "learning_rate": 0.0001, - "loss": 0.5241, - "step": 55 - }, - { - "epoch": 0.07541833608296017, - "grad_norm": 0.07958984375, - "learning_rate": 0.0001, - "loss": 0.1898, - "step": 60 - }, - { - "epoch": 0.08170319742320685, - "grad_norm": 0.052490234375, - "learning_rate": 0.0001, - "loss": 0.1542, - "step": 65 - }, - { - "epoch": 0.08798805876345353, - "grad_norm": 0.0546875, - "learning_rate": 0.0001, - "loss": 0.1152, - "step": 70 - }, - { - "epoch": 0.09427292010370021, - "grad_norm": 0.058349609375, - "learning_rate": 0.0001, - "loss": 0.1399, - "step": 75 - }, - { - "epoch": 0.10055778144394689, - "grad_norm": 0.04052734375, - "learning_rate": 0.0001, - "loss": 0.1282, - "step": 80 - }, - { - "epoch": 0.10684264278419357, - "grad_norm": 0.044189453125, - "learning_rate": 0.0001, - "loss": 0.1135, - "step": 85 - }, - { - "epoch": 0.11312750412444025, - "grad_norm": 0.037109375, - "learning_rate": 0.0001, - "loss": 0.0923, - "step": 90 - }, - { - "epoch": 0.11941236546468693, - "grad_norm": 0.050048828125, - "learning_rate": 0.0001, - "loss": 0.0895, - "step": 95 - }, - { - "epoch": 0.12569722680493361, - "grad_norm": 0.064453125, - "learning_rate": 0.0001, - "loss": 0.0574, - "step": 100 - }, - { - "epoch": 0.1319820881451803, - "grad_norm": 0.0625, - "learning_rate": 0.0001, - "loss": 0.3794, - "step": 105 - }, - { - "epoch": 0.13826694948542698, - "grad_norm": 0.04443359375, - "learning_rate": 0.0001, - "loss": 0.1638, - "step": 110 - }, - { - "epoch": 0.14455181082567367, - "grad_norm": 0.04931640625, - "learning_rate": 0.0001, - "loss": 0.1154, - "step": 115 - }, - { - "epoch": 0.15083667216592034, - "grad_norm": 0.04931640625, - "learning_rate": 0.0001, - "loss": 0.0967, - "step": 120 - }, - { - "epoch": 0.15712153350616703, - "grad_norm": 0.04248046875, - "learning_rate": 0.0001, - "loss": 0.1275, - "step": 125 - }, - { - "epoch": 0.1634063948464137, - "grad_norm": 0.03759765625, - "learning_rate": 0.0001, - "loss": 0.11, - "step": 130 - }, - { - "epoch": 0.1696912561866604, - "grad_norm": 0.039794921875, - "learning_rate": 0.0001, - "loss": 0.0986, - "step": 135 - }, - { - "epoch": 0.17597611752690706, - "grad_norm": 0.042724609375, - "learning_rate": 0.0001, - "loss": 0.082, - "step": 140 - }, - { - "epoch": 0.18226097886715376, - "grad_norm": 0.049072265625, - "learning_rate": 0.0001, - "loss": 0.0729, - "step": 145 - }, - { - "epoch": 0.18854584020740042, - "grad_norm": 0.044189453125, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 150 - }, - { - "epoch": 0.19483070154764712, - "grad_norm": 0.06982421875, - "learning_rate": 0.0001, - "loss": 0.3499, - "step": 155 - }, - { - "epoch": 0.20111556288789378, - "grad_norm": 0.054931640625, - "learning_rate": 0.0001, - "loss": 0.1535, - "step": 160 - }, - { - "epoch": 0.20740042422814048, - "grad_norm": 0.045166015625, - "learning_rate": 0.0001, - "loss": 0.1166, - "step": 165 - }, - { - "epoch": 0.21368528556838715, - "grad_norm": 0.047119140625, - "learning_rate": 0.0001, - "loss": 0.0816, - "step": 170 - }, - { - "epoch": 0.21997014690863384, - "grad_norm": 0.0634765625, - "learning_rate": 0.0001, - "loss": 0.1164, - "step": 175 - }, - { - "epoch": 0.2262550082488805, - "grad_norm": 0.0478515625, - "learning_rate": 0.0001, - "loss": 0.1004, - "step": 180 - }, - { - "epoch": 0.2325398695891272, - "grad_norm": 0.06103515625, - "learning_rate": 0.0001, - "loss": 0.092, - "step": 185 - }, - { - "epoch": 0.23882473092937387, - "grad_norm": 0.0458984375, - "learning_rate": 0.0001, - "loss": 0.0815, - "step": 190 - }, - { - "epoch": 0.24510959226962056, - "grad_norm": 0.05419921875, - "learning_rate": 0.0001, - "loss": 0.0708, - "step": 195 - }, - { - "epoch": 0.25139445360986723, - "grad_norm": 0.07763671875, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 200 - }, - { - "epoch": 0.2576793149501139, - "grad_norm": 0.11376953125, - "learning_rate": 0.0001, - "loss": 0.3435, - "step": 205 - }, - { - "epoch": 0.2639641762903606, - "grad_norm": 0.057373046875, - "learning_rate": 0.0001, - "loss": 0.1445, - "step": 210 - }, - { - "epoch": 0.27024903763060726, - "grad_norm": 0.03759765625, - "learning_rate": 0.0001, - "loss": 0.1052, - "step": 215 - }, - { - "epoch": 0.27653389897085395, - "grad_norm": 0.03515625, - "learning_rate": 0.0001, - "loss": 0.0789, - "step": 220 - }, - { - "epoch": 0.28281876031110065, - "grad_norm": 0.043212890625, - "learning_rate": 0.0001, - "loss": 0.1068, - "step": 225 - }, - { - "epoch": 0.28910362165134734, - "grad_norm": 0.043212890625, - "learning_rate": 0.0001, - "loss": 0.0958, - "step": 230 - }, - { - "epoch": 0.295388482991594, - "grad_norm": 0.04052734375, - "learning_rate": 0.0001, - "loss": 0.0817, - "step": 235 - }, - { - "epoch": 0.3016733443318407, - "grad_norm": 0.06494140625, - "learning_rate": 0.0001, - "loss": 0.07, - "step": 240 - }, - { - "epoch": 0.30795820567208737, - "grad_norm": 0.047607421875, - "learning_rate": 0.0001, - "loss": 0.0596, - "step": 245 - }, - { - "epoch": 0.31424306701233407, - "grad_norm": 0.06884765625, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 250 - }, - { - "epoch": 0.3205279283525807, - "grad_norm": 0.08154296875, - "learning_rate": 0.0001, - "loss": 0.3339, - "step": 255 - }, - { - "epoch": 0.3268127896928274, - "grad_norm": 0.048583984375, - "learning_rate": 0.0001, - "loss": 0.1467, - "step": 260 - }, - { - "epoch": 0.3330976510330741, - "grad_norm": 0.035400390625, - "learning_rate": 0.0001, - "loss": 0.1058, - "step": 265 - }, - { - "epoch": 0.3393825123733208, - "grad_norm": 0.034423828125, - "learning_rate": 0.0001, - "loss": 0.0701, - "step": 270 - }, - { - "epoch": 0.3456673737135674, - "grad_norm": 0.04443359375, - "learning_rate": 0.0001, - "loss": 0.1052, - "step": 275 - }, - { - "epoch": 0.3519522350538141, - "grad_norm": 0.047119140625, - "learning_rate": 0.0001, - "loss": 0.0958, - "step": 280 - }, - { - "epoch": 0.3582370963940608, - "grad_norm": 0.033447265625, - "learning_rate": 0.0001, - "loss": 0.0784, - "step": 285 - }, - { - "epoch": 0.3645219577343075, - "grad_norm": 0.051025390625, - "learning_rate": 0.0001, - "loss": 0.0671, - "step": 290 - }, - { - "epoch": 0.37080681907455415, - "grad_norm": 0.0673828125, - "learning_rate": 0.0001, - "loss": 0.0517, - "step": 295 - }, - { - "epoch": 0.37709168041480084, - "grad_norm": 0.08203125, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 300 - }, - { - "epoch": 0.38337654175504754, - "grad_norm": 0.06201171875, - "learning_rate": 0.0001, - "loss": 0.3533, - "step": 305 - }, - { - "epoch": 0.38966140309529423, - "grad_norm": 0.055419921875, - "learning_rate": 0.0001, - "loss": 0.1495, - "step": 310 - }, - { - "epoch": 0.3959462644355409, - "grad_norm": 0.044189453125, - "learning_rate": 0.0001, - "loss": 0.0914, - "step": 315 - }, - { - "epoch": 0.40223112577578757, - "grad_norm": 0.042236328125, - "learning_rate": 0.0001, - "loss": 0.0759, - "step": 320 - }, - { - "epoch": 0.40851598711603426, - "grad_norm": 0.0439453125, - "learning_rate": 0.0001, - "loss": 0.0956, - "step": 325 - }, - { - "epoch": 0.41480084845628096, - "grad_norm": 0.042724609375, - "learning_rate": 0.0001, - "loss": 0.0874, - "step": 330 - }, - { - "epoch": 0.4210857097965276, - "grad_norm": 0.045166015625, - "learning_rate": 0.0001, - "loss": 0.0697, - "step": 335 - }, - { - "epoch": 0.4273705711367743, - "grad_norm": 0.140625, - "learning_rate": 0.0001, - "loss": 0.0647, - "step": 340 - }, - { - "epoch": 0.433655432477021, - "grad_norm": 0.0439453125, - "learning_rate": 0.0001, - "loss": 0.0538, - "step": 345 - }, - { - "epoch": 0.4399402938172677, - "grad_norm": 0.05029296875, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 350 - }, - { - "epoch": 0.4462251551575143, - "grad_norm": 0.0556640625, - "learning_rate": 0.0001, - "loss": 0.3265, - "step": 355 - }, - { - "epoch": 0.452510016497761, - "grad_norm": 0.06982421875, - "learning_rate": 0.0001, - "loss": 0.1376, - "step": 360 - }, - { - "epoch": 0.4587948778380077, - "grad_norm": 0.034423828125, - "learning_rate": 0.0001, - "loss": 0.0982, - "step": 365 - }, - { - "epoch": 0.4650797391782544, - "grad_norm": 0.042236328125, - "learning_rate": 0.0001, - "loss": 0.0813, - "step": 370 - }, - { - "epoch": 0.47136460051850104, - "grad_norm": 0.040771484375, - "learning_rate": 0.0001, - "loss": 0.0947, - "step": 375 - }, - { - "epoch": 0.47764946185874774, - "grad_norm": 0.0400390625, - "learning_rate": 0.0001, - "loss": 0.0847, - "step": 380 - }, - { - "epoch": 0.48393432319899443, - "grad_norm": 0.0419921875, - "learning_rate": 0.0001, - "loss": 0.0738, - "step": 385 - }, - { - "epoch": 0.4902191845392411, - "grad_norm": 0.043701171875, - "learning_rate": 0.0001, - "loss": 0.062, - "step": 390 - }, - { - "epoch": 0.49650404587948777, - "grad_norm": 0.08251953125, - "learning_rate": 0.0001, - "loss": 0.0558, - "step": 395 - }, - { - "epoch": 0.5027889072197345, - "grad_norm": 0.040771484375, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 400 - }, - { - "epoch": 0.5090737685599811, - "grad_norm": 0.062255859375, - "learning_rate": 0.0001, - "loss": 0.3109, - "step": 405 - }, - { - "epoch": 0.5153586299002278, - "grad_norm": 0.06689453125, - "learning_rate": 0.0001, - "loss": 0.1447, - "step": 410 - }, - { - "epoch": 0.5216434912404745, - "grad_norm": 0.033935546875, - "learning_rate": 0.0001, - "loss": 0.0943, - "step": 415 - }, - { - "epoch": 0.5279283525807212, - "grad_norm": 0.037353515625, - "learning_rate": 0.0001, - "loss": 0.0724, - "step": 420 - }, - { - "epoch": 0.5342132139209679, - "grad_norm": 0.03466796875, - "learning_rate": 0.0001, - "loss": 0.1063, - "step": 425 - }, - { - "epoch": 0.5404980752612145, - "grad_norm": 0.068359375, - "learning_rate": 0.0001, - "loss": 0.0855, - "step": 430 - }, - { - "epoch": 0.5467829366014613, - "grad_norm": 0.044677734375, - "learning_rate": 0.0001, - "loss": 0.076, - "step": 435 - }, - { - "epoch": 0.5530677979417079, - "grad_norm": 0.04638671875, - "learning_rate": 0.0001, - "loss": 0.0608, - "step": 440 - }, - { - "epoch": 0.5593526592819545, - "grad_norm": 0.03515625, - "learning_rate": 0.0001, - "loss": 0.0506, - "step": 445 - }, - { - "epoch": 0.5656375206222013, - "grad_norm": 0.02099609375, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 450 - }, - { - "epoch": 0.5719223819624479, - "grad_norm": 0.059326171875, - "learning_rate": 0.0001, - "loss": 0.2604, - "step": 455 - }, - { - "epoch": 0.5782072433026947, - "grad_norm": 0.07470703125, - "learning_rate": 0.0001, - "loss": 0.1273, - "step": 460 - }, - { - "epoch": 0.5844921046429413, - "grad_norm": 0.054931640625, - "learning_rate": 0.0001, - "loss": 0.094, - "step": 465 - }, - { - "epoch": 0.590776965983188, - "grad_norm": 0.021240234375, - "learning_rate": 0.0001, - "loss": 0.0642, - "step": 470 - }, - { - "epoch": 0.5970618273234347, - "grad_norm": 0.032958984375, - "learning_rate": 0.0001, - "loss": 0.0914, - "step": 475 - }, - { - "epoch": 0.6033466886636814, - "grad_norm": 0.0400390625, - "learning_rate": 0.0001, - "loss": 0.08, - "step": 480 - }, - { - "epoch": 0.609631550003928, - "grad_norm": 0.046875, - "learning_rate": 0.0001, - "loss": 0.0709, - "step": 485 - }, - { - "epoch": 0.6159164113441747, - "grad_norm": 0.048828125, - "learning_rate": 0.0001, - "loss": 0.0588, - "step": 490 - }, - { - "epoch": 0.6222012726844214, - "grad_norm": 0.056884765625, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 495 - }, - { - "epoch": 0.6284861340246681, - "grad_norm": 0.041259765625, - "learning_rate": 0.0001, - "loss": 0.0281, - "step": 500 - }, - { - "epoch": 0.6347709953649148, - "grad_norm": 0.064453125, - "learning_rate": 0.0001, - "loss": 0.2518, - "step": 505 - }, - { - "epoch": 0.6410558567051614, - "grad_norm": 0.058837890625, - "learning_rate": 0.0001, - "loss": 0.1275, - "step": 510 - }, - { - "epoch": 0.6473407180454082, - "grad_norm": 0.034912109375, - "learning_rate": 0.0001, - "loss": 0.086, - "step": 515 - }, - { - "epoch": 0.6536255793856548, - "grad_norm": 0.042236328125, - "learning_rate": 0.0001, - "loss": 0.0677, - "step": 520 - }, - { - "epoch": 0.6599104407259014, - "grad_norm": 0.03369140625, - "learning_rate": 0.0001, - "loss": 0.0934, - "step": 525 - }, - { - "epoch": 0.6661953020661482, - "grad_norm": 0.040771484375, - "learning_rate": 0.0001, - "loss": 0.0781, - "step": 530 - }, - { - "epoch": 0.6724801634063948, - "grad_norm": 0.041748046875, - "learning_rate": 0.0001, - "loss": 0.0638, - "step": 535 - }, - { - "epoch": 0.6787650247466416, - "grad_norm": 0.035888671875, - "learning_rate": 0.0001, - "loss": 0.0543, - "step": 540 - }, - { - "epoch": 0.6850498860868882, - "grad_norm": 0.0341796875, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 545 - }, - { - "epoch": 0.6913347474271349, - "grad_norm": 0.03271484375, - "learning_rate": 0.0001, - "loss": 0.0278, - "step": 550 - }, - { - "epoch": 0.6976196087673816, - "grad_norm": 0.055419921875, - "learning_rate": 0.0001, - "loss": 0.2516, - "step": 555 - }, - { - "epoch": 0.7039044701076282, - "grad_norm": 0.0634765625, - "learning_rate": 0.0001, - "loss": 0.1206, - "step": 560 - }, - { - "epoch": 0.7101893314478749, - "grad_norm": 0.038818359375, - "learning_rate": 0.0001, - "loss": 0.0805, - "step": 565 - }, - { - "epoch": 0.7164741927881216, - "grad_norm": 0.036865234375, - "learning_rate": 0.0001, - "loss": 0.0648, - "step": 570 - }, - { - "epoch": 0.7227590541283683, - "grad_norm": 0.03857421875, - "learning_rate": 0.0001, - "loss": 0.0835, - "step": 575 - }, - { - "epoch": 0.729043915468615, - "grad_norm": 0.041748046875, - "learning_rate": 0.0001, - "loss": 0.0773, - "step": 580 - }, - { - "epoch": 0.7353287768088617, - "grad_norm": 0.04443359375, - "learning_rate": 0.0001, - "loss": 0.0609, - "step": 585 - }, - { - "epoch": 0.7416136381491083, - "grad_norm": 0.05224609375, - "learning_rate": 0.0001, - "loss": 0.0516, - "step": 590 - }, - { - "epoch": 0.747898499489355, - "grad_norm": 0.1240234375, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 595 - }, - { - "epoch": 0.7541833608296017, - "grad_norm": 0.0206298828125, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 600 - }, - { - "epoch": 0.7604682221698483, - "grad_norm": 0.07080078125, - "learning_rate": 0.0001, - "loss": 0.243, - "step": 605 - }, - { - "epoch": 0.7667530835100951, - "grad_norm": 0.06494140625, - "learning_rate": 0.0001, - "loss": 0.1263, - "step": 610 - }, - { - "epoch": 0.7730379448503417, - "grad_norm": 0.0537109375, - "learning_rate": 0.0001, - "loss": 0.088, - "step": 615 - }, - { - "epoch": 0.7793228061905885, - "grad_norm": 0.03515625, - "learning_rate": 0.0001, - "loss": 0.0559, - "step": 620 - }, - { - "epoch": 0.7856076675308351, - "grad_norm": 0.047607421875, - "learning_rate": 0.0001, - "loss": 0.0853, - "step": 625 - }, - { - "epoch": 0.7918925288710817, - "grad_norm": 0.0419921875, - "learning_rate": 0.0001, - "loss": 0.0715, - "step": 630 - }, - { - "epoch": 0.7981773902113285, - "grad_norm": 0.0927734375, - "learning_rate": 0.0001, - "loss": 0.0598, - "step": 635 - }, - { - "epoch": 0.8044622515515751, - "grad_norm": 0.0419921875, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 640 - }, - { - "epoch": 0.8107471128918218, - "grad_norm": 0.043701171875, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 645 - }, - { - "epoch": 0.8170319742320685, - "grad_norm": 0.033935546875, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 650 - }, - { - "epoch": 0.8233168355723152, - "grad_norm": 0.08251953125, - "learning_rate": 0.0001, - "loss": 0.2336, - "step": 655 - }, - { - "epoch": 0.8296016969125619, - "grad_norm": 0.053955078125, - "learning_rate": 0.0001, - "loss": 0.1183, - "step": 660 - }, - { - "epoch": 0.8358865582528086, - "grad_norm": 0.03759765625, - "learning_rate": 0.0001, - "loss": 0.0826, - "step": 665 - }, - { - "epoch": 0.8421714195930552, - "grad_norm": 0.046142578125, - "learning_rate": 0.0001, - "loss": 0.0657, - "step": 670 - }, - { - "epoch": 0.8484562809333019, - "grad_norm": 0.04248046875, - "learning_rate": 0.0001, - "loss": 0.0845, - "step": 675 - }, - { - "epoch": 0.8547411422735486, - "grad_norm": 0.048828125, - "learning_rate": 0.0001, - "loss": 0.0663, - "step": 680 - }, - { - "epoch": 0.8610260036137952, - "grad_norm": 0.0625, - "learning_rate": 0.0001, - "loss": 0.0565, - "step": 685 - }, - { - "epoch": 0.867310864954042, - "grad_norm": 0.05810546875, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 690 - }, - { - "epoch": 0.8735957262942886, - "grad_norm": 0.054931640625, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 695 - }, - { - "epoch": 0.8798805876345354, - "grad_norm": 0.037353515625, - "learning_rate": 0.0001, - "loss": 0.0268, - "step": 700 - }, - { - "epoch": 0.886165448974782, - "grad_norm": 0.09521484375, - "learning_rate": 0.0001, - "loss": 0.2371, - "step": 705 - }, - { - "epoch": 0.8924503103150286, - "grad_norm": 0.06494140625, - "learning_rate": 0.0001, - "loss": 0.1144, - "step": 710 - }, - { - "epoch": 0.8987351716552754, - "grad_norm": 0.041748046875, - "learning_rate": 0.0001, - "loss": 0.0906, - "step": 715 - }, - { - "epoch": 0.905020032995522, - "grad_norm": 0.033203125, - "learning_rate": 0.0001, - "loss": 0.0549, - "step": 720 - }, - { - "epoch": 0.905020032995522, - "step": 720, - "total_flos": 1.5364568007927398e+18, - "train_loss": 0.11899957797593541, - "train_runtime": 69215.1765, - "train_samples_per_second": 0.666, - "train_steps_per_second": 0.01 - } - ], - "logging_steps": 5, - "max_steps": 720, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 90, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 1.5364568007927398e+18, - "train_batch_size": 4, - "trial_name": null, - "trial_params": null -} diff --git a/jam/jam-dataflow/ckpt.pt b/jam/jam-dataflow/ckpt.pt index 0062281a6c0edd54f20dd76a6eb1770035a2af8a..d248bc1b78070e9e53278bb3e7b24e904213be0f 100644 --- a/jam/jam-dataflow/ckpt.pt +++ b/jam/jam-dataflow/ckpt.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a48545ac3c2f22e1c75a304ee2c8426b58cb452858a8a070d712ccff156fcce -size 4255365370 +oid sha256:fb546c749c22378e34791cb47bc6f8195300a3f538e2c30150d592a71d9afc1a +size 4255365797