dzungpham commited on
Commit
4f8c4ed
·
verified ·
1 Parent(s): 221c903

upload checkpoints

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. graphcodebert-swa-from-epoch-1/checkpoint-1100/config.json +29 -0
  2. graphcodebert-swa-from-epoch-1/checkpoint-1100/config_hyperparams.json +53 -0
  3. graphcodebert-swa-from-epoch-1/checkpoint-1100/merges.txt +0 -0
  4. graphcodebert-swa-from-epoch-1/checkpoint-1100/model.safetensors +3 -0
  5. graphcodebert-swa-from-epoch-1/checkpoint-1100/optimizer.pt +3 -0
  6. graphcodebert-swa-from-epoch-1/checkpoint-1100/rng_state.pth +3 -0
  7. graphcodebert-swa-from-epoch-1/checkpoint-1100/scaler.pt +3 -0
  8. graphcodebert-swa-from-epoch-1/checkpoint-1100/scheduler.pt +3 -0
  9. graphcodebert-swa-from-epoch-1/checkpoint-1100/special_tokens_map.json +51 -0
  10. graphcodebert-swa-from-epoch-1/checkpoint-1100/tokenizer.json +0 -0
  11. graphcodebert-swa-from-epoch-1/checkpoint-1100/tokenizer_config.json +58 -0
  12. graphcodebert-swa-from-epoch-1/checkpoint-1100/trainer_state.json +1720 -0
  13. graphcodebert-swa-from-epoch-1/checkpoint-1100/training_args.bin +3 -0
  14. graphcodebert-swa-from-epoch-1/checkpoint-1100/vocab.json +0 -0
  15. graphcodebert-swa-from-epoch-1/checkpoint-1200/config.json +29 -0
  16. graphcodebert-swa-from-epoch-1/checkpoint-1200/config_hyperparams.json +53 -0
  17. graphcodebert-swa-from-epoch-1/checkpoint-1200/merges.txt +0 -0
  18. graphcodebert-swa-from-epoch-1/checkpoint-1200/model.safetensors +3 -0
  19. graphcodebert-swa-from-epoch-1/checkpoint-1200/optimizer.pt +3 -0
  20. graphcodebert-swa-from-epoch-1/checkpoint-1200/rng_state.pth +3 -0
  21. graphcodebert-swa-from-epoch-1/checkpoint-1200/scaler.pt +3 -0
  22. graphcodebert-swa-from-epoch-1/checkpoint-1200/scheduler.pt +3 -0
  23. graphcodebert-swa-from-epoch-1/checkpoint-1200/special_tokens_map.json +51 -0
  24. graphcodebert-swa-from-epoch-1/checkpoint-1200/tokenizer.json +0 -0
  25. graphcodebert-swa-from-epoch-1/checkpoint-1200/tokenizer_config.json +58 -0
  26. graphcodebert-swa-from-epoch-1/checkpoint-1200/trainer_state.json +1872 -0
  27. graphcodebert-swa-from-epoch-1/checkpoint-1200/training_args.bin +3 -0
  28. graphcodebert-swa-from-epoch-1/checkpoint-1200/vocab.json +0 -0
  29. graphcodebert-swa-from-epoch-1/checkpoint-1300/config.json +29 -0
  30. graphcodebert-swa-from-epoch-1/checkpoint-1300/config_hyperparams.json +53 -0
  31. graphcodebert-swa-from-epoch-1/checkpoint-1300/merges.txt +0 -0
  32. graphcodebert-swa-from-epoch-1/checkpoint-1300/model.safetensors +3 -0
  33. graphcodebert-swa-from-epoch-1/checkpoint-1300/optimizer.pt +3 -0
  34. graphcodebert-swa-from-epoch-1/checkpoint-1300/rng_state.pth +3 -0
  35. graphcodebert-swa-from-epoch-1/checkpoint-1300/scaler.pt +3 -0
  36. graphcodebert-swa-from-epoch-1/checkpoint-1300/scheduler.pt +3 -0
  37. graphcodebert-swa-from-epoch-1/checkpoint-1300/special_tokens_map.json +51 -0
  38. graphcodebert-swa-from-epoch-1/checkpoint-1300/tokenizer.json +0 -0
  39. graphcodebert-swa-from-epoch-1/checkpoint-1300/tokenizer_config.json +58 -0
  40. graphcodebert-swa-from-epoch-1/checkpoint-1300/trainer_state.json +2024 -0
  41. graphcodebert-swa-from-epoch-1/checkpoint-1300/training_args.bin +3 -0
  42. graphcodebert-swa-from-epoch-1/checkpoint-1300/vocab.json +0 -0
  43. graphcodebert-swa-from-epoch-1/checkpoint-1400/config.json +29 -0
  44. graphcodebert-swa-from-epoch-1/checkpoint-1400/config_hyperparams.json +53 -0
  45. graphcodebert-swa-from-epoch-1/checkpoint-1400/merges.txt +0 -0
  46. graphcodebert-swa-from-epoch-1/checkpoint-1400/model.safetensors +3 -0
  47. graphcodebert-swa-from-epoch-1/checkpoint-1400/optimizer.pt +3 -0
  48. graphcodebert-swa-from-epoch-1/checkpoint-1400/rng_state.pth +3 -0
  49. graphcodebert-swa-from-epoch-1/checkpoint-1400/scaler.pt +3 -0
  50. graphcodebert-swa-from-epoch-1/checkpoint-1400/scheduler.pt +3 -0
graphcodebert-swa-from-epoch-1/checkpoint-1100/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.3,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": 0.3,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.3,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 514,
18
+ "model_type": "roberta",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "output_past": true,
22
+ "pad_token_id": 1,
23
+ "position_embedding_type": "absolute",
24
+ "problem_type": "single_label_classification",
25
+ "transformers_version": "4.56.0",
26
+ "type_vocab_size": 1,
27
+ "use_cache": true,
28
+ "vocab_size": 50265
29
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1100/config_hyperparams.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_config": {
3
+ "model_name": "/kaggle/input/models/dzung271828/microsoft-graphcodebert-base/transformers/default/1",
4
+ "num_epochs": 4,
5
+ "batch_size": 1024,
6
+ "learning_rate": 1e-06,
7
+ "max_length": 512,
8
+ "num_labels": 2,
9
+ "loss_type": "r-drop",
10
+ "focal_alpha": 1.0,
11
+ "focal_gamma": 2.0,
12
+ "r_drop_alpha": 10.0,
13
+ "infonce_temperature": 0.07,
14
+ "infonce_weight": 0.5,
15
+ "label_smoothing": 0.5,
16
+ "adversarial_epsilon": 0.5,
17
+ "use_swa": true,
18
+ "swa_start_epoch": 1,
19
+ "swa_lr": 1e-05,
20
+ "data_augmentation": true,
21
+ "aug_rename_prob": 0.8,
22
+ "aug_format_prob": 0.8,
23
+ "freeze_base": true,
24
+ "seed": 42,
25
+ "use_wandb": false,
26
+ "mixup_alpha": 1.0,
27
+ "low_pass_keep_ratio": 0.5,
28
+ "freq_consistency_weight": 0.5
29
+ },
30
+ "training_arguments": {
31
+ "output_dir": "graphcodebert-swa-from-epoch-1/",
32
+ "num_train_epochs": 4,
33
+ "per_device_train_batch_size": 1024,
34
+ "per_device_eval_batch_size": 2048,
35
+ "learning_rate": 1e-06,
36
+ "warmup_steps": 195,
37
+ "weight_decay": 0.1,
38
+ "logging_steps": 5,
39
+ "eval_steps": 100,
40
+ "save_steps": 100,
41
+ "metric_for_best_model": "macro_f1",
42
+ "greater_is_better": true,
43
+ "save_total_limit": 5,
44
+ "fp16": true,
45
+ "seed": 42
46
+ },
47
+ "training_state": {
48
+ "global_step": 1100,
49
+ "epoch": 2.2494887525562373,
50
+ "best_metric": 0.6353491904387377,
51
+ "best_model_checkpoint": "graphcodebert-swa-from-epoch-1/checkpoint-1100"
52
+ }
53
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1100/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
graphcodebert-swa-from-epoch-1/checkpoint-1100/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c9417a2936e7f0ce73d8c66376e5076e086152e1065d08a19c88e74a6d9d60b
3
+ size 498612824
graphcodebert-swa-from-epoch-1/checkpoint-1100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e3659f041e4c8d82648f51304b5085f2588948bfb88a4a567f372a2f90da828
3
+ size 4741923
graphcodebert-swa-from-epoch-1/checkpoint-1100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:569f2d754635d1166e6fa072476908b41ee096442e70e30c444a50d0f1ad79a2
3
+ size 14709
graphcodebert-swa-from-epoch-1/checkpoint-1100/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ecbbd6c6dff6f228661737c64adc040f47ef9a21f9a0d2159df5b5b4adb3e9d
3
+ size 1383
graphcodebert-swa-from-epoch-1/checkpoint-1100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23673245f9d38511ddb278bc62eff92cbf79c78692024951bcecc31aafc6e59e
3
+ size 1465
graphcodebert-swa-from-epoch-1/checkpoint-1100/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1100/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
graphcodebert-swa-from-epoch-1/checkpoint-1100/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 512,
53
+ "pad_token": "<pad>",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1100/trainer_state.json ADDED
@@ -0,0 +1,1720 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1100,
3
+ "best_metric": 0.6353491904387377,
4
+ "best_model_checkpoint": "graphcodebert-swa-from-epoch-1/checkpoint-1100",
5
+ "epoch": 2.2494887525562373,
6
+ "eval_steps": 100,
7
+ "global_step": 1100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.010224948875255624,
14
+ "grad_norm": 2.4707133769989014,
15
+ "learning_rate": 2.0512820512820512e-08,
16
+ "loss": 0.8431,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.02044989775051125,
21
+ "grad_norm": 3.114851951599121,
22
+ "learning_rate": 4.615384615384615e-08,
23
+ "loss": 0.844,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.03067484662576687,
28
+ "grad_norm": 2.2256007194519043,
29
+ "learning_rate": 7.179487179487178e-08,
30
+ "loss": 0.847,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.0408997955010225,
35
+ "grad_norm": 2.5343081951141357,
36
+ "learning_rate": 9.743589743589743e-08,
37
+ "loss": 0.8492,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.05112474437627812,
42
+ "grad_norm": 3.1964163780212402,
43
+ "learning_rate": 1.2307692307692308e-07,
44
+ "loss": 0.8475,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.06134969325153374,
49
+ "grad_norm": 2.0466485023498535,
50
+ "learning_rate": 1.4871794871794872e-07,
51
+ "loss": 0.8445,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.07157464212678936,
56
+ "grad_norm": 2.164569139480591,
57
+ "learning_rate": 1.7435897435897435e-07,
58
+ "loss": 0.8452,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.081799591002045,
63
+ "grad_norm": 2.56343150138855,
64
+ "learning_rate": 2e-07,
65
+ "loss": 0.8473,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.09202453987730061,
70
+ "grad_norm": 2.5742437839508057,
71
+ "learning_rate": 2.2564102564102563e-07,
72
+ "loss": 0.848,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.10224948875255624,
77
+ "grad_norm": 2.587480306625366,
78
+ "learning_rate": 2.5128205128205126e-07,
79
+ "loss": 0.8409,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.11247443762781185,
84
+ "grad_norm": 2.5737764835357666,
85
+ "learning_rate": 2.7692307692307693e-07,
86
+ "loss": 0.8471,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.12269938650306748,
91
+ "grad_norm": 3.044358730316162,
92
+ "learning_rate": 3.0256410256410254e-07,
93
+ "loss": 0.8448,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.1329243353783231,
98
+ "grad_norm": 2.326373815536499,
99
+ "learning_rate": 3.282051282051282e-07,
100
+ "loss": 0.8517,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.14314928425357873,
105
+ "grad_norm": 2.267547607421875,
106
+ "learning_rate": 3.5384615384615386e-07,
107
+ "loss": 0.8387,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.15337423312883436,
112
+ "grad_norm": 2.609232187271118,
113
+ "learning_rate": 3.7948717948717947e-07,
114
+ "loss": 0.841,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.16359918200409,
119
+ "grad_norm": 2.9532523155212402,
120
+ "learning_rate": 4.0512820512820514e-07,
121
+ "loss": 0.8509,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.1738241308793456,
126
+ "grad_norm": 3.002154588699341,
127
+ "learning_rate": 4.307692307692308e-07,
128
+ "loss": 0.8482,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.18404907975460122,
133
+ "grad_norm": 2.701613187789917,
134
+ "learning_rate": 4.5641025641025636e-07,
135
+ "loss": 0.8422,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.19427402862985685,
140
+ "grad_norm": 2.7430365085601807,
141
+ "learning_rate": 4.82051282051282e-07,
142
+ "loss": 0.846,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.20449897750511248,
147
+ "grad_norm": 2.8101418018341064,
148
+ "learning_rate": 5.076923076923076e-07,
149
+ "loss": 0.8444,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.20449897750511248,
154
+ "eval_accuracy": 0.52033,
155
+ "eval_loss": 0.6922348141670227,
156
+ "eval_macro_f1": 0.4427650399783254,
157
+ "eval_precision": 0.6036606007378691,
158
+ "eval_recall": 0.5386742448919869,
159
+ "eval_runtime": 80.6812,
160
+ "eval_samples_per_second": 1239.446,
161
+ "eval_steps_per_second": 0.607,
162
+ "step": 100
163
+ },
164
+ {
165
+ "epoch": 0.2147239263803681,
166
+ "grad_norm": 2.5835089683532715,
167
+ "learning_rate": 5.333333333333333e-07,
168
+ "loss": 0.8437,
169
+ "step": 105
170
+ },
171
+ {
172
+ "epoch": 0.2249488752556237,
173
+ "grad_norm": 2.7237253189086914,
174
+ "learning_rate": 5.58974358974359e-07,
175
+ "loss": 0.8431,
176
+ "step": 110
177
+ },
178
+ {
179
+ "epoch": 0.23517382413087934,
180
+ "grad_norm": 2.4648072719573975,
181
+ "learning_rate": 5.846153846153847e-07,
182
+ "loss": 0.8399,
183
+ "step": 115
184
+ },
185
+ {
186
+ "epoch": 0.24539877300613497,
187
+ "grad_norm": 2.7011852264404297,
188
+ "learning_rate": 6.102564102564103e-07,
189
+ "loss": 0.8409,
190
+ "step": 120
191
+ },
192
+ {
193
+ "epoch": 0.2556237218813906,
194
+ "grad_norm": 2.3170969486236572,
195
+ "learning_rate": 6.358974358974358e-07,
196
+ "loss": 0.8361,
197
+ "step": 125
198
+ },
199
+ {
200
+ "epoch": 0.2658486707566462,
201
+ "grad_norm": 2.517194986343384,
202
+ "learning_rate": 6.615384615384615e-07,
203
+ "loss": 0.839,
204
+ "step": 130
205
+ },
206
+ {
207
+ "epoch": 0.27607361963190186,
208
+ "grad_norm": 2.5092124938964844,
209
+ "learning_rate": 6.871794871794871e-07,
210
+ "loss": 0.8438,
211
+ "step": 135
212
+ },
213
+ {
214
+ "epoch": 0.28629856850715746,
215
+ "grad_norm": 2.3993237018585205,
216
+ "learning_rate": 7.128205128205128e-07,
217
+ "loss": 0.8349,
218
+ "step": 140
219
+ },
220
+ {
221
+ "epoch": 0.2965235173824131,
222
+ "grad_norm": 2.1388165950775146,
223
+ "learning_rate": 7.384615384615384e-07,
224
+ "loss": 0.8363,
225
+ "step": 145
226
+ },
227
+ {
228
+ "epoch": 0.3067484662576687,
229
+ "grad_norm": 1.8425891399383545,
230
+ "learning_rate": 7.64102564102564e-07,
231
+ "loss": 0.8325,
232
+ "step": 150
233
+ },
234
+ {
235
+ "epoch": 0.3169734151329243,
236
+ "grad_norm": 1.8665552139282227,
237
+ "learning_rate": 7.897435897435897e-07,
238
+ "loss": 0.835,
239
+ "step": 155
240
+ },
241
+ {
242
+ "epoch": 0.32719836400818,
243
+ "grad_norm": 1.8765455484390259,
244
+ "learning_rate": 8.153846153846154e-07,
245
+ "loss": 0.8328,
246
+ "step": 160
247
+ },
248
+ {
249
+ "epoch": 0.3374233128834356,
250
+ "grad_norm": 2.640779495239258,
251
+ "learning_rate": 8.41025641025641e-07,
252
+ "loss": 0.8388,
253
+ "step": 165
254
+ },
255
+ {
256
+ "epoch": 0.3476482617586912,
257
+ "grad_norm": 2.174116373062134,
258
+ "learning_rate": 8.666666666666667e-07,
259
+ "loss": 0.8336,
260
+ "step": 170
261
+ },
262
+ {
263
+ "epoch": 0.35787321063394684,
264
+ "grad_norm": 1.8411178588867188,
265
+ "learning_rate": 8.923076923076923e-07,
266
+ "loss": 0.8384,
267
+ "step": 175
268
+ },
269
+ {
270
+ "epoch": 0.36809815950920244,
271
+ "grad_norm": 2.3652143478393555,
272
+ "learning_rate": 9.179487179487179e-07,
273
+ "loss": 0.8318,
274
+ "step": 180
275
+ },
276
+ {
277
+ "epoch": 0.3783231083844581,
278
+ "grad_norm": 1.9870903491973877,
279
+ "learning_rate": 9.435897435897435e-07,
280
+ "loss": 0.8306,
281
+ "step": 185
282
+ },
283
+ {
284
+ "epoch": 0.3885480572597137,
285
+ "grad_norm": 2.458887815475464,
286
+ "learning_rate": 9.692307692307691e-07,
287
+ "loss": 0.8342,
288
+ "step": 190
289
+ },
290
+ {
291
+ "epoch": 0.3987730061349693,
292
+ "grad_norm": 1.9105890989303589,
293
+ "learning_rate": 9.948717948717949e-07,
294
+ "loss": 0.8301,
295
+ "step": 195
296
+ },
297
+ {
298
+ "epoch": 0.40899795501022496,
299
+ "grad_norm": 2.04896879196167,
300
+ "learning_rate": 9.999490793845076e-07,
301
+ "loss": 0.8291,
302
+ "step": 200
303
+ },
304
+ {
305
+ "epoch": 0.40899795501022496,
306
+ "eval_accuracy": 0.52697,
307
+ "eval_loss": 0.6913915872573853,
308
+ "eval_macro_f1": 0.4511625248903547,
309
+ "eval_precision": 0.6198512746424523,
310
+ "eval_recall": 0.5452618609595298,
311
+ "eval_runtime": 80.6395,
312
+ "eval_samples_per_second": 1240.088,
313
+ "eval_steps_per_second": 0.608,
314
+ "step": 200
315
+ },
316
+ {
317
+ "epoch": 0.41922290388548056,
318
+ "grad_norm": 2.394630193710327,
319
+ "learning_rate": 9.997422321595486e-07,
320
+ "loss": 0.8311,
321
+ "step": 205
322
+ },
323
+ {
324
+ "epoch": 0.4294478527607362,
325
+ "grad_norm": 1.7013665437698364,
326
+ "learning_rate": 9.993763415653074e-07,
327
+ "loss": 0.8264,
328
+ "step": 210
329
+ },
330
+ {
331
+ "epoch": 0.4396728016359918,
332
+ "grad_norm": 2.1158103942871094,
333
+ "learning_rate": 9.988515240467613e-07,
334
+ "loss": 0.8262,
335
+ "step": 215
336
+ },
337
+ {
338
+ "epoch": 0.4498977505112474,
339
+ "grad_norm": 1.5985370874404907,
340
+ "learning_rate": 9.981679466275095e-07,
341
+ "loss": 0.8296,
342
+ "step": 220
343
+ },
344
+ {
345
+ "epoch": 0.4601226993865031,
346
+ "grad_norm": 2.0426042079925537,
347
+ "learning_rate": 9.973258268566182e-07,
348
+ "loss": 0.8233,
349
+ "step": 225
350
+ },
351
+ {
352
+ "epoch": 0.4703476482617587,
353
+ "grad_norm": 1.7411834001541138,
354
+ "learning_rate": 9.963254327393853e-07,
355
+ "loss": 0.8269,
356
+ "step": 230
357
+ },
358
+ {
359
+ "epoch": 0.48057259713701433,
360
+ "grad_norm": 2.1182405948638916,
361
+ "learning_rate": 9.95167082652047e-07,
362
+ "loss": 0.8247,
363
+ "step": 235
364
+ },
365
+ {
366
+ "epoch": 0.49079754601226994,
367
+ "grad_norm": 2.0239953994750977,
368
+ "learning_rate": 9.938511452404547e-07,
369
+ "loss": 0.8308,
370
+ "step": 240
371
+ },
372
+ {
373
+ "epoch": 0.5010224948875256,
374
+ "grad_norm": 2.366060495376587,
375
+ "learning_rate": 9.923780393027534e-07,
376
+ "loss": 0.8205,
377
+ "step": 245
378
+ },
379
+ {
380
+ "epoch": 0.5112474437627812,
381
+ "grad_norm": 1.848169207572937,
382
+ "learning_rate": 9.907482336560982e-07,
383
+ "loss": 0.825,
384
+ "step": 250
385
+ },
386
+ {
387
+ "epoch": 0.5214723926380368,
388
+ "grad_norm": 1.8216668367385864,
389
+ "learning_rate": 9.889622469874535e-07,
390
+ "loss": 0.8271,
391
+ "step": 255
392
+ },
393
+ {
394
+ "epoch": 0.5316973415132924,
395
+ "grad_norm": 1.507730484008789,
396
+ "learning_rate": 9.8702064768852e-07,
397
+ "loss": 0.8147,
398
+ "step": 260
399
+ },
400
+ {
401
+ "epoch": 0.5419222903885481,
402
+ "grad_norm": 1.7608263492584229,
403
+ "learning_rate": 9.849240536748438e-07,
404
+ "loss": 0.8221,
405
+ "step": 265
406
+ },
407
+ {
408
+ "epoch": 0.5521472392638037,
409
+ "grad_norm": 2.203326940536499,
410
+ "learning_rate": 9.826731321891641e-07,
411
+ "loss": 0.8292,
412
+ "step": 270
413
+ },
414
+ {
415
+ "epoch": 0.5623721881390593,
416
+ "grad_norm": 1.9529740810394287,
417
+ "learning_rate": 9.802685995890632e-07,
418
+ "loss": 0.8228,
419
+ "step": 275
420
+ },
421
+ {
422
+ "epoch": 0.5725971370143149,
423
+ "grad_norm": 1.6214399337768555,
424
+ "learning_rate": 9.777112211189841e-07,
425
+ "loss": 0.8149,
426
+ "step": 280
427
+ },
428
+ {
429
+ "epoch": 0.5828220858895705,
430
+ "grad_norm": 2.07482647895813,
431
+ "learning_rate": 9.750018106666924e-07,
432
+ "loss": 0.8143,
433
+ "step": 285
434
+ },
435
+ {
436
+ "epoch": 0.5930470347648262,
437
+ "grad_norm": 1.7083203792572021,
438
+ "learning_rate": 9.721412305042538e-07,
439
+ "loss": 0.8188,
440
+ "step": 290
441
+ },
442
+ {
443
+ "epoch": 0.6032719836400818,
444
+ "grad_norm": 2.0022943019866943,
445
+ "learning_rate": 9.69130391013617e-07,
446
+ "loss": 0.8195,
447
+ "step": 295
448
+ },
449
+ {
450
+ "epoch": 0.6134969325153374,
451
+ "grad_norm": 1.5799461603164673,
452
+ "learning_rate": 9.659702503968834e-07,
453
+ "loss": 0.8146,
454
+ "step": 300
455
+ },
456
+ {
457
+ "epoch": 0.6134969325153374,
458
+ "eval_accuracy": 0.55052,
459
+ "eval_loss": 0.6896406412124634,
460
+ "eval_macro_f1": 0.49869783315905847,
461
+ "eval_precision": 0.6264643684302231,
462
+ "eval_recall": 0.5665461014402418,
463
+ "eval_runtime": 80.6145,
464
+ "eval_samples_per_second": 1240.472,
465
+ "eval_steps_per_second": 0.608,
466
+ "step": 300
467
+ },
468
+ {
469
+ "epoch": 0.623721881390593,
470
+ "grad_norm": 1.9373347759246826,
471
+ "learning_rate": 9.626618143713586e-07,
472
+ "loss": 0.8166,
473
+ "step": 305
474
+ },
475
+ {
476
+ "epoch": 0.6339468302658486,
477
+ "grad_norm": 1.6276922225952148,
478
+ "learning_rate": 9.592061358494813e-07,
479
+ "loss": 0.8176,
480
+ "step": 310
481
+ },
482
+ {
483
+ "epoch": 0.6441717791411042,
484
+ "grad_norm": 1.9373250007629395,
485
+ "learning_rate": 9.556043146037337e-07,
486
+ "loss": 0.8168,
487
+ "step": 315
488
+ },
489
+ {
490
+ "epoch": 0.65439672801636,
491
+ "grad_norm": 1.320465087890625,
492
+ "learning_rate": 9.518574969166391e-07,
493
+ "loss": 0.8101,
494
+ "step": 320
495
+ },
496
+ {
497
+ "epoch": 0.6646216768916156,
498
+ "grad_norm": 1.8596330881118774,
499
+ "learning_rate": 9.47966875215954e-07,
500
+ "loss": 0.8167,
501
+ "step": 325
502
+ },
503
+ {
504
+ "epoch": 0.6748466257668712,
505
+ "grad_norm": 1.304662823677063,
506
+ "learning_rate": 9.439336876951793e-07,
507
+ "loss": 0.815,
508
+ "step": 330
509
+ },
510
+ {
511
+ "epoch": 0.6850715746421268,
512
+ "grad_norm": 1.8063029050827026,
513
+ "learning_rate": 9.397592179195033e-07,
514
+ "loss": 0.8121,
515
+ "step": 335
516
+ },
517
+ {
518
+ "epoch": 0.6952965235173824,
519
+ "grad_norm": 1.7432739734649658,
520
+ "learning_rate": 9.354447944173059e-07,
521
+ "loss": 0.8104,
522
+ "step": 340
523
+ },
524
+ {
525
+ "epoch": 0.7055214723926381,
526
+ "grad_norm": 1.4523797035217285,
527
+ "learning_rate": 9.309917902573533e-07,
528
+ "loss": 0.8098,
529
+ "step": 345
530
+ },
531
+ {
532
+ "epoch": 0.7157464212678937,
533
+ "grad_norm": 1.681409478187561,
534
+ "learning_rate": 9.264016226118188e-07,
535
+ "loss": 0.8107,
536
+ "step": 350
537
+ },
538
+ {
539
+ "epoch": 0.7259713701431493,
540
+ "grad_norm": 1.5168694257736206,
541
+ "learning_rate": 9.216757523052652e-07,
542
+ "loss": 0.8085,
543
+ "step": 355
544
+ },
545
+ {
546
+ "epoch": 0.7361963190184049,
547
+ "grad_norm": 1.2200194597244263,
548
+ "learning_rate": 9.168156833497371e-07,
549
+ "loss": 0.8109,
550
+ "step": 360
551
+ },
552
+ {
553
+ "epoch": 0.7464212678936605,
554
+ "grad_norm": 1.2745580673217773,
555
+ "learning_rate": 9.118229624661078e-07,
556
+ "loss": 0.8096,
557
+ "step": 365
558
+ },
559
+ {
560
+ "epoch": 0.7566462167689162,
561
+ "grad_norm": 1.8339142799377441,
562
+ "learning_rate": 9.066991785918333e-07,
563
+ "loss": 0.808,
564
+ "step": 370
565
+ },
566
+ {
567
+ "epoch": 0.7668711656441718,
568
+ "grad_norm": 1.2315114736557007,
569
+ "learning_rate": 9.01445962375273e-07,
570
+ "loss": 0.805,
571
+ "step": 375
572
+ },
573
+ {
574
+ "epoch": 0.7770961145194274,
575
+ "grad_norm": 1.3081412315368652,
576
+ "learning_rate": 8.960649856567333e-07,
577
+ "loss": 0.8066,
578
+ "step": 380
579
+ },
580
+ {
581
+ "epoch": 0.787321063394683,
582
+ "grad_norm": 1.5145998001098633,
583
+ "learning_rate": 8.90557960936404e-07,
584
+ "loss": 0.8028,
585
+ "step": 385
586
+ },
587
+ {
588
+ "epoch": 0.7975460122699386,
589
+ "grad_norm": 1.5990959405899048,
590
+ "learning_rate": 8.84926640829353e-07,
591
+ "loss": 0.8035,
592
+ "step": 390
593
+ },
594
+ {
595
+ "epoch": 0.8077709611451943,
596
+ "grad_norm": 1.2120558023452759,
597
+ "learning_rate": 8.79172817507756e-07,
598
+ "loss": 0.802,
599
+ "step": 395
600
+ },
601
+ {
602
+ "epoch": 0.8179959100204499,
603
+ "grad_norm": 1.5799622535705566,
604
+ "learning_rate": 8.73298322130535e-07,
605
+ "loss": 0.8037,
606
+ "step": 400
607
+ },
608
+ {
609
+ "epoch": 0.8179959100204499,
610
+ "eval_accuracy": 0.58537,
611
+ "eval_loss": 0.6877263784408569,
612
+ "eval_macro_f1": 0.5630337315451738,
613
+ "eval_precision": 0.628845494567806,
614
+ "eval_recall": 0.5970616303474306,
615
+ "eval_runtime": 81.293,
616
+ "eval_samples_per_second": 1230.118,
617
+ "eval_steps_per_second": 0.603,
618
+ "step": 400
619
+ },
620
+ {
621
+ "epoch": 0.8282208588957055,
622
+ "grad_norm": 1.3475037813186646,
623
+ "learning_rate": 8.673050242605921e-07,
624
+ "loss": 0.8067,
625
+ "step": 405
626
+ },
627
+ {
628
+ "epoch": 0.8384458077709611,
629
+ "grad_norm": 1.2836309671401978,
630
+ "learning_rate": 8.611948312698179e-07,
631
+ "loss": 0.7996,
632
+ "step": 410
633
+ },
634
+ {
635
+ "epoch": 0.8486707566462167,
636
+ "grad_norm": 1.460316777229309,
637
+ "learning_rate": 8.5496968773207e-07,
638
+ "loss": 0.802,
639
+ "step": 415
640
+ },
641
+ {
642
+ "epoch": 0.8588957055214724,
643
+ "grad_norm": 1.33119797706604,
644
+ "learning_rate": 8.486315748043109e-07,
645
+ "loss": 0.798,
646
+ "step": 420
647
+ },
648
+ {
649
+ "epoch": 0.869120654396728,
650
+ "grad_norm": 1.9951454401016235,
651
+ "learning_rate": 8.42182509596102e-07,
652
+ "loss": 0.8013,
653
+ "step": 425
654
+ },
655
+ {
656
+ "epoch": 0.8793456032719836,
657
+ "grad_norm": 1.2590746879577637,
658
+ "learning_rate": 8.356245445276584e-07,
659
+ "loss": 0.7963,
660
+ "step": 430
661
+ },
662
+ {
663
+ "epoch": 0.8895705521472392,
664
+ "grad_norm": 1.1192667484283447,
665
+ "learning_rate": 8.28959766676663e-07,
666
+ "loss": 0.8004,
667
+ "step": 435
668
+ },
669
+ {
670
+ "epoch": 0.8997955010224948,
671
+ "grad_norm": 1.1180275678634644,
672
+ "learning_rate": 8.221902971140535e-07,
673
+ "loss": 0.8041,
674
+ "step": 440
675
+ },
676
+ {
677
+ "epoch": 0.9100204498977505,
678
+ "grad_norm": 1.1210858821868896,
679
+ "learning_rate": 8.153182902289897e-07,
680
+ "loss": 0.7991,
681
+ "step": 445
682
+ },
683
+ {
684
+ "epoch": 0.9202453987730062,
685
+ "grad_norm": 1.1266220808029175,
686
+ "learning_rate": 8.083459330432164e-07,
687
+ "loss": 0.8002,
688
+ "step": 450
689
+ },
690
+ {
691
+ "epoch": 0.9304703476482618,
692
+ "grad_norm": 1.0373694896697998,
693
+ "learning_rate": 8.012754445150434e-07,
694
+ "loss": 0.7974,
695
+ "step": 455
696
+ },
697
+ {
698
+ "epoch": 0.9406952965235174,
699
+ "grad_norm": 1.2223235368728638,
700
+ "learning_rate": 7.941090748331589e-07,
701
+ "loss": 0.8001,
702
+ "step": 460
703
+ },
704
+ {
705
+ "epoch": 0.950920245398773,
706
+ "grad_norm": 1.4549195766448975,
707
+ "learning_rate": 7.868491047005065e-07,
708
+ "loss": 0.7993,
709
+ "step": 465
710
+ },
711
+ {
712
+ "epoch": 0.9611451942740287,
713
+ "grad_norm": 1.3064852952957153,
714
+ "learning_rate": 7.794978446084483e-07,
715
+ "loss": 0.8006,
716
+ "step": 470
717
+ },
718
+ {
719
+ "epoch": 0.9713701431492843,
720
+ "grad_norm": 1.2408719062805176,
721
+ "learning_rate": 7.720576341014498e-07,
722
+ "loss": 0.7983,
723
+ "step": 475
724
+ },
725
+ {
726
+ "epoch": 0.9815950920245399,
727
+ "grad_norm": 1.2148370742797852,
728
+ "learning_rate": 7.645308410325187e-07,
729
+ "loss": 0.7959,
730
+ "step": 480
731
+ },
732
+ {
733
+ "epoch": 0.9918200408997955,
734
+ "grad_norm": 1.0927603244781494,
735
+ "learning_rate": 7.569198608096317e-07,
736
+ "loss": 0.7978,
737
+ "step": 485
738
+ },
739
+ {
740
+ "SWA": "started",
741
+ "epoch": 1.0,
742
+ "step": 489
743
+ },
744
+ {
745
+ "epoch": 1.0020449897750512,
746
+ "grad_norm": 1.245108723640442,
747
+ "learning_rate": 7.492271156333967e-07,
748
+ "loss": 0.7965,
749
+ "step": 490
750
+ },
751
+ {
752
+ "epoch": 1.0122699386503067,
753
+ "grad_norm": 1.3393553495407104,
754
+ "learning_rate": 7.414550537261828e-07,
755
+ "loss": 0.795,
756
+ "step": 495
757
+ },
758
+ {
759
+ "epoch": 1.0224948875255624,
760
+ "grad_norm": 1.2823072671890259,
761
+ "learning_rate": 7.336061485529738e-07,
762
+ "loss": 0.8014,
763
+ "step": 500
764
+ },
765
+ {
766
+ "epoch": 1.0224948875255624,
767
+ "eval_accuracy": 0.60723,
768
+ "eval_loss": 0.6864892244338989,
769
+ "eval_macro_f1": 0.5966241921587988,
770
+ "eval_precision": 0.6341761761282843,
771
+ "eval_recall": 0.6160142746967282,
772
+ "eval_runtime": 81.931,
773
+ "eval_samples_per_second": 1220.539,
774
+ "eval_steps_per_second": 0.598,
775
+ "step": 500
776
+ },
777
+ {
778
+ "epoch": 1.032719836400818,
779
+ "grad_norm": 1.1278107166290283,
780
+ "learning_rate": 7.256828980341846e-07,
781
+ "loss": 0.7977,
782
+ "step": 505
783
+ },
784
+ {
785
+ "epoch": 1.0429447852760736,
786
+ "grad_norm": 1.110093355178833,
787
+ "learning_rate": 7.176878237506965e-07,
788
+ "loss": 0.7954,
789
+ "step": 510
790
+ },
791
+ {
792
+ "epoch": 1.0531697341513293,
793
+ "grad_norm": 1.2248748540878296,
794
+ "learning_rate": 7.096234701413617e-07,
795
+ "loss": 0.7957,
796
+ "step": 515
797
+ },
798
+ {
799
+ "epoch": 1.0633946830265848,
800
+ "grad_norm": 1.2420642375946045,
801
+ "learning_rate": 7.014924036932345e-07,
802
+ "loss": 0.7935,
803
+ "step": 520
804
+ },
805
+ {
806
+ "epoch": 1.0736196319018405,
807
+ "grad_norm": 1.0777639150619507,
808
+ "learning_rate": 6.932972121247831e-07,
809
+ "loss": 0.796,
810
+ "step": 525
811
+ },
812
+ {
813
+ "epoch": 1.0838445807770962,
814
+ "grad_norm": 1.3830324411392212,
815
+ "learning_rate": 6.850405035623481e-07,
816
+ "loss": 0.7929,
817
+ "step": 530
818
+ },
819
+ {
820
+ "epoch": 1.0940695296523517,
821
+ "grad_norm": 0.9407713413238525,
822
+ "learning_rate": 6.767249057101023e-07,
823
+ "loss": 0.7964,
824
+ "step": 535
825
+ },
826
+ {
827
+ "epoch": 1.1042944785276074,
828
+ "grad_norm": 1.1688194274902344,
829
+ "learning_rate": 6.683530650137832e-07,
830
+ "loss": 0.7944,
831
+ "step": 540
832
+ },
833
+ {
834
+ "epoch": 1.114519427402863,
835
+ "grad_norm": 0.9509923458099365,
836
+ "learning_rate": 6.599276458184588e-07,
837
+ "loss": 0.7912,
838
+ "step": 545
839
+ },
840
+ {
841
+ "epoch": 1.1247443762781186,
842
+ "grad_norm": 1.0683159828186035,
843
+ "learning_rate": 6.514513295205969e-07,
844
+ "loss": 0.7931,
845
+ "step": 550
846
+ },
847
+ {
848
+ "epoch": 1.1349693251533743,
849
+ "grad_norm": 0.9022642374038696,
850
+ "learning_rate": 6.429268137147104e-07,
851
+ "loss": 0.7945,
852
+ "step": 555
853
+ },
854
+ {
855
+ "epoch": 1.1451942740286298,
856
+ "grad_norm": 1.1609984636306763,
857
+ "learning_rate": 6.343568113348441e-07,
858
+ "loss": 0.7913,
859
+ "step": 560
860
+ },
861
+ {
862
+ "epoch": 1.1554192229038855,
863
+ "grad_norm": 1.2184994220733643,
864
+ "learning_rate": 6.257440497911815e-07,
865
+ "loss": 0.7919,
866
+ "step": 565
867
+ },
868
+ {
869
+ "epoch": 1.165644171779141,
870
+ "grad_norm": 1.0256582498550415,
871
+ "learning_rate": 6.170912701020454e-07,
872
+ "loss": 0.7912,
873
+ "step": 570
874
+ },
875
+ {
876
+ "epoch": 1.1758691206543967,
877
+ "grad_norm": 0.8725862503051758,
878
+ "learning_rate": 6.084012260215645e-07,
879
+ "loss": 0.7907,
880
+ "step": 575
881
+ },
882
+ {
883
+ "epoch": 1.1860940695296525,
884
+ "grad_norm": 1.5192348957061768,
885
+ "learning_rate": 5.996766831632912e-07,
886
+ "loss": 0.7913,
887
+ "step": 580
888
+ },
889
+ {
890
+ "epoch": 1.196319018404908,
891
+ "grad_norm": 1.109052062034607,
892
+ "learning_rate": 5.909204181200414e-07,
893
+ "loss": 0.795,
894
+ "step": 585
895
+ },
896
+ {
897
+ "epoch": 1.2065439672801637,
898
+ "grad_norm": 1.0413333177566528,
899
+ "learning_rate": 5.821352175802419e-07,
900
+ "loss": 0.7924,
901
+ "step": 590
902
+ },
903
+ {
904
+ "epoch": 1.2167689161554192,
905
+ "grad_norm": 0.8926281929016113,
906
+ "learning_rate": 5.733238774410647e-07,
907
+ "loss": 0.7921,
908
+ "step": 595
909
+ },
910
+ {
911
+ "epoch": 1.2269938650306749,
912
+ "grad_norm": 0.9231971502304077,
913
+ "learning_rate": 5.644892019186307e-07,
914
+ "loss": 0.7894,
915
+ "step": 600
916
+ },
917
+ {
918
+ "epoch": 1.2269938650306749,
919
+ "eval_accuracy": 0.62182,
920
+ "eval_loss": 0.6853985786437988,
921
+ "eval_macro_f1": 0.6195549574374046,
922
+ "eval_precision": 0.6317310781859349,
923
+ "eval_recall": 0.6267089641577176,
924
+ "eval_runtime": 81.4512,
925
+ "eval_samples_per_second": 1227.728,
926
+ "eval_steps_per_second": 0.602,
927
+ "step": 600
928
+ },
929
+ {
930
+ "epoch": 1.2372188139059306,
931
+ "grad_norm": 0.9845394492149353,
932
+ "learning_rate": 5.556340026555653e-07,
933
+ "loss": 0.7918,
934
+ "step": 605
935
+ },
936
+ {
937
+ "epoch": 1.247443762781186,
938
+ "grad_norm": 1.3759487867355347,
939
+ "learning_rate": 5.467610978261906e-07,
940
+ "loss": 0.7904,
941
+ "step": 610
942
+ },
943
+ {
944
+ "epoch": 1.2576687116564418,
945
+ "grad_norm": 1.1568200588226318,
946
+ "learning_rate": 5.378733112396398e-07,
947
+ "loss": 0.7923,
948
+ "step": 615
949
+ },
950
+ {
951
+ "epoch": 1.2678936605316973,
952
+ "grad_norm": 1.4351176023483276,
953
+ "learning_rate": 5.289734714411775e-07,
954
+ "loss": 0.7905,
955
+ "step": 620
956
+ },
957
+ {
958
+ "epoch": 1.278118609406953,
959
+ "grad_norm": 1.178076982498169,
960
+ "learning_rate": 5.200644108120121e-07,
961
+ "loss": 0.7947,
962
+ "step": 625
963
+ },
964
+ {
965
+ "epoch": 1.2883435582822087,
966
+ "grad_norm": 1.2398017644882202,
967
+ "learning_rate": 5.111489646678896e-07,
968
+ "loss": 0.796,
969
+ "step": 630
970
+ },
971
+ {
972
+ "epoch": 1.2985685071574642,
973
+ "grad_norm": 1.1236284971237183,
974
+ "learning_rate": 5.022299703567508e-07,
975
+ "loss": 0.7895,
976
+ "step": 635
977
+ },
978
+ {
979
+ "epoch": 1.30879345603272,
980
+ "grad_norm": 1.0112528800964355,
981
+ "learning_rate": 4.933102663557439e-07,
982
+ "loss": 0.79,
983
+ "step": 640
984
+ },
985
+ {
986
+ "epoch": 1.3190184049079754,
987
+ "grad_norm": 1.3201746940612793,
988
+ "learning_rate": 4.843926913678757e-07,
989
+ "loss": 0.7897,
990
+ "step": 645
991
+ },
992
+ {
993
+ "epoch": 1.329243353783231,
994
+ "grad_norm": 0.969918429851532,
995
+ "learning_rate": 4.7548008341859384e-07,
996
+ "loss": 0.7912,
997
+ "step": 650
998
+ },
999
+ {
1000
+ "epoch": 1.3394683026584868,
1001
+ "grad_norm": 0.8914945125579834,
1002
+ "learning_rate": 4.665752789525812e-07,
1003
+ "loss": 0.7964,
1004
+ "step": 655
1005
+ },
1006
+ {
1007
+ "epoch": 1.3496932515337423,
1008
+ "grad_norm": 0.906989574432373,
1009
+ "learning_rate": 4.576811119310563e-07,
1010
+ "loss": 0.7924,
1011
+ "step": 660
1012
+ },
1013
+ {
1014
+ "epoch": 1.359918200408998,
1015
+ "grad_norm": 1.2423877716064453,
1016
+ "learning_rate": 4.488004129298618e-07,
1017
+ "loss": 0.7904,
1018
+ "step": 665
1019
+ },
1020
+ {
1021
+ "epoch": 1.3701431492842535,
1022
+ "grad_norm": 1.2455909252166748,
1023
+ "learning_rate": 4.3993600823863256e-07,
1024
+ "loss": 0.7875,
1025
+ "step": 670
1026
+ },
1027
+ {
1028
+ "epoch": 1.3803680981595092,
1029
+ "grad_norm": 1.4931528568267822,
1030
+ "learning_rate": 4.3109071896132574e-07,
1031
+ "loss": 0.7947,
1032
+ "step": 675
1033
+ },
1034
+ {
1035
+ "epoch": 1.390593047034765,
1036
+ "grad_norm": 1.0538350343704224,
1037
+ "learning_rate": 4.222673601184029e-07,
1038
+ "loss": 0.7886,
1039
+ "step": 680
1040
+ },
1041
+ {
1042
+ "epoch": 1.4008179959100204,
1043
+ "grad_norm": 0.9246828556060791,
1044
+ "learning_rate": 4.134687397509467e-07,
1045
+ "loss": 0.7884,
1046
+ "step": 685
1047
+ },
1048
+ {
1049
+ "epoch": 1.4110429447852761,
1050
+ "grad_norm": 1.0383715629577637,
1051
+ "learning_rate": 4.0469765802700033e-07,
1052
+ "loss": 0.7943,
1053
+ "step": 690
1054
+ },
1055
+ {
1056
+ "epoch": 1.4212678936605316,
1057
+ "grad_norm": 1.0180901288986206,
1058
+ "learning_rate": 3.9595690635041145e-07,
1059
+ "loss": 0.7895,
1060
+ "step": 695
1061
+ },
1062
+ {
1063
+ "epoch": 1.4314928425357873,
1064
+ "grad_norm": 0.9119181632995605,
1065
+ "learning_rate": 3.8724926647246536e-07,
1066
+ "loss": 0.7864,
1067
+ "step": 700
1068
+ },
1069
+ {
1070
+ "epoch": 1.4314928425357873,
1071
+ "eval_accuracy": 0.62357,
1072
+ "eval_loss": 0.6852650046348572,
1073
+ "eval_macro_f1": 0.6215147432652665,
1074
+ "eval_precision": 0.6330088346022082,
1075
+ "eval_recall": 0.628302383508456,
1076
+ "eval_runtime": 80.5998,
1077
+ "eval_samples_per_second": 1240.698,
1078
+ "eval_steps_per_second": 0.608,
1079
+ "step": 700
1080
+ },
1081
+ {
1082
+ "epoch": 1.441717791411043,
1083
+ "grad_norm": 0.8882152438163757,
1084
+ "learning_rate": 3.785775096065909e-07,
1085
+ "loss": 0.7858,
1086
+ "step": 705
1087
+ },
1088
+ {
1089
+ "epoch": 1.4519427402862985,
1090
+ "grad_norm": 1.5290203094482422,
1091
+ "learning_rate": 3.699443955464192e-07,
1092
+ "loss": 0.7837,
1093
+ "step": 710
1094
+ },
1095
+ {
1096
+ "epoch": 1.4621676891615543,
1097
+ "grad_norm": 0.881521463394165,
1098
+ "learning_rate": 3.613526717874774e-07,
1099
+ "loss": 0.7858,
1100
+ "step": 715
1101
+ },
1102
+ {
1103
+ "epoch": 1.4723926380368098,
1104
+ "grad_norm": 0.9955899119377136,
1105
+ "learning_rate": 3.5280507265279555e-07,
1106
+ "loss": 0.7907,
1107
+ "step": 720
1108
+ },
1109
+ {
1110
+ "epoch": 1.4826175869120655,
1111
+ "grad_norm": 1.3247544765472412,
1112
+ "learning_rate": 3.443043184227067e-07,
1113
+ "loss": 0.79,
1114
+ "step": 725
1115
+ },
1116
+ {
1117
+ "epoch": 1.4928425357873212,
1118
+ "grad_norm": 1.200223445892334,
1119
+ "learning_rate": 3.358531144691148e-07,
1120
+ "loss": 0.7874,
1121
+ "step": 730
1122
+ },
1123
+ {
1124
+ "epoch": 1.5030674846625767,
1125
+ "grad_norm": 0.9952226281166077,
1126
+ "learning_rate": 3.2745415039450867e-07,
1127
+ "loss": 0.7874,
1128
+ "step": 735
1129
+ },
1130
+ {
1131
+ "epoch": 1.5132924335378322,
1132
+ "grad_norm": 1.2515606880187988,
1133
+ "learning_rate": 3.19110099175993e-07,
1134
+ "loss": 0.789,
1135
+ "step": 740
1136
+ },
1137
+ {
1138
+ "epoch": 1.5235173824130879,
1139
+ "grad_norm": 0.8901408314704895,
1140
+ "learning_rate": 3.10823616314612e-07,
1141
+ "loss": 0.7853,
1142
+ "step": 745
1143
+ },
1144
+ {
1145
+ "epoch": 1.5337423312883436,
1146
+ "grad_norm": 1.0439373254776,
1147
+ "learning_rate": 3.0259733899023345e-07,
1148
+ "loss": 0.7899,
1149
+ "step": 750
1150
+ },
1151
+ {
1152
+ "epoch": 1.5439672801635993,
1153
+ "grad_norm": 1.0658971071243286,
1154
+ "learning_rate": 2.944338852222643e-07,
1155
+ "loss": 0.7868,
1156
+ "step": 755
1157
+ },
1158
+ {
1159
+ "epoch": 1.5541922290388548,
1160
+ "grad_norm": 0.927455484867096,
1161
+ "learning_rate": 2.8633585303646413e-07,
1162
+ "loss": 0.7904,
1163
+ "step": 760
1164
+ },
1165
+ {
1166
+ "epoch": 1.5644171779141103,
1167
+ "grad_norm": 0.9637423753738403,
1168
+ "learning_rate": 2.783058196381214e-07,
1169
+ "loss": 0.7856,
1170
+ "step": 765
1171
+ },
1172
+ {
1173
+ "epoch": 1.574642126789366,
1174
+ "grad_norm": 1.396472692489624,
1175
+ "learning_rate": 2.7034634059185437e-07,
1176
+ "loss": 0.7903,
1177
+ "step": 770
1178
+ },
1179
+ {
1180
+ "epoch": 1.5848670756646217,
1181
+ "grad_norm": 0.7922792434692383,
1182
+ "learning_rate": 2.6245994900830257e-07,
1183
+ "loss": 0.7843,
1184
+ "step": 775
1185
+ },
1186
+ {
1187
+ "epoch": 1.5950920245398774,
1188
+ "grad_norm": 0.8896881341934204,
1189
+ "learning_rate": 2.546491547379619e-07,
1190
+ "loss": 0.787,
1191
+ "step": 780
1192
+ },
1193
+ {
1194
+ "epoch": 1.605316973415133,
1195
+ "grad_norm": 0.8732028007507324,
1196
+ "learning_rate": 2.469164435724212e-07,
1197
+ "loss": 0.7856,
1198
+ "step": 785
1199
+ },
1200
+ {
1201
+ "epoch": 1.6155419222903884,
1202
+ "grad_norm": 1.0021744966506958,
1203
+ "learning_rate": 2.3926427645325875e-07,
1204
+ "loss": 0.7867,
1205
+ "step": 790
1206
+ },
1207
+ {
1208
+ "epoch": 1.6257668711656441,
1209
+ "grad_norm": 1.1783545017242432,
1210
+ "learning_rate": 2.3169508868884453e-07,
1211
+ "loss": 0.7897,
1212
+ "step": 795
1213
+ },
1214
+ {
1215
+ "epoch": 1.6359918200408998,
1216
+ "grad_norm": 0.9119800329208374,
1217
+ "learning_rate": 2.2421128917930243e-07,
1218
+ "loss": 0.7845,
1219
+ "step": 800
1220
+ },
1221
+ {
1222
+ "epoch": 1.6359918200408998,
1223
+ "eval_accuracy": 0.62896,
1224
+ "eval_loss": 0.6847647428512573,
1225
+ "eval_macro_f1": 0.6281943240633717,
1226
+ "eval_precision": 0.6346364525627035,
1227
+ "eval_recall": 0.6323959922867678,
1228
+ "eval_runtime": 80.6105,
1229
+ "eval_samples_per_second": 1240.533,
1230
+ "eval_steps_per_second": 0.608,
1231
+ "step": 800
1232
+ },
1233
+ {
1234
+ "epoch": 1.6462167689161555,
1235
+ "grad_norm": 0.8903971314430237,
1236
+ "learning_rate": 2.1681525964987474e-07,
1237
+ "loss": 0.7824,
1238
+ "step": 805
1239
+ },
1240
+ {
1241
+ "epoch": 1.656441717791411,
1242
+ "grad_norm": 1.115395188331604,
1243
+ "learning_rate": 2.0950935389293656e-07,
1244
+ "loss": 0.7824,
1245
+ "step": 810
1246
+ },
1247
+ {
1248
+ "epoch": 1.6666666666666665,
1249
+ "grad_norm": 0.9636144638061523,
1250
+ "learning_rate": 2.022958970189001e-07,
1251
+ "loss": 0.7917,
1252
+ "step": 815
1253
+ },
1254
+ {
1255
+ "epoch": 1.6768916155419222,
1256
+ "grad_norm": 0.8787257075309753,
1257
+ "learning_rate": 1.9517718471624532e-07,
1258
+ "loss": 0.7869,
1259
+ "step": 820
1260
+ },
1261
+ {
1262
+ "epoch": 1.687116564417178,
1263
+ "grad_norm": 1.0157173871994019,
1264
+ "learning_rate": 1.88155482520916e-07,
1265
+ "loss": 0.7844,
1266
+ "step": 825
1267
+ },
1268
+ {
1269
+ "epoch": 1.6973415132924337,
1270
+ "grad_norm": 0.9504719972610474,
1271
+ "learning_rate": 1.812330250953107e-07,
1272
+ "loss": 0.7872,
1273
+ "step": 830
1274
+ },
1275
+ {
1276
+ "epoch": 1.7075664621676891,
1277
+ "grad_norm": 0.893625795841217,
1278
+ "learning_rate": 1.7441201551710016e-07,
1279
+ "loss": 0.7879,
1280
+ "step": 835
1281
+ },
1282
+ {
1283
+ "epoch": 1.7177914110429446,
1284
+ "grad_norm": 0.8460310101509094,
1285
+ "learning_rate": 1.6769462457809536e-07,
1286
+ "loss": 0.7853,
1287
+ "step": 840
1288
+ },
1289
+ {
1290
+ "epoch": 1.7280163599182004,
1291
+ "grad_norm": 0.9349818229675293,
1292
+ "learning_rate": 1.610829900933917e-07,
1293
+ "loss": 0.7862,
1294
+ "step": 845
1295
+ },
1296
+ {
1297
+ "epoch": 1.738241308793456,
1298
+ "grad_norm": 0.859866738319397,
1299
+ "learning_rate": 1.545792162210074e-07,
1300
+ "loss": 0.7836,
1301
+ "step": 850
1302
+ },
1303
+ {
1304
+ "epoch": 1.7484662576687118,
1305
+ "grad_norm": 1.0148438215255737,
1306
+ "learning_rate": 1.481853727922341e-07,
1307
+ "loss": 0.7859,
1308
+ "step": 855
1309
+ },
1310
+ {
1311
+ "epoch": 1.7586912065439673,
1312
+ "grad_norm": 0.8861204385757446,
1313
+ "learning_rate": 1.4190349465291035e-07,
1314
+ "loss": 0.7909,
1315
+ "step": 860
1316
+ },
1317
+ {
1318
+ "epoch": 1.7689161554192228,
1319
+ "grad_norm": 0.7679073214530945,
1320
+ "learning_rate": 1.3573558101583105e-07,
1321
+ "loss": 0.785,
1322
+ "step": 865
1323
+ },
1324
+ {
1325
+ "epoch": 1.7791411042944785,
1326
+ "grad_norm": 0.7364144325256348,
1327
+ "learning_rate": 1.2968359482449636e-07,
1328
+ "loss": 0.7824,
1329
+ "step": 870
1330
+ },
1331
+ {
1332
+ "epoch": 1.7893660531697342,
1333
+ "grad_norm": 0.945924699306488,
1334
+ "learning_rate": 1.2374946212840288e-07,
1335
+ "loss": 0.7864,
1336
+ "step": 875
1337
+ },
1338
+ {
1339
+ "epoch": 1.79959100204499,
1340
+ "grad_norm": 1.1060514450073242,
1341
+ "learning_rate": 1.1793507147007714e-07,
1342
+ "loss": 0.7866,
1343
+ "step": 880
1344
+ },
1345
+ {
1346
+ "epoch": 1.8098159509202454,
1347
+ "grad_norm": 0.9230445623397827,
1348
+ "learning_rate": 1.1224227328404534e-07,
1349
+ "loss": 0.7895,
1350
+ "step": 885
1351
+ },
1352
+ {
1353
+ "epoch": 1.8200408997955009,
1354
+ "grad_norm": 0.9153196811676025,
1355
+ "learning_rate": 1.0667287930793151e-07,
1356
+ "loss": 0.7835,
1357
+ "step": 890
1358
+ },
1359
+ {
1360
+ "epoch": 1.8302658486707566,
1361
+ "grad_norm": 0.9513780474662781,
1362
+ "learning_rate": 1.0122866200586944e-07,
1363
+ "loss": 0.7846,
1364
+ "step": 895
1365
+ },
1366
+ {
1367
+ "epoch": 1.8404907975460123,
1368
+ "grad_norm": 0.8672247529029846,
1369
+ "learning_rate": 9.591135400441552e-08,
1370
+ "loss": 0.7839,
1371
+ "step": 900
1372
+ },
1373
+ {
1374
+ "epoch": 1.8404907975460123,
1375
+ "eval_accuracy": 0.63125,
1376
+ "eval_loss": 0.6845182776451111,
1377
+ "eval_macro_f1": 0.6309538076224105,
1378
+ "eval_precision": 0.6350446377333951,
1379
+ "eval_recall": 0.6339031903992685,
1380
+ "eval_runtime": 80.5646,
1381
+ "eval_samples_per_second": 1241.24,
1382
+ "eval_steps_per_second": 0.608,
1383
+ "step": 900
1384
+ },
1385
+ {
1386
+ "epoch": 1.850715746421268,
1387
+ "grad_norm": 1.2127219438552856,
1388
+ "learning_rate": 9.072264754113912e-08,
1389
+ "loss": 0.7876,
1390
+ "step": 905
1391
+ },
1392
+ {
1393
+ "epoch": 1.8609406952965235,
1394
+ "grad_norm": 0.875455379486084,
1395
+ "learning_rate": 8.566419392606544e-08,
1396
+ "loss": 0.787,
1397
+ "step": 910
1398
+ },
1399
+ {
1400
+ "epoch": 1.871165644171779,
1401
+ "grad_norm": 0.92503821849823,
1402
+ "learning_rate": 8.073760301614596e-08,
1403
+ "loss": 0.7834,
1404
+ "step": 915
1405
+ },
1406
+ {
1407
+ "epoch": 1.8813905930470347,
1408
+ "grad_norm": 1.1361068487167358,
1409
+ "learning_rate": 7.594444270291922e-08,
1410
+ "loss": 0.7821,
1411
+ "step": 920
1412
+ },
1413
+ {
1414
+ "epoch": 1.8916155419222904,
1415
+ "grad_norm": 1.1415101289749146,
1416
+ "learning_rate": 7.128623841352916e-08,
1417
+ "loss": 0.7877,
1418
+ "step": 925
1419
+ },
1420
+ {
1421
+ "epoch": 1.9018404907975461,
1422
+ "grad_norm": 0.9358757138252258,
1423
+ "learning_rate": 6.676447262525547e-08,
1424
+ "loss": 0.7867,
1425
+ "step": 930
1426
+ },
1427
+ {
1428
+ "epoch": 1.9120654396728016,
1429
+ "grad_norm": 0.912706732749939,
1430
+ "learning_rate": 6.238058439371479e-08,
1431
+ "loss": 0.7884,
1432
+ "step": 935
1433
+ },
1434
+ {
1435
+ "epoch": 1.9222903885480571,
1436
+ "grad_norm": 0.9449842572212219,
1437
+ "learning_rate": 5.813596889488009e-08,
1438
+ "loss": 0.7893,
1439
+ "step": 940
1440
+ },
1441
+ {
1442
+ "epoch": 1.9325153374233128,
1443
+ "grad_norm": 0.8449825048446655,
1444
+ "learning_rate": 5.403197698106432e-08,
1445
+ "loss": 0.7828,
1446
+ "step": 945
1447
+ },
1448
+ {
1449
+ "epoch": 1.9427402862985685,
1450
+ "grad_norm": 0.9307764768600464,
1451
+ "learning_rate": 5.0069914751010913e-08,
1452
+ "loss": 0.785,
1453
+ "step": 950
1454
+ },
1455
+ {
1456
+ "epoch": 1.9529652351738243,
1457
+ "grad_norm": 1.3704556226730347,
1458
+ "learning_rate": 4.625104313422673e-08,
1459
+ "loss": 0.7874,
1460
+ "step": 955
1461
+ },
1462
+ {
1463
+ "epoch": 1.9631901840490797,
1464
+ "grad_norm": 1.0163496732711792,
1465
+ "learning_rate": 4.257657748969046e-08,
1466
+ "loss": 0.7834,
1467
+ "step": 960
1468
+ },
1469
+ {
1470
+ "epoch": 1.9734151329243352,
1471
+ "grad_norm": 0.8112438321113586,
1472
+ "learning_rate": 3.904768721906304e-08,
1473
+ "loss": 0.7852,
1474
+ "step": 965
1475
+ },
1476
+ {
1477
+ "epoch": 1.983640081799591,
1478
+ "grad_norm": 0.885705828666687,
1479
+ "learning_rate": 3.566549539452529e-08,
1480
+ "loss": 0.7792,
1481
+ "step": 970
1482
+ },
1483
+ {
1484
+ "epoch": 1.9938650306748467,
1485
+ "grad_norm": 0.8692009449005127,
1486
+ "learning_rate": 3.243107840135878e-08,
1487
+ "loss": 0.7822,
1488
+ "step": 975
1489
+ },
1490
+ {
1491
+ "epoch": 2.0040899795501024,
1492
+ "grad_norm": 0.8909807205200195,
1493
+ "learning_rate": 2.9345465595385866e-08,
1494
+ "loss": 0.7826,
1495
+ "step": 980
1496
+ },
1497
+ {
1498
+ "epoch": 2.014314928425358,
1499
+ "grad_norm": 0.9065344333648682,
1500
+ "learning_rate": 2.6409638975375737e-08,
1501
+ "loss": 0.7849,
1502
+ "step": 985
1503
+ },
1504
+ {
1505
+ "epoch": 2.0245398773006134,
1506
+ "grad_norm": 0.8145809173583984,
1507
+ "learning_rate": 2.3624532870522962e-08,
1508
+ "loss": 0.7885,
1509
+ "step": 990
1510
+ },
1511
+ {
1512
+ "epoch": 2.034764826175869,
1513
+ "grad_norm": 0.9461153149604797,
1514
+ "learning_rate": 2.0991033643096457e-08,
1515
+ "loss": 0.7853,
1516
+ "step": 995
1517
+ },
1518
+ {
1519
+ "epoch": 2.044989775051125,
1520
+ "grad_norm": 0.8470706343650818,
1521
+ "learning_rate": 1.8509979406353794e-08,
1522
+ "loss": 0.7881,
1523
+ "step": 1000
1524
+ },
1525
+ {
1526
+ "epoch": 2.044989775051125,
1527
+ "eval_accuracy": 0.63202,
1528
+ "eval_loss": 0.6844514012336731,
1529
+ "eval_macro_f1": 0.6318036560759084,
1530
+ "eval_precision": 0.6354113747156731,
1531
+ "eval_recall": 0.6344858797364747,
1532
+ "eval_runtime": 81.1838,
1533
+ "eval_samples_per_second": 1231.772,
1534
+ "eval_steps_per_second": 0.604,
1535
+ "step": 1000
1536
+ },
1537
+ {
1538
+ "epoch": 2.0552147239263805,
1539
+ "grad_norm": 0.8817445635795593,
1540
+ "learning_rate": 1.6182159757810897e-08,
1541
+ "loss": 0.7879,
1542
+ "step": 1005
1543
+ },
1544
+ {
1545
+ "epoch": 2.065439672801636,
1546
+ "grad_norm": 0.856109082698822,
1547
+ "learning_rate": 1.400831552795234e-08,
1548
+ "loss": 0.7868,
1549
+ "step": 1010
1550
+ },
1551
+ {
1552
+ "epoch": 2.0756646216768915,
1553
+ "grad_norm": 0.956066370010376,
1554
+ "learning_rate": 1.1989138544461375e-08,
1555
+ "loss": 0.7845,
1556
+ "step": 1015
1557
+ },
1558
+ {
1559
+ "epoch": 2.085889570552147,
1560
+ "grad_norm": 0.930978000164032,
1561
+ "learning_rate": 1.0125271412044666e-08,
1562
+ "loss": 0.7876,
1563
+ "step": 1020
1564
+ },
1565
+ {
1566
+ "epoch": 2.096114519427403,
1567
+ "grad_norm": 0.9799636602401733,
1568
+ "learning_rate": 8.417307307923615e-09,
1569
+ "loss": 0.7861,
1570
+ "step": 1025
1571
+ },
1572
+ {
1573
+ "epoch": 2.1063394683026586,
1574
+ "grad_norm": 0.9991019368171692,
1575
+ "learning_rate": 6.8657897930547435e-09,
1576
+ "loss": 0.7852,
1577
+ "step": 1030
1578
+ },
1579
+ {
1580
+ "epoch": 2.116564417177914,
1581
+ "grad_norm": 1.076750636100769,
1582
+ "learning_rate": 5.471212639141132e-09,
1583
+ "loss": 0.7789,
1584
+ "step": 1035
1585
+ },
1586
+ {
1587
+ "epoch": 2.1267893660531696,
1588
+ "grad_norm": 0.9805507063865662,
1589
+ "learning_rate": 4.23401967148912e-09,
1590
+ "loss": 0.7829,
1591
+ "step": 1040
1592
+ },
1593
+ {
1594
+ "epoch": 2.1370143149284253,
1595
+ "grad_norm": 0.7899750471115112,
1596
+ "learning_rate": 3.154604627760571e-09,
1597
+ "loss": 0.7839,
1598
+ "step": 1045
1599
+ },
1600
+ {
1601
+ "epoch": 2.147239263803681,
1602
+ "grad_norm": 1.1698967218399048,
1603
+ "learning_rate": 2.2333110326655526e-09,
1604
+ "loss": 0.7869,
1605
+ "step": 1050
1606
+ },
1607
+ {
1608
+ "epoch": 2.1574642126789367,
1609
+ "grad_norm": 0.9302964806556702,
1610
+ "learning_rate": 1.4704320886352873e-09,
1611
+ "loss": 0.7832,
1612
+ "step": 1055
1613
+ },
1614
+ {
1615
+ "epoch": 2.1676891615541924,
1616
+ "grad_norm": 1.057986855506897,
1617
+ "learning_rate": 8.662105825103517e-10,
1618
+ "loss": 0.7864,
1619
+ "step": 1060
1620
+ },
1621
+ {
1622
+ "epoch": 2.1779141104294477,
1623
+ "grad_norm": 1.0347933769226074,
1624
+ "learning_rate": 4.208388082733161e-10,
1625
+ "loss": 0.7822,
1626
+ "step": 1065
1627
+ },
1628
+ {
1629
+ "epoch": 2.1881390593047034,
1630
+ "grad_norm": 0.9827083945274353,
1631
+ "learning_rate": 1.3445850585130924e-10,
1632
+ "loss": 0.784,
1633
+ "step": 1070
1634
+ },
1635
+ {
1636
+ "epoch": 2.198364008179959,
1637
+ "grad_norm": 0.8463678956031799,
1638
+ "learning_rate": 7.160816007045767e-12,
1639
+ "loss": 0.7811,
1640
+ "step": 1075
1641
+ },
1642
+ {
1643
+ "epoch": 2.208588957055215,
1644
+ "grad_norm": 0.9141009449958801,
1645
+ "learning_rate": 9.999610137486667e-07,
1646
+ "loss": 0.7828,
1647
+ "step": 1080
1648
+ },
1649
+ {
1650
+ "epoch": 2.21881390593047,
1651
+ "grad_norm": 0.8992940783500671,
1652
+ "learning_rate": 9.997700753166407e-07,
1653
+ "loss": 0.7843,
1654
+ "step": 1085
1655
+ },
1656
+ {
1657
+ "epoch": 2.229038854805726,
1658
+ "grad_norm": 0.9198014140129089,
1659
+ "learning_rate": 9.99420084654225e-07,
1660
+ "loss": 0.7867,
1661
+ "step": 1090
1662
+ },
1663
+ {
1664
+ "epoch": 2.2392638036809815,
1665
+ "grad_norm": 0.841385006904602,
1666
+ "learning_rate": 9.98911153146231e-07,
1667
+ "loss": 0.7899,
1668
+ "step": 1095
1669
+ },
1670
+ {
1671
+ "epoch": 2.2494887525562373,
1672
+ "grad_norm": 0.9428244233131409,
1673
+ "learning_rate": 9.982434427605222e-07,
1674
+ "loss": 0.783,
1675
+ "step": 1100
1676
+ },
1677
+ {
1678
+ "epoch": 2.2494887525562373,
1679
+ "eval_accuracy": 0.63535,
1680
+ "eval_loss": 0.6841139197349548,
1681
+ "eval_macro_f1": 0.6353491904387377,
1682
+ "eval_precision": 0.6368108503242846,
1683
+ "eval_recall": 0.6367719631437929,
1684
+ "eval_runtime": 81.1976,
1685
+ "eval_samples_per_second": 1231.563,
1686
+ "eval_steps_per_second": 0.603,
1687
+ "step": 1100
1688
+ }
1689
+ ],
1690
+ "logging_steps": 5,
1691
+ "max_steps": 1956,
1692
+ "num_input_tokens_seen": 0,
1693
+ "num_train_epochs": 4,
1694
+ "save_steps": 100,
1695
+ "stateful_callbacks": {
1696
+ "EarlyStoppingCallback": {
1697
+ "args": {
1698
+ "early_stopping_patience": 3,
1699
+ "early_stopping_threshold": 0.0
1700
+ },
1701
+ "attributes": {
1702
+ "early_stopping_patience_counter": 0
1703
+ }
1704
+ },
1705
+ "TrainerControl": {
1706
+ "args": {
1707
+ "should_epoch_stop": false,
1708
+ "should_evaluate": false,
1709
+ "should_log": false,
1710
+ "should_save": true,
1711
+ "should_training_stop": false
1712
+ },
1713
+ "attributes": {}
1714
+ }
1715
+ },
1716
+ "total_flos": 2.959809932840141e+17,
1717
+ "train_batch_size": 1024,
1718
+ "trial_name": null,
1719
+ "trial_params": null
1720
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2212b57ced9fbe3464bd23d4ac0f4d8e75b4b021597f160058a4a19990d9f0d3
3
+ size 5841
graphcodebert-swa-from-epoch-1/checkpoint-1100/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
graphcodebert-swa-from-epoch-1/checkpoint-1200/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.3,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": 0.3,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.3,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 514,
18
+ "model_type": "roberta",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "output_past": true,
22
+ "pad_token_id": 1,
23
+ "position_embedding_type": "absolute",
24
+ "problem_type": "single_label_classification",
25
+ "transformers_version": "4.56.0",
26
+ "type_vocab_size": 1,
27
+ "use_cache": true,
28
+ "vocab_size": 50265
29
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1200/config_hyperparams.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_config": {
3
+ "model_name": "/kaggle/input/models/dzung271828/microsoft-graphcodebert-base/transformers/default/1",
4
+ "num_epochs": 4,
5
+ "batch_size": 1024,
6
+ "learning_rate": 1e-06,
7
+ "max_length": 512,
8
+ "num_labels": 2,
9
+ "loss_type": "r-drop",
10
+ "focal_alpha": 1.0,
11
+ "focal_gamma": 2.0,
12
+ "r_drop_alpha": 10.0,
13
+ "infonce_temperature": 0.07,
14
+ "infonce_weight": 0.5,
15
+ "label_smoothing": 0.5,
16
+ "adversarial_epsilon": 0.5,
17
+ "use_swa": true,
18
+ "swa_start_epoch": 1,
19
+ "swa_lr": 1e-05,
20
+ "data_augmentation": true,
21
+ "aug_rename_prob": 0.8,
22
+ "aug_format_prob": 0.8,
23
+ "freeze_base": true,
24
+ "seed": 42,
25
+ "use_wandb": false,
26
+ "mixup_alpha": 1.0,
27
+ "low_pass_keep_ratio": 0.5,
28
+ "freq_consistency_weight": 0.5
29
+ },
30
+ "training_arguments": {
31
+ "output_dir": "graphcodebert-swa-from-epoch-1/",
32
+ "num_train_epochs": 4,
33
+ "per_device_train_batch_size": 1024,
34
+ "per_device_eval_batch_size": 2048,
35
+ "learning_rate": 1e-06,
36
+ "warmup_steps": 195,
37
+ "weight_decay": 0.1,
38
+ "logging_steps": 5,
39
+ "eval_steps": 100,
40
+ "save_steps": 100,
41
+ "metric_for_best_model": "macro_f1",
42
+ "greater_is_better": true,
43
+ "save_total_limit": 5,
44
+ "fp16": true,
45
+ "seed": 42
46
+ },
47
+ "training_state": {
48
+ "global_step": 1200,
49
+ "epoch": 2.4539877300613497,
50
+ "best_metric": 0.6440359919423964,
51
+ "best_model_checkpoint": "graphcodebert-swa-from-epoch-1/checkpoint-1200"
52
+ }
53
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1200/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
graphcodebert-swa-from-epoch-1/checkpoint-1200/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:638928033229238d9d8b14410b8c8884341bf5076f986fddea82390a6ad61185
3
+ size 498612824
graphcodebert-swa-from-epoch-1/checkpoint-1200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0a356d7e7578338726980fb7d72e7c4d6b1e5408ca3e2f10c6a71174205d890
3
+ size 4741923
graphcodebert-swa-from-epoch-1/checkpoint-1200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7a88290d1fe94434568e18f7de3e57f24dd9a631fd2b477ff8367461c0ed128
3
+ size 14645
graphcodebert-swa-from-epoch-1/checkpoint-1200/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18783150ac09b6b81cea5af47876a10bfe5f36c3d76aca4ffce5382bdfaf7b28
3
+ size 1383
graphcodebert-swa-from-epoch-1/checkpoint-1200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:223ff4ad0572c1075186f0d43c6aebd5581a30e1282be5d602abe8def7241268
3
+ size 1465
graphcodebert-swa-from-epoch-1/checkpoint-1200/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1200/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
graphcodebert-swa-from-epoch-1/checkpoint-1200/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 512,
53
+ "pad_token": "<pad>",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1200/trainer_state.json ADDED
@@ -0,0 +1,1872 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1200,
3
+ "best_metric": 0.6440359919423964,
4
+ "best_model_checkpoint": "graphcodebert-swa-from-epoch-1/checkpoint-1200",
5
+ "epoch": 2.4539877300613497,
6
+ "eval_steps": 100,
7
+ "global_step": 1200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.010224948875255624,
14
+ "grad_norm": 2.4707133769989014,
15
+ "learning_rate": 2.0512820512820512e-08,
16
+ "loss": 0.8431,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.02044989775051125,
21
+ "grad_norm": 3.114851951599121,
22
+ "learning_rate": 4.615384615384615e-08,
23
+ "loss": 0.844,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.03067484662576687,
28
+ "grad_norm": 2.2256007194519043,
29
+ "learning_rate": 7.179487179487178e-08,
30
+ "loss": 0.847,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.0408997955010225,
35
+ "grad_norm": 2.5343081951141357,
36
+ "learning_rate": 9.743589743589743e-08,
37
+ "loss": 0.8492,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.05112474437627812,
42
+ "grad_norm": 3.1964163780212402,
43
+ "learning_rate": 1.2307692307692308e-07,
44
+ "loss": 0.8475,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.06134969325153374,
49
+ "grad_norm": 2.0466485023498535,
50
+ "learning_rate": 1.4871794871794872e-07,
51
+ "loss": 0.8445,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.07157464212678936,
56
+ "grad_norm": 2.164569139480591,
57
+ "learning_rate": 1.7435897435897435e-07,
58
+ "loss": 0.8452,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.081799591002045,
63
+ "grad_norm": 2.56343150138855,
64
+ "learning_rate": 2e-07,
65
+ "loss": 0.8473,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.09202453987730061,
70
+ "grad_norm": 2.5742437839508057,
71
+ "learning_rate": 2.2564102564102563e-07,
72
+ "loss": 0.848,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.10224948875255624,
77
+ "grad_norm": 2.587480306625366,
78
+ "learning_rate": 2.5128205128205126e-07,
79
+ "loss": 0.8409,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.11247443762781185,
84
+ "grad_norm": 2.5737764835357666,
85
+ "learning_rate": 2.7692307692307693e-07,
86
+ "loss": 0.8471,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.12269938650306748,
91
+ "grad_norm": 3.044358730316162,
92
+ "learning_rate": 3.0256410256410254e-07,
93
+ "loss": 0.8448,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.1329243353783231,
98
+ "grad_norm": 2.326373815536499,
99
+ "learning_rate": 3.282051282051282e-07,
100
+ "loss": 0.8517,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.14314928425357873,
105
+ "grad_norm": 2.267547607421875,
106
+ "learning_rate": 3.5384615384615386e-07,
107
+ "loss": 0.8387,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.15337423312883436,
112
+ "grad_norm": 2.609232187271118,
113
+ "learning_rate": 3.7948717948717947e-07,
114
+ "loss": 0.841,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.16359918200409,
119
+ "grad_norm": 2.9532523155212402,
120
+ "learning_rate": 4.0512820512820514e-07,
121
+ "loss": 0.8509,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.1738241308793456,
126
+ "grad_norm": 3.002154588699341,
127
+ "learning_rate": 4.307692307692308e-07,
128
+ "loss": 0.8482,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.18404907975460122,
133
+ "grad_norm": 2.701613187789917,
134
+ "learning_rate": 4.5641025641025636e-07,
135
+ "loss": 0.8422,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.19427402862985685,
140
+ "grad_norm": 2.7430365085601807,
141
+ "learning_rate": 4.82051282051282e-07,
142
+ "loss": 0.846,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.20449897750511248,
147
+ "grad_norm": 2.8101418018341064,
148
+ "learning_rate": 5.076923076923076e-07,
149
+ "loss": 0.8444,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.20449897750511248,
154
+ "eval_accuracy": 0.52033,
155
+ "eval_loss": 0.6922348141670227,
156
+ "eval_macro_f1": 0.4427650399783254,
157
+ "eval_precision": 0.6036606007378691,
158
+ "eval_recall": 0.5386742448919869,
159
+ "eval_runtime": 80.6812,
160
+ "eval_samples_per_second": 1239.446,
161
+ "eval_steps_per_second": 0.607,
162
+ "step": 100
163
+ },
164
+ {
165
+ "epoch": 0.2147239263803681,
166
+ "grad_norm": 2.5835089683532715,
167
+ "learning_rate": 5.333333333333333e-07,
168
+ "loss": 0.8437,
169
+ "step": 105
170
+ },
171
+ {
172
+ "epoch": 0.2249488752556237,
173
+ "grad_norm": 2.7237253189086914,
174
+ "learning_rate": 5.58974358974359e-07,
175
+ "loss": 0.8431,
176
+ "step": 110
177
+ },
178
+ {
179
+ "epoch": 0.23517382413087934,
180
+ "grad_norm": 2.4648072719573975,
181
+ "learning_rate": 5.846153846153847e-07,
182
+ "loss": 0.8399,
183
+ "step": 115
184
+ },
185
+ {
186
+ "epoch": 0.24539877300613497,
187
+ "grad_norm": 2.7011852264404297,
188
+ "learning_rate": 6.102564102564103e-07,
189
+ "loss": 0.8409,
190
+ "step": 120
191
+ },
192
+ {
193
+ "epoch": 0.2556237218813906,
194
+ "grad_norm": 2.3170969486236572,
195
+ "learning_rate": 6.358974358974358e-07,
196
+ "loss": 0.8361,
197
+ "step": 125
198
+ },
199
+ {
200
+ "epoch": 0.2658486707566462,
201
+ "grad_norm": 2.517194986343384,
202
+ "learning_rate": 6.615384615384615e-07,
203
+ "loss": 0.839,
204
+ "step": 130
205
+ },
206
+ {
207
+ "epoch": 0.27607361963190186,
208
+ "grad_norm": 2.5092124938964844,
209
+ "learning_rate": 6.871794871794871e-07,
210
+ "loss": 0.8438,
211
+ "step": 135
212
+ },
213
+ {
214
+ "epoch": 0.28629856850715746,
215
+ "grad_norm": 2.3993237018585205,
216
+ "learning_rate": 7.128205128205128e-07,
217
+ "loss": 0.8349,
218
+ "step": 140
219
+ },
220
+ {
221
+ "epoch": 0.2965235173824131,
222
+ "grad_norm": 2.1388165950775146,
223
+ "learning_rate": 7.384615384615384e-07,
224
+ "loss": 0.8363,
225
+ "step": 145
226
+ },
227
+ {
228
+ "epoch": 0.3067484662576687,
229
+ "grad_norm": 1.8425891399383545,
230
+ "learning_rate": 7.64102564102564e-07,
231
+ "loss": 0.8325,
232
+ "step": 150
233
+ },
234
+ {
235
+ "epoch": 0.3169734151329243,
236
+ "grad_norm": 1.8665552139282227,
237
+ "learning_rate": 7.897435897435897e-07,
238
+ "loss": 0.835,
239
+ "step": 155
240
+ },
241
+ {
242
+ "epoch": 0.32719836400818,
243
+ "grad_norm": 1.8765455484390259,
244
+ "learning_rate": 8.153846153846154e-07,
245
+ "loss": 0.8328,
246
+ "step": 160
247
+ },
248
+ {
249
+ "epoch": 0.3374233128834356,
250
+ "grad_norm": 2.640779495239258,
251
+ "learning_rate": 8.41025641025641e-07,
252
+ "loss": 0.8388,
253
+ "step": 165
254
+ },
255
+ {
256
+ "epoch": 0.3476482617586912,
257
+ "grad_norm": 2.174116373062134,
258
+ "learning_rate": 8.666666666666667e-07,
259
+ "loss": 0.8336,
260
+ "step": 170
261
+ },
262
+ {
263
+ "epoch": 0.35787321063394684,
264
+ "grad_norm": 1.8411178588867188,
265
+ "learning_rate": 8.923076923076923e-07,
266
+ "loss": 0.8384,
267
+ "step": 175
268
+ },
269
+ {
270
+ "epoch": 0.36809815950920244,
271
+ "grad_norm": 2.3652143478393555,
272
+ "learning_rate": 9.179487179487179e-07,
273
+ "loss": 0.8318,
274
+ "step": 180
275
+ },
276
+ {
277
+ "epoch": 0.3783231083844581,
278
+ "grad_norm": 1.9870903491973877,
279
+ "learning_rate": 9.435897435897435e-07,
280
+ "loss": 0.8306,
281
+ "step": 185
282
+ },
283
+ {
284
+ "epoch": 0.3885480572597137,
285
+ "grad_norm": 2.458887815475464,
286
+ "learning_rate": 9.692307692307691e-07,
287
+ "loss": 0.8342,
288
+ "step": 190
289
+ },
290
+ {
291
+ "epoch": 0.3987730061349693,
292
+ "grad_norm": 1.9105890989303589,
293
+ "learning_rate": 9.948717948717949e-07,
294
+ "loss": 0.8301,
295
+ "step": 195
296
+ },
297
+ {
298
+ "epoch": 0.40899795501022496,
299
+ "grad_norm": 2.04896879196167,
300
+ "learning_rate": 9.999490793845076e-07,
301
+ "loss": 0.8291,
302
+ "step": 200
303
+ },
304
+ {
305
+ "epoch": 0.40899795501022496,
306
+ "eval_accuracy": 0.52697,
307
+ "eval_loss": 0.6913915872573853,
308
+ "eval_macro_f1": 0.4511625248903547,
309
+ "eval_precision": 0.6198512746424523,
310
+ "eval_recall": 0.5452618609595298,
311
+ "eval_runtime": 80.6395,
312
+ "eval_samples_per_second": 1240.088,
313
+ "eval_steps_per_second": 0.608,
314
+ "step": 200
315
+ },
316
+ {
317
+ "epoch": 0.41922290388548056,
318
+ "grad_norm": 2.394630193710327,
319
+ "learning_rate": 9.997422321595486e-07,
320
+ "loss": 0.8311,
321
+ "step": 205
322
+ },
323
+ {
324
+ "epoch": 0.4294478527607362,
325
+ "grad_norm": 1.7013665437698364,
326
+ "learning_rate": 9.993763415653074e-07,
327
+ "loss": 0.8264,
328
+ "step": 210
329
+ },
330
+ {
331
+ "epoch": 0.4396728016359918,
332
+ "grad_norm": 2.1158103942871094,
333
+ "learning_rate": 9.988515240467613e-07,
334
+ "loss": 0.8262,
335
+ "step": 215
336
+ },
337
+ {
338
+ "epoch": 0.4498977505112474,
339
+ "grad_norm": 1.5985370874404907,
340
+ "learning_rate": 9.981679466275095e-07,
341
+ "loss": 0.8296,
342
+ "step": 220
343
+ },
344
+ {
345
+ "epoch": 0.4601226993865031,
346
+ "grad_norm": 2.0426042079925537,
347
+ "learning_rate": 9.973258268566182e-07,
348
+ "loss": 0.8233,
349
+ "step": 225
350
+ },
351
+ {
352
+ "epoch": 0.4703476482617587,
353
+ "grad_norm": 1.7411834001541138,
354
+ "learning_rate": 9.963254327393853e-07,
355
+ "loss": 0.8269,
356
+ "step": 230
357
+ },
358
+ {
359
+ "epoch": 0.48057259713701433,
360
+ "grad_norm": 2.1182405948638916,
361
+ "learning_rate": 9.95167082652047e-07,
362
+ "loss": 0.8247,
363
+ "step": 235
364
+ },
365
+ {
366
+ "epoch": 0.49079754601226994,
367
+ "grad_norm": 2.0239953994750977,
368
+ "learning_rate": 9.938511452404547e-07,
369
+ "loss": 0.8308,
370
+ "step": 240
371
+ },
372
+ {
373
+ "epoch": 0.5010224948875256,
374
+ "grad_norm": 2.366060495376587,
375
+ "learning_rate": 9.923780393027534e-07,
376
+ "loss": 0.8205,
377
+ "step": 245
378
+ },
379
+ {
380
+ "epoch": 0.5112474437627812,
381
+ "grad_norm": 1.848169207572937,
382
+ "learning_rate": 9.907482336560982e-07,
383
+ "loss": 0.825,
384
+ "step": 250
385
+ },
386
+ {
387
+ "epoch": 0.5214723926380368,
388
+ "grad_norm": 1.8216668367385864,
389
+ "learning_rate": 9.889622469874535e-07,
390
+ "loss": 0.8271,
391
+ "step": 255
392
+ },
393
+ {
394
+ "epoch": 0.5316973415132924,
395
+ "grad_norm": 1.507730484008789,
396
+ "learning_rate": 9.8702064768852e-07,
397
+ "loss": 0.8147,
398
+ "step": 260
399
+ },
400
+ {
401
+ "epoch": 0.5419222903885481,
402
+ "grad_norm": 1.7608263492584229,
403
+ "learning_rate": 9.849240536748438e-07,
404
+ "loss": 0.8221,
405
+ "step": 265
406
+ },
407
+ {
408
+ "epoch": 0.5521472392638037,
409
+ "grad_norm": 2.203326940536499,
410
+ "learning_rate": 9.826731321891641e-07,
411
+ "loss": 0.8292,
412
+ "step": 270
413
+ },
414
+ {
415
+ "epoch": 0.5623721881390593,
416
+ "grad_norm": 1.9529740810394287,
417
+ "learning_rate": 9.802685995890632e-07,
418
+ "loss": 0.8228,
419
+ "step": 275
420
+ },
421
+ {
422
+ "epoch": 0.5725971370143149,
423
+ "grad_norm": 1.6214399337768555,
424
+ "learning_rate": 9.777112211189841e-07,
425
+ "loss": 0.8149,
426
+ "step": 280
427
+ },
428
+ {
429
+ "epoch": 0.5828220858895705,
430
+ "grad_norm": 2.07482647895813,
431
+ "learning_rate": 9.750018106666924e-07,
432
+ "loss": 0.8143,
433
+ "step": 285
434
+ },
435
+ {
436
+ "epoch": 0.5930470347648262,
437
+ "grad_norm": 1.7083203792572021,
438
+ "learning_rate": 9.721412305042538e-07,
439
+ "loss": 0.8188,
440
+ "step": 290
441
+ },
442
+ {
443
+ "epoch": 0.6032719836400818,
444
+ "grad_norm": 2.0022943019866943,
445
+ "learning_rate": 9.69130391013617e-07,
446
+ "loss": 0.8195,
447
+ "step": 295
448
+ },
449
+ {
450
+ "epoch": 0.6134969325153374,
451
+ "grad_norm": 1.5799461603164673,
452
+ "learning_rate": 9.659702503968834e-07,
453
+ "loss": 0.8146,
454
+ "step": 300
455
+ },
456
+ {
457
+ "epoch": 0.6134969325153374,
458
+ "eval_accuracy": 0.55052,
459
+ "eval_loss": 0.6896406412124634,
460
+ "eval_macro_f1": 0.49869783315905847,
461
+ "eval_precision": 0.6264643684302231,
462
+ "eval_recall": 0.5665461014402418,
463
+ "eval_runtime": 80.6145,
464
+ "eval_samples_per_second": 1240.472,
465
+ "eval_steps_per_second": 0.608,
466
+ "step": 300
467
+ },
468
+ {
469
+ "epoch": 0.623721881390593,
470
+ "grad_norm": 1.9373347759246826,
471
+ "learning_rate": 9.626618143713586e-07,
472
+ "loss": 0.8166,
473
+ "step": 305
474
+ },
475
+ {
476
+ "epoch": 0.6339468302658486,
477
+ "grad_norm": 1.6276922225952148,
478
+ "learning_rate": 9.592061358494813e-07,
479
+ "loss": 0.8176,
480
+ "step": 310
481
+ },
482
+ {
483
+ "epoch": 0.6441717791411042,
484
+ "grad_norm": 1.9373250007629395,
485
+ "learning_rate": 9.556043146037337e-07,
486
+ "loss": 0.8168,
487
+ "step": 315
488
+ },
489
+ {
490
+ "epoch": 0.65439672801636,
491
+ "grad_norm": 1.320465087890625,
492
+ "learning_rate": 9.518574969166391e-07,
493
+ "loss": 0.8101,
494
+ "step": 320
495
+ },
496
+ {
497
+ "epoch": 0.6646216768916156,
498
+ "grad_norm": 1.8596330881118774,
499
+ "learning_rate": 9.47966875215954e-07,
500
+ "loss": 0.8167,
501
+ "step": 325
502
+ },
503
+ {
504
+ "epoch": 0.6748466257668712,
505
+ "grad_norm": 1.304662823677063,
506
+ "learning_rate": 9.439336876951793e-07,
507
+ "loss": 0.815,
508
+ "step": 330
509
+ },
510
+ {
511
+ "epoch": 0.6850715746421268,
512
+ "grad_norm": 1.8063029050827026,
513
+ "learning_rate": 9.397592179195033e-07,
514
+ "loss": 0.8121,
515
+ "step": 335
516
+ },
517
+ {
518
+ "epoch": 0.6952965235173824,
519
+ "grad_norm": 1.7432739734649658,
520
+ "learning_rate": 9.354447944173059e-07,
521
+ "loss": 0.8104,
522
+ "step": 340
523
+ },
524
+ {
525
+ "epoch": 0.7055214723926381,
526
+ "grad_norm": 1.4523797035217285,
527
+ "learning_rate": 9.309917902573533e-07,
528
+ "loss": 0.8098,
529
+ "step": 345
530
+ },
531
+ {
532
+ "epoch": 0.7157464212678937,
533
+ "grad_norm": 1.681409478187561,
534
+ "learning_rate": 9.264016226118188e-07,
535
+ "loss": 0.8107,
536
+ "step": 350
537
+ },
538
+ {
539
+ "epoch": 0.7259713701431493,
540
+ "grad_norm": 1.5168694257736206,
541
+ "learning_rate": 9.216757523052652e-07,
542
+ "loss": 0.8085,
543
+ "step": 355
544
+ },
545
+ {
546
+ "epoch": 0.7361963190184049,
547
+ "grad_norm": 1.2200194597244263,
548
+ "learning_rate": 9.168156833497371e-07,
549
+ "loss": 0.8109,
550
+ "step": 360
551
+ },
552
+ {
553
+ "epoch": 0.7464212678936605,
554
+ "grad_norm": 1.2745580673217773,
555
+ "learning_rate": 9.118229624661078e-07,
556
+ "loss": 0.8096,
557
+ "step": 365
558
+ },
559
+ {
560
+ "epoch": 0.7566462167689162,
561
+ "grad_norm": 1.8339142799377441,
562
+ "learning_rate": 9.066991785918333e-07,
563
+ "loss": 0.808,
564
+ "step": 370
565
+ },
566
+ {
567
+ "epoch": 0.7668711656441718,
568
+ "grad_norm": 1.2315114736557007,
569
+ "learning_rate": 9.01445962375273e-07,
570
+ "loss": 0.805,
571
+ "step": 375
572
+ },
573
+ {
574
+ "epoch": 0.7770961145194274,
575
+ "grad_norm": 1.3081412315368652,
576
+ "learning_rate": 8.960649856567333e-07,
577
+ "loss": 0.8066,
578
+ "step": 380
579
+ },
580
+ {
581
+ "epoch": 0.787321063394683,
582
+ "grad_norm": 1.5145998001098633,
583
+ "learning_rate": 8.90557960936404e-07,
584
+ "loss": 0.8028,
585
+ "step": 385
586
+ },
587
+ {
588
+ "epoch": 0.7975460122699386,
589
+ "grad_norm": 1.5990959405899048,
590
+ "learning_rate": 8.84926640829353e-07,
591
+ "loss": 0.8035,
592
+ "step": 390
593
+ },
594
+ {
595
+ "epoch": 0.8077709611451943,
596
+ "grad_norm": 1.2120558023452759,
597
+ "learning_rate": 8.79172817507756e-07,
598
+ "loss": 0.802,
599
+ "step": 395
600
+ },
601
+ {
602
+ "epoch": 0.8179959100204499,
603
+ "grad_norm": 1.5799622535705566,
604
+ "learning_rate": 8.73298322130535e-07,
605
+ "loss": 0.8037,
606
+ "step": 400
607
+ },
608
+ {
609
+ "epoch": 0.8179959100204499,
610
+ "eval_accuracy": 0.58537,
611
+ "eval_loss": 0.6877263784408569,
612
+ "eval_macro_f1": 0.5630337315451738,
613
+ "eval_precision": 0.628845494567806,
614
+ "eval_recall": 0.5970616303474306,
615
+ "eval_runtime": 81.293,
616
+ "eval_samples_per_second": 1230.118,
617
+ "eval_steps_per_second": 0.603,
618
+ "step": 400
619
+ },
620
+ {
621
+ "epoch": 0.8282208588957055,
622
+ "grad_norm": 1.3475037813186646,
623
+ "learning_rate": 8.673050242605921e-07,
624
+ "loss": 0.8067,
625
+ "step": 405
626
+ },
627
+ {
628
+ "epoch": 0.8384458077709611,
629
+ "grad_norm": 1.2836309671401978,
630
+ "learning_rate": 8.611948312698179e-07,
631
+ "loss": 0.7996,
632
+ "step": 410
633
+ },
634
+ {
635
+ "epoch": 0.8486707566462167,
636
+ "grad_norm": 1.460316777229309,
637
+ "learning_rate": 8.5496968773207e-07,
638
+ "loss": 0.802,
639
+ "step": 415
640
+ },
641
+ {
642
+ "epoch": 0.8588957055214724,
643
+ "grad_norm": 1.33119797706604,
644
+ "learning_rate": 8.486315748043109e-07,
645
+ "loss": 0.798,
646
+ "step": 420
647
+ },
648
+ {
649
+ "epoch": 0.869120654396728,
650
+ "grad_norm": 1.9951454401016235,
651
+ "learning_rate": 8.42182509596102e-07,
652
+ "loss": 0.8013,
653
+ "step": 425
654
+ },
655
+ {
656
+ "epoch": 0.8793456032719836,
657
+ "grad_norm": 1.2590746879577637,
658
+ "learning_rate": 8.356245445276584e-07,
659
+ "loss": 0.7963,
660
+ "step": 430
661
+ },
662
+ {
663
+ "epoch": 0.8895705521472392,
664
+ "grad_norm": 1.1192667484283447,
665
+ "learning_rate": 8.28959766676663e-07,
666
+ "loss": 0.8004,
667
+ "step": 435
668
+ },
669
+ {
670
+ "epoch": 0.8997955010224948,
671
+ "grad_norm": 1.1180275678634644,
672
+ "learning_rate": 8.221902971140535e-07,
673
+ "loss": 0.8041,
674
+ "step": 440
675
+ },
676
+ {
677
+ "epoch": 0.9100204498977505,
678
+ "grad_norm": 1.1210858821868896,
679
+ "learning_rate": 8.153182902289897e-07,
680
+ "loss": 0.7991,
681
+ "step": 445
682
+ },
683
+ {
684
+ "epoch": 0.9202453987730062,
685
+ "grad_norm": 1.1266220808029175,
686
+ "learning_rate": 8.083459330432164e-07,
687
+ "loss": 0.8002,
688
+ "step": 450
689
+ },
690
+ {
691
+ "epoch": 0.9304703476482618,
692
+ "grad_norm": 1.0373694896697998,
693
+ "learning_rate": 8.012754445150434e-07,
694
+ "loss": 0.7974,
695
+ "step": 455
696
+ },
697
+ {
698
+ "epoch": 0.9406952965235174,
699
+ "grad_norm": 1.2223235368728638,
700
+ "learning_rate": 7.941090748331589e-07,
701
+ "loss": 0.8001,
702
+ "step": 460
703
+ },
704
+ {
705
+ "epoch": 0.950920245398773,
706
+ "grad_norm": 1.4549195766448975,
707
+ "learning_rate": 7.868491047005065e-07,
708
+ "loss": 0.7993,
709
+ "step": 465
710
+ },
711
+ {
712
+ "epoch": 0.9611451942740287,
713
+ "grad_norm": 1.3064852952957153,
714
+ "learning_rate": 7.794978446084483e-07,
715
+ "loss": 0.8006,
716
+ "step": 470
717
+ },
718
+ {
719
+ "epoch": 0.9713701431492843,
720
+ "grad_norm": 1.2408719062805176,
721
+ "learning_rate": 7.720576341014498e-07,
722
+ "loss": 0.7983,
723
+ "step": 475
724
+ },
725
+ {
726
+ "epoch": 0.9815950920245399,
727
+ "grad_norm": 1.2148370742797852,
728
+ "learning_rate": 7.645308410325187e-07,
729
+ "loss": 0.7959,
730
+ "step": 480
731
+ },
732
+ {
733
+ "epoch": 0.9918200408997955,
734
+ "grad_norm": 1.0927603244781494,
735
+ "learning_rate": 7.569198608096317e-07,
736
+ "loss": 0.7978,
737
+ "step": 485
738
+ },
739
+ {
740
+ "SWA": "started",
741
+ "epoch": 1.0,
742
+ "step": 489
743
+ },
744
+ {
745
+ "epoch": 1.0020449897750512,
746
+ "grad_norm": 1.245108723640442,
747
+ "learning_rate": 7.492271156333967e-07,
748
+ "loss": 0.7965,
749
+ "step": 490
750
+ },
751
+ {
752
+ "epoch": 1.0122699386503067,
753
+ "grad_norm": 1.3393553495407104,
754
+ "learning_rate": 7.414550537261828e-07,
755
+ "loss": 0.795,
756
+ "step": 495
757
+ },
758
+ {
759
+ "epoch": 1.0224948875255624,
760
+ "grad_norm": 1.2823072671890259,
761
+ "learning_rate": 7.336061485529738e-07,
762
+ "loss": 0.8014,
763
+ "step": 500
764
+ },
765
+ {
766
+ "epoch": 1.0224948875255624,
767
+ "eval_accuracy": 0.60723,
768
+ "eval_loss": 0.6864892244338989,
769
+ "eval_macro_f1": 0.5966241921587988,
770
+ "eval_precision": 0.6341761761282843,
771
+ "eval_recall": 0.6160142746967282,
772
+ "eval_runtime": 81.931,
773
+ "eval_samples_per_second": 1220.539,
774
+ "eval_steps_per_second": 0.598,
775
+ "step": 500
776
+ },
777
+ {
778
+ "epoch": 1.032719836400818,
779
+ "grad_norm": 1.1278107166290283,
780
+ "learning_rate": 7.256828980341846e-07,
781
+ "loss": 0.7977,
782
+ "step": 505
783
+ },
784
+ {
785
+ "epoch": 1.0429447852760736,
786
+ "grad_norm": 1.110093355178833,
787
+ "learning_rate": 7.176878237506965e-07,
788
+ "loss": 0.7954,
789
+ "step": 510
790
+ },
791
+ {
792
+ "epoch": 1.0531697341513293,
793
+ "grad_norm": 1.2248748540878296,
794
+ "learning_rate": 7.096234701413617e-07,
795
+ "loss": 0.7957,
796
+ "step": 515
797
+ },
798
+ {
799
+ "epoch": 1.0633946830265848,
800
+ "grad_norm": 1.2420642375946045,
801
+ "learning_rate": 7.014924036932345e-07,
802
+ "loss": 0.7935,
803
+ "step": 520
804
+ },
805
+ {
806
+ "epoch": 1.0736196319018405,
807
+ "grad_norm": 1.0777639150619507,
808
+ "learning_rate": 6.932972121247831e-07,
809
+ "loss": 0.796,
810
+ "step": 525
811
+ },
812
+ {
813
+ "epoch": 1.0838445807770962,
814
+ "grad_norm": 1.3830324411392212,
815
+ "learning_rate": 6.850405035623481e-07,
816
+ "loss": 0.7929,
817
+ "step": 530
818
+ },
819
+ {
820
+ "epoch": 1.0940695296523517,
821
+ "grad_norm": 0.9407713413238525,
822
+ "learning_rate": 6.767249057101023e-07,
823
+ "loss": 0.7964,
824
+ "step": 535
825
+ },
826
+ {
827
+ "epoch": 1.1042944785276074,
828
+ "grad_norm": 1.1688194274902344,
829
+ "learning_rate": 6.683530650137832e-07,
830
+ "loss": 0.7944,
831
+ "step": 540
832
+ },
833
+ {
834
+ "epoch": 1.114519427402863,
835
+ "grad_norm": 0.9509923458099365,
836
+ "learning_rate": 6.599276458184588e-07,
837
+ "loss": 0.7912,
838
+ "step": 545
839
+ },
840
+ {
841
+ "epoch": 1.1247443762781186,
842
+ "grad_norm": 1.0683159828186035,
843
+ "learning_rate": 6.514513295205969e-07,
844
+ "loss": 0.7931,
845
+ "step": 550
846
+ },
847
+ {
848
+ "epoch": 1.1349693251533743,
849
+ "grad_norm": 0.9022642374038696,
850
+ "learning_rate": 6.429268137147104e-07,
851
+ "loss": 0.7945,
852
+ "step": 555
853
+ },
854
+ {
855
+ "epoch": 1.1451942740286298,
856
+ "grad_norm": 1.1609984636306763,
857
+ "learning_rate": 6.343568113348441e-07,
858
+ "loss": 0.7913,
859
+ "step": 560
860
+ },
861
+ {
862
+ "epoch": 1.1554192229038855,
863
+ "grad_norm": 1.2184994220733643,
864
+ "learning_rate": 6.257440497911815e-07,
865
+ "loss": 0.7919,
866
+ "step": 565
867
+ },
868
+ {
869
+ "epoch": 1.165644171779141,
870
+ "grad_norm": 1.0256582498550415,
871
+ "learning_rate": 6.170912701020454e-07,
872
+ "loss": 0.7912,
873
+ "step": 570
874
+ },
875
+ {
876
+ "epoch": 1.1758691206543967,
877
+ "grad_norm": 0.8725862503051758,
878
+ "learning_rate": 6.084012260215645e-07,
879
+ "loss": 0.7907,
880
+ "step": 575
881
+ },
882
+ {
883
+ "epoch": 1.1860940695296525,
884
+ "grad_norm": 1.5192348957061768,
885
+ "learning_rate": 5.996766831632912e-07,
886
+ "loss": 0.7913,
887
+ "step": 580
888
+ },
889
+ {
890
+ "epoch": 1.196319018404908,
891
+ "grad_norm": 1.109052062034607,
892
+ "learning_rate": 5.909204181200414e-07,
893
+ "loss": 0.795,
894
+ "step": 585
895
+ },
896
+ {
897
+ "epoch": 1.2065439672801637,
898
+ "grad_norm": 1.0413333177566528,
899
+ "learning_rate": 5.821352175802419e-07,
900
+ "loss": 0.7924,
901
+ "step": 590
902
+ },
903
+ {
904
+ "epoch": 1.2167689161554192,
905
+ "grad_norm": 0.8926281929016113,
906
+ "learning_rate": 5.733238774410647e-07,
907
+ "loss": 0.7921,
908
+ "step": 595
909
+ },
910
+ {
911
+ "epoch": 1.2269938650306749,
912
+ "grad_norm": 0.9231971502304077,
913
+ "learning_rate": 5.644892019186307e-07,
914
+ "loss": 0.7894,
915
+ "step": 600
916
+ },
917
+ {
918
+ "epoch": 1.2269938650306749,
919
+ "eval_accuracy": 0.62182,
920
+ "eval_loss": 0.6853985786437988,
921
+ "eval_macro_f1": 0.6195549574374046,
922
+ "eval_precision": 0.6317310781859349,
923
+ "eval_recall": 0.6267089641577176,
924
+ "eval_runtime": 81.4512,
925
+ "eval_samples_per_second": 1227.728,
926
+ "eval_steps_per_second": 0.602,
927
+ "step": 600
928
+ },
929
+ {
930
+ "epoch": 1.2372188139059306,
931
+ "grad_norm": 0.9845394492149353,
932
+ "learning_rate": 5.556340026555653e-07,
933
+ "loss": 0.7918,
934
+ "step": 605
935
+ },
936
+ {
937
+ "epoch": 1.247443762781186,
938
+ "grad_norm": 1.3759487867355347,
939
+ "learning_rate": 5.467610978261906e-07,
940
+ "loss": 0.7904,
941
+ "step": 610
942
+ },
943
+ {
944
+ "epoch": 1.2576687116564418,
945
+ "grad_norm": 1.1568200588226318,
946
+ "learning_rate": 5.378733112396398e-07,
947
+ "loss": 0.7923,
948
+ "step": 615
949
+ },
950
+ {
951
+ "epoch": 1.2678936605316973,
952
+ "grad_norm": 1.4351176023483276,
953
+ "learning_rate": 5.289734714411775e-07,
954
+ "loss": 0.7905,
955
+ "step": 620
956
+ },
957
+ {
958
+ "epoch": 1.278118609406953,
959
+ "grad_norm": 1.178076982498169,
960
+ "learning_rate": 5.200644108120121e-07,
961
+ "loss": 0.7947,
962
+ "step": 625
963
+ },
964
+ {
965
+ "epoch": 1.2883435582822087,
966
+ "grad_norm": 1.2398017644882202,
967
+ "learning_rate": 5.111489646678896e-07,
968
+ "loss": 0.796,
969
+ "step": 630
970
+ },
971
+ {
972
+ "epoch": 1.2985685071574642,
973
+ "grad_norm": 1.1236284971237183,
974
+ "learning_rate": 5.022299703567508e-07,
975
+ "loss": 0.7895,
976
+ "step": 635
977
+ },
978
+ {
979
+ "epoch": 1.30879345603272,
980
+ "grad_norm": 1.0112528800964355,
981
+ "learning_rate": 4.933102663557439e-07,
982
+ "loss": 0.79,
983
+ "step": 640
984
+ },
985
+ {
986
+ "epoch": 1.3190184049079754,
987
+ "grad_norm": 1.3201746940612793,
988
+ "learning_rate": 4.843926913678757e-07,
989
+ "loss": 0.7897,
990
+ "step": 645
991
+ },
992
+ {
993
+ "epoch": 1.329243353783231,
994
+ "grad_norm": 0.969918429851532,
995
+ "learning_rate": 4.7548008341859384e-07,
996
+ "loss": 0.7912,
997
+ "step": 650
998
+ },
999
+ {
1000
+ "epoch": 1.3394683026584868,
1001
+ "grad_norm": 0.8914945125579834,
1002
+ "learning_rate": 4.665752789525812e-07,
1003
+ "loss": 0.7964,
1004
+ "step": 655
1005
+ },
1006
+ {
1007
+ "epoch": 1.3496932515337423,
1008
+ "grad_norm": 0.906989574432373,
1009
+ "learning_rate": 4.576811119310563e-07,
1010
+ "loss": 0.7924,
1011
+ "step": 660
1012
+ },
1013
+ {
1014
+ "epoch": 1.359918200408998,
1015
+ "grad_norm": 1.2423877716064453,
1016
+ "learning_rate": 4.488004129298618e-07,
1017
+ "loss": 0.7904,
1018
+ "step": 665
1019
+ },
1020
+ {
1021
+ "epoch": 1.3701431492842535,
1022
+ "grad_norm": 1.2455909252166748,
1023
+ "learning_rate": 4.3993600823863256e-07,
1024
+ "loss": 0.7875,
1025
+ "step": 670
1026
+ },
1027
+ {
1028
+ "epoch": 1.3803680981595092,
1029
+ "grad_norm": 1.4931528568267822,
1030
+ "learning_rate": 4.3109071896132574e-07,
1031
+ "loss": 0.7947,
1032
+ "step": 675
1033
+ },
1034
+ {
1035
+ "epoch": 1.390593047034765,
1036
+ "grad_norm": 1.0538350343704224,
1037
+ "learning_rate": 4.222673601184029e-07,
1038
+ "loss": 0.7886,
1039
+ "step": 680
1040
+ },
1041
+ {
1042
+ "epoch": 1.4008179959100204,
1043
+ "grad_norm": 0.9246828556060791,
1044
+ "learning_rate": 4.134687397509467e-07,
1045
+ "loss": 0.7884,
1046
+ "step": 685
1047
+ },
1048
+ {
1049
+ "epoch": 1.4110429447852761,
1050
+ "grad_norm": 1.0383715629577637,
1051
+ "learning_rate": 4.0469765802700033e-07,
1052
+ "loss": 0.7943,
1053
+ "step": 690
1054
+ },
1055
+ {
1056
+ "epoch": 1.4212678936605316,
1057
+ "grad_norm": 1.0180901288986206,
1058
+ "learning_rate": 3.9595690635041145e-07,
1059
+ "loss": 0.7895,
1060
+ "step": 695
1061
+ },
1062
+ {
1063
+ "epoch": 1.4314928425357873,
1064
+ "grad_norm": 0.9119181632995605,
1065
+ "learning_rate": 3.8724926647246536e-07,
1066
+ "loss": 0.7864,
1067
+ "step": 700
1068
+ },
1069
+ {
1070
+ "epoch": 1.4314928425357873,
1071
+ "eval_accuracy": 0.62357,
1072
+ "eval_loss": 0.6852650046348572,
1073
+ "eval_macro_f1": 0.6215147432652665,
1074
+ "eval_precision": 0.6330088346022082,
1075
+ "eval_recall": 0.628302383508456,
1076
+ "eval_runtime": 80.5998,
1077
+ "eval_samples_per_second": 1240.698,
1078
+ "eval_steps_per_second": 0.608,
1079
+ "step": 700
1080
+ },
1081
+ {
1082
+ "epoch": 1.441717791411043,
1083
+ "grad_norm": 0.8882152438163757,
1084
+ "learning_rate": 3.785775096065909e-07,
1085
+ "loss": 0.7858,
1086
+ "step": 705
1087
+ },
1088
+ {
1089
+ "epoch": 1.4519427402862985,
1090
+ "grad_norm": 1.5290203094482422,
1091
+ "learning_rate": 3.699443955464192e-07,
1092
+ "loss": 0.7837,
1093
+ "step": 710
1094
+ },
1095
+ {
1096
+ "epoch": 1.4621676891615543,
1097
+ "grad_norm": 0.881521463394165,
1098
+ "learning_rate": 3.613526717874774e-07,
1099
+ "loss": 0.7858,
1100
+ "step": 715
1101
+ },
1102
+ {
1103
+ "epoch": 1.4723926380368098,
1104
+ "grad_norm": 0.9955899119377136,
1105
+ "learning_rate": 3.5280507265279555e-07,
1106
+ "loss": 0.7907,
1107
+ "step": 720
1108
+ },
1109
+ {
1110
+ "epoch": 1.4826175869120655,
1111
+ "grad_norm": 1.3247544765472412,
1112
+ "learning_rate": 3.443043184227067e-07,
1113
+ "loss": 0.79,
1114
+ "step": 725
1115
+ },
1116
+ {
1117
+ "epoch": 1.4928425357873212,
1118
+ "grad_norm": 1.200223445892334,
1119
+ "learning_rate": 3.358531144691148e-07,
1120
+ "loss": 0.7874,
1121
+ "step": 730
1122
+ },
1123
+ {
1124
+ "epoch": 1.5030674846625767,
1125
+ "grad_norm": 0.9952226281166077,
1126
+ "learning_rate": 3.2745415039450867e-07,
1127
+ "loss": 0.7874,
1128
+ "step": 735
1129
+ },
1130
+ {
1131
+ "epoch": 1.5132924335378322,
1132
+ "grad_norm": 1.2515606880187988,
1133
+ "learning_rate": 3.19110099175993e-07,
1134
+ "loss": 0.789,
1135
+ "step": 740
1136
+ },
1137
+ {
1138
+ "epoch": 1.5235173824130879,
1139
+ "grad_norm": 0.8901408314704895,
1140
+ "learning_rate": 3.10823616314612e-07,
1141
+ "loss": 0.7853,
1142
+ "step": 745
1143
+ },
1144
+ {
1145
+ "epoch": 1.5337423312883436,
1146
+ "grad_norm": 1.0439373254776,
1147
+ "learning_rate": 3.0259733899023345e-07,
1148
+ "loss": 0.7899,
1149
+ "step": 750
1150
+ },
1151
+ {
1152
+ "epoch": 1.5439672801635993,
1153
+ "grad_norm": 1.0658971071243286,
1154
+ "learning_rate": 2.944338852222643e-07,
1155
+ "loss": 0.7868,
1156
+ "step": 755
1157
+ },
1158
+ {
1159
+ "epoch": 1.5541922290388548,
1160
+ "grad_norm": 0.927455484867096,
1161
+ "learning_rate": 2.8633585303646413e-07,
1162
+ "loss": 0.7904,
1163
+ "step": 760
1164
+ },
1165
+ {
1166
+ "epoch": 1.5644171779141103,
1167
+ "grad_norm": 0.9637423753738403,
1168
+ "learning_rate": 2.783058196381214e-07,
1169
+ "loss": 0.7856,
1170
+ "step": 765
1171
+ },
1172
+ {
1173
+ "epoch": 1.574642126789366,
1174
+ "grad_norm": 1.396472692489624,
1175
+ "learning_rate": 2.7034634059185437e-07,
1176
+ "loss": 0.7903,
1177
+ "step": 770
1178
+ },
1179
+ {
1180
+ "epoch": 1.5848670756646217,
1181
+ "grad_norm": 0.7922792434692383,
1182
+ "learning_rate": 2.6245994900830257e-07,
1183
+ "loss": 0.7843,
1184
+ "step": 775
1185
+ },
1186
+ {
1187
+ "epoch": 1.5950920245398774,
1188
+ "grad_norm": 0.8896881341934204,
1189
+ "learning_rate": 2.546491547379619e-07,
1190
+ "loss": 0.787,
1191
+ "step": 780
1192
+ },
1193
+ {
1194
+ "epoch": 1.605316973415133,
1195
+ "grad_norm": 0.8732028007507324,
1196
+ "learning_rate": 2.469164435724212e-07,
1197
+ "loss": 0.7856,
1198
+ "step": 785
1199
+ },
1200
+ {
1201
+ "epoch": 1.6155419222903884,
1202
+ "grad_norm": 1.0021744966506958,
1203
+ "learning_rate": 2.3926427645325875e-07,
1204
+ "loss": 0.7867,
1205
+ "step": 790
1206
+ },
1207
+ {
1208
+ "epoch": 1.6257668711656441,
1209
+ "grad_norm": 1.1783545017242432,
1210
+ "learning_rate": 2.3169508868884453e-07,
1211
+ "loss": 0.7897,
1212
+ "step": 795
1213
+ },
1214
+ {
1215
+ "epoch": 1.6359918200408998,
1216
+ "grad_norm": 0.9119800329208374,
1217
+ "learning_rate": 2.2421128917930243e-07,
1218
+ "loss": 0.7845,
1219
+ "step": 800
1220
+ },
1221
+ {
1222
+ "epoch": 1.6359918200408998,
1223
+ "eval_accuracy": 0.62896,
1224
+ "eval_loss": 0.6847647428512573,
1225
+ "eval_macro_f1": 0.6281943240633717,
1226
+ "eval_precision": 0.6346364525627035,
1227
+ "eval_recall": 0.6323959922867678,
1228
+ "eval_runtime": 80.6105,
1229
+ "eval_samples_per_second": 1240.533,
1230
+ "eval_steps_per_second": 0.608,
1231
+ "step": 800
1232
+ },
1233
+ {
1234
+ "epoch": 1.6462167689161555,
1235
+ "grad_norm": 0.8903971314430237,
1236
+ "learning_rate": 2.1681525964987474e-07,
1237
+ "loss": 0.7824,
1238
+ "step": 805
1239
+ },
1240
+ {
1241
+ "epoch": 1.656441717791411,
1242
+ "grad_norm": 1.115395188331604,
1243
+ "learning_rate": 2.0950935389293656e-07,
1244
+ "loss": 0.7824,
1245
+ "step": 810
1246
+ },
1247
+ {
1248
+ "epoch": 1.6666666666666665,
1249
+ "grad_norm": 0.9636144638061523,
1250
+ "learning_rate": 2.022958970189001e-07,
1251
+ "loss": 0.7917,
1252
+ "step": 815
1253
+ },
1254
+ {
1255
+ "epoch": 1.6768916155419222,
1256
+ "grad_norm": 0.8787257075309753,
1257
+ "learning_rate": 1.9517718471624532e-07,
1258
+ "loss": 0.7869,
1259
+ "step": 820
1260
+ },
1261
+ {
1262
+ "epoch": 1.687116564417178,
1263
+ "grad_norm": 1.0157173871994019,
1264
+ "learning_rate": 1.88155482520916e-07,
1265
+ "loss": 0.7844,
1266
+ "step": 825
1267
+ },
1268
+ {
1269
+ "epoch": 1.6973415132924337,
1270
+ "grad_norm": 0.9504719972610474,
1271
+ "learning_rate": 1.812330250953107e-07,
1272
+ "loss": 0.7872,
1273
+ "step": 830
1274
+ },
1275
+ {
1276
+ "epoch": 1.7075664621676891,
1277
+ "grad_norm": 0.893625795841217,
1278
+ "learning_rate": 1.7441201551710016e-07,
1279
+ "loss": 0.7879,
1280
+ "step": 835
1281
+ },
1282
+ {
1283
+ "epoch": 1.7177914110429446,
1284
+ "grad_norm": 0.8460310101509094,
1285
+ "learning_rate": 1.6769462457809536e-07,
1286
+ "loss": 0.7853,
1287
+ "step": 840
1288
+ },
1289
+ {
1290
+ "epoch": 1.7280163599182004,
1291
+ "grad_norm": 0.9349818229675293,
1292
+ "learning_rate": 1.610829900933917e-07,
1293
+ "loss": 0.7862,
1294
+ "step": 845
1295
+ },
1296
+ {
1297
+ "epoch": 1.738241308793456,
1298
+ "grad_norm": 0.859866738319397,
1299
+ "learning_rate": 1.545792162210074e-07,
1300
+ "loss": 0.7836,
1301
+ "step": 850
1302
+ },
1303
+ {
1304
+ "epoch": 1.7484662576687118,
1305
+ "grad_norm": 1.0148438215255737,
1306
+ "learning_rate": 1.481853727922341e-07,
1307
+ "loss": 0.7859,
1308
+ "step": 855
1309
+ },
1310
+ {
1311
+ "epoch": 1.7586912065439673,
1312
+ "grad_norm": 0.8861204385757446,
1313
+ "learning_rate": 1.4190349465291035e-07,
1314
+ "loss": 0.7909,
1315
+ "step": 860
1316
+ },
1317
+ {
1318
+ "epoch": 1.7689161554192228,
1319
+ "grad_norm": 0.7679073214530945,
1320
+ "learning_rate": 1.3573558101583105e-07,
1321
+ "loss": 0.785,
1322
+ "step": 865
1323
+ },
1324
+ {
1325
+ "epoch": 1.7791411042944785,
1326
+ "grad_norm": 0.7364144325256348,
1327
+ "learning_rate": 1.2968359482449636e-07,
1328
+ "loss": 0.7824,
1329
+ "step": 870
1330
+ },
1331
+ {
1332
+ "epoch": 1.7893660531697342,
1333
+ "grad_norm": 0.945924699306488,
1334
+ "learning_rate": 1.2374946212840288e-07,
1335
+ "loss": 0.7864,
1336
+ "step": 875
1337
+ },
1338
+ {
1339
+ "epoch": 1.79959100204499,
1340
+ "grad_norm": 1.1060514450073242,
1341
+ "learning_rate": 1.1793507147007714e-07,
1342
+ "loss": 0.7866,
1343
+ "step": 880
1344
+ },
1345
+ {
1346
+ "epoch": 1.8098159509202454,
1347
+ "grad_norm": 0.9230445623397827,
1348
+ "learning_rate": 1.1224227328404534e-07,
1349
+ "loss": 0.7895,
1350
+ "step": 885
1351
+ },
1352
+ {
1353
+ "epoch": 1.8200408997955009,
1354
+ "grad_norm": 0.9153196811676025,
1355
+ "learning_rate": 1.0667287930793151e-07,
1356
+ "loss": 0.7835,
1357
+ "step": 890
1358
+ },
1359
+ {
1360
+ "epoch": 1.8302658486707566,
1361
+ "grad_norm": 0.9513780474662781,
1362
+ "learning_rate": 1.0122866200586944e-07,
1363
+ "loss": 0.7846,
1364
+ "step": 895
1365
+ },
1366
+ {
1367
+ "epoch": 1.8404907975460123,
1368
+ "grad_norm": 0.8672247529029846,
1369
+ "learning_rate": 9.591135400441552e-08,
1370
+ "loss": 0.7839,
1371
+ "step": 900
1372
+ },
1373
+ {
1374
+ "epoch": 1.8404907975460123,
1375
+ "eval_accuracy": 0.63125,
1376
+ "eval_loss": 0.6845182776451111,
1377
+ "eval_macro_f1": 0.6309538076224105,
1378
+ "eval_precision": 0.6350446377333951,
1379
+ "eval_recall": 0.6339031903992685,
1380
+ "eval_runtime": 80.5646,
1381
+ "eval_samples_per_second": 1241.24,
1382
+ "eval_steps_per_second": 0.608,
1383
+ "step": 900
1384
+ },
1385
+ {
1386
+ "epoch": 1.850715746421268,
1387
+ "grad_norm": 1.2127219438552856,
1388
+ "learning_rate": 9.072264754113912e-08,
1389
+ "loss": 0.7876,
1390
+ "step": 905
1391
+ },
1392
+ {
1393
+ "epoch": 1.8609406952965235,
1394
+ "grad_norm": 0.875455379486084,
1395
+ "learning_rate": 8.566419392606544e-08,
1396
+ "loss": 0.787,
1397
+ "step": 910
1398
+ },
1399
+ {
1400
+ "epoch": 1.871165644171779,
1401
+ "grad_norm": 0.92503821849823,
1402
+ "learning_rate": 8.073760301614596e-08,
1403
+ "loss": 0.7834,
1404
+ "step": 915
1405
+ },
1406
+ {
1407
+ "epoch": 1.8813905930470347,
1408
+ "grad_norm": 1.1361068487167358,
1409
+ "learning_rate": 7.594444270291922e-08,
1410
+ "loss": 0.7821,
1411
+ "step": 920
1412
+ },
1413
+ {
1414
+ "epoch": 1.8916155419222904,
1415
+ "grad_norm": 1.1415101289749146,
1416
+ "learning_rate": 7.128623841352916e-08,
1417
+ "loss": 0.7877,
1418
+ "step": 925
1419
+ },
1420
+ {
1421
+ "epoch": 1.9018404907975461,
1422
+ "grad_norm": 0.9358757138252258,
1423
+ "learning_rate": 6.676447262525547e-08,
1424
+ "loss": 0.7867,
1425
+ "step": 930
1426
+ },
1427
+ {
1428
+ "epoch": 1.9120654396728016,
1429
+ "grad_norm": 0.912706732749939,
1430
+ "learning_rate": 6.238058439371479e-08,
1431
+ "loss": 0.7884,
1432
+ "step": 935
1433
+ },
1434
+ {
1435
+ "epoch": 1.9222903885480571,
1436
+ "grad_norm": 0.9449842572212219,
1437
+ "learning_rate": 5.813596889488009e-08,
1438
+ "loss": 0.7893,
1439
+ "step": 940
1440
+ },
1441
+ {
1442
+ "epoch": 1.9325153374233128,
1443
+ "grad_norm": 0.8449825048446655,
1444
+ "learning_rate": 5.403197698106432e-08,
1445
+ "loss": 0.7828,
1446
+ "step": 945
1447
+ },
1448
+ {
1449
+ "epoch": 1.9427402862985685,
1450
+ "grad_norm": 0.9307764768600464,
1451
+ "learning_rate": 5.0069914751010913e-08,
1452
+ "loss": 0.785,
1453
+ "step": 950
1454
+ },
1455
+ {
1456
+ "epoch": 1.9529652351738243,
1457
+ "grad_norm": 1.3704556226730347,
1458
+ "learning_rate": 4.625104313422673e-08,
1459
+ "loss": 0.7874,
1460
+ "step": 955
1461
+ },
1462
+ {
1463
+ "epoch": 1.9631901840490797,
1464
+ "grad_norm": 1.0163496732711792,
1465
+ "learning_rate": 4.257657748969046e-08,
1466
+ "loss": 0.7834,
1467
+ "step": 960
1468
+ },
1469
+ {
1470
+ "epoch": 1.9734151329243352,
1471
+ "grad_norm": 0.8112438321113586,
1472
+ "learning_rate": 3.904768721906304e-08,
1473
+ "loss": 0.7852,
1474
+ "step": 965
1475
+ },
1476
+ {
1477
+ "epoch": 1.983640081799591,
1478
+ "grad_norm": 0.885705828666687,
1479
+ "learning_rate": 3.566549539452529e-08,
1480
+ "loss": 0.7792,
1481
+ "step": 970
1482
+ },
1483
+ {
1484
+ "epoch": 1.9938650306748467,
1485
+ "grad_norm": 0.8692009449005127,
1486
+ "learning_rate": 3.243107840135878e-08,
1487
+ "loss": 0.7822,
1488
+ "step": 975
1489
+ },
1490
+ {
1491
+ "epoch": 2.0040899795501024,
1492
+ "grad_norm": 0.8909807205200195,
1493
+ "learning_rate": 2.9345465595385866e-08,
1494
+ "loss": 0.7826,
1495
+ "step": 980
1496
+ },
1497
+ {
1498
+ "epoch": 2.014314928425358,
1499
+ "grad_norm": 0.9065344333648682,
1500
+ "learning_rate": 2.6409638975375737e-08,
1501
+ "loss": 0.7849,
1502
+ "step": 985
1503
+ },
1504
+ {
1505
+ "epoch": 2.0245398773006134,
1506
+ "grad_norm": 0.8145809173583984,
1507
+ "learning_rate": 2.3624532870522962e-08,
1508
+ "loss": 0.7885,
1509
+ "step": 990
1510
+ },
1511
+ {
1512
+ "epoch": 2.034764826175869,
1513
+ "grad_norm": 0.9461153149604797,
1514
+ "learning_rate": 2.0991033643096457e-08,
1515
+ "loss": 0.7853,
1516
+ "step": 995
1517
+ },
1518
+ {
1519
+ "epoch": 2.044989775051125,
1520
+ "grad_norm": 0.8470706343650818,
1521
+ "learning_rate": 1.8509979406353794e-08,
1522
+ "loss": 0.7881,
1523
+ "step": 1000
1524
+ },
1525
+ {
1526
+ "epoch": 2.044989775051125,
1527
+ "eval_accuracy": 0.63202,
1528
+ "eval_loss": 0.6844514012336731,
1529
+ "eval_macro_f1": 0.6318036560759084,
1530
+ "eval_precision": 0.6354113747156731,
1531
+ "eval_recall": 0.6344858797364747,
1532
+ "eval_runtime": 81.1838,
1533
+ "eval_samples_per_second": 1231.772,
1534
+ "eval_steps_per_second": 0.604,
1535
+ "step": 1000
1536
+ },
1537
+ {
1538
+ "epoch": 2.0552147239263805,
1539
+ "grad_norm": 0.8817445635795593,
1540
+ "learning_rate": 1.6182159757810897e-08,
1541
+ "loss": 0.7879,
1542
+ "step": 1005
1543
+ },
1544
+ {
1545
+ "epoch": 2.065439672801636,
1546
+ "grad_norm": 0.856109082698822,
1547
+ "learning_rate": 1.400831552795234e-08,
1548
+ "loss": 0.7868,
1549
+ "step": 1010
1550
+ },
1551
+ {
1552
+ "epoch": 2.0756646216768915,
1553
+ "grad_norm": 0.956066370010376,
1554
+ "learning_rate": 1.1989138544461375e-08,
1555
+ "loss": 0.7845,
1556
+ "step": 1015
1557
+ },
1558
+ {
1559
+ "epoch": 2.085889570552147,
1560
+ "grad_norm": 0.930978000164032,
1561
+ "learning_rate": 1.0125271412044666e-08,
1562
+ "loss": 0.7876,
1563
+ "step": 1020
1564
+ },
1565
+ {
1566
+ "epoch": 2.096114519427403,
1567
+ "grad_norm": 0.9799636602401733,
1568
+ "learning_rate": 8.417307307923615e-09,
1569
+ "loss": 0.7861,
1570
+ "step": 1025
1571
+ },
1572
+ {
1573
+ "epoch": 2.1063394683026586,
1574
+ "grad_norm": 0.9991019368171692,
1575
+ "learning_rate": 6.8657897930547435e-09,
1576
+ "loss": 0.7852,
1577
+ "step": 1030
1578
+ },
1579
+ {
1580
+ "epoch": 2.116564417177914,
1581
+ "grad_norm": 1.076750636100769,
1582
+ "learning_rate": 5.471212639141132e-09,
1583
+ "loss": 0.7789,
1584
+ "step": 1035
1585
+ },
1586
+ {
1587
+ "epoch": 2.1267893660531696,
1588
+ "grad_norm": 0.9805507063865662,
1589
+ "learning_rate": 4.23401967148912e-09,
1590
+ "loss": 0.7829,
1591
+ "step": 1040
1592
+ },
1593
+ {
1594
+ "epoch": 2.1370143149284253,
1595
+ "grad_norm": 0.7899750471115112,
1596
+ "learning_rate": 3.154604627760571e-09,
1597
+ "loss": 0.7839,
1598
+ "step": 1045
1599
+ },
1600
+ {
1601
+ "epoch": 2.147239263803681,
1602
+ "grad_norm": 1.1698967218399048,
1603
+ "learning_rate": 2.2333110326655526e-09,
1604
+ "loss": 0.7869,
1605
+ "step": 1050
1606
+ },
1607
+ {
1608
+ "epoch": 2.1574642126789367,
1609
+ "grad_norm": 0.9302964806556702,
1610
+ "learning_rate": 1.4704320886352873e-09,
1611
+ "loss": 0.7832,
1612
+ "step": 1055
1613
+ },
1614
+ {
1615
+ "epoch": 2.1676891615541924,
1616
+ "grad_norm": 1.057986855506897,
1617
+ "learning_rate": 8.662105825103517e-10,
1618
+ "loss": 0.7864,
1619
+ "step": 1060
1620
+ },
1621
+ {
1622
+ "epoch": 2.1779141104294477,
1623
+ "grad_norm": 1.0347933769226074,
1624
+ "learning_rate": 4.208388082733161e-10,
1625
+ "loss": 0.7822,
1626
+ "step": 1065
1627
+ },
1628
+ {
1629
+ "epoch": 2.1881390593047034,
1630
+ "grad_norm": 0.9827083945274353,
1631
+ "learning_rate": 1.3445850585130924e-10,
1632
+ "loss": 0.784,
1633
+ "step": 1070
1634
+ },
1635
+ {
1636
+ "epoch": 2.198364008179959,
1637
+ "grad_norm": 0.8463678956031799,
1638
+ "learning_rate": 7.160816007045767e-12,
1639
+ "loss": 0.7811,
1640
+ "step": 1075
1641
+ },
1642
+ {
1643
+ "epoch": 2.208588957055215,
1644
+ "grad_norm": 0.9141009449958801,
1645
+ "learning_rate": 9.999610137486667e-07,
1646
+ "loss": 0.7828,
1647
+ "step": 1080
1648
+ },
1649
+ {
1650
+ "epoch": 2.21881390593047,
1651
+ "grad_norm": 0.8992940783500671,
1652
+ "learning_rate": 9.997700753166407e-07,
1653
+ "loss": 0.7843,
1654
+ "step": 1085
1655
+ },
1656
+ {
1657
+ "epoch": 2.229038854805726,
1658
+ "grad_norm": 0.9198014140129089,
1659
+ "learning_rate": 9.99420084654225e-07,
1660
+ "loss": 0.7867,
1661
+ "step": 1090
1662
+ },
1663
+ {
1664
+ "epoch": 2.2392638036809815,
1665
+ "grad_norm": 0.841385006904602,
1666
+ "learning_rate": 9.98911153146231e-07,
1667
+ "loss": 0.7899,
1668
+ "step": 1095
1669
+ },
1670
+ {
1671
+ "epoch": 2.2494887525562373,
1672
+ "grad_norm": 0.9428244233131409,
1673
+ "learning_rate": 9.982434427605222e-07,
1674
+ "loss": 0.783,
1675
+ "step": 1100
1676
+ },
1677
+ {
1678
+ "epoch": 2.2494887525562373,
1679
+ "eval_accuracy": 0.63535,
1680
+ "eval_loss": 0.6841139197349548,
1681
+ "eval_macro_f1": 0.6353491904387377,
1682
+ "eval_precision": 0.6368108503242846,
1683
+ "eval_recall": 0.6367719631437929,
1684
+ "eval_runtime": 81.1976,
1685
+ "eval_samples_per_second": 1231.563,
1686
+ "eval_steps_per_second": 0.603,
1687
+ "step": 1100
1688
+ },
1689
+ {
1690
+ "epoch": 2.259713701431493,
1691
+ "grad_norm": 0.8474355936050415,
1692
+ "learning_rate": 9.974171659964687e-07,
1693
+ "loss": 0.7805,
1694
+ "step": 1105
1695
+ },
1696
+ {
1697
+ "epoch": 2.2699386503067487,
1698
+ "grad_norm": 0.8366284370422363,
1699
+ "learning_rate": 9.964325858173184e-07,
1700
+ "loss": 0.7821,
1701
+ "step": 1110
1702
+ },
1703
+ {
1704
+ "epoch": 2.280163599182004,
1705
+ "grad_norm": 1.102426290512085,
1706
+ "learning_rate": 9.952900155665089e-07,
1707
+ "loss": 0.7854,
1708
+ "step": 1115
1709
+ },
1710
+ {
1711
+ "epoch": 2.2903885480572597,
1712
+ "grad_norm": 0.8815932273864746,
1713
+ "learning_rate": 9.939898188679462e-07,
1714
+ "loss": 0.7835,
1715
+ "step": 1120
1716
+ },
1717
+ {
1718
+ "epoch": 2.3006134969325154,
1719
+ "grad_norm": 0.8016415238380432,
1720
+ "learning_rate": 9.925324095102806e-07,
1721
+ "loss": 0.7842,
1722
+ "step": 1125
1723
+ },
1724
+ {
1725
+ "epoch": 2.310838445807771,
1726
+ "grad_norm": 0.8805480599403381,
1727
+ "learning_rate": 9.909182513152177e-07,
1728
+ "loss": 0.7791,
1729
+ "step": 1130
1730
+ },
1731
+ {
1732
+ "epoch": 2.3210633946830264,
1733
+ "grad_norm": 0.9736661314964294,
1734
+ "learning_rate": 9.891478579899078e-07,
1735
+ "loss": 0.7825,
1736
+ "step": 1135
1737
+ },
1738
+ {
1739
+ "epoch": 2.331288343558282,
1740
+ "grad_norm": 0.8331109285354614,
1741
+ "learning_rate": 9.872217929634573e-07,
1742
+ "loss": 0.7852,
1743
+ "step": 1140
1744
+ },
1745
+ {
1746
+ "epoch": 2.341513292433538,
1747
+ "grad_norm": 0.8597177267074585,
1748
+ "learning_rate": 9.851406692076183e-07,
1749
+ "loss": 0.7817,
1750
+ "step": 1145
1751
+ },
1752
+ {
1753
+ "epoch": 2.3517382413087935,
1754
+ "grad_norm": 0.7928445339202881,
1755
+ "learning_rate": 9.829051490417071e-07,
1756
+ "loss": 0.7765,
1757
+ "step": 1150
1758
+ },
1759
+ {
1760
+ "epoch": 2.361963190184049,
1761
+ "grad_norm": 0.8488237857818604,
1762
+ "learning_rate": 9.80515943921824e-07,
1763
+ "loss": 0.7836,
1764
+ "step": 1155
1765
+ },
1766
+ {
1767
+ "epoch": 2.372188139059305,
1768
+ "grad_norm": 0.7608004212379456,
1769
+ "learning_rate": 9.77973814214429e-07,
1770
+ "loss": 0.7834,
1771
+ "step": 1160
1772
+ },
1773
+ {
1774
+ "epoch": 2.38241308793456,
1775
+ "grad_norm": 0.8542405962944031,
1776
+ "learning_rate": 9.752795689543563e-07,
1777
+ "loss": 0.7777,
1778
+ "step": 1165
1779
+ },
1780
+ {
1781
+ "epoch": 2.392638036809816,
1782
+ "grad_norm": 0.8797897100448608,
1783
+ "learning_rate": 9.72434065587337e-07,
1784
+ "loss": 0.7823,
1785
+ "step": 1170
1786
+ },
1787
+ {
1788
+ "epoch": 2.4028629856850716,
1789
+ "grad_norm": 0.9687849283218384,
1790
+ "learning_rate": 9.69438209697118e-07,
1791
+ "loss": 0.7754,
1792
+ "step": 1175
1793
+ },
1794
+ {
1795
+ "epoch": 2.4130879345603273,
1796
+ "grad_norm": 0.9111893773078918,
1797
+ "learning_rate": 9.662929547172574e-07,
1798
+ "loss": 0.7806,
1799
+ "step": 1180
1800
+ },
1801
+ {
1802
+ "epoch": 2.4233128834355826,
1803
+ "grad_norm": 1.0323760509490967,
1804
+ "learning_rate": 9.629993016276944e-07,
1805
+ "loss": 0.7801,
1806
+ "step": 1185
1807
+ },
1808
+ {
1809
+ "epoch": 2.4335378323108383,
1810
+ "grad_norm": 0.79954594373703,
1811
+ "learning_rate": 9.595582986361872e-07,
1812
+ "loss": 0.7781,
1813
+ "step": 1190
1814
+ },
1815
+ {
1816
+ "epoch": 2.443762781186094,
1817
+ "grad_norm": 0.7106928825378418,
1818
+ "learning_rate": 9.559710408447184e-07,
1819
+ "loss": 0.7788,
1820
+ "step": 1195
1821
+ },
1822
+ {
1823
+ "epoch": 2.4539877300613497,
1824
+ "grad_norm": 0.77292400598526,
1825
+ "learning_rate": 9.522386699009795e-07,
1826
+ "loss": 0.7827,
1827
+ "step": 1200
1828
+ },
1829
+ {
1830
+ "epoch": 2.4539877300613497,
1831
+ "eval_accuracy": 0.645,
1832
+ "eval_loss": 0.6828427314758301,
1833
+ "eval_macro_f1": 0.6440359919423964,
1834
+ "eval_precision": 0.6441481409802297,
1835
+ "eval_recall": 0.6439695264773649,
1836
+ "eval_runtime": 81.1775,
1837
+ "eval_samples_per_second": 1231.869,
1838
+ "eval_steps_per_second": 0.604,
1839
+ "step": 1200
1840
+ }
1841
+ ],
1842
+ "logging_steps": 5,
1843
+ "max_steps": 1956,
1844
+ "num_input_tokens_seen": 0,
1845
+ "num_train_epochs": 4,
1846
+ "save_steps": 100,
1847
+ "stateful_callbacks": {
1848
+ "EarlyStoppingCallback": {
1849
+ "args": {
1850
+ "early_stopping_patience": 3,
1851
+ "early_stopping_threshold": 0.0
1852
+ },
1853
+ "attributes": {
1854
+ "early_stopping_patience_counter": 0
1855
+ }
1856
+ },
1857
+ "TrainerControl": {
1858
+ "args": {
1859
+ "should_epoch_stop": false,
1860
+ "should_evaluate": false,
1861
+ "should_log": false,
1862
+ "should_save": true,
1863
+ "should_training_stop": false
1864
+ },
1865
+ "attributes": {}
1866
+ }
1867
+ },
1868
+ "total_flos": 3.229235653528781e+17,
1869
+ "train_batch_size": 1024,
1870
+ "trial_name": null,
1871
+ "trial_params": null
1872
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2212b57ced9fbe3464bd23d4ac0f4d8e75b4b021597f160058a4a19990d9f0d3
3
+ size 5841
graphcodebert-swa-from-epoch-1/checkpoint-1200/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
graphcodebert-swa-from-epoch-1/checkpoint-1300/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.3,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": 0.3,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.3,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 514,
18
+ "model_type": "roberta",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "output_past": true,
22
+ "pad_token_id": 1,
23
+ "position_embedding_type": "absolute",
24
+ "problem_type": "single_label_classification",
25
+ "transformers_version": "4.56.0",
26
+ "type_vocab_size": 1,
27
+ "use_cache": true,
28
+ "vocab_size": 50265
29
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1300/config_hyperparams.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_config": {
3
+ "model_name": "/kaggle/input/models/dzung271828/microsoft-graphcodebert-base/transformers/default/1",
4
+ "num_epochs": 4,
5
+ "batch_size": 1024,
6
+ "learning_rate": 1e-06,
7
+ "max_length": 512,
8
+ "num_labels": 2,
9
+ "loss_type": "r-drop",
10
+ "focal_alpha": 1.0,
11
+ "focal_gamma": 2.0,
12
+ "r_drop_alpha": 10.0,
13
+ "infonce_temperature": 0.07,
14
+ "infonce_weight": 0.5,
15
+ "label_smoothing": 0.5,
16
+ "adversarial_epsilon": 0.5,
17
+ "use_swa": true,
18
+ "swa_start_epoch": 1,
19
+ "swa_lr": 1e-05,
20
+ "data_augmentation": true,
21
+ "aug_rename_prob": 0.8,
22
+ "aug_format_prob": 0.8,
23
+ "freeze_base": true,
24
+ "seed": 42,
25
+ "use_wandb": false,
26
+ "mixup_alpha": 1.0,
27
+ "low_pass_keep_ratio": 0.5,
28
+ "freq_consistency_weight": 0.5
29
+ },
30
+ "training_arguments": {
31
+ "output_dir": "graphcodebert-swa-from-epoch-1/",
32
+ "num_train_epochs": 4,
33
+ "per_device_train_batch_size": 1024,
34
+ "per_device_eval_batch_size": 2048,
35
+ "learning_rate": 1e-06,
36
+ "warmup_steps": 195,
37
+ "weight_decay": 0.1,
38
+ "logging_steps": 5,
39
+ "eval_steps": 100,
40
+ "save_steps": 100,
41
+ "metric_for_best_model": "macro_f1",
42
+ "greater_is_better": true,
43
+ "save_total_limit": 5,
44
+ "fp16": true,
45
+ "seed": 42
46
+ },
47
+ "training_state": {
48
+ "global_step": 1300,
49
+ "epoch": 2.658486707566462,
50
+ "best_metric": 0.6440359919423964,
51
+ "best_model_checkpoint": "graphcodebert-swa-from-epoch-1/checkpoint-1200"
52
+ }
53
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1300/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
graphcodebert-swa-from-epoch-1/checkpoint-1300/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:765035226f2c33b7f7c9c48302463744e4e2b9f073e8acc72d38881d5a154262
3
+ size 498612824
graphcodebert-swa-from-epoch-1/checkpoint-1300/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80a5156c308c91edf59dabbe0cb10a901ca02179d2cce567f64adedc884617a5
3
+ size 4741923
graphcodebert-swa-from-epoch-1/checkpoint-1300/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7ad1f5bd03c5b693ac65b5330d044d4256afb982243b5e689487a4d29ff7884
3
+ size 14709
graphcodebert-swa-from-epoch-1/checkpoint-1300/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:317914e1b0b7e57d42f0fa6759aa19d9a30f1d604cc5192b2404476b6f3f4a62
3
+ size 1383
graphcodebert-swa-from-epoch-1/checkpoint-1300/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2e896b20a477c5286da5432db1aa2eb0570278dea808738954c64294bfba404
3
+ size 1465
graphcodebert-swa-from-epoch-1/checkpoint-1300/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1300/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
graphcodebert-swa-from-epoch-1/checkpoint-1300/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 512,
53
+ "pad_token": "<pad>",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1300/trainer_state.json ADDED
@@ -0,0 +1,2024 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1200,
3
+ "best_metric": 0.6440359919423964,
4
+ "best_model_checkpoint": "graphcodebert-swa-from-epoch-1/checkpoint-1200",
5
+ "epoch": 2.658486707566462,
6
+ "eval_steps": 100,
7
+ "global_step": 1300,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.010224948875255624,
14
+ "grad_norm": 2.4707133769989014,
15
+ "learning_rate": 2.0512820512820512e-08,
16
+ "loss": 0.8431,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.02044989775051125,
21
+ "grad_norm": 3.114851951599121,
22
+ "learning_rate": 4.615384615384615e-08,
23
+ "loss": 0.844,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.03067484662576687,
28
+ "grad_norm": 2.2256007194519043,
29
+ "learning_rate": 7.179487179487178e-08,
30
+ "loss": 0.847,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.0408997955010225,
35
+ "grad_norm": 2.5343081951141357,
36
+ "learning_rate": 9.743589743589743e-08,
37
+ "loss": 0.8492,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.05112474437627812,
42
+ "grad_norm": 3.1964163780212402,
43
+ "learning_rate": 1.2307692307692308e-07,
44
+ "loss": 0.8475,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.06134969325153374,
49
+ "grad_norm": 2.0466485023498535,
50
+ "learning_rate": 1.4871794871794872e-07,
51
+ "loss": 0.8445,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.07157464212678936,
56
+ "grad_norm": 2.164569139480591,
57
+ "learning_rate": 1.7435897435897435e-07,
58
+ "loss": 0.8452,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.081799591002045,
63
+ "grad_norm": 2.56343150138855,
64
+ "learning_rate": 2e-07,
65
+ "loss": 0.8473,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.09202453987730061,
70
+ "grad_norm": 2.5742437839508057,
71
+ "learning_rate": 2.2564102564102563e-07,
72
+ "loss": 0.848,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.10224948875255624,
77
+ "grad_norm": 2.587480306625366,
78
+ "learning_rate": 2.5128205128205126e-07,
79
+ "loss": 0.8409,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.11247443762781185,
84
+ "grad_norm": 2.5737764835357666,
85
+ "learning_rate": 2.7692307692307693e-07,
86
+ "loss": 0.8471,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.12269938650306748,
91
+ "grad_norm": 3.044358730316162,
92
+ "learning_rate": 3.0256410256410254e-07,
93
+ "loss": 0.8448,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.1329243353783231,
98
+ "grad_norm": 2.326373815536499,
99
+ "learning_rate": 3.282051282051282e-07,
100
+ "loss": 0.8517,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.14314928425357873,
105
+ "grad_norm": 2.267547607421875,
106
+ "learning_rate": 3.5384615384615386e-07,
107
+ "loss": 0.8387,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.15337423312883436,
112
+ "grad_norm": 2.609232187271118,
113
+ "learning_rate": 3.7948717948717947e-07,
114
+ "loss": 0.841,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.16359918200409,
119
+ "grad_norm": 2.9532523155212402,
120
+ "learning_rate": 4.0512820512820514e-07,
121
+ "loss": 0.8509,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.1738241308793456,
126
+ "grad_norm": 3.002154588699341,
127
+ "learning_rate": 4.307692307692308e-07,
128
+ "loss": 0.8482,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.18404907975460122,
133
+ "grad_norm": 2.701613187789917,
134
+ "learning_rate": 4.5641025641025636e-07,
135
+ "loss": 0.8422,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.19427402862985685,
140
+ "grad_norm": 2.7430365085601807,
141
+ "learning_rate": 4.82051282051282e-07,
142
+ "loss": 0.846,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.20449897750511248,
147
+ "grad_norm": 2.8101418018341064,
148
+ "learning_rate": 5.076923076923076e-07,
149
+ "loss": 0.8444,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.20449897750511248,
154
+ "eval_accuracy": 0.52033,
155
+ "eval_loss": 0.6922348141670227,
156
+ "eval_macro_f1": 0.4427650399783254,
157
+ "eval_precision": 0.6036606007378691,
158
+ "eval_recall": 0.5386742448919869,
159
+ "eval_runtime": 80.6812,
160
+ "eval_samples_per_second": 1239.446,
161
+ "eval_steps_per_second": 0.607,
162
+ "step": 100
163
+ },
164
+ {
165
+ "epoch": 0.2147239263803681,
166
+ "grad_norm": 2.5835089683532715,
167
+ "learning_rate": 5.333333333333333e-07,
168
+ "loss": 0.8437,
169
+ "step": 105
170
+ },
171
+ {
172
+ "epoch": 0.2249488752556237,
173
+ "grad_norm": 2.7237253189086914,
174
+ "learning_rate": 5.58974358974359e-07,
175
+ "loss": 0.8431,
176
+ "step": 110
177
+ },
178
+ {
179
+ "epoch": 0.23517382413087934,
180
+ "grad_norm": 2.4648072719573975,
181
+ "learning_rate": 5.846153846153847e-07,
182
+ "loss": 0.8399,
183
+ "step": 115
184
+ },
185
+ {
186
+ "epoch": 0.24539877300613497,
187
+ "grad_norm": 2.7011852264404297,
188
+ "learning_rate": 6.102564102564103e-07,
189
+ "loss": 0.8409,
190
+ "step": 120
191
+ },
192
+ {
193
+ "epoch": 0.2556237218813906,
194
+ "grad_norm": 2.3170969486236572,
195
+ "learning_rate": 6.358974358974358e-07,
196
+ "loss": 0.8361,
197
+ "step": 125
198
+ },
199
+ {
200
+ "epoch": 0.2658486707566462,
201
+ "grad_norm": 2.517194986343384,
202
+ "learning_rate": 6.615384615384615e-07,
203
+ "loss": 0.839,
204
+ "step": 130
205
+ },
206
+ {
207
+ "epoch": 0.27607361963190186,
208
+ "grad_norm": 2.5092124938964844,
209
+ "learning_rate": 6.871794871794871e-07,
210
+ "loss": 0.8438,
211
+ "step": 135
212
+ },
213
+ {
214
+ "epoch": 0.28629856850715746,
215
+ "grad_norm": 2.3993237018585205,
216
+ "learning_rate": 7.128205128205128e-07,
217
+ "loss": 0.8349,
218
+ "step": 140
219
+ },
220
+ {
221
+ "epoch": 0.2965235173824131,
222
+ "grad_norm": 2.1388165950775146,
223
+ "learning_rate": 7.384615384615384e-07,
224
+ "loss": 0.8363,
225
+ "step": 145
226
+ },
227
+ {
228
+ "epoch": 0.3067484662576687,
229
+ "grad_norm": 1.8425891399383545,
230
+ "learning_rate": 7.64102564102564e-07,
231
+ "loss": 0.8325,
232
+ "step": 150
233
+ },
234
+ {
235
+ "epoch": 0.3169734151329243,
236
+ "grad_norm": 1.8665552139282227,
237
+ "learning_rate": 7.897435897435897e-07,
238
+ "loss": 0.835,
239
+ "step": 155
240
+ },
241
+ {
242
+ "epoch": 0.32719836400818,
243
+ "grad_norm": 1.8765455484390259,
244
+ "learning_rate": 8.153846153846154e-07,
245
+ "loss": 0.8328,
246
+ "step": 160
247
+ },
248
+ {
249
+ "epoch": 0.3374233128834356,
250
+ "grad_norm": 2.640779495239258,
251
+ "learning_rate": 8.41025641025641e-07,
252
+ "loss": 0.8388,
253
+ "step": 165
254
+ },
255
+ {
256
+ "epoch": 0.3476482617586912,
257
+ "grad_norm": 2.174116373062134,
258
+ "learning_rate": 8.666666666666667e-07,
259
+ "loss": 0.8336,
260
+ "step": 170
261
+ },
262
+ {
263
+ "epoch": 0.35787321063394684,
264
+ "grad_norm": 1.8411178588867188,
265
+ "learning_rate": 8.923076923076923e-07,
266
+ "loss": 0.8384,
267
+ "step": 175
268
+ },
269
+ {
270
+ "epoch": 0.36809815950920244,
271
+ "grad_norm": 2.3652143478393555,
272
+ "learning_rate": 9.179487179487179e-07,
273
+ "loss": 0.8318,
274
+ "step": 180
275
+ },
276
+ {
277
+ "epoch": 0.3783231083844581,
278
+ "grad_norm": 1.9870903491973877,
279
+ "learning_rate": 9.435897435897435e-07,
280
+ "loss": 0.8306,
281
+ "step": 185
282
+ },
283
+ {
284
+ "epoch": 0.3885480572597137,
285
+ "grad_norm": 2.458887815475464,
286
+ "learning_rate": 9.692307692307691e-07,
287
+ "loss": 0.8342,
288
+ "step": 190
289
+ },
290
+ {
291
+ "epoch": 0.3987730061349693,
292
+ "grad_norm": 1.9105890989303589,
293
+ "learning_rate": 9.948717948717949e-07,
294
+ "loss": 0.8301,
295
+ "step": 195
296
+ },
297
+ {
298
+ "epoch": 0.40899795501022496,
299
+ "grad_norm": 2.04896879196167,
300
+ "learning_rate": 9.999490793845076e-07,
301
+ "loss": 0.8291,
302
+ "step": 200
303
+ },
304
+ {
305
+ "epoch": 0.40899795501022496,
306
+ "eval_accuracy": 0.52697,
307
+ "eval_loss": 0.6913915872573853,
308
+ "eval_macro_f1": 0.4511625248903547,
309
+ "eval_precision": 0.6198512746424523,
310
+ "eval_recall": 0.5452618609595298,
311
+ "eval_runtime": 80.6395,
312
+ "eval_samples_per_second": 1240.088,
313
+ "eval_steps_per_second": 0.608,
314
+ "step": 200
315
+ },
316
+ {
317
+ "epoch": 0.41922290388548056,
318
+ "grad_norm": 2.394630193710327,
319
+ "learning_rate": 9.997422321595486e-07,
320
+ "loss": 0.8311,
321
+ "step": 205
322
+ },
323
+ {
324
+ "epoch": 0.4294478527607362,
325
+ "grad_norm": 1.7013665437698364,
326
+ "learning_rate": 9.993763415653074e-07,
327
+ "loss": 0.8264,
328
+ "step": 210
329
+ },
330
+ {
331
+ "epoch": 0.4396728016359918,
332
+ "grad_norm": 2.1158103942871094,
333
+ "learning_rate": 9.988515240467613e-07,
334
+ "loss": 0.8262,
335
+ "step": 215
336
+ },
337
+ {
338
+ "epoch": 0.4498977505112474,
339
+ "grad_norm": 1.5985370874404907,
340
+ "learning_rate": 9.981679466275095e-07,
341
+ "loss": 0.8296,
342
+ "step": 220
343
+ },
344
+ {
345
+ "epoch": 0.4601226993865031,
346
+ "grad_norm": 2.0426042079925537,
347
+ "learning_rate": 9.973258268566182e-07,
348
+ "loss": 0.8233,
349
+ "step": 225
350
+ },
351
+ {
352
+ "epoch": 0.4703476482617587,
353
+ "grad_norm": 1.7411834001541138,
354
+ "learning_rate": 9.963254327393853e-07,
355
+ "loss": 0.8269,
356
+ "step": 230
357
+ },
358
+ {
359
+ "epoch": 0.48057259713701433,
360
+ "grad_norm": 2.1182405948638916,
361
+ "learning_rate": 9.95167082652047e-07,
362
+ "loss": 0.8247,
363
+ "step": 235
364
+ },
365
+ {
366
+ "epoch": 0.49079754601226994,
367
+ "grad_norm": 2.0239953994750977,
368
+ "learning_rate": 9.938511452404547e-07,
369
+ "loss": 0.8308,
370
+ "step": 240
371
+ },
372
+ {
373
+ "epoch": 0.5010224948875256,
374
+ "grad_norm": 2.366060495376587,
375
+ "learning_rate": 9.923780393027534e-07,
376
+ "loss": 0.8205,
377
+ "step": 245
378
+ },
379
+ {
380
+ "epoch": 0.5112474437627812,
381
+ "grad_norm": 1.848169207572937,
382
+ "learning_rate": 9.907482336560982e-07,
383
+ "loss": 0.825,
384
+ "step": 250
385
+ },
386
+ {
387
+ "epoch": 0.5214723926380368,
388
+ "grad_norm": 1.8216668367385864,
389
+ "learning_rate": 9.889622469874535e-07,
390
+ "loss": 0.8271,
391
+ "step": 255
392
+ },
393
+ {
394
+ "epoch": 0.5316973415132924,
395
+ "grad_norm": 1.507730484008789,
396
+ "learning_rate": 9.8702064768852e-07,
397
+ "loss": 0.8147,
398
+ "step": 260
399
+ },
400
+ {
401
+ "epoch": 0.5419222903885481,
402
+ "grad_norm": 1.7608263492584229,
403
+ "learning_rate": 9.849240536748438e-07,
404
+ "loss": 0.8221,
405
+ "step": 265
406
+ },
407
+ {
408
+ "epoch": 0.5521472392638037,
409
+ "grad_norm": 2.203326940536499,
410
+ "learning_rate": 9.826731321891641e-07,
411
+ "loss": 0.8292,
412
+ "step": 270
413
+ },
414
+ {
415
+ "epoch": 0.5623721881390593,
416
+ "grad_norm": 1.9529740810394287,
417
+ "learning_rate": 9.802685995890632e-07,
418
+ "loss": 0.8228,
419
+ "step": 275
420
+ },
421
+ {
422
+ "epoch": 0.5725971370143149,
423
+ "grad_norm": 1.6214399337768555,
424
+ "learning_rate": 9.777112211189841e-07,
425
+ "loss": 0.8149,
426
+ "step": 280
427
+ },
428
+ {
429
+ "epoch": 0.5828220858895705,
430
+ "grad_norm": 2.07482647895813,
431
+ "learning_rate": 9.750018106666924e-07,
432
+ "loss": 0.8143,
433
+ "step": 285
434
+ },
435
+ {
436
+ "epoch": 0.5930470347648262,
437
+ "grad_norm": 1.7083203792572021,
438
+ "learning_rate": 9.721412305042538e-07,
439
+ "loss": 0.8188,
440
+ "step": 290
441
+ },
442
+ {
443
+ "epoch": 0.6032719836400818,
444
+ "grad_norm": 2.0022943019866943,
445
+ "learning_rate": 9.69130391013617e-07,
446
+ "loss": 0.8195,
447
+ "step": 295
448
+ },
449
+ {
450
+ "epoch": 0.6134969325153374,
451
+ "grad_norm": 1.5799461603164673,
452
+ "learning_rate": 9.659702503968834e-07,
453
+ "loss": 0.8146,
454
+ "step": 300
455
+ },
456
+ {
457
+ "epoch": 0.6134969325153374,
458
+ "eval_accuracy": 0.55052,
459
+ "eval_loss": 0.6896406412124634,
460
+ "eval_macro_f1": 0.49869783315905847,
461
+ "eval_precision": 0.6264643684302231,
462
+ "eval_recall": 0.5665461014402418,
463
+ "eval_runtime": 80.6145,
464
+ "eval_samples_per_second": 1240.472,
465
+ "eval_steps_per_second": 0.608,
466
+ "step": 300
467
+ },
468
+ {
469
+ "epoch": 0.623721881390593,
470
+ "grad_norm": 1.9373347759246826,
471
+ "learning_rate": 9.626618143713586e-07,
472
+ "loss": 0.8166,
473
+ "step": 305
474
+ },
475
+ {
476
+ "epoch": 0.6339468302658486,
477
+ "grad_norm": 1.6276922225952148,
478
+ "learning_rate": 9.592061358494813e-07,
479
+ "loss": 0.8176,
480
+ "step": 310
481
+ },
482
+ {
483
+ "epoch": 0.6441717791411042,
484
+ "grad_norm": 1.9373250007629395,
485
+ "learning_rate": 9.556043146037337e-07,
486
+ "loss": 0.8168,
487
+ "step": 315
488
+ },
489
+ {
490
+ "epoch": 0.65439672801636,
491
+ "grad_norm": 1.320465087890625,
492
+ "learning_rate": 9.518574969166391e-07,
493
+ "loss": 0.8101,
494
+ "step": 320
495
+ },
496
+ {
497
+ "epoch": 0.6646216768916156,
498
+ "grad_norm": 1.8596330881118774,
499
+ "learning_rate": 9.47966875215954e-07,
500
+ "loss": 0.8167,
501
+ "step": 325
502
+ },
503
+ {
504
+ "epoch": 0.6748466257668712,
505
+ "grad_norm": 1.304662823677063,
506
+ "learning_rate": 9.439336876951793e-07,
507
+ "loss": 0.815,
508
+ "step": 330
509
+ },
510
+ {
511
+ "epoch": 0.6850715746421268,
512
+ "grad_norm": 1.8063029050827026,
513
+ "learning_rate": 9.397592179195033e-07,
514
+ "loss": 0.8121,
515
+ "step": 335
516
+ },
517
+ {
518
+ "epoch": 0.6952965235173824,
519
+ "grad_norm": 1.7432739734649658,
520
+ "learning_rate": 9.354447944173059e-07,
521
+ "loss": 0.8104,
522
+ "step": 340
523
+ },
524
+ {
525
+ "epoch": 0.7055214723926381,
526
+ "grad_norm": 1.4523797035217285,
527
+ "learning_rate": 9.309917902573533e-07,
528
+ "loss": 0.8098,
529
+ "step": 345
530
+ },
531
+ {
532
+ "epoch": 0.7157464212678937,
533
+ "grad_norm": 1.681409478187561,
534
+ "learning_rate": 9.264016226118188e-07,
535
+ "loss": 0.8107,
536
+ "step": 350
537
+ },
538
+ {
539
+ "epoch": 0.7259713701431493,
540
+ "grad_norm": 1.5168694257736206,
541
+ "learning_rate": 9.216757523052652e-07,
542
+ "loss": 0.8085,
543
+ "step": 355
544
+ },
545
+ {
546
+ "epoch": 0.7361963190184049,
547
+ "grad_norm": 1.2200194597244263,
548
+ "learning_rate": 9.168156833497371e-07,
549
+ "loss": 0.8109,
550
+ "step": 360
551
+ },
552
+ {
553
+ "epoch": 0.7464212678936605,
554
+ "grad_norm": 1.2745580673217773,
555
+ "learning_rate": 9.118229624661078e-07,
556
+ "loss": 0.8096,
557
+ "step": 365
558
+ },
559
+ {
560
+ "epoch": 0.7566462167689162,
561
+ "grad_norm": 1.8339142799377441,
562
+ "learning_rate": 9.066991785918333e-07,
563
+ "loss": 0.808,
564
+ "step": 370
565
+ },
566
+ {
567
+ "epoch": 0.7668711656441718,
568
+ "grad_norm": 1.2315114736557007,
569
+ "learning_rate": 9.01445962375273e-07,
570
+ "loss": 0.805,
571
+ "step": 375
572
+ },
573
+ {
574
+ "epoch": 0.7770961145194274,
575
+ "grad_norm": 1.3081412315368652,
576
+ "learning_rate": 8.960649856567333e-07,
577
+ "loss": 0.8066,
578
+ "step": 380
579
+ },
580
+ {
581
+ "epoch": 0.787321063394683,
582
+ "grad_norm": 1.5145998001098633,
583
+ "learning_rate": 8.90557960936404e-07,
584
+ "loss": 0.8028,
585
+ "step": 385
586
+ },
587
+ {
588
+ "epoch": 0.7975460122699386,
589
+ "grad_norm": 1.5990959405899048,
590
+ "learning_rate": 8.84926640829353e-07,
591
+ "loss": 0.8035,
592
+ "step": 390
593
+ },
594
+ {
595
+ "epoch": 0.8077709611451943,
596
+ "grad_norm": 1.2120558023452759,
597
+ "learning_rate": 8.79172817507756e-07,
598
+ "loss": 0.802,
599
+ "step": 395
600
+ },
601
+ {
602
+ "epoch": 0.8179959100204499,
603
+ "grad_norm": 1.5799622535705566,
604
+ "learning_rate": 8.73298322130535e-07,
605
+ "loss": 0.8037,
606
+ "step": 400
607
+ },
608
+ {
609
+ "epoch": 0.8179959100204499,
610
+ "eval_accuracy": 0.58537,
611
+ "eval_loss": 0.6877263784408569,
612
+ "eval_macro_f1": 0.5630337315451738,
613
+ "eval_precision": 0.628845494567806,
614
+ "eval_recall": 0.5970616303474306,
615
+ "eval_runtime": 81.293,
616
+ "eval_samples_per_second": 1230.118,
617
+ "eval_steps_per_second": 0.603,
618
+ "step": 400
619
+ },
620
+ {
621
+ "epoch": 0.8282208588957055,
622
+ "grad_norm": 1.3475037813186646,
623
+ "learning_rate": 8.673050242605921e-07,
624
+ "loss": 0.8067,
625
+ "step": 405
626
+ },
627
+ {
628
+ "epoch": 0.8384458077709611,
629
+ "grad_norm": 1.2836309671401978,
630
+ "learning_rate": 8.611948312698179e-07,
631
+ "loss": 0.7996,
632
+ "step": 410
633
+ },
634
+ {
635
+ "epoch": 0.8486707566462167,
636
+ "grad_norm": 1.460316777229309,
637
+ "learning_rate": 8.5496968773207e-07,
638
+ "loss": 0.802,
639
+ "step": 415
640
+ },
641
+ {
642
+ "epoch": 0.8588957055214724,
643
+ "grad_norm": 1.33119797706604,
644
+ "learning_rate": 8.486315748043109e-07,
645
+ "loss": 0.798,
646
+ "step": 420
647
+ },
648
+ {
649
+ "epoch": 0.869120654396728,
650
+ "grad_norm": 1.9951454401016235,
651
+ "learning_rate": 8.42182509596102e-07,
652
+ "loss": 0.8013,
653
+ "step": 425
654
+ },
655
+ {
656
+ "epoch": 0.8793456032719836,
657
+ "grad_norm": 1.2590746879577637,
658
+ "learning_rate": 8.356245445276584e-07,
659
+ "loss": 0.7963,
660
+ "step": 430
661
+ },
662
+ {
663
+ "epoch": 0.8895705521472392,
664
+ "grad_norm": 1.1192667484283447,
665
+ "learning_rate": 8.28959766676663e-07,
666
+ "loss": 0.8004,
667
+ "step": 435
668
+ },
669
+ {
670
+ "epoch": 0.8997955010224948,
671
+ "grad_norm": 1.1180275678634644,
672
+ "learning_rate": 8.221902971140535e-07,
673
+ "loss": 0.8041,
674
+ "step": 440
675
+ },
676
+ {
677
+ "epoch": 0.9100204498977505,
678
+ "grad_norm": 1.1210858821868896,
679
+ "learning_rate": 8.153182902289897e-07,
680
+ "loss": 0.7991,
681
+ "step": 445
682
+ },
683
+ {
684
+ "epoch": 0.9202453987730062,
685
+ "grad_norm": 1.1266220808029175,
686
+ "learning_rate": 8.083459330432164e-07,
687
+ "loss": 0.8002,
688
+ "step": 450
689
+ },
690
+ {
691
+ "epoch": 0.9304703476482618,
692
+ "grad_norm": 1.0373694896697998,
693
+ "learning_rate": 8.012754445150434e-07,
694
+ "loss": 0.7974,
695
+ "step": 455
696
+ },
697
+ {
698
+ "epoch": 0.9406952965235174,
699
+ "grad_norm": 1.2223235368728638,
700
+ "learning_rate": 7.941090748331589e-07,
701
+ "loss": 0.8001,
702
+ "step": 460
703
+ },
704
+ {
705
+ "epoch": 0.950920245398773,
706
+ "grad_norm": 1.4549195766448975,
707
+ "learning_rate": 7.868491047005065e-07,
708
+ "loss": 0.7993,
709
+ "step": 465
710
+ },
711
+ {
712
+ "epoch": 0.9611451942740287,
713
+ "grad_norm": 1.3064852952957153,
714
+ "learning_rate": 7.794978446084483e-07,
715
+ "loss": 0.8006,
716
+ "step": 470
717
+ },
718
+ {
719
+ "epoch": 0.9713701431492843,
720
+ "grad_norm": 1.2408719062805176,
721
+ "learning_rate": 7.720576341014498e-07,
722
+ "loss": 0.7983,
723
+ "step": 475
724
+ },
725
+ {
726
+ "epoch": 0.9815950920245399,
727
+ "grad_norm": 1.2148370742797852,
728
+ "learning_rate": 7.645308410325187e-07,
729
+ "loss": 0.7959,
730
+ "step": 480
731
+ },
732
+ {
733
+ "epoch": 0.9918200408997955,
734
+ "grad_norm": 1.0927603244781494,
735
+ "learning_rate": 7.569198608096317e-07,
736
+ "loss": 0.7978,
737
+ "step": 485
738
+ },
739
+ {
740
+ "SWA": "started",
741
+ "epoch": 1.0,
742
+ "step": 489
743
+ },
744
+ {
745
+ "epoch": 1.0020449897750512,
746
+ "grad_norm": 1.245108723640442,
747
+ "learning_rate": 7.492271156333967e-07,
748
+ "loss": 0.7965,
749
+ "step": 490
750
+ },
751
+ {
752
+ "epoch": 1.0122699386503067,
753
+ "grad_norm": 1.3393553495407104,
754
+ "learning_rate": 7.414550537261828e-07,
755
+ "loss": 0.795,
756
+ "step": 495
757
+ },
758
+ {
759
+ "epoch": 1.0224948875255624,
760
+ "grad_norm": 1.2823072671890259,
761
+ "learning_rate": 7.336061485529738e-07,
762
+ "loss": 0.8014,
763
+ "step": 500
764
+ },
765
+ {
766
+ "epoch": 1.0224948875255624,
767
+ "eval_accuracy": 0.60723,
768
+ "eval_loss": 0.6864892244338989,
769
+ "eval_macro_f1": 0.5966241921587988,
770
+ "eval_precision": 0.6341761761282843,
771
+ "eval_recall": 0.6160142746967282,
772
+ "eval_runtime": 81.931,
773
+ "eval_samples_per_second": 1220.539,
774
+ "eval_steps_per_second": 0.598,
775
+ "step": 500
776
+ },
777
+ {
778
+ "epoch": 1.032719836400818,
779
+ "grad_norm": 1.1278107166290283,
780
+ "learning_rate": 7.256828980341846e-07,
781
+ "loss": 0.7977,
782
+ "step": 505
783
+ },
784
+ {
785
+ "epoch": 1.0429447852760736,
786
+ "grad_norm": 1.110093355178833,
787
+ "learning_rate": 7.176878237506965e-07,
788
+ "loss": 0.7954,
789
+ "step": 510
790
+ },
791
+ {
792
+ "epoch": 1.0531697341513293,
793
+ "grad_norm": 1.2248748540878296,
794
+ "learning_rate": 7.096234701413617e-07,
795
+ "loss": 0.7957,
796
+ "step": 515
797
+ },
798
+ {
799
+ "epoch": 1.0633946830265848,
800
+ "grad_norm": 1.2420642375946045,
801
+ "learning_rate": 7.014924036932345e-07,
802
+ "loss": 0.7935,
803
+ "step": 520
804
+ },
805
+ {
806
+ "epoch": 1.0736196319018405,
807
+ "grad_norm": 1.0777639150619507,
808
+ "learning_rate": 6.932972121247831e-07,
809
+ "loss": 0.796,
810
+ "step": 525
811
+ },
812
+ {
813
+ "epoch": 1.0838445807770962,
814
+ "grad_norm": 1.3830324411392212,
815
+ "learning_rate": 6.850405035623481e-07,
816
+ "loss": 0.7929,
817
+ "step": 530
818
+ },
819
+ {
820
+ "epoch": 1.0940695296523517,
821
+ "grad_norm": 0.9407713413238525,
822
+ "learning_rate": 6.767249057101023e-07,
823
+ "loss": 0.7964,
824
+ "step": 535
825
+ },
826
+ {
827
+ "epoch": 1.1042944785276074,
828
+ "grad_norm": 1.1688194274902344,
829
+ "learning_rate": 6.683530650137832e-07,
830
+ "loss": 0.7944,
831
+ "step": 540
832
+ },
833
+ {
834
+ "epoch": 1.114519427402863,
835
+ "grad_norm": 0.9509923458099365,
836
+ "learning_rate": 6.599276458184588e-07,
837
+ "loss": 0.7912,
838
+ "step": 545
839
+ },
840
+ {
841
+ "epoch": 1.1247443762781186,
842
+ "grad_norm": 1.0683159828186035,
843
+ "learning_rate": 6.514513295205969e-07,
844
+ "loss": 0.7931,
845
+ "step": 550
846
+ },
847
+ {
848
+ "epoch": 1.1349693251533743,
849
+ "grad_norm": 0.9022642374038696,
850
+ "learning_rate": 6.429268137147104e-07,
851
+ "loss": 0.7945,
852
+ "step": 555
853
+ },
854
+ {
855
+ "epoch": 1.1451942740286298,
856
+ "grad_norm": 1.1609984636306763,
857
+ "learning_rate": 6.343568113348441e-07,
858
+ "loss": 0.7913,
859
+ "step": 560
860
+ },
861
+ {
862
+ "epoch": 1.1554192229038855,
863
+ "grad_norm": 1.2184994220733643,
864
+ "learning_rate": 6.257440497911815e-07,
865
+ "loss": 0.7919,
866
+ "step": 565
867
+ },
868
+ {
869
+ "epoch": 1.165644171779141,
870
+ "grad_norm": 1.0256582498550415,
871
+ "learning_rate": 6.170912701020454e-07,
872
+ "loss": 0.7912,
873
+ "step": 570
874
+ },
875
+ {
876
+ "epoch": 1.1758691206543967,
877
+ "grad_norm": 0.8725862503051758,
878
+ "learning_rate": 6.084012260215645e-07,
879
+ "loss": 0.7907,
880
+ "step": 575
881
+ },
882
+ {
883
+ "epoch": 1.1860940695296525,
884
+ "grad_norm": 1.5192348957061768,
885
+ "learning_rate": 5.996766831632912e-07,
886
+ "loss": 0.7913,
887
+ "step": 580
888
+ },
889
+ {
890
+ "epoch": 1.196319018404908,
891
+ "grad_norm": 1.109052062034607,
892
+ "learning_rate": 5.909204181200414e-07,
893
+ "loss": 0.795,
894
+ "step": 585
895
+ },
896
+ {
897
+ "epoch": 1.2065439672801637,
898
+ "grad_norm": 1.0413333177566528,
899
+ "learning_rate": 5.821352175802419e-07,
900
+ "loss": 0.7924,
901
+ "step": 590
902
+ },
903
+ {
904
+ "epoch": 1.2167689161554192,
905
+ "grad_norm": 0.8926281929016113,
906
+ "learning_rate": 5.733238774410647e-07,
907
+ "loss": 0.7921,
908
+ "step": 595
909
+ },
910
+ {
911
+ "epoch": 1.2269938650306749,
912
+ "grad_norm": 0.9231971502304077,
913
+ "learning_rate": 5.644892019186307e-07,
914
+ "loss": 0.7894,
915
+ "step": 600
916
+ },
917
+ {
918
+ "epoch": 1.2269938650306749,
919
+ "eval_accuracy": 0.62182,
920
+ "eval_loss": 0.6853985786437988,
921
+ "eval_macro_f1": 0.6195549574374046,
922
+ "eval_precision": 0.6317310781859349,
923
+ "eval_recall": 0.6267089641577176,
924
+ "eval_runtime": 81.4512,
925
+ "eval_samples_per_second": 1227.728,
926
+ "eval_steps_per_second": 0.602,
927
+ "step": 600
928
+ },
929
+ {
930
+ "epoch": 1.2372188139059306,
931
+ "grad_norm": 0.9845394492149353,
932
+ "learning_rate": 5.556340026555653e-07,
933
+ "loss": 0.7918,
934
+ "step": 605
935
+ },
936
+ {
937
+ "epoch": 1.247443762781186,
938
+ "grad_norm": 1.3759487867355347,
939
+ "learning_rate": 5.467610978261906e-07,
940
+ "loss": 0.7904,
941
+ "step": 610
942
+ },
943
+ {
944
+ "epoch": 1.2576687116564418,
945
+ "grad_norm": 1.1568200588226318,
946
+ "learning_rate": 5.378733112396398e-07,
947
+ "loss": 0.7923,
948
+ "step": 615
949
+ },
950
+ {
951
+ "epoch": 1.2678936605316973,
952
+ "grad_norm": 1.4351176023483276,
953
+ "learning_rate": 5.289734714411775e-07,
954
+ "loss": 0.7905,
955
+ "step": 620
956
+ },
957
+ {
958
+ "epoch": 1.278118609406953,
959
+ "grad_norm": 1.178076982498169,
960
+ "learning_rate": 5.200644108120121e-07,
961
+ "loss": 0.7947,
962
+ "step": 625
963
+ },
964
+ {
965
+ "epoch": 1.2883435582822087,
966
+ "grad_norm": 1.2398017644882202,
967
+ "learning_rate": 5.111489646678896e-07,
968
+ "loss": 0.796,
969
+ "step": 630
970
+ },
971
+ {
972
+ "epoch": 1.2985685071574642,
973
+ "grad_norm": 1.1236284971237183,
974
+ "learning_rate": 5.022299703567508e-07,
975
+ "loss": 0.7895,
976
+ "step": 635
977
+ },
978
+ {
979
+ "epoch": 1.30879345603272,
980
+ "grad_norm": 1.0112528800964355,
981
+ "learning_rate": 4.933102663557439e-07,
982
+ "loss": 0.79,
983
+ "step": 640
984
+ },
985
+ {
986
+ "epoch": 1.3190184049079754,
987
+ "grad_norm": 1.3201746940612793,
988
+ "learning_rate": 4.843926913678757e-07,
989
+ "loss": 0.7897,
990
+ "step": 645
991
+ },
992
+ {
993
+ "epoch": 1.329243353783231,
994
+ "grad_norm": 0.969918429851532,
995
+ "learning_rate": 4.7548008341859384e-07,
996
+ "loss": 0.7912,
997
+ "step": 650
998
+ },
999
+ {
1000
+ "epoch": 1.3394683026584868,
1001
+ "grad_norm": 0.8914945125579834,
1002
+ "learning_rate": 4.665752789525812e-07,
1003
+ "loss": 0.7964,
1004
+ "step": 655
1005
+ },
1006
+ {
1007
+ "epoch": 1.3496932515337423,
1008
+ "grad_norm": 0.906989574432373,
1009
+ "learning_rate": 4.576811119310563e-07,
1010
+ "loss": 0.7924,
1011
+ "step": 660
1012
+ },
1013
+ {
1014
+ "epoch": 1.359918200408998,
1015
+ "grad_norm": 1.2423877716064453,
1016
+ "learning_rate": 4.488004129298618e-07,
1017
+ "loss": 0.7904,
1018
+ "step": 665
1019
+ },
1020
+ {
1021
+ "epoch": 1.3701431492842535,
1022
+ "grad_norm": 1.2455909252166748,
1023
+ "learning_rate": 4.3993600823863256e-07,
1024
+ "loss": 0.7875,
1025
+ "step": 670
1026
+ },
1027
+ {
1028
+ "epoch": 1.3803680981595092,
1029
+ "grad_norm": 1.4931528568267822,
1030
+ "learning_rate": 4.3109071896132574e-07,
1031
+ "loss": 0.7947,
1032
+ "step": 675
1033
+ },
1034
+ {
1035
+ "epoch": 1.390593047034765,
1036
+ "grad_norm": 1.0538350343704224,
1037
+ "learning_rate": 4.222673601184029e-07,
1038
+ "loss": 0.7886,
1039
+ "step": 680
1040
+ },
1041
+ {
1042
+ "epoch": 1.4008179959100204,
1043
+ "grad_norm": 0.9246828556060791,
1044
+ "learning_rate": 4.134687397509467e-07,
1045
+ "loss": 0.7884,
1046
+ "step": 685
1047
+ },
1048
+ {
1049
+ "epoch": 1.4110429447852761,
1050
+ "grad_norm": 1.0383715629577637,
1051
+ "learning_rate": 4.0469765802700033e-07,
1052
+ "loss": 0.7943,
1053
+ "step": 690
1054
+ },
1055
+ {
1056
+ "epoch": 1.4212678936605316,
1057
+ "grad_norm": 1.0180901288986206,
1058
+ "learning_rate": 3.9595690635041145e-07,
1059
+ "loss": 0.7895,
1060
+ "step": 695
1061
+ },
1062
+ {
1063
+ "epoch": 1.4314928425357873,
1064
+ "grad_norm": 0.9119181632995605,
1065
+ "learning_rate": 3.8724926647246536e-07,
1066
+ "loss": 0.7864,
1067
+ "step": 700
1068
+ },
1069
+ {
1070
+ "epoch": 1.4314928425357873,
1071
+ "eval_accuracy": 0.62357,
1072
+ "eval_loss": 0.6852650046348572,
1073
+ "eval_macro_f1": 0.6215147432652665,
1074
+ "eval_precision": 0.6330088346022082,
1075
+ "eval_recall": 0.628302383508456,
1076
+ "eval_runtime": 80.5998,
1077
+ "eval_samples_per_second": 1240.698,
1078
+ "eval_steps_per_second": 0.608,
1079
+ "step": 700
1080
+ },
1081
+ {
1082
+ "epoch": 1.441717791411043,
1083
+ "grad_norm": 0.8882152438163757,
1084
+ "learning_rate": 3.785775096065909e-07,
1085
+ "loss": 0.7858,
1086
+ "step": 705
1087
+ },
1088
+ {
1089
+ "epoch": 1.4519427402862985,
1090
+ "grad_norm": 1.5290203094482422,
1091
+ "learning_rate": 3.699443955464192e-07,
1092
+ "loss": 0.7837,
1093
+ "step": 710
1094
+ },
1095
+ {
1096
+ "epoch": 1.4621676891615543,
1097
+ "grad_norm": 0.881521463394165,
1098
+ "learning_rate": 3.613526717874774e-07,
1099
+ "loss": 0.7858,
1100
+ "step": 715
1101
+ },
1102
+ {
1103
+ "epoch": 1.4723926380368098,
1104
+ "grad_norm": 0.9955899119377136,
1105
+ "learning_rate": 3.5280507265279555e-07,
1106
+ "loss": 0.7907,
1107
+ "step": 720
1108
+ },
1109
+ {
1110
+ "epoch": 1.4826175869120655,
1111
+ "grad_norm": 1.3247544765472412,
1112
+ "learning_rate": 3.443043184227067e-07,
1113
+ "loss": 0.79,
1114
+ "step": 725
1115
+ },
1116
+ {
1117
+ "epoch": 1.4928425357873212,
1118
+ "grad_norm": 1.200223445892334,
1119
+ "learning_rate": 3.358531144691148e-07,
1120
+ "loss": 0.7874,
1121
+ "step": 730
1122
+ },
1123
+ {
1124
+ "epoch": 1.5030674846625767,
1125
+ "grad_norm": 0.9952226281166077,
1126
+ "learning_rate": 3.2745415039450867e-07,
1127
+ "loss": 0.7874,
1128
+ "step": 735
1129
+ },
1130
+ {
1131
+ "epoch": 1.5132924335378322,
1132
+ "grad_norm": 1.2515606880187988,
1133
+ "learning_rate": 3.19110099175993e-07,
1134
+ "loss": 0.789,
1135
+ "step": 740
1136
+ },
1137
+ {
1138
+ "epoch": 1.5235173824130879,
1139
+ "grad_norm": 0.8901408314704895,
1140
+ "learning_rate": 3.10823616314612e-07,
1141
+ "loss": 0.7853,
1142
+ "step": 745
1143
+ },
1144
+ {
1145
+ "epoch": 1.5337423312883436,
1146
+ "grad_norm": 1.0439373254776,
1147
+ "learning_rate": 3.0259733899023345e-07,
1148
+ "loss": 0.7899,
1149
+ "step": 750
1150
+ },
1151
+ {
1152
+ "epoch": 1.5439672801635993,
1153
+ "grad_norm": 1.0658971071243286,
1154
+ "learning_rate": 2.944338852222643e-07,
1155
+ "loss": 0.7868,
1156
+ "step": 755
1157
+ },
1158
+ {
1159
+ "epoch": 1.5541922290388548,
1160
+ "grad_norm": 0.927455484867096,
1161
+ "learning_rate": 2.8633585303646413e-07,
1162
+ "loss": 0.7904,
1163
+ "step": 760
1164
+ },
1165
+ {
1166
+ "epoch": 1.5644171779141103,
1167
+ "grad_norm": 0.9637423753738403,
1168
+ "learning_rate": 2.783058196381214e-07,
1169
+ "loss": 0.7856,
1170
+ "step": 765
1171
+ },
1172
+ {
1173
+ "epoch": 1.574642126789366,
1174
+ "grad_norm": 1.396472692489624,
1175
+ "learning_rate": 2.7034634059185437e-07,
1176
+ "loss": 0.7903,
1177
+ "step": 770
1178
+ },
1179
+ {
1180
+ "epoch": 1.5848670756646217,
1181
+ "grad_norm": 0.7922792434692383,
1182
+ "learning_rate": 2.6245994900830257e-07,
1183
+ "loss": 0.7843,
1184
+ "step": 775
1185
+ },
1186
+ {
1187
+ "epoch": 1.5950920245398774,
1188
+ "grad_norm": 0.8896881341934204,
1189
+ "learning_rate": 2.546491547379619e-07,
1190
+ "loss": 0.787,
1191
+ "step": 780
1192
+ },
1193
+ {
1194
+ "epoch": 1.605316973415133,
1195
+ "grad_norm": 0.8732028007507324,
1196
+ "learning_rate": 2.469164435724212e-07,
1197
+ "loss": 0.7856,
1198
+ "step": 785
1199
+ },
1200
+ {
1201
+ "epoch": 1.6155419222903884,
1202
+ "grad_norm": 1.0021744966506958,
1203
+ "learning_rate": 2.3926427645325875e-07,
1204
+ "loss": 0.7867,
1205
+ "step": 790
1206
+ },
1207
+ {
1208
+ "epoch": 1.6257668711656441,
1209
+ "grad_norm": 1.1783545017242432,
1210
+ "learning_rate": 2.3169508868884453e-07,
1211
+ "loss": 0.7897,
1212
+ "step": 795
1213
+ },
1214
+ {
1215
+ "epoch": 1.6359918200408998,
1216
+ "grad_norm": 0.9119800329208374,
1217
+ "learning_rate": 2.2421128917930243e-07,
1218
+ "loss": 0.7845,
1219
+ "step": 800
1220
+ },
1221
+ {
1222
+ "epoch": 1.6359918200408998,
1223
+ "eval_accuracy": 0.62896,
1224
+ "eval_loss": 0.6847647428512573,
1225
+ "eval_macro_f1": 0.6281943240633717,
1226
+ "eval_precision": 0.6346364525627035,
1227
+ "eval_recall": 0.6323959922867678,
1228
+ "eval_runtime": 80.6105,
1229
+ "eval_samples_per_second": 1240.533,
1230
+ "eval_steps_per_second": 0.608,
1231
+ "step": 800
1232
+ },
1233
+ {
1234
+ "epoch": 1.6462167689161555,
1235
+ "grad_norm": 0.8903971314430237,
1236
+ "learning_rate": 2.1681525964987474e-07,
1237
+ "loss": 0.7824,
1238
+ "step": 805
1239
+ },
1240
+ {
1241
+ "epoch": 1.656441717791411,
1242
+ "grad_norm": 1.115395188331604,
1243
+ "learning_rate": 2.0950935389293656e-07,
1244
+ "loss": 0.7824,
1245
+ "step": 810
1246
+ },
1247
+ {
1248
+ "epoch": 1.6666666666666665,
1249
+ "grad_norm": 0.9636144638061523,
1250
+ "learning_rate": 2.022958970189001e-07,
1251
+ "loss": 0.7917,
1252
+ "step": 815
1253
+ },
1254
+ {
1255
+ "epoch": 1.6768916155419222,
1256
+ "grad_norm": 0.8787257075309753,
1257
+ "learning_rate": 1.9517718471624532e-07,
1258
+ "loss": 0.7869,
1259
+ "step": 820
1260
+ },
1261
+ {
1262
+ "epoch": 1.687116564417178,
1263
+ "grad_norm": 1.0157173871994019,
1264
+ "learning_rate": 1.88155482520916e-07,
1265
+ "loss": 0.7844,
1266
+ "step": 825
1267
+ },
1268
+ {
1269
+ "epoch": 1.6973415132924337,
1270
+ "grad_norm": 0.9504719972610474,
1271
+ "learning_rate": 1.812330250953107e-07,
1272
+ "loss": 0.7872,
1273
+ "step": 830
1274
+ },
1275
+ {
1276
+ "epoch": 1.7075664621676891,
1277
+ "grad_norm": 0.893625795841217,
1278
+ "learning_rate": 1.7441201551710016e-07,
1279
+ "loss": 0.7879,
1280
+ "step": 835
1281
+ },
1282
+ {
1283
+ "epoch": 1.7177914110429446,
1284
+ "grad_norm": 0.8460310101509094,
1285
+ "learning_rate": 1.6769462457809536e-07,
1286
+ "loss": 0.7853,
1287
+ "step": 840
1288
+ },
1289
+ {
1290
+ "epoch": 1.7280163599182004,
1291
+ "grad_norm": 0.9349818229675293,
1292
+ "learning_rate": 1.610829900933917e-07,
1293
+ "loss": 0.7862,
1294
+ "step": 845
1295
+ },
1296
+ {
1297
+ "epoch": 1.738241308793456,
1298
+ "grad_norm": 0.859866738319397,
1299
+ "learning_rate": 1.545792162210074e-07,
1300
+ "loss": 0.7836,
1301
+ "step": 850
1302
+ },
1303
+ {
1304
+ "epoch": 1.7484662576687118,
1305
+ "grad_norm": 1.0148438215255737,
1306
+ "learning_rate": 1.481853727922341e-07,
1307
+ "loss": 0.7859,
1308
+ "step": 855
1309
+ },
1310
+ {
1311
+ "epoch": 1.7586912065439673,
1312
+ "grad_norm": 0.8861204385757446,
1313
+ "learning_rate": 1.4190349465291035e-07,
1314
+ "loss": 0.7909,
1315
+ "step": 860
1316
+ },
1317
+ {
1318
+ "epoch": 1.7689161554192228,
1319
+ "grad_norm": 0.7679073214530945,
1320
+ "learning_rate": 1.3573558101583105e-07,
1321
+ "loss": 0.785,
1322
+ "step": 865
1323
+ },
1324
+ {
1325
+ "epoch": 1.7791411042944785,
1326
+ "grad_norm": 0.7364144325256348,
1327
+ "learning_rate": 1.2968359482449636e-07,
1328
+ "loss": 0.7824,
1329
+ "step": 870
1330
+ },
1331
+ {
1332
+ "epoch": 1.7893660531697342,
1333
+ "grad_norm": 0.945924699306488,
1334
+ "learning_rate": 1.2374946212840288e-07,
1335
+ "loss": 0.7864,
1336
+ "step": 875
1337
+ },
1338
+ {
1339
+ "epoch": 1.79959100204499,
1340
+ "grad_norm": 1.1060514450073242,
1341
+ "learning_rate": 1.1793507147007714e-07,
1342
+ "loss": 0.7866,
1343
+ "step": 880
1344
+ },
1345
+ {
1346
+ "epoch": 1.8098159509202454,
1347
+ "grad_norm": 0.9230445623397827,
1348
+ "learning_rate": 1.1224227328404534e-07,
1349
+ "loss": 0.7895,
1350
+ "step": 885
1351
+ },
1352
+ {
1353
+ "epoch": 1.8200408997955009,
1354
+ "grad_norm": 0.9153196811676025,
1355
+ "learning_rate": 1.0667287930793151e-07,
1356
+ "loss": 0.7835,
1357
+ "step": 890
1358
+ },
1359
+ {
1360
+ "epoch": 1.8302658486707566,
1361
+ "grad_norm": 0.9513780474662781,
1362
+ "learning_rate": 1.0122866200586944e-07,
1363
+ "loss": 0.7846,
1364
+ "step": 895
1365
+ },
1366
+ {
1367
+ "epoch": 1.8404907975460123,
1368
+ "grad_norm": 0.8672247529029846,
1369
+ "learning_rate": 9.591135400441552e-08,
1370
+ "loss": 0.7839,
1371
+ "step": 900
1372
+ },
1373
+ {
1374
+ "epoch": 1.8404907975460123,
1375
+ "eval_accuracy": 0.63125,
1376
+ "eval_loss": 0.6845182776451111,
1377
+ "eval_macro_f1": 0.6309538076224105,
1378
+ "eval_precision": 0.6350446377333951,
1379
+ "eval_recall": 0.6339031903992685,
1380
+ "eval_runtime": 80.5646,
1381
+ "eval_samples_per_second": 1241.24,
1382
+ "eval_steps_per_second": 0.608,
1383
+ "step": 900
1384
+ },
1385
+ {
1386
+ "epoch": 1.850715746421268,
1387
+ "grad_norm": 1.2127219438552856,
1388
+ "learning_rate": 9.072264754113912e-08,
1389
+ "loss": 0.7876,
1390
+ "step": 905
1391
+ },
1392
+ {
1393
+ "epoch": 1.8609406952965235,
1394
+ "grad_norm": 0.875455379486084,
1395
+ "learning_rate": 8.566419392606544e-08,
1396
+ "loss": 0.787,
1397
+ "step": 910
1398
+ },
1399
+ {
1400
+ "epoch": 1.871165644171779,
1401
+ "grad_norm": 0.92503821849823,
1402
+ "learning_rate": 8.073760301614596e-08,
1403
+ "loss": 0.7834,
1404
+ "step": 915
1405
+ },
1406
+ {
1407
+ "epoch": 1.8813905930470347,
1408
+ "grad_norm": 1.1361068487167358,
1409
+ "learning_rate": 7.594444270291922e-08,
1410
+ "loss": 0.7821,
1411
+ "step": 920
1412
+ },
1413
+ {
1414
+ "epoch": 1.8916155419222904,
1415
+ "grad_norm": 1.1415101289749146,
1416
+ "learning_rate": 7.128623841352916e-08,
1417
+ "loss": 0.7877,
1418
+ "step": 925
1419
+ },
1420
+ {
1421
+ "epoch": 1.9018404907975461,
1422
+ "grad_norm": 0.9358757138252258,
1423
+ "learning_rate": 6.676447262525547e-08,
1424
+ "loss": 0.7867,
1425
+ "step": 930
1426
+ },
1427
+ {
1428
+ "epoch": 1.9120654396728016,
1429
+ "grad_norm": 0.912706732749939,
1430
+ "learning_rate": 6.238058439371479e-08,
1431
+ "loss": 0.7884,
1432
+ "step": 935
1433
+ },
1434
+ {
1435
+ "epoch": 1.9222903885480571,
1436
+ "grad_norm": 0.9449842572212219,
1437
+ "learning_rate": 5.813596889488009e-08,
1438
+ "loss": 0.7893,
1439
+ "step": 940
1440
+ },
1441
+ {
1442
+ "epoch": 1.9325153374233128,
1443
+ "grad_norm": 0.8449825048446655,
1444
+ "learning_rate": 5.403197698106432e-08,
1445
+ "loss": 0.7828,
1446
+ "step": 945
1447
+ },
1448
+ {
1449
+ "epoch": 1.9427402862985685,
1450
+ "grad_norm": 0.9307764768600464,
1451
+ "learning_rate": 5.0069914751010913e-08,
1452
+ "loss": 0.785,
1453
+ "step": 950
1454
+ },
1455
+ {
1456
+ "epoch": 1.9529652351738243,
1457
+ "grad_norm": 1.3704556226730347,
1458
+ "learning_rate": 4.625104313422673e-08,
1459
+ "loss": 0.7874,
1460
+ "step": 955
1461
+ },
1462
+ {
1463
+ "epoch": 1.9631901840490797,
1464
+ "grad_norm": 1.0163496732711792,
1465
+ "learning_rate": 4.257657748969046e-08,
1466
+ "loss": 0.7834,
1467
+ "step": 960
1468
+ },
1469
+ {
1470
+ "epoch": 1.9734151329243352,
1471
+ "grad_norm": 0.8112438321113586,
1472
+ "learning_rate": 3.904768721906304e-08,
1473
+ "loss": 0.7852,
1474
+ "step": 965
1475
+ },
1476
+ {
1477
+ "epoch": 1.983640081799591,
1478
+ "grad_norm": 0.885705828666687,
1479
+ "learning_rate": 3.566549539452529e-08,
1480
+ "loss": 0.7792,
1481
+ "step": 970
1482
+ },
1483
+ {
1484
+ "epoch": 1.9938650306748467,
1485
+ "grad_norm": 0.8692009449005127,
1486
+ "learning_rate": 3.243107840135878e-08,
1487
+ "loss": 0.7822,
1488
+ "step": 975
1489
+ },
1490
+ {
1491
+ "epoch": 2.0040899795501024,
1492
+ "grad_norm": 0.8909807205200195,
1493
+ "learning_rate": 2.9345465595385866e-08,
1494
+ "loss": 0.7826,
1495
+ "step": 980
1496
+ },
1497
+ {
1498
+ "epoch": 2.014314928425358,
1499
+ "grad_norm": 0.9065344333648682,
1500
+ "learning_rate": 2.6409638975375737e-08,
1501
+ "loss": 0.7849,
1502
+ "step": 985
1503
+ },
1504
+ {
1505
+ "epoch": 2.0245398773006134,
1506
+ "grad_norm": 0.8145809173583984,
1507
+ "learning_rate": 2.3624532870522962e-08,
1508
+ "loss": 0.7885,
1509
+ "step": 990
1510
+ },
1511
+ {
1512
+ "epoch": 2.034764826175869,
1513
+ "grad_norm": 0.9461153149604797,
1514
+ "learning_rate": 2.0991033643096457e-08,
1515
+ "loss": 0.7853,
1516
+ "step": 995
1517
+ },
1518
+ {
1519
+ "epoch": 2.044989775051125,
1520
+ "grad_norm": 0.8470706343650818,
1521
+ "learning_rate": 1.8509979406353794e-08,
1522
+ "loss": 0.7881,
1523
+ "step": 1000
1524
+ },
1525
+ {
1526
+ "epoch": 2.044989775051125,
1527
+ "eval_accuracy": 0.63202,
1528
+ "eval_loss": 0.6844514012336731,
1529
+ "eval_macro_f1": 0.6318036560759084,
1530
+ "eval_precision": 0.6354113747156731,
1531
+ "eval_recall": 0.6344858797364747,
1532
+ "eval_runtime": 81.1838,
1533
+ "eval_samples_per_second": 1231.772,
1534
+ "eval_steps_per_second": 0.604,
1535
+ "step": 1000
1536
+ },
1537
+ {
1538
+ "epoch": 2.0552147239263805,
1539
+ "grad_norm": 0.8817445635795593,
1540
+ "learning_rate": 1.6182159757810897e-08,
1541
+ "loss": 0.7879,
1542
+ "step": 1005
1543
+ },
1544
+ {
1545
+ "epoch": 2.065439672801636,
1546
+ "grad_norm": 0.856109082698822,
1547
+ "learning_rate": 1.400831552795234e-08,
1548
+ "loss": 0.7868,
1549
+ "step": 1010
1550
+ },
1551
+ {
1552
+ "epoch": 2.0756646216768915,
1553
+ "grad_norm": 0.956066370010376,
1554
+ "learning_rate": 1.1989138544461375e-08,
1555
+ "loss": 0.7845,
1556
+ "step": 1015
1557
+ },
1558
+ {
1559
+ "epoch": 2.085889570552147,
1560
+ "grad_norm": 0.930978000164032,
1561
+ "learning_rate": 1.0125271412044666e-08,
1562
+ "loss": 0.7876,
1563
+ "step": 1020
1564
+ },
1565
+ {
1566
+ "epoch": 2.096114519427403,
1567
+ "grad_norm": 0.9799636602401733,
1568
+ "learning_rate": 8.417307307923615e-09,
1569
+ "loss": 0.7861,
1570
+ "step": 1025
1571
+ },
1572
+ {
1573
+ "epoch": 2.1063394683026586,
1574
+ "grad_norm": 0.9991019368171692,
1575
+ "learning_rate": 6.8657897930547435e-09,
1576
+ "loss": 0.7852,
1577
+ "step": 1030
1578
+ },
1579
+ {
1580
+ "epoch": 2.116564417177914,
1581
+ "grad_norm": 1.076750636100769,
1582
+ "learning_rate": 5.471212639141132e-09,
1583
+ "loss": 0.7789,
1584
+ "step": 1035
1585
+ },
1586
+ {
1587
+ "epoch": 2.1267893660531696,
1588
+ "grad_norm": 0.9805507063865662,
1589
+ "learning_rate": 4.23401967148912e-09,
1590
+ "loss": 0.7829,
1591
+ "step": 1040
1592
+ },
1593
+ {
1594
+ "epoch": 2.1370143149284253,
1595
+ "grad_norm": 0.7899750471115112,
1596
+ "learning_rate": 3.154604627760571e-09,
1597
+ "loss": 0.7839,
1598
+ "step": 1045
1599
+ },
1600
+ {
1601
+ "epoch": 2.147239263803681,
1602
+ "grad_norm": 1.1698967218399048,
1603
+ "learning_rate": 2.2333110326655526e-09,
1604
+ "loss": 0.7869,
1605
+ "step": 1050
1606
+ },
1607
+ {
1608
+ "epoch": 2.1574642126789367,
1609
+ "grad_norm": 0.9302964806556702,
1610
+ "learning_rate": 1.4704320886352873e-09,
1611
+ "loss": 0.7832,
1612
+ "step": 1055
1613
+ },
1614
+ {
1615
+ "epoch": 2.1676891615541924,
1616
+ "grad_norm": 1.057986855506897,
1617
+ "learning_rate": 8.662105825103517e-10,
1618
+ "loss": 0.7864,
1619
+ "step": 1060
1620
+ },
1621
+ {
1622
+ "epoch": 2.1779141104294477,
1623
+ "grad_norm": 1.0347933769226074,
1624
+ "learning_rate": 4.208388082733161e-10,
1625
+ "loss": 0.7822,
1626
+ "step": 1065
1627
+ },
1628
+ {
1629
+ "epoch": 2.1881390593047034,
1630
+ "grad_norm": 0.9827083945274353,
1631
+ "learning_rate": 1.3445850585130924e-10,
1632
+ "loss": 0.784,
1633
+ "step": 1070
1634
+ },
1635
+ {
1636
+ "epoch": 2.198364008179959,
1637
+ "grad_norm": 0.8463678956031799,
1638
+ "learning_rate": 7.160816007045767e-12,
1639
+ "loss": 0.7811,
1640
+ "step": 1075
1641
+ },
1642
+ {
1643
+ "epoch": 2.208588957055215,
1644
+ "grad_norm": 0.9141009449958801,
1645
+ "learning_rate": 9.999610137486667e-07,
1646
+ "loss": 0.7828,
1647
+ "step": 1080
1648
+ },
1649
+ {
1650
+ "epoch": 2.21881390593047,
1651
+ "grad_norm": 0.8992940783500671,
1652
+ "learning_rate": 9.997700753166407e-07,
1653
+ "loss": 0.7843,
1654
+ "step": 1085
1655
+ },
1656
+ {
1657
+ "epoch": 2.229038854805726,
1658
+ "grad_norm": 0.9198014140129089,
1659
+ "learning_rate": 9.99420084654225e-07,
1660
+ "loss": 0.7867,
1661
+ "step": 1090
1662
+ },
1663
+ {
1664
+ "epoch": 2.2392638036809815,
1665
+ "grad_norm": 0.841385006904602,
1666
+ "learning_rate": 9.98911153146231e-07,
1667
+ "loss": 0.7899,
1668
+ "step": 1095
1669
+ },
1670
+ {
1671
+ "epoch": 2.2494887525562373,
1672
+ "grad_norm": 0.9428244233131409,
1673
+ "learning_rate": 9.982434427605222e-07,
1674
+ "loss": 0.783,
1675
+ "step": 1100
1676
+ },
1677
+ {
1678
+ "epoch": 2.2494887525562373,
1679
+ "eval_accuracy": 0.63535,
1680
+ "eval_loss": 0.6841139197349548,
1681
+ "eval_macro_f1": 0.6353491904387377,
1682
+ "eval_precision": 0.6368108503242846,
1683
+ "eval_recall": 0.6367719631437929,
1684
+ "eval_runtime": 81.1976,
1685
+ "eval_samples_per_second": 1231.563,
1686
+ "eval_steps_per_second": 0.603,
1687
+ "step": 1100
1688
+ },
1689
+ {
1690
+ "epoch": 2.259713701431493,
1691
+ "grad_norm": 0.8474355936050415,
1692
+ "learning_rate": 9.974171659964687e-07,
1693
+ "loss": 0.7805,
1694
+ "step": 1105
1695
+ },
1696
+ {
1697
+ "epoch": 2.2699386503067487,
1698
+ "grad_norm": 0.8366284370422363,
1699
+ "learning_rate": 9.964325858173184e-07,
1700
+ "loss": 0.7821,
1701
+ "step": 1110
1702
+ },
1703
+ {
1704
+ "epoch": 2.280163599182004,
1705
+ "grad_norm": 1.102426290512085,
1706
+ "learning_rate": 9.952900155665089e-07,
1707
+ "loss": 0.7854,
1708
+ "step": 1115
1709
+ },
1710
+ {
1711
+ "epoch": 2.2903885480572597,
1712
+ "grad_norm": 0.8815932273864746,
1713
+ "learning_rate": 9.939898188679462e-07,
1714
+ "loss": 0.7835,
1715
+ "step": 1120
1716
+ },
1717
+ {
1718
+ "epoch": 2.3006134969325154,
1719
+ "grad_norm": 0.8016415238380432,
1720
+ "learning_rate": 9.925324095102806e-07,
1721
+ "loss": 0.7842,
1722
+ "step": 1125
1723
+ },
1724
+ {
1725
+ "epoch": 2.310838445807771,
1726
+ "grad_norm": 0.8805480599403381,
1727
+ "learning_rate": 9.909182513152177e-07,
1728
+ "loss": 0.7791,
1729
+ "step": 1130
1730
+ },
1731
+ {
1732
+ "epoch": 2.3210633946830264,
1733
+ "grad_norm": 0.9736661314964294,
1734
+ "learning_rate": 9.891478579899078e-07,
1735
+ "loss": 0.7825,
1736
+ "step": 1135
1737
+ },
1738
+ {
1739
+ "epoch": 2.331288343558282,
1740
+ "grad_norm": 0.8331109285354614,
1741
+ "learning_rate": 9.872217929634573e-07,
1742
+ "loss": 0.7852,
1743
+ "step": 1140
1744
+ },
1745
+ {
1746
+ "epoch": 2.341513292433538,
1747
+ "grad_norm": 0.8597177267074585,
1748
+ "learning_rate": 9.851406692076183e-07,
1749
+ "loss": 0.7817,
1750
+ "step": 1145
1751
+ },
1752
+ {
1753
+ "epoch": 2.3517382413087935,
1754
+ "grad_norm": 0.7928445339202881,
1755
+ "learning_rate": 9.829051490417071e-07,
1756
+ "loss": 0.7765,
1757
+ "step": 1150
1758
+ },
1759
+ {
1760
+ "epoch": 2.361963190184049,
1761
+ "grad_norm": 0.8488237857818604,
1762
+ "learning_rate": 9.80515943921824e-07,
1763
+ "loss": 0.7836,
1764
+ "step": 1155
1765
+ },
1766
+ {
1767
+ "epoch": 2.372188139059305,
1768
+ "grad_norm": 0.7608004212379456,
1769
+ "learning_rate": 9.77973814214429e-07,
1770
+ "loss": 0.7834,
1771
+ "step": 1160
1772
+ },
1773
+ {
1774
+ "epoch": 2.38241308793456,
1775
+ "grad_norm": 0.8542405962944031,
1776
+ "learning_rate": 9.752795689543563e-07,
1777
+ "loss": 0.7777,
1778
+ "step": 1165
1779
+ },
1780
+ {
1781
+ "epoch": 2.392638036809816,
1782
+ "grad_norm": 0.8797897100448608,
1783
+ "learning_rate": 9.72434065587337e-07,
1784
+ "loss": 0.7823,
1785
+ "step": 1170
1786
+ },
1787
+ {
1788
+ "epoch": 2.4028629856850716,
1789
+ "grad_norm": 0.9687849283218384,
1790
+ "learning_rate": 9.69438209697118e-07,
1791
+ "loss": 0.7754,
1792
+ "step": 1175
1793
+ },
1794
+ {
1795
+ "epoch": 2.4130879345603273,
1796
+ "grad_norm": 0.9111893773078918,
1797
+ "learning_rate": 9.662929547172574e-07,
1798
+ "loss": 0.7806,
1799
+ "step": 1180
1800
+ },
1801
+ {
1802
+ "epoch": 2.4233128834355826,
1803
+ "grad_norm": 1.0323760509490967,
1804
+ "learning_rate": 9.629993016276944e-07,
1805
+ "loss": 0.7801,
1806
+ "step": 1185
1807
+ },
1808
+ {
1809
+ "epoch": 2.4335378323108383,
1810
+ "grad_norm": 0.79954594373703,
1811
+ "learning_rate": 9.595582986361872e-07,
1812
+ "loss": 0.7781,
1813
+ "step": 1190
1814
+ },
1815
+ {
1816
+ "epoch": 2.443762781186094,
1817
+ "grad_norm": 0.7106928825378418,
1818
+ "learning_rate": 9.559710408447184e-07,
1819
+ "loss": 0.7788,
1820
+ "step": 1195
1821
+ },
1822
+ {
1823
+ "epoch": 2.4539877300613497,
1824
+ "grad_norm": 0.77292400598526,
1825
+ "learning_rate": 9.522386699009795e-07,
1826
+ "loss": 0.7827,
1827
+ "step": 1200
1828
+ },
1829
+ {
1830
+ "epoch": 2.4539877300613497,
1831
+ "eval_accuracy": 0.645,
1832
+ "eval_loss": 0.6828427314758301,
1833
+ "eval_macro_f1": 0.6440359919423964,
1834
+ "eval_precision": 0.6441481409802297,
1835
+ "eval_recall": 0.6439695264773649,
1836
+ "eval_runtime": 81.1775,
1837
+ "eval_samples_per_second": 1231.869,
1838
+ "eval_steps_per_second": 0.604,
1839
+ "step": 1200
1840
+ },
1841
+ {
1842
+ "epoch": 2.4642126789366054,
1843
+ "grad_norm": 0.8576335310935974,
1844
+ "learning_rate": 9.483623736350402e-07,
1845
+ "loss": 0.7765,
1846
+ "step": 1205
1847
+ },
1848
+ {
1849
+ "epoch": 2.474437627811861,
1850
+ "grad_norm": 0.7940819263458252,
1851
+ "learning_rate": 9.443433856813196e-07,
1852
+ "loss": 0.7744,
1853
+ "step": 1210
1854
+ },
1855
+ {
1856
+ "epoch": 2.4846625766871164,
1857
+ "grad_norm": 0.9138656854629517,
1858
+ "learning_rate": 9.401829850859823e-07,
1859
+ "loss": 0.779,
1860
+ "step": 1215
1861
+ },
1862
+ {
1863
+ "epoch": 2.494887525562372,
1864
+ "grad_norm": 0.7292961478233337,
1865
+ "learning_rate": 9.358824958998804e-07,
1866
+ "loss": 0.7741,
1867
+ "step": 1220
1868
+ },
1869
+ {
1870
+ "epoch": 2.505112474437628,
1871
+ "grad_norm": 0.8346101641654968,
1872
+ "learning_rate": 9.314432867571731e-07,
1873
+ "loss": 0.7769,
1874
+ "step": 1225
1875
+ },
1876
+ {
1877
+ "epoch": 2.5153374233128836,
1878
+ "grad_norm": 0.7433446645736694,
1879
+ "learning_rate": 9.268667704397576e-07,
1880
+ "loss": 0.7811,
1881
+ "step": 1230
1882
+ },
1883
+ {
1884
+ "epoch": 2.525562372188139,
1885
+ "grad_norm": 0.7142143845558167,
1886
+ "learning_rate": 9.22154403427651e-07,
1887
+ "loss": 0.7739,
1888
+ "step": 1235
1889
+ },
1890
+ {
1891
+ "epoch": 2.5357873210633946,
1892
+ "grad_norm": 0.8269698023796082,
1893
+ "learning_rate": 9.173076854354633e-07,
1894
+ "loss": 0.7751,
1895
+ "step": 1240
1896
+ },
1897
+ {
1898
+ "epoch": 2.5460122699386503,
1899
+ "grad_norm": 0.7005385160446167,
1900
+ "learning_rate": 9.123281589351127e-07,
1901
+ "loss": 0.7747,
1902
+ "step": 1245
1903
+ },
1904
+ {
1905
+ "epoch": 2.556237218813906,
1906
+ "grad_norm": 0.7422548532485962,
1907
+ "learning_rate": 9.072174086649326e-07,
1908
+ "loss": 0.7764,
1909
+ "step": 1250
1910
+ },
1911
+ {
1912
+ "epoch": 2.5664621676891617,
1913
+ "grad_norm": 0.7844764590263367,
1914
+ "learning_rate": 9.01977061125327e-07,
1915
+ "loss": 0.7751,
1916
+ "step": 1255
1917
+ },
1918
+ {
1919
+ "epoch": 2.5766871165644174,
1920
+ "grad_norm": 0.8993695378303528,
1921
+ "learning_rate": 8.966087840611356e-07,
1922
+ "loss": 0.7771,
1923
+ "step": 1260
1924
+ },
1925
+ {
1926
+ "epoch": 2.5869120654396727,
1927
+ "grad_norm": 0.7648841738700867,
1928
+ "learning_rate": 8.911142859308729e-07,
1929
+ "loss": 0.7771,
1930
+ "step": 1265
1931
+ },
1932
+ {
1933
+ "epoch": 2.5971370143149284,
1934
+ "grad_norm": 0.789523720741272,
1935
+ "learning_rate": 8.854953153630096e-07,
1936
+ "loss": 0.7732,
1937
+ "step": 1270
1938
+ },
1939
+ {
1940
+ "epoch": 2.607361963190184,
1941
+ "grad_norm": 0.7698408365249634,
1942
+ "learning_rate": 8.7975366059947e-07,
1943
+ "loss": 0.769,
1944
+ "step": 1275
1945
+ },
1946
+ {
1947
+ "epoch": 2.61758691206544,
1948
+ "grad_norm": 1.019235610961914,
1949
+ "learning_rate": 8.738911489265233e-07,
1950
+ "loss": 0.7768,
1951
+ "step": 1280
1952
+ },
1953
+ {
1954
+ "epoch": 2.627811860940695,
1955
+ "grad_norm": 0.8915722966194153,
1956
+ "learning_rate": 8.679096460932475e-07,
1957
+ "loss": 0.774,
1958
+ "step": 1285
1959
+ },
1960
+ {
1961
+ "epoch": 2.638036809815951,
1962
+ "grad_norm": 1.0551347732543945,
1963
+ "learning_rate": 8.618110557177536e-07,
1964
+ "loss": 0.7711,
1965
+ "step": 1290
1966
+ },
1967
+ {
1968
+ "epoch": 2.6482617586912065,
1969
+ "grad_norm": 0.7630209922790527,
1970
+ "learning_rate": 8.555973186813575e-07,
1971
+ "loss": 0.7724,
1972
+ "step": 1295
1973
+ },
1974
+ {
1975
+ "epoch": 2.658486707566462,
1976
+ "grad_norm": 0.6783341765403748,
1977
+ "learning_rate": 8.49270412510893e-07,
1978
+ "loss": 0.773,
1979
+ "step": 1300
1980
+ },
1981
+ {
1982
+ "epoch": 2.658486707566462,
1983
+ "eval_accuracy": 0.64275,
1984
+ "eval_loss": 0.6818840503692627,
1985
+ "eval_macro_f1": 0.6366805441223703,
1986
+ "eval_precision": 0.6443671237738225,
1987
+ "eval_recall": 0.6381477730287184,
1988
+ "eval_runtime": 81.1862,
1989
+ "eval_samples_per_second": 1231.736,
1990
+ "eval_steps_per_second": 0.604,
1991
+ "step": 1300
1992
+ }
1993
+ ],
1994
+ "logging_steps": 5,
1995
+ "max_steps": 1956,
1996
+ "num_input_tokens_seen": 0,
1997
+ "num_train_epochs": 4,
1998
+ "save_steps": 100,
1999
+ "stateful_callbacks": {
2000
+ "EarlyStoppingCallback": {
2001
+ "args": {
2002
+ "early_stopping_patience": 3,
2003
+ "early_stopping_threshold": 0.0
2004
+ },
2005
+ "attributes": {
2006
+ "early_stopping_patience_counter": 1
2007
+ }
2008
+ },
2009
+ "TrainerControl": {
2010
+ "args": {
2011
+ "should_epoch_stop": false,
2012
+ "should_evaluate": false,
2013
+ "should_log": false,
2014
+ "should_save": true,
2015
+ "should_training_stop": false
2016
+ },
2017
+ "attributes": {}
2018
+ }
2019
+ },
2020
+ "total_flos": 3.498661374217421e+17,
2021
+ "train_batch_size": 1024,
2022
+ "trial_name": null,
2023
+ "trial_params": null
2024
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1300/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2212b57ced9fbe3464bd23d4ac0f4d8e75b4b021597f160058a4a19990d9f0d3
3
+ size 5841
graphcodebert-swa-from-epoch-1/checkpoint-1300/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
graphcodebert-swa-from-epoch-1/checkpoint-1400/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.3,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": 0.3,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.3,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 514,
18
+ "model_type": "roberta",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "output_past": true,
22
+ "pad_token_id": 1,
23
+ "position_embedding_type": "absolute",
24
+ "problem_type": "single_label_classification",
25
+ "transformers_version": "4.56.0",
26
+ "type_vocab_size": 1,
27
+ "use_cache": true,
28
+ "vocab_size": 50265
29
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1400/config_hyperparams.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_config": {
3
+ "model_name": "/kaggle/input/models/dzung271828/microsoft-graphcodebert-base/transformers/default/1",
4
+ "num_epochs": 4,
5
+ "batch_size": 1024,
6
+ "learning_rate": 1e-06,
7
+ "max_length": 512,
8
+ "num_labels": 2,
9
+ "loss_type": "r-drop",
10
+ "focal_alpha": 1.0,
11
+ "focal_gamma": 2.0,
12
+ "r_drop_alpha": 10.0,
13
+ "infonce_temperature": 0.07,
14
+ "infonce_weight": 0.5,
15
+ "label_smoothing": 0.5,
16
+ "adversarial_epsilon": 0.5,
17
+ "use_swa": true,
18
+ "swa_start_epoch": 1,
19
+ "swa_lr": 1e-05,
20
+ "data_augmentation": true,
21
+ "aug_rename_prob": 0.8,
22
+ "aug_format_prob": 0.8,
23
+ "freeze_base": true,
24
+ "seed": 42,
25
+ "use_wandb": false,
26
+ "mixup_alpha": 1.0,
27
+ "low_pass_keep_ratio": 0.5,
28
+ "freq_consistency_weight": 0.5
29
+ },
30
+ "training_arguments": {
31
+ "output_dir": "graphcodebert-swa-from-epoch-1/",
32
+ "num_train_epochs": 4,
33
+ "per_device_train_batch_size": 1024,
34
+ "per_device_eval_batch_size": 2048,
35
+ "learning_rate": 1e-06,
36
+ "warmup_steps": 195,
37
+ "weight_decay": 0.1,
38
+ "logging_steps": 5,
39
+ "eval_steps": 100,
40
+ "save_steps": 100,
41
+ "metric_for_best_model": "macro_f1",
42
+ "greater_is_better": true,
43
+ "save_total_limit": 5,
44
+ "fp16": true,
45
+ "seed": 42
46
+ },
47
+ "training_state": {
48
+ "global_step": 1400,
49
+ "epoch": 2.8629856850715747,
50
+ "best_metric": 0.6459359532496649,
51
+ "best_model_checkpoint": "graphcodebert-swa-from-epoch-1/checkpoint-1400"
52
+ }
53
+ }
graphcodebert-swa-from-epoch-1/checkpoint-1400/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
graphcodebert-swa-from-epoch-1/checkpoint-1400/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44ffaa0bb66889175547889af7dceab487d4d9d6631533c70e0d45526ad4d0b2
3
+ size 498612824
graphcodebert-swa-from-epoch-1/checkpoint-1400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11155d039b15987237b966bb2fb0ef9114f7e74e2d27e7b3542c52d18d9eb0ae
3
+ size 4741923
graphcodebert-swa-from-epoch-1/checkpoint-1400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6e5c9a07f3f291636965eaeac8f965e4d9c0c54af3e92df020bffab1af8436b
3
+ size 14773
graphcodebert-swa-from-epoch-1/checkpoint-1400/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3b03c8c40e45be48b5956b82e1abb2c1ec5641f4fc77e6fdec0942a77964500
3
+ size 1383
graphcodebert-swa-from-epoch-1/checkpoint-1400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f0b4305c8223bd5b30cf2e73eee2f4de35a3ce3d940ebead766a15d981b126e
3
+ size 1465