fabikru commited on
Commit
b0f2428
·
verified ·
1 Parent(s): 3f741c7

model_15M_small_ds_masking_0.1_predicted_hparamas

Browse files
Files changed (4) hide show
  1. README.md +11 -39
  2. config.json +5 -6
  3. model.safetensors +2 -2
  4. training_args.bin +1 -1
README.md CHANGED
@@ -16,8 +16,8 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.1351
20
- - Accuracy: 0.9525
21
 
22
  ## Model description
23
 
@@ -36,12 +36,10 @@ More information needed
36
  ### Training hyperparameters
37
 
38
  The following hyperparameters were used during training:
39
- - learning_rate: 0.003303
40
  - train_batch_size: 256
41
  - eval_batch_size: 256
42
  - seed: 42
43
- - gradient_accumulation_steps: 16
44
- - total_train_batch_size: 4096
45
  - optimizer: Use OptimizerNames.SCHEDULE_FREE_ADAMW with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
  - lr_scheduler_type: constant
47
  - lr_scheduler_warmup_steps: 1000
@@ -52,42 +50,16 @@ The following hyperparameters were used during training:
52
 
53
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
54
  |:-------------:|:------:|:----:|:---------------:|:--------:|
55
- | No log | 0 | 0 | 4.7921 | 0.0008 |
56
- | No log | 0.0044 | 122 | 0.7441 | 0.7665 |
57
- | No log | 0.0087 | 244 | 0.4206 | 0.8615 |
58
- | No log | 0.0131 | 366 | 0.3098 | 0.8961 |
59
- | No log | 0.0175 | 488 | 0.2602 | 0.9122 |
60
- | 11.7806 | 0.0218 | 610 | 0.2386 | 0.9188 |
61
- | 11.7806 | 0.0262 | 732 | 0.2316 | 0.9214 |
62
- | 11.7806 | 0.0306 | 854 | 0.2172 | 0.9259 |
63
- | 11.7806 | 0.0349 | 976 | 0.2144 | 0.9264 |
64
- | 4.0915 | 0.0393 | 1098 | 0.2056 | 0.9295 |
65
- | 4.0915 | 0.0437 | 1220 | 0.1973 | 0.9327 |
66
- | 4.0915 | 0.0480 | 1342 | 0.1923 | 0.9341 |
67
- | 4.0915 | 0.0524 | 1464 | 0.1848 | 0.9366 |
68
- | 3.3969 | 0.0568 | 1586 | 0.1802 | 0.9380 |
69
- | 3.3969 | 0.0611 | 1708 | 0.1739 | 0.9403 |
70
- | 3.3969 | 0.0655 | 1830 | 0.1687 | 0.9415 |
71
- | 3.3969 | 0.0699 | 1952 | 0.1638 | 0.9434 |
72
- | 2.8823 | 0.0743 | 2074 | 0.1608 | 0.9444 |
73
- | 2.8823 | 0.0786 | 2196 | 0.1563 | 0.9459 |
74
- | 2.8823 | 0.0830 | 2318 | 0.1532 | 0.9467 |
75
- | 2.8823 | 0.0874 | 2440 | 0.1498 | 0.9480 |
76
- | 2.6025 | 0.0917 | 2562 | 0.1489 | 0.9482 |
77
- | 2.6025 | 0.0961 | 2684 | 0.1465 | 0.9491 |
78
- | 2.6025 | 0.1005 | 2806 | 0.1423 | 0.9503 |
79
- | 2.6025 | 0.1048 | 2928 | 0.1417 | 0.9505 |
80
- | 2.4119 | 0.1092 | 3050 | 0.1393 | 0.9512 |
81
- | 2.4119 | 0.1136 | 3172 | 0.1383 | 0.9515 |
82
- | 2.4119 | 0.1179 | 3294 | 0.1357 | 0.9527 |
83
- | 2.4119 | 0.1223 | 3416 | 0.1342 | 0.9533 |
84
- | 2.2796 | 0.1267 | 3538 | 0.1344 | 0.9531 |
85
- | 2.2796 | 0.1310 | 3660 | 0.1352 | 0.9529 |
86
 
87
 
88
  ### Framework versions
89
 
90
- - Transformers 4.50.1
91
- - Pytorch 2.8.0.dev20250325+cu128
92
- - Datasets 3.4.1
93
  - Tokenizers 0.21.1
 
16
 
17
  This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.3599
20
+ - Accuracy: 0.8744
21
 
22
  ## Model description
23
 
 
36
  ### Training hyperparameters
37
 
38
  The following hyperparameters were used during training:
39
+ - learning_rate: 0.003439
40
  - train_batch_size: 256
41
  - eval_batch_size: 256
42
  - seed: 42
 
 
43
  - optimizer: Use OptimizerNames.SCHEDULE_FREE_ADAMW with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
44
  - lr_scheduler_type: constant
45
  - lr_scheduler_warmup_steps: 1000
 
50
 
51
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
  |:-------------:|:------:|:----:|:---------------:|:--------:|
53
+ | No log | 0 | 0 | 4.6089 | 0.0017 |
54
+ | 0.4423 | 0.4302 | 1953 | 0.3749 | 0.8701 |
55
+ | 0.3952 | 0.8604 | 3906 | 0.3602 | 0.8745 |
56
+ | 0.4069 | 1.2905 | 5859 | 0.3640 | 0.8738 |
57
+ | 0.4012 | 1.7207 | 7812 | 0.3674 | 0.8711 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
 
60
  ### Framework versions
61
 
62
+ - Transformers 4.51.2
63
+ - Pytorch 2.8.0.dev20250410+cu128
64
+ - Datasets 3.5.0
65
  - Tokenizers 0.21.1
config.json CHANGED
@@ -17,10 +17,10 @@
17
  "global_attn_every_n_layers": 1,
18
  "global_rope_theta": 160000.0,
19
  "hidden_activation": "gelu",
20
- "hidden_size": 768,
21
  "initializer_cutoff_factor": 2.0,
22
  "initializer_range": 0.02,
23
- "intermediate_size": 1152,
24
  "local_attention": 128,
25
  "local_rope_theta": 10000.0,
26
  "max_position_embeddings": 502,
@@ -29,15 +29,14 @@
29
  "model_type": "modernbert",
30
  "norm_bias": false,
31
  "norm_eps": 1e-05,
32
- "num_attention_heads": 12,
33
- "num_hidden_layers": 22,
34
  "pad_token_id": 1,
35
- "reference_compile": true,
36
  "repad_logits_with_grad": false,
37
  "sep_token_id": 3,
38
  "sparse_pred_ignore_index": -100,
39
  "sparse_prediction": false,
40
  "torch_dtype": "float32",
41
- "transformers_version": "4.50.1",
42
  "vocab_size": 82
43
  }
 
17
  "global_attn_every_n_layers": 1,
18
  "global_rope_theta": 160000.0,
19
  "hidden_activation": "gelu",
20
+ "hidden_size": 384,
21
  "initializer_cutoff_factor": 2.0,
22
  "initializer_range": 0.02,
23
+ "intermediate_size": 576,
24
  "local_attention": 128,
25
  "local_rope_theta": 10000.0,
26
  "max_position_embeddings": 502,
 
29
  "model_type": "modernbert",
30
  "norm_bias": false,
31
  "norm_eps": 1e-05,
32
+ "num_attention_heads": 6,
33
+ "num_hidden_layers": 12,
34
  "pad_token_id": 1,
 
35
  "repad_logits_with_grad": false,
36
  "sep_token_id": 3,
37
  "sparse_pred_ignore_index": -100,
38
  "sparse_prediction": false,
39
  "torch_dtype": "float32",
40
+ "transformers_version": "4.51.2",
41
  "vocab_size": 82
42
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f1a8afcad2257ffdc80b29e011bb4b6938ec1392006e49822d69ab2a4e1f33c
3
- size 443955224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de3dea857f6e6cad9d8ee484718c303aaa6109a7bc51f33aa4e590c361b1b73e
3
+ size 60925776
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eda8fc4856bc15beae646fcc0308df8cba95eec480ebb8fbb26821042e287aa8
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a2b46f80daa977eea7bab37ba38e09ff666b2e9052ba40aad0a4a0380a48b6a
3
  size 5905