tasal9 commited on
Commit
0d65f9c
·
verified ·
1 Parent(s): 0e87fcf

Push retrained pashto-base-bloom (on base_pashto_clean) and update ModelCard

Browse files
Files changed (43) hide show
  1. .gitattributes +4 -0
  2. ModelCard.md +181 -0
  3. checkpoint-2829/config.json +32 -0
  4. checkpoint-2829/generation_config.json +7 -0
  5. checkpoint-2829/model.safetensors +3 -0
  6. checkpoint-2829/optimizer.pt +3 -0
  7. checkpoint-2829/rng_state.pth +3 -0
  8. checkpoint-2829/scheduler.pt +3 -0
  9. checkpoint-2829/special_tokens_map.json +30 -0
  10. checkpoint-2829/tokenizer.json +3 -0
  11. checkpoint-2829/tokenizer_config.json +48 -0
  12. checkpoint-2829/trainer_state.json +237 -0
  13. checkpoint-2829/training_args.bin +3 -0
  14. checkpoint-5658/config.json +32 -0
  15. checkpoint-5658/generation_config.json +7 -0
  16. checkpoint-5658/model.safetensors +3 -0
  17. checkpoint-5658/optimizer.pt +3 -0
  18. checkpoint-5658/rng_state.pth +3 -0
  19. checkpoint-5658/scheduler.pt +3 -0
  20. checkpoint-5658/special_tokens_map.json +30 -0
  21. checkpoint-5658/tokenizer.json +3 -0
  22. checkpoint-5658/tokenizer_config.json +48 -0
  23. checkpoint-5658/trainer_state.json +441 -0
  24. checkpoint-5658/training_args.bin +3 -0
  25. checkpoint-8487/config.json +32 -0
  26. checkpoint-8487/generation_config.json +7 -0
  27. checkpoint-8487/model.safetensors +3 -0
  28. checkpoint-8487/optimizer.pt +3 -0
  29. checkpoint-8487/rng_state.pth +3 -0
  30. checkpoint-8487/scheduler.pt +3 -0
  31. checkpoint-8487/special_tokens_map.json +30 -0
  32. checkpoint-8487/tokenizer.json +3 -0
  33. checkpoint-8487/tokenizer_config.json +48 -0
  34. checkpoint-8487/trainer_state.json +645 -0
  35. checkpoint-8487/training_args.bin +3 -0
  36. final_model/config.json +32 -0
  37. final_model/generation_config.json +7 -0
  38. final_model/model.safetensors +3 -0
  39. final_model/special_tokens_map.json +30 -0
  40. final_model/tokenizer.json +3 -0
  41. final_model/tokenizer_config.json +48 -0
  42. final_model/training_args.bin +3 -0
  43. logs/events.out.tfevents.1749891083.51d9157369b8.14130.0 +3 -0
.gitattributes CHANGED
@@ -34,3 +34,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-2829/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-5658/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-8487/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
ModelCard.md ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ ---
3
+ license: mit # Or your chosen license: apache-2.0, cc-by-4.0, etc.
4
+ language:
5
+ - ps # Pashto
6
+ library_name: transformers
7
+ tags:
8
+ - text-generation
9
+ - pashto
10
+ - bloom
11
+ - zamai-bloom
12
+ datasets:
13
+
14
+ **Note on Dataset Identifiers:**
15
+ The `datasets` field in the metadata of this model card might list `tasal9/pashto_base_bloom`. This identifier may refer to an earlier version or a different collection of Pashto data. The specific training run culminating in this model update (June 2025) exclusively used the locally processed `datasets/base_pashto_clean` as described above.
16
+
17
+
18
+ - tasal9/pashto_base_bloom
19
+ pipeline_tag: text-generation
20
+ widget:
21
+ - text: "پښتو ژبه"
22
+ ---
23
+
24
+ # ZamAI Bloom Pashto - checkpoint5207 (and Final Model)
25
+
26
+ This model card is for `checkpoint5207` and the final fine-tuned version of a Bloom model for Pashto text generation, developed under the ZamAI Bloom project.
27
+
28
+ ## Model Description
29
+
30
+ This model is a fine-tuned version of [bigscience/bloom-560m](https://huggingface.co/bigscience/bloom-560m) on a Pashto text corpus. The goal of this project was to create a language model proficient in generating coherent and contextually relevant Pashto text.
31
+
32
+ **Base Model:** `bigscience/bloom-560m`
33
+ **Fine-tuning Checkpoint:** `checkpoint5207`
34
+ **Final Model:** [tasal9/zamai-bloom-ps-final]
35
+
36
+ ## Intended Uses & Limitations
37
+
38
+ ### Intended Uses
39
+
40
+ This model is intended for:
41
+ * Generating Pashto text.
42
+ * Assisting with Pashto language content creation.
43
+ * Research in Pashto NLP.
44
+ * Educational purposes for Pashto language learning.
45
+
46
+ ### Limitations and Bias
47
+
48
+ * The model's performance is dependent on the quality and diversity of the training data. It may generate text that reflects biases present in the data.
49
+ * It might produce factually incorrect or nonsensical text, especially for complex topics or out-of-domain prompts.
50
+ * The model may not be suitable for critical applications without further evaluation and mitigation of potential harms.
51
+ * Performance on specific Pashto dialects might vary depending on their representation in the training data.
52
+
53
+ ## How to use
54
+
55
+ You can use this model with the Hugging Face `transformers` library for text generation.
56
+
57
+ First, install the library:
58
+ ```bash
59
+ pip install transformers torch
60
+ ```
61
+
62
+ Then, you can use the model in Python:
63
+
64
+ ```python
65
+ from transformers import AutoModelForCausalLM, AutoTokenizer
66
+
67
+ model_name = "tasal9/zamai-bloom-ps-final" # Or the specific checkpoint identifier if using a checkpoint directly
68
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
69
+ model = AutoModelForCausalLM.from_pretrained(model_name)
70
+
71
+ prompt = "په پښتو ژبه کې یو شعر ولیکئ د پسرلي په اړه" # Example prompt: "Write a poem in Pashto about spring"
72
+ inputs = tokenizer(prompt, return_tensors="pt")
73
+
74
+ # Generate text
75
+ # Adjust generation parameters as needed (max_length, num_beams, do_sample, top_k, top_p, etc.)
76
+ outputs = model.generate(**inputs, max_length=100, num_beams=5, early_stopping=True)
77
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
78
+
79
+ print(generated_text)
80
+ ```
81
+
82
+ ## Training Data
83
+
84
+ Describe the dataset(s) used for fine-tuning.
85
+ * **Source:** [e.g., Web scraped data, specific Pashto corpora, data from `datasets/base_pashto/`]
86
+ * **Size:** [e.g., Number of documents, tokens, GBs]
87
+ * **Preprocessing:** [e.g., Cleaning steps, tokenization details]
88
+ * **Language Variety:** [e.g., Predominant dialects, formal/informal text]
89
+
90
+ If your dataset is on the Hugging Face Hub, link to it.
91
+
92
+ ## Training Procedure
93
+
94
+ ### Preprocessing
95
+
96
+ The texts were tokenized using the `AutoTokenizer` associated with the base Bloom model.
97
+ [Add any other specific preprocessing steps you took.]
98
+
99
+ ### Fine-tuning
100
+
101
+ The model was fine-tuned using the Hugging Face `transformers` library with PyTorch.
102
+ * **Training script:** [Link to your `train_base_model.py` if applicable]
103
+ * **Hyperparameters:**
104
+ * Learning rate: 2e-5
105
+ * Batch size: 4 # Adjust based on your GPU memory (e.g., 8, 16)
106
+ * Number of epochs: 3 # Adjust based on convergence and overfitting
107
+ * Optimizer: AdamW
108
+ * Weight decay: 0.01
109
+ * Warmup steps: 500 # Or warmup_ratio, e.g., 0.1
110
+ * Gradient accumulation steps: 1 # Increase if actual batch size is limited by memory
111
+ * Seed: 42 # For reproducibility
112
+ * **Infrastructure:**
113
+ * Hardware: [e.g., 1x NVIDIA A100 40GB, or specify your hardware]
114
+ * Training time: [e.g., X hours]
115
+
116
+ This specific model card refers to `checkpoint5207`, which was saved at step 5207 of the training process. The final model represents the model after the completion of all training epochs/steps.
117
+
118
+ ## Evaluation Results
119
+
120
+ Provide quantitative results if available (e.g., perplexity, BLEU scores on a held-out test set).
121
+ * **Test set:** [Describe your test set]
122
+ * **Metrics:** [e.g., Perplexity, BLEU, ROUGE]
123
+ * **Results for checkpoint5207:**
124
+ * [Metric 1]: [Value]
125
+ * [Metric 2]: [Value]
126
+ * **Results for final model:**
127
+ * [Metric 1]: [Value]
128
+ * [Metric 2]: [Value]
129
+
130
+ Qualitative observations can also be included.
131
+
132
+ ## Model Card Contact
133
+
134
+ **Author:** Yaqoob Tasal
135
+ **Username:** tasal9
136
+ **Organization:** ZamAI
137
+ [GitHub: https://github.com/tasal9](https://github.com/tasal9)
138
+
139
+ ## Citation
140
+
141
+ If you use this model or its checkpoints, please consider citing:
142
+
143
+ ```bibtex
144
+ @misc{zamai_bloom_pashto_2025,
145
+ author = {Yaqoob Tasal},
146
+ title = {ZamAI Bloom Pashto - Fine-tuned Language Model},
147
+ year = {2025},
148
+ publisher = {Hugging Face},
149
+ journal = {Hugging Face Model Hub},
150
+ howpublished = {\url{https://huggingface.co/tasal9/zamai-bloom-ps-final}}
151
+ }
152
+ ```
153
+
154
+ And the original Bloom model:
155
+ ```bibtex
156
+ @article{scao2022bloom,
157
+ title={BLOOM: A 176B-Parameter Open-Access Multilingual Language Model},
158
+ author={Scao, Teven Le and Fan, Angela and Akiki, Christopher and Baran, Efrat and Ben Cheikh, Rim and Coavoux, Maxime and Davison, Thomas and de Vargas, Niklas Deckers and Delangue, C{\'e}line and Demeusy, Thibault and others},
159
+ journal={arXiv preprint arXiv:2211.05100},
160
+ year={2022}
161
+ }
162
+ ```
163
+
164
+ ---
165
+
166
+ Remember to replace placeholders like dataset details, hyperparameters, and evaluation results with your actual project details. Save this as a `README.md` file in your model repository on the Hugging Face Hub.
167
+
168
+ ## Training Details (Cleaned Base Model - June 2025)
169
+
170
+ This model version was trained from `bigscience/bloom-560m` using the `train_base_model.py` script.
171
+
172
+ - **Training Data:** The model was trained on a locally prepared dataset located at `datasets/base_pashto_clean`. This dataset was created using `prepare_base_dataset.py` and is derived from `pashto_data/base_model/cleaned_base_data.txt`, which primarily contains Pashto text from a bilingual Pashto-English glossary.
173
+ - **Training Objective:** To establish a foundational Pashto language model with improved coherence and reduced issues (e.g., repetition, off-language generation) compared to any prior versions trained on noisier data.
174
+ - **Output Directory (during training):** `models/pashto-bloom-base-clean-colab`
175
+ - **Key Training Hyperparameters:**
176
+ - Epochs: 3
177
+ - Per Device Batch Size: 2
178
+ - Gradient Accumulation Steps: 4
179
+ - Learning Rate: 5e-5
180
+ - FP16 (Mixed Precision): True
181
+ - Optimizer: AdamW
checkpoint-2829/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bigscience/bloom-560m",
3
+ "apply_residual_connection_post_layernorm": false,
4
+ "architectures": [
5
+ "BloomForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "attention_softmax_in_fp32": true,
9
+ "bias_dropout_fusion": true,
10
+ "bos_token_id": 1,
11
+ "eos_token_id": 2,
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 1024,
14
+ "initializer_range": 0.02,
15
+ "layer_norm_epsilon": 1e-05,
16
+ "masked_softmax_fusion": true,
17
+ "model_type": "bloom",
18
+ "n_head": 16,
19
+ "n_inner": null,
20
+ "n_layer": 24,
21
+ "offset_alibi": 100,
22
+ "pad_token_id": 3,
23
+ "pretraining_tp": 1,
24
+ "skip_bias_add": true,
25
+ "skip_bias_add_qkv": false,
26
+ "slow_but_exact": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.48.3",
29
+ "unk_token_id": 0,
30
+ "use_cache": true,
31
+ "vocab_size": 250680
32
+ }
checkpoint-2829/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 3,
6
+ "transformers_version": "4.48.3"
7
+ }
checkpoint-2829/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5da99209f92544cc5bb67435831c564d4561d5a31cae6e4c3339f91a68579c48
3
+ size 2236073104
checkpoint-2829/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b225d39b8c1adbc720b566e3fb3fb3a1ad81ec75eeb9ce2f14b2a577bf46f97f
3
+ size 4472324430
checkpoint-2829/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ff264f99d31b522cc7e2a4eac9d38606d0c58a34c0adc74d71e0ca8b371dc36
3
+ size 14244
checkpoint-2829/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3415c1b6fabdb38d6655042b70f1c42c937fba17970b0f944c772dba14a441d
3
+ size 1064
checkpoint-2829/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-2829/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d963066d6adae5034a1dc114c3ac444512de09928cf14ed4562ba94d9a440e66
3
+ size 21763085
checkpoint-2829/tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<unk>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<pad>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ }
36
+ },
37
+ "bos_token": "<s>",
38
+ "clean_up_tokenization_spaces": false,
39
+ "eos_token": "</s>",
40
+ "extra_special_tokens": {},
41
+ "merges_file": null,
42
+ "model_max_length": 1000000000000000019884624838656,
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "tokenizer_class": "BloomTokenizer",
46
+ "unk_token": "<unk>",
47
+ "vocab_file": null
48
+ }
checkpoint-2829/trainer_state.json ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": Infinity,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9997349589186324,
5
+ "eval_steps": 500,
6
+ "global_step": 2829,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03533881084901493,
13
+ "grad_norm": 130.5043487548828,
14
+ "learning_rate": 5.4181389870435815e-06,
15
+ "loss": 5.2462,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.07067762169802987,
20
+ "grad_norm": 121.2613754272461,
21
+ "learning_rate": 1.1189634864546526e-05,
22
+ "loss": 4.293,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.10601643254704479,
27
+ "grad_norm": 28631.3515625,
28
+ "learning_rate": 1.6961130742049473e-05,
29
+ "loss": 4.2171,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.14135524339605973,
34
+ "grad_norm": 2812505.5,
35
+ "learning_rate": 2.220259128386337e-05,
36
+ "loss": 6.2637,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.17669405424507464,
41
+ "grad_norm": 416.1485290527344,
42
+ "learning_rate": 2.7561837455830393e-05,
43
+ "loss": 166.4242,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.21203286509408958,
48
+ "grad_norm": 416.1195373535156,
49
+ "learning_rate": 3.345111896348646e-05,
50
+ "loss": 187.1228,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.24737167594310452,
55
+ "grad_norm": 392.4357604980469,
56
+ "learning_rate": 3.934040047114253e-05,
57
+ "loss": 190.0541,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.28271048679211946,
62
+ "grad_norm": 318.76153564453125,
63
+ "learning_rate": 4.5229681978798585e-05,
64
+ "loss": 150.7144,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.3180492976411344,
69
+ "grad_norm": 407.822021484375,
70
+ "learning_rate": 4.9875621890547264e-05,
71
+ "loss": 138.6864,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.3533881084901493,
76
+ "grad_norm": 382.1808166503906,
77
+ "learning_rate": 4.922100026184866e-05,
78
+ "loss": 131.8268,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.38872691933916426,
83
+ "grad_norm": 374.1636047363281,
84
+ "learning_rate": 4.8566378633150036e-05,
85
+ "loss": 118.8281,
86
+ "step": 1100
87
+ },
88
+ {
89
+ "epoch": 0.42406573018817917,
90
+ "grad_norm": 276.59124755859375,
91
+ "learning_rate": 4.791175700445143e-05,
92
+ "loss": 106.5782,
93
+ "step": 1200
94
+ },
95
+ {
96
+ "epoch": 0.4594045410371941,
97
+ "grad_norm": 285.5071105957031,
98
+ "learning_rate": 4.7257135375752815e-05,
99
+ "loss": 93.512,
100
+ "step": 1300
101
+ },
102
+ {
103
+ "epoch": 0.49474335188620905,
104
+ "grad_norm": 306.88946533203125,
105
+ "learning_rate": 4.660251374705421e-05,
106
+ "loss": 82.3966,
107
+ "step": 1400
108
+ },
109
+ {
110
+ "epoch": 0.5300821627352239,
111
+ "grad_norm": 259.3111877441406,
112
+ "learning_rate": 4.594789211835559e-05,
113
+ "loss": 82.1071,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.5654209735842389,
118
+ "grad_norm": 414.5076599121094,
119
+ "learning_rate": 4.529327048965698e-05,
120
+ "loss": 78.2349,
121
+ "step": 1600
122
+ },
123
+ {
124
+ "epoch": 0.6007597844332538,
125
+ "grad_norm": 249.02976989746094,
126
+ "learning_rate": 4.463864886095837e-05,
127
+ "loss": 65.9571,
128
+ "step": 1700
129
+ },
130
+ {
131
+ "epoch": 0.6360985952822688,
132
+ "grad_norm": 188.65126037597656,
133
+ "learning_rate": 4.398402723225975e-05,
134
+ "loss": 51.9814,
135
+ "step": 1800
136
+ },
137
+ {
138
+ "epoch": 0.6714374061312837,
139
+ "grad_norm": 208.6486358642578,
140
+ "learning_rate": 4.3329405603561146e-05,
141
+ "loss": 53.3651,
142
+ "step": 1900
143
+ },
144
+ {
145
+ "epoch": 0.7067762169802986,
146
+ "grad_norm": 349.70465087890625,
147
+ "learning_rate": 4.267478397486253e-05,
148
+ "loss": 58.8342,
149
+ "step": 2000
150
+ },
151
+ {
152
+ "epoch": 0.7421150278293135,
153
+ "grad_norm": 327.7672119140625,
154
+ "learning_rate": 4.2020162346163925e-05,
155
+ "loss": 75.0863,
156
+ "step": 2100
157
+ },
158
+ {
159
+ "epoch": 0.7774538386783285,
160
+ "grad_norm": 174.98016357421875,
161
+ "learning_rate": 4.1365540717465304e-05,
162
+ "loss": 81.5808,
163
+ "step": 2200
164
+ },
165
+ {
166
+ "epoch": 0.8127926495273434,
167
+ "grad_norm": 243.49595642089844,
168
+ "learning_rate": 4.07109190887667e-05,
169
+ "loss": 85.0316,
170
+ "step": 2300
171
+ },
172
+ {
173
+ "epoch": 0.8481314603763583,
174
+ "grad_norm": 193.5273895263672,
175
+ "learning_rate": 4.005629746006808e-05,
176
+ "loss": 88.9008,
177
+ "step": 2400
178
+ },
179
+ {
180
+ "epoch": 0.8834702712253732,
181
+ "grad_norm": 614.0827026367188,
182
+ "learning_rate": 3.940167583136947e-05,
183
+ "loss": 84.831,
184
+ "step": 2500
185
+ },
186
+ {
187
+ "epoch": 0.9188090820743882,
188
+ "grad_norm": 430.6079406738281,
189
+ "learning_rate": 3.8747054202670855e-05,
190
+ "loss": 71.7362,
191
+ "step": 2600
192
+ },
193
+ {
194
+ "epoch": 0.9541478929234031,
195
+ "grad_norm": 285.82318115234375,
196
+ "learning_rate": 3.809243257397225e-05,
197
+ "loss": 73.4898,
198
+ "step": 2700
199
+ },
200
+ {
201
+ "epoch": 0.9894867037724181,
202
+ "grad_norm": 328.9468078613281,
203
+ "learning_rate": 3.7437810945273634e-05,
204
+ "loss": 54.771,
205
+ "step": 2800
206
+ },
207
+ {
208
+ "epoch": 0.9997349589186324,
209
+ "eval_loss": NaN,
210
+ "eval_runtime": 50.7142,
211
+ "eval_samples_per_second": 49.611,
212
+ "eval_steps_per_second": 24.806,
213
+ "step": 2829
214
+ }
215
+ ],
216
+ "logging_steps": 100,
217
+ "max_steps": 8487,
218
+ "num_input_tokens_seen": 0,
219
+ "num_train_epochs": 3,
220
+ "save_steps": 500,
221
+ "stateful_callbacks": {
222
+ "TrainerControl": {
223
+ "args": {
224
+ "should_epoch_stop": false,
225
+ "should_evaluate": false,
226
+ "should_log": false,
227
+ "should_save": true,
228
+ "should_training_stop": false
229
+ },
230
+ "attributes": {}
231
+ }
232
+ },
233
+ "total_flos": 940609672003584.0,
234
+ "train_batch_size": 2,
235
+ "trial_name": null,
236
+ "trial_params": null
237
+ }
checkpoint-2829/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd754075914a8cc7aa6384a77012a824ab1c56880d743f05986a4069b1cd5fc5
3
+ size 5368
checkpoint-5658/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bigscience/bloom-560m",
3
+ "apply_residual_connection_post_layernorm": false,
4
+ "architectures": [
5
+ "BloomForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "attention_softmax_in_fp32": true,
9
+ "bias_dropout_fusion": true,
10
+ "bos_token_id": 1,
11
+ "eos_token_id": 2,
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 1024,
14
+ "initializer_range": 0.02,
15
+ "layer_norm_epsilon": 1e-05,
16
+ "masked_softmax_fusion": true,
17
+ "model_type": "bloom",
18
+ "n_head": 16,
19
+ "n_inner": null,
20
+ "n_layer": 24,
21
+ "offset_alibi": 100,
22
+ "pad_token_id": 3,
23
+ "pretraining_tp": 1,
24
+ "skip_bias_add": true,
25
+ "skip_bias_add_qkv": false,
26
+ "slow_but_exact": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.48.3",
29
+ "unk_token_id": 0,
30
+ "use_cache": true,
31
+ "vocab_size": 250680
32
+ }
checkpoint-5658/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 3,
6
+ "transformers_version": "4.48.3"
7
+ }
checkpoint-5658/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684787c615abc52dc81952c3975a1e7d4c07ac88fb56cdbd2e8bb74bd3578c56
3
+ size 2236073104
checkpoint-5658/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7713aac236be7294ccdc7091a6cc3d57dcef4ba927b71be99619d30ab22b4af7
3
+ size 4472324430
checkpoint-5658/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d138cfe3a4adf21f048848ee35837c9a757a0a3616ff7adbb45b69aac247435
3
+ size 14244
checkpoint-5658/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc70f87b398935da27e9c8e90d71bb6e5ed23c1ba54b99e2a8d5e44635ac52f3
3
+ size 1064
checkpoint-5658/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-5658/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d963066d6adae5034a1dc114c3ac444512de09928cf14ed4562ba94d9a440e66
3
+ size 21763085
checkpoint-5658/tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<unk>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<pad>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ }
36
+ },
37
+ "bos_token": "<s>",
38
+ "clean_up_tokenization_spaces": false,
39
+ "eos_token": "</s>",
40
+ "extra_special_tokens": {},
41
+ "merges_file": null,
42
+ "model_max_length": 1000000000000000019884624838656,
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "tokenizer_class": "BloomTokenizer",
46
+ "unk_token": "<unk>",
47
+ "vocab_file": null
48
+ }
checkpoint-5658/trainer_state.json ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": Infinity,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9997349589186324,
5
+ "eval_steps": 500,
6
+ "global_step": 5658,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03533881084901493,
13
+ "grad_norm": 130.5043487548828,
14
+ "learning_rate": 5.4181389870435815e-06,
15
+ "loss": 5.2462,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.07067762169802987,
20
+ "grad_norm": 121.2613754272461,
21
+ "learning_rate": 1.1189634864546526e-05,
22
+ "loss": 4.293,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.10601643254704479,
27
+ "grad_norm": 28631.3515625,
28
+ "learning_rate": 1.6961130742049473e-05,
29
+ "loss": 4.2171,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.14135524339605973,
34
+ "grad_norm": 2812505.5,
35
+ "learning_rate": 2.220259128386337e-05,
36
+ "loss": 6.2637,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.17669405424507464,
41
+ "grad_norm": 416.1485290527344,
42
+ "learning_rate": 2.7561837455830393e-05,
43
+ "loss": 166.4242,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.21203286509408958,
48
+ "grad_norm": 416.1195373535156,
49
+ "learning_rate": 3.345111896348646e-05,
50
+ "loss": 187.1228,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.24737167594310452,
55
+ "grad_norm": 392.4357604980469,
56
+ "learning_rate": 3.934040047114253e-05,
57
+ "loss": 190.0541,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.28271048679211946,
62
+ "grad_norm": 318.76153564453125,
63
+ "learning_rate": 4.5229681978798585e-05,
64
+ "loss": 150.7144,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.3180492976411344,
69
+ "grad_norm": 407.822021484375,
70
+ "learning_rate": 4.9875621890547264e-05,
71
+ "loss": 138.6864,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.3533881084901493,
76
+ "grad_norm": 382.1808166503906,
77
+ "learning_rate": 4.922100026184866e-05,
78
+ "loss": 131.8268,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.38872691933916426,
83
+ "grad_norm": 374.1636047363281,
84
+ "learning_rate": 4.8566378633150036e-05,
85
+ "loss": 118.8281,
86
+ "step": 1100
87
+ },
88
+ {
89
+ "epoch": 0.42406573018817917,
90
+ "grad_norm": 276.59124755859375,
91
+ "learning_rate": 4.791175700445143e-05,
92
+ "loss": 106.5782,
93
+ "step": 1200
94
+ },
95
+ {
96
+ "epoch": 0.4594045410371941,
97
+ "grad_norm": 285.5071105957031,
98
+ "learning_rate": 4.7257135375752815e-05,
99
+ "loss": 93.512,
100
+ "step": 1300
101
+ },
102
+ {
103
+ "epoch": 0.49474335188620905,
104
+ "grad_norm": 306.88946533203125,
105
+ "learning_rate": 4.660251374705421e-05,
106
+ "loss": 82.3966,
107
+ "step": 1400
108
+ },
109
+ {
110
+ "epoch": 0.5300821627352239,
111
+ "grad_norm": 259.3111877441406,
112
+ "learning_rate": 4.594789211835559e-05,
113
+ "loss": 82.1071,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.5654209735842389,
118
+ "grad_norm": 414.5076599121094,
119
+ "learning_rate": 4.529327048965698e-05,
120
+ "loss": 78.2349,
121
+ "step": 1600
122
+ },
123
+ {
124
+ "epoch": 0.6007597844332538,
125
+ "grad_norm": 249.02976989746094,
126
+ "learning_rate": 4.463864886095837e-05,
127
+ "loss": 65.9571,
128
+ "step": 1700
129
+ },
130
+ {
131
+ "epoch": 0.6360985952822688,
132
+ "grad_norm": 188.65126037597656,
133
+ "learning_rate": 4.398402723225975e-05,
134
+ "loss": 51.9814,
135
+ "step": 1800
136
+ },
137
+ {
138
+ "epoch": 0.6714374061312837,
139
+ "grad_norm": 208.6486358642578,
140
+ "learning_rate": 4.3329405603561146e-05,
141
+ "loss": 53.3651,
142
+ "step": 1900
143
+ },
144
+ {
145
+ "epoch": 0.7067762169802986,
146
+ "grad_norm": 349.70465087890625,
147
+ "learning_rate": 4.267478397486253e-05,
148
+ "loss": 58.8342,
149
+ "step": 2000
150
+ },
151
+ {
152
+ "epoch": 0.7421150278293135,
153
+ "grad_norm": 327.7672119140625,
154
+ "learning_rate": 4.2020162346163925e-05,
155
+ "loss": 75.0863,
156
+ "step": 2100
157
+ },
158
+ {
159
+ "epoch": 0.7774538386783285,
160
+ "grad_norm": 174.98016357421875,
161
+ "learning_rate": 4.1365540717465304e-05,
162
+ "loss": 81.5808,
163
+ "step": 2200
164
+ },
165
+ {
166
+ "epoch": 0.8127926495273434,
167
+ "grad_norm": 243.49595642089844,
168
+ "learning_rate": 4.07109190887667e-05,
169
+ "loss": 85.0316,
170
+ "step": 2300
171
+ },
172
+ {
173
+ "epoch": 0.8481314603763583,
174
+ "grad_norm": 193.5273895263672,
175
+ "learning_rate": 4.005629746006808e-05,
176
+ "loss": 88.9008,
177
+ "step": 2400
178
+ },
179
+ {
180
+ "epoch": 0.8834702712253732,
181
+ "grad_norm": 614.0827026367188,
182
+ "learning_rate": 3.940167583136947e-05,
183
+ "loss": 84.831,
184
+ "step": 2500
185
+ },
186
+ {
187
+ "epoch": 0.9188090820743882,
188
+ "grad_norm": 430.6079406738281,
189
+ "learning_rate": 3.8747054202670855e-05,
190
+ "loss": 71.7362,
191
+ "step": 2600
192
+ },
193
+ {
194
+ "epoch": 0.9541478929234031,
195
+ "grad_norm": 285.82318115234375,
196
+ "learning_rate": 3.809243257397225e-05,
197
+ "loss": 73.4898,
198
+ "step": 2700
199
+ },
200
+ {
201
+ "epoch": 0.9894867037724181,
202
+ "grad_norm": 328.9468078613281,
203
+ "learning_rate": 3.7437810945273634e-05,
204
+ "loss": 54.771,
205
+ "step": 2800
206
+ },
207
+ {
208
+ "epoch": 0.9997349589186324,
209
+ "eval_loss": NaN,
210
+ "eval_runtime": 50.7142,
211
+ "eval_samples_per_second": 49.611,
212
+ "eval_steps_per_second": 24.806,
213
+ "step": 2829
214
+ },
215
+ {
216
+ "epoch": 1.0250905557028005,
217
+ "grad_norm": 308.40863037109375,
218
+ "learning_rate": 3.678318931657502e-05,
219
+ "loss": 51.9766,
220
+ "step": 2900
221
+ },
222
+ {
223
+ "epoch": 1.0604293665518156,
224
+ "grad_norm": 189.22195434570312,
225
+ "learning_rate": 3.6128567687876407e-05,
226
+ "loss": 44.7251,
227
+ "step": 3000
228
+ },
229
+ {
230
+ "epoch": 1.0957681774008305,
231
+ "grad_norm": 638.1983642578125,
232
+ "learning_rate": 3.54739460591778e-05,
233
+ "loss": 32.4377,
234
+ "step": 3100
235
+ },
236
+ {
237
+ "epoch": 1.1311069882498455,
238
+ "grad_norm": 273.1210632324219,
239
+ "learning_rate": 3.4819324430479186e-05,
240
+ "loss": 34.0672,
241
+ "step": 3200
242
+ },
243
+ {
244
+ "epoch": 1.1664457990988604,
245
+ "grad_norm": 509.01788330078125,
246
+ "learning_rate": 3.416470280178057e-05,
247
+ "loss": 35.6198,
248
+ "step": 3300
249
+ },
250
+ {
251
+ "epoch": 1.2017846099478753,
252
+ "grad_norm": 504.3890686035156,
253
+ "learning_rate": 3.351008117308196e-05,
254
+ "loss": 38.9046,
255
+ "step": 3400
256
+ },
257
+ {
258
+ "epoch": 1.2371234207968902,
259
+ "grad_norm": 236.5414581298828,
260
+ "learning_rate": 3.285545954438335e-05,
261
+ "loss": 36.6003,
262
+ "step": 3500
263
+ },
264
+ {
265
+ "epoch": 1.272462231645905,
266
+ "grad_norm": 220.0192108154297,
267
+ "learning_rate": 3.220083791568474e-05,
268
+ "loss": 45.3013,
269
+ "step": 3600
270
+ },
271
+ {
272
+ "epoch": 1.30780104249492,
273
+ "grad_norm": 343.69354248046875,
274
+ "learning_rate": 3.154621628698612e-05,
275
+ "loss": 47.3381,
276
+ "step": 3700
277
+ },
278
+ {
279
+ "epoch": 1.343139853343935,
280
+ "grad_norm": 286.2487487792969,
281
+ "learning_rate": 3.089159465828751e-05,
282
+ "loss": 56.1506,
283
+ "step": 3800
284
+ },
285
+ {
286
+ "epoch": 1.3784786641929498,
287
+ "grad_norm": 258.3203430175781,
288
+ "learning_rate": 3.02369730295889e-05,
289
+ "loss": 62.5717,
290
+ "step": 3900
291
+ },
292
+ {
293
+ "epoch": 1.4138174750419648,
294
+ "grad_norm": 291.66729736328125,
295
+ "learning_rate": 2.9582351400890285e-05,
296
+ "loss": 66.6682,
297
+ "step": 4000
298
+ },
299
+ {
300
+ "epoch": 1.4491562858909797,
301
+ "grad_norm": 287.23291015625,
302
+ "learning_rate": 2.8927729772191674e-05,
303
+ "loss": 73.0452,
304
+ "step": 4100
305
+ },
306
+ {
307
+ "epoch": 1.4844950967399946,
308
+ "grad_norm": 263.8997802734375,
309
+ "learning_rate": 2.827310814349306e-05,
310
+ "loss": 74.5742,
311
+ "step": 4200
312
+ },
313
+ {
314
+ "epoch": 1.5198339075890095,
315
+ "grad_norm": 725.5924072265625,
316
+ "learning_rate": 2.761848651479445e-05,
317
+ "loss": 75.3333,
318
+ "step": 4300
319
+ },
320
+ {
321
+ "epoch": 1.5551727184380244,
322
+ "grad_norm": 255.7268829345703,
323
+ "learning_rate": 2.6963864886095836e-05,
324
+ "loss": 71.4852,
325
+ "step": 4400
326
+ },
327
+ {
328
+ "epoch": 1.5905115292870395,
329
+ "grad_norm": 358.010498046875,
330
+ "learning_rate": 2.6309243257397226e-05,
331
+ "loss": 73.1212,
332
+ "step": 4500
333
+ },
334
+ {
335
+ "epoch": 1.6258503401360545,
336
+ "grad_norm": 382.2020568847656,
337
+ "learning_rate": 2.565462162869861e-05,
338
+ "loss": 77.8558,
339
+ "step": 4600
340
+ },
341
+ {
342
+ "epoch": 1.6611891509850694,
343
+ "grad_norm": 376.1300048828125,
344
+ "learning_rate": 2.5e-05,
345
+ "loss": 82.9754,
346
+ "step": 4700
347
+ },
348
+ {
349
+ "epoch": 1.6965279618340843,
350
+ "grad_norm": 270.1905822753906,
351
+ "learning_rate": 2.4345378371301387e-05,
352
+ "loss": 87.3897,
353
+ "step": 4800
354
+ },
355
+ {
356
+ "epoch": 1.7318667726830992,
357
+ "grad_norm": 263.4345397949219,
358
+ "learning_rate": 2.3690756742602777e-05,
359
+ "loss": 86.5547,
360
+ "step": 4900
361
+ },
362
+ {
363
+ "epoch": 1.767205583532114,
364
+ "grad_norm": 285.1637268066406,
365
+ "learning_rate": 2.3036135113904163e-05,
366
+ "loss": 90.7775,
367
+ "step": 5000
368
+ },
369
+ {
370
+ "epoch": 1.8025443943811292,
371
+ "grad_norm": 296.69482421875,
372
+ "learning_rate": 2.2381513485205552e-05,
373
+ "loss": 93.2774,
374
+ "step": 5100
375
+ },
376
+ {
377
+ "epoch": 1.8378832052301441,
378
+ "grad_norm": 319.2518005371094,
379
+ "learning_rate": 2.172689185650694e-05,
380
+ "loss": 92.5169,
381
+ "step": 5200
382
+ },
383
+ {
384
+ "epoch": 1.873222016079159,
385
+ "grad_norm": 354.99530029296875,
386
+ "learning_rate": 2.1072270227808328e-05,
387
+ "loss": 98.2755,
388
+ "step": 5300
389
+ },
390
+ {
391
+ "epoch": 1.908560826928174,
392
+ "grad_norm": 325.14776611328125,
393
+ "learning_rate": 2.0417648599109714e-05,
394
+ "loss": 97.8725,
395
+ "step": 5400
396
+ },
397
+ {
398
+ "epoch": 1.9438996377771889,
399
+ "grad_norm": 335.0797424316406,
400
+ "learning_rate": 1.9763026970411104e-05,
401
+ "loss": 99.2225,
402
+ "step": 5500
403
+ },
404
+ {
405
+ "epoch": 1.9792384486262038,
406
+ "grad_norm": 351.8290710449219,
407
+ "learning_rate": 1.910840534171249e-05,
408
+ "loss": 100.1383,
409
+ "step": 5600
410
+ },
411
+ {
412
+ "epoch": 1.9997349589186324,
413
+ "eval_loss": NaN,
414
+ "eval_runtime": 51.2483,
415
+ "eval_samples_per_second": 49.094,
416
+ "eval_steps_per_second": 24.547,
417
+ "step": 5658
418
+ }
419
+ ],
420
+ "logging_steps": 100,
421
+ "max_steps": 8487,
422
+ "num_input_tokens_seen": 0,
423
+ "num_train_epochs": 3,
424
+ "save_steps": 500,
425
+ "stateful_callbacks": {
426
+ "TrainerControl": {
427
+ "args": {
428
+ "should_epoch_stop": false,
429
+ "should_evaluate": false,
430
+ "should_log": false,
431
+ "should_save": true,
432
+ "should_training_stop": false
433
+ },
434
+ "attributes": {}
435
+ }
436
+ },
437
+ "total_flos": 1884306569183232.0,
438
+ "train_batch_size": 2,
439
+ "trial_name": null,
440
+ "trial_params": null
441
+ }
checkpoint-5658/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd754075914a8cc7aa6384a77012a824ab1c56880d743f05986a4069b1cd5fc5
3
+ size 5368
checkpoint-8487/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bigscience/bloom-560m",
3
+ "apply_residual_connection_post_layernorm": false,
4
+ "architectures": [
5
+ "BloomForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "attention_softmax_in_fp32": true,
9
+ "bias_dropout_fusion": true,
10
+ "bos_token_id": 1,
11
+ "eos_token_id": 2,
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 1024,
14
+ "initializer_range": 0.02,
15
+ "layer_norm_epsilon": 1e-05,
16
+ "masked_softmax_fusion": true,
17
+ "model_type": "bloom",
18
+ "n_head": 16,
19
+ "n_inner": null,
20
+ "n_layer": 24,
21
+ "offset_alibi": 100,
22
+ "pad_token_id": 3,
23
+ "pretraining_tp": 1,
24
+ "skip_bias_add": true,
25
+ "skip_bias_add_qkv": false,
26
+ "slow_but_exact": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.48.3",
29
+ "unk_token_id": 0,
30
+ "use_cache": true,
31
+ "vocab_size": 250680
32
+ }
checkpoint-8487/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 3,
6
+ "transformers_version": "4.48.3"
7
+ }
checkpoint-8487/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c36d5ff990e34d7c8bcbf97484b8adf792d66f36af9917c3f54caac9cca43a3
3
+ size 2236073104
checkpoint-8487/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c579dd0782c64345c329cf6683ba3c19b524eb7b0232798fc32fa74f2929dd3
3
+ size 4472324430
checkpoint-8487/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
3
+ size 14244
checkpoint-8487/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15cb5f47cc10848063f533ef07ba2bb2229bb85a359eb996c350f893ab7840b0
3
+ size 1064
checkpoint-8487/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-8487/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d963066d6adae5034a1dc114c3ac444512de09928cf14ed4562ba94d9a440e66
3
+ size 21763085
checkpoint-8487/tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<unk>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<pad>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ }
36
+ },
37
+ "bos_token": "<s>",
38
+ "clean_up_tokenization_spaces": false,
39
+ "eos_token": "</s>",
40
+ "extra_special_tokens": {},
41
+ "merges_file": null,
42
+ "model_max_length": 1000000000000000019884624838656,
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "tokenizer_class": "BloomTokenizer",
46
+ "unk_token": "<unk>",
47
+ "vocab_file": null
48
+ }
checkpoint-8487/trainer_state.json ADDED
@@ -0,0 +1,645 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": Infinity,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9997349589186326,
5
+ "eval_steps": 500,
6
+ "global_step": 8487,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03533881084901493,
13
+ "grad_norm": 130.5043487548828,
14
+ "learning_rate": 5.4181389870435815e-06,
15
+ "loss": 5.2462,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.07067762169802987,
20
+ "grad_norm": 121.2613754272461,
21
+ "learning_rate": 1.1189634864546526e-05,
22
+ "loss": 4.293,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.10601643254704479,
27
+ "grad_norm": 28631.3515625,
28
+ "learning_rate": 1.6961130742049473e-05,
29
+ "loss": 4.2171,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.14135524339605973,
34
+ "grad_norm": 2812505.5,
35
+ "learning_rate": 2.220259128386337e-05,
36
+ "loss": 6.2637,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.17669405424507464,
41
+ "grad_norm": 416.1485290527344,
42
+ "learning_rate": 2.7561837455830393e-05,
43
+ "loss": 166.4242,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.21203286509408958,
48
+ "grad_norm": 416.1195373535156,
49
+ "learning_rate": 3.345111896348646e-05,
50
+ "loss": 187.1228,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.24737167594310452,
55
+ "grad_norm": 392.4357604980469,
56
+ "learning_rate": 3.934040047114253e-05,
57
+ "loss": 190.0541,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.28271048679211946,
62
+ "grad_norm": 318.76153564453125,
63
+ "learning_rate": 4.5229681978798585e-05,
64
+ "loss": 150.7144,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.3180492976411344,
69
+ "grad_norm": 407.822021484375,
70
+ "learning_rate": 4.9875621890547264e-05,
71
+ "loss": 138.6864,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.3533881084901493,
76
+ "grad_norm": 382.1808166503906,
77
+ "learning_rate": 4.922100026184866e-05,
78
+ "loss": 131.8268,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.38872691933916426,
83
+ "grad_norm": 374.1636047363281,
84
+ "learning_rate": 4.8566378633150036e-05,
85
+ "loss": 118.8281,
86
+ "step": 1100
87
+ },
88
+ {
89
+ "epoch": 0.42406573018817917,
90
+ "grad_norm": 276.59124755859375,
91
+ "learning_rate": 4.791175700445143e-05,
92
+ "loss": 106.5782,
93
+ "step": 1200
94
+ },
95
+ {
96
+ "epoch": 0.4594045410371941,
97
+ "grad_norm": 285.5071105957031,
98
+ "learning_rate": 4.7257135375752815e-05,
99
+ "loss": 93.512,
100
+ "step": 1300
101
+ },
102
+ {
103
+ "epoch": 0.49474335188620905,
104
+ "grad_norm": 306.88946533203125,
105
+ "learning_rate": 4.660251374705421e-05,
106
+ "loss": 82.3966,
107
+ "step": 1400
108
+ },
109
+ {
110
+ "epoch": 0.5300821627352239,
111
+ "grad_norm": 259.3111877441406,
112
+ "learning_rate": 4.594789211835559e-05,
113
+ "loss": 82.1071,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.5654209735842389,
118
+ "grad_norm": 414.5076599121094,
119
+ "learning_rate": 4.529327048965698e-05,
120
+ "loss": 78.2349,
121
+ "step": 1600
122
+ },
123
+ {
124
+ "epoch": 0.6007597844332538,
125
+ "grad_norm": 249.02976989746094,
126
+ "learning_rate": 4.463864886095837e-05,
127
+ "loss": 65.9571,
128
+ "step": 1700
129
+ },
130
+ {
131
+ "epoch": 0.6360985952822688,
132
+ "grad_norm": 188.65126037597656,
133
+ "learning_rate": 4.398402723225975e-05,
134
+ "loss": 51.9814,
135
+ "step": 1800
136
+ },
137
+ {
138
+ "epoch": 0.6714374061312837,
139
+ "grad_norm": 208.6486358642578,
140
+ "learning_rate": 4.3329405603561146e-05,
141
+ "loss": 53.3651,
142
+ "step": 1900
143
+ },
144
+ {
145
+ "epoch": 0.7067762169802986,
146
+ "grad_norm": 349.70465087890625,
147
+ "learning_rate": 4.267478397486253e-05,
148
+ "loss": 58.8342,
149
+ "step": 2000
150
+ },
151
+ {
152
+ "epoch": 0.7421150278293135,
153
+ "grad_norm": 327.7672119140625,
154
+ "learning_rate": 4.2020162346163925e-05,
155
+ "loss": 75.0863,
156
+ "step": 2100
157
+ },
158
+ {
159
+ "epoch": 0.7774538386783285,
160
+ "grad_norm": 174.98016357421875,
161
+ "learning_rate": 4.1365540717465304e-05,
162
+ "loss": 81.5808,
163
+ "step": 2200
164
+ },
165
+ {
166
+ "epoch": 0.8127926495273434,
167
+ "grad_norm": 243.49595642089844,
168
+ "learning_rate": 4.07109190887667e-05,
169
+ "loss": 85.0316,
170
+ "step": 2300
171
+ },
172
+ {
173
+ "epoch": 0.8481314603763583,
174
+ "grad_norm": 193.5273895263672,
175
+ "learning_rate": 4.005629746006808e-05,
176
+ "loss": 88.9008,
177
+ "step": 2400
178
+ },
179
+ {
180
+ "epoch": 0.8834702712253732,
181
+ "grad_norm": 614.0827026367188,
182
+ "learning_rate": 3.940167583136947e-05,
183
+ "loss": 84.831,
184
+ "step": 2500
185
+ },
186
+ {
187
+ "epoch": 0.9188090820743882,
188
+ "grad_norm": 430.6079406738281,
189
+ "learning_rate": 3.8747054202670855e-05,
190
+ "loss": 71.7362,
191
+ "step": 2600
192
+ },
193
+ {
194
+ "epoch": 0.9541478929234031,
195
+ "grad_norm": 285.82318115234375,
196
+ "learning_rate": 3.809243257397225e-05,
197
+ "loss": 73.4898,
198
+ "step": 2700
199
+ },
200
+ {
201
+ "epoch": 0.9894867037724181,
202
+ "grad_norm": 328.9468078613281,
203
+ "learning_rate": 3.7437810945273634e-05,
204
+ "loss": 54.771,
205
+ "step": 2800
206
+ },
207
+ {
208
+ "epoch": 0.9997349589186324,
209
+ "eval_loss": NaN,
210
+ "eval_runtime": 50.7142,
211
+ "eval_samples_per_second": 49.611,
212
+ "eval_steps_per_second": 24.806,
213
+ "step": 2829
214
+ },
215
+ {
216
+ "epoch": 1.0250905557028005,
217
+ "grad_norm": 308.40863037109375,
218
+ "learning_rate": 3.678318931657502e-05,
219
+ "loss": 51.9766,
220
+ "step": 2900
221
+ },
222
+ {
223
+ "epoch": 1.0604293665518156,
224
+ "grad_norm": 189.22195434570312,
225
+ "learning_rate": 3.6128567687876407e-05,
226
+ "loss": 44.7251,
227
+ "step": 3000
228
+ },
229
+ {
230
+ "epoch": 1.0957681774008305,
231
+ "grad_norm": 638.1983642578125,
232
+ "learning_rate": 3.54739460591778e-05,
233
+ "loss": 32.4377,
234
+ "step": 3100
235
+ },
236
+ {
237
+ "epoch": 1.1311069882498455,
238
+ "grad_norm": 273.1210632324219,
239
+ "learning_rate": 3.4819324430479186e-05,
240
+ "loss": 34.0672,
241
+ "step": 3200
242
+ },
243
+ {
244
+ "epoch": 1.1664457990988604,
245
+ "grad_norm": 509.01788330078125,
246
+ "learning_rate": 3.416470280178057e-05,
247
+ "loss": 35.6198,
248
+ "step": 3300
249
+ },
250
+ {
251
+ "epoch": 1.2017846099478753,
252
+ "grad_norm": 504.3890686035156,
253
+ "learning_rate": 3.351008117308196e-05,
254
+ "loss": 38.9046,
255
+ "step": 3400
256
+ },
257
+ {
258
+ "epoch": 1.2371234207968902,
259
+ "grad_norm": 236.5414581298828,
260
+ "learning_rate": 3.285545954438335e-05,
261
+ "loss": 36.6003,
262
+ "step": 3500
263
+ },
264
+ {
265
+ "epoch": 1.272462231645905,
266
+ "grad_norm": 220.0192108154297,
267
+ "learning_rate": 3.220083791568474e-05,
268
+ "loss": 45.3013,
269
+ "step": 3600
270
+ },
271
+ {
272
+ "epoch": 1.30780104249492,
273
+ "grad_norm": 343.69354248046875,
274
+ "learning_rate": 3.154621628698612e-05,
275
+ "loss": 47.3381,
276
+ "step": 3700
277
+ },
278
+ {
279
+ "epoch": 1.343139853343935,
280
+ "grad_norm": 286.2487487792969,
281
+ "learning_rate": 3.089159465828751e-05,
282
+ "loss": 56.1506,
283
+ "step": 3800
284
+ },
285
+ {
286
+ "epoch": 1.3784786641929498,
287
+ "grad_norm": 258.3203430175781,
288
+ "learning_rate": 3.02369730295889e-05,
289
+ "loss": 62.5717,
290
+ "step": 3900
291
+ },
292
+ {
293
+ "epoch": 1.4138174750419648,
294
+ "grad_norm": 291.66729736328125,
295
+ "learning_rate": 2.9582351400890285e-05,
296
+ "loss": 66.6682,
297
+ "step": 4000
298
+ },
299
+ {
300
+ "epoch": 1.4491562858909797,
301
+ "grad_norm": 287.23291015625,
302
+ "learning_rate": 2.8927729772191674e-05,
303
+ "loss": 73.0452,
304
+ "step": 4100
305
+ },
306
+ {
307
+ "epoch": 1.4844950967399946,
308
+ "grad_norm": 263.8997802734375,
309
+ "learning_rate": 2.827310814349306e-05,
310
+ "loss": 74.5742,
311
+ "step": 4200
312
+ },
313
+ {
314
+ "epoch": 1.5198339075890095,
315
+ "grad_norm": 725.5924072265625,
316
+ "learning_rate": 2.761848651479445e-05,
317
+ "loss": 75.3333,
318
+ "step": 4300
319
+ },
320
+ {
321
+ "epoch": 1.5551727184380244,
322
+ "grad_norm": 255.7268829345703,
323
+ "learning_rate": 2.6963864886095836e-05,
324
+ "loss": 71.4852,
325
+ "step": 4400
326
+ },
327
+ {
328
+ "epoch": 1.5905115292870395,
329
+ "grad_norm": 358.010498046875,
330
+ "learning_rate": 2.6309243257397226e-05,
331
+ "loss": 73.1212,
332
+ "step": 4500
333
+ },
334
+ {
335
+ "epoch": 1.6258503401360545,
336
+ "grad_norm": 382.2020568847656,
337
+ "learning_rate": 2.565462162869861e-05,
338
+ "loss": 77.8558,
339
+ "step": 4600
340
+ },
341
+ {
342
+ "epoch": 1.6611891509850694,
343
+ "grad_norm": 376.1300048828125,
344
+ "learning_rate": 2.5e-05,
345
+ "loss": 82.9754,
346
+ "step": 4700
347
+ },
348
+ {
349
+ "epoch": 1.6965279618340843,
350
+ "grad_norm": 270.1905822753906,
351
+ "learning_rate": 2.4345378371301387e-05,
352
+ "loss": 87.3897,
353
+ "step": 4800
354
+ },
355
+ {
356
+ "epoch": 1.7318667726830992,
357
+ "grad_norm": 263.4345397949219,
358
+ "learning_rate": 2.3690756742602777e-05,
359
+ "loss": 86.5547,
360
+ "step": 4900
361
+ },
362
+ {
363
+ "epoch": 1.767205583532114,
364
+ "grad_norm": 285.1637268066406,
365
+ "learning_rate": 2.3036135113904163e-05,
366
+ "loss": 90.7775,
367
+ "step": 5000
368
+ },
369
+ {
370
+ "epoch": 1.8025443943811292,
371
+ "grad_norm": 296.69482421875,
372
+ "learning_rate": 2.2381513485205552e-05,
373
+ "loss": 93.2774,
374
+ "step": 5100
375
+ },
376
+ {
377
+ "epoch": 1.8378832052301441,
378
+ "grad_norm": 319.2518005371094,
379
+ "learning_rate": 2.172689185650694e-05,
380
+ "loss": 92.5169,
381
+ "step": 5200
382
+ },
383
+ {
384
+ "epoch": 1.873222016079159,
385
+ "grad_norm": 354.99530029296875,
386
+ "learning_rate": 2.1072270227808328e-05,
387
+ "loss": 98.2755,
388
+ "step": 5300
389
+ },
390
+ {
391
+ "epoch": 1.908560826928174,
392
+ "grad_norm": 325.14776611328125,
393
+ "learning_rate": 2.0417648599109714e-05,
394
+ "loss": 97.8725,
395
+ "step": 5400
396
+ },
397
+ {
398
+ "epoch": 1.9438996377771889,
399
+ "grad_norm": 335.0797424316406,
400
+ "learning_rate": 1.9763026970411104e-05,
401
+ "loss": 99.2225,
402
+ "step": 5500
403
+ },
404
+ {
405
+ "epoch": 1.9792384486262038,
406
+ "grad_norm": 351.8290710449219,
407
+ "learning_rate": 1.910840534171249e-05,
408
+ "loss": 100.1383,
409
+ "step": 5600
410
+ },
411
+ {
412
+ "epoch": 1.9997349589186324,
413
+ "eval_loss": NaN,
414
+ "eval_runtime": 51.2483,
415
+ "eval_samples_per_second": 49.094,
416
+ "eval_steps_per_second": 24.547,
417
+ "step": 5658
418
+ },
419
+ {
420
+ "epoch": 2.014842300556586,
421
+ "grad_norm": 339.35479736328125,
422
+ "learning_rate": 1.845378371301388e-05,
423
+ "loss": 101.0501,
424
+ "step": 5700
425
+ },
426
+ {
427
+ "epoch": 2.050181111405601,
428
+ "grad_norm": 337.0163269042969,
429
+ "learning_rate": 1.7799162084315266e-05,
430
+ "loss": 106.9505,
431
+ "step": 5800
432
+ },
433
+ {
434
+ "epoch": 2.0855199222546164,
435
+ "grad_norm": 295.1744689941406,
436
+ "learning_rate": 1.714454045561665e-05,
437
+ "loss": 103.8797,
438
+ "step": 5900
439
+ },
440
+ {
441
+ "epoch": 2.1208587331036313,
442
+ "grad_norm": 313.05487060546875,
443
+ "learning_rate": 1.648991882691804e-05,
444
+ "loss": 104.9367,
445
+ "step": 6000
446
+ },
447
+ {
448
+ "epoch": 2.156197543952646,
449
+ "grad_norm": 455.3573913574219,
450
+ "learning_rate": 1.5835297198219427e-05,
451
+ "loss": 108.2618,
452
+ "step": 6100
453
+ },
454
+ {
455
+ "epoch": 2.191536354801661,
456
+ "grad_norm": 413.2171325683594,
457
+ "learning_rate": 1.5180675569520817e-05,
458
+ "loss": 108.1512,
459
+ "step": 6200
460
+ },
461
+ {
462
+ "epoch": 2.226875165650676,
463
+ "grad_norm": 412.13140869140625,
464
+ "learning_rate": 1.4526053940822206e-05,
465
+ "loss": 111.6234,
466
+ "step": 6300
467
+ },
468
+ {
469
+ "epoch": 2.262213976499691,
470
+ "grad_norm": 637.2862548828125,
471
+ "learning_rate": 1.3871432312123594e-05,
472
+ "loss": 111.8395,
473
+ "step": 6400
474
+ },
475
+ {
476
+ "epoch": 2.297552787348706,
477
+ "grad_norm": 289.26751708984375,
478
+ "learning_rate": 1.3216810683424982e-05,
479
+ "loss": 113.3466,
480
+ "step": 6500
481
+ },
482
+ {
483
+ "epoch": 2.3328915981977207,
484
+ "grad_norm": 295.8336181640625,
485
+ "learning_rate": 1.256218905472637e-05,
486
+ "loss": 105.6647,
487
+ "step": 6600
488
+ },
489
+ {
490
+ "epoch": 2.3682304090467357,
491
+ "grad_norm": 263.1067810058594,
492
+ "learning_rate": 1.1907567426027756e-05,
493
+ "loss": 105.089,
494
+ "step": 6700
495
+ },
496
+ {
497
+ "epoch": 2.4035692198957506,
498
+ "grad_norm": 263.7178649902344,
499
+ "learning_rate": 1.1252945797329144e-05,
500
+ "loss": 106.2189,
501
+ "step": 6800
502
+ },
503
+ {
504
+ "epoch": 2.4389080307447655,
505
+ "grad_norm": 281.2179260253906,
506
+ "learning_rate": 1.0598324168630532e-05,
507
+ "loss": 98.9815,
508
+ "step": 6900
509
+ },
510
+ {
511
+ "epoch": 2.4742468415937804,
512
+ "grad_norm": 270.7872619628906,
513
+ "learning_rate": 9.94370253993192e-06,
514
+ "loss": 97.342,
515
+ "step": 7000
516
+ },
517
+ {
518
+ "epoch": 2.5095856524427953,
519
+ "grad_norm": 500.3921813964844,
520
+ "learning_rate": 9.289080911233309e-06,
521
+ "loss": 98.0968,
522
+ "step": 7100
523
+ },
524
+ {
525
+ "epoch": 2.54492446329181,
526
+ "grad_norm": 322.947021484375,
527
+ "learning_rate": 8.634459282534697e-06,
528
+ "loss": 98.3004,
529
+ "step": 7200
530
+ },
531
+ {
532
+ "epoch": 2.580263274140825,
533
+ "grad_norm": 253.83358764648438,
534
+ "learning_rate": 7.979837653836083e-06,
535
+ "loss": 97.1224,
536
+ "step": 7300
537
+ },
538
+ {
539
+ "epoch": 2.61560208498984,
540
+ "grad_norm": 770.7826538085938,
541
+ "learning_rate": 7.3252160251374715e-06,
542
+ "loss": 89.2073,
543
+ "step": 7400
544
+ },
545
+ {
546
+ "epoch": 2.650940895838855,
547
+ "grad_norm": 497.59124755859375,
548
+ "learning_rate": 6.670594396438859e-06,
549
+ "loss": 96.602,
550
+ "step": 7500
551
+ },
552
+ {
553
+ "epoch": 2.68627970668787,
554
+ "grad_norm": 257.338623046875,
555
+ "learning_rate": 6.015972767740246e-06,
556
+ "loss": 98.7739,
557
+ "step": 7600
558
+ },
559
+ {
560
+ "epoch": 2.7216185175368848,
561
+ "grad_norm": 276.27337646484375,
562
+ "learning_rate": 5.361351139041634e-06,
563
+ "loss": 93.0737,
564
+ "step": 7700
565
+ },
566
+ {
567
+ "epoch": 2.7569573283858997,
568
+ "grad_norm": 301.7321472167969,
569
+ "learning_rate": 4.706729510343022e-06,
570
+ "loss": 93.7306,
571
+ "step": 7800
572
+ },
573
+ {
574
+ "epoch": 2.7922961392349146,
575
+ "grad_norm": 265.5256652832031,
576
+ "learning_rate": 4.05210788164441e-06,
577
+ "loss": 92.7555,
578
+ "step": 7900
579
+ },
580
+ {
581
+ "epoch": 2.8276349500839295,
582
+ "grad_norm": 425.70501708984375,
583
+ "learning_rate": 3.397486252945797e-06,
584
+ "loss": 95.7403,
585
+ "step": 8000
586
+ },
587
+ {
588
+ "epoch": 2.8629737609329444,
589
+ "grad_norm": 254.2684326171875,
590
+ "learning_rate": 2.7428646242471854e-06,
591
+ "loss": 94.279,
592
+ "step": 8100
593
+ },
594
+ {
595
+ "epoch": 2.8983125717819593,
596
+ "grad_norm": 282.39202880859375,
597
+ "learning_rate": 2.0882429955485732e-06,
598
+ "loss": 96.5359,
599
+ "step": 8200
600
+ },
601
+ {
602
+ "epoch": 2.9336513826309742,
603
+ "grad_norm": 313.8695068359375,
604
+ "learning_rate": 1.4336213668499608e-06,
605
+ "loss": 97.4498,
606
+ "step": 8300
607
+ },
608
+ {
609
+ "epoch": 2.968990193479989,
610
+ "grad_norm": 234.9430389404297,
611
+ "learning_rate": 7.789997381513486e-07,
612
+ "loss": 95.4695,
613
+ "step": 8400
614
+ },
615
+ {
616
+ "epoch": 2.9997349589186326,
617
+ "eval_loss": NaN,
618
+ "eval_runtime": 50.8651,
619
+ "eval_samples_per_second": 49.464,
620
+ "eval_steps_per_second": 24.732,
621
+ "step": 8487
622
+ }
623
+ ],
624
+ "logging_steps": 100,
625
+ "max_steps": 8487,
626
+ "num_input_tokens_seen": 0,
627
+ "num_train_epochs": 3,
628
+ "save_steps": 500,
629
+ "stateful_callbacks": {
630
+ "TrainerControl": {
631
+ "args": {
632
+ "should_epoch_stop": false,
633
+ "should_evaluate": false,
634
+ "should_log": false,
635
+ "should_save": true,
636
+ "should_training_stop": true
637
+ },
638
+ "attributes": {}
639
+ }
640
+ },
641
+ "total_flos": 2826570500505600.0,
642
+ "train_batch_size": 2,
643
+ "trial_name": null,
644
+ "trial_params": null
645
+ }
checkpoint-8487/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd754075914a8cc7aa6384a77012a824ab1c56880d743f05986a4069b1cd5fc5
3
+ size 5368
final_model/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bigscience/bloom-560m",
3
+ "apply_residual_connection_post_layernorm": false,
4
+ "architectures": [
5
+ "BloomForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "attention_softmax_in_fp32": true,
9
+ "bias_dropout_fusion": true,
10
+ "bos_token_id": 1,
11
+ "eos_token_id": 2,
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 1024,
14
+ "initializer_range": 0.02,
15
+ "layer_norm_epsilon": 1e-05,
16
+ "masked_softmax_fusion": true,
17
+ "model_type": "bloom",
18
+ "n_head": 16,
19
+ "n_inner": null,
20
+ "n_layer": 24,
21
+ "offset_alibi": 100,
22
+ "pad_token_id": 3,
23
+ "pretraining_tp": 1,
24
+ "skip_bias_add": true,
25
+ "skip_bias_add_qkv": false,
26
+ "slow_but_exact": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.48.3",
29
+ "unk_token_id": 0,
30
+ "use_cache": true,
31
+ "vocab_size": 250680
32
+ }
final_model/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 3,
6
+ "transformers_version": "4.48.3"
7
+ }
final_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c36d5ff990e34d7c8bcbf97484b8adf792d66f36af9917c3f54caac9cca43a3
3
+ size 2236073104
final_model/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
final_model/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d963066d6adae5034a1dc114c3ac444512de09928cf14ed4562ba94d9a440e66
3
+ size 21763085
final_model/tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<unk>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<pad>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ }
36
+ },
37
+ "bos_token": "<s>",
38
+ "clean_up_tokenization_spaces": false,
39
+ "eos_token": "</s>",
40
+ "extra_special_tokens": {},
41
+ "merges_file": null,
42
+ "model_max_length": 1000000000000000019884624838656,
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "tokenizer_class": "BloomTokenizer",
46
+ "unk_token": "<unk>",
47
+ "vocab_file": null
48
+ }
final_model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd754075914a8cc7aa6384a77012a824ab1c56880d743f05986a4069b1cd5fc5
3
+ size 5368
logs/events.out.tfevents.1749891083.51d9157369b8.14130.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bde15f0588c68d6346d96578059866fa45524410d2d895af4899f4330158ab07
3
+ size 24135