Đào Quốc Tuấn commited on
Upload folder using huggingface_hub
Browse files- experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/config.json +39 -0
- experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/generation_config.json +6 -0
- experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/lr_scheduler.pt +3 -0
- experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/merges.txt +0 -0
- experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/model.safetensors +3 -0
- experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/optimizer.pt +3 -0
- experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/special_tokens_map.json +6 -0
- experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/tokenizer.json +0 -0
- experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/tokenizer_config.json +21 -0
- experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/vocab.json +0 -0
- experiments/sft_gpt2-120m/20251209_070038/sft_gpt2-120m.log +43 -0
- experiments/sft_gpt2-120m/20251209_070038/sft_gpt2-120m_metrics.jsonl +0 -0
experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/config.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_function": "gelu_new",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"GPT2LMHeadModel"
|
| 5 |
+
],
|
| 6 |
+
"attn_pdrop": 0.1,
|
| 7 |
+
"bos_token_id": 50256,
|
| 8 |
+
"dtype": "float32",
|
| 9 |
+
"embd_pdrop": 0.1,
|
| 10 |
+
"eos_token_id": 50256,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"layer_norm_epsilon": 1e-05,
|
| 13 |
+
"model_type": "gpt2",
|
| 14 |
+
"n_ctx": 1024,
|
| 15 |
+
"n_embd": 768,
|
| 16 |
+
"n_head": 12,
|
| 17 |
+
"n_inner": null,
|
| 18 |
+
"n_layer": 12,
|
| 19 |
+
"n_positions": 1024,
|
| 20 |
+
"pad_token_id": 50256,
|
| 21 |
+
"reorder_and_upcast_attn": false,
|
| 22 |
+
"resid_pdrop": 0.1,
|
| 23 |
+
"scale_attn_by_inverse_layer_idx": false,
|
| 24 |
+
"scale_attn_weights": true,
|
| 25 |
+
"summary_activation": null,
|
| 26 |
+
"summary_first_dropout": 0.1,
|
| 27 |
+
"summary_proj_to_labels": true,
|
| 28 |
+
"summary_type": "cls_index",
|
| 29 |
+
"summary_use_proj": true,
|
| 30 |
+
"task_specific_params": {
|
| 31 |
+
"text-generation": {
|
| 32 |
+
"do_sample": true,
|
| 33 |
+
"max_length": 50
|
| 34 |
+
}
|
| 35 |
+
},
|
| 36 |
+
"transformers_version": "4.56.0",
|
| 37 |
+
"use_cache": true,
|
| 38 |
+
"vocab_size": 50257
|
| 39 |
+
}
|
experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 50256,
|
| 4 |
+
"eos_token_id": 50256,
|
| 5 |
+
"transformers_version": "4.56.0"
|
| 6 |
+
}
|
experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/lr_scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c35da3e5bbde9e797a5c5553120ff59acd38a9f4e960ebb6476bd15a03d1fb22
|
| 3 |
+
size 1483
|
experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:162a58da14b121f4491e1ea2d1c28087c3567295adc65936ff5a899e69a2a2e6
|
| 3 |
+
size 497774208
|
experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:84960f14491685b331e700ed0df0b85ac091809ab265b536600a1d0d04cbda6b
|
| 3 |
+
size 995642763
|
experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<|endoftext|>",
|
| 3 |
+
"eos_token": "<|endoftext|>",
|
| 4 |
+
"pad_token": "<|endoftext|>",
|
| 5 |
+
"unk_token": "<|endoftext|>"
|
| 6 |
+
}
|
experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/tokenizer_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"50256": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"bos_token": "<|endoftext|>",
|
| 14 |
+
"clean_up_tokenization_spaces": false,
|
| 15 |
+
"eos_token": "<|endoftext|>",
|
| 16 |
+
"extra_special_tokens": {},
|
| 17 |
+
"model_max_length": 1024,
|
| 18 |
+
"pad_token": "<|endoftext|>",
|
| 19 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 20 |
+
"unk_token": "<|endoftext|>"
|
| 21 |
+
}
|
experiments/sft_gpt2-120m/20251209_070038/checkpoints/epoch_16/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
experiments/sft_gpt2-120m/20251209_070038/sft_gpt2-120m.log
CHANGED
|
@@ -854,3 +854,46 @@
|
|
| 854 |
2025-12-09 07:55:56,448 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 855 |
2025-12-09 07:55:57,754 - root - [32m[1mINFO[0m - Step 23169/28600 train rougeL: 0.979368932038835
|
| 856 |
2025-12-09 07:55:57,810 - root - [32m[1mINFO[0m - Step 23169/28600 loss: 0.017524462193250656, nll_loss: 0.017524462193250656
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 854 |
2025-12-09 07:55:56,448 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 855 |
2025-12-09 07:55:57,754 - root - [32m[1mINFO[0m - Step 23169/28600 train rougeL: 0.979368932038835
|
| 856 |
2025-12-09 07:55:57,810 - root - [32m[1mINFO[0m - Step 23169/28600 loss: 0.017524462193250656, nll_loss: 0.017524462193250656
|
| 857 |
+
2025-12-09 07:56:08,854 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 858 |
+
2025-12-09 07:56:11,919 - root - [32m[1mINFO[0m - Step 23297/28600 train rougeL: 0.9888107126313195
|
| 859 |
+
2025-12-09 07:56:11,982 - root - [32m[1mINFO[0m - Step 23297/28600 loss: 0.030149059370160103, nll_loss: 0.030149059370160103
|
| 860 |
+
2025-12-09 07:56:23,052 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 861 |
+
2025-12-09 07:56:25,326 - root - [32m[1mINFO[0m - Step 23425/28600 train rougeL: 0.9094240541000018
|
| 862 |
+
2025-12-09 07:56:25,389 - root - [32m[1mINFO[0m - Step 23425/28600 loss: 0.031148415058851242, nll_loss: 0.031148415058851242
|
| 863 |
+
2025-12-09 07:56:36,454 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 864 |
+
2025-12-09 07:56:37,184 - root - [32m[1mINFO[0m - Step 23553/28600 train rougeL: 1.0
|
| 865 |
+
2025-12-09 07:56:37,240 - root - [32m[1mINFO[0m - Step 23553/28600 loss: 0.04822136089205742, nll_loss: 0.04822136089205742
|
| 866 |
+
2025-12-09 07:56:48,299 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 867 |
+
2025-12-09 07:56:51,400 - root - [32m[1mINFO[0m - Step 23681/28600 train rougeL: 0.8622660301159286
|
| 868 |
+
2025-12-09 07:56:51,464 - root - [32m[1mINFO[0m - Step 23681/28600 loss: 0.034108687192201614, nll_loss: 0.034108687192201614
|
| 869 |
+
2025-12-09 07:57:02,536 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 870 |
+
2025-12-09 07:57:04,043 - root - [32m[1mINFO[0m - Step 23809/28600 train rougeL: 1.0
|
| 871 |
+
2025-12-09 07:57:04,100 - root - [32m[1mINFO[0m - Step 23809/28600 loss: 0.007645273581147194, nll_loss: 0.007645273581147194
|
| 872 |
+
2025-12-09 07:57:15,162 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 873 |
+
2025-12-09 07:57:18,286 - root - [32m[1mINFO[0m - Step 23937/28600 train rougeL: 0.9782634446813552
|
| 874 |
+
2025-12-09 07:57:18,349 - root - [32m[1mINFO[0m - Step 23937/28600 loss: 0.021661849692463875, nll_loss: 0.021661849692463875
|
| 875 |
+
2025-12-09 07:57:29,412 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 876 |
+
2025-12-09 07:57:31,099 - root - [32m[1mINFO[0m - Step 24065/28600 train rougeL: 0.9977272727272727
|
| 877 |
+
2025-12-09 07:57:31,155 - root - [32m[1mINFO[0m - Step 24065/28600 loss: 0.022607143968343735, nll_loss: 0.022607143968343735
|
| 878 |
+
2025-12-09 07:57:42,219 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 879 |
+
2025-12-09 07:57:45,239 - root - [32m[1mINFO[0m - Step 24193/28600 train rougeL: 0.9979804421768708
|
| 880 |
+
2025-12-09 07:57:45,302 - root - [32m[1mINFO[0m - Step 24193/28600 loss: 0.03091508336365223, nll_loss: 0.03091508336365223
|
| 881 |
+
2025-12-09 07:57:55,424 - root - [32m[1mINFO[0m - Epoch 17/20 finished
|
| 882 |
+
2025-12-09 07:57:55,453 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 883 |
+
2025-12-09 07:57:58,824 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 884 |
+
2025-12-09 07:58:01,735 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 885 |
+
2025-12-09 07:58:05,059 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 886 |
+
2025-12-09 07:58:07,981 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 887 |
+
2025-12-09 07:58:11,667 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 888 |
+
2025-12-09 07:58:14,930 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 889 |
+
2025-12-09 07:58:18,235 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 890 |
+
2025-12-09 07:58:21,610 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 891 |
+
2025-12-09 07:58:24,956 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 892 |
+
2025-12-09 07:58:28,280 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 893 |
+
2025-12-09 07:58:31,603 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 894 |
+
2025-12-09 07:58:34,971 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 895 |
+
2025-12-09 07:58:38,325 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 896 |
+
2025-12-09 07:58:41,470 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 897 |
+
2025-12-09 07:58:44,248 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 898 |
+
2025-12-09 07:58:46,987 - root - [32m[1mINFO[0m - Epoch 17/20 eval rougeL: 0.23964140298317876
|
| 899 |
+
2025-12-09 07:58:49,024 - root - [32m[1mINFO[0m - Early stopping triggered at epoch 17
|
experiments/sft_gpt2-120m/20251209_070038/sft_gpt2-120m_metrics.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|