KotshinZ commited on
Commit
ae28c65
·
verified ·
1 Parent(s): a13a531

Model save

Browse files
PreTrainedRMTConfig.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from transformers import PretrainedConfig
4
+
5
+ class PreTrainedRMTConfig(PretrainedConfig):
6
+ """
7
+ Recurrent Memory Transformer の設定クラス
8
+ """
9
+
10
+ model_type = "rmt"
11
+
12
+ # マッピング情報を追加(設定クラスとモデルクラスの関連付け)
13
+ auto_map = {
14
+ "AutoModelForCausalLM": "open_r1.rmt.RecurrentMemoryTransofomer.RecurrentMemoryTransformer"
15
+ }
16
+
17
+ def __init__(
18
+ self,
19
+ base_model_config=None,
20
+ is_memory_all=True,
21
+ max_n_segments=1,
22
+ input_seg_len=512,
23
+ output_seg_len=512,
24
+ align="left",
25
+ num_mem_tokens=10,
26
+ **kwargs
27
+ ):
28
+ super().__init__(**kwargs)
29
+ self.base_model_config = base_model_config
30
+ self.is_memory_all = is_memory_all
31
+ self.max_n_segments = max_n_segments
32
+ self.input_seg_len = input_seg_len
33
+ self.output_seg_len = output_seg_len
34
+ self.align = align
35
+ self.num_mem_tokens = num_mem_tokens
36
+
37
+ if base_model_config is not None:
38
+ if type(base_model_config) is not dict:
39
+ dict_config: dict = base_model_config.to_dict()
40
+ else:
41
+ dict_config: dict = base_model_config
42
+
43
+ for key, value in dict_config.items():
44
+ setattr(self, key, value)
45
+ self.base_model_type = dict_config.get("model_type")
46
+ if self.base_model_type is None:
47
+ raise ValueError("base_model_configにmodel_typeが指定されていません。")
48
+ PreTrainedRMTConfig.model_type = "rmt_" + self.base_model_type
49
+ """
50
+ def __repr__(self):
51
+ return f"PreTrainedRMTConfig(is_memory_all={self.is_memory_all}, max_n_segments={self.max_n_segments}, " \
52
+ f"input_seg_len={self.input_seg_len}, output_seg_len={self.output_seg_len}, " \
53
+ f"align='{self.align}', num_mem_tokens={self.num_mem_tokens})"
54
+ """
55
+
56
+ PreTrainedRMTConfig.register_for_auto_class()
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: KotshinZ/gpt2-RMT-7-mem512
3
+ library_name: transformers
4
+ model_name: gpt2-RMT-8-mem512
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - sft
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for gpt2-RMT-8-mem512
13
+
14
+ This model is a fine-tuned version of [KotshinZ/gpt2-RMT-7-mem512](https://huggingface.co/KotshinZ/gpt2-RMT-7-mem512).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="KotshinZ/gpt2-RMT-8-mem512", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/shin2021001-osaka-city-university/huggingface/runs/zgmj6bcp)
31
+
32
+
33
+ This model was trained with SFT.
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.15.2
38
+ - Transformers: 4.50.0.dev0
39
+ - Pytorch: 2.5.1
40
+ - Datasets: 3.3.2
41
+ - Tokenizers: 0.21.0
42
+
43
+ ## Citations
44
+
45
+
46
+
47
+ Cite TRL as:
48
+
49
+ ```bibtex
50
+ @misc{vonwerra2022trl,
51
+ title = {{TRL: Transformer Reinforcement Learning}},
52
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
53
+ year = 2020,
54
+ journal = {GitHub repository},
55
+ publisher = {GitHub},
56
+ howpublished = {\url{https://github.com/huggingface/trl}}
57
+ }
58
+ ```
RecurrentMemoryTransofomer.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import PreTrainedModel, AutoModelForCausalLM, AutoConfig
3
+ from transformers.models.auto.auto_factory import _BaseAutoModelClass
4
+ from open_r1.rmt.MemoryCell import MemoryCell
5
+ from open_r1.rmt.RecurrentWrapper import RecurrentWrapper
6
+ from open_r1.rmt.PreTrainedRMTConfig import PreTrainedRMTConfig
7
+
8
+
9
+ # @register_for_auto_class("AutoModelForCausalLM")
10
+ class RecurrentMemoryTransformer(PreTrainedModel):
11
+ """
12
+ Recurrent Memory Transformer モデルクラス
13
+ 長い文脈をセグメント単位で処理し、メモリを使って情報を保持するトランスフォーマーモデル
14
+ """
15
+
16
+ config_class = PreTrainedRMTConfig
17
+ auto_model_class = "AutoModelForCausalLM"
18
+
19
+ # マッピングを定義してAutoクラスが適切なモデルを見つけられるようにする
20
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
21
+
22
+ # AUTO_MAPを定義(モデル名からクラスへのマッピング)
23
+ AUTO_MAP = {
24
+ "AutoModelForCausalLM": "RecurrentMemoryTransformer",
25
+ }
26
+
27
+ def __init__(self, config, base_model=None):
28
+ """
29
+ 初期化
30
+
31
+ Parameters
32
+ ----------
33
+ config : PreTrainedRMTConfig
34
+ モデルの設定
35
+ base_model : PreTrainedModel, optional
36
+ ベースとなるトランスフォーマーモデル
37
+ """
38
+ super().__init__(config)
39
+
40
+ # base_modelが指定されていない場合は、configから自動生成
41
+ if base_model is None:
42
+ # ベースモデルのタイプを確認
43
+ if not hasattr(config, "base_model_type"):
44
+ raise ValueError("configにbase_model_typeが指定されていません。RMTの設定にはベースモデルタイプが必要です。")
45
+ base_model_type = config.base_model_type
46
+
47
+ # ベースモデル用の設定を作成
48
+ base_config = AutoConfig.from_pretrained(base_model_type)
49
+
50
+ # RMT固有のパラメータを除外してベースモデルの設定を作成
51
+ rmt_specific_params = ['model_type', 'is_memory_all', 'max_n_segments', 'input_seg_len',
52
+ 'output_seg_len', 'align', 'num_mem_tokens', 'base_model_type']
53
+ for key, value in config.__dict__.items():
54
+ if key not in rmt_specific_params and not key.startswith('_'):
55
+ setattr(base_config, key, value)
56
+
57
+ # ベースモデルを作成
58
+ base_model = AutoModelForCausalLM.from_config(base_config)
59
+
60
+ # MemoryCellとRecurrentWrapperの初期化
61
+ memory_cell = MemoryCell(base_model, config.num_mem_tokens)
62
+ self.recurrent_wrapper = RecurrentWrapper(
63
+ memory_cell=memory_cell,
64
+ is_memory_all=config.is_memory_all,
65
+ max_n_segments=config.max_n_segments,
66
+ input_seg_len=config.input_seg_len,
67
+ output_seg_len=config.output_seg_len,
68
+ align=config.align
69
+ )
70
+
71
+ def get_base_model(self):
72
+ """
73
+ ベースモデルを取得
74
+ """
75
+ return self.recurrent_wrapper.memory_cell.model
76
+
77
+ def forward(self, input_ids=None, attention_mask=None, labels=None, labels_mask=None,
78
+ inputs_embeds=None, output_attentions=None, output_hidden_states=None):
79
+ """
80
+ モデルの順伝播
81
+
82
+ Parameters
83
+ ----------
84
+ input_ids : torch.Tensor, optional
85
+ 入力テンソル
86
+ attention_mask : torch.Tensor, optional
87
+ アテンションマスク
88
+ labels : torch.Tensor, optional
89
+ ラベルテンソル
90
+ labels_mask : torch.Tensor, optional
91
+ ラベルマスク
92
+ inputs_embeds : torch.Tensor, optional
93
+ 入力埋め込み
94
+ output_attentions : bool, optional
95
+ アテンション重みを出力するかどうか
96
+ output_hidden_states : bool, optional
97
+ 隠れ状態を出力するかどうか
98
+ """
99
+ forward_kwargs = {}
100
+ if input_ids is not None:
101
+ forward_kwargs["input_ids"] = input_ids
102
+ if labels is not None:
103
+ forward_kwargs["labels"] = labels
104
+ if attention_mask is not None:
105
+ forward_kwargs["attention_mask"] = attention_mask
106
+ if labels_mask is not None:
107
+ forward_kwargs["labels_mask"] = labels_mask
108
+ if inputs_embeds is not None:
109
+ forward_kwargs["inputs_embeds"] = inputs_embeds
110
+ if output_attentions is not None:
111
+ forward_kwargs["output_attentions"] = output_attentions
112
+ if output_hidden_states is not None:
113
+ forward_kwargs["output_hidden_states"] = output_hidden_states
114
+
115
+ #forward_kwargs.update(kwargs)
116
+
117
+ # 通常の順伝播処理
118
+ out = self.recurrent_wrapper.forward(**forward_kwargs)
119
+ """
120
+ # デバッグ出力を削除(または必要に応じてコメント化)
121
+ # print(out["loss"])
122
+
123
+ # 分散環境で損失が二重計算されないよう、ワールドサイズで割る
124
+ # これは処理済みの場合は不要なので、環境変数などで制御することも可能
125
+ if torch.distributed.is_initialized() and "loss" in out and out["loss"] is not None:
126
+ # 既にDeepSpeedが処理している可能性があるため、確認が必要
127
+ # テスト目的で一時的に追加(実際の環境に合わせて調整が必要)
128
+ # world_size = torch.distributed.get_world_size()
129
+ # out["loss"] = out["loss"] / world_size
130
+ pass
131
+ """
132
+ return out
133
+
134
+ def generate(self, **kwargs):
135
+ """
136
+ テキスト生成
137
+ """
138
+ return self.recurrent_wrapper.generate(**kwargs)
139
+
140
+ def generate_with_tokenizer(self, tokenizer, input_text, **kwargs):
141
+ """
142
+ トークナイザーを用いたテキスト生成
143
+ """
144
+ return self.recurrent_wrapper.generate_with_tokenizer(tokenizer, input_text, **kwargs)
145
+
146
+ def get_input_embeddings(self):
147
+ """
148
+ 入力埋め込みを取得
149
+ """
150
+ return self.get_base_model().get_input_embeddings()
151
+
152
+ def set_input_embeddings(self, embeddings):
153
+ """
154
+ 入力埋め込みを設定
155
+ """
156
+ self.get_base_model().set_input_embeddings(embeddings)
157
+
158
+ def get_output_embeddings(self):
159
+ """
160
+ 出力埋め込みを取得
161
+ """
162
+ return self.get_base_model().get_output_embeddings()
163
+
164
+ def resize_token_embeddings(self, new_num_tokens):
165
+ """
166
+ トークン埋め込みのサイズを変更
167
+ """
168
+ self.get_base_model().resize_token_embeddings(new_num_tokens)
169
+ return self.get_input_embeddings()
170
+
171
+ RecurrentMemoryTransformer.register_for_auto_class("AutoModelForCausalLM")
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 5426859755962368.0,
3
+ "train_loss": 3.1510416666666665,
4
+ "train_runtime": 387.2622,
5
+ "train_samples": 19883,
6
+ "train_samples_per_second": 13.394,
7
+ "train_steps_per_second": 0.418
8
+ }
config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "align": "left",
4
+ "architectures": [
5
+ "RecurrentMemoryTransformer"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "base_model_config": {
9
+ "_attn_implementation_autoset": true,
10
+ "_name_or_path": "openai-community/gpt2",
11
+ "activation_function": "gelu_new",
12
+ "add_cross_attention": false,
13
+ "architectures": [
14
+ "GPT2LMHeadModel"
15
+ ],
16
+ "attn_pdrop": 0.1,
17
+ "bad_words_ids": null,
18
+ "begin_suppress_tokens": null,
19
+ "bos_token_id": 50256,
20
+ "chunk_size_feed_forward": 0,
21
+ "cross_attention_hidden_size": null,
22
+ "decoder_start_token_id": null,
23
+ "diversity_penalty": 0.0,
24
+ "do_sample": false,
25
+ "early_stopping": false,
26
+ "embd_pdrop": 0.1,
27
+ "encoder_no_repeat_ngram_size": 0,
28
+ "eos_token_id": 50256,
29
+ "exponential_decay_length_penalty": null,
30
+ "finetuning_task": null,
31
+ "forced_bos_token_id": null,
32
+ "forced_eos_token_id": null,
33
+ "id2label": {
34
+ "0": "LABEL_0",
35
+ "1": "LABEL_1"
36
+ },
37
+ "initializer_range": 0.02,
38
+ "is_decoder": false,
39
+ "is_encoder_decoder": false,
40
+ "label2id": {
41
+ "LABEL_0": 0,
42
+ "LABEL_1": 1
43
+ },
44
+ "layer_norm_epsilon": 1e-05,
45
+ "length_penalty": 1.0,
46
+ "max_length": 20,
47
+ "min_length": 0,
48
+ "model_type": "gpt2",
49
+ "n_ctx": 1024,
50
+ "n_embd": 768,
51
+ "n_head": 12,
52
+ "n_inner": null,
53
+ "n_layer": 12,
54
+ "n_positions": 1024,
55
+ "no_repeat_ngram_size": 0,
56
+ "num_beam_groups": 1,
57
+ "num_beams": 1,
58
+ "num_return_sequences": 1,
59
+ "output_attentions": false,
60
+ "output_hidden_states": false,
61
+ "output_scores": false,
62
+ "pad_token_id": null,
63
+ "prefix": null,
64
+ "problem_type": null,
65
+ "pruned_heads": {},
66
+ "remove_invalid_values": false,
67
+ "reorder_and_upcast_attn": false,
68
+ "repetition_penalty": 1.0,
69
+ "resid_pdrop": 0.1,
70
+ "return_dict": true,
71
+ "return_dict_in_generate": false,
72
+ "scale_attn_by_inverse_layer_idx": false,
73
+ "scale_attn_weights": true,
74
+ "sep_token_id": null,
75
+ "summary_activation": null,
76
+ "summary_first_dropout": 0.1,
77
+ "summary_proj_to_labels": true,
78
+ "summary_type": "cls_index",
79
+ "summary_use_proj": true,
80
+ "suppress_tokens": null,
81
+ "task_specific_params": {
82
+ "text-generation": {
83
+ "do_sample": true,
84
+ "max_length": 50
85
+ }
86
+ },
87
+ "temperature": 1.0,
88
+ "tf_legacy_loss": false,
89
+ "tie_encoder_decoder": false,
90
+ "tie_word_embeddings": true,
91
+ "tokenizer_class": null,
92
+ "top_k": 50,
93
+ "top_p": 1.0,
94
+ "torch_dtype": "bfloat16",
95
+ "torchscript": false,
96
+ "typical_p": 1.0,
97
+ "use_bfloat16": false,
98
+ "use_cache": false,
99
+ "vocab_size": 50257
100
+ },
101
+ "base_model_type": "gpt2",
102
+ "bos_token_id": 50256,
103
+ "embd_pdrop": 0.1,
104
+ "eos_token_id": 50256,
105
+ "id2label": {
106
+ "0": "LABEL_0",
107
+ "1": "LABEL_1"
108
+ },
109
+ "initializer_range": 0.02,
110
+ "input_seg_len": 512,
111
+ "is_memory_all": false,
112
+ "layer_norm_epsilon": 1e-05,
113
+ "max_n_segments": 2,
114
+ "memory_size": 512,
115
+ "model_type": "rmt_gpt2",
116
+ "n_ctx": 1024,
117
+ "n_embd": 768,
118
+ "n_head": 12,
119
+ "n_inner": null,
120
+ "n_layer": 12,
121
+ "n_positions": 1024,
122
+ "num_mem_tokens": 10,
123
+ "output_seg_len": 512,
124
+ "reorder_and_upcast_attn": false,
125
+ "resid_pdrop": 0.1,
126
+ "scale_attn_by_inverse_layer_idx": false,
127
+ "scale_attn_weights": true,
128
+ "summary_activation": null,
129
+ "summary_first_dropout": 0.1,
130
+ "summary_proj_to_labels": true,
131
+ "summary_type": "cls_index",
132
+ "summary_use_proj": true,
133
+ "task_specific_params": {
134
+ "text-generation": {
135
+ "do_sample": true,
136
+ "max_length": 50
137
+ }
138
+ },
139
+ "torch_dtype": "bfloat16",
140
+ "transformers_version": "4.50.0.dev0",
141
+ "use_cache": false,
142
+ "vocab_size": 50257
143
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43aa3154a3a0507399f2b63925dbc848879161df546b22ae0e149c43b6ed5434
3
+ size 248915448
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 5426859755962368.0,
3
+ "train_loss": 3.1510416666666665,
4
+ "train_runtime": 387.2622,
5
+ "train_samples": 19883,
6
+ "train_samples_per_second": 13.394,
7
+ "train_steps_per_second": 0.418
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9969230769230769,
5
+ "eval_steps": 100,
6
+ "global_step": 162,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06153846153846154,
13
+ "grad_norm": 0.39951827848719723,
14
+ "learning_rate": 1.1764705882352942e-05,
15
+ "loss": 3.1582,
16
+ "mean_token_accuracy": 0.400910259783268,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.12307692307692308,
21
+ "grad_norm": 0.3995616499609791,
22
+ "learning_rate": 1.9978883431348845e-05,
23
+ "loss": 3.1668,
24
+ "mean_token_accuracy": 0.39937195032835004,
25
+ "step": 20
26
+ },
27
+ {
28
+ "epoch": 0.18461538461538463,
29
+ "grad_norm": 0.4102168990934247,
30
+ "learning_rate": 1.9605953553832987e-05,
31
+ "loss": 3.1387,
32
+ "mean_token_accuracy": 0.4016988441348076,
33
+ "step": 30
34
+ },
35
+ {
36
+ "epoch": 0.24615384615384617,
37
+ "grad_norm": 0.40391230141561146,
38
+ "learning_rate": 1.8783859964390466e-05,
39
+ "loss": 3.1551,
40
+ "mean_token_accuracy": 0.3992693811655045,
41
+ "step": 40
42
+ },
43
+ {
44
+ "epoch": 0.3076923076923077,
45
+ "grad_norm": 0.3961653949779693,
46
+ "learning_rate": 1.755104284557221e-05,
47
+ "loss": 3.1457,
48
+ "mean_token_accuracy": 0.40172072798013686,
49
+ "step": 50
50
+ },
51
+ {
52
+ "epoch": 0.36923076923076925,
53
+ "grad_norm": 0.41939623098641426,
54
+ "learning_rate": 1.5965147355676344e-05,
55
+ "loss": 3.1492,
56
+ "mean_token_accuracy": 0.4015924736857414,
57
+ "step": 60
58
+ },
59
+ {
60
+ "epoch": 0.4307692307692308,
61
+ "grad_norm": 0.3945808975159374,
62
+ "learning_rate": 1.4100328205214161e-05,
63
+ "loss": 3.1387,
64
+ "mean_token_accuracy": 0.4018391355872154,
65
+ "step": 70
66
+ },
67
+ {
68
+ "epoch": 0.49230769230769234,
69
+ "grad_norm": 0.37735963614306256,
70
+ "learning_rate": 1.204378226506365e-05,
71
+ "loss": 3.1488,
72
+ "mean_token_accuracy": 0.4004943951964378,
73
+ "step": 80
74
+ },
75
+ {
76
+ "epoch": 0.5538461538461539,
77
+ "grad_norm": 0.4114554888279876,
78
+ "learning_rate": 9.891671337699603e-06,
79
+ "loss": 3.1559,
80
+ "mean_token_accuracy": 0.4003040686249733,
81
+ "step": 90
82
+ },
83
+ {
84
+ "epoch": 0.6153846153846154,
85
+ "grad_norm": 0.41103454513473137,
86
+ "learning_rate": 7.74462573818606e-06,
87
+ "loss": 3.1543,
88
+ "mean_token_accuracy": 0.40160409063100816,
89
+ "step": 100
90
+ },
91
+ {
92
+ "epoch": 0.6153846153846154,
93
+ "eval_runtime": 0.4543,
94
+ "eval_samples_per_second": 52.823,
95
+ "eval_steps_per_second": 4.402,
96
+ "step": 100
97
+ },
98
+ {
99
+ "epoch": 0.676923076923077,
100
+ "grad_norm": 0.38569131648987776,
101
+ "learning_rate": 5.7030389324864845e-06,
102
+ "loss": 3.1824,
103
+ "mean_token_accuracy": 0.397557806968689,
104
+ "step": 110
105
+ },
106
+ {
107
+ "epoch": 0.7384615384615385,
108
+ "grad_norm": 0.4255553103146562,
109
+ "learning_rate": 3.862373250574626e-06,
110
+ "loss": 3.1828,
111
+ "mean_token_accuracy": 0.3972348183393478,
112
+ "step": 120
113
+ },
114
+ {
115
+ "epoch": 0.8,
116
+ "grad_norm": 0.432279104045337,
117
+ "learning_rate": 2.308696173983711e-06,
118
+ "loss": 3.1387,
119
+ "mean_token_accuracy": 0.40221917182207106,
120
+ "step": 130
121
+ },
122
+ {
123
+ "epoch": 0.8615384615384616,
124
+ "grad_norm": 0.4143971545965478,
125
+ "learning_rate": 1.1146559160270875e-06,
126
+ "loss": 3.1297,
127
+ "mean_token_accuracy": 0.4037084937095642,
128
+ "step": 140
129
+ },
130
+ {
131
+ "epoch": 0.9230769230769231,
132
+ "grad_norm": 0.41227809966322443,
133
+ "learning_rate": 3.360844720863765e-07,
134
+ "loss": 3.1383,
135
+ "mean_token_accuracy": 0.40221751779317855,
136
+ "step": 150
137
+ },
138
+ {
139
+ "epoch": 0.9846153846153847,
140
+ "grad_norm": 0.4212135535860477,
141
+ "learning_rate": 9.38697756023288e-09,
142
+ "loss": 3.1223,
143
+ "mean_token_accuracy": 0.4041751459240913,
144
+ "step": 160
145
+ },
146
+ {
147
+ "epoch": 0.9969230769230769,
148
+ "mean_token_accuracy": 0.3931814655661583,
149
+ "step": 162,
150
+ "total_flos": 5426859755962368.0,
151
+ "train_loss": 3.1510416666666665,
152
+ "train_runtime": 387.2622,
153
+ "train_samples_per_second": 13.394,
154
+ "train_steps_per_second": 0.418
155
+ }
156
+ ],
157
+ "logging_steps": 10,
158
+ "max_steps": 162,
159
+ "num_input_tokens_seen": 0,
160
+ "num_train_epochs": 1,
161
+ "save_steps": 500,
162
+ "stateful_callbacks": {
163
+ "TrainerControl": {
164
+ "args": {
165
+ "should_epoch_stop": false,
166
+ "should_evaluate": false,
167
+ "should_log": false,
168
+ "should_save": false,
169
+ "should_training_stop": false
170
+ },
171
+ "attributes": {}
172
+ }
173
+ },
174
+ "total_flos": 5426859755962368.0,
175
+ "train_batch_size": 8,
176
+ "trial_name": null,
177
+ "trial_params": null
178
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64aa7be91907887688fd3f87d1956ad836561b1853152b26a33b9fff2b072cbd
3
+ size 7352
vocab.json ADDED
The diff for this file is too large to render. See raw diff