JonusNattapong commited on
Commit
4c7ef82
·
verified ·
1 Parent(s): a6342ea

End of training

Browse files
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: mit
4
+ base_model: JonusNattapong/gptoss-mini-thaichat
5
+ tags:
6
+ - generated_from_trainer
7
+ model-index:
8
+ - name: gptoss-mini-reasoning
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # gptoss-mini-reasoning
16
+
17
+ This model is a fine-tuned version of [JonusNattapong/gptoss-mini-thaichat](https://huggingface.co/JonusNattapong/gptoss-mini-thaichat) on an unknown dataset.
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 5e-05
37
+ - train_batch_size: 2
38
+ - eval_batch_size: 8
39
+ - seed: 42
40
+ - gradient_accumulation_steps: 8
41
+ - total_train_batch_size: 16
42
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
43
+ - lr_scheduler_type: linear
44
+ - lr_scheduler_warmup_steps: 200
45
+ - num_epochs: 3
46
+ - mixed_precision_training: Native AMP
47
+
48
+ ### Training results
49
+
50
+
51
+
52
+ ### Framework versions
53
+
54
+ - Transformers 4.57.0.dev0
55
+ - Pytorch 2.8.0+cu126
56
+ - Datasets 4.0.0
57
+ - Tokenizers 0.22.0
configuration_gptoss_mini.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class GPTMiniConfig(PretrainedConfig):
5
+ model_type = "gptoss-mini"
6
+
7
+ attribute_map = {
8
+ "num_experts": "num_experts",
9
+ "top_k": "top_k",
10
+ "num_hidden_layers": "num_layers"
11
+ }
12
+
13
+ def __init__(
14
+ self,
15
+ vocab_size=50000,
16
+ hidden_size=768,
17
+ num_layers=6,
18
+ num_heads=8,
19
+ num_experts=4,
20
+ top_k=2,
21
+ max_position_embeddings=512,
22
+ intermediate_size=3072,
23
+ eos_token_id=None,
24
+ bos_token_id=None,
25
+ pad_token_id=None,
26
+ **kwargs
27
+ ):
28
+ if top_k > num_experts:
29
+ raise ValueError(
30
+ f"top_k ({top_k}) cannot be greater than num_experts ({num_experts})"
31
+ )
32
+
33
+ super().__init__(
34
+ pad_token_id=pad_token_id,
35
+ bos_token_id=bos_token_id,
36
+ eos_token_id=eos_token_id,
37
+ **kwargs
38
+ )
39
+
40
+ self.vocab_size = vocab_size
41
+ self.hidden_size = hidden_size
42
+ self.num_layers = num_layers
43
+ self.num_heads = num_heads
44
+ self.num_experts = num_experts
45
+ self.top_k = top_k
46
+ self.max_position_embeddings = max_position_embeddings
47
+ self.intermediate_size = intermediate_size
48
+
49
+ self.num_hidden_layers = num_layers
50
+
51
+ def to_dict(self):
52
+ output = super().to_dict()
53
+ output["num_experts"] = self.num_experts
54
+ output["top_k"] = self.top_k
55
+ output["num_hidden_layers"] = self.num_layers
56
+ return output
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 2,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 3,
6
+ 2
7
+ ],
8
+ "max_length": 512,
9
+ "pad_token_id": 0,
10
+ "temperature": 0.7,
11
+ "top_p": 0.9,
12
+ "transformers_version": "4.57.0.dev0"
13
+ }
modeling_gptoss_mini.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+
6
+ from transformers import PreTrainedModel, GenerationMixin
7
+ from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
8
+ from .configuration_gptoss_mini import GPTMiniConfig
9
+
10
+
11
+ class RMSNorm(nn.Module):
12
+ def __init__(self, d, eps=1e-6):
13
+ super().__init__()
14
+ self.weight = nn.Parameter(torch.ones(d))
15
+ self.eps = eps
16
+
17
+ def forward(self, x):
18
+ norm = x.norm(dim=-1, keepdim=True) * (1.0 / math.sqrt(x.size(-1)))
19
+ return self.weight * x / (norm + self.eps)
20
+
21
+
22
+ class SwiGLU(nn.Module):
23
+ def __init__(self, d_model, d_ff):
24
+ super().__init__()
25
+ self.w1 = nn.Linear(d_model, d_ff)
26
+ self.w2 = nn.Linear(d_model, d_ff)
27
+
28
+ def forward(self, x):
29
+ return F.silu(self.w1(x)) * self.w2(x)
30
+
31
+
32
+ class MultiHeadAttention(nn.Module):
33
+ def __init__(self, config: GPTMiniConfig):
34
+ super().__init__()
35
+ self.qkv = nn.Linear(config.hidden_size, 3 * config.hidden_size)
36
+ self.o_proj = nn.Linear(config.hidden_size, config.hidden_size)
37
+ self.num_heads = config.num_heads
38
+ self.head_dim = config.hidden_size // config.num_heads
39
+
40
+ def forward(self, x):
41
+ B, T, C = x.shape
42
+ qkv = self.qkv(x).view(B, T, 3, self.num_heads, self.head_dim)
43
+ q, k, v = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]
44
+
45
+ attn = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
46
+ attn = F.softmax(attn, dim=-1)
47
+ out = attn @ v
48
+ out = out.reshape(B, T, C)
49
+ return self.o_proj(out)
50
+
51
+
52
+ class MoE(nn.Module):
53
+ def __init__(self, config: GPTMiniConfig):
54
+ super().__init__()
55
+ if config.top_k > config.num_experts:
56
+ raise ValueError(
57
+ f"top_k ({config.top_k}) cannot be greater than num_experts ({config.num_experts})"
58
+ )
59
+
60
+ self.experts = nn.ModuleList(
61
+ [SwiGLU(config.hidden_size, config.intermediate_size) for _ in range(config.num_experts)]
62
+ )
63
+ self.gate = nn.Linear(config.hidden_size, config.num_experts)
64
+ self.top_k = config.top_k
65
+ self.num_experts = config.num_experts
66
+
67
+ def forward(self, x):
68
+ B, T, C = x.shape
69
+ scores = F.softmax(self.gate(x), dim=-1)
70
+
71
+ current_top_k = min(self.top_k, self.num_experts)
72
+ topk_scores, topk_idx = torch.topk(scores, current_top_k, dim=-1)
73
+
74
+ expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=2)
75
+ topk_idx_expanded = topk_idx.unsqueeze(-1).expand(-1, -1, -1, C)
76
+ selected_expert_outputs = torch.gather(expert_outputs, dim=2, index=topk_idx_expanded)
77
+ topk_scores_expanded = topk_scores.unsqueeze(-1).expand(-1, -1, -1, C)
78
+ weighted_expert_outputs = selected_expert_outputs * topk_scores_expanded
79
+ output = torch.sum(weighted_expert_outputs, dim=2)
80
+
81
+ return output
82
+
83
+
84
+ class Block(nn.Module):
85
+ def __init__(self, config: GPTMiniConfig):
86
+ super().__init__()
87
+ self.ln1 = RMSNorm(config.hidden_size)
88
+ self.attn = MultiHeadAttention(config)
89
+ self.ln2 = RMSNorm(config.hidden_size)
90
+ self.moe = MoE(config)
91
+
92
+ def forward(self, x):
93
+ x = x + self.attn(self.ln1(x))
94
+ x = x + self.moe(self.ln2(x))
95
+ return x
96
+
97
+
98
+ class GPTMiniForCausalLM(PreTrainedModel, GenerationMixin):
99
+ config_class = GPTMiniConfig
100
+
101
+ def __init__(self, config: GPTMiniConfig):
102
+ super().__init__(config)
103
+ self.embed = nn.Embedding(config.vocab_size, config.hidden_size)
104
+ self.pos_embed = nn.Embedding(config.max_position_embeddings, config.hidden_size)
105
+ self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)])
106
+ self.ln_f = RMSNorm(config.hidden_size)
107
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
108
+
109
+ self.post_init()
110
+
111
+ def get_input_embeddings(self):
112
+ return self.embed
113
+
114
+ def set_input_embeddings(self, new_embeddings):
115
+ self.embed = new_embeddings
116
+
117
+ def get_output_embeddings(self):
118
+ return self.lm_head
119
+
120
+ def set_output_embeddings(self, new_embeddings):
121
+ self.lm_head = new_embeddings
122
+
123
+ def tie_weights(self):
124
+ self._tie_or_clone_weights(self.lm_head, self.embed)
125
+
126
+ def forward(
127
+ self,
128
+ input_ids,
129
+ labels=None,
130
+ attention_mask=None,
131
+ token_type_ids=None,
132
+ past_key_values=None,
133
+ use_cache: bool = False,
134
+ cache_position=None,
135
+ **kwargs
136
+ ):
137
+ B, T = input_ids.shape
138
+ pos = torch.arange(0, T, device=input_ids.device).unsqueeze(0)
139
+ x = self.embed(input_ids) + self.pos_embed(pos)
140
+
141
+ for block in self.blocks:
142
+ x = block(x)
143
+
144
+ x = self.ln_f(x)
145
+ logits = self.lm_head(x)
146
+
147
+ loss = None
148
+ if labels is not None:
149
+ loss = F.cross_entropy(
150
+ logits.view(-1, logits.size(-1)),
151
+ labels.view(-1),
152
+ ignore_index=-100
153
+ )
154
+
155
+ return CausalLMOutputWithCrossAttentions(
156
+ loss=loss,
157
+ logits=logits,
158
+ past_key_values=past_key_values if use_cache else None,
159
+ hidden_states=None,
160
+ attentions=None,
161
+ )