aframson commited on
Commit
39c6477
·
1 Parent(s): 1224b7d
Files changed (1) hide show
  1. model.py +47 -8
model.py CHANGED
@@ -107,16 +107,55 @@ class Block(nn.Module):
107
  x = x + self.mlp(self.ln_2(x))
108
  return x
109
 
 
110
  @dataclass
111
  class GPTConfig(PretrainedConfig):
112
- block_size: int = 1024
113
- vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
114
- n_layer: int = 12
115
- n_head: int = 12
116
- n_embd: int = 768
117
- dropout: float = 0.0
118
- bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
119
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  class GPT(PreTrainedModel):
121
 
122
  def __init__(self, config):
 
107
  x = x + self.mlp(self.ln_2(x))
108
  return x
109
 
110
+
111
  @dataclass
112
  class GPTConfig(PretrainedConfig):
113
+ def __init__(self,
114
+ model_type="GPT",
115
+ auto_map={
116
+ "AutoConfig": "model.GPTConfig",
117
+ "AutoModel": "model.GPT",
118
+ "AutoModelForCausalLM": "model.GPT",
119
+ "AutoModelForQuestionAnswering": "model.GPT"
120
+ },
121
+ hidden_size=4,
122
+ num_attention_heads=2,
123
+ num_hidden_layers=2,
124
+ hidden_dropout_prob=0.1,
125
+ batch_size=60,
126
+ max_iters=200,
127
+ eval_interval=100,
128
+ learning_rate=0.001,
129
+ device="cpu",
130
+ block_size: int = 1024,
131
+ vocab_size: int = 50304, # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
132
+ n_layer: int = 12,
133
+ n_head: int = 12,
134
+ n_embd: int = 768,
135
+ dropout: float = 0.0,
136
+ bias: bool = True, # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
137
+ **kwargs
138
+ )->None:
139
+ super().__init__(**kwargs)
140
+ self.model_type = model_type
141
+ self.auto_map = auto_map
142
+ self.vocab_size = vocab_size
143
+ self.hidden_size = hidden_size
144
+ self.num_attention_heads = num_attention_heads
145
+ self.num_hidden_layers = num_hidden_layers
146
+ self.hidden_dropout_prob = hidden_dropout_prob
147
+ self.block_size = block_size
148
+ self.batch_size = batch_size
149
+ self.max_iters = max_iters
150
+ self.eval_interval = eval_interval
151
+ self.learning_rate = learning_rate
152
+ self.device = device
153
+ self.n_layer = n_layer
154
+ self.n_head = n_head
155
+ self.n_embd = n_embd
156
+ self.dropout = dropout
157
+ self.bias = bias
158
+
159
  class GPT(PreTrainedModel):
160
 
161
  def __init__(self, config):