sds
Browse files
model.py
CHANGED
|
@@ -107,16 +107,55 @@ class Block(nn.Module):
|
|
| 107 |
x = x + self.mlp(self.ln_2(x))
|
| 108 |
return x
|
| 109 |
|
|
|
|
| 110 |
@dataclass
|
| 111 |
class GPTConfig(PretrainedConfig):
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
class GPT(PreTrainedModel):
|
| 121 |
|
| 122 |
def __init__(self, config):
|
|
|
|
| 107 |
x = x + self.mlp(self.ln_2(x))
|
| 108 |
return x
|
| 109 |
|
| 110 |
+
|
| 111 |
@dataclass
|
| 112 |
class GPTConfig(PretrainedConfig):
|
| 113 |
+
def __init__(self,
|
| 114 |
+
model_type="GPT",
|
| 115 |
+
auto_map={
|
| 116 |
+
"AutoConfig": "model.GPTConfig",
|
| 117 |
+
"AutoModel": "model.GPT",
|
| 118 |
+
"AutoModelForCausalLM": "model.GPT",
|
| 119 |
+
"AutoModelForQuestionAnswering": "model.GPT"
|
| 120 |
+
},
|
| 121 |
+
hidden_size=4,
|
| 122 |
+
num_attention_heads=2,
|
| 123 |
+
num_hidden_layers=2,
|
| 124 |
+
hidden_dropout_prob=0.1,
|
| 125 |
+
batch_size=60,
|
| 126 |
+
max_iters=200,
|
| 127 |
+
eval_interval=100,
|
| 128 |
+
learning_rate=0.001,
|
| 129 |
+
device="cpu",
|
| 130 |
+
block_size: int = 1024,
|
| 131 |
+
vocab_size: int = 50304, # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
|
| 132 |
+
n_layer: int = 12,
|
| 133 |
+
n_head: int = 12,
|
| 134 |
+
n_embd: int = 768,
|
| 135 |
+
dropout: float = 0.0,
|
| 136 |
+
bias: bool = True, # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
|
| 137 |
+
**kwargs
|
| 138 |
+
)->None:
|
| 139 |
+
super().__init__(**kwargs)
|
| 140 |
+
self.model_type = model_type
|
| 141 |
+
self.auto_map = auto_map
|
| 142 |
+
self.vocab_size = vocab_size
|
| 143 |
+
self.hidden_size = hidden_size
|
| 144 |
+
self.num_attention_heads = num_attention_heads
|
| 145 |
+
self.num_hidden_layers = num_hidden_layers
|
| 146 |
+
self.hidden_dropout_prob = hidden_dropout_prob
|
| 147 |
+
self.block_size = block_size
|
| 148 |
+
self.batch_size = batch_size
|
| 149 |
+
self.max_iters = max_iters
|
| 150 |
+
self.eval_interval = eval_interval
|
| 151 |
+
self.learning_rate = learning_rate
|
| 152 |
+
self.device = device
|
| 153 |
+
self.n_layer = n_layer
|
| 154 |
+
self.n_head = n_head
|
| 155 |
+
self.n_embd = n_embd
|
| 156 |
+
self.dropout = dropout
|
| 157 |
+
self.bias = bias
|
| 158 |
+
|
| 159 |
class GPT(PreTrainedModel):
|
| 160 |
|
| 161 |
def __init__(self, config):
|