Spaces:
Paused
Paused
Update train.py
Browse files
train.py
CHANGED
|
@@ -36,7 +36,7 @@ class Config:
|
|
| 36 |
self.INSTRUCT_FINETUNE_BOOL = False
|
| 37 |
|
| 38 |
# Training steps and warmup
|
| 39 |
-
self.FACTOR = 12 ** 3 //
|
| 40 |
self.TOTAL_STEPS = (self.SHARD_SIZE * self.EPOCHS) // (self.BATCH_SIZE * self.GRADIENT_ACCUMULATION_STEPS)
|
| 41 |
self.WARMUP_STEPS = int(self.TOTAL_STEPS * 0.1)
|
| 42 |
|
|
@@ -160,11 +160,11 @@ def create_model(tokenizer):
|
|
| 160 |
vocab_size=tokenizer.vocab_size,
|
| 161 |
hidden_size=config.FACTOR,
|
| 162 |
intermediate_size=config.FACTOR * 4,
|
| 163 |
-
num_hidden_layers=
|
| 164 |
-
num_attention_heads=
|
| 165 |
max_position_embeddings=config.MAX_SEQ_LENGTH,
|
| 166 |
rms_norm_eps=1e-5,
|
| 167 |
-
initializer_range=
|
| 168 |
use_cache=True,
|
| 169 |
pad_token_id=tokenizer.pad_token_id,
|
| 170 |
bos_token_id=tokenizer.bos_token_id,
|
|
|
|
| 36 |
self.INSTRUCT_FINETUNE_BOOL = False
|
| 37 |
|
| 38 |
# Training steps and warmup
|
| 39 |
+
self.FACTOR = 12 ** 3 // 2
|
| 40 |
self.TOTAL_STEPS = (self.SHARD_SIZE * self.EPOCHS) // (self.BATCH_SIZE * self.GRADIENT_ACCUMULATION_STEPS)
|
| 41 |
self.WARMUP_STEPS = int(self.TOTAL_STEPS * 0.1)
|
| 42 |
|
|
|
|
| 160 |
vocab_size=tokenizer.vocab_size,
|
| 161 |
hidden_size=config.FACTOR,
|
| 162 |
intermediate_size=config.FACTOR * 4,
|
| 163 |
+
num_hidden_layers=config.FACTOR // 2 ** 4,
|
| 164 |
+
num_attention_heads=config.FACTOR // 2 ** 5,
|
| 165 |
max_position_embeddings=config.MAX_SEQ_LENGTH,
|
| 166 |
rms_norm_eps=1e-5,
|
| 167 |
+
initializer_range=2e-2,
|
| 168 |
use_cache=True,
|
| 169 |
pad_token_id=tokenizer.pad_token_id,
|
| 170 |
bos_token_id=tokenizer.bos_token_id,
|