attention_mask not needed for training (#642)
Browse files* attention_mask not needed for training
* specifically don't use attention mask for phi
* use a different check for phi
* small fixes since phi removed some values from their config
src/axolotl/models/phi/modeling_mixformer_sequential.py
CHANGED
|
@@ -711,12 +711,8 @@ class ParallelBlock(nn.Module):
|
|
| 711 |
self.resid_dropout = nn.Dropout(config.resid_pdrop)
|
| 712 |
self.block_idx = block_idx
|
| 713 |
|
| 714 |
-
self.mixer = MHA(config
|
| 715 |
-
|
| 716 |
-
if mlp_cls == "fused_mlp":
|
| 717 |
-
self.mlp = FusedMLP(config=config, **mlp)
|
| 718 |
-
else:
|
| 719 |
-
self.mlp = MLP(config=config, **mlp)
|
| 720 |
|
| 721 |
def forward(
|
| 722 |
self,
|
|
|
|
| 711 |
self.resid_dropout = nn.Dropout(config.resid_pdrop)
|
| 712 |
self.block_idx = block_idx
|
| 713 |
|
| 714 |
+
self.mixer = MHA(config, layer_idx=block_idx)
|
| 715 |
+
self.mlp = MLP(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 716 |
|
| 717 |
def forward(
|
| 718 |
self,
|
src/axolotl/utils/data.py
CHANGED
|
@@ -76,7 +76,7 @@ def prepare_dataset(cfg, tokenizer):
|
|
| 76 |
|
| 77 |
with zero_first(is_main_process()):
|
| 78 |
train_dataset, eval_dataset = process_datasets_for_packing(
|
| 79 |
-
cfg, train_dataset, eval_dataset
|
| 80 |
)
|
| 81 |
if cfg.max_steps:
|
| 82 |
total_num_steps = min(
|
|
|
|
| 76 |
|
| 77 |
with zero_first(is_main_process()):
|
| 78 |
train_dataset, eval_dataset = process_datasets_for_packing(
|
| 79 |
+
cfg, train_dataset, eval_dataset, tokenizer
|
| 80 |
)
|
| 81 |
if cfg.max_steps:
|
| 82 |
total_num_steps = min(
|
src/axolotl/utils/trainer.py
CHANGED
|
@@ -397,7 +397,7 @@ def disable_datasets_caching():
|
|
| 397 |
set_caching_enabled(True)
|
| 398 |
|
| 399 |
|
| 400 |
-
def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
| 401 |
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
| 402 |
with zero_first(is_main_process()):
|
| 403 |
train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count())
|
|
@@ -414,6 +414,13 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
|
| 414 |
eval_dataset = eval_dataset.map(
|
| 415 |
add_position_ids, num_proc=os.cpu_count()
|
| 416 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
return train_dataset, eval_dataset
|
| 418 |
|
| 419 |
|
|
|
|
| 397 |
set_caching_enabled(True)
|
| 398 |
|
| 399 |
|
| 400 |
+
def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
|
| 401 |
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
| 402 |
with zero_first(is_main_process()):
|
| 403 |
train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count())
|
|
|
|
| 414 |
eval_dataset = eval_dataset.map(
|
| 415 |
add_position_ids, num_proc=os.cpu_count()
|
| 416 |
)
|
| 417 |
+
|
| 418 |
+
# Phi doesn't want the attention_mask feature when training
|
| 419 |
+
if "CodeGenTokenizer" in tokenizer.__class__.__name__:
|
| 420 |
+
train_dataset = train_dataset.remove_columns("attention_mask")
|
| 421 |
+
if eval_dataset:
|
| 422 |
+
eval_dataset = eval_dataset.remove_columns("attention_mask")
|
| 423 |
+
|
| 424 |
return train_dataset, eval_dataset
|
| 425 |
|
| 426 |
|