Commit
·
a5b83ec
1
Parent(s):
1da34bf
Update modeling_transnormer.py
Browse files- modeling_transnormer.py +0 -40
modeling_transnormer.py
CHANGED
|
@@ -734,43 +734,6 @@ class TransnormerModel(TransnormerPreTrainedModel):
|
|
| 734 |
slope_rate = slope_rates[idx]
|
| 735 |
slope_rate = slope_rate * (1 - idx / (self.num_layers - 1) + 1e-5)
|
| 736 |
mask = linear_attn_mask
|
| 737 |
-
|
| 738 |
-
# if self.gradient_checkpointing and self.training:
|
| 739 |
-
|
| 740 |
-
# def create_custom_forward(module):
|
| 741 |
-
# def custom_forward(*inputs):
|
| 742 |
-
# # None for past_key_value
|
| 743 |
-
# return module(*inputs, output_attentions, None)
|
| 744 |
-
|
| 745 |
-
# return custom_forward
|
| 746 |
-
|
| 747 |
-
# # layer_outputs = torch.utils.checkpoint.checkpoint(
|
| 748 |
-
# # create_custom_forward(layer),
|
| 749 |
-
# # hidden_states,
|
| 750 |
-
# # mask,
|
| 751 |
-
# # linear_attn_padding_mask,
|
| 752 |
-
# # None,
|
| 753 |
-
# # )
|
| 754 |
-
# layer_outputs = torch.utils.checkpoint.checkpoint(
|
| 755 |
-
# create_custom_forward(layer),
|
| 756 |
-
# hidden_states,
|
| 757 |
-
# mask,
|
| 758 |
-
# linear_attn_padding_mask,
|
| 759 |
-
# None,
|
| 760 |
-
# output_attentions,
|
| 761 |
-
# use_cache,
|
| 762 |
-
# slope_rate,
|
| 763 |
-
# )
|
| 764 |
-
# else:
|
| 765 |
-
# layer_outputs = layer(
|
| 766 |
-
# hidden_states,
|
| 767 |
-
# attn_mask=mask,
|
| 768 |
-
# attn_padding_mask=linear_attn_padding_mask,
|
| 769 |
-
# past_key_value=past_key_value,
|
| 770 |
-
# output_attentions=output_attentions,
|
| 771 |
-
# use_cache=use_cache,
|
| 772 |
-
# slope_rate=slope_rate,
|
| 773 |
-
# )
|
| 774 |
|
| 775 |
layer_outputs = layer(
|
| 776 |
hidden_states,
|
|
@@ -789,9 +752,6 @@ class TransnormerModel(TransnormerPreTrainedModel):
|
|
| 789 |
|
| 790 |
if output_attentions:
|
| 791 |
all_self_attns += (layer_outputs[1],)
|
| 792 |
-
|
| 793 |
-
# if idx == 0:
|
| 794 |
-
# break
|
| 795 |
|
| 796 |
hidden_states = self.final_norm(hidden_states)
|
| 797 |
|
|
|
|
| 734 |
slope_rate = slope_rates[idx]
|
| 735 |
slope_rate = slope_rate * (1 - idx / (self.num_layers - 1) + 1e-5)
|
| 736 |
mask = linear_attn_mask
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 737 |
|
| 738 |
layer_outputs = layer(
|
| 739 |
hidden_states,
|
|
|
|
| 752 |
|
| 753 |
if output_attentions:
|
| 754 |
all_self_attns += (layer_outputs[1],)
|
|
|
|
|
|
|
|
|
|
| 755 |
|
| 756 |
hidden_states = self.final_norm(hidden_states)
|
| 757 |
|