OliverPerrin commited on
Commit
2dcb4b5
·
1 Parent(s): f2e0099

Chore: Update pre-commit hooks and fix formatting

Browse files
.pre-commit-config.yaml CHANGED
@@ -1,13 +1,13 @@
1
  repos:
2
  - repo: https://github.com/astral-sh/ruff-pre-commit
3
- rev: v0.1.11
4
  hooks:
5
  - id: ruff
6
  args: [ --fix ]
7
  - id: ruff-format
8
 
9
  - repo: https://github.com/pre-commit/mirrors-mypy
10
- rev: v1.8.0
11
  hooks:
12
  - id: mypy
13
  additional_dependencies: [types-requests, types-PyYAML]
 
1
  repos:
2
  - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.14.7
4
  hooks:
5
  - id: ruff
6
  args: [ --fix ]
7
  - id: ruff-format
8
 
9
  - repo: https://github.com/pre-commit/mirrors-mypy
10
+ rev: v1.19.0
11
  hooks:
12
  - id: mypy
13
  additional_dependencies: [types-requests, types-PyYAML]
src/models/encoder.py CHANGED
@@ -160,9 +160,9 @@ class TransformerEncoder(nn.Module):
160
  Build a 3D attention mask (batch, seq, seq) from input_ids and pad_token_id.
161
  True indicates valid positions; False indicates masked (pad).
162
  """
163
- assert (
164
- self.pad_token_id is not None
165
- ), "pad_token_id must be set to build padding mask from ids."
166
  # mask shape: (batch, seq) where True = token kept (non-pad)
167
  pad_mask = input_ids != self.pad_token_id
168
  # Convert to (batch, seq_q, seq_k) by outer product broadcasting
 
160
  Build a 3D attention mask (batch, seq, seq) from input_ids and pad_token_id.
161
  True indicates valid positions; False indicates masked (pad).
162
  """
163
+ assert self.pad_token_id is not None, (
164
+ "pad_token_id must be set to build padding mask from ids."
165
+ )
166
  # mask shape: (batch, seq) where True = token kept (non-pad)
167
  pad_mask = input_ids != self.pad_token_id
168
  # Convert to (batch, seq_q, seq_k) by outer product broadcasting
src/models/heads.py CHANGED
@@ -97,12 +97,12 @@ class LMHead(nn.Module):
97
 
98
  if tie_embedding is not None:
99
  # Validate sizes
100
- assert (
101
- tie_embedding.num_embeddings == vocab_size
102
- ), "vocab size mismatch for weight tying"
103
- assert (
104
- tie_embedding.embedding_dim == d_model
105
- ), "embedding dim must match d_model for weight tying"
106
  # Tie weights: point the projection weight to the embedding weight Tensor
107
  # Remove the existing projection parameter in favor of the embedding weight
108
  # This keeps the same Parameter object, so updates affect both modules.
 
97
 
98
  if tie_embedding is not None:
99
  # Validate sizes
100
+ assert tie_embedding.num_embeddings == vocab_size, (
101
+ "vocab size mismatch for weight tying"
102
+ )
103
+ assert tie_embedding.embedding_dim == d_model, (
104
+ "embedding dim must match d_model for weight tying"
105
+ )
106
  # Tie weights: point the projection weight to the embedding weight Tensor
107
  # Remove the existing projection parameter in favor of the embedding weight
108
  # This keeps the same Parameter object, so updates affect both modules.
tests/test_models/test_decoder.py CHANGED
@@ -64,9 +64,9 @@ def test_decoder_layer_causal_mask_blocks_future():
64
  B, H, Tq, Tk = self_attn.shape
65
  for i in range(Tq):
66
  for j in range(i + 1, Tk):
67
- assert torch.allclose(
68
- self_attn[:, :, i, j], torch.zeros(B, H)
69
- ), f"Found nonzero attention to future position {j} from query {i}"
70
 
71
 
72
  def test_decoder_stack_and_greedy_decode_shapes():
 
64
  B, H, Tq, Tk = self_attn.shape
65
  for i in range(Tq):
66
  for j in range(i + 1, Tk):
67
+ assert torch.allclose(self_attn[:, :, i, j], torch.zeros(B, H)), (
68
+ f"Found nonzero attention to future position {j} from query {i}"
69
+ )
70
 
71
 
72
  def test_decoder_stack_and_greedy_decode_shapes():