thomas-schweich Claude Opus 4.6 (1M context) commited on 5 days ago

Commit

36caadb

1 Parent(s): d7ecc62

Add type annotations across codebase and configure pyright

Configure pyright in basic mode for static type checking. Add type
annotations to all function signatures (~76 additions across 18 files),
bringing coverage from ~65% to 100%.

Key structural changes to achieve clean type checking without
suppressions:

- Add PAWNCLM.get_block() typed accessor to avoid ModuleList type
erasure (single type-narrowing point for all adapter consumers)
- Replace None with nn.Identity() in adapter ModuleLists (bottleneck,
hybrid) so entries are always valid Modules
- Separate trainer's unwrapped _model from potentially-compiled model
to preserve concrete PAWNCLM type
- Define GenerativeModel Protocol for eval suite's duck-typed model
parameter instead of bare nn.Module
- Add class-level type declarations for registered buffers and
submodules (rope_cos, causal_mask, TransformerBlock attrs)

Result: 0 pyright errors, 1 type: ignore (in get_block), all tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (18) hide show

pawn/adapters/bottleneck.py +19 -22
pawn/adapters/film.py +7 -5
pawn/adapters/hybrid.py +17 -12
pawn/adapters/lora.py +7 -6
pawn/adapters/sparse.py +10 -5
pawn/data.py +4 -3
pawn/eval_suite/corpus.py +9 -8
pawn/eval_suite/diagnostics.py +3 -2
pawn/eval_suite/generation.py +41 -11
pawn/eval_suite/lichess.py +2 -1
pawn/eval_suite/probes.py +6 -6
pawn/eval_suite/viz.py +11 -5
pawn/eval_suite/worker.py +36 -26
pawn/lichess_data.py +2 -2
pawn/logging.py +4 -4
pawn/model.py +21 -2
pawn/trainer.py +11 -14
pyproject.toml +8 -0

pawn/adapters/bottleneck.py CHANGED Viewed

@@ -80,18 +80,18 @@ class BottleneckCLM(nn.Module):
         for p in backbone.parameters():
             p.requires_grad = False
-        # Create adapter modules (None for non-adapted layers)
         self.attn_adapters = nn.ModuleList()
         self.ffn_adapters = nn.ModuleList()
         for i in range(n_layers):
             if i in self._attn_set:
                 self.attn_adapters.append(BottleneckAdapter(cfg.d_model, bottleneck_dim))
             else:
-                self.attn_adapters.append(None)
             if i in self._ffn_set:
                 self.ffn_adapters.append(BottleneckAdapter(cfg.d_model, bottleneck_dim))
             else:
-                self.ffn_adapters.append(None)
     @property
     def cfg(self) -> CLMConfig:
@@ -106,16 +106,15 @@ class BottleneckCLM(nn.Module):
         rope_cos = bb.rope_cos[:, :, :T, :]
         rope_sin = bb.rope_sin[:, :, :T, :]
-        for i, block in enumerate(bb.layers):
             # Attention sublayer + adapter
             x = x + block.attn(block.attn_norm(x), rope_cos, rope_sin, None)
-            if self.attn_adapters[i] is not None:
-                x = self.attn_adapters[i](x)
             # FFN sublayer + adapter
             x = x + block.ffn(block.ffn_norm(x))
-            if self.ffn_adapters[i] is not None:
-                x = self.ffn_adapters[i](x)
         return bb.final_norm(x)
@@ -143,14 +142,13 @@ class BottleneckCLM(nn.Module):
         rope_cos = bb.rope_cos[:, :, :T, :]
         rope_sin = bb.rope_sin[:, :, :T, :]
-        for i, block in enumerate(bb.layers):
             x = x + block.attn(block.attn_norm(x), rope_cos, rope_sin, mask)
-            if self.attn_adapters[i] is not None:
-                x = self.attn_adapters[i](x)
             x = x + block.ffn(block.ffn_norm(x))
-            if self.ffn_adapters[i] is not None:
-                x = self.ffn_adapters[i](x)
         x = bb.final_norm(x)
         return self.project_head(x)
@@ -174,20 +172,19 @@ class BottleneckCLM(nn.Module):
             rope_sin = bb.rope_sin[:, :, :T_new, :]
         new_kv_cache = []
-        for i, block in enumerate(bb.layers):
             # KV-cache forward for attention
             layer_cache = kv_cache[i] if kv_cache is not None else None
             attn_out, new_cache = block.attn.forward_kv(
                 block.attn_norm(x), rope_cos, rope_sin, layer_cache,
             )
             x = x + attn_out
-            if self.attn_adapters[i] is not None:
-                x = self.attn_adapters[i](x)
             new_kv_cache.append(new_cache)
             x = x + block.ffn(block.ffn_norm(x))
-            if self.ffn_adapters[i] is not None:
-                x = self.ffn_adapters[i](x)
         x = bb.final_norm(x[:, -1:, :])
         logits = bb.lm_head(x)
@@ -218,12 +215,12 @@ class BottleneckCLM(nn.Module):
         """Per-layer adapter weight norms for monitoring."""
         report = {}
         for i in range(len(self.backbone.layers)):
-            if self.attn_adapters[i] is not None:
-                a = self.attn_adapters[i]
                 report[f"adapter/layer{i}.attn.down"] = a.down.weight.data.norm().item()
                 report[f"adapter/layer{i}.attn.up"] = a.up.weight.data.norm().item()
-            if self.ffn_adapters[i] is not None:
-                a = self.ffn_adapters[i]
                 report[f"adapter/layer{i}.ffn.down"] = a.down.weight.data.norm().item()
                 report[f"adapter/layer{i}.ffn.up"] = a.up.weight.data.norm().item()
         return report

         for p in backbone.parameters():
             p.requires_grad = False
+        # Create adapter modules (Identity for non-adapted layers)
         self.attn_adapters = nn.ModuleList()
         self.ffn_adapters = nn.ModuleList()
         for i in range(n_layers):
             if i in self._attn_set:
                 self.attn_adapters.append(BottleneckAdapter(cfg.d_model, bottleneck_dim))
             else:
+                self.attn_adapters.append(nn.Identity())
             if i in self._ffn_set:
                 self.ffn_adapters.append(BottleneckAdapter(cfg.d_model, bottleneck_dim))
             else:
+                self.ffn_adapters.append(nn.Identity())
     @property
     def cfg(self) -> CLMConfig:
         rope_cos = bb.rope_cos[:, :, :T, :]
         rope_sin = bb.rope_sin[:, :, :T, :]
+        for i in range(len(bb.layers)):
+            block = bb.get_block(i)
             # Attention sublayer + adapter
             x = x + block.attn(block.attn_norm(x), rope_cos, rope_sin, None)
+            x = self.attn_adapters[i](x)
             # FFN sublayer + adapter
             x = x + block.ffn(block.ffn_norm(x))
+            x = self.ffn_adapters[i](x)
         return bb.final_norm(x)
         rope_cos = bb.rope_cos[:, :, :T, :]
         rope_sin = bb.rope_sin[:, :, :T, :]
+        for i in range(len(bb.layers)):
+            block = bb.get_block(i)
             x = x + block.attn(block.attn_norm(x), rope_cos, rope_sin, mask)
+            x = self.attn_adapters[i](x)
             x = x + block.ffn(block.ffn_norm(x))
+            x = self.ffn_adapters[i](x)
         x = bb.final_norm(x)
         return self.project_head(x)
             rope_sin = bb.rope_sin[:, :, :T_new, :]
         new_kv_cache = []
+        for i in range(len(bb.layers)):
+            block = bb.get_block(i)
             # KV-cache forward for attention
             layer_cache = kv_cache[i] if kv_cache is not None else None
             attn_out, new_cache = block.attn.forward_kv(
                 block.attn_norm(x), rope_cos, rope_sin, layer_cache,
             )
             x = x + attn_out
+            x = self.attn_adapters[i](x)
             new_kv_cache.append(new_cache)
             x = x + block.ffn(block.ffn_norm(x))
+            x = self.ffn_adapters[i](x)
         x = bb.final_norm(x[:, -1:, :])
         logits = bb.lm_head(x)
         """Per-layer adapter weight norms for monitoring."""
         report = {}
         for i in range(len(self.backbone.layers)):
+            a = self.attn_adapters[i]
+            if isinstance(a, BottleneckAdapter):
                 report[f"adapter/layer{i}.attn.down"] = a.down.weight.data.norm().item()
                 report[f"adapter/layer{i}.attn.up"] = a.up.weight.data.norm().item()
+            a = self.ffn_adapters[i]
+            if isinstance(a, BottleneckAdapter):
                 report[f"adapter/layer{i}.ffn.down"] = a.down.weight.data.norm().item()
                 report[f"adapter/layer{i}.ffn.up"] = a.up.weight.data.norm().item()
         return report

pawn/adapters/film.py CHANGED Viewed

@@ -140,10 +140,11 @@ class FiLMCLM(nn.Module):
             rope_sin = bb.rope_sin[:, :, :T_new, :]
         new_kv_cache = []
-        for i, (layer, film) in enumerate(zip(bb.layers, self.hidden_films)):
             layer_cache = kv_cache[i] if kv_cache is not None else None
-            x, new_cache = layer.forward_kv(x, rope_cos, rope_sin, layer_cache)
-            x = film(x)
             new_kv_cache.append(new_cache)
         x = bb.final_norm(x[:, -1:, :])
@@ -182,8 +183,9 @@ class FiLMCLM(nn.Module):
         """Per-layer FiLM deviation from identity, for monitoring."""
         report = {}
         for i, film in enumerate(self.hidden_films):
-            report[f"hidden_{i}/gamma_dev"] = (film.gamma - 1.0).norm().item()
-            report[f"hidden_{i}/beta_norm"] = film.beta.norm().item()
         if self.output_film is not None:
             report["output/gamma_dev"] = (self.output_film.gamma - 1.0).norm().item()
             report["output/beta_norm"] = self.output_film.beta.norm().item()

             rope_sin = bb.rope_sin[:, :, :T_new, :]
         new_kv_cache = []
+        for i in range(len(bb.layers)):
+            block = bb.get_block(i)
             layer_cache = kv_cache[i] if kv_cache is not None else None
+            x, new_cache = block.forward_kv(x, rope_cos, rope_sin, layer_cache)
+            x = self.hidden_films[i](x)
             new_kv_cache.append(new_cache)
         x = bb.final_norm(x[:, -1:, :])
         """Per-layer FiLM deviation from identity, for monitoring."""
         report = {}
         for i, film in enumerate(self.hidden_films):
+            if isinstance(film, FiLMLayer):
+                report[f"hidden_{i}/gamma_dev"] = (film.gamma - 1.0).norm().item()
+                report[f"hidden_{i}/beta_norm"] = film.beta.norm().item()
         if self.output_film is not None:
             report["output/gamma_dev"] = (self.output_film.gamma - 1.0).norm().item()
             report["output/beta_norm"] = self.output_film.beta.norm().item()

pawn/adapters/hybrid.py CHANGED Viewed

@@ -65,9 +65,11 @@ class HybridCLM(nn.Module):
             p.requires_grad = False
         # Inject LoRA
-        for layer_idx, block in enumerate(backbone.layers):
             if layer_idx not in self.lora_layer_set:
                 continue
             attn: Attention = block.attn
             for proj_name in self.attn_targets:
                 original = getattr(attn, proj_name)
@@ -78,14 +80,14 @@ class HybridCLM(nn.Module):
                     original = getattr(ffn, proj_name)
                     setattr(ffn, proj_name, LoRALinear(original, lora_rank, self.lora_alpha))
-        # Create FiLM layers (identity for non-adapted layers)
         if use_film:
             self.hidden_films = nn.ModuleList()
             for i in range(n_layers):
                 if i in self.film_layer_set:
                     self.hidden_films.append(FiLMLayer(cfg.d_model))
                 else:
-                    self.hidden_films.append(None)
         else:
             self.hidden_films = None
@@ -109,7 +111,7 @@ class HybridCLM(nn.Module):
         for i, layer in enumerate(bb.layers):
             x = layer(x, rope_cos, rope_sin, None)  # LoRA happens inside
-            if self.hidden_films is not None and self.hidden_films[i] is not None:
                 x = self.hidden_films[i](x)
         return bb.final_norm(x)
@@ -143,7 +145,7 @@ class HybridCLM(nn.Module):
         for i, layer in enumerate(bb.layers):
             x = layer(x, rope_cos, rope_sin, mask)
-            if self.hidden_films is not None and self.hidden_films[i] is not None:
                 x = self.hidden_films[i](x)
         x = bb.final_norm(x)
@@ -168,10 +170,11 @@ class HybridCLM(nn.Module):
             rope_sin = bb.rope_sin[:, :, :T_new, :]
         new_kv_cache = []
-        for i, layer in enumerate(bb.layers):
             layer_cache = kv_cache[i] if kv_cache is not None else None
-            x, new_cache = layer.forward_kv(x, rope_cos, rope_sin, layer_cache)
-            if self.hidden_films is not None and self.hidden_films[i] is not None:
                 x = self.hidden_films[i](x)
             new_kv_cache.append(new_cache)
@@ -187,7 +190,8 @@ class HybridCLM(nn.Module):
     def lora_parameters(self) -> list[nn.Parameter]:
         """Return only LoRA A/B parameters."""
         params = []
-        for block in self.backbone.layers:
             for proj_name in self.attn_targets:
                 module = getattr(block.attn, proj_name)
                 if isinstance(module, LoRALinear):
@@ -206,7 +210,7 @@ class HybridCLM(nn.Module):
         params = []
         if self.hidden_films is not None:
             for film in self.hidden_films:
-                if film is not None:
                     params.extend(film.parameters())
         if self.output_film is not None:
             params.extend(self.output_film.parameters())
@@ -233,7 +237,8 @@ class HybridCLM(nn.Module):
         report = {}
         # LoRA norms
-        for layer_idx, block in enumerate(self.backbone.layers):
             for proj_name in self.attn_targets:
                 module = getattr(block.attn, proj_name)
                 if isinstance(module, LoRALinear):
@@ -247,7 +252,7 @@ class HybridCLM(nn.Module):
         # FiLM norms
         if self.hidden_films is not None:
             for i, film in enumerate(self.hidden_films):
-                if film is not None:
                     report[f"film/hidden_{i}/gamma_dev"] = (film.gamma - 1.0).norm().item()
                     report[f"film/hidden_{i}/beta_norm"] = film.beta.norm().item()
         if self.output_film is not None:

             p.requires_grad = False
         # Inject LoRA
+        for layer_idx in range(n_layers):
             if layer_idx not in self.lora_layer_set:
                 continue
+            block = backbone.get_block(layer_idx)
             attn: Attention = block.attn
             for proj_name in self.attn_targets:
                 original = getattr(attn, proj_name)
                     original = getattr(ffn, proj_name)
                     setattr(ffn, proj_name, LoRALinear(original, lora_rank, self.lora_alpha))
+        # Create FiLM layers (Identity for non-adapted layers)
         if use_film:
             self.hidden_films = nn.ModuleList()
             for i in range(n_layers):
                 if i in self.film_layer_set:
                     self.hidden_films.append(FiLMLayer(cfg.d_model))
                 else:
+                    self.hidden_films.append(nn.Identity())
         else:
             self.hidden_films = None
         for i, layer in enumerate(bb.layers):
             x = layer(x, rope_cos, rope_sin, None)  # LoRA happens inside
+            if self.hidden_films is not None:
                 x = self.hidden_films[i](x)
         return bb.final_norm(x)
         for i, layer in enumerate(bb.layers):
             x = layer(x, rope_cos, rope_sin, mask)
+            if self.hidden_films is not None:
                 x = self.hidden_films[i](x)
         x = bb.final_norm(x)
             rope_sin = bb.rope_sin[:, :, :T_new, :]
         new_kv_cache = []
+        for i in range(len(bb.layers)):
+            block = bb.get_block(i)
             layer_cache = kv_cache[i] if kv_cache is not None else None
+            x, new_cache = block.forward_kv(x, rope_cos, rope_sin, layer_cache)
+            if self.hidden_films is not None:
                 x = self.hidden_films[i](x)
             new_kv_cache.append(new_cache)
     def lora_parameters(self) -> list[nn.Parameter]:
         """Return only LoRA A/B parameters."""
         params = []
+        for layer_idx in range(len(self.backbone.layers)):
+            block = self.backbone.get_block(layer_idx)
             for proj_name in self.attn_targets:
                 module = getattr(block.attn, proj_name)
                 if isinstance(module, LoRALinear):
         params = []
         if self.hidden_films is not None:
             for film in self.hidden_films:
+                if isinstance(film, FiLMLayer):
                     params.extend(film.parameters())
         if self.output_film is not None:
             params.extend(self.output_film.parameters())
         report = {}
         # LoRA norms
+        for layer_idx in range(len(self.backbone.layers)):
+            block = self.backbone.get_block(layer_idx)
             for proj_name in self.attn_targets:
                 module = getattr(block.attn, proj_name)
                 if isinstance(module, LoRALinear):
         # FiLM norms
         if self.hidden_films is not None:
             for i, film in enumerate(self.hidden_films):
+                if isinstance(film, FiLMLayer):
                     report[f"film/hidden_{i}/gamma_dev"] = (film.gamma - 1.0).norm().item()
                     report[f"film/hidden_{i}/beta_norm"] = film.beta.norm().item()
         if self.output_film is not None:

pawn/adapters/lora.py CHANGED Viewed

@@ -90,9 +90,10 @@ class LoRACLM(nn.Module):
             p.requires_grad = False
         # Inject LoRA into selected layers
-        for layer_idx, block in enumerate(backbone.layers):
             if layer_idx not in self.adapted_layers:
                 continue
             attn: Attention = block.attn
             for proj_name in self.attn_targets:
@@ -182,9 +183,9 @@ class LoRACLM(nn.Module):
             rope_sin = bb.rope_sin[:, :, :T_new, :]
         new_kv_cache = []
-        for i, layer in enumerate(bb.layers):
             layer_cache = kv_cache[i] if kv_cache is not None else None
-            x, new_cache = layer.forward_kv(x, rope_cos, rope_sin, layer_cache)
             new_kv_cache.append(new_cache)
         x = bb.final_norm(x[:, -1:, :])
@@ -215,8 +216,8 @@ class LoRACLM(nn.Module):
     def lora_weight_report(self) -> dict[str, float]:
         """Per-layer LoRA weight norms for monitoring."""
         report = {}
-        for layer_idx, block in enumerate(self.backbone.layers):
-            attn = block.attn
             for proj_name in self.attn_targets:
                 module = getattr(attn, proj_name)
                 if isinstance(module, LoRALinear):
@@ -224,7 +225,7 @@ class LoRACLM(nn.Module):
                     report[f"layer{layer_idx}.{proj_name}.B"] = module.lora_B.data.norm().item()
             if self.adapt_ffn:
-                ffn = block.ffn
                 for proj_name in _FFN_TARGETS:
                     module = getattr(ffn, proj_name)
                     if isinstance(module, LoRALinear):

             p.requires_grad = False
         # Inject LoRA into selected layers
+        for layer_idx in range(len(backbone.layers)):
             if layer_idx not in self.adapted_layers:
                 continue
+            block = backbone.get_block(layer_idx)
             attn: Attention = block.attn
             for proj_name in self.attn_targets:
             rope_sin = bb.rope_sin[:, :, :T_new, :]
         new_kv_cache = []
+        for i in range(len(bb.layers)):
             layer_cache = kv_cache[i] if kv_cache is not None else None
+            x, new_cache = bb.get_block(i).forward_kv(x, rope_cos, rope_sin, layer_cache)
             new_kv_cache.append(new_cache)
         x = bb.final_norm(x[:, -1:, :])
     def lora_weight_report(self) -> dict[str, float]:
         """Per-layer LoRA weight norms for monitoring."""
         report = {}
+        for layer_idx in range(len(self.backbone.layers)):
+            attn = self.backbone.get_block(layer_idx).attn
             for proj_name in self.attn_targets:
                 module = getattr(attn, proj_name)
                 if isinstance(module, LoRALinear):
                     report[f"layer{layer_idx}.{proj_name}.B"] = module.lora_B.data.norm().item()
             if self.adapt_ffn:
+                ffn = self.backbone.get_block(layer_idx).ffn
                 for proj_name in _FFN_TARGETS:
                     module = getattr(ffn, proj_name)
                     if isinstance(module, LoRALinear):

pawn/adapters/sparse.py CHANGED Viewed

@@ -24,6 +24,8 @@ class SparseLinear(nn.Module):
     output = F.linear(x, W_frozen + delta * mask, bias)
     """
     def __init__(self, frozen_linear: nn.Linear, mask: torch.Tensor):
         super().__init__()
         self.frozen = frozen_linear
@@ -82,9 +84,10 @@ class SparseCLM(nn.Module):
         gen = torch.Generator().manual_seed(seed)
         # Inject sparse adapters
-        for layer_idx, block in enumerate(backbone.layers):
             if layer_idx not in self.adapted_layers:
                 continue
             attn: Attention = block.attn
             for proj_name in self.attn_targets:
@@ -166,9 +169,9 @@ class SparseCLM(nn.Module):
             rope_sin = bb.rope_sin[:, :, :T_new, :]
         new_kv_cache = []
-        for i, layer in enumerate(bb.layers):
             layer_cache = kv_cache[i] if kv_cache is not None else None
-            x, new_cache = layer.forward_kv(x, rope_cos, rope_sin, layer_cache)
             new_kv_cache.append(new_cache)
         x = bb.final_norm(x[:, -1:, :])
@@ -184,7 +187,8 @@ class SparseCLM(nn.Module):
     def n_active_params(self) -> int:
         """Count of actually active (masked-in) parameters."""
         total = 0
-        for block in self.backbone.layers:
             for proj_name in self.attn_targets:
                 module = getattr(block.attn, proj_name)
                 if isinstance(module, SparseLinear):
@@ -215,7 +219,8 @@ class SparseCLM(nn.Module):
     def sparse_weight_report(self) -> dict[str, float]:
         """Per-layer sparse delta norms for monitoring."""
         report = {}
-        for layer_idx, block in enumerate(self.backbone.layers):
             for proj_name in self.attn_targets:
                 module = getattr(block.attn, proj_name)
                 if isinstance(module, SparseLinear):

     output = F.linear(x, W_frozen + delta * mask, bias)
     """
+    mask: torch.Tensor
     def __init__(self, frozen_linear: nn.Linear, mask: torch.Tensor):
         super().__init__()
         self.frozen = frozen_linear
         gen = torch.Generator().manual_seed(seed)
         # Inject sparse adapters
+        for layer_idx in range(len(backbone.layers)):
             if layer_idx not in self.adapted_layers:
                 continue
+            block = backbone.get_block(layer_idx)
             attn: Attention = block.attn
             for proj_name in self.attn_targets:
             rope_sin = bb.rope_sin[:, :, :T_new, :]
         new_kv_cache = []
+        for i in range(len(bb.layers)):
             layer_cache = kv_cache[i] if kv_cache is not None else None
+            x, new_cache = bb.get_block(i).forward_kv(x, rope_cos, rope_sin, layer_cache)
             new_kv_cache.append(new_cache)
         x = bb.final_norm(x[:, -1:, :])
     def n_active_params(self) -> int:
         """Count of actually active (masked-in) parameters."""
         total = 0
+        for layer_idx in range(len(self.backbone.layers)):
+            block = self.backbone.get_block(layer_idx)
             for proj_name in self.attn_targets:
                 module = getattr(block.attn, proj_name)
                 if isinstance(module, SparseLinear):
     def sparse_weight_report(self) -> dict[str, float]:
         """Per-layer sparse delta norms for monitoring."""
         report = {}
+        for layer_idx in range(len(self.backbone.layers)):
+            block = self.backbone.get_block(layer_idx)
             for proj_name in self.attn_targets:
                 module = getattr(block.attn, proj_name)
                 if isinstance(module, SparseLinear):

pawn/data.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import os
 import threading
 import time
 import numpy as np
 import torch
@@ -19,7 +20,7 @@ from pawn.config import (
 )
-_positions_cache: dict = {}
@@ -130,10 +131,10 @@ class CLMDataset(torch.utils.data.IterableDataset):
         self._start_step = 0
         self._main_pid = os.getpid()
-    def set_start_step(self, step: int):
         self._start_step = step
-    def __iter__(self):
         worker_info = torch.utils.data.get_worker_info()
         worker_id = worker_info.id if worker_info else 0
         num_workers = worker_info.num_workers if worker_info else 1

 import os
 import threading
 import time
+from collections.abc import Iterator
 import numpy as np
 import torch
 )
+_positions_cache: dict[tuple[str, int], torch.Tensor] = {}
         self._start_step = 0
         self._main_pid = os.getpid()
+    def set_start_step(self, step: int) -> None:
         self._start_step = step
+    def __iter__(self) -> Iterator[dict[str, torch.Tensor]]:
         worker_info = torch.utils.data.get_worker_info()
         worker_id = worker_info.id if worker_info else 0
         num_workers = worker_info.num_workers if worker_info else 1

pawn/eval_suite/corpus.py CHANGED Viewed

@@ -10,6 +10,7 @@ Storage layout:
 import json
 import time
 from pathlib import Path
 import numpy as np
@@ -32,7 +33,7 @@ def _popcount_u64(arr: np.ndarray) -> np.ndarray:
     return result
-def _count_legal_moves(move_ids, game_lengths):
     """Legal move count per ply via bit-packed grids + promo mask."""
     grid, promo_mask = engine.compute_legal_move_masks(move_ids, game_lengths)
     grid_counts = np.zeros(grid.shape[:2], dtype=np.uint32)
@@ -384,7 +385,7 @@ _PHASES = [("ply_1_20", 0, 20), ("ply_21_80", 20, 80),
            ("ply_81_150", 80, 150), ("ply_150_plus", 150, 9999)]
-def _iter_position_parts(corpus: dict):
     """Yield each position parquet part as an eager DataFrame."""
     from pathlib import Path
     # Find corpus dir from the LazyFrame's file path
@@ -395,7 +396,7 @@ def _iter_position_parts(corpus: dict):
         yield pl.read_parquet(f)
-def _new_accumulator() -> dict:
     return {
         "n": 0, "sum_k": 0.0, "sum_k_sq": 0.0, "k_min": 999, "k_max": 0,
         "sum_inv_k": 0.0, "sum_inv_k_sq": 0.0,
@@ -408,7 +409,7 @@ def _new_accumulator() -> dict:
     }
-def _accumulate(acc: dict, df: pl.DataFrame):
     """Accumulate stats from one chunk (already filtered to k > 0)."""
     k = df["k"].to_numpy().astype(np.float64)
     ply = df["ply"].to_numpy()
@@ -455,7 +456,7 @@ def _accumulate(acc: dict, df: pl.DataFrame):
     del k, ply, chk, inv_k, ln_k, top5
-def _finalize_k_stats(acc):
     N = acc["n"]
     mean = acc["sum_k"] / N
     var = acc["sum_k_sq"] / N - mean ** 2
@@ -466,13 +467,13 @@ def _finalize_k_stats(acc):
             "min": acc["k_min"], "max": acc["k_max"]}
-def _finalize_k_hist(acc):
     h = acc["k_hist"]
     nz = h > 0
     return {"values": np.arange(300)[nz].tolist(), "counts": h[nz].tolist(), "total": acc["n"]}
-def _finalize_phases(acc):
     result = {}
     for name, _, _ in _PHASES:
         c = acc[f"{name}_n"]
@@ -486,7 +487,7 @@ def _finalize_phases(acc):
     return result
-def _finalize_checks(acc):
     N = acc["n"]
     result = {}
     for label in ("chk", "nochk"):

 import json
 import time
+from collections.abc import Iterator
 from pathlib import Path
 import numpy as np
     return result
+def _count_legal_moves(move_ids: np.ndarray, game_lengths: np.ndarray) -> np.ndarray:
     """Legal move count per ply via bit-packed grids + promo mask."""
     grid, promo_mask = engine.compute_legal_move_masks(move_ids, game_lengths)
     grid_counts = np.zeros(grid.shape[:2], dtype=np.uint32)
            ("ply_81_150", 80, 150), ("ply_150_plus", 150, 9999)]
+def _iter_position_parts(corpus: dict) -> Iterator[pl.DataFrame]:
     """Yield each position parquet part as an eager DataFrame."""
     from pathlib import Path
     # Find corpus dir from the LazyFrame's file path
         yield pl.read_parquet(f)
+def _new_accumulator() -> dict[str, int | float | np.ndarray]:
     return {
         "n": 0, "sum_k": 0.0, "sum_k_sq": 0.0, "k_min": 999, "k_max": 0,
         "sum_inv_k": 0.0, "sum_inv_k_sq": 0.0,
     }
+def _accumulate(acc: dict, df: pl.DataFrame) -> None:
     """Accumulate stats from one chunk (already filtered to k > 0)."""
     k = df["k"].to_numpy().astype(np.float64)
     ply = df["ply"].to_numpy()
     del k, ply, chk, inv_k, ln_k, top5
+def _finalize_k_stats(acc: dict) -> dict[str, float | int]:
     N = acc["n"]
     mean = acc["sum_k"] / N
     var = acc["sum_k_sq"] / N - mean ** 2
             "min": acc["k_min"], "max": acc["k_max"]}
+def _finalize_k_hist(acc: dict) -> dict:
     h = acc["k_hist"]
     nz = h > 0
     return {"values": np.arange(300)[nz].tolist(), "counts": h[nz].tolist(), "total": acc["n"]}
+def _finalize_phases(acc: dict) -> dict:
     result = {}
     for name, _, _ in _PHASES:
         c = acc[f"{name}_n"]
     return result
+def _finalize_checks(acc: dict) -> dict:
     N = acc["n"]
     result = {}
     for label in ("chk", "nochk"):

pawn/eval_suite/diagnostics.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
 import chess_engine as engine
@@ -39,7 +40,7 @@ def extract_diagnostic_positions(
     corpus: dict,
     min_per_category: int = 2000,
     max_per_category: int = 5000,
-) -> dict:
     """Extract diagnostic positions from corpus.
     Returns dict[category_name] -> list of dicts with:
@@ -140,7 +141,7 @@ def _term_code_to_outcome_name(tc: int, gl: int) -> str:
 @torch.no_grad()
 def evaluate_diagnostic_positions(
-    model,
     positions: dict,
     corpus: dict,
     device: str,

 import numpy as np
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
 import chess_engine as engine
     corpus: dict,
     min_per_category: int = 2000,
     max_per_category: int = 5000,
+) -> dict[str, list[dict]]:
     """Extract diagnostic positions from corpus.
     Returns dict[category_name] -> list of dicts with:
 @torch.no_grad()
 def evaluate_diagnostic_positions(
+    model: nn.Module,
     positions: dict,
     corpus: dict,
     device: str,

pawn/eval_suite/generation.py CHANGED Viewed

@@ -1,9 +1,13 @@
 """Autoregressive generation for outcome token signal tests (§6)."""
 import gc
 import numpy as np
 import torch
 import torch.nn.functional as F
 import chess_engine as engine
@@ -12,13 +16,32 @@ from pawn.config import PAD_TOKEN, WHITE_CHECKMATES, PLY_LIMIT, CLMConfig
 from pawn.data import _map_termination_to_outcome
 # ---------------------------------------------------------------------------
 # Core autoregressive generation
 # ---------------------------------------------------------------------------
 def autoregressive_generate(
-    model,
     outcome_token: int,
     n_games: int,
     device: str,
@@ -28,7 +51,7 @@ def autoregressive_generate(
     max_seq_len: int = 256,
     temperature: float = 1.0,
     batch_size: int = 64,
-) -> dict:
     """Generate games autoregressively from a trained PAWN.
     Args:
@@ -74,9 +97,16 @@ def autoregressive_generate(
 @torch.no_grad()
 def _generate_batch(
-    model, outcome_token, n_games, device, mask_illegal,
-    prefix_moves, prefix_lengths, max_seq_len, temperature,
-) -> dict:
     """Generate a batch of games using batch Rust engine for state management."""
     cfg_vocab_size = CLMConfig.vocab_size  # 4278
     max_move_positions = max_seq_len - 1  # position 0 is outcome token
@@ -264,7 +294,7 @@ OUTCOME_TOKENS = {
 def outcome_signal_test(
-    model,
     device: str,
     n_per_outcome: int = 1000,
     mask_conditions: tuple[bool, ...] = (False, True),
@@ -354,7 +384,7 @@ def _analyze_generated_games(gen: dict, conditioned_outcome: str) -> dict:
 def prefix_continuation_test(
-    model,
     corpus: dict,
     device: str,
     n_per_bucket: int = 200,
@@ -475,7 +505,7 @@ def prefix_continuation_test(
     return results
-def _outcome_mask(term_codes, game_lengths, outcome_name):
     """Create a boolean mask for games matching the given outcome."""
     if outcome_name == "WHITE_CHECKMATES":
         return (term_codes == 0) & (game_lengths % 2 == 1)
@@ -503,7 +533,7 @@ POISONING_PAIRS = [
 def poisoned_prefix_test(
-    model,
     corpus: dict,
     device: str,
     n_per_pair: int = 500,
@@ -566,7 +596,7 @@ def poisoned_prefix_test(
 def impossible_task_test(
-    model,
     corpus: dict,
     device: str,
     n_per_scenario: int = 200,
@@ -652,7 +682,7 @@ def impossible_task_test(
 def improbable_task_test(
-    model,
     corpus: dict,
     device: str,
     n_per_scenario: int = 200,

 """Autoregressive generation for outcome token signal tests (§6)."""
+from __future__ import annotations
 import gc
+from typing import Protocol
 import numpy as np
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
 import chess_engine as engine
 from pawn.data import _map_termination_to_outcome
+class GenerativeModel(Protocol):
+    """Structural type for models usable in autoregressive generation."""
+    def eval(self) -> nn.Module: ...
+    def __call__(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        hidden_only: bool = ...,
+    ) -> tuple[torch.Tensor, list[torch.Tensor]]: ...
+    def forward_generate(
+        self,
+        input_ids: torch.Tensor,
+        kv_cache: list[tuple[torch.Tensor, torch.Tensor]] | None = ...,
+    ) -> tuple[torch.Tensor, list[tuple[torch.Tensor, torch.Tensor]]]: ...
 # ---------------------------------------------------------------------------
 # Core autoregressive generation
 # ---------------------------------------------------------------------------
 def autoregressive_generate(
+    model: GenerativeModel,
     outcome_token: int,
     n_games: int,
     device: str,
     max_seq_len: int = 256,
     temperature: float = 1.0,
     batch_size: int = 64,
+) -> dict[str, np.ndarray]:
     """Generate games autoregressively from a trained PAWN.
     Args:
 @torch.no_grad()
 def _generate_batch(
+    model: GenerativeModel,
+    outcome_token: int,
+    n_games: int,
+    device: str,
+    mask_illegal: bool,
+    prefix_moves: np.ndarray | None,
+    prefix_lengths: np.ndarray | None,
+    max_seq_len: int,
+    temperature: float,
+) -> dict[str, np.ndarray]:
     """Generate a batch of games using batch Rust engine for state management."""
     cfg_vocab_size = CLMConfig.vocab_size  # 4278
     max_move_positions = max_seq_len - 1  # position 0 is outcome token
 def outcome_signal_test(
+    model: GenerativeModel,
     device: str,
     n_per_outcome: int = 1000,
     mask_conditions: tuple[bool, ...] = (False, True),
 def prefix_continuation_test(
+    model: GenerativeModel,
     corpus: dict,
     device: str,
     n_per_bucket: int = 200,
     return results
+def _outcome_mask(term_codes: np.ndarray, game_lengths: np.ndarray, outcome_name: str) -> np.ndarray:
     """Create a boolean mask for games matching the given outcome."""
     if outcome_name == "WHITE_CHECKMATES":
         return (term_codes == 0) & (game_lengths % 2 == 1)
 def poisoned_prefix_test(
+    model: GenerativeModel,
     corpus: dict,
     device: str,
     n_per_pair: int = 500,
 def impossible_task_test(
+    model: GenerativeModel,
     corpus: dict,
     device: str,
     n_per_scenario: int = 200,
 def improbable_task_test(
+    model: GenerativeModel,
     corpus: dict,
     device: str,
     n_per_scenario: int = 200,

pawn/eval_suite/lichess.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pathlib import Path
 import numpy as np
 import torch
 import chess_engine as engine
@@ -119,7 +120,7 @@ def _extract_elos_from_pgn(pgn_path: Path, max_games: int) -> list[tuple[int, in
 @torch.no_grad()
 def evaluate_on_lichess(
-    model,
     lichess_data: dict,
     device: str,
     max_seq_len: int = 256,

 import numpy as np
 import torch
+import torch.nn as nn
 import chess_engine as engine
 @torch.no_grad()
 def evaluate_on_lichess(
+    model: nn.Module,
     lichess_data: dict,
     device: str,
     max_seq_len: int = 256,

pawn/eval_suite/probes.py CHANGED Viewed

@@ -295,7 +295,7 @@ def _extract_targets(
 # ---------------------------------------------------------------------------
-def _compute_loss(logits, targets, loss_type, n_outputs):
     if loss_type == "ce":
         return F.cross_entropy(logits, targets)
     elif loss_type == "ce_per_square":
@@ -308,7 +308,7 @@ def _compute_loss(logits, targets, loss_type, n_outputs):
         raise ValueError(f"Unknown loss type: {loss_type}")
-def _compute_accuracy(logits, targets, loss_type, n_outputs):
     if loss_type == "ce":
         preds = logits.argmax(dim=-1)
         return (preds == targets).float().mean().item()
@@ -328,7 +328,7 @@ def _compute_accuracy(logits, targets, loss_type, n_outputs):
         raise ValueError(f"Unknown loss type: {loss_type}")
-def _compute_mae(logits, targets):
     """Mean absolute error for regression probes."""
     return (logits - targets).abs().mean().item()
@@ -427,7 +427,7 @@ def train_single_probe(
     )
-def _eval_in_batches(probe, h, t, loss_type, n_outputs, device, batch_size):
     """Accuracy in mini-batches."""
     total_correct = 0.0
     total = 0
@@ -441,7 +441,7 @@ def _eval_in_batches(probe, h, t, loss_type, n_outputs, device, batch_size):
     return total_correct / total if total > 0 else 0.0
-def _eval_loss_in_batches(probe, h, t, loss_type, n_outputs, device, batch_size):
     """Loss in mini-batches (returns scalar)."""
     total_loss = 0.0
     total = 0
@@ -455,7 +455,7 @@ def _eval_loss_in_batches(probe, h, t, loss_type, n_outputs, device, batch_size)
     return total_loss / total if total > 0 else 0.0
-def _eval_mae_in_batches(probe, h, t, device, batch_size):
     """MAE in mini-batches."""
     total_ae = 0.0
     total = 0

 # ---------------------------------------------------------------------------
+def _compute_loss(logits: torch.Tensor, targets: torch.Tensor, loss_type: str, n_outputs: int) -> torch.Tensor:
     if loss_type == "ce":
         return F.cross_entropy(logits, targets)
     elif loss_type == "ce_per_square":
         raise ValueError(f"Unknown loss type: {loss_type}")
+def _compute_accuracy(logits: torch.Tensor, targets: torch.Tensor, loss_type: str, n_outputs: int) -> float:
     if loss_type == "ce":
         preds = logits.argmax(dim=-1)
         return (preds == targets).float().mean().item()
         raise ValueError(f"Unknown loss type: {loss_type}")
+def _compute_mae(logits: torch.Tensor, targets: torch.Tensor) -> float:
     """Mean absolute error for regression probes."""
     return (logits - targets).abs().mean().item()
     )
+def _eval_in_batches(probe: LinearProbe, h: torch.Tensor, t: torch.Tensor, loss_type: str, n_outputs: int, device: str, batch_size: int) -> float:
     """Accuracy in mini-batches."""
     total_correct = 0.0
     total = 0
     return total_correct / total if total > 0 else 0.0
+def _eval_loss_in_batches(probe: LinearProbe, h: torch.Tensor, t: torch.Tensor, loss_type: str, n_outputs: int, device: str, batch_size: int) -> float:
     """Loss in mini-batches (returns scalar)."""
     total_loss = 0.0
     total = 0
     return total_loss / total if total > 0 else 0.0
+def _eval_mae_in_batches(probe: LinearProbe, h: torch.Tensor, t: torch.Tensor, device: str, batch_size: int) -> float:
     """MAE in mini-batches."""
     total_ae = 0.0
     total = 0

pawn/eval_suite/viz.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.ticker as mticker
 import seaborn as sns
 # Consistent style
@@ -26,11 +27,12 @@ GRID_PAWN_BASELINES = {
 # ---------------------------------------------------------------------------
-def plot_game_length_distribution(stats: dict, ax=None) -> plt.Figure:
     """Histogram of game lengths."""
     fig = None
     if ax is None:
         fig, ax = plt.subplots(figsize=FIGSIZE)
     counts = stats["game_length"]["histogram_counts"]
     edges = stats["game_length"]["histogram_edges"]
     centers = [(edges[i] + edges[i + 1]) / 2 for i in range(len(counts))]
@@ -44,11 +46,12 @@ def plot_game_length_distribution(stats: dict, ax=None) -> plt.Figure:
     return fig or ax.figure
-def plot_legal_move_distribution(bounds: dict, ax=None) -> plt.Figure:
     """Histogram of legal move counts (K) from pre-computed histogram data."""
     fig = None
     if ax is None:
         fig, ax = plt.subplots(figsize=FIGSIZE)
     k_hist = bounds["k_histogram"]
     k_vals = np.array(k_hist["values"])
     k_counts = np.array(k_hist["counts"], dtype=np.float64)
@@ -64,11 +67,12 @@ def plot_legal_move_distribution(bounds: dict, ax=None) -> plt.Figure:
     return fig or ax.figure
-def plot_outcome_rates(stats: dict, ax=None) -> plt.Figure:
     """Bar chart of outcome base rates."""
     fig = None
     if ax is None:
         fig, ax = plt.subplots(figsize=FIGSIZE)
     rates = stats["outcome_rates"]
     names = list(rates.keys())
     values = [rates[n] * 100 for n in names]
@@ -82,11 +86,12 @@ def plot_outcome_rates(stats: dict, ax=None) -> plt.Figure:
     return fig or ax.figure
-def plot_k_by_phase(bounds: dict, ax=None) -> plt.Figure:
     """E[1/K] by game phase."""
     fig = None
     if ax is None:
         fig, ax = plt.subplots(figsize=FIGSIZE)
     phase_data = bounds["phase_bounds"]
     names = list(phase_data.keys())
     e_inv_k = [phase_data[n]["e_1_over_k"] * 100 for n in names]
@@ -104,11 +109,12 @@ def plot_k_by_phase(bounds: dict, ax=None) -> plt.Figure:
     return fig or ax.figure
-def plot_prefix_histogram(sanity: dict, ax=None) -> plt.Figure:
     """Histogram of common prefix lengths."""
     fig = None
     if ax is None:
         fig, ax = plt.subplots(figsize=FIGSIZE)
     hist = sanity["prefix_length_histogram"]
     ks = sorted(hist.keys())
     vs = [hist[k] for k in ks]

 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.ticker as mticker
+from matplotlib.axes import Axes
 import seaborn as sns
 # Consistent style
 # ---------------------------------------------------------------------------
+def plot_game_length_distribution(stats: dict, ax: Axes | None = None) -> plt.Figure:
     """Histogram of game lengths."""
     fig = None
     if ax is None:
         fig, ax = plt.subplots(figsize=FIGSIZE)
+    assert ax is not None
     counts = stats["game_length"]["histogram_counts"]
     edges = stats["game_length"]["histogram_edges"]
     centers = [(edges[i] + edges[i + 1]) / 2 for i in range(len(counts))]
     return fig or ax.figure
+def plot_legal_move_distribution(bounds: dict, ax: Axes | None = None) -> plt.Figure:
     """Histogram of legal move counts (K) from pre-computed histogram data."""
     fig = None
     if ax is None:
         fig, ax = plt.subplots(figsize=FIGSIZE)
+    assert ax is not None
     k_hist = bounds["k_histogram"]
     k_vals = np.array(k_hist["values"])
     k_counts = np.array(k_hist["counts"], dtype=np.float64)
     return fig or ax.figure
+def plot_outcome_rates(stats: dict, ax: Axes | None = None) -> plt.Figure:
     """Bar chart of outcome base rates."""
     fig = None
     if ax is None:
         fig, ax = plt.subplots(figsize=FIGSIZE)
+    assert ax is not None
     rates = stats["outcome_rates"]
     names = list(rates.keys())
     values = [rates[n] * 100 for n in names]
     return fig or ax.figure
+def plot_k_by_phase(bounds: dict, ax: Axes | None = None) -> plt.Figure:
     """E[1/K] by game phase."""
     fig = None
     if ax is None:
         fig, ax = plt.subplots(figsize=FIGSIZE)
+    assert ax is not None
     phase_data = bounds["phase_bounds"]
     names = list(phase_data.keys())
     e_inv_k = [phase_data[n]["e_1_over_k"] * 100 for n in names]
     return fig or ax.figure
+def plot_prefix_histogram(sanity: dict, ax: Axes | None = None) -> plt.Figure:
     """Histogram of common prefix lengths."""
     fig = None
     if ax is None:
         fig, ax = plt.subplots(figsize=FIGSIZE)
+    assert ax is not None
     hist = sanity["prefix_length_histogram"]
     ks = sorted(hist.keys())
     vs = [hist[k] for k in ks]

pawn/eval_suite/worker.py CHANGED Viewed

@@ -20,7 +20,12 @@ from __future__ import annotations
 import gc
 import multiprocessing as mp
 from pathlib import Path
 # Use "spawn" so the child gets a clean process with no inherited GPU state.
 _ctx = mp.get_context("spawn")
@@ -31,11 +36,11 @@ _ctx = mp.get_context("spawn")
 # ---------------------------------------------------------------------------
-def _worker_entry(fn, args, kwargs):
     return fn(*args, **kwargs)
-def run_in_worker(fn, *args, timeout: float | None = None, **kwargs):
     """Run fn(*args, **kwargs) in an isolated worker process.
     On KeyboardInterrupt, the worker is terminated and the interrupt is
@@ -55,7 +60,7 @@ def run_in_worker(fn, *args, timeout: float | None = None, **kwargs):
 # ---------------------------------------------------------------------------
-def _load_model(checkpoint_path: str, device: str):
     """Load and freeze a PAWNCLM checkpoint. Runs inside worker processes."""
     import torch
     from pawn.config import CLMConfig
@@ -83,8 +88,8 @@ def _load_corpus(corpus_dir: str) -> dict:
 # ---------------------------------------------------------------------------
-def _probes_worker(checkpoint_path, device, n_train, n_val, n_epochs,
-                   seed_train, seed_val):
     from pawn.eval_suite.probes import extract_probe_data, train_all_probes
     model = _load_model(checkpoint_path, device)
     train_data = extract_probe_data(n_train, max_ply=256, seed=seed_train)
@@ -94,7 +99,7 @@ def _probes_worker(checkpoint_path, device, n_train, n_val, n_epochs,
 def run_probes(
-    checkpoint_path,
     device: str,
     n_train: int = 5_000,
     n_val: int = 1_000,
@@ -110,7 +115,8 @@ def run_probes(
     )
-def _signal_test_worker(checkpoint_path, device, n_per_outcome, mask_conditions):
     from pawn.eval_suite.generation import outcome_signal_test
     model = _load_model(checkpoint_path, device)
     return outcome_signal_test(model, device, n_per_outcome=n_per_outcome,
@@ -118,7 +124,7 @@ def _signal_test_worker(checkpoint_path, device, n_per_outcome, mask_conditions)
 def run_outcome_signal_test(
-    checkpoint_path,
     device: str,
     n_per_outcome: int = 1000,
     mask_conditions: tuple[bool, ...] = (False, True),
@@ -130,8 +136,9 @@ def run_outcome_signal_test(
     )
-def _prefix_continuation_worker(checkpoint_path, corpus_dir, device,
-                                 n_per_bucket, prefix_pcts, absolute_plies):
     from pawn.eval_suite.generation import prefix_continuation_test
     model = _load_model(checkpoint_path, device)
     corpus = _load_corpus(corpus_dir)
@@ -142,8 +149,8 @@ def _prefix_continuation_worker(checkpoint_path, corpus_dir, device,
 def run_prefix_continuation_test(
-    checkpoint_path,
-    corpus_dir,
     device: str,
     n_per_bucket: int = 200,
     prefix_pcts: tuple[float, ...] = (0.1, 0.5, 0.9),
@@ -157,8 +164,8 @@ def run_prefix_continuation_test(
     )
-def _poisoned_prefix_worker(checkpoint_path, corpus_dir, device,
-                             n_per_pair, prefix_pct):
     from pawn.eval_suite.generation import poisoned_prefix_test
     model = _load_model(checkpoint_path, device)
     corpus = _load_corpus(corpus_dir)
@@ -167,8 +174,8 @@ def _poisoned_prefix_worker(checkpoint_path, corpus_dir, device,
 def run_poisoned_prefix_test(
-    checkpoint_path,
-    corpus_dir,
     device: str,
     n_per_pair: int = 500,
     prefix_pct: float = 0.5,
@@ -180,7 +187,8 @@ def run_poisoned_prefix_test(
     )
-def _impossible_task_worker(checkpoint_path, corpus_dir, device, n_per_scenario):
     from pawn.eval_suite.generation import impossible_task_test
     model = _load_model(checkpoint_path, device)
     corpus = _load_corpus(corpus_dir)
@@ -188,8 +196,8 @@ def _impossible_task_worker(checkpoint_path, corpus_dir, device, n_per_scenario)
 def run_impossible_task_test(
-    checkpoint_path,
-    corpus_dir,
     device: str,
     n_per_scenario: int = 200,
 ) -> dict:
@@ -200,7 +208,8 @@ def run_impossible_task_test(
     )
-def _improbable_task_worker(checkpoint_path, corpus_dir, device, n_per_scenario):
     from pawn.eval_suite.generation import improbable_task_test
     model = _load_model(checkpoint_path, device)
     corpus = _load_corpus(corpus_dir)
@@ -208,8 +217,8 @@ def _improbable_task_worker(checkpoint_path, corpus_dir, device, n_per_scenario)
 def run_improbable_task_test(
-    checkpoint_path,
-    corpus_dir,
     device: str,
     n_per_scenario: int = 200,
 ) -> dict:
@@ -220,8 +229,9 @@ def run_improbable_task_test(
     )
-def _diagnostic_worker(checkpoint_path, corpus_dir, device, min_per_category,
-                        max_per_category, n_samples, batch_size):
     from pawn.eval_suite.diagnostics import (
         extract_diagnostic_positions, evaluate_diagnostic_positions,
     )
@@ -240,8 +250,8 @@ def _diagnostic_worker(checkpoint_path, corpus_dir, device, min_per_category,
 def run_diagnostic_eval(
-    checkpoint_path,
-    corpus_dir,
     device: str,
     min_per_category: int = 2000,
     max_per_category: int = 5000,

 import gc
 import multiprocessing as mp
+from collections.abc import Callable
 from pathlib import Path
+from typing import Any, TYPE_CHECKING
+if TYPE_CHECKING:
+    from pawn.model import PAWNCLM
 # Use "spawn" so the child gets a clean process with no inherited GPU state.
 _ctx = mp.get_context("spawn")
 # ---------------------------------------------------------------------------
+def _worker_entry(fn: Callable[..., Any], args: tuple, kwargs: dict) -> Any:
     return fn(*args, **kwargs)
+def run_in_worker(fn: Callable[..., Any], *args: Any, timeout: float | None = None, **kwargs: Any) -> Any:
     """Run fn(*args, **kwargs) in an isolated worker process.
     On KeyboardInterrupt, the worker is terminated and the interrupt is
 # ---------------------------------------------------------------------------
+def _load_model(checkpoint_path: str, device: str) -> PAWNCLM:
     """Load and freeze a PAWNCLM checkpoint. Runs inside worker processes."""
     import torch
     from pawn.config import CLMConfig
 # ---------------------------------------------------------------------------
+def _probes_worker(checkpoint_path: str, device: str, n_train: int, n_val: int,
+                   n_epochs: int, seed_train: int, seed_val: int) -> dict:
     from pawn.eval_suite.probes import extract_probe_data, train_all_probes
     model = _load_model(checkpoint_path, device)
     train_data = extract_probe_data(n_train, max_ply=256, seed=seed_train)
 def run_probes(
+    checkpoint_path: str | Path,
     device: str,
     n_train: int = 5_000,
     n_val: int = 1_000,
     )
+def _signal_test_worker(checkpoint_path: str, device: str, n_per_outcome: int,
+                         mask_conditions: list[bool]) -> dict:
     from pawn.eval_suite.generation import outcome_signal_test
     model = _load_model(checkpoint_path, device)
     return outcome_signal_test(model, device, n_per_outcome=n_per_outcome,
 def run_outcome_signal_test(
+    checkpoint_path: str | Path,
     device: str,
     n_per_outcome: int = 1000,
     mask_conditions: tuple[bool, ...] = (False, True),
     )
+def _prefix_continuation_worker(checkpoint_path: str, corpus_dir: str, device: str,
+                                 n_per_bucket: int, prefix_pcts: list[float],
+                                 absolute_plies: list[int]) -> dict:
     from pawn.eval_suite.generation import prefix_continuation_test
     model = _load_model(checkpoint_path, device)
     corpus = _load_corpus(corpus_dir)
 def run_prefix_continuation_test(
+    checkpoint_path: str | Path,
+    corpus_dir: str | Path,
     device: str,
     n_per_bucket: int = 200,
     prefix_pcts: tuple[float, ...] = (0.1, 0.5, 0.9),
     )
+def _poisoned_prefix_worker(checkpoint_path: str, corpus_dir: str, device: str,
+                             n_per_pair: int, prefix_pct: float) -> dict:
     from pawn.eval_suite.generation import poisoned_prefix_test
     model = _load_model(checkpoint_path, device)
     corpus = _load_corpus(corpus_dir)
 def run_poisoned_prefix_test(
+    checkpoint_path: str | Path,
+    corpus_dir: str | Path,
     device: str,
     n_per_pair: int = 500,
     prefix_pct: float = 0.5,
     )
+def _impossible_task_worker(checkpoint_path: str, corpus_dir: str, device: str,
+                             n_per_scenario: int) -> dict:
     from pawn.eval_suite.generation import impossible_task_test
     model = _load_model(checkpoint_path, device)
     corpus = _load_corpus(corpus_dir)
 def run_impossible_task_test(
+    checkpoint_path: str | Path,
+    corpus_dir: str | Path,
     device: str,
     n_per_scenario: int = 200,
 ) -> dict:
     )
+def _improbable_task_worker(checkpoint_path: str, corpus_dir: str, device: str,
+                             n_per_scenario: int) -> dict:
     from pawn.eval_suite.generation import improbable_task_test
     model = _load_model(checkpoint_path, device)
     corpus = _load_corpus(corpus_dir)
 def run_improbable_task_test(
+    checkpoint_path: str | Path,
+    corpus_dir: str | Path,
     device: str,
     n_per_scenario: int = 200,
 ) -> dict:
     )
+def _diagnostic_worker(checkpoint_path: str, corpus_dir: str, device: str,
+                        min_per_category: int, max_per_category: int,
+                        n_samples: int, batch_size: int) -> dict:
     from pawn.eval_suite.diagnostics import (
         extract_diagnostic_positions, evaluate_diagnostic_positions,
     )
 def run_diagnostic_eval(
+    checkpoint_path: str | Path,
+    corpus_dir: str | Path,
     device: str,
     min_per_category: int = 2000,
     max_per_category: int = 5000,

pawn/lichess_data.py CHANGED Viewed

@@ -308,10 +308,10 @@ class LichessDataset(torch.utils.data.Dataset):
         self.game_lengths = torch.from_numpy(np.array(self.game_lengths)).share_memory_()
         return self
-    def __len__(self):
         return len(self.input_ids)
-    def __getitem__(self, idx):
         return {
             "input_ids": self.input_ids[idx],
             "targets": self.targets[idx],

         self.game_lengths = torch.from_numpy(np.array(self.game_lengths)).share_memory_()
         return self
+    def __len__(self) -> int:
         return len(self.input_ids)
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor | int]:
         return {
             "input_ids": self.input_ids[idx],
             "targets": self.targets[idx],

pawn/logging.py CHANGED Viewed

@@ -72,7 +72,7 @@ class MetricsLogger:
             record_type: Record type (train, eval, batch, etc.)
             include_resources: Whether to include memory/CPU stats
         """
-        record = {"type": record_type}
         if step is not None:
             record["step"] = step
@@ -119,14 +119,14 @@ class MetricsLogger:
     def close(self) -> None:
         self._file.close()
-    def __enter__(self):
         return self
-    def __exit__(self, *args):
         self.close()
-def _sanitize(obj):
     """Replace NaN/Inf with None for valid JSON."""
     if isinstance(obj, float) and (math.isnan(obj) or math.isinf(obj)):
         return None

             record_type: Record type (train, eval, batch, etc.)
             include_resources: Whether to include memory/CPU stats
         """
+        record: dict[str, object] = {"type": record_type}
         if step is not None:
             record["step"] = step
     def close(self) -> None:
         self._file.close()
+    def __enter__(self) -> "MetricsLogger":
         return self
+    def __exit__(self, *args: object) -> None:
         self.close()
+def _sanitize(obj: object) -> object:
     """Replace NaN/Inf with None for valid JSON."""
     if isinstance(obj, float) and (math.isnan(obj) or math.isinf(obj)):
         return None

pawn/model.py CHANGED Viewed

@@ -179,6 +179,11 @@ class SwiGLUFFN(nn.Module):
 class TransformerBlock(nn.Module):
     def __init__(self, cfg: CLMConfig):
         super().__init__()
         self.attn_norm = RMSNorm(cfg.d_model)
@@ -220,6 +225,8 @@ class CLMEmbedding(nn.Module):
     PAD and outcome tokens use standalone embeddings.
     """
     def __init__(self, cfg: CLMConfig):
         super().__init__()
         self.d_model = cfg.d_model
@@ -273,6 +280,14 @@ class PAWNCLM(nn.Module):
     full vocabulary. No factored output head, no grid, no BCE.
     """
     def __init__(self, cfg: CLMConfig):
         super().__init__()
         self.cfg = cfg
@@ -300,6 +315,10 @@ class PAWNCLM(nn.Module):
         self._init_weights()
     def _init_weights(self):
         for p in self.parameters():
             if p.dim() > 1:
@@ -425,9 +444,9 @@ class PAWNCLM(nn.Module):
             rope_sin = self.rope_sin[:, :, :T_new, :]
         new_kv_cache = []
-        for i, layer in enumerate(self.layers):
             layer_cache = kv_cache[i] if kv_cache is not None else None
-            x, new_cache = layer.forward_kv(x, rope_cos, rope_sin, layer_cache)
             new_kv_cache.append(new_cache)
         x = self.final_norm(x[:, -1:, :])

 class TransformerBlock(nn.Module):
+    attn_norm: RMSNorm
+    attn: Attention
+    ffn_norm: RMSNorm
+    ffn: SwiGLUFFN
     def __init__(self, cfg: CLMConfig):
         super().__init__()
         self.attn_norm = RMSNorm(cfg.d_model)
     PAD and outcome tokens use standalone embeddings.
     """
+    decomp_table: torch.Tensor
     def __init__(self, cfg: CLMConfig):
         super().__init__()
         self.d_model = cfg.d_model
     full vocabulary. No factored output head, no grid, no BCE.
     """
+    rope_cos: torch.Tensor
+    rope_sin: torch.Tensor
+    causal_mask: torch.Tensor
+    embed: CLMEmbedding
+    layers: nn.ModuleList
+    final_norm: RMSNorm
+    lm_head: nn.Linear
     def __init__(self, cfg: CLMConfig):
         super().__init__()
         self.cfg = cfg
         self._init_weights()
+    def get_block(self, i: int) -> TransformerBlock:
+        """Typed accessor for transformer layers (avoids ModuleList type erasure)."""
+        return self.layers[i]  # type: ignore[return-value]
     def _init_weights(self):
         for p in self.parameters():
             if p.dim() > 1:
             rope_sin = self.rope_sin[:, :, :T_new, :]
         new_kv_cache = []
+        for i in range(len(self.layers)):
             layer_cache = kv_cache[i] if kv_cache is not None else None
+            x, new_cache = self.get_block(i).forward_kv(x, rope_cos, rope_sin, layer_cache)
             new_kv_cache.append(new_cache)
         x = self.final_norm(x[:, -1:, :])

pawn/trainer.py CHANGED Viewed

@@ -38,12 +38,12 @@ class CosineWithWarmup:
         self._step = 0
         self._apply_lr(0)
-    def _apply_lr(self, step: int):
         lr_scale = self._compute_lr_scale(step)
         for pg, base_lr in zip(self.optimizer.param_groups, self.base_lrs, strict=True):
             pg["lr"] = base_lr * lr_scale
-    def step(self):
         self._step += 1
         self._apply_lr(self._step)
@@ -59,10 +59,10 @@ class CosineWithWarmup:
     def get_lr(self) -> float:
         return self.optimizer.param_groups[0]["lr"]
-    def state_dict(self):
         return {"step": self._step}
-    def load_state_dict(self, state):
         self._step = state["step"]
         self._apply_lr(self._step)
@@ -208,8 +208,9 @@ class CLMTrainer:
         self._jsonl_path = os.path.join(self.run_dir, "metrics.jsonl")
         self._jsonl_file = None
-        self.model = PAWNCLM(model_cfg).to(self.device)
-        param_count = sum(p.numel() for p in self.model.parameters())
         print(f"Model parameters: {param_count:,}")
         print(f"Run directory: {self.run_dir}")
@@ -345,7 +346,7 @@ class CLMTrainer:
     def optimizer_step(self) -> float:
         self.scaler.unscale_(self.optimizer)
-        grad_norm = _get_grad_norm(self.model)
         torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg.max_grad_norm)
         self.scaler.step(self.optimizer)
         self.scaler.update()
@@ -354,7 +355,7 @@ class CLMTrainer:
         return grad_norm
     def _eager_model(self) -> PAWNCLM:
-        return self.model._orig_mod if hasattr(self.model, "_orig_mod") else self.model
     @torch.no_grad()
     def evaluate(self) -> dict[str, float]:
@@ -535,9 +536,7 @@ class CLMTrainer:
         if dirname:
             os.makedirs(dirname, exist_ok=True)
-        model = self.model
-        if hasattr(model, "_orig_mod"):
-            model = model._orig_mod
         torch.save(
             {
@@ -561,9 +560,7 @@ class CLMTrainer:
         ckpt = torch.load(path, map_location=self.device, weights_only=False)
         self.global_step = ckpt["global_step"]
-        model = self.model
-        if hasattr(model, "_orig_mod"):
-            model = model._orig_mod
         model.load_state_dict(ckpt["model_state_dict"])
         self.optimizer.load_state_dict(ckpt["optimizer_state_dict"])

         self._step = 0
         self._apply_lr(0)
+    def _apply_lr(self, step: int) -> None:
         lr_scale = self._compute_lr_scale(step)
         for pg, base_lr in zip(self.optimizer.param_groups, self.base_lrs, strict=True):
             pg["lr"] = base_lr * lr_scale
+    def step(self) -> None:
         self._step += 1
         self._apply_lr(self._step)
     def get_lr(self) -> float:
         return self.optimizer.param_groups[0]["lr"]
+    def state_dict(self) -> dict[str, int]:
         return {"step": self._step}
+    def load_state_dict(self, state: dict[str, int]) -> None:
         self._step = state["step"]
         self._apply_lr(self._step)
         self._jsonl_path = os.path.join(self.run_dir, "metrics.jsonl")
         self._jsonl_file = None
+        self._model = PAWNCLM(model_cfg).to(self.device)
+        self.model = self._model
+        param_count = sum(p.numel() for p in self._model.parameters())
         print(f"Model parameters: {param_count:,}")
         print(f"Run directory: {self.run_dir}")
     def optimizer_step(self) -> float:
         self.scaler.unscale_(self.optimizer)
+        grad_norm = _get_grad_norm(self._model)
         torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg.max_grad_norm)
         self.scaler.step(self.optimizer)
         self.scaler.update()
         return grad_norm
     def _eager_model(self) -> PAWNCLM:
+        return self._model
     @torch.no_grad()
     def evaluate(self) -> dict[str, float]:
         if dirname:
             os.makedirs(dirname, exist_ok=True)
+        model: PAWNCLM = self._eager_model()
         torch.save(
             {
         ckpt = torch.load(path, map_location=self.device, weights_only=False)
         self.global_step = ckpt["global_step"]
+        model: PAWNCLM = self._eager_model()
         model.load_state_dict(ckpt["model_state_dict"])
         self.optimizer.load_state_dict(ckpt["optimizer_state_dict"])

pyproject.toml CHANGED Viewed

@@ -57,5 +57,13 @@ name = "pytorch-cu128"
 url = "https://download.pytorch.org/whl/cu128"
 explicit = true
 [tool.pytest.ini_options]
 testpaths = ["tests"]

 url = "https://download.pytorch.org/whl/cu128"
 explicit = true
+[tool.pyright]
+pythonVersion = "3.10"
+typeCheckingMode = "basic"
+reportMissingTypeStubs = false
+reportPrivateImportUsage = false
+reportMissingImports = "warning"
+include = ["pawn"]
 [tool.pytest.ini_options]
 testpaths = ["tests"]