Add config + custom code for Query2SAE

Files changed (3) hide show

config.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
-    "model_type": "query2sae",
-    "backbone_name": "gpt2",
-    "head_hidden_dim": 128,
-    "sae_dim": 1024,
-    "auto_map": {
-      "AutoConfig": "my_package.my_configuration.Query2SAEConfig",
-      "AutoModel":  "my_package.my_modeling.Query2SAEModel"
-    }
   }

 {
+  "model_type": "query2sae",
+  "backbone_name": "gpt2",
+  "head_hidden_dim": 128,
+  "sae_dim": 1024,
+  "architectures": ["Query2SAEModel"],
+  "auto_map": {
+    "AutoConfig": "configuration_query2sae.Query2SAEConfig",
+    "AutoModel":  "modeling_query2sae.Query2SAEModel"
   }
+}

configuration_query2sae.py ADDED Viewed

+from transformers import PretrainedConfig
+class Query2SAEConfig(PretrainedConfig):
+    model_type = "query2sae"
+    def __init__(
+        self,
+        backbone_name: str = "gpt2",
+        head_hidden_dim: int = 128,
+        sae_dim: int = 1024,  # <-- set this to YOUR real SAE dim
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.backbone_name = backbone_name
+        self.head_hidden_dim = int(head_hidden_dim)
+        self.sae_dim = int(sae_dim)

model_query2sae.py ADDED Viewed

+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel, GPT2Config, GPT2Model
+from configuration_query2sae import Query2SAEConfig
+class Query2SAEModel(PreTrainedModel):
+    """
+    HF-compatible wrapper for your Query2SAE:
+    - GPT-2 backbone is frozen
+    - MLP head maps hidden -> SAE space
+    """
+    config_class = Query2SAEConfig
+    base_model_prefix = "query2sae"
+    def __init__(self, config: Query2SAEConfig):
+        super().__init__(config)
+        # Build GPT-2 backbone (weights will be loaded by from_pretrained via state_dict)
+        gpt2_cfg = GPT2Config.from_pretrained(config.backbone_name)
+        self.backbone = GPT2Model(gpt2_cfg)
+        for p in self.backbone.parameters():
+            p.requires_grad = False
+        self.head = nn.Sequential(
+            nn.Linear(self.backbone.config.hidden_size, config.head_hidden_dim),
+            nn.ReLU(),
+            nn.Linear(config.head_hidden_dim, config.sae_dim),
+        )
+        self.post_init()
+    def forward(self, input_ids=None, attention_mask=None, **kwargs):
+        with torch.no_grad():
+            out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
+            last_hidden = out.last_hidden_state[:, -1, :]
+        logits = self.head(last_hidden)
+        return {"logits": logits}