Cactus-Compute
/

gemma4-e2b-grouped-k192-router

+"""gemma4_hf.py — thin HF wrapper that exposes the same interface as gemma4.py.
+Replaces our custom (broken) gemma4.py with transformers' validated implementation.
+Exposes:
+  - N_LAYERS, HIDDEN_SIZE, INTERMEDIATE, INTERMEDIATE_WIDE, DOUBLE_WIDE_START, DEVICE, DTYPE
+  - load_gemma4() -> (model, tokenizer)  where model behaves like our custom one:
+      * model(input_ids) -> logits tensor (not CausalLMOutputWithPast)
+      * model.layers is a proxy for the underlying ModuleList of decoder layers
+      * all nn.Module APIs (parameters, state_dict, etc.) work
+"""
+import os
+import torch
+import torch.nn as nn
+from transformers import AutoModelForCausalLM, AutoTokenizer
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16
+# Architecture constants (match google/gemma-4-e2b-it)
+N_LAYERS = 35
+HIDDEN_SIZE = 1536
+VOCAB_SIZE = 262144
+INTERMEDIATE = 6144          # layers 0-14
+INTERMEDIATE_WIDE = 12288    # layers 15-34
+DOUBLE_WIDE_START = 15
+HF_REPO = os.environ.get("GEMMA4_HF_REPO", "google/gemma-4-e2b-it")
+class HFGemma4(nn.Module):
+    """Wraps HF's Gemma4ForConditionalGeneration to match our custom-model interface.
+    Forward returns raw logits (not the HF output struct), and `.layers` exposes
+    the decoder layer ModuleList for rung6_moe_g4.py's install_moe / Taylor hooks.
+    """
+    def __init__(self, inner: nn.Module):
+        super().__init__()
+        self.inner = inner
+    @property
+    def layers(self) -> nn.ModuleList:
+        return self.inner.model.language_model.layers
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.inner(input_ids=input_ids).logits
+def load_gemma4(device=None):
+    """Load HF Gemma-4 E2B-IT, wrapped in HFGemma4 for rung6 compatibility."""
+    if device is None:
+        device = DEVICE
+    print(f"Loading HF {HF_REPO} (dtype={DTYPE})...")
+    inner = AutoModelForCausalLM.from_pretrained(HF_REPO, dtype=DTYPE, device_map=device)
+    inner.eval()
+    model = HFGemma4(inner)
+    model.eval()
+    print(f"Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(HF_REPO)
+    return model, tokenizer