upload weights

Browse files

Files changed (11) hide show

added_tokens.json +9 -0
chat_template.jinja +1 -0
config.json +26 -0
generation_config.json +4 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling_gpt4dev.py +314 -0
special_tokens_map.json +75 -0
tokenizer.json +0 -0
tokenizer_config.json +86 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "<|call|>": 50262,
+  "<|channel|>": 50260,
+  "<|constrain|>": 50263,
+  "<|end|>": 50258,
+  "<|message|>": 50259,
+  "<|return|>": 50261,
+  "<|start|>": 50257
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1 @@


1	+ {% for m in messages %}{% if m['role'] == 'assistant' %}<\|start\|>assistant<\|channel\|>final<\|message\|>{{ m['content'] }}<\|end\|>{% elif m['role'] == 'developer' %}<\|start\|>developer<\|message\|>{{ m['content'] }}<\|end\|>{% else %}<\|start\|>{{ m['role'] }}<\|message\|>{{ m['content'] }}<\|end\|>{% endif %}{% endfor %}{% if add_generation_prompt %}<\|start\|>assistant<\|channel\|>final<\|message\|>{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architectures": [
+    "GPT4DevForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling_gpt4dev.GPT4DevConfig",
+    "AutoModel": "modeling_gpt4dev.GPT4DevForCausalLM",
+    "AutoModelForCausalLM": "modeling_gpt4dev.GPT4DevForCausalLM"
+  },
+  "compat_prefill_tokens": 0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt4dev",
+  "multi_query": true,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 12,
+  "qkv_bias": true,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.4",
+  "vocab_size": 50264
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.52.4"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afcba36e53071eaef2acdf78b948f73b1ba4cacf5989dce6f475a13b4cd9cf6f
+size 709224088

modeling_gpt4dev.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import math, torch, torch.nn as nn, torch.nn.functional as F
+from transformers import PretrainedConfig, PreTrainedModel, GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
+from typing import Optional, Tuple, List
+class GPT4DevConfig(PretrainedConfig):
+    model_type = "gpt4dev"
+    def __init__(
+        self,
+        vocab_size=50257,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        max_position_embeddings=1024,
+        rope_theta=10000.0,
+        qkv_bias=True,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        multi_query=True,
+        architectures=None,
+        tie_word_embeddings=False,
+        compat_prefill_tokens: int = 0,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            intermediate_size=intermediate_size,
+            max_position_embeddings=max_position_embeddings,
+            rope_theta=rope_theta,
+            qkv_bias=qkv_bias,
+            layer_norm_epsilon=layer_norm_epsilon,
+            initializer_range=initializer_range,
+            multi_query=multi_query,
+            architectures=architectures,
+            tie_word_embeddings=tie_word_embeddings,
+            compat_prefill_tokens=compat_prefill_tokens,
+            **kwargs,
+        )
+def rope_cache(seq_len, dim, theta, device, dtype=torch.float32):
+    # Note: kept float32 to match training-time math used in early checkpoints
+    inv = 1.0 / (theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
+    t = torch.arange(seq_len, device=device, dtype=torch.float32)
+    freqs = torch.outer(t, inv)
+    return torch.polar(torch.ones_like(freqs), freqs).to(dtype)
+def apply_rope(x, rope):
+    # x: (..., D) with D even; rope: (T, D/2). In legacy math this can be float (cos-only)
+    xc = torch.view_as_complex(x.to(torch.float32).reshape(*x.shape[:-1], -1, 2))
+    yc = xc * rope.to(xc.dtype)
+    y = torch.view_as_real(yc).reshape(*x.shape[:-1], -1)
+    return y.to(x.dtype)
+class MQA(nn.Module):
+    def __init__(self, config: GPT4DevConfig):
+        super().__init__()
+        h, d = config.num_attention_heads, config.hidden_size // config.num_attention_heads
+        self.h, self.d = h, d
+        self.qkv = nn.Linear(config.hidden_size, h * d + 2 * d, bias=config.qkv_bias)
+        self.out = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: torch.Tensor,
+        past_kv: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        B, T, _ = x.shape
+        qkv = self.qkv(x)
+        q, kv = qkv.split(self.h * self.d, dim=-1)
+        k_new, v_new = kv.split(self.d, dim=-1)  # (B, T, d)
+        # queries to head dim; apply RoPE
+        q = q.view(B, T, self.h, self.d).transpose(1, 2)  # (B, h, T, d)
+        q = apply_rope(q, rope)
+        # rotate new k
+        k_new = apply_rope(k_new.unsqueeze(1), rope).squeeze(1)  # (B, T, d)
+        # concat cache
+        if past_kv is not None and past_kv[0] is not None:
+            k_cat = torch.cat([past_kv[0], k_new], dim=1)
+            v_cat = torch.cat([past_kv[1], v_new], dim=1)
+        else:
+            k_cat, v_cat = k_new, v_new
+        # expand KV
+        k_exp = k_cat.unsqueeze(1).expand(-1, self.h, -1, -1)  # (B, h, S, d)
+        v_exp = v_cat.unsqueeze(1).expand(-1, self.h, -1, -1)  # (B, h, S, d)
+        B, h, T, d = q.shape
+        S = k_exp.size(2)
+        past_len = S - T
+        attn = torch.matmul(q, k_exp.transpose(-2, -1)) / math.sqrt(d)
+        # Offset-aware causal mask
+        idx_t = torch.arange(T, device=q.device)[:, None]
+        idx_s = torch.arange(S, device=q.device)[None, :]
+        mask = idx_s > idx_t + past_len
+        attn = attn.masked_fill(mask.unsqueeze(0).unsqueeze(0), float('-inf'))
+        attn = F.softmax(attn, dim=-1)
+        y = torch.matmul(attn, v_exp)
+        y = y.transpose(1, 2).reshape(B, T, -1)
+        return self.out(y), (k_cat, v_cat)
+    def forward_compat(self, x: torch.Tensor, rope: torch.Tensor) -> torch.Tensor:
+        B, T, _ = x.shape
+        qkv = self.qkv(x)
+        q, kv = qkv.split(self.h * self.d, dim=-1)
+        k, v = kv.split(self.d, dim=-1)
+        q = q.view(B, T, self.h, self.d).transpose(1, 2)  # (B,h,T,d)
+        k = k.unsqueeze(1).expand(-1, self.h, -1, -1)  # (B,h,T,d)
+        v = v.unsqueeze(1).expand(-1, self.h, -1, -1)  # (B,h,T,d)
+        q = apply_rope(q, rope)
+        k = apply_rope(k, rope)
+        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        return self.out(y.transpose(1, 2).reshape(B, T, -1))
+class SwiGLU(nn.Module):
+    def __init__(self, hidden_dim, intermediate_dim):
+        super().__init__()
+        self.w1 = nn.Linear(hidden_dim, intermediate_dim * 2, bias=True)
+        self.w2 = nn.Linear(intermediate_dim, hidden_dim, bias=False)
+    def forward(self, x):
+        x_g, x_v = self.w1(x).chunk(2, dim=-1)
+        return self.w2(F.silu(x_g) * x_v)
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = MQA(config) if config.multi_query else nn.MultiheadAttention(
+            config.hidden_size, config.num_attention_heads, bias=config.qkv_bias, batch_first=True)
+        self.ln2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = SwiGLU(config.hidden_size, config.intermediate_size)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: torch.Tensor,
+        past_kv: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_checkpoint: bool = False,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        def custom_forward(x_, rope_):
+            a, new_kv = self.attn(self.ln1(x_), rope_, past_kv)
+            x_ = x_ + a
+            x_ = x_ + self.mlp(self.ln2(x_))
+            return x_, new_kv
+        if use_checkpoint and self.training:
+            y, new_kv = torch.utils.checkpoint.checkpoint(custom_forward, x, rope, use_reentrant=False)
+            return y, new_kv
+        else:
+            return custom_forward(x, rope)
+    def forward_compat(self, x: torch.Tensor, rope: torch.Tensor, use_checkpoint: bool = False) -> torch.Tensor:
+        def custom_forward(x_, rope_):
+            a = self.attn.forward_compat(self.ln1(x_), rope_)
+            x_ = x_ + a
+            x_ = x_ + self.mlp(self.ln2(x_))
+            return x_
+        if use_checkpoint and self.training:
+            return torch.utils.checkpoint.checkpoint(custom_forward, x, rope, use_reentrant=False)
+        else:
+            return custom_forward(x, rope)
+class GPT4DevPreTrained(PreTrainedModel):
+    config_class = GPT4DevConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Block"]
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            nn.init.zeros_(module.bias)
+class GPT4DevForCausalLM(GPT4DevPreTrained, GenerationMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        self.embed = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.rope_cache = None
+        self.post_init()
+    # embeddings tie helpers
+    def get_input_embeddings(self):
+        return self.embed
+    def set_input_embeddings(self, new_embeddings):
+        self.embed = new_embeddings
+        if getattr(self.config, "tie_word_embeddings", True) and self.get_output_embeddings() is not None:
+            with torch.no_grad():
+                self.get_output_embeddings().weight = self.embed.weight
+    def get_output_embeddings(self):
+        return self.head
+    def set_output_embeddings(self, new_lm_head):
+        self.head = new_lm_head
+    def tie_weights(self):
+        if getattr(self.config, "tie_word_embeddings", True):
+            self.head.weight = self.embed.weight
+    # generation helpers (legacy tuple KV-cache)
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, past_key_values=None, **kwargs):
+        # Until compat_prefill_tokens, avoid slicing and ignore cache to mirror legacy behavior
+        cutoff = int(getattr(self.config, "compat_prefill_tokens", 0) or 0)
+        if past_key_values is not None and input_ids is not None and input_ids.size(1) < cutoff:
+            past_key_values = None  # drop cache, process full prefix
+        elif past_key_values is not None:
+            # normal cached decode path
+            input_ids = input_ids[:, -1:]
+            if attention_mask is not None and attention_mask.dim() == 2 and torch.all(attention_mask == 1):
+                attention_mask = None
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values, "use_cache": True}
+    def _reorder_cache(self, past_key_values, beam_idx):
+        if isinstance(past_key_values, (tuple, list)):
+            reordered = []
+            for k, v in past_key_values:
+                if k is None or v is None:
+                    reordered.append((k, v))
+                else:
+                    reordered.append((k.index_select(0, beam_idx), v.index_select(0, beam_idx)))
+            return tuple(reordered)
+        return past_key_values
+    # RoPE utilities (kept float32 behavior to mirror training)
+    def _rope_slice(self, past_len: int, T: int, device, dtype):
+        if self.rope_cache is None or self.rope_cache.device != device:
+            self.rope_cache = rope_cache(
+                self.config.max_position_embeddings,
+                self.config.hidden_size // self.config.num_attention_heads,
+                self.config.rope_theta, device, dtype=torch.float32
+            )
+        need = past_len + T
+        if need > self.rope_cache.size(0):
+            self.rope_cache = rope_cache(
+                self.config.max_position_embeddings,
+                self.config.hidden_size // self.config.num_attention_heads,
+                self.config.rope_theta, device, dtype=torch.float32
+            )
+        return self.rope_cache[past_len: past_len + T]
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, Block):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        input_ids,
+        labels=None,
+        attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        **kwargs,
+    ):
+        B, T = input_ids.shape
+        x = self.embed(input_ids)
+        past = past_key_values
+        use_cache = True if (use_cache is None) else use_cache
+        new_past: List[Tuple[torch.Tensor, torch.Tensor]] = [] if use_cache else None
+        past_len = 0
+        if past is not None and isinstance(past, (tuple, list)) and past and past[0] is not None:
+            past_len = past[0][0].size(1)
+        rope = self._rope_slice(past_len, T, x.device, x.dtype)
+        for i, blk in enumerate(self.blocks):
+            pkv = None if past is None else (past[i] if i < len(past) else None)
+            x, new_kv = blk(x, rope, past_kv=pkv, use_checkpoint=(self.is_gradient_checkpointing and self.training))
+            if use_cache and new_past is not None:
+                new_past.append(new_kv)
+        logits = self.head(self.ln_f(x))
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=tuple(new_past) if use_cache else None,
+        )
+GPT4DevConfig.auto_map = {
+    "AutoConfig": "modeling_gpt4dev.GPT4DevConfig",
+    "AutoModel": "modeling_gpt4dev.GPT4DevForCausalLM",
+    "AutoModelForCausalLM": "modeling_gpt4dev.GPT4DevForCausalLM",
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|message|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|channel|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|return|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|constrain|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|end|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,86 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50257": {
+      "content": "<|start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50258": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50259": {
+      "content": "<|message|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50260": {
+      "content": "<|channel|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50261": {
+      "content": "<|return|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50262": {
+      "content": "<|call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50263": {
+      "content": "<|constrain|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|start|>",
+    "<|end|>",
+    "<|message|>",
+    "<|channel|>",
+    "<|return|>",
+    "<|call|>",
+    "<|constrain|>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|end|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|end|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff