ziadrone commited on Nov 25, 2025

Commit

054c77e

verified ·

1 Parent(s): c8f105b

Upload Shivik-M2 with merges.txt (clean)

Browse files

Files changed (37) hide show

.gitattributes +2 -0
.ipynb_checkpoints/tokenization_shivik_m1-checkpoint.py +143 -0
README.md +19 -0
UPLOADED_TOKENIZER_HELPER.txt +1 -0
build_tokenizer_fast.py +39 -0
config.json +11 -0
generation_config.json +5 -0
load_and_test.py +18 -0
merges.txt +0 -0
migrate_weights_m1_to_m2.py +71 -0
model.safetensors +3 -0
model.safetensors.bak +3 -0
model_card.md +41 -0
modeling_shivik_m1.py +201 -0
modeling_shivik_m2.py +191 -0
shivik-tokenizer-v120k/special_tokens_map.json +16 -0
shivik-tokenizer-v120k/tokenizer.json +0 -0
shivik-tokenizer-v120k/tokenizer_config.json +20 -0
shivik-tokenizer-v200k/special_tokens_map.json +0 -0
shivik-tokenizer-v200k/token_ids.json +0 -0
shivik-tokenizer-v200k/tokenizer.json +3 -0
shivik-tokenizer-v200k/tokenizer_config.json +0 -0
shivik-tokenizer-v200k/tokenizer_metadata.json +20 -0
special_tokens_map.json +26 -0
tokenization_shivik_m1.py +181 -0
tokenization_shivik_m1.py.bak +175 -0
tokenization_shivik_m1_fast.py +9 -0
tokenizer.json +0 -0
tokenizer/special_tokens_map.json +23 -0
tokenizer/tokenizer.json +345 -0
tokenizer/tokenizer_metadata.json +6 -0
tokenizer/vocab.json +0 -0
tokenizer_config.json +6 -0
tokenizer_fast.json +0 -0
train_aries.py +76 -0
upload_to_hf.py +18 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model.safetensors.bak filter=lfs diff=lfs merge=lfs -text
+shivik-tokenizer-v200k/tokenizer.json filter=lfs diff=lfs merge=lfs -text

.ipynb_checkpoints/tokenization_shivik_m1-checkpoint.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import json, re, os
+from transformers import PreTrainedTokenizer
+class ShivikM1Tokenizer(PreTrainedTokenizer):
+    vocab_files_names = {
+        "vocab_file": "vocab.json",
+        "merges_file": "merges.txt",
+    }
+    def __init__(self, vocab_file=None, merges_file=None, **kwargs):
+        # --------------------------------------------------------------
+        # 1) Resolve real paths when HF passes only folder
+        # --------------------------------------------------------------
+        if vocab_file is None or not os.path.isfile(vocab_file):
+            vocab_file = os.path.join(kwargs.get("pretrained_model_name_or_path", ""), "vocab.json")
+        if merges_file is None or not os.path.isfile(merges_file):
+            merges_file = os.path.join(kwargs.get("pretrained_model_name_or_path", ""), "merges.txt")
+        if not os.path.isfile(vocab_file):
+            raise FileNotFoundError(f"Cannot find vocab.json at {vocab_file}")
+        if not os.path.isfile(merges_file):
+            raise FileNotFoundError(f"Cannot find merges.txt at {merges_file}")
+        # --------------------------------------------------------------
+        # 2) Load vocab + merges
+        # --------------------------------------------------------------
+        with open(vocab_file, "r", encoding="utf-8") as f:
+            self.encoder = json.load(f)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        merges = []
+        with open(merges_file, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                merges.append(tuple(line.split()))
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+        # Robust pattern
+        self.pat = re.compile(r"\S+")
+        self.vocab_file = vocab_file
+        self.merges_file = merges_file
+        # set default specials
+        kwargs.setdefault("unk_token", "<unk>")
+        kwargs.setdefault("pad_token", "<pad_000000>")
+        kwargs.setdefault("bos_token", "<think>")
+        kwargs.setdefault("eos_token", "</think>")
+        super().__init__(**kwargs)
+    # --------------------------------------------------------------
+    # Standard GPT BPE tokenization
+    # --------------------------------------------------------------
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    def get_vocab(self):
+        return dict(self.encoder)
+    def get_pairs(self, word):
+        pairs = set()
+        prev = word[0]
+        for ch in word[1:]:
+            pairs.add((prev, ch))
+            prev = ch
+        return pairs
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token) + ("</w>",)
+        pairs = self.get_pairs(word)
+        if not pairs:
+            result = token + "</w>"
+            self.cache[token] = result
+            return result
+        while True:
+            bigram = min(pairs, key=lambda p: self.bpe_ranks.get(p, 1e10))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                new_word.extend(word[i:j])
+                i = j
+                if word[i:i+2] == bigram:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            word = tuple(new_word)
+            pairs = self.get_pairs(word)
+        result = " ".join(word)
+        self.cache[token] = result
+        return result
+    # --------------------------------------------------------------
+    # Final tokenization functions
+    # --------------------------------------------------------------
+    def _tokenize(self, text, **kwargs):
+        tokens = []
+        for word in re.findall(self.pat, text):
+            bpe_res = self.bpe(word)
+            tokens.extend(bpe_res.split(" "))
+        return tokens
+    def tokenize(self, text, **kwargs):
+        return self._tokenize(text)
+    def _convert_token_to_id(self, token):
+        return self.encoder.get(token, self.encoder["<unk>"])
+    def _convert_id_to_token(self, idx):
+        return self.decoder.get(idx, "<unk>")
+    def convert_tokens_to_string(self, tokens):
+        return " ".join(tokens).replace("</w>", "")
+    def build_inputs_with_special_tokens(self, ids_0, ids_1=None):
+        return list(ids_0) if ids_1 is None else list(ids_0) + list(ids_1)
+    def decode(self, ids, **kwargs):
+        return self.convert_tokens_to_string([self._convert_id_to_token(i) for i in ids])

README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# ziadrone / shivik-m2-aries
+✅ **Shivik-M2 (Aries infusion)** — 1.1B reasoning-capable causal LM
+This repository contains:
+- model.safetensors (M2 weights)
+- tokenizer files (vocab.json, merges.txt, tokenizer.json)
+- `modeling_shivik_m2.py` (custom model class)
+- `tokenization_shivik_m1.py` (custom HF-compatible Python tokenizer)
+- helper scripts: `build_tokenizer_fast.py`, `train_aries.py`
+## Quick usage (after `pip install transformers safetensors tokenizers`)
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+tok = AutoTokenizer.from_pretrained("ziadrone/shivik-m2-aries", trust_remote_code=True, use_fast=False)
+model = AutoModelForCausalLM.from_pretrained("ziadrone/shivik-m2-aries", trust_remote_code=True)
+text = "Hello <think> explain step by step </think>"
+enc = tok(text, return_tensors='pt')
+out = model(**enc)
+```

UPLOADED_TOKENIZER_HELPER.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Uploaded tokenizer helper path (for reference): /mnt/data/tokenization_shivik_m1.py\n

build_tokenizer_fast.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# build_tokenizer_fast.py
+# Builds a tokenizers (Rust) BPE tokenizer from vocab.json + merges.txt and saves tokenizer.json
+import json, sys
+from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
+from tokenizers.processors import TemplateProcessing
+from pathlib import Path
+REPO = Path("/workspace/shivik-m2")
+vocab_file = REPO / "vocab.json"
+merges_file = REPO / "merges.txt"
+out_file = REPO / "tokenizers_bpe.json"
+if not vocab_file.exists() or not merges_file.exists():
+    raise SystemExit("vocab.json or merges.txt missing in " + str(REPO))
+print("Loading vocab + merges...")
+with open(vocab_file, "r", encoding="utf-8") as f:
+    vocab = json.load(f)
+merges = [line.rstrip("\n") for line in open(merges_file, "r", encoding="utf-8") if line.strip() and not line.startswith("#")]
+# Build BPE model from explicit vocab+merges
+model = models.BPE(vocab=vocab, merges=merges)
+tokenizer = Tokenizer(model)
+# simple pre-tokenizer / decoder for GPT style
+tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
+tokenizer.decoder = decoders.ByteLevel()
+# Set post-processor to keep things simple (no added special tokens)
+tokenizer.post_processor = TemplateProcessing(
+    single="$A",
+    pair="$A $B",
+    special_tokens=[]
+)
+print("Saving tokenizer to", out_file)
+tokenizer.save(str(out_file))
+print("Done. You can move tokenizers_bpe.json -> tokenizer.json or upload as-is.")
+print("\nUploaded helper file path (for reference):")
+print("/mnt/data/tokenization_shivik_m1.py")

config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "model_type": "shivik_m1",
+  "vocab_size": 49152,
+  "d_model": 2048,
+  "n_layers": 24,
+  "num_heads": 16,
+  "kv_heads": 4,
+  "rotary_dim": 128,
+  "context_length": 4096,
+  "use_cache": true
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "max_length": 2048,
+  "do_sample": false,
+  "eos_token_id": null
+}

load_and_test.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# load_and_test.py - quick load test
+import sys, os
+sys.path.insert(0, os.getcwd())
+from tokenization_shivik_m1 import ShivikM1Tokenizer
+from modeling_shivik_m2 import ShivikM2Config, ShivikM2ForCausalLM
+repo = "/workspace/shivik-m2"
+tok = ShivikM1Tokenizer.from_pretrained(repo, local_files_only=True)
+print("Tokenizer loaded ✓ vocab_size =", tok.vocab_size)
+cfg = ShivikM2Config()
+model = ShivikM2ForCausalLM(cfg)
+print("Model instance created ✓")
+# test forward with random IDs
+import torch
+x = torch.randint(0, tok.vocab_size, (2, 8))
+out = model(x)
+print("Forward OK, logits shape:", out.logits.shape)

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

migrate_weights_m1_to_m2.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# migrate_weights_m1_to_m2.py
+import os, sys, torch
+from safetensors.torch import load_file as load_safetensors, save_file as save_safetensors
+from modeling_shivik_m2 import ShivikM2Config, ShivikM2ForCausalLM
+SRC = "/workspace/shivik-m1-v3.1-fp16/model.safetensors"
+DST_DIR = "/workspace/shivik-m2"
+DST = os.path.join(DST_DIR, "model.safetensors")
+def safe_load(path):
+    if path.endswith(".safetensors"):
+        try:
+            sd = load_safetensors(path)
+            print("Loaded safetensors:", path)
+            # convert to torch tensors
+            return {k: torch.tensor(v) if not isinstance(v, torch.Tensor) else v for k,v in sd.items()}
+        except Exception as e:
+            print("safetensors load failed:", e)
+            raise
+    else:
+        return torch.load(path, map_location="cpu")
+print("Loading source state dict:", SRC)
+src_sd = safe_load(SRC)
+# instantiate new model
+cfg = ShivikM2Config()
+model = ShivikM2ForCausalLM(cfg).eval()
+new_sd = model.state_dict()
+print("Mapping compatible tensors (exact shape match) from source -> new model...")
+copied = []
+skipped = []
+for k_new, v_new in new_sd.items():
+    # attempt to find exact name in src_sd
+    if k_new in src_sd and src_sd[k_new].shape == v_new.shape:
+        new_sd[k_new] = src_sd[k_new].clone()
+        copied.append(k_new)
+    else:
+        # try some heuristics for common renames: embed, lm_head, norm weights
+        alt_keys = [
+            k_new.replace("model.", ""),
+            k_new.replace("model.", "shivik_m1_v3.model."),
+            k_new.replace("lm_head.weight", "embed.weight"),
+            k_new.replace("model.embed.weight", "model.embed.weight"),
+        ]
+        found = False
+        for alt in alt_keys:
+            if alt in src_sd and src_sd[alt].shape == v_new.shape:
+                new_sd[k_new] = src_sd[alt].clone()
+                copied.append((k_new, alt))
+                found = True
+                break
+        if not found:
+            skipped.append(k_new)
+print(f"Copied {len(copied)} tensors, skipped {len(skipped)} tensors.")
+print("Skipped (sample 20):", skipped[:20])
+# save new_sd as safetensors (if possible), else torch.save
+try:
+    # safetensors expects numpy arrays; convert
+    from safetensors.torch import save_file
+    out = {k: v.cpu() for k,v in new_sd.items()}
+    save_file(out, DST)
+    print("Saved migrated safetensors to", DST)
+except Exception as e:
+    print("safetensors save failed, falling back to torch.save:", e)
+    torch.save(new_sd, DST.replace(".safetensors", ".pt"))
+    print("Saved as torch .pt to", DST.replace(".safetensors", ".pt"))

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35697f70767428363b9b367d666b0ed114081d5f5bd8e1d1c80227e227687729
+size 4850737576

model.safetensors.bak ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35697f70767428363b9b367d666b0ed114081d5f5bd8e1d1c80227e227687729
+size 4850737576

model_card.md ADDED Viewed

	@@ -0,0 +1,41 @@

+---
+language: en
+license: apache-2.0
+tags:
+  - causal-lm
+  - reasoning
+  - aries
+  - shivik
+  - instruction-following
+  - safetensors
+library_name: "transformers"
+---
+# Shivik-M2 Aries (ziadrone/shivik-m2-aries)
+**Model type:** Causal LM (1.1B) with Aries reasoning tokens infused.
+## Description
+This model is an M2 architecture (GQA-style attention) derived from Shivik-M1 weights and reworked to support reasoning tokens. It includes custom special tokens for multi-step reasoning:
+```
+<think>...</think>  <step>...</step>  <path>...</path>  <graph>...</graph>
+<score>...</score>  <final>...</final>  <context>...</context>
+<analysis>...</analysis>  <answer>...</answer>  <evaluate>...</evaluate>
+```
+## How to use
+- Use `trust_remote_code=True` when loading because model/tokenizer classes are custom.
+- Example:
+  ```py
+  from transformers import AutoTokenizer, AutoModelForCausalLM
+  tok = AutoTokenizer.from_pretrained("ziadrone/shivik-m2-aries", trust_remote_code=True, use_fast=False)
+  model = AutoModelForCausalLM.from_pretrained("ziadrone/shivik-m2-aries", trust_remote_code=True).to("cuda")
+  prompt = "Hello <think> explain step by step </think>"
+  enc = tok(prompt, return_tensors="pt").to("cuda")
+  out = model(**enc)
+  ```
+## Intended uses & limitations
+- Intended for research: reasoning experiments, RAG orchestration, TOT/ToT.
+- NOT recommended for direct production use without safety review.
+## Paper / credits
+Model and tokenizer created by ziadrone (Shivik). See repo for training recipe and license.

modeling_shivik_m1.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# modeling_shivik_m1.py (PATCHED)
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
+class ShivikM1V3Config(PretrainedConfig):
+    # keep model_type stable so HF knows what this is
+    model_type = "shivik_m1"
+    def __init__(
+        self,
+        vocab_size=49156,
+        d_model=2048,
+        n_layers=24,
+        num_heads=16,
+        rotary_dim=128,
+        context_length=4096,
+        # legacy / generation-friendly aliases (kept in config for compatibility)
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # core params
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.n_layers = n_layers
+        self.num_heads = num_heads
+        self.rotary_dim = rotary_dim
+        self.context_length = context_length
+        # Generation compatibility fields (Transformers internals expect these)
+        # Keep several aliases so both old and new code find a supported name
+        self.num_hidden_layers = kwargs.get("num_hidden_layers", n_layers)
+        self.num_layers = kwargs.get("num_layers", n_layers)
+        self.n_layer = kwargs.get("n_layer", n_layers)
+        self.layer_types = kwargs.get("layer_types", ["full_attention"] * n_layers)
+        self.num_kv_shared_layers = kwargs.get("num_kv_shared_layers", 0)
+        self.use_cache = kwargs.get("use_cache", True)
+class RMSNorm(nn.Module):
+    def __init__(self, d, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(d))
+    def forward(self, x):
+        norm = x.pow(2).mean(-1, keepdim=True)
+        return x * torch.rsqrt(norm + self.eps) * self.weight
+def apply_rope(x, cos, sin):
+    # x: (..., seq_len, head_dim)
+    # cos/sin: seq_len x (rotary_dim/2)  (as created below)
+    D = x.shape[-1]
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    # x1/x2 shape: (..., seq_len, D/2)
+    xr = torch.stack([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
+    return xr.reshape(x.shape)
+class Attention(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.head_dim = cfg.d_model // cfg.num_heads
+        self.qkv = nn.Linear(cfg.d_model, 3 * cfg.d_model, bias=False)
+        self.out = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+    def split_heads(self, x):
+        B, T, C = x.shape
+        return x.view(B, T, self.cfg.num_heads, self.head_dim).transpose(1, 2)
+    def forward(self, x, cos, sin, mask, past=None):
+        B, T, C = x.shape
+        qkv = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+        q, k, v = self.split_heads(q), self.split_heads(k), self.split_heads(v)
+        rd = self.cfg.rotary_dim
+        if rd > 0:
+            # cos/sin currently shape: (T, rd/2)
+            # Expand cos/sin to match q[..., :rd] shape if necessary via unsqueeze:
+            # q[..., :rd] has shape (B, heads, T, rd)
+            # our cos/sin are (T, rd/2) but apply_rope uses splitting into even/odd so current shapes work if broadcasted.
+            q_rot = apply_rope(q[..., :rd], cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0))
+            k_rot = apply_rope(k[..., :rd], cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0))
+            q = torch.cat([q_rot, q[..., rd:]], dim=-1)
+            k = torch.cat([k_rot, k[..., rd:]], dim=-1)
+        if past is not None:
+            pk, pv = past
+            if pk is not None:
+                k = torch.cat([pk, k], dim=2)
+            if pv is not None:
+                v = torch.cat([pv, v], dim=2)
+        present = (k, v)
+        dk = q.shape[-1]
+        # attention scores: (B, heads, T, T')
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(dk)
+        # mask: shape (1,1,T,T) broadcastable to (B,heads,T,T)
+        scores = scores.masked_fill(~mask, float("-inf"))
+        att = torch.softmax(scores, dim=-1)
+        out = torch.matmul(att, v).transpose(1, 2).reshape(B, T, C)
+        return self.out(out), present
+class SwiGLU(nn.Module):
+    def __init__(self, d):
+        super().__init__()
+        self.w1 = nn.Linear(d, 4 * d, bias=False)
+        self.w2 = nn.Linear(d, 4 * d, bias=False)
+        self.w3 = nn.Linear(4 * d, d, bias=False)
+    def forward(self, x):
+        return self.w3(F.silu(self.w1(x)) * self.w2(x))
+class Block(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.norm1 = RMSNorm(cfg.d_model)
+        self.att = Attention(cfg)
+        self.norm2 = RMSNorm(cfg.d_model)
+        self.mlp = SwiGLU(cfg.d_model)
+    def forward(self, x, cos, sin, mask, past=None):
+        h, present = self.att(self.norm1(x), cos, sin, mask, past)
+        x = x + h
+        x = x + self.mlp(self.norm2(x))
+        return x, present
+class ShivikM1V3Model(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.embed = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        # position embedding (kept as parameter)
+        self.pos = nn.Parameter(torch.zeros(1, cfg.context_length, cfg.d_model))
+        mask = torch.tril(torch.ones(cfg.context_length, cfg.context_length)).bool()
+        self.register_buffer("mask", mask.unsqueeze(0).unsqueeze(0))
+        t = torch.arange(cfg.context_length)
+        # rotary frequencies: create half-dim angles (matching even/odd packing)
+        freqs = 1.0 / (10000 ** (torch.arange(0, cfg.rotary_dim, 2) / cfg.rotary_dim))
+        angles = torch.einsum("i,j->ij", t.float(), freqs.float())  # (T, rd/2)
+        # register cos/sin as (T, rd/2) and cast later by loading code if needed
+        self.register_buffer("cos", angles.cos())
+        self.register_buffer("sin", angles.sin())
+        self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layers)])
+        self.norm = RMSNorm(cfg.d_model)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+        # tie weights
+        self.lm_head.weight = self.embed.weight
+    def forward(self, input_ids, past_kvs=None, use_cache=False, **kwargs):
+        """
+        Returns CausalLMOutputWithCrossAttentions to be compatible with .generate().
+        past_kvs (or past_key_values) should be iterable of (k, v) tuples per layer or None.
+        """
+        B, T = input_ids.shape
+        x = self.embed(input_ids) + self.pos[:, :T]
+        mask = self.mask[:, :, :T, :T]  # (1,1,T,T) -> broadcast to (B,heads,T,T)
+        cos = self.cos[:T]   # shape (T, rd/2)
+        sin = self.sin[:T]   # shape (T, rd/2)
+        # Normalize past format: accept tuple/list named past_key_values or past_kvs
+        if past_kvs is None:
+            past_kvs = [None] * len(self.blocks)
+        presents = []
+        for block, p in zip(self.blocks, past_kvs):
+            x, kv = block(x, cos, sin, mask, p)
+            presents.append(kv)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        # convert presents -> tuple-of-tuples for past_key_values expected shape
+        past_key_values = None
+        if use_cache:
+            # each present is (k, v); make them into tuples
+            past_key_values = tuple((p[0], p[1]) if p is not None else (None, None) for p in presents)
+        return CausalLMOutputWithCrossAttentions(
+            logits=logits,
+            past_key_values=past_key_values,
+            hidden_states=None,
+            attentions=None,
+            cross_attentions=None,
+        )
+class ShivikM1V3ForCausalLM(PreTrainedModel, GenerationMixin):
+    config_class = ShivikM1V3Config
+    base_model_prefix = "shivik_m1_v3"
+    def __init__(self, config):
+        super().__init__(config)
+        # allow both config.n_layers and config.num_hidden_layers to drive model depth
+        # ensure config fields are in sync
+        n = getattr(config, "n_layers", None) or getattr(config, "n_layer", None) or getattr(config, "n_layers", None) or getattr(config, "num_hidden_layers", None) or getattr(config, "num_layers", None) or config.n_layers
+        # normalize config for downstream code
+        config.n_layers = int(n)
+        config.num_hidden_layers = int(n)
+        config.num_layers = int(n)
+        config.n_layer = int(n)
+        self.model = ShivikM1V3Model(config)
+    def forward(self, input_ids=None, past_key_values=None, **kwargs):
+        # pass through; ShivikM1V3Model returns a proper ModelOutput
+        return self.model(input_ids, past_key_values, use_cache=kwargs.get("use_cache", False))

modeling_shivik_m2.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# modeling_shivik_m2.py
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
+class ShivikM2Config(PretrainedConfig):
+    model_type = "shivik_m2"
+    def __init__(self, vocab_size=49152, d_model=2048, n_layers=24, num_heads=16, kv_heads=4, rotary_dim=2048, context_length=4096, **kwargs):
+        super().__init__(**kwargs)
+        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
+        assert num_heads % kv_heads == 0, "num_heads must be divisible by kv_heads"
+        self.vocab_size=vocab_size
+        self.d_model=d_model
+        self.n_layers=n_layers
+        self.num_heads=num_heads
+        self.kv_heads=kv_heads
+        self.rotary_dim=rotary_dim
+        self.context_length=context_length
+        # generation compat
+        self.use_cache = kwargs.get("use_cache", True)
+        self.num_hidden_layers = kwargs.get("num_hidden_layers", n_layers)
+# RMSNorm
+class RMSNorm(nn.Module):
+    def __init__(self, d, eps=1e-6):
+        super().__init__()
+        self.eps=eps
+        self.weight = nn.Parameter(torch.ones(d))
+    def forward(self,x):
+        norm = x.pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(norm + self.eps)
+        return x * self.weight
+# RoPE helpers: precompute complex cos/sin via cis (returns complex-like cos+isin stored as two tensors)
+def precompute_freqs_cis(dim, seq_len, base=10000.0, device='cpu', dtype=torch.float32):
+    half = dim // 2
+    inv_freq = 1.0 / (base ** (torch.arange(0, half, dtype=dtype) / float(half)))
+    t = torch.arange(seq_len, dtype=dtype)
+    freqs = torch.outer(t, inv_freq)  # (seq_len, half)
+    cos = torch.cos(freqs).to(device)
+    sin = torch.sin(freqs).to(device)
+    return cos, sin
+def apply_rope_tensor(x, cos, sin):
+    # x: (B, heads, T, head_dim)
+    # we assume head_dim is even
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    cos = cos.unsqueeze(0).unsqueeze(0)  # (1,1,T,half)
+    sin = sin.unsqueeze(0).unsqueeze(0)
+    xr0 = x1 * cos - x2 * sin
+    xr1 = x1 * sin + x2 * cos
+    xr = torch.stack([xr0, xr1], dim=-1)
+    return xr.reshape_as(x)
+# GQA attention
+class GQAAttention(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.num_heads = cfg.num_heads
+        self.kv_heads = cfg.kv_heads
+        self.head_dim = cfg.d_model // cfg.num_heads
+        assert self.head_dim % 2 == 0, "head_dim must be even for RoPE"
+        self.rep = self.num_heads // self.kv_heads
+        self.q_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        kv_dim = self.kv_heads * self.head_dim
+        self.kv_proj = nn.Linear(cfg.d_model, 2 * kv_dim, bias=False)
+        self.out = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+    def split_heads(self, x, heads):
+        B, T, C = x.shape
+        return x.view(B, T, heads, C // heads).transpose(1,2)  # (B, heads, T, head_dim)
+    def forward(self, x, cos, sin, att_mask, past=None):
+        B, T, C = x.shape
+        q = self.q_proj(x)
+        kv = self.kv_proj(x)
+        k, v = kv.chunk(2, dim=-1)
+        q = self.split_heads(q, self.num_heads)   # (B, Hq, T, hd)
+        k = self.split_heads(k, self.kv_heads)    # (B, Hk, T, hd)
+        v = self.split_heads(v, self.kv_heads)
+        # apply RoPE to full head_dim (head_dim even)
+        if cos is not None and sin is not None:
+            # cos/sin shapes: (T, head_dim/2) for full head_dim per head
+            # Apply on q per head, and on k per kv_head (works because head_dim is same)
+            q_rot = apply_rope_tensor(q, cos, sin)
+            k_rot = apply_rope_tensor(k, cos, sin)
+            q = q_rot
+            k = k_rot
+        # past handling: past expected as (pk, pv) per layer where pk shape (B, Hk, Tpast, hd)
+        if past is not None:
+            pk, pv = past
+            if pk is not None:
+                k = torch.cat([pk, k], dim=2)
+            if pv is not None:
+                v = torch.cat([pv, v], dim=2)
+        present = (k, v)
+        # expand k/v to q-heads
+        if self.rep > 1:
+            # repeat_interleave across head dim
+            k = k.unsqueeze(2).repeat(1,1,self.rep,1,1).view(B, self.num_heads, -1, self.head_dim)
+            v = v.unsqueeze(2).repeat(1,1,self.rep,1,1).view(B, self.num_heads, -1, self.head_dim)
+        dk = q.shape[-1]
+        # q @ k^T => (B, H, Tq, Tk)
+        scores = torch.matmul(q, k.transpose(-2,-1)) / math.sqrt(dk)
+        # att_mask shape must broadcast to (B,1,Tq,Tk) or (1,1,Tq,Tk)
+        scores = scores.masked_fill(~att_mask, torch.finfo(scores.dtype).min)
+        att = torch.softmax(scores, dim=-1)
+        out = torch.matmul(att, v)
+        out = out.transpose(1,2).reshape(B, T, C)
+        return self.out(out), present
+# SwiGLU MLP with 2.667x expansion
+class SwiGLUMLP(nn.Module):
+    def __init__(self, d_model):
+        super().__init__()
+        hidden = int(d_model * 8 / 3)  # ~2.667x
+        self.w1 = nn.Linear(d_model, hidden, bias=False)
+        self.w2 = nn.Linear(d_model, hidden, bias=False)
+        self.w3 = nn.Linear(hidden, d_model, bias=False)
+    def forward(self,x):
+        return self.w3(F.silu(self.w1(x)) * self.w2(x))
+# Transformer Block (pre-norm)
+class Block(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.norm1 = RMSNorm(cfg.d_model)
+        self.att = GQAAttention(cfg)
+        self.norm2 = RMSNorm(cfg.d_model)
+        self.mlp = SwiGLUMLP(cfg.d_model)
+    def forward(self, x, cos, sin, att_mask, past=None):
+        h, present = self.att(self.norm1(x), cos, sin, att_mask, past)
+        x = x + h
+        x = x + self.mlp(self.norm2(x))
+        return x, present
+# Full model
+class ShivikM2Model(nn.Module):
+    def __init__(self, cfg: ShivikM2Config):
+        super().__init__()
+        self.cfg = cfg
+        self.embed = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        # precompute RoPE cos/sin for context_length and head_dim/2
+        cos, sin = precompute_freqs_cis(cfg.d_model // cfg.num_heads, cfg.context_length)
+        # We'll store per-head cos/sin later on forward if needed
+        self.register_buffer("cos", cos)  # shape (T, head_dim/2)
+        self.register_buffer("sin", sin)
+        self.register_buffer("att_mask", torch.tril(torch.ones(cfg.context_length, cfg.context_length)).bool().unsqueeze(0).unsqueeze(0))
+        self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layers)])
+        self.norm = RMSNorm(cfg.d_model)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+        # tie weights at end by copying in from outside if needed
+    def forward(self, input_ids, past_key_values=None, use_cache=False):
+        B, T = input_ids.shape
+        x = self.embed(input_ids)
+        att_mask = self.att_mask[:, :, :T, :T].to(x.device)
+        cos = self.cos[:T].to(x.device)
+        sin = self.sin[:T].to(x.device)
+        if past_key_values is None:
+            past_key_values = [None] * len(self.blocks)
+        presents = []
+        for block, p in zip(self.blocks, past_key_values):
+            x, present = block(x, cos, sin, att_mask, p)
+            presents.append(present)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        past_key_values_out = None
+        if use_cache:
+            past_key_values_out = tuple((p[0], p[1]) if p is not None else (None, None) for p in presents)
+        return CausalLMOutputWithCrossAttentions(logits=logits, past_key_values=past_key_values_out, hidden_states=None, attentions=None, cross_attentions=None)
+class ShivikM2ForCausalLM(PreTrainedModel, GenerationMixin):
+    config_class = ShivikM2Config
+    base_model_prefix = "shivik_m2"
+    def __init__(self, config: ShivikM2Config):
+        PreTrainedModel.__init__(self, config)
+        # normalize n_layers fields
+        n = int(getattr(config, "n_layers", config.num_hidden_layers))
+        config.n_layers = n
+        config.num_hidden_layers = n
+        self.model = ShivikM2Model(config)
+        # tie lm_head weight to embedding
+        self.model.lm_head.weight = self.model.embed.weight
+    def forward(self, input_ids=None, past_key_values=None, **kwargs):
+        return self.model(input_ids, past_key_values, use_cache=kwargs.get("use_cache", False))

shivik-tokenizer-v120k/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "special_tokens": [
+    "<unk>",
+    "<pad>",
+    "<bos>",
+    "<eos>",
+    "<think>",
+    "<context>",
+    "<answer>",
+    "<end>",
+    "<thought_step>",
+    "<thought_branch>",
+    "<thought_end>",
+    "<thought_vote>"
+  ]
+}

shivik-tokenizer-v120k/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

shivik-tokenizer-v120k/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "vocab_size": 120000,
+  "special_tokens": [
+    "<unk>",
+    "<pad>",
+    "<bos>",
+    "<eos>",
+    "<think>",
+    "<context>",
+    "<answer>",
+    "<end>",
+    "<thought_step>",
+    "<thought_branch>",
+    "<thought_end>",
+    "<thought_vote>"
+  ],
+  "model": "BPE",
+  "training_samples": 2300000,
+  "training_time_minutes": 17.45083087682724
+}

shivik-tokenizer-v200k/special_tokens_map.json ADDED Viewed

The diff for this file is too large to render. See raw diff

shivik-tokenizer-v200k/token_ids.json ADDED Viewed

The diff for this file is too large to render. See raw diff

shivik-tokenizer-v200k/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df201412b2416c3076b005ed2cc217aeba2615391bb727d4d89fefa03a2dedf3
+size 20886503

shivik-tokenizer-v200k/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

shivik-tokenizer-v200k/tokenizer_metadata.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "total_vocab_size": 100000,
+  "base_vocab_size": 100000,
+  "special_tokens_count": 93406,
+  "training_samples": 2300000,
+  "training_time_minutes": 17.01,
+  "model": "BPE",
+  "categories": {
+    "reasoning_core": 6,
+    "tot_branching": 2100,
+    "reasoning_steps": 15000,
+    "voting": 1300,
+    "path_tracking": 15000,
+    "reward_policy": 15000,
+    "multi_agent": 15000,
+    "semantic": 10000,
+    "execution": 10000,
+    "summary": 10000
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "unk_token": "<unk>",
+  "pad_token": "<pad_000000>",
+  "bos_token": "<think>",
+  "eos_token": "</think>",
+  "additional_special_tokens": [
+    "<step>",
+    "</step>",
+    "<path>",
+    "</path>",
+    "<graph>",
+    "</graph>",
+    "<score>",
+    "</score>",
+    "<final>",
+    "</final>",
+    "<context>",
+    "</context>",
+    "<analysis>",
+    "</analysis>",
+    "<answer>",
+    "</answer>",
+    "<evaluate>",
+    "</evaluate>"
+  ]
+}

tokenization_shivik_m1.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import json
+import re
+import os
+from transformers import PreTrainedTokenizer
+class ShivikM1Tokenizer(PreTrainedTokenizer):
+    """
+    Clean HF-compatible Python BPE tokenizer.
+    """
+    vocab_files_names = {
+        "vocab_file": "vocab.json",
+        "merges_file": "merges.txt",
+    }
+    def __init__(self, vocab_file, merges_file, **kwargs):
+        # -------------------------
+        # Validate paths
+        # -------------------------
+        if vocab_file is None or not os.path.exists(vocab_file):
+            raise FileNotFoundError(f"vocab_file missing: {vocab_file}")
+        if merges_file is None or not os.path.exists(merges_file):
+            raise FileNotFoundError(f"merges_file missing: {merges_file}")
+        # -------------------------
+        # Load vocab + decoder
+        # -------------------------
+        with open(vocab_file, "r", encoding="utf-8") as f:
+            self.encoder = json.load(f)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        # -------------------------
+        # Load merges
+        # -------------------------
+        merges = []
+        with open(merges_file, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                parts = tuple(line.split())
+                if len(parts) == 2:
+                    merges.append(parts)
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+        # -------------------------
+        # Regex (HF-required)
+        # -------------------------
+        self.pat = re.compile(r"\S+")
+        # Store file paths
+        self.vocab_file = vocab_file
+        self.merges_file = merges_file
+        # -------------------------
+        # Default special tokens
+        # -------------------------
+        kwargs.setdefault("unk_token", "<unk>")
+        kwargs.setdefault("pad_token", "<pad_000000>")
+        kwargs.setdefault("bos_token", "<think>")
+        kwargs.setdefault("eos_token", "</think>")
+        super().__init__(**kwargs)
+    # -----------------------------------------------------------
+    # TOKENIZER REQUIRED API
+    # -----------------------------------------------------------
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    def get_vocab(self):
+        return dict(self.encoder)
+    # -----------------------------------------------------------
+    # BPE IMPLEMENTATION
+    # -----------------------------------------------------------
+    def get_pairs(self, word):
+        pairs = set()
+        prev = word[0]
+        for ch in word[1:]:
+            pairs.add((prev, ch))
+            prev = ch
+        return pairs
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token) + ("</w>",)
+        pairs = self.get_pairs(word)
+        if not pairs:
+            result = token + "</w>"
+            self.cache[token] = result
+            return result
+        while True:
+            bigram = min(pairs, key=lambda p: self.bpe_ranks.get(p, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                new_word.extend(word[i:j])
+                i = j
+                if word[i:i+2] == bigram:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            word = tuple(new_word)
+            pairs = self.get_pairs(word)
+        result = " ".join(word)
+        self.cache[token] = result
+        return result
+    # -----------------------------------------------------------
+    # Tokenization
+    # -----------------------------------------------------------
+    def _tokenize(self, text, **kwargs):
+        tokens = []
+        for word in re.findall(self.pat, text):
+            pieces = self.bpe(word).split(" ")
+            tokens.extend(pieces)
+        return tokens
+    def tokenize(self, text, **kwargs):
+        # Ignore HF-only kwargs safely
+        return self._tokenize(text)
+    # -----------------------------------------------------------
+    # Token ↔ ID
+    # -----------------------------------------------------------
+    def _convert_token_to_id(self, token):
+        return self.encoder.get(token, self.encoder.get("<unk>", 0))
+    def _convert_id_to_token(self, idx):
+        return self.decoder.get(idx, "<unk>")
+    def convert_tokens_to_string(self, tokens):
+        return " ".join(tokens).replace("</w>", "")
+    # -----------------------------------------------------------
+    # HF Special Token Helpers
+    # -----------------------------------------------------------
+    def build_inputs_with_special_tokens(self, ids_0, ids_1=None):
+        return list(ids_0) if ids_1 is None else list(ids_0) + list(ids_1)
+    def num_special_tokens_to_add(self, pair=False):
+        return 0
+    def get_special_tokens_mask(self, ids_0, ids_1=None, already_has_special_tokens=False):
+        special = set(self.all_special_ids)
+        if ids_1 is None:
+            return [1 if t in special else 0 for t in ids_0]
+        all_ids = ids_0 + ids_1
+        return [1 if t in special else 0 for t in all_ids]
+    def decode(self, ids, **kwargs):
+        toks = [self._convert_id_to_token(int(i)) for i in ids]
+        return self.convert_tokens_to_string(toks)

tokenization_shivik_m1.py.bak ADDED Viewed

	@@ -0,0 +1,175 @@

+import json
+import os
+import re
+from typing import List, Optional, Union
+from transformers import PreTrainedTokenizer
+class ShivikM1Tokenizer(PreTrainedTokenizer):
+    """
+    HuggingFace-compatible custom BPE tokenizer
+    """
+    vocab_files_names = {
+        "vocab_file": "vocab.json",
+        "merges_file": "merges.txt"
+    }
+    def __init__(self, vocab_file: str, merges_file: str, **kwargs):
+        super().__init__(**kwargs)
+        # -------------------------
+        # Load vocab
+        # -------------------------
+        with open(vocab_file, "r", encoding="utf-8") as f:
+            self.encoder = json.load(f)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        # -------------------------
+        # Load merges
+        # -------------------------
+        merges = []
+        with open(merges_file, "r", encoding="utf-8") as f:
+            for line in f:
+                if line.startswith("#") or not line.strip():
+                    continue
+                merges.append(tuple(line.strip().split()))
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+        self.pat = re.compile(r"\S+")
+        self.vocab_file = vocab_file
+        self.merges_file = merges_file
+        # Default special tokens
+        self.unk_token = kwargs.get("unk_token", "<unk>")
+        self.pad_token = kwargs.get("pad_token", "<pad_000000>")
+        self.bos_token = kwargs.get("bos_token", "<think>")
+        self.eos_token = kwargs.get("eos_token", "</think>")
+    # ============================
+    # HF Required Methods
+    # ============================
+    def get_vocab(self):
+        return dict(self.encoder)
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    # ============================
+    # BPE IMPLEMENTATION
+    # ============================
+    def get_pairs(self, word):
+        pairs = set()
+        prev = word[0]
+        for char in word[1:]:
+            pairs.add((prev, char))
+            prev = char
+        return pairs
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token) + ("</w>",)
+        pairs = self.get_pairs(word)
+        if not pairs:
+            return token + "</w>"
+        while True:
+            bigram = min(
+                pairs,
+                key=lambda x: self.bpe_ranks.get(x, 1e10)
+            )
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                new_word.extend(word[i:j])
+                i = j
+                if word[i:i+2] == bigram:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            word = tuple(new_word)
+            pairs = self.get_pairs(word)
+        word_str = " ".join(word)
+        self.cache[token] = word_str
+        return word_str
+    def _tokenize(self, text):
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            bpe = self.bpe(token)
+            bpe_tokens.extend(bpe.split(" "))
+        return bpe_tokens
+    # ============================
+    # Token <-> ID Mapping
+    # ============================
+    def _convert_token_to_id(self, token):
+        return self.encoder.get(token, self.encoder.get("<unk>", 0))
+    def _convert_id_to_token(self, idx):
+        return self.decoder.get(idx, "<unk>")
+    def convert_tokens_to_string(self, tokens):
+        return " ".join(tokens).replace("</w>", "")
+    # ============================
+    # HuggingFace Compatibility
+    # ============================
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        HF expects two args; we do not auto-insert BOS/EOS.
+        """
+        if token_ids_1 is None:
+            return list(token_ids_0)
+        return list(token_ids_0) + list(token_ids_1)
+    def num_special_tokens_to_add(self, pair=False):
+        return 0
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Required by HF. Marks special tokens = 1, others = 0.
+        """
+        if already_has_special_tokens:
+            special = set(self.all_special_ids)
+            return [1 if t in special else 0 for t in token_ids_0]
+        if token_ids_1 is None:
+            return [0] * len(token_ids_0)
+        combined = list(token_ids_0) + list(token_ids_1)
+        special = set(self.all_special_ids)
+        return [1 if t in special else 0 for t in combined]
+    # Optional but helpful
+    def decode(self, token_ids, **kwargs):
+        tokens = [self._convert_id_to_token(int(i)) for i in token_ids]
+        return self.convert_tokens_to_string(tokens)
+    def tokenize(self, text):
+        return self._tokenize(text)

tokenization_shivik_m1_fast.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from transformers import PreTrainedTokenizerFast
+class ShivikM1TokenizerFast(PreTrainedTokenizerFast):
+    """
+    Custom fast tokenizer for Shivik-M1 models.
+    Uses tokenizer.json + merges + vocab from HuggingFace repo.
+    """
+    model_input_names = ["input_ids", "attention_mask"]

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "unk_token": "<unk>",
+  "additional_special_tokens": [
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|end|>",
+    "<pad>",
+    "<think>",
+    "</think>",
+    "<context>",
+    "</context>",
+    "<answer>",
+    "</answer>",
+    "<end>",
+    "<instruction>",
+    "<tool>",
+    "<tool_input>",
+    "<tool_output>",
+    "<safety>",
+    "<e>"
+  ]
+}

tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,345 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 1,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 2,
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 3,
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 4,
+      "content": "<|system|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 5,
+      "content": "<|user|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 6,
+      "content": "<|assistant|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 7,
+      "content": "<|end|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 8,
+      "content": "<think>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 9,
+      "content": "</think>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 10,
+      "content": "<context>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 11,
+      "content": "</context>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 12,
+      "content": "<answer>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 13,
+      "content": "</answer>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 14,
+      "content": "<instruction>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 15,
+      "content": "<tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 16,
+      "content": "<tool_input>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 17,
+      "content": "<tool_output>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 18,
+      "content": "<safety>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 19,
+      "content": "<e>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 20,
+      "content": "<branch>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 21,
+      "content": "</branch>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 22,
+      "content": "<select>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 23,
+      "content": "</select>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 24,
+      "content": "<evaluate>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 25,
+      "content": "</evaluate>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 26,
+      "content": "<confidence>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 27,
+      "content": "</confidence>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 28,
+      "content": "<merge>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 29,
+      "content": "</merge>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 30,
+      "content": "<path_1>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 31,
+      "content": "<path_2>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 32,
+      "content": "<path_3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    }
+  ],
+  "normalizer": {
+    "type": "Sequence",
+    "normalizers": [
+      {
+        "type": "NFC"
+      },
+      {
+        "type": "Lowercase"
+      }
+    ]
+  },
+  "pre_tokenizer": {
+    "type": "ByteLevel",
+    "add_prefix_space": true,
+    "trim_offsets": true,
+    "use_regex": true
+  },
+  "post_processor": {
+    "type": "ByteLevel",
+    "add_prefix_space": true,
+    "trim_offsets": true,
+    "use_regex": true
+  },
+  "decoder": {
+    "type": "ByteLevel",
+    "add_prefix_space": true,
+    "trim_offsets": true,
+    "use_regex": true
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": null,
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": false,
+    "vocab": {},
+    "merges": []
+  }
+}

tokenizer/tokenizer_metadata.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "vocab_size": 200000,
+  "training_time_minutes": 144.34882674217224,
+  "timestamp": 1763844808.7371492,
+  "missing_tokens": []
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "tokenizer_class": "ShivikM1Tokenizer",
+  "vocab_file": "vocab.json",
+  "merges_file": "merges.txt",
+  "do_lower_case": false
+}

tokenizer_fast.json ADDED Viewed

The diff for this file is too large to render. See raw diff

train_aries.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# train_aries.py
+# Skeleton training pipeline for:
+#  - SFT (supervised fine-tuning)
+#  - hooks to plug GRPO/TRL reward models (placeholders provided)
+#
+# Usage:
+#   export HF_TOKEN="hf_xxx"
+#   python train_aries.py --data /path/to/data.jsonl --output_dir /path/to/out --epochs 3 --batch 2
+import os, argparse, json
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
+from datasets import load_dataset
+def load_tokenizer_and_model(repo_or_local):
+    tok = AutoTokenizer.from_pretrained(repo_or_local, trust_remote_code=True, use_fast=False)
+    model = AutoModelForCausalLM.from_pretrained(repo_or_local, trust_remote_code=True)
+    return tok, model
+def prepare_dataset(path, tok, max_length=512):
+    # expects jsonl with {"prompt": "...", "response": "..."}
+    ds = load_dataset('json', data_files={'train': str(path)}, split='train')
+    def map_fn(x):
+        text = x.get('prompt','') + '\n' + x.get('response','')
+        return tok(text, truncation=True, max_length=max_length)
+    ds = ds.map(map_fn, batched=False)
+    ds.set_format(type='torch', columns=['input_ids', 'attention_mask'])
+    return ds
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument('--data', required=True)
+    p.add_argument('--repo', default='.' , help='local folder or HF repo id')
+    p.add_argument('--output_dir', default='./out')
+    p.add_argument('--epochs', type=int, default=1)
+    p.add_argument('--batch', type=int, default=2)
+    args = p.parse_args()
+    tok, model = load_tokenizer_and_model(args.repo)
+    ds = prepare_dataset(args.data, tok)
+    training_args = TrainingArguments(
+        output_dir=args.output_dir,
+        per_device_train_batch_size=args.batch,
+        num_train_epochs=args.epochs,
+        bf16=torch.cuda.is_available(),
+        fp16=torch.cuda.is_available(),
+        logging_steps=10,
+        save_strategy='epoch',
+        push_to_hub=False
+    )
+    # Basic SFT trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=ds,
+        tokenizer=tok
+    )
+    trainer.train()
+    # === Hooks: attach GRPO/TRL ===
+    # After SFT completes, you may want to:
+    # 1) Initialize reward model and KTO/GRPO loop (placeholder)
+    # 2) Use `trl`'s PPOTrainer or custom GRPO trainer
+    # Example (pseudo):
+    # from trl import PPOTrainer
+    # reward_fn = lambda queries, generations: compute_rewards(queries, generations, reward_model)
+    # ppo_trainer = PPOTrainer(...)
+    # ppo_trainer.train()
+    print("Done SFT. Model checkpoint in", args.output_dir)
+if __name__ == '__main__':
+    main()

upload_to_hf.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# upload_to_hf.py
+# Usage: export HF_TOKEN='hf_xxx' ; python upload_to_hf.py --repo_id username/repo
+import os, argparse
+from huggingface_hub import HfApi, create_repo, upload_folder
+p = argparse.ArgumentParser()
+p.add_argument('--repo_id', required=True)
+p.add_argument('--folder', default='.')
+args = p.parse_args()
+token = os.environ.get('HF_TOKEN')
+if not token:
+    raise SystemExit('Please set HF_TOKEN in environment.')
+create_repo(repo_id=args.repo_id, token=token, exist_ok=True)
+print('Uploading folder', args.folder, 'to', args.repo_id)
+upload_folder(folder_path=args.folder, repo_id=args.repo_id, token=token)
+print('Done.')

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff