ziadrone commited on
Commit
054c77e
·
verified ·
1 Parent(s): c8f105b

Upload Shivik-M2 with merges.txt (clean)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.safetensors.bak filter=lfs diff=lfs merge=lfs -text
37
+ shivik-tokenizer-v200k/tokenizer.json filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/tokenization_shivik_m1-checkpoint.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json, re, os
2
+ from transformers import PreTrainedTokenizer
3
+
4
+ class ShivikM1Tokenizer(PreTrainedTokenizer):
5
+
6
+ vocab_files_names = {
7
+ "vocab_file": "vocab.json",
8
+ "merges_file": "merges.txt",
9
+ }
10
+
11
+ def __init__(self, vocab_file=None, merges_file=None, **kwargs):
12
+
13
+ # --------------------------------------------------------------
14
+ # 1) Resolve real paths when HF passes only folder
15
+ # --------------------------------------------------------------
16
+ if vocab_file is None or not os.path.isfile(vocab_file):
17
+ vocab_file = os.path.join(kwargs.get("pretrained_model_name_or_path", ""), "vocab.json")
18
+ if merges_file is None or not os.path.isfile(merges_file):
19
+ merges_file = os.path.join(kwargs.get("pretrained_model_name_or_path", ""), "merges.txt")
20
+
21
+ if not os.path.isfile(vocab_file):
22
+ raise FileNotFoundError(f"Cannot find vocab.json at {vocab_file}")
23
+ if not os.path.isfile(merges_file):
24
+ raise FileNotFoundError(f"Cannot find merges.txt at {merges_file}")
25
+
26
+ # --------------------------------------------------------------
27
+ # 2) Load vocab + merges
28
+ # --------------------------------------------------------------
29
+ with open(vocab_file, "r", encoding="utf-8") as f:
30
+ self.encoder = json.load(f)
31
+ self.decoder = {v: k for k, v in self.encoder.items()}
32
+
33
+ merges = []
34
+ with open(merges_file, "r", encoding="utf-8") as f:
35
+ for line in f:
36
+ line = line.strip()
37
+ if not line or line.startswith("#"):
38
+ continue
39
+ merges.append(tuple(line.split()))
40
+
41
+ self.bpe_ranks = dict(zip(merges, range(len(merges))))
42
+ self.cache = {}
43
+
44
+ # Robust pattern
45
+ self.pat = re.compile(r"\S+")
46
+
47
+ self.vocab_file = vocab_file
48
+ self.merges_file = merges_file
49
+
50
+ # set default specials
51
+ kwargs.setdefault("unk_token", "<unk>")
52
+ kwargs.setdefault("pad_token", "<pad_000000>")
53
+ kwargs.setdefault("bos_token", "<think>")
54
+ kwargs.setdefault("eos_token", "</think>")
55
+
56
+ super().__init__(**kwargs)
57
+
58
+ # --------------------------------------------------------------
59
+ # Standard GPT BPE tokenization
60
+ # --------------------------------------------------------------
61
+ @property
62
+ def vocab_size(self):
63
+ return len(self.encoder)
64
+
65
+ def get_vocab(self):
66
+ return dict(self.encoder)
67
+
68
+ def get_pairs(self, word):
69
+ pairs = set()
70
+ prev = word[0]
71
+ for ch in word[1:]:
72
+ pairs.add((prev, ch))
73
+ prev = ch
74
+ return pairs
75
+
76
+ def bpe(self, token):
77
+ if token in self.cache:
78
+ return self.cache[token]
79
+
80
+ word = tuple(token) + ("</w>",)
81
+ pairs = self.get_pairs(word)
82
+
83
+ if not pairs:
84
+ result = token + "</w>"
85
+ self.cache[token] = result
86
+ return result
87
+
88
+ while True:
89
+ bigram = min(pairs, key=lambda p: self.bpe_ranks.get(p, 1e10))
90
+ if bigram not in self.bpe_ranks:
91
+ break
92
+
93
+ first, second = bigram
94
+ new_word = []
95
+ i = 0
96
+ while i < len(word):
97
+ try:
98
+ j = word.index(first, i)
99
+ except ValueError:
100
+ new_word.extend(word[i:])
101
+ break
102
+ new_word.extend(word[i:j])
103
+ i = j
104
+ if word[i:i+2] == bigram:
105
+ new_word.append(first + second)
106
+ i += 2
107
+ else:
108
+ new_word.append(word[i])
109
+ i += 1
110
+ word = tuple(new_word)
111
+ pairs = self.get_pairs(word)
112
+
113
+ result = " ".join(word)
114
+ self.cache[token] = result
115
+ return result
116
+
117
+ # --------------------------------------------------------------
118
+ # Final tokenization functions
119
+ # --------------------------------------------------------------
120
+ def _tokenize(self, text, **kwargs):
121
+ tokens = []
122
+ for word in re.findall(self.pat, text):
123
+ bpe_res = self.bpe(word)
124
+ tokens.extend(bpe_res.split(" "))
125
+ return tokens
126
+
127
+ def tokenize(self, text, **kwargs):
128
+ return self._tokenize(text)
129
+
130
+ def _convert_token_to_id(self, token):
131
+ return self.encoder.get(token, self.encoder["<unk>"])
132
+
133
+ def _convert_id_to_token(self, idx):
134
+ return self.decoder.get(idx, "<unk>")
135
+
136
+ def convert_tokens_to_string(self, tokens):
137
+ return " ".join(tokens).replace("</w>", "")
138
+
139
+ def build_inputs_with_special_tokens(self, ids_0, ids_1=None):
140
+ return list(ids_0) if ids_1 is None else list(ids_0) + list(ids_1)
141
+
142
+ def decode(self, ids, **kwargs):
143
+ return self.convert_tokens_to_string([self._convert_id_to_token(i) for i in ids])
README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ziadrone / shivik-m2-aries
2
+
3
+ ✅ **Shivik-M2 (Aries infusion)** — 1.1B reasoning-capable causal LM
4
+ This repository contains:
5
+ - model.safetensors (M2 weights)
6
+ - tokenizer files (vocab.json, merges.txt, tokenizer.json)
7
+ - `modeling_shivik_m2.py` (custom model class)
8
+ - `tokenization_shivik_m1.py` (custom HF-compatible Python tokenizer)
9
+ - helper scripts: `build_tokenizer_fast.py`, `train_aries.py`
10
+
11
+ ## Quick usage (after `pip install transformers safetensors tokenizers`)
12
+ ```py
13
+ from transformers import AutoTokenizer, AutoModelForCausalLM
14
+ tok = AutoTokenizer.from_pretrained("ziadrone/shivik-m2-aries", trust_remote_code=True, use_fast=False)
15
+ model = AutoModelForCausalLM.from_pretrained("ziadrone/shivik-m2-aries", trust_remote_code=True)
16
+ text = "Hello <think> explain step by step </think>"
17
+ enc = tok(text, return_tensors='pt')
18
+ out = model(**enc)
19
+ ```
UPLOADED_TOKENIZER_HELPER.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Uploaded tokenizer helper path (for reference): /mnt/data/tokenization_shivik_m1.py\n
build_tokenizer_fast.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_tokenizer_fast.py
2
+ # Builds a tokenizers (Rust) BPE tokenizer from vocab.json + merges.txt and saves tokenizer.json
3
+ import json, sys
4
+ from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
5
+ from tokenizers.processors import TemplateProcessing
6
+ from pathlib import Path
7
+
8
+ REPO = Path("/workspace/shivik-m2")
9
+ vocab_file = REPO / "vocab.json"
10
+ merges_file = REPO / "merges.txt"
11
+ out_file = REPO / "tokenizers_bpe.json"
12
+
13
+ if not vocab_file.exists() or not merges_file.exists():
14
+ raise SystemExit("vocab.json or merges.txt missing in " + str(REPO))
15
+
16
+ print("Loading vocab + merges...")
17
+ with open(vocab_file, "r", encoding="utf-8") as f:
18
+ vocab = json.load(f)
19
+ merges = [line.rstrip("\n") for line in open(merges_file, "r", encoding="utf-8") if line.strip() and not line.startswith("#")]
20
+
21
+ # Build BPE model from explicit vocab+merges
22
+ model = models.BPE(vocab=vocab, merges=merges)
23
+
24
+ tokenizer = Tokenizer(model)
25
+ # simple pre-tokenizer / decoder for GPT style
26
+ tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
27
+ tokenizer.decoder = decoders.ByteLevel()
28
+ # Set post-processor to keep things simple (no added special tokens)
29
+ tokenizer.post_processor = TemplateProcessing(
30
+ single="$A",
31
+ pair="$A $B",
32
+ special_tokens=[]
33
+ )
34
+
35
+ print("Saving tokenizer to", out_file)
36
+ tokenizer.save(str(out_file))
37
+ print("Done. You can move tokenizers_bpe.json -> tokenizer.json or upload as-is.")
38
+ print("\nUploaded helper file path (for reference):")
39
+ print("/mnt/data/tokenization_shivik_m1.py")
config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "shivik_m1",
3
+ "vocab_size": 49152,
4
+ "d_model": 2048,
5
+ "n_layers": 24,
6
+ "num_heads": 16,
7
+ "kv_heads": 4,
8
+ "rotary_dim": 128,
9
+ "context_length": 4096,
10
+ "use_cache": true
11
+ }
generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "max_length": 2048,
3
+ "do_sample": false,
4
+ "eos_token_id": null
5
+ }
load_and_test.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # load_and_test.py - quick load test
3
+ import sys, os
4
+ sys.path.insert(0, os.getcwd())
5
+ from tokenization_shivik_m1 import ShivikM1Tokenizer
6
+ from modeling_shivik_m2 import ShivikM2Config, ShivikM2ForCausalLM
7
+
8
+ repo = "/workspace/shivik-m2"
9
+ tok = ShivikM1Tokenizer.from_pretrained(repo, local_files_only=True)
10
+ print("Tokenizer loaded ✓ vocab_size =", tok.vocab_size)
11
+ cfg = ShivikM2Config()
12
+ model = ShivikM2ForCausalLM(cfg)
13
+ print("Model instance created ✓")
14
+ # test forward with random IDs
15
+ import torch
16
+ x = torch.randint(0, tok.vocab_size, (2, 8))
17
+ out = model(x)
18
+ print("Forward OK, logits shape:", out.logits.shape)
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
migrate_weights_m1_to_m2.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # migrate_weights_m1_to_m2.py
3
+ import os, sys, torch
4
+ from safetensors.torch import load_file as load_safetensors, save_file as save_safetensors
5
+ from modeling_shivik_m2 import ShivikM2Config, ShivikM2ForCausalLM
6
+
7
+ SRC = "/workspace/shivik-m1-v3.1-fp16/model.safetensors"
8
+ DST_DIR = "/workspace/shivik-m2"
9
+ DST = os.path.join(DST_DIR, "model.safetensors")
10
+
11
+ def safe_load(path):
12
+ if path.endswith(".safetensors"):
13
+ try:
14
+ sd = load_safetensors(path)
15
+ print("Loaded safetensors:", path)
16
+ # convert to torch tensors
17
+ return {k: torch.tensor(v) if not isinstance(v, torch.Tensor) else v for k,v in sd.items()}
18
+ except Exception as e:
19
+ print("safetensors load failed:", e)
20
+ raise
21
+ else:
22
+ return torch.load(path, map_location="cpu")
23
+
24
+ print("Loading source state dict:", SRC)
25
+ src_sd = safe_load(SRC)
26
+
27
+ # instantiate new model
28
+ cfg = ShivikM2Config()
29
+ model = ShivikM2ForCausalLM(cfg).eval()
30
+ new_sd = model.state_dict()
31
+
32
+ print("Mapping compatible tensors (exact shape match) from source -> new model...")
33
+ copied = []
34
+ skipped = []
35
+ for k_new, v_new in new_sd.items():
36
+ # attempt to find exact name in src_sd
37
+ if k_new in src_sd and src_sd[k_new].shape == v_new.shape:
38
+ new_sd[k_new] = src_sd[k_new].clone()
39
+ copied.append(k_new)
40
+ else:
41
+ # try some heuristics for common renames: embed, lm_head, norm weights
42
+ alt_keys = [
43
+ k_new.replace("model.", ""),
44
+ k_new.replace("model.", "shivik_m1_v3.model."),
45
+ k_new.replace("lm_head.weight", "embed.weight"),
46
+ k_new.replace("model.embed.weight", "model.embed.weight"),
47
+ ]
48
+ found = False
49
+ for alt in alt_keys:
50
+ if alt in src_sd and src_sd[alt].shape == v_new.shape:
51
+ new_sd[k_new] = src_sd[alt].clone()
52
+ copied.append((k_new, alt))
53
+ found = True
54
+ break
55
+ if not found:
56
+ skipped.append(k_new)
57
+
58
+ print(f"Copied {len(copied)} tensors, skipped {len(skipped)} tensors.")
59
+ print("Skipped (sample 20):", skipped[:20])
60
+
61
+ # save new_sd as safetensors (if possible), else torch.save
62
+ try:
63
+ # safetensors expects numpy arrays; convert
64
+ from safetensors.torch import save_file
65
+ out = {k: v.cpu() for k,v in new_sd.items()}
66
+ save_file(out, DST)
67
+ print("Saved migrated safetensors to", DST)
68
+ except Exception as e:
69
+ print("safetensors save failed, falling back to torch.save:", e)
70
+ torch.save(new_sd, DST.replace(".safetensors", ".pt"))
71
+ print("Saved as torch .pt to", DST.replace(".safetensors", ".pt"))
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35697f70767428363b9b367d666b0ed114081d5f5bd8e1d1c80227e227687729
3
+ size 4850737576
model.safetensors.bak ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35697f70767428363b9b367d666b0ed114081d5f5bd8e1d1c80227e227687729
3
+ size 4850737576
model_card.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: apache-2.0
4
+ tags:
5
+ - causal-lm
6
+ - reasoning
7
+ - aries
8
+ - shivik
9
+ - instruction-following
10
+ - safetensors
11
+ library_name: "transformers"
12
+ ---
13
+ # Shivik-M2 Aries (ziadrone/shivik-m2-aries)
14
+
15
+ **Model type:** Causal LM (1.1B) with Aries reasoning tokens infused.
16
+
17
+ ## Description
18
+ This model is an M2 architecture (GQA-style attention) derived from Shivik-M1 weights and reworked to support reasoning tokens. It includes custom special tokens for multi-step reasoning:
19
+ ```
20
+ <think>...</think> <step>...</step> <path>...</path> <graph>...</graph>
21
+ <score>...</score> <final>...</final> <context>...</context>
22
+ <analysis>...</analysis> <answer>...</answer> <evaluate>...</evaluate>
23
+ ```
24
+
25
+ ## How to use
26
+ - Use `trust_remote_code=True` when loading because model/tokenizer classes are custom.
27
+ - Example:
28
+ ```py
29
+ from transformers import AutoTokenizer, AutoModelForCausalLM
30
+ tok = AutoTokenizer.from_pretrained("ziadrone/shivik-m2-aries", trust_remote_code=True, use_fast=False)
31
+ model = AutoModelForCausalLM.from_pretrained("ziadrone/shivik-m2-aries", trust_remote_code=True).to("cuda")
32
+ prompt = "Hello <think> explain step by step </think>"
33
+ enc = tok(prompt, return_tensors="pt").to("cuda")
34
+ out = model(**enc)
35
+ ```
36
+ ## Intended uses & limitations
37
+ - Intended for research: reasoning experiments, RAG orchestration, TOT/ToT.
38
+ - NOT recommended for direct production use without safety review.
39
+
40
+ ## Paper / credits
41
+ Model and tokenizer created by ziadrone (Shivik). See repo for training recipe and license.
modeling_shivik_m1.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modeling_shivik_m1.py (PATCHED)
2
+ import math
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ from transformers import PreTrainedModel, PretrainedConfig
7
+ from transformers.generation import GenerationMixin
8
+ from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
9
+
10
+ class ShivikM1V3Config(PretrainedConfig):
11
+ # keep model_type stable so HF knows what this is
12
+ model_type = "shivik_m1"
13
+
14
+ def __init__(
15
+ self,
16
+ vocab_size=49156,
17
+ d_model=2048,
18
+ n_layers=24,
19
+ num_heads=16,
20
+ rotary_dim=128,
21
+ context_length=4096,
22
+ # legacy / generation-friendly aliases (kept in config for compatibility)
23
+ **kwargs,
24
+ ):
25
+ super().__init__(**kwargs)
26
+ # core params
27
+ self.vocab_size = vocab_size
28
+ self.d_model = d_model
29
+ self.n_layers = n_layers
30
+ self.num_heads = num_heads
31
+ self.rotary_dim = rotary_dim
32
+ self.context_length = context_length
33
+
34
+ # Generation compatibility fields (Transformers internals expect these)
35
+ # Keep several aliases so both old and new code find a supported name
36
+ self.num_hidden_layers = kwargs.get("num_hidden_layers", n_layers)
37
+ self.num_layers = kwargs.get("num_layers", n_layers)
38
+ self.n_layer = kwargs.get("n_layer", n_layers)
39
+ self.layer_types = kwargs.get("layer_types", ["full_attention"] * n_layers)
40
+ self.num_kv_shared_layers = kwargs.get("num_kv_shared_layers", 0)
41
+ self.use_cache = kwargs.get("use_cache", True)
42
+
43
+ class RMSNorm(nn.Module):
44
+ def __init__(self, d, eps=1e-6):
45
+ super().__init__()
46
+ self.eps = eps
47
+ self.weight = nn.Parameter(torch.ones(d))
48
+ def forward(self, x):
49
+ norm = x.pow(2).mean(-1, keepdim=True)
50
+ return x * torch.rsqrt(norm + self.eps) * self.weight
51
+
52
+ def apply_rope(x, cos, sin):
53
+ # x: (..., seq_len, head_dim)
54
+ # cos/sin: seq_len x (rotary_dim/2) (as created below)
55
+ D = x.shape[-1]
56
+ x1 = x[..., 0::2]
57
+ x2 = x[..., 1::2]
58
+ # x1/x2 shape: (..., seq_len, D/2)
59
+ xr = torch.stack([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
60
+ return xr.reshape(x.shape)
61
+
62
+ class Attention(nn.Module):
63
+ def __init__(self, cfg):
64
+ super().__init__()
65
+ self.cfg = cfg
66
+ self.head_dim = cfg.d_model // cfg.num_heads
67
+ self.qkv = nn.Linear(cfg.d_model, 3 * cfg.d_model, bias=False)
68
+ self.out = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
69
+ def split_heads(self, x):
70
+ B, T, C = x.shape
71
+ return x.view(B, T, self.cfg.num_heads, self.head_dim).transpose(1, 2)
72
+ def forward(self, x, cos, sin, mask, past=None):
73
+ B, T, C = x.shape
74
+ qkv = self.qkv(x)
75
+ q, k, v = qkv.chunk(3, dim=-1)
76
+ q, k, v = self.split_heads(q), self.split_heads(k), self.split_heads(v)
77
+ rd = self.cfg.rotary_dim
78
+ if rd > 0:
79
+ # cos/sin currently shape: (T, rd/2)
80
+ # Expand cos/sin to match q[..., :rd] shape if necessary via unsqueeze:
81
+ # q[..., :rd] has shape (B, heads, T, rd)
82
+ # our cos/sin are (T, rd/2) but apply_rope uses splitting into even/odd so current shapes work if broadcasted.
83
+ q_rot = apply_rope(q[..., :rd], cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0))
84
+ k_rot = apply_rope(k[..., :rd], cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0))
85
+ q = torch.cat([q_rot, q[..., rd:]], dim=-1)
86
+ k = torch.cat([k_rot, k[..., rd:]], dim=-1)
87
+ if past is not None:
88
+ pk, pv = past
89
+ if pk is not None:
90
+ k = torch.cat([pk, k], dim=2)
91
+ if pv is not None:
92
+ v = torch.cat([pv, v], dim=2)
93
+ present = (k, v)
94
+ dk = q.shape[-1]
95
+ # attention scores: (B, heads, T, T')
96
+ scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(dk)
97
+ # mask: shape (1,1,T,T) broadcastable to (B,heads,T,T)
98
+ scores = scores.masked_fill(~mask, float("-inf"))
99
+ att = torch.softmax(scores, dim=-1)
100
+ out = torch.matmul(att, v).transpose(1, 2).reshape(B, T, C)
101
+ return self.out(out), present
102
+
103
+ class SwiGLU(nn.Module):
104
+ def __init__(self, d):
105
+ super().__init__()
106
+ self.w1 = nn.Linear(d, 4 * d, bias=False)
107
+ self.w2 = nn.Linear(d, 4 * d, bias=False)
108
+ self.w3 = nn.Linear(4 * d, d, bias=False)
109
+ def forward(self, x):
110
+ return self.w3(F.silu(self.w1(x)) * self.w2(x))
111
+
112
+ class Block(nn.Module):
113
+ def __init__(self, cfg):
114
+ super().__init__()
115
+ self.norm1 = RMSNorm(cfg.d_model)
116
+ self.att = Attention(cfg)
117
+ self.norm2 = RMSNorm(cfg.d_model)
118
+ self.mlp = SwiGLU(cfg.d_model)
119
+ def forward(self, x, cos, sin, mask, past=None):
120
+ h, present = self.att(self.norm1(x), cos, sin, mask, past)
121
+ x = x + h
122
+ x = x + self.mlp(self.norm2(x))
123
+ return x, present
124
+
125
+ class ShivikM1V3Model(nn.Module):
126
+ def __init__(self, cfg):
127
+ super().__init__()
128
+ self.cfg = cfg
129
+ self.embed = nn.Embedding(cfg.vocab_size, cfg.d_model)
130
+ # position embedding (kept as parameter)
131
+ self.pos = nn.Parameter(torch.zeros(1, cfg.context_length, cfg.d_model))
132
+ mask = torch.tril(torch.ones(cfg.context_length, cfg.context_length)).bool()
133
+ self.register_buffer("mask", mask.unsqueeze(0).unsqueeze(0))
134
+ t = torch.arange(cfg.context_length)
135
+ # rotary frequencies: create half-dim angles (matching even/odd packing)
136
+ freqs = 1.0 / (10000 ** (torch.arange(0, cfg.rotary_dim, 2) / cfg.rotary_dim))
137
+ angles = torch.einsum("i,j->ij", t.float(), freqs.float()) # (T, rd/2)
138
+ # register cos/sin as (T, rd/2) and cast later by loading code if needed
139
+ self.register_buffer("cos", angles.cos())
140
+ self.register_buffer("sin", angles.sin())
141
+ self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layers)])
142
+ self.norm = RMSNorm(cfg.d_model)
143
+ self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
144
+ # tie weights
145
+ self.lm_head.weight = self.embed.weight
146
+
147
+ def forward(self, input_ids, past_kvs=None, use_cache=False, **kwargs):
148
+ """
149
+ Returns CausalLMOutputWithCrossAttentions to be compatible with .generate().
150
+ past_kvs (or past_key_values) should be iterable of (k, v) tuples per layer or None.
151
+ """
152
+ B, T = input_ids.shape
153
+ x = self.embed(input_ids) + self.pos[:, :T]
154
+ mask = self.mask[:, :, :T, :T] # (1,1,T,T) -> broadcast to (B,heads,T,T)
155
+ cos = self.cos[:T] # shape (T, rd/2)
156
+ sin = self.sin[:T] # shape (T, rd/2)
157
+
158
+ # Normalize past format: accept tuple/list named past_key_values or past_kvs
159
+ if past_kvs is None:
160
+ past_kvs = [None] * len(self.blocks)
161
+ presents = []
162
+ for block, p in zip(self.blocks, past_kvs):
163
+ x, kv = block(x, cos, sin, mask, p)
164
+ presents.append(kv)
165
+
166
+ x = self.norm(x)
167
+ logits = self.lm_head(x)
168
+
169
+ # convert presents -> tuple-of-tuples for past_key_values expected shape
170
+ past_key_values = None
171
+ if use_cache:
172
+ # each present is (k, v); make them into tuples
173
+ past_key_values = tuple((p[0], p[1]) if p is not None else (None, None) for p in presents)
174
+
175
+ return CausalLMOutputWithCrossAttentions(
176
+ logits=logits,
177
+ past_key_values=past_key_values,
178
+ hidden_states=None,
179
+ attentions=None,
180
+ cross_attentions=None,
181
+ )
182
+
183
+ class ShivikM1V3ForCausalLM(PreTrainedModel, GenerationMixin):
184
+ config_class = ShivikM1V3Config
185
+ base_model_prefix = "shivik_m1_v3"
186
+
187
+ def __init__(self, config):
188
+ super().__init__(config)
189
+ # allow both config.n_layers and config.num_hidden_layers to drive model depth
190
+ # ensure config fields are in sync
191
+ n = getattr(config, "n_layers", None) or getattr(config, "n_layer", None) or getattr(config, "n_layers", None) or getattr(config, "num_hidden_layers", None) or getattr(config, "num_layers", None) or config.n_layers
192
+ # normalize config for downstream code
193
+ config.n_layers = int(n)
194
+ config.num_hidden_layers = int(n)
195
+ config.num_layers = int(n)
196
+ config.n_layer = int(n)
197
+ self.model = ShivikM1V3Model(config)
198
+
199
+ def forward(self, input_ids=None, past_key_values=None, **kwargs):
200
+ # pass through; ShivikM1V3Model returns a proper ModelOutput
201
+ return self.model(input_ids, past_key_values, use_cache=kwargs.get("use_cache", False))
modeling_shivik_m2.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # modeling_shivik_m2.py
3
+ import math
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ from transformers import PreTrainedModel, PretrainedConfig
8
+ from transformers.generation import GenerationMixin
9
+ from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
10
+
11
+ class ShivikM2Config(PretrainedConfig):
12
+ model_type = "shivik_m2"
13
+ def __init__(self, vocab_size=49152, d_model=2048, n_layers=24, num_heads=16, kv_heads=4, rotary_dim=2048, context_length=4096, **kwargs):
14
+ super().__init__(**kwargs)
15
+ assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
16
+ assert num_heads % kv_heads == 0, "num_heads must be divisible by kv_heads"
17
+ self.vocab_size=vocab_size
18
+ self.d_model=d_model
19
+ self.n_layers=n_layers
20
+ self.num_heads=num_heads
21
+ self.kv_heads=kv_heads
22
+ self.rotary_dim=rotary_dim
23
+ self.context_length=context_length
24
+ # generation compat
25
+ self.use_cache = kwargs.get("use_cache", True)
26
+ self.num_hidden_layers = kwargs.get("num_hidden_layers", n_layers)
27
+
28
+ # RMSNorm
29
+ class RMSNorm(nn.Module):
30
+ def __init__(self, d, eps=1e-6):
31
+ super().__init__()
32
+ self.eps=eps
33
+ self.weight = nn.Parameter(torch.ones(d))
34
+ def forward(self,x):
35
+ norm = x.pow(2).mean(-1, keepdim=True)
36
+ x = x * torch.rsqrt(norm + self.eps)
37
+ return x * self.weight
38
+
39
+ # RoPE helpers: precompute complex cos/sin via cis (returns complex-like cos+isin stored as two tensors)
40
+ def precompute_freqs_cis(dim, seq_len, base=10000.0, device='cpu', dtype=torch.float32):
41
+ half = dim // 2
42
+ inv_freq = 1.0 / (base ** (torch.arange(0, half, dtype=dtype) / float(half)))
43
+ t = torch.arange(seq_len, dtype=dtype)
44
+ freqs = torch.outer(t, inv_freq) # (seq_len, half)
45
+ cos = torch.cos(freqs).to(device)
46
+ sin = torch.sin(freqs).to(device)
47
+ return cos, sin
48
+
49
+ def apply_rope_tensor(x, cos, sin):
50
+ # x: (B, heads, T, head_dim)
51
+ # we assume head_dim is even
52
+ x1 = x[..., 0::2]
53
+ x2 = x[..., 1::2]
54
+ cos = cos.unsqueeze(0).unsqueeze(0) # (1,1,T,half)
55
+ sin = sin.unsqueeze(0).unsqueeze(0)
56
+ xr0 = x1 * cos - x2 * sin
57
+ xr1 = x1 * sin + x2 * cos
58
+ xr = torch.stack([xr0, xr1], dim=-1)
59
+ return xr.reshape_as(x)
60
+
61
+ # GQA attention
62
+ class GQAAttention(nn.Module):
63
+ def __init__(self, cfg):
64
+ super().__init__()
65
+ self.cfg = cfg
66
+ self.num_heads = cfg.num_heads
67
+ self.kv_heads = cfg.kv_heads
68
+ self.head_dim = cfg.d_model // cfg.num_heads
69
+ assert self.head_dim % 2 == 0, "head_dim must be even for RoPE"
70
+ self.rep = self.num_heads // self.kv_heads
71
+ self.q_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
72
+ kv_dim = self.kv_heads * self.head_dim
73
+ self.kv_proj = nn.Linear(cfg.d_model, 2 * kv_dim, bias=False)
74
+ self.out = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
75
+ def split_heads(self, x, heads):
76
+ B, T, C = x.shape
77
+ return x.view(B, T, heads, C // heads).transpose(1,2) # (B, heads, T, head_dim)
78
+ def forward(self, x, cos, sin, att_mask, past=None):
79
+ B, T, C = x.shape
80
+ q = self.q_proj(x)
81
+ kv = self.kv_proj(x)
82
+ k, v = kv.chunk(2, dim=-1)
83
+ q = self.split_heads(q, self.num_heads) # (B, Hq, T, hd)
84
+ k = self.split_heads(k, self.kv_heads) # (B, Hk, T, hd)
85
+ v = self.split_heads(v, self.kv_heads)
86
+ # apply RoPE to full head_dim (head_dim even)
87
+ if cos is not None and sin is not None:
88
+ # cos/sin shapes: (T, head_dim/2) for full head_dim per head
89
+ # Apply on q per head, and on k per kv_head (works because head_dim is same)
90
+ q_rot = apply_rope_tensor(q, cos, sin)
91
+ k_rot = apply_rope_tensor(k, cos, sin)
92
+ q = q_rot
93
+ k = k_rot
94
+ # past handling: past expected as (pk, pv) per layer where pk shape (B, Hk, Tpast, hd)
95
+ if past is not None:
96
+ pk, pv = past
97
+ if pk is not None:
98
+ k = torch.cat([pk, k], dim=2)
99
+ if pv is not None:
100
+ v = torch.cat([pv, v], dim=2)
101
+ present = (k, v)
102
+ # expand k/v to q-heads
103
+ if self.rep > 1:
104
+ # repeat_interleave across head dim
105
+ k = k.unsqueeze(2).repeat(1,1,self.rep,1,1).view(B, self.num_heads, -1, self.head_dim)
106
+ v = v.unsqueeze(2).repeat(1,1,self.rep,1,1).view(B, self.num_heads, -1, self.head_dim)
107
+ dk = q.shape[-1]
108
+ # q @ k^T => (B, H, Tq, Tk)
109
+ scores = torch.matmul(q, k.transpose(-2,-1)) / math.sqrt(dk)
110
+ # att_mask shape must broadcast to (B,1,Tq,Tk) or (1,1,Tq,Tk)
111
+ scores = scores.masked_fill(~att_mask, torch.finfo(scores.dtype).min)
112
+ att = torch.softmax(scores, dim=-1)
113
+ out = torch.matmul(att, v)
114
+ out = out.transpose(1,2).reshape(B, T, C)
115
+ return self.out(out), present
116
+
117
+ # SwiGLU MLP with 2.667x expansion
118
+ class SwiGLUMLP(nn.Module):
119
+ def __init__(self, d_model):
120
+ super().__init__()
121
+ hidden = int(d_model * 8 / 3) # ~2.667x
122
+ self.w1 = nn.Linear(d_model, hidden, bias=False)
123
+ self.w2 = nn.Linear(d_model, hidden, bias=False)
124
+ self.w3 = nn.Linear(hidden, d_model, bias=False)
125
+ def forward(self,x):
126
+ return self.w3(F.silu(self.w1(x)) * self.w2(x))
127
+
128
+ # Transformer Block (pre-norm)
129
+ class Block(nn.Module):
130
+ def __init__(self, cfg):
131
+ super().__init__()
132
+ self.norm1 = RMSNorm(cfg.d_model)
133
+ self.att = GQAAttention(cfg)
134
+ self.norm2 = RMSNorm(cfg.d_model)
135
+ self.mlp = SwiGLUMLP(cfg.d_model)
136
+ def forward(self, x, cos, sin, att_mask, past=None):
137
+ h, present = self.att(self.norm1(x), cos, sin, att_mask, past)
138
+ x = x + h
139
+ x = x + self.mlp(self.norm2(x))
140
+ return x, present
141
+
142
+ # Full model
143
+ class ShivikM2Model(nn.Module):
144
+ def __init__(self, cfg: ShivikM2Config):
145
+ super().__init__()
146
+ self.cfg = cfg
147
+ self.embed = nn.Embedding(cfg.vocab_size, cfg.d_model)
148
+ # precompute RoPE cos/sin for context_length and head_dim/2
149
+ cos, sin = precompute_freqs_cis(cfg.d_model // cfg.num_heads, cfg.context_length)
150
+ # We'll store per-head cos/sin later on forward if needed
151
+ self.register_buffer("cos", cos) # shape (T, head_dim/2)
152
+ self.register_buffer("sin", sin)
153
+ self.register_buffer("att_mask", torch.tril(torch.ones(cfg.context_length, cfg.context_length)).bool().unsqueeze(0).unsqueeze(0))
154
+ self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layers)])
155
+ self.norm = RMSNorm(cfg.d_model)
156
+ self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
157
+ # tie weights at end by copying in from outside if needed
158
+
159
+ def forward(self, input_ids, past_key_values=None, use_cache=False):
160
+ B, T = input_ids.shape
161
+ x = self.embed(input_ids)
162
+ att_mask = self.att_mask[:, :, :T, :T].to(x.device)
163
+ cos = self.cos[:T].to(x.device)
164
+ sin = self.sin[:T].to(x.device)
165
+ if past_key_values is None:
166
+ past_key_values = [None] * len(self.blocks)
167
+ presents = []
168
+ for block, p in zip(self.blocks, past_key_values):
169
+ x, present = block(x, cos, sin, att_mask, p)
170
+ presents.append(present)
171
+ x = self.norm(x)
172
+ logits = self.lm_head(x)
173
+ past_key_values_out = None
174
+ if use_cache:
175
+ past_key_values_out = tuple((p[0], p[1]) if p is not None else (None, None) for p in presents)
176
+ return CausalLMOutputWithCrossAttentions(logits=logits, past_key_values=past_key_values_out, hidden_states=None, attentions=None, cross_attentions=None)
177
+
178
+ class ShivikM2ForCausalLM(PreTrainedModel, GenerationMixin):
179
+ config_class = ShivikM2Config
180
+ base_model_prefix = "shivik_m2"
181
+ def __init__(self, config: ShivikM2Config):
182
+ PreTrainedModel.__init__(self, config)
183
+ # normalize n_layers fields
184
+ n = int(getattr(config, "n_layers", config.num_hidden_layers))
185
+ config.n_layers = n
186
+ config.num_hidden_layers = n
187
+ self.model = ShivikM2Model(config)
188
+ # tie lm_head weight to embedding
189
+ self.model.lm_head.weight = self.model.embed.weight
190
+ def forward(self, input_ids=None, past_key_values=None, **kwargs):
191
+ return self.model(input_ids, past_key_values, use_cache=kwargs.get("use_cache", False))
shivik-tokenizer-v120k/special_tokens_map.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "special_tokens": [
3
+ "<unk>",
4
+ "<pad>",
5
+ "<bos>",
6
+ "<eos>",
7
+ "<think>",
8
+ "<context>",
9
+ "<answer>",
10
+ "<end>",
11
+ "<thought_step>",
12
+ "<thought_branch>",
13
+ "<thought_end>",
14
+ "<thought_vote>"
15
+ ]
16
+ }
shivik-tokenizer-v120k/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
shivik-tokenizer-v120k/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 120000,
3
+ "special_tokens": [
4
+ "<unk>",
5
+ "<pad>",
6
+ "<bos>",
7
+ "<eos>",
8
+ "<think>",
9
+ "<context>",
10
+ "<answer>",
11
+ "<end>",
12
+ "<thought_step>",
13
+ "<thought_branch>",
14
+ "<thought_end>",
15
+ "<thought_vote>"
16
+ ],
17
+ "model": "BPE",
18
+ "training_samples": 2300000,
19
+ "training_time_minutes": 17.45083087682724
20
+ }
shivik-tokenizer-v200k/special_tokens_map.json ADDED
The diff for this file is too large to render. See raw diff
 
shivik-tokenizer-v200k/token_ids.json ADDED
The diff for this file is too large to render. See raw diff
 
shivik-tokenizer-v200k/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df201412b2416c3076b005ed2cc217aeba2615391bb727d4d89fefa03a2dedf3
3
+ size 20886503
shivik-tokenizer-v200k/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
shivik-tokenizer-v200k/tokenizer_metadata.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_vocab_size": 100000,
3
+ "base_vocab_size": 100000,
4
+ "special_tokens_count": 93406,
5
+ "training_samples": 2300000,
6
+ "training_time_minutes": 17.01,
7
+ "model": "BPE",
8
+ "categories": {
9
+ "reasoning_core": 6,
10
+ "tot_branching": 2100,
11
+ "reasoning_steps": 15000,
12
+ "voting": 1300,
13
+ "path_tracking": 15000,
14
+ "reward_policy": 15000,
15
+ "multi_agent": 15000,
16
+ "semantic": 10000,
17
+ "execution": 10000,
18
+ "summary": 10000
19
+ }
20
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "unk_token": "<unk>",
3
+ "pad_token": "<pad_000000>",
4
+ "bos_token": "<think>",
5
+ "eos_token": "</think>",
6
+ "additional_special_tokens": [
7
+ "<step>",
8
+ "</step>",
9
+ "<path>",
10
+ "</path>",
11
+ "<graph>",
12
+ "</graph>",
13
+ "<score>",
14
+ "</score>",
15
+ "<final>",
16
+ "</final>",
17
+ "<context>",
18
+ "</context>",
19
+ "<analysis>",
20
+ "</analysis>",
21
+ "<answer>",
22
+ "</answer>",
23
+ "<evaluate>",
24
+ "</evaluate>"
25
+ ]
26
+ }
tokenization_shivik_m1.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import os
4
+ from transformers import PreTrainedTokenizer
5
+
6
+
7
+ class ShivikM1Tokenizer(PreTrainedTokenizer):
8
+ """
9
+ Clean HF-compatible Python BPE tokenizer.
10
+ """
11
+
12
+ vocab_files_names = {
13
+ "vocab_file": "vocab.json",
14
+ "merges_file": "merges.txt",
15
+ }
16
+
17
+ def __init__(self, vocab_file, merges_file, **kwargs):
18
+ # -------------------------
19
+ # Validate paths
20
+ # -------------------------
21
+ if vocab_file is None or not os.path.exists(vocab_file):
22
+ raise FileNotFoundError(f"vocab_file missing: {vocab_file}")
23
+
24
+ if merges_file is None or not os.path.exists(merges_file):
25
+ raise FileNotFoundError(f"merges_file missing: {merges_file}")
26
+
27
+ # -------------------------
28
+ # Load vocab + decoder
29
+ # -------------------------
30
+ with open(vocab_file, "r", encoding="utf-8") as f:
31
+ self.encoder = json.load(f)
32
+
33
+ self.decoder = {v: k for k, v in self.encoder.items()}
34
+
35
+ # -------------------------
36
+ # Load merges
37
+ # -------------------------
38
+ merges = []
39
+ with open(merges_file, "r", encoding="utf-8") as f:
40
+ for line in f:
41
+ line = line.strip()
42
+ if not line or line.startswith("#"):
43
+ continue
44
+ parts = tuple(line.split())
45
+ if len(parts) == 2:
46
+ merges.append(parts)
47
+
48
+ self.bpe_ranks = dict(zip(merges, range(len(merges))))
49
+ self.cache = {}
50
+
51
+ # -------------------------
52
+ # Regex (HF-required)
53
+ # -------------------------
54
+ self.pat = re.compile(r"\S+")
55
+
56
+ # Store file paths
57
+ self.vocab_file = vocab_file
58
+ self.merges_file = merges_file
59
+
60
+ # -------------------------
61
+ # Default special tokens
62
+ # -------------------------
63
+ kwargs.setdefault("unk_token", "<unk>")
64
+ kwargs.setdefault("pad_token", "<pad_000000>")
65
+ kwargs.setdefault("bos_token", "<think>")
66
+ kwargs.setdefault("eos_token", "</think>")
67
+
68
+ super().__init__(**kwargs)
69
+
70
+ # -----------------------------------------------------------
71
+ # TOKENIZER REQUIRED API
72
+ # -----------------------------------------------------------
73
+ @property
74
+ def vocab_size(self):
75
+ return len(self.encoder)
76
+
77
+ def get_vocab(self):
78
+ return dict(self.encoder)
79
+
80
+ # -----------------------------------------------------------
81
+ # BPE IMPLEMENTATION
82
+ # -----------------------------------------------------------
83
+ def get_pairs(self, word):
84
+ pairs = set()
85
+ prev = word[0]
86
+ for ch in word[1:]:
87
+ pairs.add((prev, ch))
88
+ prev = ch
89
+ return pairs
90
+
91
+ def bpe(self, token):
92
+ if token in self.cache:
93
+ return self.cache[token]
94
+
95
+ word = tuple(token) + ("</w>",)
96
+ pairs = self.get_pairs(word)
97
+
98
+ if not pairs:
99
+ result = token + "</w>"
100
+ self.cache[token] = result
101
+ return result
102
+
103
+ while True:
104
+ bigram = min(pairs, key=lambda p: self.bpe_ranks.get(p, float("inf")))
105
+
106
+ if bigram not in self.bpe_ranks:
107
+ break
108
+
109
+ first, second = bigram
110
+ new_word = []
111
+ i = 0
112
+
113
+ while i < len(word):
114
+ try:
115
+ j = word.index(first, i)
116
+ except ValueError:
117
+ new_word.extend(word[i:])
118
+ break
119
+
120
+ new_word.extend(word[i:j])
121
+ i = j
122
+
123
+ if word[i:i+2] == bigram:
124
+ new_word.append(first + second)
125
+ i += 2
126
+ else:
127
+ new_word.append(word[i])
128
+ i += 1
129
+
130
+ word = tuple(new_word)
131
+ pairs = self.get_pairs(word)
132
+
133
+ result = " ".join(word)
134
+ self.cache[token] = result
135
+ return result
136
+
137
+ # -----------------------------------------------------------
138
+ # Tokenization
139
+ # -----------------------------------------------------------
140
+ def _tokenize(self, text, **kwargs):
141
+ tokens = []
142
+ for word in re.findall(self.pat, text):
143
+ pieces = self.bpe(word).split(" ")
144
+ tokens.extend(pieces)
145
+ return tokens
146
+
147
+ def tokenize(self, text, **kwargs):
148
+ # Ignore HF-only kwargs safely
149
+ return self._tokenize(text)
150
+
151
+ # -----------------------------------------------------------
152
+ # Token ↔ ID
153
+ # -----------------------------------------------------------
154
+ def _convert_token_to_id(self, token):
155
+ return self.encoder.get(token, self.encoder.get("<unk>", 0))
156
+
157
+ def _convert_id_to_token(self, idx):
158
+ return self.decoder.get(idx, "<unk>")
159
+
160
+ def convert_tokens_to_string(self, tokens):
161
+ return " ".join(tokens).replace("</w>", "")
162
+
163
+ # -----------------------------------------------------------
164
+ # HF Special Token Helpers
165
+ # -----------------------------------------------------------
166
+ def build_inputs_with_special_tokens(self, ids_0, ids_1=None):
167
+ return list(ids_0) if ids_1 is None else list(ids_0) + list(ids_1)
168
+
169
+ def num_special_tokens_to_add(self, pair=False):
170
+ return 0
171
+
172
+ def get_special_tokens_mask(self, ids_0, ids_1=None, already_has_special_tokens=False):
173
+ special = set(self.all_special_ids)
174
+ if ids_1 is None:
175
+ return [1 if t in special else 0 for t in ids_0]
176
+ all_ids = ids_0 + ids_1
177
+ return [1 if t in special else 0 for t in all_ids]
178
+
179
+ def decode(self, ids, **kwargs):
180
+ toks = [self._convert_id_to_token(int(i)) for i in ids]
181
+ return self.convert_tokens_to_string(toks)
tokenization_shivik_m1.py.bak ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import List, Optional, Union
5
+ from transformers import PreTrainedTokenizer
6
+
7
+ class ShivikM1Tokenizer(PreTrainedTokenizer):
8
+ """
9
+ HuggingFace-compatible custom BPE tokenizer
10
+ """
11
+
12
+ vocab_files_names = {
13
+ "vocab_file": "vocab.json",
14
+ "merges_file": "merges.txt"
15
+ }
16
+
17
+ def __init__(self, vocab_file: str, merges_file: str, **kwargs):
18
+ super().__init__(**kwargs)
19
+
20
+ # -------------------------
21
+ # Load vocab
22
+ # -------------------------
23
+ with open(vocab_file, "r", encoding="utf-8") as f:
24
+ self.encoder = json.load(f)
25
+ self.decoder = {v: k for k, v in self.encoder.items()}
26
+
27
+ # -------------------------
28
+ # Load merges
29
+ # -------------------------
30
+ merges = []
31
+ with open(merges_file, "r", encoding="utf-8") as f:
32
+ for line in f:
33
+ if line.startswith("#") or not line.strip():
34
+ continue
35
+ merges.append(tuple(line.strip().split()))
36
+ self.bpe_ranks = dict(zip(merges, range(len(merges))))
37
+ self.cache = {}
38
+
39
+ self.pat = re.compile(r"\S+")
40
+
41
+ self.vocab_file = vocab_file
42
+ self.merges_file = merges_file
43
+
44
+ # Default special tokens
45
+ self.unk_token = kwargs.get("unk_token", "<unk>")
46
+ self.pad_token = kwargs.get("pad_token", "<pad_000000>")
47
+ self.bos_token = kwargs.get("bos_token", "<think>")
48
+ self.eos_token = kwargs.get("eos_token", "</think>")
49
+
50
+ # ============================
51
+ # HF Required Methods
52
+ # ============================
53
+
54
+ def get_vocab(self):
55
+ return dict(self.encoder)
56
+
57
+ @property
58
+ def vocab_size(self):
59
+ return len(self.encoder)
60
+
61
+ # ============================
62
+ # BPE IMPLEMENTATION
63
+ # ============================
64
+
65
+ def get_pairs(self, word):
66
+ pairs = set()
67
+ prev = word[0]
68
+ for char in word[1:]:
69
+ pairs.add((prev, char))
70
+ prev = char
71
+ return pairs
72
+
73
+ def bpe(self, token):
74
+ if token in self.cache:
75
+ return self.cache[token]
76
+
77
+ word = tuple(token) + ("</w>",)
78
+ pairs = self.get_pairs(word)
79
+
80
+ if not pairs:
81
+ return token + "</w>"
82
+
83
+ while True:
84
+ bigram = min(
85
+ pairs,
86
+ key=lambda x: self.bpe_ranks.get(x, 1e10)
87
+ )
88
+ if bigram not in self.bpe_ranks:
89
+ break
90
+
91
+ first, second = bigram
92
+ new_word = []
93
+ i = 0
94
+
95
+ while i < len(word):
96
+ try:
97
+ j = word.index(first, i)
98
+ except ValueError:
99
+ new_word.extend(word[i:])
100
+ break
101
+
102
+ new_word.extend(word[i:j])
103
+ i = j
104
+
105
+ if word[i:i+2] == bigram:
106
+ new_word.append(first + second)
107
+ i += 2
108
+ else:
109
+ new_word.append(word[i])
110
+ i += 1
111
+
112
+ word = tuple(new_word)
113
+ pairs = self.get_pairs(word)
114
+
115
+ word_str = " ".join(word)
116
+ self.cache[token] = word_str
117
+ return word_str
118
+
119
+ def _tokenize(self, text):
120
+ bpe_tokens = []
121
+ for token in re.findall(self.pat, text):
122
+ bpe = self.bpe(token)
123
+ bpe_tokens.extend(bpe.split(" "))
124
+ return bpe_tokens
125
+
126
+ # ============================
127
+ # Token <-> ID Mapping
128
+ # ============================
129
+
130
+ def _convert_token_to_id(self, token):
131
+ return self.encoder.get(token, self.encoder.get("<unk>", 0))
132
+
133
+ def _convert_id_to_token(self, idx):
134
+ return self.decoder.get(idx, "<unk>")
135
+
136
+ def convert_tokens_to_string(self, tokens):
137
+ return " ".join(tokens).replace("</w>", "")
138
+
139
+ # ============================
140
+ # HuggingFace Compatibility
141
+ # ============================
142
+
143
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
144
+ """
145
+ HF expects two args; we do not auto-insert BOS/EOS.
146
+ """
147
+ if token_ids_1 is None:
148
+ return list(token_ids_0)
149
+ return list(token_ids_0) + list(token_ids_1)
150
+
151
+ def num_special_tokens_to_add(self, pair=False):
152
+ return 0
153
+
154
+ def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
155
+ """
156
+ Required by HF. Marks special tokens = 1, others = 0.
157
+ """
158
+ if already_has_special_tokens:
159
+ special = set(self.all_special_ids)
160
+ return [1 if t in special else 0 for t in token_ids_0]
161
+
162
+ if token_ids_1 is None:
163
+ return [0] * len(token_ids_0)
164
+
165
+ combined = list(token_ids_0) + list(token_ids_1)
166
+ special = set(self.all_special_ids)
167
+ return [1 if t in special else 0 for t in combined]
168
+
169
+ # Optional but helpful
170
+ def decode(self, token_ids, **kwargs):
171
+ tokens = [self._convert_id_to_token(int(i)) for i in token_ids]
172
+ return self.convert_tokens_to_string(tokens)
173
+
174
+ def tokenize(self, text):
175
+ return self._tokenize(text)
tokenization_shivik_m1_fast.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformers import PreTrainedTokenizerFast
3
+
4
+ class ShivikM1TokenizerFast(PreTrainedTokenizerFast):
5
+ """
6
+ Custom fast tokenizer for Shivik-M1 models.
7
+ Uses tokenizer.json + merges + vocab from HuggingFace repo.
8
+ """
9
+ model_input_names = ["input_ids", "attention_mask"]
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "unk_token": "<unk>",
3
+ "additional_special_tokens": [
4
+ "<|system|>",
5
+ "<|user|>",
6
+ "<|assistant|>",
7
+ "<|end|>",
8
+ "<pad>",
9
+ "<think>",
10
+ "</think>",
11
+ "<context>",
12
+ "</context>",
13
+ "<answer>",
14
+ "</answer>",
15
+ "<end>",
16
+ "<instruction>",
17
+ "<tool>",
18
+ "<tool_input>",
19
+ "<tool_output>",
20
+ "<safety>",
21
+ "<e>"
22
+ ]
23
+ }
tokenizer/tokenizer.json ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<pad>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": true,
13
+ "special": false
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<unk>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": true,
22
+ "special": false
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<bos>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": true,
31
+ "special": false
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<eos>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": true,
40
+ "special": false
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "<|system|>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": true,
49
+ "special": false
50
+ },
51
+ {
52
+ "id": 5,
53
+ "content": "<|user|>",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": true,
58
+ "special": false
59
+ },
60
+ {
61
+ "id": 6,
62
+ "content": "<|assistant|>",
63
+ "single_word": false,
64
+ "lstrip": false,
65
+ "rstrip": false,
66
+ "normalized": true,
67
+ "special": false
68
+ },
69
+ {
70
+ "id": 7,
71
+ "content": "<|end|>",
72
+ "single_word": false,
73
+ "lstrip": false,
74
+ "rstrip": false,
75
+ "normalized": true,
76
+ "special": false
77
+ },
78
+ {
79
+ "id": 8,
80
+ "content": "<think>",
81
+ "single_word": false,
82
+ "lstrip": false,
83
+ "rstrip": false,
84
+ "normalized": true,
85
+ "special": false
86
+ },
87
+ {
88
+ "id": 9,
89
+ "content": "</think>",
90
+ "single_word": false,
91
+ "lstrip": false,
92
+ "rstrip": false,
93
+ "normalized": true,
94
+ "special": false
95
+ },
96
+ {
97
+ "id": 10,
98
+ "content": "<context>",
99
+ "single_word": false,
100
+ "lstrip": false,
101
+ "rstrip": false,
102
+ "normalized": true,
103
+ "special": false
104
+ },
105
+ {
106
+ "id": 11,
107
+ "content": "</context>",
108
+ "single_word": false,
109
+ "lstrip": false,
110
+ "rstrip": false,
111
+ "normalized": true,
112
+ "special": false
113
+ },
114
+ {
115
+ "id": 12,
116
+ "content": "<answer>",
117
+ "single_word": false,
118
+ "lstrip": false,
119
+ "rstrip": false,
120
+ "normalized": true,
121
+ "special": false
122
+ },
123
+ {
124
+ "id": 13,
125
+ "content": "</answer>",
126
+ "single_word": false,
127
+ "lstrip": false,
128
+ "rstrip": false,
129
+ "normalized": true,
130
+ "special": false
131
+ },
132
+ {
133
+ "id": 14,
134
+ "content": "<instruction>",
135
+ "single_word": false,
136
+ "lstrip": false,
137
+ "rstrip": false,
138
+ "normalized": true,
139
+ "special": false
140
+ },
141
+ {
142
+ "id": 15,
143
+ "content": "<tool>",
144
+ "single_word": false,
145
+ "lstrip": false,
146
+ "rstrip": false,
147
+ "normalized": true,
148
+ "special": false
149
+ },
150
+ {
151
+ "id": 16,
152
+ "content": "<tool_input>",
153
+ "single_word": false,
154
+ "lstrip": false,
155
+ "rstrip": false,
156
+ "normalized": true,
157
+ "special": false
158
+ },
159
+ {
160
+ "id": 17,
161
+ "content": "<tool_output>",
162
+ "single_word": false,
163
+ "lstrip": false,
164
+ "rstrip": false,
165
+ "normalized": true,
166
+ "special": false
167
+ },
168
+ {
169
+ "id": 18,
170
+ "content": "<safety>",
171
+ "single_word": false,
172
+ "lstrip": false,
173
+ "rstrip": false,
174
+ "normalized": true,
175
+ "special": false
176
+ },
177
+ {
178
+ "id": 19,
179
+ "content": "<e>",
180
+ "single_word": false,
181
+ "lstrip": false,
182
+ "rstrip": false,
183
+ "normalized": true,
184
+ "special": false
185
+ },
186
+ {
187
+ "id": 20,
188
+ "content": "<branch>",
189
+ "single_word": false,
190
+ "lstrip": false,
191
+ "rstrip": false,
192
+ "normalized": true,
193
+ "special": false
194
+ },
195
+ {
196
+ "id": 21,
197
+ "content": "</branch>",
198
+ "single_word": false,
199
+ "lstrip": false,
200
+ "rstrip": false,
201
+ "normalized": true,
202
+ "special": false
203
+ },
204
+ {
205
+ "id": 22,
206
+ "content": "<select>",
207
+ "single_word": false,
208
+ "lstrip": false,
209
+ "rstrip": false,
210
+ "normalized": true,
211
+ "special": false
212
+ },
213
+ {
214
+ "id": 23,
215
+ "content": "</select>",
216
+ "single_word": false,
217
+ "lstrip": false,
218
+ "rstrip": false,
219
+ "normalized": true,
220
+ "special": false
221
+ },
222
+ {
223
+ "id": 24,
224
+ "content": "<evaluate>",
225
+ "single_word": false,
226
+ "lstrip": false,
227
+ "rstrip": false,
228
+ "normalized": true,
229
+ "special": false
230
+ },
231
+ {
232
+ "id": 25,
233
+ "content": "</evaluate>",
234
+ "single_word": false,
235
+ "lstrip": false,
236
+ "rstrip": false,
237
+ "normalized": true,
238
+ "special": false
239
+ },
240
+ {
241
+ "id": 26,
242
+ "content": "<confidence>",
243
+ "single_word": false,
244
+ "lstrip": false,
245
+ "rstrip": false,
246
+ "normalized": true,
247
+ "special": false
248
+ },
249
+ {
250
+ "id": 27,
251
+ "content": "</confidence>",
252
+ "single_word": false,
253
+ "lstrip": false,
254
+ "rstrip": false,
255
+ "normalized": true,
256
+ "special": false
257
+ },
258
+ {
259
+ "id": 28,
260
+ "content": "<merge>",
261
+ "single_word": false,
262
+ "lstrip": false,
263
+ "rstrip": false,
264
+ "normalized": true,
265
+ "special": false
266
+ },
267
+ {
268
+ "id": 29,
269
+ "content": "</merge>",
270
+ "single_word": false,
271
+ "lstrip": false,
272
+ "rstrip": false,
273
+ "normalized": true,
274
+ "special": false
275
+ },
276
+ {
277
+ "id": 30,
278
+ "content": "<path_1>",
279
+ "single_word": false,
280
+ "lstrip": false,
281
+ "rstrip": false,
282
+ "normalized": true,
283
+ "special": false
284
+ },
285
+ {
286
+ "id": 31,
287
+ "content": "<path_2>",
288
+ "single_word": false,
289
+ "lstrip": false,
290
+ "rstrip": false,
291
+ "normalized": true,
292
+ "special": false
293
+ },
294
+ {
295
+ "id": 32,
296
+ "content": "<path_3>",
297
+ "single_word": false,
298
+ "lstrip": false,
299
+ "rstrip": false,
300
+ "normalized": true,
301
+ "special": false
302
+ }
303
+ ],
304
+ "normalizer": {
305
+ "type": "Sequence",
306
+ "normalizers": [
307
+ {
308
+ "type": "NFC"
309
+ },
310
+ {
311
+ "type": "Lowercase"
312
+ }
313
+ ]
314
+ },
315
+ "pre_tokenizer": {
316
+ "type": "ByteLevel",
317
+ "add_prefix_space": true,
318
+ "trim_offsets": true,
319
+ "use_regex": true
320
+ },
321
+ "post_processor": {
322
+ "type": "ByteLevel",
323
+ "add_prefix_space": true,
324
+ "trim_offsets": true,
325
+ "use_regex": true
326
+ },
327
+ "decoder": {
328
+ "type": "ByteLevel",
329
+ "add_prefix_space": true,
330
+ "trim_offsets": true,
331
+ "use_regex": true
332
+ },
333
+ "model": {
334
+ "type": "BPE",
335
+ "dropout": null,
336
+ "unk_token": null,
337
+ "continuing_subword_prefix": null,
338
+ "end_of_word_suffix": null,
339
+ "fuse_unk": false,
340
+ "byte_fallback": false,
341
+ "ignore_merges": false,
342
+ "vocab": {},
343
+ "merges": []
344
+ }
345
+ }
tokenizer/tokenizer_metadata.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 200000,
3
+ "training_time_minutes": 144.34882674217224,
4
+ "timestamp": 1763844808.7371492,
5
+ "missing_tokens": []
6
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "ShivikM1Tokenizer",
3
+ "vocab_file": "vocab.json",
4
+ "merges_file": "merges.txt",
5
+ "do_lower_case": false
6
+ }
tokenizer_fast.json ADDED
The diff for this file is too large to render. See raw diff
 
train_aries.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # train_aries.py
2
+ # Skeleton training pipeline for:
3
+ # - SFT (supervised fine-tuning)
4
+ # - hooks to plug GRPO/TRL reward models (placeholders provided)
5
+ #
6
+ # Usage:
7
+ # export HF_TOKEN="hf_xxx"
8
+ # python train_aries.py --data /path/to/data.jsonl --output_dir /path/to/out --epochs 3 --batch 2
9
+
10
+ import os, argparse, json
11
+ from pathlib import Path
12
+ import torch
13
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
14
+ from datasets import load_dataset
15
+
16
+ def load_tokenizer_and_model(repo_or_local):
17
+ tok = AutoTokenizer.from_pretrained(repo_or_local, trust_remote_code=True, use_fast=False)
18
+ model = AutoModelForCausalLM.from_pretrained(repo_or_local, trust_remote_code=True)
19
+ return tok, model
20
+
21
+ def prepare_dataset(path, tok, max_length=512):
22
+ # expects jsonl with {"prompt": "...", "response": "..."}
23
+ ds = load_dataset('json', data_files={'train': str(path)}, split='train')
24
+ def map_fn(x):
25
+ text = x.get('prompt','') + '\n' + x.get('response','')
26
+ return tok(text, truncation=True, max_length=max_length)
27
+ ds = ds.map(map_fn, batched=False)
28
+ ds.set_format(type='torch', columns=['input_ids', 'attention_mask'])
29
+ return ds
30
+
31
+ def main():
32
+ p = argparse.ArgumentParser()
33
+ p.add_argument('--data', required=True)
34
+ p.add_argument('--repo', default='.' , help='local folder or HF repo id')
35
+ p.add_argument('--output_dir', default='./out')
36
+ p.add_argument('--epochs', type=int, default=1)
37
+ p.add_argument('--batch', type=int, default=2)
38
+ args = p.parse_args()
39
+
40
+ tok, model = load_tokenizer_and_model(args.repo)
41
+ ds = prepare_dataset(args.data, tok)
42
+
43
+ training_args = TrainingArguments(
44
+ output_dir=args.output_dir,
45
+ per_device_train_batch_size=args.batch,
46
+ num_train_epochs=args.epochs,
47
+ bf16=torch.cuda.is_available(),
48
+ fp16=torch.cuda.is_available(),
49
+ logging_steps=10,
50
+ save_strategy='epoch',
51
+ push_to_hub=False
52
+ )
53
+
54
+ # Basic SFT trainer
55
+ trainer = Trainer(
56
+ model=model,
57
+ args=training_args,
58
+ train_dataset=ds,
59
+ tokenizer=tok
60
+ )
61
+ trainer.train()
62
+
63
+ # === Hooks: attach GRPO/TRL ===
64
+ # After SFT completes, you may want to:
65
+ # 1) Initialize reward model and KTO/GRPO loop (placeholder)
66
+ # 2) Use `trl`'s PPOTrainer or custom GRPO trainer
67
+ # Example (pseudo):
68
+ # from trl import PPOTrainer
69
+ # reward_fn = lambda queries, generations: compute_rewards(queries, generations, reward_model)
70
+ # ppo_trainer = PPOTrainer(...)
71
+ # ppo_trainer.train()
72
+
73
+ print("Done SFT. Model checkpoint in", args.output_dir)
74
+
75
+ if __name__ == '__main__':
76
+ main()
upload_to_hf.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # upload_to_hf.py
2
+ # Usage: export HF_TOKEN='hf_xxx' ; python upload_to_hf.py --repo_id username/repo
3
+ import os, argparse
4
+ from huggingface_hub import HfApi, create_repo, upload_folder
5
+
6
+ p = argparse.ArgumentParser()
7
+ p.add_argument('--repo_id', required=True)
8
+ p.add_argument('--folder', default='.')
9
+ args = p.parse_args()
10
+
11
+ token = os.environ.get('HF_TOKEN')
12
+ if not token:
13
+ raise SystemExit('Please set HF_TOKEN in environment.')
14
+
15
+ create_repo(repo_id=args.repo_id, token=token, exist_ok=True)
16
+ print('Uploading folder', args.folder, 'to', args.repo_id)
17
+ upload_folder(folder_path=args.folder, repo_id=args.repo_id, token=token)
18
+ print('Done.')
vocab.json ADDED
The diff for this file is too large to render. See raw diff