Mindigenous commited on
Commit
0d29ee0
·
verified ·
1 Parent(s): 6838b81

Upload MINDI 1.0 420M full release

Browse files
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 MINDI 1.0 420M Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ library_name: transformers
6
+ pipeline_tag: text-generation
7
+ tags:
8
+ - code
9
+ - python
10
+ - javascript
11
+ - local-llm
12
+ - offline
13
+ ---
14
+
15
+ # MINDI 1.0 420M
16
+
17
+ MINDI 1.0 420M is a 420M-parameter coding language model focused on Python first and JavaScript second.
18
+ It is built for local, offline code generation workflows.
19
+
20
+ ## Capabilities
21
+
22
+ - Code generation from natural language prompts
23
+ - Code completion
24
+ - Bug-fix suggestions
25
+ - Code explanation
26
+
27
+ ## Model Details
28
+
29
+ - Parameters: 423,934,848
30
+ - Architecture: Decoder-only Transformer
31
+ - Context length: 2048 tokens
32
+ - Focus languages: Python, JavaScript
33
+
34
+ ## Hardware Requirements
35
+
36
+ Recommended:
37
+ - NVIDIA GPU with 8GB+ VRAM
38
+ - CUDA-enabled PyTorch
39
+
40
+ Minimum:
41
+ - CPU inference works but is slower
42
+
43
+ ## Quick Start (GPU)
44
+
45
+ ```python
46
+ from transformers import AutoTokenizer, AutoModelForCausalLM
47
+ import torch
48
+
49
+ repo_id = "YOUR_USERNAME/MINDI-1.0-420M"
50
+
51
+ tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
52
+ model = AutoModelForCausalLM.from_pretrained(
53
+ repo_id,
54
+ trust_remote_code=True,
55
+ torch_dtype=torch.float16,
56
+ ).cuda()
57
+
58
+ prompt = "Write a Python function to check if a string is a palindrome."
59
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
60
+
61
+ with torch.no_grad():
62
+ output = model.generate(
63
+ **inputs,
64
+ max_new_tokens=220,
65
+ temperature=0.2,
66
+ top_p=0.9,
67
+ do_sample=True,
68
+ )
69
+
70
+ print(tokenizer.decode(output[0], skip_special_tokens=True))
71
+ ```
72
+
73
+ ## Limitations
74
+
75
+ - The model can still produce syntax or logic errors.
76
+ - Generated code should always be reviewed and tested.
77
+ - Not intended for safety-critical production use without validation.
78
+
79
+ ## Safety
80
+
81
+ Always run tests and static checks before using generated code in production.
UPLOAD_TO_HF.ps1 ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Upload helper for MINDI 1.0 420M
2
+ # Run from PowerShell.
3
+
4
+ huggingface-cli login
5
+ huggingface-cli repo create MINDI-1.0-420M --type model --public
6
+ huggingface-cli upload YOUR_USERNAME/MINDI-1.0-420M "C:\AI 2\hf_release\MINDI-1.0-420M" . --repo-type model
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "mindi",
3
+ "architectures": [
4
+ "MindiForCausalLM"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_mindi.MindiConfig",
8
+ "AutoModelForCausalLM": "modeling_mindi.MindiForCausalLM",
9
+ "AutoTokenizer": [
10
+ null,
11
+ "tokenization_mindi.MindiTokenizer"
12
+ ]
13
+ },
14
+ "vocab_size": 50000,
15
+ "max_seq_len": 2048,
16
+ "d_model": 1152,
17
+ "n_layers": 23,
18
+ "n_heads": 16,
19
+ "d_ff": 4608,
20
+ "dropout": 0.1,
21
+ "tie_embeddings": true,
22
+ "init_std": 0.02,
23
+ "rms_norm_eps": 1e-05,
24
+ "bos_token_id": 2,
25
+ "eos_token_id": 3,
26
+ "pad_token_id": 0,
27
+ "torch_dtype": "float16",
28
+ "transformers_version": "4.46.3"
29
+ }
configuration_mindi.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face config class for MINDI 1.0 420M.
3
+ """
4
+
5
+ from transformers import PretrainedConfig
6
+
7
+
8
+ class MindiConfig(PretrainedConfig):
9
+ model_type = "mindi"
10
+
11
+ def __init__(
12
+ self,
13
+ vocab_size=50000,
14
+ max_seq_len=2048,
15
+ d_model=1152,
16
+ n_layers=23,
17
+ n_heads=16,
18
+ d_ff=4608,
19
+ dropout=0.1,
20
+ tie_embeddings=True,
21
+ init_std=0.02,
22
+ rms_norm_eps=1e-5,
23
+ bos_token_id=2,
24
+ eos_token_id=3,
25
+ pad_token_id=0,
26
+ **kwargs,
27
+ ):
28
+ super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
29
+ self.vocab_size = vocab_size
30
+ self.max_seq_len = max_seq_len
31
+ self.d_model = d_model
32
+ self.n_layers = n_layers
33
+ self.n_heads = n_heads
34
+ self.d_ff = d_ff
35
+ self.dropout = dropout
36
+ self.tie_embeddings = tie_embeddings
37
+ self.init_std = init_std
38
+ self.rms_norm_eps = rms_norm_eps
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 2,
3
+ "eos_token_id": 3,
4
+ "pad_token_id": 0,
5
+ "max_new_tokens": 220,
6
+ "temperature": 0.2,
7
+ "top_p": 0.9,
8
+ "do_sample": true
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89d5df76ccfe5be47eaf94b1d58eec9b36276c4c1c2bb235766c766e1dd838a0
3
+ size 1695758072
modeling_mindi.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face model class for MINDI 1.0 420M.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Optional, Tuple
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from transformers import PreTrainedModel
14
+ from transformers.modeling_outputs import CausalLMOutputWithPast
15
+
16
+ from .configuration_mindi import MindiConfig
17
+
18
+
19
+ @dataclass
20
+ class _Cfg:
21
+ vocab_size: int
22
+ max_seq_len: int
23
+ d_model: int
24
+ n_layers: int
25
+ n_heads: int
26
+ d_ff: int
27
+ dropout: float
28
+ tie_embeddings: bool
29
+ init_std: float
30
+ rms_norm_eps: float
31
+
32
+ @property
33
+ def head_dim(self) -> int:
34
+ if self.d_model % self.n_heads != 0:
35
+ raise ValueError("d_model must be divisible by n_heads")
36
+ return self.d_model // self.n_heads
37
+
38
+
39
+ class RMSNorm(nn.Module):
40
+ def __init__(self, dim: int, eps: float = 1e-5) -> None:
41
+ super().__init__()
42
+ self.eps = eps
43
+ self.weight = nn.Parameter(torch.ones(dim))
44
+
45
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
46
+ norm = x.pow(2).mean(dim=-1, keepdim=True)
47
+ x = x * torch.rsqrt(norm + self.eps)
48
+ return self.weight * x
49
+
50
+
51
+ class RotaryEmbedding(nn.Module):
52
+ def __init__(self, head_dim: int, max_seq_len: int) -> None:
53
+ super().__init__()
54
+ if head_dim % 2 != 0:
55
+ raise ValueError("head_dim must be even for rotary embeddings")
56
+ inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim))
57
+ t = torch.arange(max_seq_len, dtype=torch.float32)
58
+ freqs = torch.outer(t, inv_freq)
59
+ self.register_buffer("cos_cached", torch.cos(freqs), persistent=False)
60
+ self.register_buffer("sin_cached", torch.sin(freqs), persistent=False)
61
+
62
+ def forward(self, q: torch.Tensor, k: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
63
+ cos = self.cos_cached[:seq_len].unsqueeze(0).unsqueeze(0)
64
+ sin = self.sin_cached[:seq_len].unsqueeze(0).unsqueeze(0)
65
+ return self._apply_rotary(q, cos, sin), self._apply_rotary(k, cos, sin)
66
+
67
+ @staticmethod
68
+ def _apply_rotary(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
69
+ x1 = x[..., ::2]
70
+ x2 = x[..., 1::2]
71
+ xe = x1 * cos - x2 * sin
72
+ xo = x1 * sin + x2 * cos
73
+ return torch.stack((xe, xo), dim=-1).flatten(-2)
74
+
75
+
76
+ class CausalSelfAttention(nn.Module):
77
+ def __init__(self, cfg: _Cfg) -> None:
78
+ super().__init__()
79
+ self.n_heads = cfg.n_heads
80
+ self.head_dim = cfg.head_dim
81
+ self.scale = self.head_dim ** -0.5
82
+ self.q_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
83
+ self.k_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
84
+ self.v_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
85
+ self.o_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
86
+ self.dropout = nn.Dropout(cfg.dropout)
87
+ self.rotary = RotaryEmbedding(self.head_dim, cfg.max_seq_len)
88
+
89
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
90
+ bsz, seq_len, _ = x.shape
91
+ q = self.q_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
92
+ k = self.k_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
93
+ v = self.v_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
94
+ q, k = self.rotary(q, k, seq_len=seq_len)
95
+ out = F.scaled_dot_product_attention(
96
+ q,
97
+ k,
98
+ v,
99
+ attn_mask=None,
100
+ dropout_p=self.dropout.p if self.training else 0.0,
101
+ is_causal=True,
102
+ scale=self.scale,
103
+ )
104
+ out = out.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
105
+ return self.o_proj(out)
106
+
107
+
108
+ class FeedForward(nn.Module):
109
+ def __init__(self, cfg: _Cfg) -> None:
110
+ super().__init__()
111
+ self.fc1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
112
+ self.fc2 = nn.Linear(cfg.d_ff, cfg.d_model, bias=False)
113
+ self.dropout = nn.Dropout(cfg.dropout)
114
+
115
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
116
+ x = self.fc1(x)
117
+ x = F.gelu(x, approximate="tanh")
118
+ x = self.fc2(x)
119
+ x = self.dropout(x)
120
+ return x
121
+
122
+
123
+ class TransformerBlock(nn.Module):
124
+ def __init__(self, cfg: _Cfg) -> None:
125
+ super().__init__()
126
+ self.norm1 = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
127
+ self.attn = CausalSelfAttention(cfg)
128
+ self.norm2 = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
129
+ self.ffn = FeedForward(cfg)
130
+
131
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
132
+ x = x + self.attn(self.norm1(x))
133
+ x = x + self.ffn(self.norm2(x))
134
+ return x
135
+
136
+
137
+ class MindiForCausalLM(PreTrainedModel):
138
+ config_class = MindiConfig
139
+ base_model_prefix = "mindi"
140
+ supports_gradient_checkpointing = False
141
+
142
+ def __init__(self, config: MindiConfig):
143
+ super().__init__(config)
144
+ cfg = _Cfg(
145
+ vocab_size=config.vocab_size,
146
+ max_seq_len=config.max_seq_len,
147
+ d_model=config.d_model,
148
+ n_layers=config.n_layers,
149
+ n_heads=config.n_heads,
150
+ d_ff=config.d_ff,
151
+ dropout=config.dropout,
152
+ tie_embeddings=config.tie_embeddings,
153
+ init_std=config.init_std,
154
+ rms_norm_eps=config.rms_norm_eps,
155
+ )
156
+
157
+ self.embed_tokens = nn.Embedding(cfg.vocab_size, cfg.d_model)
158
+ self.dropout = nn.Dropout(cfg.dropout)
159
+ self.blocks = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg.n_layers)])
160
+ self.norm_final = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
161
+ self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
162
+
163
+ if cfg.tie_embeddings:
164
+ self.lm_head.weight = self.embed_tokens.weight
165
+
166
+ self.post_init()
167
+
168
+ def _init_weights(self, module: nn.Module) -> None:
169
+ if isinstance(module, nn.Linear):
170
+ nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
171
+ elif isinstance(module, nn.Embedding):
172
+ nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
173
+
174
+ def get_input_embeddings(self) -> nn.Module:
175
+ return self.embed_tokens
176
+
177
+ def set_input_embeddings(self, value: nn.Module) -> None:
178
+ self.embed_tokens = value
179
+
180
+ def get_output_embeddings(self) -> nn.Module:
181
+ return self.lm_head
182
+
183
+ def set_output_embeddings(self, new_embeddings: nn.Module) -> None:
184
+ self.lm_head = new_embeddings
185
+
186
+ def forward(
187
+ self,
188
+ input_ids: torch.Tensor,
189
+ attention_mask: Optional[torch.Tensor] = None,
190
+ labels: Optional[torch.Tensor] = None,
191
+ **kwargs,
192
+ ) -> CausalLMOutputWithPast:
193
+ del attention_mask, kwargs
194
+
195
+ x = self.embed_tokens(input_ids)
196
+ x = self.dropout(x)
197
+
198
+ for block in self.blocks:
199
+ x = block(x)
200
+
201
+ x = self.norm_final(x)
202
+ logits = self.lm_head(x)
203
+
204
+ loss = None
205
+ if labels is not None:
206
+ shift_logits = logits[:, :-1, :].contiguous()
207
+ shift_labels = labels[:, 1:].contiguous()
208
+ loss = F.cross_entropy(
209
+ shift_logits.view(-1, shift_logits.size(-1)),
210
+ shift_labels.view(-1),
211
+ ignore_index=-100,
212
+ )
213
+
214
+ return CausalLMOutputWithPast(loss=loss, logits=logits)
215
+
216
+ @torch.no_grad()
217
+ def prepare_inputs_for_generation(self, input_ids: torch.Tensor, **kwargs):
218
+ del kwargs
219
+ return {"input_ids": input_ids}
requirements_runtime.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch>=2.4.1
2
+ transformers>=4.46.3
3
+ safetensors>=0.4.5
4
+ tokenizers>=0.20.1
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<BOS>",
3
+ "eos_token": "<EOS>",
4
+ "unk_token": "<UNK>",
5
+ "pad_token": "<PAD>"
6
+ }
tokenization_mindi.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face tokenizer class for MINDI 1.0 420M.
3
+ """
4
+
5
+ from pathlib import Path
6
+ from transformers import PreTrainedTokenizerFast
7
+
8
+
9
+ class MindiTokenizer(PreTrainedTokenizerFast):
10
+ vocab_files_names = {"tokenizer_file": "tokenizer.json"}
11
+ model_input_names = ["input_ids", "attention_mask"]
12
+
13
+ @classmethod
14
+ def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
15
+ if kwargs.get("tokenizer_file") is None:
16
+ local_candidate = Path(str(pretrained_model_name_or_path)) / "tokenizer.json"
17
+ if local_candidate.exists():
18
+ kwargs["tokenizer_file"] = str(local_candidate)
19
+ return super().from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
20
+
21
+ def __init__(self, tokenizer_file=None, **kwargs):
22
+ name_or_path = kwargs.pop("name_or_path", None)
23
+ if tokenizer_file is None and name_or_path is not None:
24
+ candidate = Path(name_or_path) / "tokenizer.json"
25
+ if candidate.exists():
26
+ tokenizer_file = str(candidate)
27
+ if tokenizer_file is None:
28
+ tokenizer_file = str(Path(__file__).resolve().parent / "tokenizer.json")
29
+ kwargs.setdefault("bos_token", "<BOS>")
30
+ kwargs.setdefault("eos_token", "<EOS>")
31
+ kwargs.setdefault("unk_token", "<UNK>")
32
+ kwargs.setdefault("pad_token", "<PAD>")
33
+ super().__init__(tokenizer_file=tokenizer_file, **kwargs)
tokenizer.json ADDED
@@ -0,0 +1,799 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<PAD>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<UNK>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<BOS>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<EOS>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "<NL>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
+ {
52
+ "id": 5,
53
+ "content": "<INDENT>",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
+ },
60
+ {
61
+ "id": 6,
62
+ "content": "<DEDENT>",
63
+ "single_word": false,
64
+ "lstrip": false,
65
+ "rstrip": false,
66
+ "normalized": false,
67
+ "special": true
68
+ },
69
+ {
70
+ "id": 7,
71
+ "content": "<PROMPT>",
72
+ "single_word": false,
73
+ "lstrip": false,
74
+ "rstrip": false,
75
+ "normalized": false,
76
+ "special": true
77
+ },
78
+ {
79
+ "id": 8,
80
+ "content": "<CODE>",
81
+ "single_word": false,
82
+ "lstrip": false,
83
+ "rstrip": false,
84
+ "normalized": false,
85
+ "special": true
86
+ },
87
+ {
88
+ "id": 9,
89
+ "content": "<PYTHON>",
90
+ "single_word": false,
91
+ "lstrip": false,
92
+ "rstrip": false,
93
+ "normalized": false,
94
+ "special": true
95
+ },
96
+ {
97
+ "id": 10,
98
+ "content": "<JAVASCRIPT>",
99
+ "single_word": false,
100
+ "lstrip": false,
101
+ "rstrip": false,
102
+ "normalized": false,
103
+ "special": true
104
+ }
105
+ ],
106
+ "normalizer": {
107
+ "type": "Sequence",
108
+ "normalizers": [
109
+ {
110
+ "type": "NFKC"
111
+ }
112
+ ]
113
+ },
114
+ "pre_tokenizer": {
115
+ "type": "Sequence",
116
+ "pretokenizers": [
117
+ {
118
+ "type": "Split",
119
+ "pattern": {
120
+ "Regex": "(==|!=|<=|>=|:=|->|=>|\\+\\+|--|\\+=|-=|\\*=|/=|//=|%=|\\*\\*|&&|\\|\\||<<|>>)"
121
+ },
122
+ "behavior": "Isolated",
123
+ "invert": false
124
+ },
125
+ {
126
+ "type": "Split",
127
+ "pattern": {
128
+ "Regex": "([()\\[\\]{}.,:;])"
129
+ },
130
+ "behavior": "Isolated",
131
+ "invert": false
132
+ },
133
+ {
134
+ "type": "Metaspace",
135
+ "replacement": "_",
136
+ "prepend_scheme": "always",
137
+ "split": true
138
+ }
139
+ ]
140
+ },
141
+ "post_processor": {
142
+ "type": "TemplateProcessing",
143
+ "single": [
144
+ {
145
+ "SpecialToken": {
146
+ "id": "<BOS>",
147
+ "type_id": 0
148
+ }
149
+ },
150
+ {
151
+ "Sequence": {
152
+ "id": "A",
153
+ "type_id": 0
154
+ }
155
+ },
156
+ {
157
+ "SpecialToken": {
158
+ "id": "<EOS>",
159
+ "type_id": 0
160
+ }
161
+ }
162
+ ],
163
+ "pair": [
164
+ {
165
+ "Sequence": {
166
+ "id": "A",
167
+ "type_id": 0
168
+ }
169
+ },
170
+ {
171
+ "Sequence": {
172
+ "id": "B",
173
+ "type_id": 1
174
+ }
175
+ }
176
+ ],
177
+ "special_tokens": {
178
+ "<BOS>": {
179
+ "id": "<BOS>",
180
+ "ids": [
181
+ 2
182
+ ],
183
+ "tokens": [
184
+ "<BOS>"
185
+ ]
186
+ },
187
+ "<EOS>": {
188
+ "id": "<EOS>",
189
+ "ids": [
190
+ 3
191
+ ],
192
+ "tokens": [
193
+ "<EOS>"
194
+ ]
195
+ }
196
+ }
197
+ },
198
+ "decoder": {
199
+ "type": "BPEDecoder",
200
+ "suffix": "</w>"
201
+ },
202
+ "model": {
203
+ "type": "BPE",
204
+ "dropout": null,
205
+ "unk_token": "<UNK>",
206
+ "continuing_subword_prefix": null,
207
+ "end_of_word_suffix": null,
208
+ "fuse_unk": false,
209
+ "byte_fallback": false,
210
+ "ignore_merges": false,
211
+ "vocab": {
212
+ "<PAD>": 0,
213
+ "<UNK>": 1,
214
+ "<BOS>": 2,
215
+ "<EOS>": 3,
216
+ "<NL>": 4,
217
+ "<INDENT>": 5,
218
+ "<DEDENT>": 6,
219
+ "<PROMPT>": 7,
220
+ "<CODE>": 8,
221
+ "<PYTHON>": 9,
222
+ "<JAVASCRIPT>": 10,
223
+ "(": 11,
224
+ ")": 12,
225
+ "+": 13,
226
+ ",": 14,
227
+ ".": 15,
228
+ "0": 16,
229
+ "4": 17,
230
+ "5": 18,
231
+ ":": 19,
232
+ ";": 20,
233
+ "<": 21,
234
+ "=": 22,
235
+ ">": 23,
236
+ "A": 24,
237
+ "C": 25,
238
+ "D": 26,
239
+ "E": 27,
240
+ "F": 28,
241
+ "H": 29,
242
+ "I": 30,
243
+ "J": 31,
244
+ "L": 32,
245
+ "M": 33,
246
+ "N": 34,
247
+ "O": 35,
248
+ "P": 36,
249
+ "R": 37,
250
+ "S": 38,
251
+ "T": 39,
252
+ "V": 40,
253
+ "W": 41,
254
+ "Y": 42,
255
+ "_": 43,
256
+ "a": 44,
257
+ "b": 45,
258
+ "c": 46,
259
+ "d": 47,
260
+ "e": 48,
261
+ "f": 49,
262
+ "g": 50,
263
+ "h": 51,
264
+ "i": 52,
265
+ "l": 53,
266
+ "m": 54,
267
+ "n": 55,
268
+ "o": 56,
269
+ "p": 57,
270
+ "r": 58,
271
+ "s": 59,
272
+ "t": 60,
273
+ "u": 61,
274
+ "v": 62,
275
+ "w": 63,
276
+ "x": 64,
277
+ "y": 65,
278
+ "{": 66,
279
+ "}": 67,
280
+ "_<": 68,
281
+ "DE": 69,
282
+ "T>": 70,
283
+ "_a": 71,
284
+ "L>": 72,
285
+ "NL>": 73,
286
+ "_<NL>": 74,
287
+ "NT>": 75,
288
+ "_t": 76,
289
+ "DENT>": 77,
290
+ "_i": 78,
291
+ "PT>": 79,
292
+ "_(": 80,
293
+ "_)": 81,
294
+ "on": 82,
295
+ "_<P": 83,
296
+ "_f": 84,
297
+ "_l": 85,
298
+ "re": 86,
299
+ "ri": 87,
300
+ "CO": 88,
301
+ "IN": 89,
302
+ "MPT>": 90,
303
+ "OMPT>": 91,
304
+ "ROMPT>": 92,
305
+ "_;": 93,
306
+ "_b": 94,
307
+ "at": 95,
308
+ "_<DE": 96,
309
+ "_<CO": 97,
310
+ "_<IN": 98,
311
+ "DE>": 99,
312
+ "_to": 100,
313
+ "_<PROMPT>": 101,
314
+ "_lo": 102,
315
+ "_<DEDENT>": 103,
316
+ "_<CODE>": 104,
317
+ "_<INDENT>": 105,
318
+ "_+": 106,
319
+ "_0": 107,
320
+ "_re": 108,
321
+ "ct": 109,
322
+ "dd": 110,
323
+ "ion": 111,
324
+ "nct": 112,
325
+ "rn": 113,
326
+ "tu": 114,
327
+ "unct": 115,
328
+ "va": 116,
329
+ "_add": 117,
330
+ "_th": 118,
331
+ "_funct": 119,
332
+ "_retu": 120,
333
+ "_function": 121,
334
+ "_return": 122,
335
+ "AS": 123,
336
+ "AV": 124,
337
+ "CR": 125,
338
+ "Cre": 126,
339
+ "HO": 127,
340
+ "IPT>": 128,
341
+ "Ja": 129,
342
+ "JAV": 130,
343
+ "N>": 131,
344
+ "Py": 132,
345
+ "Sc": 133,
346
+ "THO": 134,
347
+ "YTHO": 135,
348
+ "_,": 136,
349
+ "_4": 137,
350
+ "_5": 138,
351
+ "_:": 139,
352
+ "_p": 140,
353
+ "_{": 141,
354
+ "_}": 142,
355
+ "_Cre": 143,
356
+ "_Ja": 144,
357
+ "_Py": 145,
358
+ "hon": 146,
359
+ "nt": 147,
360
+ "op": 148,
361
+ "or": 149,
362
+ "pt": 150,
363
+ "thon": 151,
364
+ "_<JAV": 152,
365
+ "_<PYTHO": 153,
366
+ "_for": 154,
367
+ "rint": 155,
368
+ "ript": 156,
369
+ "ate": 157,
370
+ "_log": 158,
371
+ "_loop": 159,
372
+ "vaSc": 160,
373
+ "_that": 161,
374
+ "ASCR": 162,
375
+ "_print": 163,
376
+ "_Create": 164,
377
+ "_JavaSc": 165,
378
+ "_Python": 166,
379
+ "_<JAVASCR": 167,
380
+ "_<PYTHON>": 168,
381
+ "_JavaScript": 169,
382
+ "_<JAVASCRIPT>": 170
383
+ },
384
+ "merges": [
385
+ [
386
+ "_",
387
+ "<"
388
+ ],
389
+ [
390
+ "D",
391
+ "E"
392
+ ],
393
+ [
394
+ "T",
395
+ ">"
396
+ ],
397
+ [
398
+ "_",
399
+ "a"
400
+ ],
401
+ [
402
+ "L",
403
+ ">"
404
+ ],
405
+ [
406
+ "N",
407
+ "L>"
408
+ ],
409
+ [
410
+ "_<",
411
+ "NL>"
412
+ ],
413
+ [
414
+ "N",
415
+ "T>"
416
+ ],
417
+ [
418
+ "_",
419
+ "t"
420
+ ],
421
+ [
422
+ "DE",
423
+ "NT>"
424
+ ],
425
+ [
426
+ "_",
427
+ "i"
428
+ ],
429
+ [
430
+ "P",
431
+ "T>"
432
+ ],
433
+ [
434
+ "_",
435
+ "("
436
+ ],
437
+ [
438
+ "_",
439
+ ")"
440
+ ],
441
+ [
442
+ "o",
443
+ "n"
444
+ ],
445
+ [
446
+ "_<",
447
+ "P"
448
+ ],
449
+ [
450
+ "_",
451
+ "f"
452
+ ],
453
+ [
454
+ "_",
455
+ "l"
456
+ ],
457
+ [
458
+ "r",
459
+ "e"
460
+ ],
461
+ [
462
+ "r",
463
+ "i"
464
+ ],
465
+ [
466
+ "C",
467
+ "O"
468
+ ],
469
+ [
470
+ "I",
471
+ "N"
472
+ ],
473
+ [
474
+ "M",
475
+ "PT>"
476
+ ],
477
+ [
478
+ "O",
479
+ "MPT>"
480
+ ],
481
+ [
482
+ "R",
483
+ "OMPT>"
484
+ ],
485
+ [
486
+ "_",
487
+ ";"
488
+ ],
489
+ [
490
+ "_",
491
+ "b"
492
+ ],
493
+ [
494
+ "a",
495
+ "t"
496
+ ],
497
+ [
498
+ "_<",
499
+ "DE"
500
+ ],
501
+ [
502
+ "_<",
503
+ "CO"
504
+ ],
505
+ [
506
+ "_<",
507
+ "IN"
508
+ ],
509
+ [
510
+ "DE",
511
+ ">"
512
+ ],
513
+ [
514
+ "_t",
515
+ "o"
516
+ ],
517
+ [
518
+ "_<P",
519
+ "ROMPT>"
520
+ ],
521
+ [
522
+ "_l",
523
+ "o"
524
+ ],
525
+ [
526
+ "_<DE",
527
+ "DENT>"
528
+ ],
529
+ [
530
+ "_<CO",
531
+ "DE>"
532
+ ],
533
+ [
534
+ "_<IN",
535
+ "DENT>"
536
+ ],
537
+ [
538
+ "_",
539
+ "+"
540
+ ],
541
+ [
542
+ "_",
543
+ "0"
544
+ ],
545
+ [
546
+ "_",
547
+ "re"
548
+ ],
549
+ [
550
+ "c",
551
+ "t"
552
+ ],
553
+ [
554
+ "d",
555
+ "d"
556
+ ],
557
+ [
558
+ "i",
559
+ "on"
560
+ ],
561
+ [
562
+ "n",
563
+ "ct"
564
+ ],
565
+ [
566
+ "r",
567
+ "n"
568
+ ],
569
+ [
570
+ "t",
571
+ "u"
572
+ ],
573
+ [
574
+ "u",
575
+ "nct"
576
+ ],
577
+ [
578
+ "v",
579
+ "a"
580
+ ],
581
+ [
582
+ "_a",
583
+ "dd"
584
+ ],
585
+ [
586
+ "_t",
587
+ "h"
588
+ ],
589
+ [
590
+ "_f",
591
+ "unct"
592
+ ],
593
+ [
594
+ "_re",
595
+ "tu"
596
+ ],
597
+ [
598
+ "_funct",
599
+ "ion"
600
+ ],
601
+ [
602
+ "_retu",
603
+ "rn"
604
+ ],
605
+ [
606
+ "A",
607
+ "S"
608
+ ],
609
+ [
610
+ "A",
611
+ "V"
612
+ ],
613
+ [
614
+ "C",
615
+ "R"
616
+ ],
617
+ [
618
+ "C",
619
+ "re"
620
+ ],
621
+ [
622
+ "H",
623
+ "O"
624
+ ],
625
+ [
626
+ "I",
627
+ "PT>"
628
+ ],
629
+ [
630
+ "J",
631
+ "a"
632
+ ],
633
+ [
634
+ "J",
635
+ "AV"
636
+ ],
637
+ [
638
+ "N",
639
+ ">"
640
+ ],
641
+ [
642
+ "P",
643
+ "y"
644
+ ],
645
+ [
646
+ "S",
647
+ "c"
648
+ ],
649
+ [
650
+ "T",
651
+ "HO"
652
+ ],
653
+ [
654
+ "Y",
655
+ "THO"
656
+ ],
657
+ [
658
+ "_",
659
+ ","
660
+ ],
661
+ [
662
+ "_",
663
+ "4"
664
+ ],
665
+ [
666
+ "_",
667
+ "5"
668
+ ],
669
+ [
670
+ "_",
671
+ ":"
672
+ ],
673
+ [
674
+ "_",
675
+ "p"
676
+ ],
677
+ [
678
+ "_",
679
+ "{"
680
+ ],
681
+ [
682
+ "_",
683
+ "}"
684
+ ],
685
+ [
686
+ "_",
687
+ "Cre"
688
+ ],
689
+ [
690
+ "_",
691
+ "Ja"
692
+ ],
693
+ [
694
+ "_",
695
+ "Py"
696
+ ],
697
+ [
698
+ "h",
699
+ "on"
700
+ ],
701
+ [
702
+ "n",
703
+ "t"
704
+ ],
705
+ [
706
+ "o",
707
+ "p"
708
+ ],
709
+ [
710
+ "o",
711
+ "r"
712
+ ],
713
+ [
714
+ "p",
715
+ "t"
716
+ ],
717
+ [
718
+ "t",
719
+ "hon"
720
+ ],
721
+ [
722
+ "_<",
723
+ "JAV"
724
+ ],
725
+ [
726
+ "_<P",
727
+ "YTHO"
728
+ ],
729
+ [
730
+ "_f",
731
+ "or"
732
+ ],
733
+ [
734
+ "ri",
735
+ "nt"
736
+ ],
737
+ [
738
+ "ri",
739
+ "pt"
740
+ ],
741
+ [
742
+ "at",
743
+ "e"
744
+ ],
745
+ [
746
+ "_lo",
747
+ "g"
748
+ ],
749
+ [
750
+ "_lo",
751
+ "op"
752
+ ],
753
+ [
754
+ "va",
755
+ "Sc"
756
+ ],
757
+ [
758
+ "_th",
759
+ "at"
760
+ ],
761
+ [
762
+ "AS",
763
+ "CR"
764
+ ],
765
+ [
766
+ "_p",
767
+ "rint"
768
+ ],
769
+ [
770
+ "_Cre",
771
+ "ate"
772
+ ],
773
+ [
774
+ "_Ja",
775
+ "vaSc"
776
+ ],
777
+ [
778
+ "_Py",
779
+ "thon"
780
+ ],
781
+ [
782
+ "_<JAV",
783
+ "ASCR"
784
+ ],
785
+ [
786
+ "_<PYTHO",
787
+ "N>"
788
+ ],
789
+ [
790
+ "_JavaSc",
791
+ "ript"
792
+ ],
793
+ [
794
+ "_<JAVASCR",
795
+ "IPT>"
796
+ ]
797
+ ]
798
+ }
799
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "MindiTokenizer",
3
+ "model_max_length": 2048,
4
+ "bos_token": "<BOS>",
5
+ "eos_token": "<EOS>",
6
+ "unk_token": "<UNK>",
7
+ "pad_token": "<PAD>",
8
+ "tokenizer_file": "tokenizer.json",
9
+ "auto_map": {
10
+ "AutoTokenizer": [
11
+ null,
12
+ "tokenization_mindi.MindiTokenizer"
13
+ ]
14
+ },
15
+ "padding_side": "right",
16
+ "truncation_side": "right"
17
+ }