Upload MINDI 1.0 420M full release

Browse files

Files changed (13) hide show

LICENSE +21 -0
README.md +81 -0
UPLOAD_TO_HF.ps1 +6 -0
config.json +29 -0
configuration_mindi.py +38 -0
generation_config.json +9 -0
model.safetensors +3 -0
modeling_mindi.py +219 -0
requirements_runtime.txt +4 -0
special_tokens_map.json +6 -0
tokenization_mindi.py +33 -0
tokenizer.json +799 -0
tokenizer_config.json +17 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 MINDI 1.0 420M Contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,81 @@

+---
+license: mit
+language:
+- en
+library_name: transformers
+pipeline_tag: text-generation
+tags:
+- code
+- python
+- javascript
+- local-llm
+- offline
+---
+# MINDI 1.0 420M
+MINDI 1.0 420M is a 420M-parameter coding language model focused on Python first and JavaScript second.
+It is built for local, offline code generation workflows.
+## Capabilities
+- Code generation from natural language prompts
+- Code completion
+- Bug-fix suggestions
+- Code explanation
+## Model Details
+- Parameters: 423,934,848
+- Architecture: Decoder-only Transformer
+- Context length: 2048 tokens
+- Focus languages: Python, JavaScript
+## Hardware Requirements
+Recommended:
+- NVIDIA GPU with 8GB+ VRAM
+- CUDA-enabled PyTorch
+Minimum:
+- CPU inference works but is slower
+## Quick Start (GPU)
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+repo_id = "YOUR_USERNAME/MINDI-1.0-420M"
+tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    repo_id,
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+).cuda()
+prompt = "Write a Python function to check if a string is a palindrome."
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+with torch.no_grad():
+    output = model.generate(
+        **inputs,
+        max_new_tokens=220,
+        temperature=0.2,
+        top_p=0.9,
+        do_sample=True,
+    )
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+## Limitations
+- The model can still produce syntax or logic errors.
+- Generated code should always be reviewed and tested.
+- Not intended for safety-critical production use without validation.
+## Safety
+Always run tests and static checks before using generated code in production.

UPLOAD_TO_HF.ps1 ADDED Viewed

	@@ -0,0 +1,6 @@

+# Upload helper for MINDI 1.0 420M
+# Run from PowerShell.
+huggingface-cli login
+huggingface-cli repo create MINDI-1.0-420M --type model --public
+huggingface-cli upload YOUR_USERNAME/MINDI-1.0-420M "C:\AI 2\hf_release\MINDI-1.0-420M" . --repo-type model

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "model_type": "mindi",
+  "architectures": [
+    "MindiForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_mindi.MindiConfig",
+    "AutoModelForCausalLM": "modeling_mindi.MindiForCausalLM",
+    "AutoTokenizer": [
+      null,
+      "tokenization_mindi.MindiTokenizer"
+    ]
+  },
+  "vocab_size": 50000,
+  "max_seq_len": 2048,
+  "d_model": 1152,
+  "n_layers": 23,
+  "n_heads": 16,
+  "d_ff": 4608,
+  "dropout": 0.1,
+  "tie_embeddings": true,
+  "init_std": 0.02,
+  "rms_norm_eps": 1e-05,
+  "bos_token_id": 2,
+  "eos_token_id": 3,
+  "pad_token_id": 0,
+  "torch_dtype": "float16",
+  "transformers_version": "4.46.3"
+}

configuration_mindi.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""
+Hugging Face config class for MINDI 1.0 420M.
+"""
+from transformers import PretrainedConfig
+class MindiConfig(PretrainedConfig):
+    model_type = "mindi"
+    def __init__(
+        self,
+        vocab_size=50000,
+        max_seq_len=2048,
+        d_model=1152,
+        n_layers=23,
+        n_heads=16,
+        d_ff=4608,
+        dropout=0.1,
+        tie_embeddings=True,
+        init_std=0.02,
+        rms_norm_eps=1e-5,
+        bos_token_id=2,
+        eos_token_id=3,
+        pad_token_id=0,
+        **kwargs,
+    ):
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.d_model = d_model
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.d_ff = d_ff
+        self.dropout = dropout
+        self.tie_embeddings = tie_embeddings
+        self.init_std = init_std
+        self.rms_norm_eps = rms_norm_eps

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token_id": 2,
+  "eos_token_id": 3,
+  "pad_token_id": 0,
+  "max_new_tokens": 220,
+  "temperature": 0.2,
+  "top_p": 0.9,
+  "do_sample": true
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89d5df76ccfe5be47eaf94b1d58eec9b36276c4c1c2bb235766c766e1dd838a0
+size 1695758072

modeling_mindi.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+Hugging Face model class for MINDI 1.0 420M.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_mindi import MindiConfig
+@dataclass
+class _Cfg:
+    vocab_size: int
+    max_seq_len: int
+    d_model: int
+    n_layers: int
+    n_heads: int
+    d_ff: int
+    dropout: float
+    tie_embeddings: bool
+    init_std: float
+    rms_norm_eps: float
+    @property
+    def head_dim(self) -> int:
+        if self.d_model % self.n_heads != 0:
+            raise ValueError("d_model must be divisible by n_heads")
+        return self.d_model // self.n_heads
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5) -> None:
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(norm + self.eps)
+        return self.weight * x
+class RotaryEmbedding(nn.Module):
+    def __init__(self, head_dim: int, max_seq_len: int) -> None:
+        super().__init__()
+        if head_dim % 2 != 0:
+            raise ValueError("head_dim must be even for rotary embeddings")
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        t = torch.arange(max_seq_len, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("cos_cached", torch.cos(freqs), persistent=False)
+        self.register_buffer("sin_cached", torch.sin(freqs), persistent=False)
+    def forward(self, q: torch.Tensor, k: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        cos = self.cos_cached[:seq_len].unsqueeze(0).unsqueeze(0)
+        sin = self.sin_cached[:seq_len].unsqueeze(0).unsqueeze(0)
+        return self._apply_rotary(q, cos, sin), self._apply_rotary(k, cos, sin)
+    @staticmethod
+    def _apply_rotary(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+        xe = x1 * cos - x2 * sin
+        xo = x1 * sin + x2 * cos
+        return torch.stack((xe, xo), dim=-1).flatten(-2)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, cfg: _Cfg) -> None:
+        super().__init__()
+        self.n_heads = cfg.n_heads
+        self.head_dim = cfg.head_dim
+        self.scale = self.head_dim ** -0.5
+        self.q_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.k_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.v_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.o_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.dropout = nn.Dropout(cfg.dropout)
+        self.rotary = RotaryEmbedding(self.head_dim, cfg.max_seq_len)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        bsz, seq_len, _ = x.shape
+        q = self.q_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        q, k = self.rotary(q, k, seq_len=seq_len)
+        out = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=None,
+            dropout_p=self.dropout.p if self.training else 0.0,
+            is_causal=True,
+            scale=self.scale,
+        )
+        out = out.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        return self.o_proj(out)
+class FeedForward(nn.Module):
+    def __init__(self, cfg: _Cfg) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.fc2 = nn.Linear(cfg.d_ff, cfg.d_model, bias=False)
+        self.dropout = nn.Dropout(cfg.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = F.gelu(x, approximate="tanh")
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, cfg: _Cfg) -> None:
+        super().__init__()
+        self.norm1 = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
+        self.attn = CausalSelfAttention(cfg)
+        self.norm2 = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
+        self.ffn = FeedForward(cfg)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x))
+        x = x + self.ffn(self.norm2(x))
+        return x
+class MindiForCausalLM(PreTrainedModel):
+    config_class = MindiConfig
+    base_model_prefix = "mindi"
+    supports_gradient_checkpointing = False
+    def __init__(self, config: MindiConfig):
+        super().__init__(config)
+        cfg = _Cfg(
+            vocab_size=config.vocab_size,
+            max_seq_len=config.max_seq_len,
+            d_model=config.d_model,
+            n_layers=config.n_layers,
+            n_heads=config.n_heads,
+            d_ff=config.d_ff,
+            dropout=config.dropout,
+            tie_embeddings=config.tie_embeddings,
+            init_std=config.init_std,
+            rms_norm_eps=config.rms_norm_eps,
+        )
+        self.embed_tokens = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.dropout = nn.Dropout(cfg.dropout)
+        self.blocks = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg.n_layers)])
+        self.norm_final = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+        if cfg.tie_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+        self.post_init()
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embed_tokens
+    def set_input_embeddings(self, value: nn.Module) -> None:
+        self.embed_tokens = value
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings: nn.Module) -> None:
+        self.lm_head = new_embeddings
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        del attention_mask, kwargs
+        x = self.embed_tokens(input_ids)
+        x = self.dropout(x)
+        for block in self.blocks:
+            x = block(x)
+        x = self.norm_final(x)
+        logits = self.lm_head(x)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        return CausalLMOutputWithPast(loss=loss, logits=logits)
+    @torch.no_grad()
+    def prepare_inputs_for_generation(self, input_ids: torch.Tensor, **kwargs):
+        del kwargs
+        return {"input_ids": input_ids}

requirements_runtime.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch>=2.4.1
+transformers>=4.46.3
+safetensors>=0.4.5
+tokenizers>=0.20.1

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<BOS>",
+  "eos_token": "<EOS>",
+  "unk_token": "<UNK>",
+  "pad_token": "<PAD>"
+}

tokenization_mindi.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+Hugging Face tokenizer class for MINDI 1.0 420M.
+"""
+from pathlib import Path
+from transformers import PreTrainedTokenizerFast
+class MindiTokenizer(PreTrainedTokenizerFast):
+    vocab_files_names = {"tokenizer_file": "tokenizer.json"}
+    model_input_names = ["input_ids", "attention_mask"]
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
+        if kwargs.get("tokenizer_file") is None:
+            local_candidate = Path(str(pretrained_model_name_or_path)) / "tokenizer.json"
+            if local_candidate.exists():
+                kwargs["tokenizer_file"] = str(local_candidate)
+        return super().from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
+    def __init__(self, tokenizer_file=None, **kwargs):
+        name_or_path = kwargs.pop("name_or_path", None)
+        if tokenizer_file is None and name_or_path is not None:
+            candidate = Path(name_or_path) / "tokenizer.json"
+            if candidate.exists():
+                tokenizer_file = str(candidate)
+        if tokenizer_file is None:
+            tokenizer_file = str(Path(__file__).resolve().parent / "tokenizer.json")
+        kwargs.setdefault("bos_token", "<BOS>")
+        kwargs.setdefault("eos_token", "<EOS>")
+        kwargs.setdefault("unk_token", "<UNK>")
+        kwargs.setdefault("pad_token", "<PAD>")
+        super().__init__(tokenizer_file=tokenizer_file, **kwargs)

tokenizer.json ADDED Viewed

	@@ -0,0 +1,799 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<PAD>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<UNK>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<BOS>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "<EOS>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": "<NL>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 5,
+      "content": "<INDENT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 6,
+      "content": "<DEDENT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 7,
+      "content": "<PROMPT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 8,
+      "content": "<CODE>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 9,
+      "content": "<PYTHON>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 10,
+      "content": "<JAVASCRIPT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": {
+    "type": "Sequence",
+    "normalizers": [
+      {
+        "type": "NFKC"
+      }
+    ]
+  },
+  "pre_tokenizer": {
+    "type": "Sequence",
+    "pretokenizers": [
+      {
+        "type": "Split",
+        "pattern": {
+          "Regex": "(==|!=|<=|>=|:=|->|=>|\\+\\+|--|\\+=|-=|\\*=|/=|//=|%=|\\*\\*|&&|\\|\\||<<|>>)"
+        },
+        "behavior": "Isolated",
+        "invert": false
+      },
+      {
+        "type": "Split",
+        "pattern": {
+          "Regex": "([()\\[\\]{}.,:;])"
+        },
+        "behavior": "Isolated",
+        "invert": false
+      },
+      {
+        "type": "Metaspace",
+        "replacement": "_",
+        "prepend_scheme": "always",
+        "split": true
+      }
+    ]
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "<BOS>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<EOS>",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "<BOS>": {
+        "id": "<BOS>",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "<BOS>"
+        ]
+      },
+      "<EOS>": {
+        "id": "<EOS>",
+        "ids": [
+          3
+        ],
+        "tokens": [
+          "<EOS>"
+        ]
+      }
+    }
+  },
+  "decoder": {
+    "type": "BPEDecoder",
+    "suffix": "</w>"
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": "<UNK>",
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": false,
+    "vocab": {
+      "<PAD>": 0,
+      "<UNK>": 1,
+      "<BOS>": 2,
+      "<EOS>": 3,
+      "<NL>": 4,
+      "<INDENT>": 5,
+      "<DEDENT>": 6,
+      "<PROMPT>": 7,
+      "<CODE>": 8,
+      "<PYTHON>": 9,
+      "<JAVASCRIPT>": 10,
+      "(": 11,
+      ")": 12,
+      "+": 13,
+      ",": 14,
+      ".": 15,
+      "0": 16,
+      "4": 17,
+      "5": 18,
+      ":": 19,
+      ";": 20,
+      "<": 21,
+      "=": 22,
+      ">": 23,
+      "A": 24,
+      "C": 25,
+      "D": 26,
+      "E": 27,
+      "F": 28,
+      "H": 29,
+      "I": 30,
+      "J": 31,
+      "L": 32,
+      "M": 33,
+      "N": 34,
+      "O": 35,
+      "P": 36,
+      "R": 37,
+      "S": 38,
+      "T": 39,
+      "V": 40,
+      "W": 41,
+      "Y": 42,
+      "_": 43,
+      "a": 44,
+      "b": 45,
+      "c": 46,
+      "d": 47,
+      "e": 48,
+      "f": 49,
+      "g": 50,
+      "h": 51,
+      "i": 52,
+      "l": 53,
+      "m": 54,
+      "n": 55,
+      "o": 56,
+      "p": 57,
+      "r": 58,
+      "s": 59,
+      "t": 60,
+      "u": 61,
+      "v": 62,
+      "w": 63,
+      "x": 64,
+      "y": 65,
+      "{": 66,
+      "}": 67,
+      "_<": 68,
+      "DE": 69,
+      "T>": 70,
+      "_a": 71,
+      "L>": 72,
+      "NL>": 73,
+      "_<NL>": 74,
+      "NT>": 75,
+      "_t": 76,
+      "DENT>": 77,
+      "_i": 78,
+      "PT>": 79,
+      "_(": 80,
+      "_)": 81,
+      "on": 82,
+      "_<P": 83,
+      "_f": 84,
+      "_l": 85,
+      "re": 86,
+      "ri": 87,
+      "CO": 88,
+      "IN": 89,
+      "MPT>": 90,
+      "OMPT>": 91,
+      "ROMPT>": 92,
+      "_;": 93,
+      "_b": 94,
+      "at": 95,
+      "_<DE": 96,
+      "_<CO": 97,
+      "_<IN": 98,
+      "DE>": 99,
+      "_to": 100,
+      "_<PROMPT>": 101,
+      "_lo": 102,
+      "_<DEDENT>": 103,
+      "_<CODE>": 104,
+      "_<INDENT>": 105,
+      "_+": 106,
+      "_0": 107,
+      "_re": 108,
+      "ct": 109,
+      "dd": 110,
+      "ion": 111,
+      "nct": 112,
+      "rn": 113,
+      "tu": 114,
+      "unct": 115,
+      "va": 116,
+      "_add": 117,
+      "_th": 118,
+      "_funct": 119,
+      "_retu": 120,
+      "_function": 121,
+      "_return": 122,
+      "AS": 123,
+      "AV": 124,
+      "CR": 125,
+      "Cre": 126,
+      "HO": 127,
+      "IPT>": 128,
+      "Ja": 129,
+      "JAV": 130,
+      "N>": 131,
+      "Py": 132,
+      "Sc": 133,
+      "THO": 134,
+      "YTHO": 135,
+      "_,": 136,
+      "_4": 137,
+      "_5": 138,
+      "_:": 139,
+      "_p": 140,
+      "_{": 141,
+      "_}": 142,
+      "_Cre": 143,
+      "_Ja": 144,
+      "_Py": 145,
+      "hon": 146,
+      "nt": 147,
+      "op": 148,
+      "or": 149,
+      "pt": 150,
+      "thon": 151,
+      "_<JAV": 152,
+      "_<PYTHO": 153,
+      "_for": 154,
+      "rint": 155,
+      "ript": 156,
+      "ate": 157,
+      "_log": 158,
+      "_loop": 159,
+      "vaSc": 160,
+      "_that": 161,
+      "ASCR": 162,
+      "_print": 163,
+      "_Create": 164,
+      "_JavaSc": 165,
+      "_Python": 166,
+      "_<JAVASCR": 167,
+      "_<PYTHON>": 168,
+      "_JavaScript": 169,
+      "_<JAVASCRIPT>": 170
+    },
+    "merges": [
+      [
+        "_",
+        "<"
+      ],
+      [
+        "D",
+        "E"
+      ],
+      [
+        "T",
+        ">"
+      ],
+      [
+        "_",
+        "a"
+      ],
+      [
+        "L",
+        ">"
+      ],
+      [
+        "N",
+        "L>"
+      ],
+      [
+        "_<",
+        "NL>"
+      ],
+      [
+        "N",
+        "T>"
+      ],
+      [
+        "_",
+        "t"
+      ],
+      [
+        "DE",
+        "NT>"
+      ],
+      [
+        "_",
+        "i"
+      ],
+      [
+        "P",
+        "T>"
+      ],
+      [
+        "_",
+        "("
+      ],
+      [
+        "_",
+        ")"
+      ],
+      [
+        "o",
+        "n"
+      ],
+      [
+        "_<",
+        "P"
+      ],
+      [
+        "_",
+        "f"
+      ],
+      [
+        "_",
+        "l"
+      ],
+      [
+        "r",
+        "e"
+      ],
+      [
+        "r",
+        "i"
+      ],
+      [
+        "C",
+        "O"
+      ],
+      [
+        "I",
+        "N"
+      ],
+      [
+        "M",
+        "PT>"
+      ],
+      [
+        "O",
+        "MPT>"
+      ],
+      [
+        "R",
+        "OMPT>"
+      ],
+      [
+        "_",
+        ";"
+      ],
+      [
+        "_",
+        "b"
+      ],
+      [
+        "a",
+        "t"
+      ],
+      [
+        "_<",
+        "DE"
+      ],
+      [
+        "_<",
+        "CO"
+      ],
+      [
+        "_<",
+        "IN"
+      ],
+      [
+        "DE",
+        ">"
+      ],
+      [
+        "_t",
+        "o"
+      ],
+      [
+        "_<P",
+        "ROMPT>"
+      ],
+      [
+        "_l",
+        "o"
+      ],
+      [
+        "_<DE",
+        "DENT>"
+      ],
+      [
+        "_<CO",
+        "DE>"
+      ],
+      [
+        "_<IN",
+        "DENT>"
+      ],
+      [
+        "_",
+        "+"
+      ],
+      [
+        "_",
+        "0"
+      ],
+      [
+        "_",
+        "re"
+      ],
+      [
+        "c",
+        "t"
+      ],
+      [
+        "d",
+        "d"
+      ],
+      [
+        "i",
+        "on"
+      ],
+      [
+        "n",
+        "ct"
+      ],
+      [
+        "r",
+        "n"
+      ],
+      [
+        "t",
+        "u"
+      ],
+      [
+        "u",
+        "nct"
+      ],
+      [
+        "v",
+        "a"
+      ],
+      [
+        "_a",
+        "dd"
+      ],
+      [
+        "_t",
+        "h"
+      ],
+      [
+        "_f",
+        "unct"
+      ],
+      [
+        "_re",
+        "tu"
+      ],
+      [
+        "_funct",
+        "ion"
+      ],
+      [
+        "_retu",
+        "rn"
+      ],
+      [
+        "A",
+        "S"
+      ],
+      [
+        "A",
+        "V"
+      ],
+      [
+        "C",
+        "R"
+      ],
+      [
+        "C",
+        "re"
+      ],
+      [
+        "H",
+        "O"
+      ],
+      [
+        "I",
+        "PT>"
+      ],
+      [
+        "J",
+        "a"
+      ],
+      [
+        "J",
+        "AV"
+      ],
+      [
+        "N",
+        ">"
+      ],
+      [
+        "P",
+        "y"
+      ],
+      [
+        "S",
+        "c"
+      ],
+      [
+        "T",
+        "HO"
+      ],
+      [
+        "Y",
+        "THO"
+      ],
+      [
+        "_",
+        ","
+      ],
+      [
+        "_",
+        "4"
+      ],
+      [
+        "_",
+        "5"
+      ],
+      [
+        "_",
+        ":"
+      ],
+      [
+        "_",
+        "p"
+      ],
+      [
+        "_",
+        "{"
+      ],
+      [
+        "_",
+        "}"
+      ],
+      [
+        "_",
+        "Cre"
+      ],
+      [
+        "_",
+        "Ja"
+      ],
+      [
+        "_",
+        "Py"
+      ],
+      [
+        "h",
+        "on"
+      ],
+      [
+        "n",
+        "t"
+      ],
+      [
+        "o",
+        "p"
+      ],
+      [
+        "o",
+        "r"
+      ],
+      [
+        "p",
+        "t"
+      ],
+      [
+        "t",
+        "hon"
+      ],
+      [
+        "_<",
+        "JAV"
+      ],
+      [
+        "_<P",
+        "YTHO"
+      ],
+      [
+        "_f",
+        "or"
+      ],
+      [
+        "ri",
+        "nt"
+      ],
+      [
+        "ri",
+        "pt"
+      ],
+      [
+        "at",
+        "e"
+      ],
+      [
+        "_lo",
+        "g"
+      ],
+      [
+        "_lo",
+        "op"
+      ],
+      [
+        "va",
+        "Sc"
+      ],
+      [
+        "_th",
+        "at"
+      ],
+      [
+        "AS",
+        "CR"
+      ],
+      [
+        "_p",
+        "rint"
+      ],
+      [
+        "_Cre",
+        "ate"
+      ],
+      [
+        "_Ja",
+        "vaSc"
+      ],
+      [
+        "_Py",
+        "thon"
+      ],
+      [
+        "_<JAV",
+        "ASCR"
+      ],
+      [
+        "_<PYTHO",
+        "N>"
+      ],
+      [
+        "_JavaSc",
+        "ript"
+      ],
+      [
+        "_<JAVASCR",
+        "IPT>"
+      ]
+    ]
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "tokenizer_class": "MindiTokenizer",
+  "model_max_length": 2048,
+  "bos_token": "<BOS>",
+  "eos_token": "<EOS>",
+  "unk_token": "<UNK>",
+  "pad_token": "<PAD>",
+  "tokenizer_file": "tokenizer.json",
+  "auto_map": {
+    "AutoTokenizer": [
+      null,
+      "tokenization_mindi.MindiTokenizer"
+    ]
+  },
+  "padding_side": "right",
+  "truncation_side": "right"
+}