| | """HuggingFace Transformers-compatible wrapper for JSONTokenizer. |
| | |
| | Provides JSONPreTrainedTokenizer, a PreTrainedTokenizer subclass that |
| | wraps JSONTokenizer for use with the HuggingFace ecosystem: |
| | - save_pretrained / from_pretrained |
| | - AutoTokenizer.from_pretrained (with trust_remote_code=True) |
| | - tokenizer(json_string) -> BatchEncoding |
| | - Padding, truncation, batch processing, return_tensors |
| | |
| | Requires: pip install json-tokenizer[huggingface] |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | import json |
| | import os |
| | from typing import Any, Dict, List, Optional, Tuple, Union |
| |
|
| | try: |
| | from transformers import PreTrainedTokenizer |
| | except ImportError: |
| | raise ImportError( |
| | "The HuggingFace transformers library is required for this module. " |
| | "Install it with: pip install json-tokenizer[huggingface]" |
| | ) |
| |
|
| | from json_tokenizer.tokenizer import JSONTokenizer, StructuralTokens |
| | from json_tokenizer.bpe import BPETrainer |
| |
|
| |
|
| | VOCAB_FILES_NAMES = {"vocab_file": "json_tokenizer_vocab.json"} |
| |
|
| | |
| | |
| | |
| | _STRUCTURAL_TOKEN_NAMES = { |
| | StructuralTokens.PAD: "<pad>", |
| | StructuralTokens.START: "<s>", |
| | StructuralTokens.END: "</s>", |
| | StructuralTokens.OBJ_START: "<obj_start>", |
| | StructuralTokens.OBJ_END: "<obj_end>", |
| | StructuralTokens.ARR_START: "<arr_start>", |
| | StructuralTokens.ARR_END: "<arr_end>", |
| | StructuralTokens.COLON: "<colon>", |
| | StructuralTokens.COMMA: "<comma>", |
| | StructuralTokens.NULL: "<null>", |
| | StructuralTokens.TRUE: "<true>", |
| | StructuralTokens.FALSE: "<false>", |
| | StructuralTokens.STR_DELIM: "<str_delim>", |
| | StructuralTokens.NUM_PREFIX: "<num_prefix>", |
| | StructuralTokens.KEY_PREFIX: "<key_prefix>", |
| | StructuralTokens.UNK: "<unk>", |
| | } |
| |
|
| | _STRUCTURAL_NAME_TO_ID = {v: k for k, v in _STRUCTURAL_TOKEN_NAMES.items()} |
| |
|
| |
|
| | class JSONPreTrainedTokenizer(PreTrainedTokenizer): |
| | """HuggingFace-compatible wrapper around JSONTokenizer. |
| | |
| | Usage: |
| | # From a trained JSONTokenizer: |
| | tok = JSONTokenizer(bpe_vocab_size=4096) |
| | tok.train(data) |
| | hf_tok = JSONPreTrainedTokenizer.from_json_tokenizer(tok) |
| | |
| | # Encode/decode via HF API: |
| | output = hf_tok('{"name": "Alice", "age": 30}') |
| | print(output["input_ids"]) |
| | print(hf_tok.decode(output["input_ids"])) |
| | |
| | # Save and reload: |
| | hf_tok.save_pretrained("./my_tokenizer") |
| | loaded = JSONPreTrainedTokenizer.from_pretrained("./my_tokenizer") |
| | """ |
| |
|
| | vocab_files_names = VOCAB_FILES_NAMES |
| | model_input_names = ["input_ids", "attention_mask"] |
| |
|
| | def __init__( |
| | self, |
| | vocab_file: Optional[str] = None, |
| | unk_token: str = "<unk>", |
| | bos_token: str = "<s>", |
| | eos_token: str = "</s>", |
| | pad_token: str = "<pad>", |
| | **kwargs, |
| | ): |
| | |
| | if not hasattr(self, "_json_tokenizer"): |
| | self._json_tokenizer: Optional[JSONTokenizer] = None |
| | if not hasattr(self, "_hf_vocab"): |
| | self._hf_vocab: Dict[str, int] = {} |
| | if not hasattr(self, "_hf_id_to_token"): |
| | self._hf_id_to_token: Dict[int, str] = {} |
| |
|
| | if vocab_file is not None and os.path.isfile(vocab_file): |
| | self._load_vocab_file(vocab_file) |
| |
|
| | super().__init__( |
| | unk_token=unk_token, |
| | bos_token=bos_token, |
| | eos_token=eos_token, |
| | pad_token=pad_token, |
| | **kwargs, |
| | ) |
| |
|
| | |
| |
|
| | @classmethod |
| | def from_json_tokenizer( |
| | cls, tokenizer: JSONTokenizer, **kwargs |
| | ) -> "JSONPreTrainedTokenizer": |
| | """Create from a trained JSONTokenizer instance. |
| | |
| | Args: |
| | tokenizer: A trained JSONTokenizer. |
| | **kwargs: Additional arguments passed to __init__. |
| | |
| | Returns: |
| | A new JSONPreTrainedTokenizer wrapping the provided tokenizer. |
| | """ |
| | if not tokenizer._trained: |
| | raise ValueError("JSONTokenizer must be trained before wrapping.") |
| |
|
| | instance = cls.__new__(cls) |
| | instance._json_tokenizer = tokenizer |
| | instance._hf_vocab = {} |
| | instance._hf_id_to_token = {} |
| | instance._build_hf_vocab() |
| | instance.__init__(vocab_file=None, **kwargs) |
| | return instance |
| |
|
| | |
| |
|
| | def _load_vocab_file(self, vocab_file: str) -> None: |
| | """Reconstruct a JSONTokenizer from our saved vocab file.""" |
| | with open(vocab_file, "r", encoding="utf-8") as f: |
| | data = json.load(f) |
| |
|
| | config = data["config"] |
| | tok = JSONTokenizer( |
| | bpe_vocab_size=config["bpe_vocab_size"], |
| | max_key_vocab=config["max_key_vocab"], |
| | min_key_freq=config["min_key_freq"], |
| | bpe_min_freq=config["bpe_min_freq"], |
| | ) |
| | tok._key_to_id = {k: int(v) for k, v in data["key_vocab"].items()} |
| | tok._id_to_key = {int(v): k for k, v in data["key_vocab"].items()} |
| | tok._key_offset = config["key_offset"] |
| | tok._bpe_offset = config["bpe_offset"] |
| |
|
| | bpe_data = data["bpe_model"] |
| | bpe = BPETrainer( |
| | vocab_size=bpe_data["vocab_size"], |
| | min_frequency=bpe_data["min_frequency"], |
| | ) |
| | bpe.merges = [tuple(m) for m in bpe_data["merges"]] |
| | bpe.vocab = bpe_data["vocab"] |
| | bpe._id_to_tok = None |
| | tok._bpe = bpe |
| |
|
| | tok._build_vocab_lookup() |
| | tok._trained = True |
| |
|
| | self._json_tokenizer = tok |
| | self._build_hf_vocab() |
| |
|
| | def _build_hf_vocab(self) -> None: |
| | """Build the unified {token_string: id} mapping across all tiers.""" |
| | tok = self._json_tokenizer |
| | self._hf_vocab = {} |
| | self._hf_id_to_token = {} |
| |
|
| | |
| | for tid, name in _STRUCTURAL_TOKEN_NAMES.items(): |
| | self._hf_vocab[name] = tid |
| | self._hf_id_to_token[tid] = name |
| |
|
| | |
| | for tid in range(16, StructuralTokens.RESERVED_END): |
| | name = f"<reserved_{tid}>" |
| | self._hf_vocab[name] = tid |
| | self._hf_id_to_token[tid] = name |
| |
|
| | |
| | for key_str, tid in tok._key_to_id.items(): |
| | name = f"<key:{key_str}>" |
| | self._hf_vocab[name] = tid |
| | self._hf_id_to_token[tid] = name |
| |
|
| | |
| | for bpe_token, bpe_local_id in tok._bpe.vocab.items(): |
| | full_id = tok._bpe_offset + bpe_local_id |
| | |
| | if bpe_token in self._hf_vocab: |
| | bpe_token_name = f"bpe:{bpe_token}" |
| | else: |
| | bpe_token_name = bpe_token |
| | self._hf_vocab[bpe_token_name] = full_id |
| | self._hf_id_to_token[full_id] = bpe_token_name |
| |
|
| | |
| |
|
| | @property |
| | def vocab_size(self) -> int: |
| | if self._json_tokenizer is None: |
| | return len(_STRUCTURAL_TOKEN_NAMES) |
| | return self._json_tokenizer.vocab_size |
| |
|
| | def get_vocab(self) -> Dict[str, int]: |
| | vocab = dict(self._hf_vocab) |
| | vocab.update(self.added_tokens_encoder) |
| | return vocab |
| |
|
| | def _tokenize(self, text: str, **kwargs) -> List[str]: |
| | """Tokenize a JSON string into HF token strings. |
| | |
| | The HF pipeline calls: tokenize(text) -> _tokenize -> list[str] |
| | then convert_tokens_to_ids maps those to IDs. |
| | |
| | We parse the JSON, encode via JSONTokenizer (skipping START/END |
| | since HF adds special tokens via build_inputs_with_special_tokens), |
| | then convert IDs to our HF token string names. |
| | """ |
| | if self._json_tokenizer is None: |
| | return [self.unk_token] |
| |
|
| | try: |
| | ids = self._json_tokenizer.encode(text) |
| | except (ValueError, json.JSONDecodeError): |
| | |
| | ids = [StructuralTokens.START] |
| | self._json_tokenizer._encode_string(text, ids) |
| | ids.append(StructuralTokens.END) |
| |
|
| | |
| | if ids and ids[0] == StructuralTokens.START: |
| | ids = ids[1:] |
| | if ids and ids[-1] == StructuralTokens.END: |
| | ids = ids[:-1] |
| |
|
| | return [self._hf_id_to_token.get(tid, self.unk_token) for tid in ids] |
| |
|
| | def _convert_token_to_id(self, token: str) -> int: |
| | return self._hf_vocab.get( |
| | token, self._hf_vocab.get(self.unk_token, StructuralTokens.UNK) |
| | ) |
| |
|
| | def _convert_id_to_token(self, index: int) -> str: |
| | return self._hf_id_to_token.get(index, self.unk_token) |
| |
|
| | def convert_tokens_to_string(self, tokens: List[str]) -> str: |
| | """Reconstruct a JSON string from token strings. |
| | |
| | Converts token strings -> IDs, wraps with START/END, |
| | and delegates to JSONTokenizer.decode(). |
| | """ |
| | if self._json_tokenizer is None: |
| | return "" |
| |
|
| | ids = [StructuralTokens.START] |
| | for token in tokens: |
| | tid = self._convert_token_to_id(token) |
| | ids.append(tid) |
| | ids.append(StructuralTokens.END) |
| |
|
| | try: |
| | return self._json_tokenizer.decode(ids) |
| | except Exception: |
| | return " ".join(tokens) |
| |
|
| | |
| |
|
| | def build_inputs_with_special_tokens( |
| | self, |
| | token_ids_0: List[int], |
| | token_ids_1: Optional[List[int]] = None, |
| | ) -> List[int]: |
| | """Wrap with START (bos) and END (eos) tokens.""" |
| | bos = [self.bos_token_id] |
| | eos = [self.eos_token_id] |
| | if token_ids_1 is None: |
| | return bos + token_ids_0 + eos |
| | return bos + token_ids_0 + eos + bos + token_ids_1 + eos |
| |
|
| | def get_special_tokens_mask( |
| | self, |
| | token_ids_0: List[int], |
| | token_ids_1: Optional[List[int]] = None, |
| | already_has_special_tokens: bool = False, |
| | ) -> List[int]: |
| | """1 for special tokens (START/END), 0 for content tokens.""" |
| | if already_has_special_tokens: |
| | return super().get_special_tokens_mask( |
| | token_ids_0=token_ids_0, |
| | token_ids_1=token_ids_1, |
| | already_has_special_tokens=True, |
| | ) |
| | if token_ids_1 is None: |
| | return [1] + [0] * len(token_ids_0) + [1] |
| | return ( |
| | [1] + [0] * len(token_ids_0) + [1] |
| | + [1] + [0] * len(token_ids_1) + [1] |
| | ) |
| |
|
| | def create_token_type_ids_from_sequences( |
| | self, |
| | token_ids_0: List[int], |
| | token_ids_1: Optional[List[int]] = None, |
| | ) -> List[int]: |
| | """Segment IDs: 0 for first sequence, 1 for second.""" |
| | bos_eos = 2 |
| | if token_ids_1 is None: |
| | return [0] * (len(token_ids_0) + bos_eos) |
| | return [0] * (len(token_ids_0) + bos_eos) + [1] * (len(token_ids_1) + bos_eos) |
| |
|
| | |
| |
|
| | def save_vocabulary( |
| | self, |
| | save_directory: str, |
| | filename_prefix: Optional[str] = None, |
| | ) -> Tuple[str]: |
| | """Save the vocabulary to a single JSON file. |
| | |
| | This file contains everything needed to reconstruct the |
| | JSONTokenizer: config, key vocab, and BPE model. |
| | """ |
| | if not os.path.isdir(save_directory): |
| | raise ValueError(f"Not a directory: {save_directory}") |
| |
|
| | vocab_file = os.path.join( |
| | save_directory, |
| | (filename_prefix + "-" if filename_prefix else "") |
| | + VOCAB_FILES_NAMES["vocab_file"], |
| | ) |
| |
|
| | tok = self._json_tokenizer |
| | data = { |
| | "version": "json-tokenizer-hf-v1", |
| | "config": { |
| | "bpe_vocab_size": tok.bpe_vocab_size, |
| | "max_key_vocab": tok.max_key_vocab, |
| | "min_key_freq": tok.min_key_freq, |
| | "bpe_min_freq": tok.bpe_min_freq, |
| | "key_offset": tok._key_offset, |
| | "bpe_offset": tok._bpe_offset, |
| | }, |
| | "key_vocab": tok._key_to_id, |
| | "bpe_model": { |
| | "vocab_size": tok._bpe.vocab_size, |
| | "min_frequency": tok._bpe.min_frequency, |
| | "merges": [list(m) for m in tok._bpe.merges], |
| | "vocab": tok._bpe.vocab, |
| | }, |
| | } |
| |
|
| | with open(vocab_file, "w", encoding="utf-8") as f: |
| | json.dump(data, f, indent=2, ensure_ascii=False) |
| |
|
| | return (vocab_file,) |
| |
|