gbyuvd
/

FastChemTokenizer

+import torch
+import json
+import os
+from typing import List, Union, Optional, Tuple, Dict, Any
+from functools import lru_cache
+# Minimal equivalents for problematic imports
+class BatchEncoding:
+    def __init__(self, data, tensor_type=None):
+        self.data = data
+        for key, value in data.items():
+            setattr(self, key, value)
+    def __getitem__(self, key):
+        return self.data[key]
+    def __contains__(self, key):
+        return key in self.data  # This should fix the issue
+    def keys(self):
+        return self.data.keys()
+    def values(self):
+        return self.data.values()
+    def items(self):
+        return self.data.items()
+class PreTrainedTokenizerBase:
+    def __init__(self, **kwargs):
+        # Set special tokens from kwargs
+        for key, value in kwargs.items():
+            if key.endswith('_token'):
+                setattr(self, f"_{key}", value)
+                # Don't set token_id here - let subclass handle it
+        # Set other attributes
+        self.model_max_length = kwargs.get('model_max_length', 512)
+        self.padding_side = kwargs.get('padding_side', 'right')
+        self.truncation_side = kwargs.get('truncation_side', 'right')
+        self.chat_template = kwargs.get('chat_template')
+class TrieNode:
+    __slots__ = ['children', 'token_id']
+    def __init__(self):
+        self.children = {}
+        self.token_id = None  # If set, this node completes a valid token
+class FastChemTokenizer(PreTrainedTokenizerBase):
+    """
+    Fully HuggingFace API compatible tokenizer for chemical representations.
+    """
+    def __init__(
+        self,
+        token_to_id=None,
+        vocab_file=None,
+        model_max_length=512,
+        padding_side="right",
+        truncation_side="right",
+        chat_template=None,
+        **kwargs
+    ):
+        # Handle vocab loading
+        if token_to_id is None and vocab_file is None:
+            raise ValueError("Either token_to_id or vocab_file must be provided")
+        if vocab_file is not None:
+            with open(vocab_file, "r", encoding="utf-8") as f:
+                token_to_id = json.load(f)
+                token_to_id = {str(k): int(v) for k, v in token_to_id.items()}
+        self.token_to_id = token_to_id
+        self.id_to_token = {v: k for k, v in token_to_id.items()}
+        # Precompute max token length for possible use & clarity
+        self.max_token_len = max(len(t) for t in token_to_id.keys()) if token_to_id else 0
+        # Build trie for fast longest-match lookup
+        self.trie_root = self._build_trie(token_to_id)
+        # Validate required special tokens
+        required_special_tokens = ["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
+        for tok in required_special_tokens:
+            if tok not in token_to_id:
+                raise KeyError(f"Required special token '{tok}' not found in vocab.")
+        # ✅ Assign special token IDs explicitly - MUST be done before parent init
+        self.bos_token_id = token_to_id["<s>"]
+        self.eos_token_id = token_to_id["</s>"]
+        self.pad_token_id = token_to_id["<pad>"]
+        self.unk_token_id = token_to_id["<unk>"]
+        self.mask_token_id = token_to_id["<mask>"]
+        # Initialize parent class
+        super().__init__(
+            bos_token="<s>",
+            eos_token="</s>",
+            unk_token="<unk>",
+            sep_token=None,
+            pad_token="<pad>",
+            cls_token=None,
+            mask_token="<mask>",
+            additional_special_tokens=[],
+            model_max_length=model_max_length,
+            padding_side=padding_side,
+            truncation_side=truncation_side,
+            chat_template=chat_template,
+            **kwargs,
+        )
+        # ✅ Set all token attributes for compatibility
+        self._unk_token = "<unk>"
+        self._bos_token = "<s>"
+        self._eos_token = "</s>"
+        self._pad_token = "<pad>"
+        self._mask_token = "<mask>"
+        self._sep_token = None
+        self._cls_token = None
+        self.unk_token = "<unk>"
+        self.bos_token = "<s>"
+        self.eos_token = "</s>"
+        self.pad_token = "<pad>"
+        self.mask_token = "<mask>"
+        self.sep_token = None
+        self.cls_token = None
+    def _build_trie(self, token_to_id):
+        root = TrieNode()
+        for token, tid in token_to_id.items():
+            node = root
+            for char in token:
+                if char not in node.children:
+                    node.children[char] = TrieNode()
+                node = node.children[char]
+            node.token_id = tid
+        return root
+    @property
+    def vocab_size(self):
+        return len(self.token_to_id)
+    def __len__(self):
+        return len(self.token_to_id)
+    def get_vocab(self) -> Dict[str, int]:
+        return self.token_to_id.copy()
+    @lru_cache(maxsize=10000)
+    def _cached_encode_str(self, s: str) -> Tuple[int, ...]:
+        return tuple(self._encode_core(s))
+    def _encode_core(self, text: str) -> List[int]:
+        """Core encoding logic using Trie — no caching."""
+        tokens = text
+        result_ids = []
+        i = 0
+        n = len(tokens)
+        while i < n:
+            node = self.trie_root
+            j = i
+            last_match_id = None
+            last_match_end = i
+            while j < n and tokens[j] in node.children:
+                node = node.children[tokens[j]]
+                j += 1
+                if node.token_id is not None:
+                    last_match_id = node.token_id
+                    last_match_end = j
+            if last_match_id is not None:
+                result_ids.append(last_match_id)
+                i = last_match_end
+            else:
+                tok = tokens[i]
+                result_ids.append(self.token_to_id.get(tok, self.unk_token_id))
+                i += 1
+        return result_ids
+    def _tokenize(self, text: str, **kwargs) -> List[str]:
+        token_ids = self._encode_core(text.strip())
+        return [self.id_to_token[tid] for tid in token_ids]
+    def _convert_token_to_id(self, token: str) -> int:
+        return self.token_to_id.get(token, self.unk_token_id)
+    def _convert_id_to_token(self, index: int) -> str:
+        return self.id_to_token.get(index, self.unk_token)
+    # ✅ Public methods
+    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        if isinstance(tokens, str):
+            return self._convert_token_to_id(tokens)
+        return [self._convert_token_to_id(tok) for tok in tokens]
+    def convert_ids_to_tokens(self, ids: Union[int, List[int]]) -> Union[str, List[str]]:
+        if isinstance(ids, int):
+            return self._convert_id_to_token(ids)
+        return [self._convert_id_to_token(i) for i in ids]
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """SMILES-style decoding: no spaces between tokens."""
+        return "".join(tokens)
+    def encode(
+        self,
+        text: str,
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+    ) -> List[int]:
+        """Simple encode method that returns list of token IDs."""
+        if text_pair is not None:
+            raise NotImplementedError("text_pair not supported in simple encode method")
+        # Get core token IDs
+        token_ids = list(self._cached_encode_str(text.strip()))
+        # Add special tokens if requested
+        if add_special_tokens:
+            token_ids = [self.bos_token_id] + token_ids + [self.eos_token_id]
+        # Handle truncation
+        if truncation:
+            if max_length is None:
+                max_length = self.model_max_length
+            if len(token_ids) > max_length:
+                token_ids = token_ids[:max_length]
+        # Handle padding
+        if padding:
+            if max_length is None:
+                max_length = self.model_max_length
+            pad_len = max_length - len(token_ids)
+            if pad_len > 0:
+                if self.padding_side == "right":
+                    token_ids = token_ids + [self.pad_token_id] * pad_len
+                else:
+                    token_ids = [self.pad_token_id] * pad_len + token_ids
+        # Return as tensor if requested
+        if return_tensors == "pt":
+            token_ids = torch.tensor(token_ids, dtype=torch.long)
+            if token_ids.dim() == 0:  # scalar
+                token_ids = token_ids.unsqueeze(0)
+        return token_ids
+    def decode(
+        self,
+        token_ids: Union[List[int], torch.Tensor],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs
+    ) -> str:
+        if isinstance(token_ids, torch.Tensor):
+            token_ids = token_ids.tolist()
+        if skip_special_tokens:
+            special_ids = {
+                self.bos_token_id,
+                self.eos_token_id,
+                self.pad_token_id,
+                self.mask_token_id,
+            }
+        else:
+            special_ids = set()
+        tokens = []
+        for tid in token_ids:
+            if tid in special_ids:
+                continue
+            token = self.id_to_token.get(tid, self.unk_token)
+            tokens.append(token)
+        return "".join(tokens)
+    def batch_decode(
+        self,
+        sequences: Union[List[List[int]], torch.Tensor],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs
+    ) -> List[str]:
+        """Batch decode sequences."""
+        if isinstance(sequences, torch.Tensor):
+            sequences = sequences.tolist()
+        return [
+            self.decode(
+                seq,
+                skip_special_tokens=skip_special_tokens,
+                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                **kwargs
+            )
+            for seq in sequences
+        ]
+    def decode_with_trace(self, token_ids: List[int]) -> None:
+        """Debug method to trace decoding process."""
+        print(f"\n🔍 Decoding {len(token_ids)} tokens:")
+        for i, tid in enumerate(token_ids):
+            token = self.id_to_token.get(tid, self.unk_token)
+            print(f"  [{i:03d}] ID={tid:5d} → '{token}'")
+    def __call__(
+        self,
+        text: Union[str, List[str]],
+        text_pair: Optional[Union[str, List[str]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, Any]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Main callable method that handles both single and batch inputs.
+        """
+        # Handle defaults
+        if return_token_type_ids is None:
+            return_token_type_ids = True
+        if return_attention_mask is None:
+            return_attention_mask = True
+        if isinstance(text, list):
+            if text_pair is not None:
+                batch = [(t, p) for t, p in zip(text, text_pair)]
+            else:
+                batch = text
+            return self.batch_encode_plus(
+                batch,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs
+            )
+        else:
+            return self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs
+            )
+    def encode_plus(
+        self,
+        text: str,
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, Any]] = None,
+        return_token_type_ids: Optional[bool] = True,
+        return_attention_mask: Optional[bool] = True,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        if max_length is None:
+            max_length = self.model_max_length
+        ids_a = list(self._cached_encode_str(text.strip()))
+        if text_pair is not None:
+            ids_b = list(self._cached_encode_str(text_pair.strip()))
+        else:
+            ids_b = None
+        input_ids = []
+        token_type_ids = []
+        if add_special_tokens:
+            input_ids.append(self.bos_token_id)
+            token_type_ids.append(0)
+            if ids_b is not None:
+                input_ids.extend(ids_a)
+                token_type_ids.extend([0] * len(ids_a))
+                input_ids.append(self.eos_token_id)
+                token_type_ids.append(0)
+                input_ids.extend(ids_b)
+                token_type_ids.extend([1] * len(ids_b))
+                input_ids.append(self.eos_token_id)
+                token_type_ids.append(1)
+            else:
+                input_ids.extend(ids_a)
+                token_type_ids.extend([0] * len(ids_a))
+                input_ids.append(self.eos_token_id)
+                token_type_ids.append(0)
+        else:
+            input_ids = ids_a.copy()
+            token_type_ids = [0] * len(input_ids)
+            if ids_b is not None:
+                input_ids.extend(ids_b)
+                token_type_ids.extend([1] * len(ids_b))
+        # Handle truncation
+        if truncation and len(input_ids) > max_length:
+            input_ids = input_ids[:max_length]
+            token_type_ids = token_type_ids[:max_length]
+        # Handle padding
+        if padding == True or padding == "max_length":
+            pad_len = max_length - len(input_ids)
+            if pad_len > 0:
+                if self.padding_side == "right":
+                    input_ids.extend([self.pad_token_id] * pad_len)
+                    token_type_ids.extend([0] * pad_len)
+                else:
+                    input_ids = [self.pad_token_id] * pad_len + input_ids
+                    token_type_ids = [0] * pad_len + token_type_ids
+        attention_mask = [1 if tid != self.pad_token_id else 0 for tid in input_ids]
+        encoded_dict = {
+            "input_ids": input_ids,
+        }
+        if return_attention_mask:
+            encoded_dict["attention_mask"] = attention_mask
+        if return_token_type_ids:
+            encoded_dict["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            special_tokens_mask = [
+                1 if tid in {self.bos_token_id, self.eos_token_id, self.pad_token_id, self.mask_token_id} else 0
+                for tid in input_ids
+            ]
+            encoded_dict["special_tokens_mask"] = special_tokens_mask
+        if return_length:
+            encoded_dict["length"] = len([tid for tid in input_ids if tid != self.pad_token_id])
+        if return_tensors == "pt":
+            output = {}
+            for k, v in encoded_dict.items():
+                tensor = torch.tensor(v, dtype=torch.long)
+                if tensor.ndim == 1:
+                    tensor = tensor.unsqueeze(0)
+                output[k] = tensor
+        else:
+            output = encoded_dict
+        return BatchEncoding(output, tensor_type=return_tensors)
+    def batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: List[Union[str, Tuple[str, str]]],
+        add_special_tokens: bool = True,
+        padding: Union[bool, str] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, Any]] = None,
+        return_token_type_ids: Optional[bool] = True,
+        return_attention_mask: Optional[bool] = True,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        all_input_ids = []
+        all_attention_masks = []
+        all_token_type_ids = []
+        all_special_tokens_masks = []
+        all_lengths = []
+        for item in batch_text_or_text_pairs:
+            if isinstance(item, tuple):
+                text, text_pair = item
+            else:
+                text, text_pair = item, None
+            encoded = self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                add_special_tokens=add_special_tokens,
+                padding=False,  # We'll handle batch padding later
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=None,  # Don't convert to tensors yet
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs
+            )
+            all_input_ids.append(encoded["input_ids"])
+            if "attention_mask" in encoded:
+                all_attention_masks.append(encoded["attention_mask"])
+            if "token_type_ids" in encoded:
+                all_token_type_ids.append(encoded["token_type_ids"])
+            if "special_tokens_mask" in encoded:
+                all_special_tokens_masks.append(encoded["special_tokens_mask"])
+            if "length" in encoded:
+                all_lengths.append(encoded["length"])
+        batched = {
+            "input_ids": all_input_ids,
+        }
+        if all_attention_masks:
+            batched["attention_mask"] = all_attention_masks
+        if all_token_type_ids:
+            batched["token_type_ids"] = all_token_type_ids
+        if all_special_tokens_masks:
+            batched["special_tokens_mask"] = all_special_tokens_masks
+        if all_lengths:
+            batched["length"] = all_lengths
+        # Handle batch padding
+        if padding == True or padding == "longest":
+            max_len = max(len(ids) for ids in all_input_ids)
+            for key in batched:
+                if key in ["input_ids", "attention_mask", "token_type_ids", "special_tokens_mask"]:
+                    padded_seqs = []
+                    for seq in batched[key]:
+                        pad_len = max_len - len(seq)
+                        if pad_len > 0:
+                            if key == "input_ids":
+                                padding_value = self.pad_token_id
+                            else:
+                                padding_value = 0
+                            if self.padding_side == "right":
+                                padded_seq = seq + [padding_value] * pad_len
+                            else:
+                                padded_seq = [padding_value] * pad_len + seq
+                        else:
+                            padded_seq = seq
+                        padded_seqs.append(padded_seq)
+                    batched[key] = padded_seqs
+        if return_tensors == "pt":
+            def to_tensor_list(lst):
+                return [torch.tensor(item, dtype=torch.long) for item in lst]
+            for key in ["input_ids", "attention_mask", "token_type_ids", "special_tokens_mask"]:
+                if key in batched:
+                    batched[key] = torch.nn.utils.rnn.pad_sequence(
+                        to_tensor_list(batched[key]),
+                        batch_first=True,
+                        padding_value=self.pad_token_id if key == "input_ids" else 0
+                    )
+            # Handle non-sequence data
+            if "length" in batched:
+                batched["length"] = torch.tensor(batched["length"], dtype=torch.long)
+        return BatchEncoding(batched, tensor_type=return_tensors)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """Save vocabulary to files."""
+        if not os.path.isdir(save_directory):
+            os.makedirs(save_directory)
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self.token_to_id, f, ensure_ascii=False, indent=2)
+        return (vocab_file,)
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        legacy_format: bool = True,
+        filename_prefix: Optional[str] = None,
+        push_to_hub: bool = False,
+        **kwargs
+    ):
+        """Save tokenizer to directory."""
+        if not os.path.exists(save_directory):
+            os.makedirs(save_directory)
+        # Save vocabulary
+        vocab_files = self.save_vocabulary(save_directory, filename_prefix)
+        # Save tokenizer config
+        tokenizer_config = {
+            "tokenizer_class": self.__class__.__name__,
+            "model_max_length": self.model_max_length,
+            "padding_side": self.padding_side,
+            "truncation_side": self.truncation_side,
+            "special_tokens": {
+                "bos_token": self.bos_token,
+                "eos_token": self.eos_token,
+                "pad_token": self.pad_token,
+                "unk_token": self.unk_token,
+                "mask_token": self.mask_token,
+            }
+        }
+        config_file = os.path.join(save_directory, "tokenizer_config.json")
+        with open(config_file, "w", encoding="utf-8") as f:
+            json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
+        print(f"✅ Tokenizer saved to: {save_directory}")
+        return (save_directory,)
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        *init_inputs,
+        **kwargs
+    ):
+        """Load tokenizer from pretrained directory or hub."""
+        if os.path.isdir(pretrained_model_name_or_path):
+            vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
+            config_file = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
+            # Load config if available
+            config = {}
+            if os.path.exists(config_file):
+                with open(config_file, "r", encoding="utf-8") as f:
+                    config = json.load(f)
+            # Merge config with kwargs
+            merged_config = {**config, **kwargs}
+            return cls(vocab_file=vocab_file, **merged_config)
+        else:
+            raise NotImplementedError("Loading from HuggingFace Hub not implemented yet")
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """Get special tokens mask."""
+        if already_has_special_tokens:
+            return [
+                1 if tid in {self.bos_token_id, self.eos_token_id, self.pad_token_id, self.mask_token_id}
+                else 0 for tid in token_ids_0
+            ]
+        mask = [1]  # BOS
+        mask.extend([0] * len(token_ids_0))  # Token sequence
+        mask.append(1)  # EOS
+        if token_ids_1 is not None:
+            mask.extend([0] * len(token_ids_1))  # Second sequence
+            mask.append(1)  # EOS
+        return mask
+    def create_token_type_ids_from_sequences(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """Create token type IDs for sequences."""
+        sep = [self.eos_token_id]
+        cls = [self.bos_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def build_inputs_with_special_tokens(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """Build inputs with special tokens."""
+        if token_ids_1 is None:
+            return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
+        return ([self.bos_token_id] + token_ids_0 + [self.eos_token_id] +
+                token_ids_1 + [self.eos_token_id])
+class FastChemTokenizerSelfies(FastChemTokenizer):
+    """
+    SELFIES variant that handles whitespace-separated tokens.
+    Uses trie-based longest-match encoding (same as original working version).
+    """
+    def _encode_core(self, text: str) -> List[int]:
+        """Trie-based encoding for SELFIES with fragment + atom vocab."""
+        result_ids = []
+        i = 0
+        n = len(text)
+        while i < n:
+            if text[i].isspace():  # skip literal whitespace
+                i += 1
+                continue
+            node = self.trie_root
+            j = i
+            last_match_id = None
+            last_match_end = i
+            # Traverse trie character by character (including spaces if part of vocab key)
+            while j < n and text[j] in node.children:
+                node = node.children[text[j]]
+                j += 1
+                if node.token_id is not None:
+                    last_match_id = node.token_id
+                    last_match_end = j
+            if last_match_id is not None:
+                result_ids.append(last_match_id)
+                i = last_match_end
+            else:
+                # Fallback: encode one char as unk or atom
+                result_ids.append(self.token_to_id.get(text[i], self.unk_token_id))
+                i += 1
+        return result_ids
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """SELFIES decoding: join tokens with spaces (preserve original format)."""
+        return " ".join(tokens)
+    def decode(
+        self,
+        token_ids: Union[List[int], torch.Tensor],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs
+    ) -> str:
+        if isinstance(token_ids, torch.Tensor):
+            token_ids = token_ids.tolist()
+        if skip_special_tokens:
+            special_ids = {
+                self.bos_token_id,
+                self.eos_token_id,
+                self.pad_token_id,
+                self.mask_token_id,
+            }
+        else:
+            special_ids = set()
+        tokens = []
+        for tid in token_ids:
+            if tid in special_ids:
+                continue
+            token = self.id_to_token.get(tid, self.unk_token)
+            tokens.append(token)
+        return " ".join(tokens)   # ✅ preserve spaces        return " ".join(tokens)   # ✅ preserve spaces