Spaces:

suku9
/

smiles-tokenizer-package

Configuration error

App Files Files Community

suku9 commited on Apr 12, 2025

Commit

4b0a05b

verified ·

1 Parent(s): 3f0172f

Upload SMILES tokenizer package

Browse files

Files changed (7) hide show

README.md +43 -10
setup.py +14 -0
smiles_tokenizer/__init__.py +6 -0
smiles_tokenizer/tokenizer.py +189 -0
smiles_tokenizer/utils.py +84 -0
smiles_tokenizer/vocabulary.py +269 -0
test_tokenizer.py +26 -0

README.md CHANGED Viewed

@@ -1,10 +1,43 @@
----
-title: Smiles Tokenizer Package
-emoji: ⚡
-colorFrom: indigo
-colorTo: gray
-sdk: static
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# SMILES Tokenizer
+This is a custom tokenizer for SMILES (Simplified Molecular Input Line Entry System) strings.
+## Installation
+```bash
+pip install git+https://huggingface.co/suku9/smiles-tokenizer-package
+```
+## Usage
+```python
+# Basic usage
+from smiles_tokenizer import SmilesTokenizer
+tokenizer = SmilesTokenizer()
+smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"  # Aspirin
+# Tokenize
+tokens = tokenizer.tokenize([smiles])[0]
+print(tokens)
+# Encode
+encoded = tokenizer.encode([smiles])[0]
+print(encoded)
+# Use with GPT-2
+from smiles_tokenizer.utils import prepare_for_gpt2
+model, tokenizer_wrapper = prepare_for_gpt2(tokenizer)
+# Now you can use it like a regular Hugging Face tokenizer
+inputs = tokenizer_wrapper(smiles, return_tensors="pt")
+outputs = model(**inputs)
+```
+## Features
+- Specialized for SMILES strings
+- Compatible with Hugging Face's transformers library
+- Designed to work with GPT-2 models
+- Preserves all functionality of the original SMILES tokenizer

setup.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""Setup script for SMILES tokenizer package."""
+from setuptools import setup, find_packages
+setup(
+    name="smiles_tokenizer",
+    version="0.1.0",
+    description="SMILES tokenizer from suku9/smiles-tokenizer-package",
+    packages=find_packages(),
+    install_requires=[
+        "torch>=1.0.0",
+        "transformers>=4.0.0",
+    ],
+)

smiles_tokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""SMILES Tokenizer package."""
+from .tokenizer import SmilesTokenizer
+from .vocabulary import SmilesVocabulary, Vocabulary
+__all__ = ["SmilesTokenizer", "SmilesVocabulary", "Vocabulary"]

smiles_tokenizer/tokenizer.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""SMILES tokenizer implementation."""
+import re
+import json
+import warnings
+from re import Pattern
+from typing import Dict, List, Optional, Union, Any
+import torch
+from .vocabulary import Vocabulary, SmilesVocabulary
+Tokens = List[str]
+class SmilesTokenizer:
+    """
+    Smiles Tokenizer
+    """
+    def __init__(
+        self,
+        vocabulary: Vocabulary = None,
+    ) -> None:
+        if vocabulary is None:
+            self.vocabulary = SmilesVocabulary()
+        else:
+            self.vocabulary = vocabulary
+        self._re: Optional[Pattern] = None
+    @property
+    def re(self) -> Pattern:
+        """Tokens Regex Object.
+        :return: Tokens Regex Object
+        """
+        if not self._re:
+            self._re = self._get_compiled_regex(self.vocabulary.symbols)
+        return self._re
+    def tokenize(self, smiles: List[str], enclose: bool = True) -> List[List[str]]:
+        """
+        convert list of smiles strings to list of lists containing tokens for each
+        """
+        if isinstance(smiles, str):
+            # Convert string to a list with one string
+            smiles = [smiles]
+        tokenized_data = []
+        for smi in smiles:
+            tokens = self.re.findall(smi)
+            if enclose:
+                tokenized_data.append(
+                    [self.vocabulary.go_word] + tokens + [self.vocabulary.eos_word]
+                )
+            else:
+                tokenized_data.append(tokens)
+        return tokenized_data
+    def encode(self, smiles: List[str], enclose: bool = True, aslist=False):
+        """
+        convert a list of smiles strings to list of tensors containing token indices
+        """
+        if isinstance(smiles, str):
+            # Convert string to a list with one string
+            smiles = [smiles]
+        tokenized_smiles = self.tokenize(smiles, enclose=enclose)
+        tokens_lengths = list(map(len, tokenized_smiles))
+        ids_list = []
+        for tokens, length in zip(tokenized_smiles, tokens_lengths):
+            ids_tensor = []  # torch.zeros(length, dtype=torch.long)
+            for tdx, token in enumerate(tokens):
+                ids_tensor.append(self.vocabulary.index(token))
+            if not aslist:
+                ids_tensor = torch.tensor(ids_tensor, dtype=torch.long)
+            ids_list.append(ids_tensor)
+        return ids_list
+    def detokenize(
+        self,
+        token_data: List[List[str]],
+        include_control_tokens: bool = False,
+        include_end_of_line_token: bool = False,
+        truncate_at_end_token: bool = False,
+    ) -> List[str]:
+        """
+        Detokenizes lists of tokens into SMILES by concatenating the token strings.
+        """
+        character_lists = [tokens.copy() for tokens in token_data]
+        character_lists = [
+            self._strip_list(
+                tokens,
+                strip_control_tokens=not include_control_tokens,
+                truncate_at_end_token=truncate_at_end_token,
+            )
+            for tokens in character_lists
+        ]
+        if include_end_of_line_token:
+            for s in character_lists:
+                s.append("\n")
+        strings = ["".join(s) for s in character_lists]
+        return strings
+    def decode(self, ids_list: List[torch.Tensor]):
+        """
+        decodes lists of encodings (ids as tensors) back into smiles strings
+        """
+        tokenized_smiles = []
+        for ids in ids_list:
+            if not isinstance(ids, list):
+                ids = ids.tolist()
+            tokens = [self.vocabulary[i] for i in ids]
+            tokenized_smiles.append(tokens)
+        smiles = self.detokenize(tokenized_smiles, truncate_at_end_token=True)
+        return smiles
+    def tokens_to_smiles(self, tokens):
+        """
+        Convert generated tokens to smiles.
+        Arguments:
+            tokens: list of tokens
+        Returns:
+            list of smiles strings
+        """
+        # convert tokens to smiles
+        smiles = self.decode(tokens)
+        smiles = [smi.replace("<unk>", "") for smi in smiles]
+        return smiles
+    def _strip_list(
+        self,
+        tokens: List[str],
+        strip_control_tokens: bool = False,
+        truncate_at_end_token: bool = False,
+    ) -> List[str]:
+        """Cleanup tokens list from control tokens.
+        :param tokens: List of tokens
+        :param strip_control_tokens: Flag to remove control tokens, defaults to False
+        :param truncate_at_end_token: If True truncate tokens after end-token
+        """
+        if truncate_at_end_token and self.vocabulary.eos_word in tokens:
+            end_token_idx = tokens.index(self.vocabulary.eos_word)
+            tokens = tokens[: end_token_idx + 1]
+        strip_characters: List[str] = [self.vocabulary.pad_word]
+        if strip_control_tokens:
+            strip_characters.extend([self.vocabulary.go_word, self.vocabulary.eos_word])
+        while len(tokens) > 0 and tokens[0] in strip_characters:
+            tokens.pop(0)
+        while len(tokens) > 0 and tokens[-1] in strip_characters:
+            tokens.pop()
+        return tokens
+    def _get_compiled_regex(self, tokens: List[str]) -> Pattern:
+        """Create a Regular Expression Object from a list of tokens and regular expression tokens.
+        :param tokens: List of tokens
+        :return: Regular Expression Object
+        """
+        regex_string = r"("  # r"("
+        for ix, token in enumerate(tokens):
+            processed_token = token
+            for special_character in "()[]+*":
+                processed_token = processed_token.replace(
+                    special_character, f"\{special_character}"
+                )
+            if ix < len(tokens) - 1:
+                regex_string += processed_token + r"|"
+            else:
+                regex_string += processed_token
+        regex_string += r")"
+        pattern = re.compile(regex_string)
+        return pattern

smiles_tokenizer/utils.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""Utility functions for using SMILES tokenizer with transformers."""
+import torch
+from transformers import PreTrainedTokenizer, GPT2LMHeadModel
+from .tokenizer import SmilesTokenizer
+def get_tokenizer():
+    """Get a new instance of the SMILES tokenizer."""
+    return SmilesTokenizer()
+def prepare_for_gpt2(tokenizer, model_name="gpt2"):
+    """Prepare a GPT-2 model to work with the SMILES tokenizer.
+    Args:
+        tokenizer: A SmilesTokenizer instance
+        model_name: Name of the GPT-2 model to load from Hugging Face
+    Returns:
+        tuple: (model, tokenizer_wrapper)
+    """
+    # Create a wrapper class for the tokenizer
+    class SmilesTokenizerWrapper(PreTrainedTokenizer):
+        def __init__(self, smiles_tokenizer):
+            self.smiles_tokenizer = smiles_tokenizer
+            self.vocab = {token: idx for idx, token in enumerate(smiles_tokenizer.vocabulary.symbols)}
+            super().__init__()
+        @property
+        def vocab_size(self):
+            return len(self.vocab)
+        def get_vocab(self):
+            return self.vocab
+        def _tokenize(self, text):
+            if isinstance(text, list):
+                return self.smiles_tokenizer.tokenize(text, enclose=False)[0]
+            return self.smiles_tokenizer.tokenize([text], enclose=False)[0]
+        def _convert_token_to_id(self, token):
+            return self.smiles_tokenizer.vocabulary.index(token)
+        def _convert_id_to_token(self, index):
+            return self.smiles_tokenizer.vocabulary[index]
+        def convert_tokens_to_string(self, tokens):
+            return "".join(tokens)
+        def __call__(self, text, return_tensors=None, **kwargs):
+            if isinstance(text, str):
+                text = [text]
+            encoded = self.smiles_tokenizer.encode(text, enclose=True)
+            if return_tensors == "pt":
+                # Convert to PyTorch tensors if needed
+                if not isinstance(encoded[0], torch.Tensor):
+                    encoded = [torch.tensor(ids) for ids in encoded]
+                # Create attention mask
+                attention_mask = [torch.ones_like(ids) for ids in encoded]
+                # Pad sequences if there are multiple
+                if len(encoded) > 1:
+                    max_len = max(len(ids) for ids in encoded)
+                    padded_ids = []
+                    padded_masks = []
+                    for ids, mask in zip(encoded, attention_mask):
+                        if len(ids) < max_len:
+                            padding = torch.full((max_len - len(ids),), self.smiles_tokenizer.vocabulary.pad_index, dtype=torch.long)
+                            padded_ids.append(torch.cat([ids, padding]))
+                            padded_masks.append(torch.cat([mask, torch.zeros_like(padding)]))
+                        else:
+                            padded_ids.append(ids)
+                            padded_masks.append(mask)
+                    return {"input_ids": torch.stack(padded_ids), "attention_mask": torch.stack(padded_masks)}
+                else:
+                    return {"input_ids": encoded[0].unsqueeze(0), "attention_mask": attention_mask[0].unsqueeze(0)}
+            return {"input_ids": encoded}
+    # Load the GPT-2 model
+    model = GPT2LMHeadModel.from_pretrained(model_name)
+    # Create the tokenizer wrapper
+    tokenizer_wrapper = SmilesTokenizerWrapper(tokenizer)
+    # Resize the model embeddings to match our vocabulary size
+    model.resize_token_embeddings(len(tokenizer_wrapper))
+    return model, tokenizer_wrapper

smiles_tokenizer/vocabulary.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""Vocabulary classes for SMILES tokenization."""
+import os
+import torch
+from collections import Counter
+class Vocabulary(object):
+    """A mapping from symbols to consecutive integers"""
+    def __init__(self, pad="<pad>", eos="</s>", unk="<unk>"):
+        self.unk_word, self.pad_word, self.eos_word = unk, pad, eos
+        self.symbols = []
+        self.count = []
+        self.indices = {}
+        self.pad_index = self.add_symbol(pad)
+        self.eos_index = self.add_symbol(eos)
+        self.unk_index = self.add_symbol(unk)
+        self.nspecial = len(self.symbols)
+    def __eq__(self, other):
+        return self.indices == other.indices
+    def __getitem__(self, idx):
+        if idx < len(self.symbols):
+            return self.symbols[idx]
+        return self.unk_word
+    def __len__(self):
+        """Returns the number of symbols in the dictionary"""
+        return len(self.symbols)
+    def index(self, sym):
+        """Returns the index of the specified symbol"""
+        if sym in self.indices:
+            return self.indices[sym]
+        return self.unk_index
+    def string(self, tensor, bpe_symbol=None, escape_unk=False):
+        """Helper for converting a tensor of token indices to a string.
+        Can optionally remove BPE symbols or escape <unk> words.
+        """
+        if torch.is_tensor(tensor) and tensor.dim() == 2:
+            return "\n".join(self.string(t) for t in tensor)
+        def token_string(i):
+            if i == self.unk():
+                return self.unk_string(escape_unk)
+            else:
+                return self[i]
+        sent = " ".join(token_string(i) for i in tensor if i != self.eos())
+        if bpe_symbol is not None:
+            sent = (sent + " ").replace(bpe_symbol, "").rstrip()
+        return sent
+    def unk_string(self, escape=False):
+        """Return unknown string, optionally escaped as: <<unk>>"""
+        if escape:
+            return "<{}>".format(self.unk_word)
+        else:
+            return self.unk_word
+    def add_symbol(self, word, n=1):
+        """Adds a word to the dictionary"""
+        if word in self.indices:
+            idx = self.indices[word]
+            self.count[idx] = self.count[idx] + n
+            return idx
+        else:
+            idx = len(self.symbols)
+            self.indices[word] = idx
+            self.symbols.append(word)
+            self.count.append(n)
+            return idx
+    def update(self, new_dict):
+        """Updates counts from new dictionary."""
+        for word in new_dict.symbols:
+            idx2 = new_dict.indices[word]
+            if word in self.indices:
+                idx = self.indices[word]
+                self.count[idx] = self.count[idx] + new_dict.count[idx2]
+            else:
+                idx = len(self.symbols)
+                self.indices[word] = idx
+                self.symbols.append(word)
+                self.count.append(new_dict.count[idx2])
+    def finalize(self, threshold=-1, nwords=-1, padding_factor=8):
+        """Sort symbols by frequency in descending order, ignoring special ones.
+        Args:
+            - threshold defines the minimum word count
+            - nwords defines the total number of words in the final dictionary,
+                including special symbols
+            - padding_factor can be used to pad the dictionary size to be a
+                multiple of 8, which is important on some hardware (e.g., Nvidia
+                Tensor Cores).
+        """
+        if nwords <= 0:
+            nwords = len(self)
+        new_indices = dict(zip(self.symbols[: self.nspecial], range(self.nspecial)))
+        new_symbols = self.symbols[: self.nspecial]
+        new_count = self.count[: self.nspecial]
+        c = Counter(
+            dict(zip(self.symbols[self.nspecial :], self.count[self.nspecial :]))
+        )
+        for symbol, count in c.most_common(nwords - self.nspecial):
+            if count >= threshold:
+                new_indices[symbol] = len(new_symbols)
+                new_symbols.append(symbol)
+                new_count.append(count)
+            else:
+                break
+        threshold_nwords = len(new_symbols)
+        if padding_factor > 1:
+            i = 0
+            while threshold_nwords % padding_factor != 0:
+                symbol = "madeupword{:04d}".format(i)
+                new_indices[symbol] = len(new_symbols)
+                new_symbols.append(symbol)
+                new_count.append(0)
+                i += 1
+                threshold_nwords += 1
+        assert len(new_symbols) % padding_factor == 0
+        assert len(new_symbols) == len(new_indices)
+        self.count = list(new_count)
+        self.symbols = list(new_symbols)
+        self.indices = new_indices
+    def pad(self):
+        """Helper to get index of pad symbol"""
+        return self.pad_index
+    def eos(self):
+        """Helper to get index of end-of-sentence symbol"""
+        return self.eos_index
+    def unk(self):
+        """Helper to get index of unk symbol"""
+        return self.unk_index
+    @classmethod
+    def load(cls, f, ignore_utf_errors=False):
+        """Loads the dictionary from a text file with the format:
+        ```
+        <symbol0> <count0>
+        <symbol1> <count1>
+        ...
+        ```
+        """
+        if isinstance(f, str):
+            try:
+                if not ignore_utf_errors:
+                    with open(f, "r", encoding="utf-8") as fd:
+                        return cls.load(fd)
+                else:
+                    with open(f, "r", encoding="utf-8", errors="ignore") as fd:
+                        return cls.load(fd)
+            except FileNotFoundError as fnfe:
+                raise fnfe
+            except Exception:
+                raise Exception(
+                    "Incorrect encoding detected in {}, please "
+                    "rebuild the dataset".format(f)
+                )
+        d = cls()
+        for line in f.readlines():
+            idx = line.rfind(" ")
+            word = line[:idx]
+            count = int(line[idx + 1 :])
+            d.indices[word] = len(d.symbols)
+            d.symbols.append(word)
+            d.count.append(count)
+        return d
+    def save(self, f):
+        """Stores dictionary into a text file"""
+        if isinstance(f, str):
+            os.makedirs(os.path.dirname(f), exist_ok=True)
+            with open(f, "w", encoding="utf-8") as fd:
+                return self.save(fd)
+        for symbol, count in zip(
+            self.symbols[self.nspecial :], self.count[self.nspecial :]
+        ):
+            print("{} {}".format(symbol, count), file=f)
+    def dummy_sentence(self, length):
+        t = torch.Tensor(length).uniform_(self.nspecial + 1, len(self)).long()
+        t[-1] = self.eos()
+        return t
+class SmilesVocabulary(Vocabulary):
+    def __init__(self, pad="<pad>", eos="</s>", unk="<unk>", go="<go>"):
+        self.unk_word, self.pad_word, self.eos_word, self.go_word = (
+            unk,
+            pad,
+            eos,
+            go,
+        )
+        self.symbols = []
+        self.count = []
+        self.indices = {}
+        self.pad_index = self.add_symbol(pad)
+        self.eos_index = self.add_symbol(eos)
+        self.unk_index = self.add_symbol(unk)
+        self.go_index = self.add_symbol(go)
+        self.nspecial = len(self.symbols)
+        for token in self.__get_smile_tokens():
+            self.add_symbol(token)
+    def __get_smile_tokens(self):
+        SMILE_TOKENS = [
+            "S",
+            "O",
+            "2",
+            "n",
+            "l",
+            "F",
+            "H",
+            "C",
+            "o",
+            "5",
+            "r",
+            "s",
+            "=",
+            "6",
+            "[",
+            "N",
+            "4",
+            "c",
+            "-",
+            "3",
+            ")",
+            "#",
+            "]",
+            "B",
+            "(",
+            "1",
+        ]
+        return SMILE_TOKENS
+    def finalize(self, threshold=-1, nwords=-1, padding_factor=1):
+        super(SmilesVocabulary, self).finalize(
+            threshold=threshold, nwords=nwords, padding_factor=padding_factor
+        )
+    def go(self):
+        """GO index."""
+        return self.go_index
+    @classmethod
+    def load(cls, f=None, ignore_utf_errors=False):
+        """Load function for SMILE data.
+        Ignore the file and just initialize the vocab.
+        """
+        return cls()

test_tokenizer.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""Test script for SMILES tokenizer."""
+from smiles_tokenizer import SmilesTokenizer
+from smiles_tokenizer.utils import prepare_for_gpt2
+def main():
+    tokenizer = SmilesTokenizer()
+    smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"  # Aspirin
+    print(f"Tokenizing SMILES: {smiles}")
+    tokens = tokenizer.tokenize([smiles])[0]
+    print(f"Tokens: {tokens}")
+    encoded = tokenizer.encode([smiles])[0]
+    print(f"Encoded: {encoded}")
+    print("Testing with GPT-2...")
+    model, tokenizer_wrapper = prepare_for_gpt2(tokenizer)
+    inputs = tokenizer_wrapper(smiles, return_tensors="pt")
+    print(f"Model inputs: {inputs}")
+    outputs = model(**inputs)
+    print(f"Model output shape: {outputs.logits.shape}")
+    print("Test completed successfully!")
+if __name__ == "__main__":
+    main()