suku9 commited on
Commit
0f51b32
·
verified ·
1 Parent(s): 6425080

Upload SMILES tokenizer

Browse files
Files changed (2) hide show
  1. gpt2_tokenizer.py +25 -0
  2. tokenizer_config.json +1 -1
gpt2_tokenizer.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GPT2 Tokenizer that loads our custom SMILES tokenizer."""
2
+
3
+ import os
4
+ import json
5
+ import torch
6
+ from transformers import PreTrainedTokenizer
7
+
8
+ from .smiles_tokenizer import SmilesTokenizer, SmilesVocabulary
9
+
10
+ class GPT2Tokenizer(PreTrainedTokenizer):
11
+ """
12
+ GPT2Tokenizer wrapper for our SMILES tokenizer.
13
+ This class exists only to make AutoTokenizer find our tokenizer.
14
+ """
15
+
16
+ def __init__(self, **kwargs):
17
+ from .tokenizer_class import HFSmilesTokenizer
18
+ self.tokenizer = HFSmilesTokenizer(**kwargs)
19
+ super().__init__(**kwargs)
20
+
21
+ def __getattr__(self, name):
22
+ return getattr(self.tokenizer, name)
23
+
24
+ def __call__(self, *args, **kwargs):
25
+ return self.tokenizer(*args, **kwargs)
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "model_type": "gpt2",
3
- "tokenizer_class": "HFSmilesTokenizer",
4
  "bos_token": "<go>",
5
  "eos_token": "</s>",
6
  "unk_token": "<unk>",
 
1
  {
2
  "model_type": "gpt2",
3
+ "tokenizer_class": "GPT2Tokenizer",
4
  "bos_token": "<go>",
5
  "eos_token": "</s>",
6
  "unk_token": "<unk>",