saicharan2804 commited on
Commit
5cf5457
·
1 Parent(s): 0478e60

First commit

Browse files
BpeTokenizer.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer
2
+
3
+ def bpe_tokenizer(smiles_string):
4
+ # Load the tokenizer from the saved file
5
+ tokenizer = Tokenizer.from_file("bpe_tokenizer.json")
6
+
7
+ # Tokenize the SMILES string
8
+ encoded_output = tokenizer.encode(smiles_string)
9
+
10
+ # To get the tokenized output as text
11
+ tokens_text = encoded_output.tokens
12
+
13
+ return tokens_text
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from BpeTokenizer import bpe_tokenizer
3
+
4
+ # def tem(name, num = 3):
5
+ # return name + num
6
+
7
+
8
+ # iface = gr.Interface(fn=tem, inputs=["text", "text"], outputs="text")
9
+
10
+ iface = gr.Interface(
11
+ fn = bpe_tokenizer,
12
+ inputs=[
13
+ gr.Textbox(label="SMILES"),
14
+ ],
15
+ outputs="text"
16
+ )
17
+
18
+ iface.launch()
19
+
chembl_bpe_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tokenizers
trainBpeTokenizer.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer
2
+ from tokenizers.models import BPE
3
+ from tokenizers.trainers import BpeTrainer
4
+ from tokenizers.pre_tokenizers import ByteLevel
5
+ from tokenizers.processors import TemplateProcessing
6
+
7
+ # Initialize a tokenizer
8
+ tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
9
+
10
+ # Use the byte level pre-tokenizer
11
+ tokenizer.pre_tokenizer = ByteLevel()
12
+
13
+ # Customize training with a BpeTrainer
14
+ trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
15
+
16
+ # Path to the file(s) for training the tokenizer
17
+ files = ["/home/saicharan/Downloads/chembl.csv"]
18
+
19
+ # Train the tokenizer
20
+ tokenizer.train(files, trainer)
21
+
22
+ # Optionally, you can customize the post-processing to add special tokens
23
+ tokenizer.post_processor = TemplateProcessing(
24
+ single="[CLS] $A [SEP]",
25
+ pair="[CLS] $A [SEP] $B:1 [SEP]:1",
26
+ special_tokens=[
27
+ ("[CLS]", tokenizer.token_to_id("[CLS]")),
28
+ ("[SEP]", tokenizer.token_to_id("[SEP]")),
29
+ ],
30
+ )
31
+
32
+ # Save the tokenizer
33
+ tokenizer.save("/home/saicharan/Downloads/chembl_bpe_tokenizer.json")