| import numpy as np |
| import pandas as pd |
| import re |
| from t5_tokenizer_model import SentencePieceAtomwiseTokenizer |
| from pretokenizer import atomwise_tokenizer |
| from tqdm import tqdm |
|
|
|
|
|
|
| vocab_size = 32_000 |
| input_sentence_size = None |
|
|
| |
| |
| dataset = pd.read_csv('/home/zoez/Chem-T5/train-file.csv') |
|
|
| tokenizer = SentencePieceAtomwiseTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>") |
| dataset.columns=['SMILES'] |
| |
| dataset=pd.DataFrame(columns=['SMILES','SMILESs'],data=dataset) |
| dataset.fillna('', inplace=True) |
| |
| |
|
|
| for i, line in tqdm(enumerate(dataset['SMILES'])): |
| line = re.sub('\d+\t', '',line) |
| |
| newLine=atomwise_tokenizer(line) |
| |
| |
| dataset.iloc[int(i/50)]['SMILESs']+="&"+newLine |
| |
| |
| |
| |
| |
| |
| def batch_iterator(input_sentence_size=None): |
| if input_sentence_size is None: |
| input_sentence_size = len(dataset) |
| batch_length = 100 |
| for i in range(0, input_sentence_size, batch_length): |
| |
| yield dataset[i: i + batch_length]['SMILESs'] |
|
|
|
|
| |
| tokenizer.train_from_iterator( |
| iterator=batch_iterator(input_sentence_size=input_sentence_size), |
| vocab_size=vocab_size, |
| show_progress=True, |
| ) |
|
|
|
|
| |
| tokenizer.save("/home/zoez/chemT5/tokenizer.json") |
|
|
|
|
| print(tokenizer.encode(atomwise_tokenizer("O=[N+]([O-])c1ccc(Cl)cc1O=[N+]([O-])c1ccc(Cl)cc1")).tokens) |
|
|
| |
|
|
|
|
| |
| |
|
|