| import tensorflow as tf | |
| import torch as pt | |
| import pandas as pd | |
| import re | |
| from t5_tokenizer_model import SentencePieceUnigramTokenizer | |
| #from pretokenizer import atomwise_tokenizer | |
| from tqdm import tqdm | |
| from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, T5Config | |
| from tokenizers import Tokenizer | |
| import numpy as np | |
| tokenizer = AutoTokenizer.from_pretrained("./") | |
| dataset = pd.read_csv('./chemT5_data.csv') | |
| train=pd.DataFrame(data=dataset) | |
| for i, line in tqdm(enumerate(dataset['SMILES'])): | |
| print(i," "+line) | |
| line = tokenizer.encode(line) | |
| #print(line) | |
| newLine=tokenizer.convert_ids_to_tokens(line) | |
| #print(newLine) | |
| #print(int(i/10)) | |
| train.iloc[i]['SMILES']=newLine | |
| #print(train[0:5]) | |
| train.to_csv('pretrain.csv',index=False) |