File size: 784 Bytes
0ab88d6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | import tensorflow as tf
import torch as pt
import pandas as pd
import re
from t5_tokenizer_model import SentencePieceUnigramTokenizer
#from pretokenizer import atomwise_tokenizer
from tqdm import tqdm
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, T5Config
from tokenizers import Tokenizer
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("./")
dataset = pd.read_csv('./chemT5_data.csv')
train=pd.DataFrame(data=dataset)
for i, line in tqdm(enumerate(dataset['SMILES'])):
print(i," "+line)
line = tokenizer.encode(line)
#print(line)
newLine=tokenizer.convert_ids_to_tokens(line)
#print(newLine)
#print(int(i/10))
train.iloc[i]['SMILES']=newLine
#print(train[0:5])
train.to_csv('pretrain.csv',index=False) |