| import numpy as np | |
| import pandas as pd | |
| import re | |
| from t5_tokenizer_model import SentencePieceUnigramTokenizer | |
| #from pretokenizer import atomwise_tokenizer | |
| from tqdm import tqdm | |
| vocab_size = 32_000 | |
| input_sentence_size = None | |
| # Initialize a dataset | |
| #dataset = load_dataset('csv', data_files='/home/zoez/Chem-T5/train-file.csv',split="train") | |
| dataset = pd.read_csv('./chemT5_data.csv')#('/home/zoez/Chem-T5/train-file.csv') | |
| #dataset=pd.DataFrame(columns=['SMILES'],data=dataset) | |
| #dataset['SMILES']=dataset['SMILES'].str[2:] | |
| # for i, line in tqdm(enumerate(dataset['SMILES'])): | |
| # print(line) | |
| # line = re.sub('\d+ ', '',line) | |
| # # | |
| # #newLine=line#atomwise_tokenizer(line) | |
| # #print(newLine) | |
| # #print(int(i/10)) | |
| # dataset.iloc[i]['SMILES']=line | |
| # print(dataset[0:5]) | |
| # dataset.dropna() | |
| #dataset.to_csv('chemT5_data.csv',index=False) | |
| #print(dataset.iloc[0]) | |
| dataset=pd.DataFrame(columns=['SMILES'],data=dataset) | |
| # print(dataset[0:5]) | |
| # print(dataset.columns) | |
| # #dataset.drop('Unnamed: 0',1) | |
| # print(dataset.columns) | |
| # dataset.columns=['SMILES'] | |
| # for i, line in tqdm(enumerate(dataset['SMILES'])): | |
| # #line = re.sub('\d+ ', '',line) | |
| # #print(line) | |
| # newLine=line#atomwise_tokenizer(line) | |
| # #print(newLine) | |
| # #print(int(i/10)) | |
| # dataset.iloc[i]['SMILES']=newLine | |
| # print(dataset['SMILES'][0:5]) | |
| dataset=dataset[~dataset.SMILES.str.contains("\"\"", regex=False,na=True)] | |
| #print(dataset[0:5]) | |
| dataset.to_csv('chemT5_data.csv',index=False) |