ZoeMC commited on
Commit
0ab88d6
·
1 Parent(s): fd41599

Saving weights and logs of step 10000

Browse files
.gitattributes CHANGED
@@ -27,4 +27,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.zstandard filter=lfs diff=lfs merge=lfs -text
28
  *tfevents* filter=lfs diff=lfs merge=lfs -text
29
  chemT5_data.csv filter=lfs diff=lfs merge=lfs -text
30
- chemT5_data.tsv filter=lfs diff=lfs merge=lfs -text
 
27
  *.zstandard filter=lfs diff=lfs merge=lfs -text
28
  *tfevents* filter=lfs diff=lfs merge=lfs -text
29
  chemT5_data.csv filter=lfs diff=lfs merge=lfs -text
 
__pycache__/pretokenizer.cpython-39.pyc CHANGED
Binary files a/__pycache__/pretokenizer.cpython-39.pyc and b/__pycache__/pretokenizer.cpython-39.pyc differ
 
chemT5_data.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:790657db4eff6c29407874fc4eb06ecfa134b91f924a44c215a0bf8b556ad307
3
- size 48054222
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7af01ac70fdcb2008a1bdc777d14181ab7f02ec48a55522710d2a7cd3f4e3952
3
+ size 41116570
dataset-clean.py CHANGED
@@ -13,13 +13,38 @@ input_sentence_size = None
13
  # Initialize a dataset
14
  #dataset = load_dataset('csv', data_files='/home/zoez/Chem-T5/train-file.csv',split="train")
15
  dataset = pd.read_csv('./chemT5_data.csv')#('/home/zoez/Chem-T5/train-file.csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  #print(dataset.iloc[0])
17
 
18
  dataset=pd.DataFrame(columns=['SMILES'],data=dataset)
19
- #dataset.drop('Unnamed: 0',1)
20
- #print(dataset.columns)
21
- dataset.columns=['SMILES']
22
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- dataset.fillna('', inplace=True)
25
- dataset.to_csv('chemT5_data.csv',sep = ' ')
 
 
13
  # Initialize a dataset
14
  #dataset = load_dataset('csv', data_files='/home/zoez/Chem-T5/train-file.csv',split="train")
15
  dataset = pd.read_csv('./chemT5_data.csv')#('/home/zoez/Chem-T5/train-file.csv')
16
+ #dataset=pd.DataFrame(columns=['SMILES'],data=dataset)
17
+ #dataset['SMILES']=dataset['SMILES'].str[2:]
18
+ # for i, line in tqdm(enumerate(dataset['SMILES'])):
19
+ # print(line)
20
+ # line = re.sub('\d+ ', '',line)
21
+ # #
22
+ # #newLine=line#atomwise_tokenizer(line)
23
+ # #print(newLine)
24
+ # #print(int(i/10))
25
+ # dataset.iloc[i]['SMILES']=line
26
+ # print(dataset[0:5])
27
+ # dataset.dropna()
28
+ #dataset.to_csv('chemT5_data.csv',index=False)
29
+
30
  #print(dataset.iloc[0])
31
 
32
  dataset=pd.DataFrame(columns=['SMILES'],data=dataset)
33
+ # print(dataset[0:5])
34
+ # print(dataset.columns)
 
35
 
36
+ # #dataset.drop('Unnamed: 0',1)
37
+ # print(dataset.columns)
38
+ # dataset.columns=['SMILES']
39
+ # for i, line in tqdm(enumerate(dataset['SMILES'])):
40
+ # #line = re.sub('\d+ ', '',line)
41
+ # #print(line)
42
+ # newLine=line#atomwise_tokenizer(line)
43
+ # #print(newLine)
44
+ # #print(int(i/10))
45
+ # dataset.iloc[i]['SMILES']=newLine
46
+ # print(dataset['SMILES'][0:5])
47
 
48
+ dataset=dataset[~dataset.SMILES.str.contains("\"\"", regex=False,na=True)]
49
+ #print(dataset[0:5])
50
+ dataset.to_csv('chemT5_data.csv',index=False)
events.out.tfevents.1651650601.toxicgpu.cs.vt.edu.23181.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49e1c63d58bd954395e335ebfaa4e9dccfab105ab974c2f7da7eda1f8472a523
3
+ size 40
events.out.tfevents.1651774324.toxicgpu.cs.vt.edu.2962.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de2218534456aa901282dfc1ace06039a8590cb8aac77177f745e1029c656314
3
+ size 40
events.out.tfevents.1651774377.toxicgpu.cs.vt.edu.4116.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:911544eef0373a776c4d8a9035cae7a97d5c5a1fafa70b6ca2f59b3ac76a9609
3
+ size 40
events.out.tfevents.1651774597.toxicgpu.cs.vt.edu.5771.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63b787fcb2b7e97f89d7be11ebb8859ab1ff98b882da0ff9fa1a7bb5a0abc8fe
3
+ size 40
events.out.tfevents.1651774686.toxicgpu.cs.vt.edu.6128.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d8b9fb2ddaf0b2d87eb166d90dc0980e5386459906b5cf7b6135bd6ad5cf153
3
+ size 40
events.out.tfevents.1651774751.toxicgpu.cs.vt.edu.7181.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:151565b73a0af3c7639f03dea6c25e2b440161f176f6661357a49e9122eb9a37
3
+ size 40
events.out.tfevents.1651822478.toxicgpu.cs.vt.edu.31615.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e75d0d56e2064320d0238b790bc4c51826bea5c2d88d20ddec70797701248e2e
3
+ size 40
events.out.tfevents.1651823225.toxicgpu.cs.vt.edu.32383.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f5c234e3e2b32fafd655d01824aaa400935ed79ce280c85a95b97f6805d0967
3
+ size 40
events.out.tfevents.1651824342.toxicgpu.cs.vt.edu.2732.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:687c32f6139062f90c29c191620ff300b24cf3115e28c1abd8f548c58f8e31bd
3
+ size 40
events.out.tfevents.1651824633.toxicgpu.cs.vt.edu.3509.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39b77e5779e2dc13eeaa6bd2d850552966aae69aa2940d2ab67ee3f25a368dff
3
+ size 40
events.out.tfevents.1651824828.toxicgpu.cs.vt.edu.3970.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8bec9562ab542ab8e34c2b3588e4b11fd53fa8d347624565638bc96eece2004
3
+ size 40
events.out.tfevents.1651824941.toxicgpu.cs.vt.edu.4751.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c7156fe5e376bc60524bc25626cc042152247d31606161c575a2ec8c53a80d7
3
+ size 1471867
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d99ed12fc3df890828fc608bde1949bb19fce1d45e4117685d366f0b31787a9
3
  size 990170015
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:568d3d6b8d71a100dda0d44d84d3e5704afd75c510ac8e4edd6e57c2ac2d0076
3
  size 990170015
pretrain_data.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ import torch as pt
3
+ import pandas as pd
4
+ import re
5
+ from t5_tokenizer_model import SentencePieceUnigramTokenizer
6
+ #from pretokenizer import atomwise_tokenizer
7
+ from tqdm import tqdm
8
+ from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, T5Config
9
+ from tokenizers import Tokenizer
10
+ import numpy as np
11
+
12
+
13
+ tokenizer = AutoTokenizer.from_pretrained("./")
14
+ dataset = pd.read_csv('./chemT5_data.csv')
15
+ train=pd.DataFrame(data=dataset)
16
+
17
+ for i, line in tqdm(enumerate(dataset['SMILES'])):
18
+ print(i," "+line)
19
+ line = tokenizer.encode(line)
20
+ #print(line)
21
+ newLine=tokenizer.convert_ids_to_tokens(line)
22
+ #print(newLine)
23
+ #print(int(i/10))
24
+ train.iloc[i]['SMILES']=newLine
25
+
26
+ #print(train[0:5])
27
+ train.to_csv('pretrain.csv',index=False)
run_t5_mlm_flax.py CHANGED
@@ -29,6 +29,7 @@ from typing import Dict, List, Optional
29
 
30
  import numpy as np
31
  from datasets import load_dataset
 
32
  from tqdm import tqdm
33
 
34
  import flax
 
29
 
30
  import numpy as np
31
  from datasets import load_dataset
32
+ from tokenizer import split_into_sentences
33
  from tqdm import tqdm
34
 
35
  import flax
tokenizer-trainer_uni.py CHANGED
@@ -57,13 +57,21 @@ tokenizer.train_from_iterator(
57
 
58
 
59
  # Save files to disk
60
- tokenizer.save("/home/zoez/chemT5/uni-tokenizer.json")
61
 
62
 
63
  print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens)
64
 
65
- from transformers import T5Config
 
 
 
 
 
 
 
 
66
 
67
 
68
- config = T5Config.from_pretrained("google/t5-v1_1-base", vocab_size=tokenizer.get_vocab_size())
69
- config.save_pretrained("./")
 
57
 
58
 
59
  # Save files to disk
60
+ #tokenizer.save("/home/zoez/chemT5/uni-tokenizer.json")
61
 
62
 
63
  print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens)
64
 
65
+ #from transformers import T5Config
66
+
67
+ for i, line in tqdm(enumerate(dataset['SMILES'])):
68
+ #line = re.sub('\d+\t', '',line)
69
+ #print(line)
70
+ newLine=tokenizer.encode(line).tokens#atomwise_tokenizer(line)
71
+ #print(newLine)
72
+ #print(int(i/10))
73
+ dataset.iloc[i]['SMILES']=newLine
74
 
75
 
76
+ #config = T5Config.from_pretrained("google/t5-v1_1-base", vocab_size=tokenizer.get_vocab_size())
77
+ #config.save_pretrained("./")
train_scprit.sh CHANGED
@@ -4,10 +4,10 @@ python run_t5_mlm_flax.py \
4
  --model_type="t5" \
5
  --config_name="./" \
6
  --tokenizer_name="./" \
7
- --train_file="chemT5_data.csv" \
8
  --max_seq_length="256" \
9
- --per_device_train_batch_size="8" \
10
- --per_device_eval_batch_size="8" \
11
  --adafactor \
12
  --learning_rate="0.005" \
13
  --weight_decay="0.001" \
@@ -20,3 +20,4 @@ python run_t5_mlm_flax.py \
20
 
21
 
22
 
 
 
4
  --model_type="t5" \
5
  --config_name="./" \
6
  --tokenizer_name="./" \
7
+ --train_file="./chemT5_data.csv" \
8
  --max_seq_length="256" \
9
+ --per_device_train_batch_size="1" \
10
+ --per_device_eval_batch_size="1" \
11
  --adafactor \
12
  --learning_rate="0.005" \
13
  --weight_decay="0.001" \
 
20
 
21
 
22
 
23
+ ~
try.py CHANGED
@@ -13,30 +13,51 @@ import numpy as np
13
 
14
 
15
 
16
- #model = T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path="/home/zoez/Chem-T5", from_flax=True)
17
- tokenizer = AutoTokenizer.from_pretrained("/home/zoez/chemT5")
18
- #tokenizer = Tokenizer.from_file("/home/zoez/Chem-T5/tokenizer.json")
19
  #model = model.to(device)
20
 
21
- print(tokenizer.encode(atomwise_tokenizer("O=[N+]([O-])c1ccc(Cl)cc1O=[N+]([O-])c1ccc(Cl)cc1")).tokens)
22
 
23
 
24
- # # encode context the generation is conditioned on
25
- # input_ids1 = tokenizer.encode(": O[N+]([O-])c1ccc(Cl)cc1",return_tensors='pt')
 
26
 
27
- # # activate beam search and early_stopping
28
  # beam_output1 = model.generate(
29
  # input_ids1,
30
  # max_length=50,
31
  # num_beams=5,
32
  # early_stopping=True
33
  # )
 
 
34
  # #print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  # print("Output: 1\n" + 100 * '-')
36
  # print(tokenizer.decode(beam_output1[0], skip_special_tokens=True))
 
 
37
 
38
  # # encode context the generation is conditioned on
39
- # input_ids2 = tokenizer.encode("SMILES: ",return_tensors='pt')
40
 
41
  # # activate beam search and early_stopping
42
  # beam_output2 = model.generate(
@@ -47,12 +68,12 @@ print(tokenizer.encode(atomwise_tokenizer("O=[N+]([O-])c1ccc(Cl)cc1O=[N+]([O-])c
47
  # num_return_sequences=9,
48
  # early_stopping=True
49
  # )
50
- # #print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens)
51
  # print("Output: 2\n" + 100 * '-')
52
- # #print(tokenizer.decode(beam_output2[0], skip_special_tokens=True))
53
 
54
- # #start = latent_to_string(latent0)
55
- # #destination = latent_to_string(latent1)
56
  # mols1 = []
57
  # step = np.linspace(0,1,100)
58
  # invalid = 0
 
13
 
14
 
15
 
16
+ #model = T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path="./", from_flax=True)
17
+ tokenizer = AutoTokenizer.from_pretrained("./")
18
+ #tokenizer = Tokenizer.from_file("/home/zoez/chemT5")
19
  #model = model.to(device)
20
 
21
+ #print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens)
22
 
23
 
24
+ # # # encode context the generation is conditioned on
25
+ # input_ids1 = tokenizer.encode("1",return_tensors='pt')
26
+ # print(input_ids1)
27
 
28
+ # # # activate beam search and early_stopping
29
  # beam_output1 = model.generate(
30
  # input_ids1,
31
  # max_length=50,
32
  # num_beams=5,
33
  # early_stopping=True
34
  # )
35
+ encoding=tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1")
36
+ print(tokenizer.convert_ids_to_tokens(encoding))
37
  # #print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens)
38
+
39
+ # # set seed to reproduce results. Feel free to change the seed though to get different results
40
+ # tf.random.set_seed(0)
41
+
42
+ # # use temperature to decrease the sensitivity to low probability candidates
43
+ # sample_output = model.generate(
44
+ # input_ids1,
45
+ # do_sample=True,
46
+ # max_length=50,
47
+ # top_k=0,
48
+ # temperature=0.7
49
+ # )
50
+
51
+ # print("Output:\n" + 100 * '-')
52
+ # print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
53
+
54
  # print("Output: 1\n" + 100 * '-')
55
  # print(tokenizer.decode(beam_output1[0], skip_special_tokens=True))
56
+ # decoding=tokenizer.decode(beam_output1[0], skip_special_tokens=True)
57
+ # print(tokenizer.convert_ids_to_tokens(decoding))
58
 
59
  # # encode context the generation is conditioned on
60
+ # input_ids2 = tokenizer.encode(": ",return_tensors='pt')
61
 
62
  # # activate beam search and early_stopping
63
  # beam_output2 = model.generate(
 
68
  # num_return_sequences=9,
69
  # early_stopping=True
70
  # )
71
+ # print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1"))
72
  # print("Output: 2\n" + 100 * '-')
73
+ # print(tokenizer.decode(beam_output2[0], skip_special_tokens=True))
74
 
75
+ # # #start = latent_to_string(latent0)
76
+ # # #destination = latent_to_string(latent1)
77
  # mols1 = []
78
  # step = np.linspace(0,1,100)
79
  # invalid = 0