Defetya commited on
Commit
592e96e
·
verified ·
1 Parent(s): 10ffb04

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +76 -0
  2. simson_modeling/.create_augmented_dataset.py.swp +0 -0
  3. simson_modeling/.create_splits.py.swp +0 -0
  4. simson_modeling/.ipynb_checkpoints/create_augmented_dataset-checkpoint.py +83 -0
  5. simson_modeling/.ipynb_checkpoints/create_augmented_dataset.py-checkpoint.save +83 -0
  6. simson_modeling/.ipynb_checkpoints/create_splits-checkpoint.py +200 -0
  7. simson_modeling/.ipynb_checkpoints/fingerprint_training-checkpoint.ipynb +1550 -0
  8. simson_modeling/.ipynb_checkpoints/simson_ddp_train-checkpoint.py +545 -0
  9. simson_modeling/.ipynb_checkpoints/upload_state_to_hf-checkpoint.py +23 -0
  10. simson_modeling/.simson_ddp_train.py.swp +0 -0
  11. simson_modeling/.upload_state_to_hf.py.swp +0 -0
  12. simson_modeling/__pycache__/create_augmented_dataset.cpython-312.pyc +0 -0
  13. simson_modeling/__pycache__/create_splits.cpython-312.pyc +0 -0
  14. simson_modeling/__pycache__/simson_ddp_train.cpython-312.pyc +0 -0
  15. simson_modeling/__pycache__/upload_state_to_hf.cpython-312.pyc +0 -0
  16. simson_modeling/create_augmented_dataset.py +83 -0
  17. simson_modeling/create_augmented_dataset.py.save +83 -0
  18. simson_modeling/create_splits.py +200 -0
  19. simson_modeling/data/polymer_1M.parquet +3 -0
  20. simson_modeling/data/polymer_splits/test.parquet +3 -0
  21. simson_modeling/data/polymer_splits/train.parquet +3 -0
  22. simson_modeling/data/polymer_splits/validation.parquet +3 -0
  23. simson_modeling/data/pubchem_119m_splits/test.parquet +3 -0
  24. simson_modeling/data/pubchem_119m_splits/train.parquet +3 -0
  25. simson_modeling/data/pubchem_119m_splits/validation.parquet +3 -0
  26. simson_modeling/data/pubchem_2_epoch_50M +3 -0
  27. simson_modeling/data/splits/test.parquet +3 -0
  28. simson_modeling/data/splits/train.parquet +3 -0
  29. simson_modeling/data/splits/validation.parquet +3 -0
  30. simson_modeling/fingerprint_training.ipynb +0 -0
  31. simson_modeling/kaggle_comp/.ipynb_checkpoints/simson-fine-tune-checkpoint.ipynb +1608 -0
  32. simson_modeling/kaggle_comp/checkpoints/clf_kaggle.bin +3 -0
  33. simson_modeling/kaggle_comp/sample_submission.csv +4 -0
  34. simson_modeling/kaggle_comp/simson-fine-tune.ipynb +1742 -0
  35. simson_modeling/kaggle_comp/simson_polymer_1m_uncompiled.pth +3 -0
  36. simson_modeling/kaggle_comp/test.csv +4 -0
  37. simson_modeling/kaggle_comp/train.csv +0 -0
  38. simson_modeling/kaggle_comp/train_supplement/dataset1.csv +875 -0
  39. simson_modeling/kaggle_comp/train_supplement/dataset2.csv +0 -0
  40. simson_modeling/kaggle_comp/train_supplement/dataset3.csv +47 -0
  41. simson_modeling/kaggle_comp/train_supplement/dataset4.csv +863 -0
  42. simson_modeling/moleculenet_eval/.ipynb_checkpoints/better_eval-checkpoint.py +671 -0
  43. simson_modeling/moleculenet_eval/.ipynb_checkpoints/eval-checkpoint.py +457 -0
  44. simson_modeling/moleculenet_eval/.ipynb_checkpoints/showcase-checkpoint.ipynb +0 -0
  45. simson_modeling/moleculenet_eval/.ipynb_checkpoints/visualizations-checkpoint.ipynb +0 -0
  46. simson_modeling/moleculenet_eval/__pycache__/better_eval.cpython-312.pyc +0 -0
  47. simson_modeling/moleculenet_eval/__pycache__/eval.cpython-312.pyc +0 -0
  48. simson_modeling/moleculenet_eval/better_eval.py +671 -0
  49. simson_modeling/moleculenet_eval/eval.py +457 -0
  50. simson_modeling/moleculenet_eval/eval.py.save +360 -0
.gitattributes CHANGED
@@ -104,3 +104,79 @@ wandb/run-20250720_171431-y7ebdfsw/run-y7ebdfsw.wandb filter=lfs diff=lfs merge=
104
  wandb/run-20250720_203832-beztzgvz/run-beztzgvz.wandb filter=lfs diff=lfs merge=lfs -text
105
  wandb/run-20250720_210448-dhzemttz/run-dhzemttz.wandb filter=lfs diff=lfs merge=lfs -text
106
  wandb/run-20250721_155939-ib4az6kg/run-ib4az6kg.wandb filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  wandb/run-20250720_203832-beztzgvz/run-beztzgvz.wandb filter=lfs diff=lfs merge=lfs -text
105
  wandb/run-20250720_210448-dhzemttz/run-dhzemttz.wandb filter=lfs diff=lfs merge=lfs -text
106
  wandb/run-20250721_155939-ib4az6kg/run-ib4az6kg.wandb filter=lfs diff=lfs merge=lfs -text
107
+ simson_modeling/data/pubchem_2_epoch_50M filter=lfs diff=lfs merge=lfs -text
108
+ simson_modeling/regression/PI_Tg_P308K_synth_db_chem.csv filter=lfs diff=lfs merge=lfs -text
109
+ simson_modeling/regression/polyGNN_combined_mols_.csv filter=lfs diff=lfs merge=lfs -text
110
+ simson_modeling/wandb/offline-run-20250722_135829-kk172e65/run-kk172e65.wandb filter=lfs diff=lfs merge=lfs -text
111
+ simson_modeling/wandb/offline-run-20250723_215057-mgnloe91/run-mgnloe91.wandb filter=lfs diff=lfs merge=lfs -text
112
+ simson_modeling/wandb/offline-run-20250723_220103-olccslmp/run-olccslmp.wandb filter=lfs diff=lfs merge=lfs -text
113
+ simson_modeling/wandb/offline-run-20250723_221245-dtzrj4ds/run-dtzrj4ds.wandb filter=lfs diff=lfs merge=lfs -text
114
+ simson_modeling/wandb/offline-run-20250723_222918-2pja97j3/.ipynb_checkpoints/run-2pja97j3-checkpoint.wandb filter=lfs diff=lfs merge=lfs -text
115
+ simson_modeling/wandb/offline-run-20250723_222918-2pja97j3/run-2pja97j3.wandb filter=lfs diff=lfs merge=lfs -text
116
+ simson_modeling/wandb/offline-run-20250807_164332-cd5u0onl/run-cd5u0onl.wandb filter=lfs diff=lfs merge=lfs -text
117
+ simson_modeling/wandb/offline-run-20250807_165940-vwaul17m/run-vwaul17m.wandb filter=lfs diff=lfs merge=lfs -text
118
+ simson_modeling/wandb/run-20250717_233437-v8x24r2k/run-v8x24r2k.wandb filter=lfs diff=lfs merge=lfs -text
119
+ simson_modeling/wandb/run-20250717_235006-x4au33ay/run-x4au33ay.wandb filter=lfs diff=lfs merge=lfs -text
120
+ simson_modeling/wandb/run-20250717_235346-1bpueejc/run-1bpueejc.wandb filter=lfs diff=lfs merge=lfs -text
121
+ simson_modeling/wandb/run-20250718_000137-paiph8s9/run-paiph8s9.wandb filter=lfs diff=lfs merge=lfs -text
122
+ simson_modeling/wandb/run-20250718_000333-i8pujeh7/run-i8pujeh7.wandb filter=lfs diff=lfs merge=lfs -text
123
+ simson_modeling/wandb/run-20250718_001003-mng2732j/run-mng2732j.wandb filter=lfs diff=lfs merge=lfs -text
124
+ simson_modeling/wandb/run-20250718_001219-y8uqpgnp/run-y8uqpgnp.wandb filter=lfs diff=lfs merge=lfs -text
125
+ simson_modeling/wandb/run-20250718_001953-ewfpx2ex/run-ewfpx2ex.wandb filter=lfs diff=lfs merge=lfs -text
126
+ simson_modeling/wandb/run-20250718_005226-taow6dte/run-taow6dte.wandb filter=lfs diff=lfs merge=lfs -text
127
+ simson_modeling/wandb/run-20250718_005742-vqbu5dpw/run-vqbu5dpw.wandb filter=lfs diff=lfs merge=lfs -text
128
+ simson_modeling/wandb/run-20250718_010148-xoj8gemp/run-xoj8gemp.wandb filter=lfs diff=lfs merge=lfs -text
129
+ simson_modeling/wandb/run-20250718_010520-r0bl1m8z/run-r0bl1m8z.wandb filter=lfs diff=lfs merge=lfs -text
130
+ simson_modeling/wandb/run-20250718_010659-paetqmbl/run-paetqmbl.wandb filter=lfs diff=lfs merge=lfs -text
131
+ simson_modeling/wandb/run-20250718_010918-245xlfgs/run-245xlfgs.wandb filter=lfs diff=lfs merge=lfs -text
132
+ simson_modeling/wandb/run-20250718_011120-bbibop4i/run-bbibop4i.wandb filter=lfs diff=lfs merge=lfs -text
133
+ simson_modeling/wandb/run-20250718_011447-dpcb0hru/run-dpcb0hru.wandb filter=lfs diff=lfs merge=lfs -text
134
+ simson_modeling/wandb/run-20250718_012254-nm4a474i/run-nm4a474i.wandb filter=lfs diff=lfs merge=lfs -text
135
+ simson_modeling/wandb/run-20250718_012858-z6yfzboi/run-z6yfzboi.wandb filter=lfs diff=lfs merge=lfs -text
136
+ simson_modeling/wandb/run-20250718_013314-g5gjoskc/run-g5gjoskc.wandb filter=lfs diff=lfs merge=lfs -text
137
+ simson_modeling/wandb/run-20250718_013859-x86s4jyn/run-x86s4jyn.wandb filter=lfs diff=lfs merge=lfs -text
138
+ simson_modeling/wandb/run-20250718_015325-kvu5ufqh/run-kvu5ufqh.wandb filter=lfs diff=lfs merge=lfs -text
139
+ simson_modeling/wandb/run-20250718_015851-qc2xd8ms/run-qc2xd8ms.wandb filter=lfs diff=lfs merge=lfs -text
140
+ simson_modeling/wandb/run-20250718_020908-wpetqjkc/run-wpetqjkc.wandb filter=lfs diff=lfs merge=lfs -text
141
+ simson_modeling/wandb/run-20250718_031955-y1l0kk0w/run-y1l0kk0w.wandb filter=lfs diff=lfs merge=lfs -text
142
+ simson_modeling/wandb/run-20250718_032230-55hrsoty/run-55hrsoty.wandb filter=lfs diff=lfs merge=lfs -text
143
+ simson_modeling/wandb/run-20250718_032523-4u6u8fp8/run-4u6u8fp8.wandb filter=lfs diff=lfs merge=lfs -text
144
+ simson_modeling/wandb/run-20250718_033334-wu8q3oan/run-wu8q3oan.wandb filter=lfs diff=lfs merge=lfs -text
145
+ simson_modeling/wandb/run-20250718_034824-816krhgj/run-816krhgj.wandb filter=lfs diff=lfs merge=lfs -text
146
+ simson_modeling/wandb/run-20250718_041024-clv8syyg/run-clv8syyg.wandb filter=lfs diff=lfs merge=lfs -text
147
+ simson_modeling/wandb/run-20250718_041516-qnappj7f/run-qnappj7f.wandb filter=lfs diff=lfs merge=lfs -text
148
+ simson_modeling/wandb/run-20250718_041821-cgsw0t5g/run-cgsw0t5g.wandb filter=lfs diff=lfs merge=lfs -text
149
+ simson_modeling/wandb/run-20250718_043059-am04u0y9/run-am04u0y9.wandb filter=lfs diff=lfs merge=lfs -text
150
+ simson_modeling/wandb/run-20250718_043256-7xdyfnu4/run-7xdyfnu4.wandb filter=lfs diff=lfs merge=lfs -text
151
+ simson_modeling/wandb/run-20250718_043758-vlc0kqs1/run-vlc0kqs1.wandb filter=lfs diff=lfs merge=lfs -text
152
+ simson_modeling/wandb/run-20250718_044126-kmk3sbwz/run-kmk3sbwz.wandb filter=lfs diff=lfs merge=lfs -text
153
+ simson_modeling/wandb/run-20250718_070426-51daq8vc/run-51daq8vc.wandb filter=lfs diff=lfs merge=lfs -text
154
+ simson_modeling/wandb/run-20250718_175738-ujiejljj/run-ujiejljj.wandb filter=lfs diff=lfs merge=lfs -text
155
+ simson_modeling/wandb/run-20250718_180457-hq6awr0t/run-hq6awr0t.wandb filter=lfs diff=lfs merge=lfs -text
156
+ simson_modeling/wandb/run-20250718_183118-1pjl1dv9/run-1pjl1dv9.wandb filter=lfs diff=lfs merge=lfs -text
157
+ simson_modeling/wandb/run-20250718_183521-04jk637s/run-04jk637s.wandb filter=lfs diff=lfs merge=lfs -text
158
+ simson_modeling/wandb/run-20250718_184320-412ie9zf/run-412ie9zf.wandb filter=lfs diff=lfs merge=lfs -text
159
+ simson_modeling/wandb/run-20250718_185342-3oree65g/run-3oree65g.wandb filter=lfs diff=lfs merge=lfs -text
160
+ simson_modeling/wandb/run-20250718_185832-apvwgarm/run-apvwgarm.wandb filter=lfs diff=lfs merge=lfs -text
161
+ simson_modeling/wandb/run-20250718_190229-vt2nljbo/run-vt2nljbo.wandb filter=lfs diff=lfs merge=lfs -text
162
+ simson_modeling/wandb/run-20250718_190608-gr6g2t8w/run-gr6g2t8w.wandb filter=lfs diff=lfs merge=lfs -text
163
+ simson_modeling/wandb/run-20250718_190920-9osopmwr/run-9osopmwr.wandb filter=lfs diff=lfs merge=lfs -text
164
+ simson_modeling/wandb/run-20250718_191351-ratgo9nz/run-ratgo9nz.wandb filter=lfs diff=lfs merge=lfs -text
165
+ simson_modeling/wandb/run-20250718_191845-jdddfbuz/run-jdddfbuz.wandb filter=lfs diff=lfs merge=lfs -text
166
+ simson_modeling/wandb/run-20250718_192326-kna5ek5b/run-kna5ek5b.wandb filter=lfs diff=lfs merge=lfs -text
167
+ simson_modeling/wandb/run-20250718_193021-f1whl1ia/run-f1whl1ia.wandb filter=lfs diff=lfs merge=lfs -text
168
+ simson_modeling/wandb/run-20250718_193519-iabkuiuw/run-iabkuiuw.wandb filter=lfs diff=lfs merge=lfs -text
169
+ simson_modeling/wandb/run-20250718_195706-6kh0cz53/run-6kh0cz53.wandb filter=lfs diff=lfs merge=lfs -text
170
+ simson_modeling/wandb/run-20250718_200252-d4gaj5js/run-d4gaj5js.wandb filter=lfs diff=lfs merge=lfs -text
171
+ simson_modeling/wandb/run-20250719_150546-agpbqxyy/run-agpbqxyy.wandb filter=lfs diff=lfs merge=lfs -text
172
+ simson_modeling/wandb/run-20250719_153452-unskjdue/run-unskjdue.wandb filter=lfs diff=lfs merge=lfs -text
173
+ simson_modeling/wandb/run-20250719_160238-27fpkd86/run-27fpkd86.wandb filter=lfs diff=lfs merge=lfs -text
174
+ simson_modeling/wandb/run-20250719_192211-3rfzbgfw/run-3rfzbgfw.wandb filter=lfs diff=lfs merge=lfs -text
175
+ simson_modeling/wandb/run-20250719_193400-bhymonq7/run-bhymonq7.wandb filter=lfs diff=lfs merge=lfs -text
176
+ simson_modeling/wandb/run-20250719_195543-l3p5eitx/run-l3p5eitx.wandb filter=lfs diff=lfs merge=lfs -text
177
+ simson_modeling/wandb/run-20250719_200341-hnsgsmyc/run-hnsgsmyc.wandb filter=lfs diff=lfs merge=lfs -text
178
+ simson_modeling/wandb/run-20250720_160322-rfm1amqg/run-rfm1amqg.wandb filter=lfs diff=lfs merge=lfs -text
179
+ simson_modeling/wandb/run-20250720_171431-y7ebdfsw/run-y7ebdfsw.wandb filter=lfs diff=lfs merge=lfs -text
180
+ simson_modeling/wandb/run-20250720_203832-beztzgvz/run-beztzgvz.wandb filter=lfs diff=lfs merge=lfs -text
181
+ simson_modeling/wandb/run-20250720_210448-dhzemttz/run-dhzemttz.wandb filter=lfs diff=lfs merge=lfs -text
182
+ simson_modeling/wandb/run-20250721_155939-ib4az6kg/run-ib4az6kg.wandb filter=lfs diff=lfs merge=lfs -text
simson_modeling/.create_augmented_dataset.py.swp ADDED
Binary file (1.02 kB). View file
 
simson_modeling/.create_splits.py.swp ADDED
Binary file (1.02 kB). View file
 
simson_modeling/.ipynb_checkpoints/create_augmented_dataset-checkpoint.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from tqdm import tqdm
3
+ from rdkit import Chem, RDLogger
4
+ from datasets import load_dataset
5
+ from multiprocessing import Pool, cpu_count
6
+ import os
7
+
8
+ # Suppress RDKit console output for cleaner logs
9
+ RDLogger.DisableLog('rdApp.*')
10
+
11
+ class SmilesEnumerator:
12
+ """
13
+ A simple class to encapsulate the SMILES randomization logic.
14
+ Needed for multiprocessing to work correctly with instance methods.
15
+ """
16
+ def randomize_smiles(self, smiles):
17
+ """Generates a randomized SMILES string."""
18
+ try:
19
+ mol = Chem.MolFromSmiles(smiles)
20
+ # Return a randomized, non-canonical SMILES string
21
+ return Chem.MolToSmiles(mol, doRandom=True, canonical=False) if mol else smiles
22
+ except:
23
+ # If RDKit fails, return the original smiles string
24
+ return smiles
25
+
26
+ def create_augmented_pair(smiles_string):
27
+ """
28
+ Worker function: takes one SMILES string and returns a tuple
29
+ containing two different randomized versions of it.
30
+ """
31
+ enumerator = SmilesEnumerator()
32
+ smiles_1 = enumerator.randomize_smiles(smiles_string)
33
+ smiles_2 = enumerator.randomize_smiles(smiles_string)
34
+ return smiles_1, smiles_2
35
+
36
+ def main():
37
+ """
38
+ Main function to run the parallel data preprocessing.
39
+ """
40
+ # --- Configuration ---
41
+ # Load your desired dataset from Hugging Face
42
+ dataset_name = 'jablonkagroup/pubchem-smiles-molecular-formula'
43
+ # Specify the column containing the SMILES strings
44
+ smiles_column_name = 'smiles'
45
+ # Set the output file path
46
+ output_path = 'data/pubchem_2_epoch'
47
+
48
+ # --- Data Loading ---
49
+ print(f"Loading dataset '{dataset_name}'...")
50
+ # Use streaming to avoid downloading the whole dataset if you only need a subset
51
+ #dataset = pd.read_csv('/home/jovyan/simson_training_bolgov/data/PI1M_v2.csv')
52
+ dataset = load_dataset(dataset_name)['train']
53
+ # Take the desired number of samples
54
+ smiles_list = dataset[smiles_column_name].to_list()
55
+ print(f"Successfully fetched {len(smiles_list)} SMILES strings.")
56
+
57
+ # --- Parallel Processing ---
58
+ # Use all available CPU cores for maximum speed
59
+ num_workers = cpu_count()
60
+ print(f"Starting SMILES augmentation with {num_workers} worker processes...")
61
+
62
+ # A Pool of processes will run the `create_augmented_pair` function in parallel
63
+ with Pool(num_workers) as p:
64
+ # Use tqdm to create a progress bar for the mapping operation
65
+ results = list(tqdm(p.imap(create_augmented_pair, smiles_list), total=len(smiles_list), desc="Augmenting Pairs"))
66
+
67
+ # --- Saving Data ---
68
+ print("Processing complete. Converting to DataFrame...")
69
+ # Convert the list of tuples into a pandas DataFrame
70
+ df = pd.DataFrame(results, columns=['smiles_1', 'smiles_2'])
71
+
72
+ # Ensure the output directory exists
73
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
74
+
75
+ print(f"Saving augmented pairs to '{output_path}'...")
76
+ # Save the DataFrame to a Parquet file for efficient storage and loading
77
+ df.to_parquet(output_path)
78
+
79
+ print("All done. Your pre-computed dataset is ready!")
80
+
81
+ if __name__ == '__main__':
82
+ main()
83
+
simson_modeling/.ipynb_checkpoints/create_augmented_dataset.py-checkpoint.save ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from tqdm import tqdm
3
+ from rdkit import Chem, RDLogger
4
+ from datasets import load_dataset
5
+ from multiprocessing import Pool, cpu_count
6
+ import os
7
+
8
+ # Suppress RDKit console output for cleaner logs
9
+ RDLogger.DisableLog('rdApp.*')
10
+
11
+ class SmilesEnumerator:
12
+ """
13
+ A simple class to encapsulate the SMILES randomization logic.
14
+ Needed for multiprocessing to work correctly with instance methods.
15
+ """
16
+ def randomize_smiles(self, smiles):
17
+ """Generates a randomized SMILES string."""
18
+ try:
19
+ mol = Chem.MolFromSmiles(smiles)
20
+ # Return a randomized, non-canonical SMILES string
21
+ return Chem.MolToSmiles(mol, doRandom=True, canonical=False) if mol else smiles
22
+ except:
23
+ # If RDKit fails, return the original smiles string
24
+ return smiles
25
+
26
+ def create_augmented_pair(smiles_string):
27
+ """
28
+ Worker function: takes one SMILES string and returns a tuple
29
+ containing two different randomized versions of it.
30
+ """
31
+ enumerator = SmilesEnumerator()
32
+ smiles_1 = enumerator.randomize_smiles(smiles_string)
33
+ smiles_2 = enumerator.randomize_smiles(smiles_string)
34
+ return smiles_1, smiles_2
35
+
36
+ def main():
37
+ """
38
+ Main function to run the parallel data preprocessing.
39
+ """
40
+ # --- Configuration ---
41
+ # Load your desired dataset from Hugging Face
42
+ dataset_name = 'jablonkagroup/pubchem-smiles-molecular-formula'
43
+ # Specify the column containing the SMILES strings
44
+ smiles_column_name = 'smiles'
45
+ # Set the output file path
46
+ output_path = 'data/pubchem_computed_110_end_M.parquet'
47
+
48
+ # --- Data Loading ---
49
+ print(f"Loading dataset '{dataset_name}'...")
50
+ # Use streaming to avoid downloading the whole dataset if you only need a subset
51
+ dataset = load_dataset(dataset_name, split='train').select(range(110_000_000, ))
52
+
53
+ # Take the desired number of samples
54
+ smiles_list = dataset[smiles_column_name]
55
+ print(f"Successfully fetched {len(smiles_list)} SMILES strings.")
56
+
57
+ # --- Parallel Processing ---
58
+ # Use all available CPU cores for maximum speed
59
+ num_workers = cpu_count()
60
+ print(f"Starting SMILES augmentation with {num_workers} worker processes...")
61
+
62
+ # A Pool of processes will run the `create_augmented_pair` function in parallel
63
+ with Pool(num_workers) as p:
64
+ # Use tqdm to create a progress bar for the mapping operation
65
+ results = list(tqdm(p.imap(create_augmented_pair, smiles_list), total=len(smiles_list), desc="Augmenting Pairs"))
66
+
67
+ # --- Saving Data ---
68
+ print("Processing complete. Converting to DataFrame...")
69
+ # Convert the list of tuples into a pandas DataFrame
70
+ df = pd.DataFrame(results, columns=['smiles_1', 'smiles_2'])
71
+
72
+ # Ensure the output directory exists
73
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
74
+
75
+ print(f"Saving augmented pairs to '{output_path}'...")
76
+ # Save the DataFrame to a Parquet file for efficient storage and loading
77
+ df.to_parquet(output_path)
78
+
79
+ print("All done. Your pre-computed dataset is ready!")
80
+
81
+ if __name__ == '__main__':
82
+ main()
83
+
simson_modeling/.ipynb_checkpoints/create_splits-checkpoint.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ import numpy as np
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ def concatenate_and_split_parquet(
8
+ input_dir: str,
9
+ output_dir: str,
10
+ val_size: int = 10000,
11
+ test_size: int = 5000,
12
+ random_state: int = 42
13
+ ):
14
+ """
15
+ Concatenate all parquet files in a directory and split into train/val/test sets.
16
+
17
+ Args:
18
+ input_dir: Path to directory containing parquet files
19
+ output_dir: Path to directory where split files will be saved
20
+ val_size: Number of samples for validation set (default: 10000)
21
+ test_size: Number of samples for test set (default: 5000)
22
+ random_state: Random seed for reproducibility
23
+ """
24
+
25
+ # Create output directory if it doesn't exist
26
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
27
+
28
+ # Find all parquet files in the input directory
29
+ input_path = Path(input_dir)
30
+ parquet_files = list(input_path.glob("*.parquet"))
31
+
32
+ if not parquet_files:
33
+ raise ValueError(f"No parquet files found in {input_dir}")
34
+
35
+ print(f"Found {len(parquet_files)} parquet files")
36
+
37
+ # Read and concatenate all parquet files
38
+ print("Reading and concatenating parquet files...")
39
+ dataframes = []
40
+
41
+ for file_path in parquet_files:
42
+ print(f"Reading {file_path.name}...")
43
+ df = pd.read_parquet(file_path)
44
+ dataframes.append(df)
45
+
46
+ # Concatenate all dataframes
47
+ combined_df = pd.concat(dataframes, ignore_index=True)
48
+ print(f"Combined dataset shape: {combined_df.shape}")
49
+
50
+ # Check if we have enough samples
51
+ total_samples = len(combined_df)
52
+ required_samples = val_size + test_size
53
+
54
+ if total_samples < required_samples:
55
+ raise ValueError(
56
+ f"Not enough samples. Required: {required_samples}, Available: {total_samples}"
57
+ )
58
+
59
+ # Shuffle the data
60
+ combined_df = combined_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
61
+
62
+ # Split the data
63
+ print("Splitting data...")
64
+
65
+ # First split: separate test set
66
+ temp_df, test_df = train_test_split(
67
+ combined_df,
68
+ test_size=test_size,
69
+ random_state=random_state
70
+ )
71
+
72
+ # Second split: separate validation from remaining data
73
+ train_df, val_df = train_test_split(
74
+ temp_df,
75
+ test_size=val_size,
76
+ random_state=random_state
77
+ )
78
+
79
+ print(f"Training set shape: {train_df.shape}")
80
+ print(f"Validation set shape: {val_df.shape}")
81
+ print(f"Test set shape: {test_df.shape}")
82
+
83
+ # Save the splits as parquet files
84
+ output_path = Path(output_dir)
85
+
86
+ train_path = output_path / "train.parquet"
87
+ val_path = output_path / "validation.parquet"
88
+ test_path = output_path / "test.parquet"
89
+
90
+ print("Saving split datasets...")
91
+ train_df.to_parquet(train_path, index=False)
92
+ val_df.to_parquet(val_path, index=False)
93
+ test_df.to_parquet(test_path, index=False)
94
+
95
+ print(f"Files saved to:")
96
+ print(f" Training: {train_path}")
97
+ print(f" Validation: {val_path}")
98
+ print(f" Test: {test_path}")
99
+
100
+ return train_df, val_df, test_df
101
+
102
+ # Alternative version using PyArrow for better performance with large files
103
+ def concatenate_and_split_parquet_arrow(
104
+ input_dir: str,
105
+ output_dir: str,
106
+ val_size: int = 10000,
107
+ test_size: int = 5000,
108
+ random_state: int = 42
109
+ ):
110
+ """
111
+ Same functionality as above but using PyArrow for better performance.
112
+ """
113
+ import pyarrow as pa
114
+ import pyarrow.parquet as pq
115
+
116
+ # Create output directory if it doesn't exist
117
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
118
+
119
+ # Find all parquet files
120
+ input_path = Path(input_dir)
121
+ parquet_files = list(input_path.glob("*.parquet"))
122
+
123
+ if not parquet_files:
124
+ raise ValueError(f"No parquet files found in {input_dir}")
125
+
126
+ print(f"Found {len(parquet_files)} parquet files")
127
+
128
+ # Read and concatenate using PyArrow
129
+ print("Reading and concatenating parquet files...")
130
+ tables = []
131
+
132
+ for file_path in parquet_files:
133
+ print(f"Reading {file_path.name}...")
134
+ table = pq.read_table(file_path)
135
+ tables.append(table)
136
+
137
+ # Concatenate tables
138
+ combined_table = pa.concat_tables(tables)
139
+ combined_df = combined_table.to_pandas()
140
+
141
+ print(f"Combined dataset shape: {combined_df.shape}")
142
+
143
+ # Rest of the function is the same as above
144
+ total_samples = len(combined_df)
145
+ required_samples = val_size + test_size
146
+
147
+ if total_samples < required_samples:
148
+ raise ValueError(
149
+ f"Not enough samples. Required: {required_samples}, Available: {total_samples}"
150
+ )
151
+
152
+ # Shuffle and split
153
+ combined_df = combined_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
154
+
155
+ temp_df, test_df = train_test_split(
156
+ combined_df, test_size=test_size, random_state=random_state
157
+ )
158
+
159
+ train_df, val_df = train_test_split(
160
+ temp_df, test_size=val_size, random_state=random_state
161
+ )
162
+
163
+ print(f"Training set shape: {train_df.shape}")
164
+ print(f"Validation set shape: {val_df.shape}")
165
+ print(f"Test set shape: {test_df.shape}")
166
+
167
+ # Save using PyArrow
168
+ output_path = Path(output_dir)
169
+
170
+ pq.write_table(pa.Table.from_pandas(train_df), output_path / "train.parquet")
171
+ pq.write_table(pa.Table.from_pandas(val_df), output_path / "validation.parquet")
172
+ pq.write_table(pa.Table.from_pandas(test_df), output_path / "test.parquet")
173
+
174
+ print(f"Files saved to {output_dir}")
175
+
176
+ return train_df, val_df, test_df
177
+
178
+ # Example usage
179
+ if __name__ == "__main__":
180
+ # Example usage
181
+ input_directory = "data"
182
+ output_directory = "data/polymer_splits"
183
+
184
+ # Using pandas version
185
+ train_df, val_df, test_df = concatenate_and_split_parquet(
186
+ input_dir=input_directory,
187
+ output_dir=output_directory,
188
+ val_size=10000,
189
+ test_size=5000,
190
+ random_state=42
191
+ )
192
+
193
+ # Or using PyArrow version for better performance
194
+ # train_df, val_df, test_df = concatenate_and_split_parquet_arrow(
195
+ # input_dir=input_directory,
196
+ # output_dir=output_directory,
197
+ # val_size=10000,
198
+ # test_size=5000,
199
+ # random_state=42
200
+ # )
simson_modeling/.ipynb_checkpoints/fingerprint_training-checkpoint.ipynb ADDED
@@ -0,0 +1,1550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "3d5d52d1-4874-44b5-b532-ef03da47644a",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import numpy as np\n",
11
+ "import pandas as pd\n",
12
+ "from rdkit import Chem\n",
13
+ "from rdkit.Chem import Descriptors, rdMolDescriptors, Crippen, Lipinski\n",
14
+ "from tqdm import tqdm\n",
15
+ "import warnings\n",
16
+ "from sklearn.preprocessing import StandardScaler\n",
17
+ "from sklearn.model_selection import train_test_split\n",
18
+ "import random\n",
19
+ "from concurrent.futures import ProcessPoolExecutor\n",
20
+ "import multiprocessing\n",
21
+ "\n",
22
+ "def analyze_polymer_features_rdkit(smiles):\n",
23
+ " mol = Chem.MolFromSmiles(smiles)\n",
24
+ " if mol is None:\n",
25
+ " return None\n",
26
+ " \n",
27
+ " features = {}\n",
28
+ " \n",
29
+ " # Basic molecular properties\n",
30
+ " features['mol_weight'] = Descriptors.MolWt(mol)\n",
31
+ " features['exact_mol_weight'] = Descriptors.ExactMolWt(mol)\n",
32
+ " features['num_heavy_atoms'] = mol.GetNumHeavyAtoms()\n",
33
+ " features['num_atoms'] = mol.GetNumAtoms()\n",
34
+ " features['num_bonds'] = mol.GetNumBonds()\n",
35
+ " \n",
36
+ " # Hydrogen bonding features\n",
37
+ " features['num_hbond_donors'] = Descriptors.NumHDonors(mol)\n",
38
+ " features['num_hbond_acceptors'] = Descriptors.NumHAcceptors(mol)\n",
39
+ " features['num_heteroatoms'] = Descriptors.NumHeteroatoms(mol)\n",
40
+ " \n",
41
+ " # Structural complexity\n",
42
+ " features['num_rotatable_bonds'] = Descriptors.NumRotatableBonds(mol)\n",
43
+ " features['num_saturated_rings'] = Descriptors.NumSaturatedRings(mol)\n",
44
+ " features['num_aromatic_rings'] = Descriptors.NumAromaticRings(mol)\n",
45
+ " features['num_aliphatic_rings'] = Descriptors.NumAliphaticRings(mol)\n",
46
+ " features['ring_count'] = Descriptors.RingCount(mol)\n",
47
+ " features['fraction_csp3'] = Descriptors.FractionCSP3(mol)\n",
48
+ " \n",
49
+ " # Surface area and polarity\n",
50
+ " features['tpsa'] = Descriptors.TPSA(mol)\n",
51
+ " features['polar_surface_area'] = rdMolDescriptors.CalcTPSA(mol)\n",
52
+ " \n",
53
+ " # Lipophilicity and solubility\n",
54
+ " features['logp'] = Descriptors.MolLogP(mol)\n",
55
+ " features['crippen_logp'] = Crippen.MolLogP(mol)\n",
56
+ " features['crippen_mr'] = Crippen.MolMR(mol) # Molar refractivity\n",
57
+ " \n",
58
+ " # Flexibility and rigidity\n",
59
+ " features['kappa1'] = Descriptors.Kappa1(mol) # Molecular shape index\n",
60
+ " features['kappa2'] = Descriptors.Kappa2(mol)\n",
61
+ " features['kappa3'] = Descriptors.Kappa3(mol)\n",
62
+ " features['chi0v'] = Descriptors.Chi0v(mol) # Connectivity indices\n",
63
+ " features['chi1v'] = Descriptors.Chi1v(mol)\n",
64
+ " features['chi2v'] = Descriptors.Chi2v(mol)\n",
65
+ " \n",
66
+ " # Electronic properties\n",
67
+ " features['balaban_j'] = Descriptors.BalabanJ(mol)\n",
68
+ " features['bertz_ct'] = Descriptors.BertzCT(mol) # Complexity index\n",
69
+ " \n",
70
+ " # Polymer-specific features\n",
71
+ " features['num_radical_electrons'] = Descriptors.NumRadicalElectrons(mol)\n",
72
+ " features['num_valence_electrons'] = Descriptors.NumValenceElectrons(mol)\n",
73
+ " \n",
74
+ " # Atom type counts\n",
75
+ " atom_counts = {}\n",
76
+ " for atom in mol.GetAtoms():\n",
77
+ " symbol = atom.GetSymbol()\n",
78
+ " atom_counts[symbol] = atom_counts.get(symbol, 0) + 1\n",
79
+ " \n",
80
+ " # Add individual atom counts as features\n",
81
+ " for element in ['C', 'N', 'O', 'S', 'P', 'F', 'Cl', 'Br', 'I']:\n",
82
+ " features[f'count_{element}'] = atom_counts.get(element, 0)\n",
83
+ " features[f'ratio_{element}'] = atom_counts.get(element, 0) / features['num_atoms'] if features['num_atoms'] > 0 else 0\n",
84
+ " \n",
85
+ " # Bond type analysis\n",
86
+ " bond_types = {'SINGLE': 0, 'DOUBLE': 0, 'TRIPLE': 0, 'AROMATIC': 0}\n",
87
+ " for bond in mol.GetBonds():\n",
88
+ " bond_type = str(bond.GetBondType())\n",
89
+ " if bond_type in bond_types:\n",
90
+ " bond_types[bond_type] += 1\n",
91
+ " \n",
92
+ " for bond_type, count in bond_types.items():\n",
93
+ " features[f'num_{bond_type.lower()}_bonds'] = count\n",
94
+ " features[f'ratio_{bond_type.lower()}_bonds'] = count / features['num_bonds'] if features['num_bonds'] > 0 else 0\n",
95
+ " \n",
96
+ " # Hybridization analysis\n",
97
+ " hybridization_counts = {'SP': 0, 'SP2': 0, 'SP3': 0, 'SP3D': 0, 'SP3D2': 0}\n",
98
+ " for atom in mol.GetAtoms():\n",
99
+ " hyb = str(atom.GetHybridization())\n",
100
+ " if hyb in hybridization_counts:\n",
101
+ " hybridization_counts[hyb] += 1\n",
102
+ " \n",
103
+ " for hyb_type, count in hybridization_counts.items():\n",
104
+ " features[f'num_{hyb_type.lower()}_carbons'] = count\n",
105
+ " features[f'ratio_{hyb_type.lower()}_carbons'] = count / features['num_atoms'] if features['num_atoms'] > 0 else 0\n",
106
+ " \n",
107
+ " # Formal charge analysis\n",
108
+ " formal_charges = [atom.GetFormalCharge() for atom in mol.GetAtoms()]\n",
109
+ " features['total_formal_charge'] = sum(formal_charges)\n",
110
+ " features['abs_total_formal_charge'] = sum(abs(charge) for charge in formal_charges)\n",
111
+ " features['max_formal_charge'] = max(formal_charges) if formal_charges else 0\n",
112
+ " features['min_formal_charge'] = min(formal_charges) if formal_charges else 0\n",
113
+ " \n",
114
+ " # Aromaticity features\n",
115
+ " aromatic_atoms = sum(1 for atom in mol.GetAtoms() if atom.GetIsAromatic())\n",
116
+ " features['num_aromatic_atoms'] = aromatic_atoms\n",
117
+ " features['aromatic_ratio'] = aromatic_atoms / features['num_atoms'] if features['num_atoms'] > 0 else 0\n",
118
+ " \n",
119
+ " # Ring size analysis\n",
120
+ " ring_info = mol.GetRingInfo()\n",
121
+ " ring_sizes = [len(ring) for ring in ring_info.AtomRings()]\n",
122
+ " if ring_sizes:\n",
123
+ " features['avg_ring_size'] = sum(ring_sizes) / len(ring_sizes)\n",
124
+ " features['max_ring_size'] = max(ring_sizes)\n",
125
+ " features['min_ring_size'] = min(ring_sizes)\n",
126
+ " features['num_3_rings'] = sum(1 for size in ring_sizes if size == 3)\n",
127
+ " features['num_4_rings'] = sum(1 for size in ring_sizes if size == 4)\n",
128
+ " features['num_5_rings'] = sum(1 for size in ring_sizes if size == 5)\n",
129
+ " features['num_6_rings'] = sum(1 for size in ring_sizes if size == 6)\n",
130
+ " features['num_7_rings'] = sum(1 for size in ring_sizes if size == 7)\n",
131
+ " features['num_large_rings'] = sum(1 for size in ring_sizes if size > 7)\n",
132
+ " else:\n",
133
+ " features.update({\n",
134
+ " 'avg_ring_size': 0, 'max_ring_size': 0, 'min_ring_size': 0,\n",
135
+ " 'num_3_rings': 0, 'num_4_rings': 0, 'num_5_rings': 0,\n",
136
+ " 'num_6_rings': 0, 'num_7_rings': 0, 'num_large_rings': 0\n",
137
+ " })\n",
138
+ " \n",
139
+ " # Polymer-specific structural features\n",
140
+ " features['has_polymer_notation'] = '*' in smiles\n",
141
+ " features['smiles_length'] = len(smiles)\n",
142
+ " features['branch_count'] = smiles.count('(')\n",
143
+ " features['branch_ratio'] = smiles.count('(') / len(smiles) if len(smiles) > 0 else 0\n",
144
+ " \n",
145
+ " return features\n",
146
+ "\n",
147
+ "def add_features(df, num_workers=None):\n",
148
+ " \"\"\"\n",
149
+ " Improved version using multiprocessing to calculate RDKit descriptors efficiently.\n",
150
+ " \n",
151
+ " Parameters:\n",
152
+ " df: pandas DataFrame with 'Smiles' column\n",
153
+ " num_workers: Number of worker processes (defaults to number of CPU cores)\n",
154
+ " \"\"\"\n",
155
+ " if num_workers is None:\n",
156
+ " num_workers = multiprocessing.cpu_count()\n",
157
+ " \n",
158
+ " smiles_list = df['Smiles'].tolist()\n",
159
+ " \n",
160
+ " with ProcessPoolExecutor(max_workers=num_workers) as executor:\n",
161
+ " # Use tqdm with executor.map for progress tracking\n",
162
+ " features_list = list(tqdm(executor.map(analyze_polymer_features_rdkit, smiles_list), \n",
163
+ " total=len(smiles_list), \n",
164
+ " desc=\"Computing RDKit descriptors\"))\n",
165
+ " \n",
166
+ " # Convert results to DataFrame\n",
167
+ " features_df = pd.DataFrame(features_list)\n",
168
+ " \n",
169
+ " # Concatenate with original DataFrame\n",
170
+ " df_result = pd.concat([df, features_df], axis=1)\n",
171
+ " \n",
172
+ " return df_result\n",
173
+ "\n",
174
+ "def get_list_dif(l1, l2):\n",
175
+ " return list(set(l1) - set(l2))\n",
176
+ "\n",
177
+ "# Usage example:\n",
178
+ "# df_with_features = add_features(df, num_workers=4)\n"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": null,
184
+ "id": "155598af-79f3-4933-8b5c-1fd11f64b870",
185
+ "metadata": {},
186
+ "outputs": [],
187
+ "source": [
188
+ "df = pd.read_csv('/home/jovyan/simson_training_bolgov/regression/PI_Tg_P308K_synth_db_chem.csv').drop(columns=['Unnamed: 0'], axis=1)"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": null,
194
+ "id": "c69cc497-9fb6-4f74-96eb-257d7aa4a91a",
195
+ "metadata": {},
196
+ "outputs": [],
197
+ "source": [
198
+ "df = pd.read_csv('/home/jovyan/simson_training_bolgov/kaggle_comp/train.csv')\n",
199
+ "df['Smiles'] = df['SMILES']\n",
200
+ "df"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": null,
206
+ "id": "7b076c55-d6ef-4780-af97-5fccd5062661",
207
+ "metadata": {},
208
+ "outputs": [],
209
+ "source": [
210
+ "sample_df = df.iloc[:10_000]"
211
+ ]
212
+ },
213
+ {
214
+ "cell_type": "code",
215
+ "execution_count": null,
216
+ "id": "96313883-c2ca-4eb8-9ec7-9aaca8dba077",
217
+ "metadata": {},
218
+ "outputs": [],
219
+ "source": [
220
+ "features_df = add_features(sample_df)"
221
+ ]
222
+ },
223
+ {
224
+ "cell_type": "code",
225
+ "execution_count": 2,
226
+ "id": "41c7f85a-ea65-42e5-b315-ef304ba311c4",
227
+ "metadata": {},
228
+ "outputs": [],
229
+ "source": [
230
+ "selected_features = ['mol_weight', 'exact_mol_weight', 'num_heavy_atoms', 'num_atoms',\n",
231
+ " 'num_bonds', 'num_hbond_donors', 'num_hbond_acceptors',\n",
232
+ " 'num_heteroatoms', 'num_rotatable_bonds', 'num_saturated_rings',\n",
233
+ " 'num_aromatic_rings', 'num_aliphatic_rings', 'ring_count',\n",
234
+ " 'fraction_csp3', 'tpsa', 'polar_surface_area', 'logp', 'crippen_logp',\n",
235
+ " 'crippen_mr', 'kappa1', 'kappa2', 'kappa3', 'chi0v', 'chi1v', 'chi2v',\n",
236
+ " 'balaban_j', 'bertz_ct', 'num_radical_electrons',\n",
237
+ " 'num_valence_electrons',\n",
238
+ " 'count_O', 'ratio_O', 'count_S', 'ratio_S', 'count_P', 'ratio_P',\n",
239
+ " 'count_F', 'ratio_F', 'count_Cl', 'ratio_Cl', 'count_Br', 'ratio_Br',\n",
240
+ " 'count_I', 'ratio_I', 'num_single_bonds', 'ratio_single_bonds',\n",
241
+ " 'num_double_bonds', 'ratio_double_bonds', 'num_triple_bonds',\n",
242
+ " 'ratio_triple_bonds', 'num_aromatic_bonds', 'ratio_aromatic_bonds',\n",
243
+ " 'num_sp_carbons', 'ratio_sp_carbons', 'num_sp2_carbons',\n",
244
+ " 'ratio_sp2_carbons', 'num_sp3_carbons', 'ratio_sp3_carbons',\n",
245
+ " 'num_sp3d_carbons', 'ratio_sp3d_carbons', 'num_sp3d2_carbons',\n",
246
+ " 'ratio_sp3d2_carbons', 'total_formal_charge', 'abs_total_formal_charge',\n",
247
+ " 'max_formal_charge', 'min_formal_charge', 'num_aromatic_atoms',\n",
248
+ " 'aromatic_ratio', 'avg_ring_size', 'max_ring_size', 'min_ring_size',\n",
249
+ " 'num_3_rings', 'num_4_rings', 'num_5_rings', 'num_6_rings',\n",
250
+ " 'num_7_rings', 'num_large_rings', 'has_polymer_notation',\n",
251
+ " 'branch_count', 'branch_ratio']"
252
+ ]
253
+ },
254
+ {
255
+ "cell_type": "code",
256
+ "execution_count": null,
257
+ "id": "fc31605d-cc21-4533-b04e-f8acdaef1a65",
258
+ "metadata": {},
259
+ "outputs": [],
260
+ "source": [
261
+ "scalers = []\n",
262
+ "for col in selected_features:\n",
263
+ " scaler = StandardScaler()\n",
264
+ " features_df[col] = scaler.fit_transform(features_df[col].to_numpy().reshape(-1, 1)).flatten()\n",
265
+ " scalers.append(scaler)\n",
266
+ " \n",
267
+ "features_df.columns"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": 4,
273
+ "id": "f2f1a614-0ba7-4a01-9731-532afc1d14e0",
274
+ "metadata": {},
275
+ "outputs": [
276
+ {
277
+ "name": "stdout",
278
+ "output_type": "stream",
279
+ "text": [
280
+ "['mol_weight', 'exact_mol_weight', 'fraction_csp3', 'tpsa', 'polar_surface_area', 'logp', 'crippen_logp', 'crippen_mr', 'kappa1', 'kappa2', 'kappa3', 'chi0v', 'chi1v', 'chi2v', 'balaban_j', 'bertz_ct', 'ratio_O', 'ratio_single_bonds', 'ratio_double_bonds', 'ratio_aromatic_bonds', 'ratio_sp2_carbons', 'ratio_sp3_carbons', 'aromatic_ratio', 'branch_ratio', 'Smiles']\n"
281
+ ]
282
+ },
283
+ {
284
+ "data": {
285
+ "text/plain": [
286
+ "(25, 79)"
287
+ ]
288
+ },
289
+ "execution_count": 4,
290
+ "metadata": {},
291
+ "output_type": "execute_result"
292
+ }
293
+ ],
294
+ "source": [
295
+ "new_features = []\n",
296
+ "\n",
297
+ "for feature in selected_features:\n",
298
+ " unique_list = features_df[feature].unique()\n",
299
+ " if len(unique_list) > 300:\n",
300
+ " new_features.append(feature)\n",
301
+ "new_features.append('Smiles')\n",
302
+ "print(new_features)\n",
303
+ "len(new_features), len(selected_features)"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": null,
309
+ "id": "28cbac75-8a9f-4292-aedb-11f33f5a6056",
310
+ "metadata": {},
311
+ "outputs": [],
312
+ "source": []
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "execution_count": null,
317
+ "id": "c065d950-7a63-4424-9923-1072d2e2268c",
318
+ "metadata": {},
319
+ "outputs": [],
320
+ "source": [
321
+ "features_df.to_csv('7k_w_descriptors.csv', index=False)"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "code",
326
+ "execution_count": 3,
327
+ "id": "069a9021-d440-4bf1-9882-a2af25f2e801",
328
+ "metadata": {},
329
+ "outputs": [
330
+ {
331
+ "data": {
332
+ "text/html": [
333
+ "<div>\n",
334
+ "<style scoped>\n",
335
+ " .dataframe tbody tr th:only-of-type {\n",
336
+ " vertical-align: middle;\n",
337
+ " }\n",
338
+ "\n",
339
+ " .dataframe tbody tr th {\n",
340
+ " vertical-align: top;\n",
341
+ " }\n",
342
+ "\n",
343
+ " .dataframe thead th {\n",
344
+ " text-align: right;\n",
345
+ " }\n",
346
+ "</style>\n",
347
+ "<table border=\"1\" class=\"dataframe\">\n",
348
+ " <thead>\n",
349
+ " <tr style=\"text-align: right;\">\n",
350
+ " <th></th>\n",
351
+ " <th>id</th>\n",
352
+ " <th>SMILES</th>\n",
353
+ " <th>Tg</th>\n",
354
+ " <th>FFV</th>\n",
355
+ " <th>Tc</th>\n",
356
+ " <th>Density</th>\n",
357
+ " <th>Rg</th>\n",
358
+ " <th>Smiles</th>\n",
359
+ " <th>mol_weight</th>\n",
360
+ " <th>exact_mol_weight</th>\n",
361
+ " <th>...</th>\n",
362
+ " <th>num_3_rings</th>\n",
363
+ " <th>num_4_rings</th>\n",
364
+ " <th>num_5_rings</th>\n",
365
+ " <th>num_6_rings</th>\n",
366
+ " <th>num_7_rings</th>\n",
367
+ " <th>num_large_rings</th>\n",
368
+ " <th>has_polymer_notation</th>\n",
369
+ " <th>smiles_length</th>\n",
370
+ " <th>branch_count</th>\n",
371
+ " <th>branch_ratio</th>\n",
372
+ " </tr>\n",
373
+ " </thead>\n",
374
+ " <tbody>\n",
375
+ " <tr>\n",
376
+ " <th>0</th>\n",
377
+ " <td>87817</td>\n",
378
+ " <td>*CC(*)c1ccccc1C(=O)OCCCCCC</td>\n",
379
+ " <td>NaN</td>\n",
380
+ " <td>0.374645</td>\n",
381
+ " <td>0.205667</td>\n",
382
+ " <td>NaN</td>\n",
383
+ " <td>NaN</td>\n",
384
+ " <td>*CC(*)c1ccccc1C(=O)OCCCCCC</td>\n",
385
+ " <td>-0.875755</td>\n",
386
+ " <td>-0.875617</td>\n",
387
+ " <td>...</td>\n",
388
+ " <td>-0.048476</td>\n",
389
+ " <td>-0.069289</td>\n",
390
+ " <td>-0.626991</td>\n",
391
+ " <td>-0.788904</td>\n",
392
+ " <td>-0.051542</td>\n",
393
+ " <td>-0.047917</td>\n",
394
+ " <td>0.0</td>\n",
395
+ " <td>26</td>\n",
396
+ " <td>-0.985221</td>\n",
397
+ " <td>-0.813832</td>\n",
398
+ " </tr>\n",
399
+ " <tr>\n",
400
+ " <th>1</th>\n",
401
+ " <td>106919</td>\n",
402
+ " <td>*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...</td>\n",
403
+ " <td>NaN</td>\n",
404
+ " <td>0.370410</td>\n",
405
+ " <td>NaN</td>\n",
406
+ " <td>NaN</td>\n",
407
+ " <td>NaN</td>\n",
408
+ " <td>*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...</td>\n",
409
+ " <td>0.651876</td>\n",
410
+ " <td>0.651916</td>\n",
411
+ " <td>...</td>\n",
412
+ " <td>-0.048476</td>\n",
413
+ " <td>-0.069289</td>\n",
414
+ " <td>-0.626991</td>\n",
415
+ " <td>0.736852</td>\n",
416
+ " <td>-0.051542</td>\n",
417
+ " <td>-0.047917</td>\n",
418
+ " <td>0.0</td>\n",
419
+ " <td>82</td>\n",
420
+ " <td>0.336345</td>\n",
421
+ " <td>-0.286141</td>\n",
422
+ " </tr>\n",
423
+ " <tr>\n",
424
+ " <th>2</th>\n",
425
+ " <td>388772</td>\n",
426
+ " <td>*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...</td>\n",
427
+ " <td>NaN</td>\n",
428
+ " <td>0.378860</td>\n",
429
+ " <td>NaN</td>\n",
430
+ " <td>NaN</td>\n",
431
+ " <td>NaN</td>\n",
432
+ " <td>*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...</td>\n",
433
+ " <td>2.336573</td>\n",
434
+ " <td>2.336165</td>\n",
435
+ " <td>...</td>\n",
436
+ " <td>-0.048476</td>\n",
437
+ " <td>-0.069289</td>\n",
438
+ " <td>-0.626991</td>\n",
439
+ " <td>2.644047</td>\n",
440
+ " <td>-0.051542</td>\n",
441
+ " <td>-0.047917</td>\n",
442
+ " <td>0.0</td>\n",
443
+ " <td>134</td>\n",
444
+ " <td>1.657910</td>\n",
445
+ " <td>-0.109289</td>\n",
446
+ " </tr>\n",
447
+ " <tr>\n",
448
+ " <th>3</th>\n",
449
+ " <td>519416</td>\n",
450
+ " <td>*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...</td>\n",
451
+ " <td>NaN</td>\n",
452
+ " <td>0.387324</td>\n",
453
+ " <td>NaN</td>\n",
454
+ " <td>NaN</td>\n",
455
+ " <td>NaN</td>\n",
456
+ " <td>*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...</td>\n",
457
+ " <td>0.417716</td>\n",
458
+ " <td>0.417722</td>\n",
459
+ " <td>...</td>\n",
460
+ " <td>-0.048476</td>\n",
461
+ " <td>-0.069289</td>\n",
462
+ " <td>-0.626991</td>\n",
463
+ " <td>1.118291</td>\n",
464
+ " <td>-0.051542</td>\n",
465
+ " <td>-0.047917</td>\n",
466
+ " <td>0.0</td>\n",
467
+ " <td>79</td>\n",
468
+ " <td>0.556606</td>\n",
469
+ " <td>0.132247</td>\n",
470
+ " </tr>\n",
471
+ " <tr>\n",
472
+ " <th>4</th>\n",
473
+ " <td>539187</td>\n",
474
+ " <td>*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...</td>\n",
475
+ " <td>NaN</td>\n",
476
+ " <td>0.355470</td>\n",
477
+ " <td>NaN</td>\n",
478
+ " <td>NaN</td>\n",
479
+ " <td>NaN</td>\n",
480
+ " <td>*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...</td>\n",
481
+ " <td>2.178003</td>\n",
482
+ " <td>2.178499</td>\n",
483
+ " <td>...</td>\n",
484
+ " <td>-0.048476</td>\n",
485
+ " <td>-0.069289</td>\n",
486
+ " <td>1.501149</td>\n",
487
+ " <td>0.355413</td>\n",
488
+ " <td>-0.051542</td>\n",
489
+ " <td>-0.047917</td>\n",
490
+ " <td>0.0</td>\n",
491
+ " <td>118</td>\n",
492
+ " <td>0.556606</td>\n",
493
+ " <td>-0.830501</td>\n",
494
+ " </tr>\n",
495
+ " <tr>\n",
496
+ " <th>...</th>\n",
497
+ " <td>...</td>\n",
498
+ " <td>...</td>\n",
499
+ " <td>...</td>\n",
500
+ " <td>...</td>\n",
501
+ " <td>...</td>\n",
502
+ " <td>...</td>\n",
503
+ " <td>...</td>\n",
504
+ " <td>...</td>\n",
505
+ " <td>...</td>\n",
506
+ " <td>...</td>\n",
507
+ " <td>...</td>\n",
508
+ " <td>...</td>\n",
509
+ " <td>...</td>\n",
510
+ " <td>...</td>\n",
511
+ " <td>...</td>\n",
512
+ " <td>...</td>\n",
513
+ " <td>...</td>\n",
514
+ " <td>...</td>\n",
515
+ " <td>...</td>\n",
516
+ " <td>...</td>\n",
517
+ " <td>...</td>\n",
518
+ " </tr>\n",
519
+ " <tr>\n",
520
+ " <th>7968</th>\n",
521
+ " <td>2146592435</td>\n",
522
+ " <td>*Oc1cc(CCCCCCCC)cc(OC(=O)c2cccc(C(*)=O)c2)c1</td>\n",
523
+ " <td>NaN</td>\n",
524
+ " <td>0.367498</td>\n",
525
+ " <td>NaN</td>\n",
526
+ " <td>NaN</td>\n",
527
+ " <td>NaN</td>\n",
528
+ " <td>*Oc1cc(CCCCCCCC)cc(OC(=O)c2cccc(C(*)=O)c2)c1</td>\n",
529
+ " <td>-0.375261</td>\n",
530
+ " <td>-0.375084</td>\n",
531
+ " <td>...</td>\n",
532
+ " <td>-0.048476</td>\n",
533
+ " <td>-0.069289</td>\n",
534
+ " <td>-0.626991</td>\n",
535
+ " <td>-0.407465</td>\n",
536
+ " <td>-0.051542</td>\n",
537
+ " <td>-0.047917</td>\n",
538
+ " <td>0.0</td>\n",
539
+ " <td>44</td>\n",
540
+ " <td>-0.324438</td>\n",
541
+ " <td>0.124891</td>\n",
542
+ " </tr>\n",
543
+ " <tr>\n",
544
+ " <th>7969</th>\n",
545
+ " <td>2146810552</td>\n",
546
+ " <td>*C(=O)OCCN(CCOC(=O)c1ccc2c(c1)C(=O)N(c1cccc(N3...</td>\n",
547
+ " <td>NaN</td>\n",
548
+ " <td>0.353280</td>\n",
549
+ " <td>NaN</td>\n",
550
+ " <td>NaN</td>\n",
551
+ " <td>NaN</td>\n",
552
+ " <td>*C(=O)OCCN(CCOC(=O)c1ccc2c(c1)C(=O)N(c1cccc(N3...</td>\n",
553
+ " <td>1.284275</td>\n",
554
+ " <td>1.284737</td>\n",
555
+ " <td>...</td>\n",
556
+ " <td>-0.048476</td>\n",
557
+ " <td>-0.069289</td>\n",
558
+ " <td>1.501149</td>\n",
559
+ " <td>0.736852</td>\n",
560
+ " <td>-0.051542</td>\n",
561
+ " <td>-0.047917</td>\n",
562
+ " <td>0.0</td>\n",
563
+ " <td>110</td>\n",
564
+ " <td>1.217388</td>\n",
565
+ " <td>0.008668</td>\n",
566
+ " </tr>\n",
567
+ " <tr>\n",
568
+ " <th>7970</th>\n",
569
+ " <td>2147191531</td>\n",
570
+ " <td>*c1cc(C(=O)NCCCCCCCC)cc(N2C(=O)c3ccc(-c4ccc5c(...</td>\n",
571
+ " <td>NaN</td>\n",
572
+ " <td>0.369411</td>\n",
573
+ " <td>NaN</td>\n",
574
+ " <td>NaN</td>\n",
575
+ " <td>NaN</td>\n",
576
+ " <td>*c1cc(C(=O)NCCCCCCCC)cc(N2C(=O)c3ccc(-c4ccc5c(...</td>\n",
577
+ " <td>0.329570</td>\n",
578
+ " <td>0.329823</td>\n",
579
+ " <td>...</td>\n",
580
+ " <td>-0.048476</td>\n",
581
+ " <td>-0.069289</td>\n",
582
+ " <td>1.501149</td>\n",
583
+ " <td>-0.026026</td>\n",
584
+ " <td>-0.051542</td>\n",
585
+ " <td>-0.047917</td>\n",
586
+ " <td>0.0</td>\n",
587
+ " <td>73</td>\n",
588
+ " <td>0.336345</td>\n",
589
+ " <td>0.021405</td>\n",
590
+ " </tr>\n",
591
+ " <tr>\n",
592
+ " <th>7971</th>\n",
593
+ " <td>2147435020</td>\n",
594
+ " <td>*C=C(*)c1ccccc1C</td>\n",
595
+ " <td>261.662355</td>\n",
596
+ " <td>NaN</td>\n",
597
+ " <td>NaN</td>\n",
598
+ " <td>NaN</td>\n",
599
+ " <td>NaN</td>\n",
600
+ " <td>*C=C(*)c1ccccc1C</td>\n",
601
+ " <td>-1.359802</td>\n",
602
+ " <td>-1.359728</td>\n",
603
+ " <td>...</td>\n",
604
+ " <td>-0.048476</td>\n",
605
+ " <td>-0.069289</td>\n",
606
+ " <td>-0.626991</td>\n",
607
+ " <td>-0.788904</td>\n",
608
+ " <td>-0.051542</td>\n",
609
+ " <td>-0.047917</td>\n",
610
+ " <td>0.0</td>\n",
611
+ " <td>16</td>\n",
612
+ " <td>-1.205481</td>\n",
613
+ " <td>-1.182617</td>\n",
614
+ " </tr>\n",
615
+ " <tr>\n",
616
+ " <th>7972</th>\n",
617
+ " <td>2147438299</td>\n",
618
+ " <td>*c1ccc(OCCCCCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCCCCC...</td>\n",
619
+ " <td>NaN</td>\n",
620
+ " <td>0.374049</td>\n",
621
+ " <td>NaN</td>\n",
622
+ " <td>NaN</td>\n",
623
+ " <td>NaN</td>\n",
624
+ " <td>*c1ccc(OCCCCCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCCCCC...</td>\n",
625
+ " <td>1.160667</td>\n",
626
+ " <td>1.160653</td>\n",
627
+ " <td>...</td>\n",
628
+ " <td>-0.048476</td>\n",
629
+ " <td>-0.069289</td>\n",
630
+ " <td>0.437079</td>\n",
631
+ " <td>-0.407465</td>\n",
632
+ " <td>-0.051542</td>\n",
633
+ " <td>-0.047917</td>\n",
634
+ " <td>0.0</td>\n",
635
+ " <td>72</td>\n",
636
+ " <td>-0.324438</td>\n",
637
+ " <td>-1.005054</td>\n",
638
+ " </tr>\n",
639
+ " </tbody>\n",
640
+ "</table>\n",
641
+ "<p>7973 rows × 92 columns</p>\n",
642
+ "</div>"
643
+ ],
644
+ "text/plain": [
645
+ " id SMILES \\\n",
646
+ "0 87817 *CC(*)c1ccccc1C(=O)OCCCCCC \n",
647
+ "1 106919 *Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5... \n",
648
+ "2 388772 *Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(... \n",
649
+ "3 519416 *Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)... \n",
650
+ "4 539187 *Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N... \n",
651
+ "... ... ... \n",
652
+ "7968 2146592435 *Oc1cc(CCCCCCCC)cc(OC(=O)c2cccc(C(*)=O)c2)c1 \n",
653
+ "7969 2146810552 *C(=O)OCCN(CCOC(=O)c1ccc2c(c1)C(=O)N(c1cccc(N3... \n",
654
+ "7970 2147191531 *c1cc(C(=O)NCCCCCCCC)cc(N2C(=O)c3ccc(-c4ccc5c(... \n",
655
+ "7971 2147435020 *C=C(*)c1ccccc1C \n",
656
+ "7972 2147438299 *c1ccc(OCCCCCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCCCCC... \n",
657
+ "\n",
658
+ " Tg FFV Tc Density Rg \\\n",
659
+ "0 NaN 0.374645 0.205667 NaN NaN \n",
660
+ "1 NaN 0.370410 NaN NaN NaN \n",
661
+ "2 NaN 0.378860 NaN NaN NaN \n",
662
+ "3 NaN 0.387324 NaN NaN NaN \n",
663
+ "4 NaN 0.355470 NaN NaN NaN \n",
664
+ "... ... ... ... ... .. \n",
665
+ "7968 NaN 0.367498 NaN NaN NaN \n",
666
+ "7969 NaN 0.353280 NaN NaN NaN \n",
667
+ "7970 NaN 0.369411 NaN NaN NaN \n",
668
+ "7971 261.662355 NaN NaN NaN NaN \n",
669
+ "7972 NaN 0.374049 NaN NaN NaN \n",
670
+ "\n",
671
+ " Smiles mol_weight \\\n",
672
+ "0 *CC(*)c1ccccc1C(=O)OCCCCCC -0.875755 \n",
673
+ "1 *Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5... 0.651876 \n",
674
+ "2 *Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(... 2.336573 \n",
675
+ "3 *Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)... 0.417716 \n",
676
+ "4 *Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N... 2.178003 \n",
677
+ "... ... ... \n",
678
+ "7968 *Oc1cc(CCCCCCCC)cc(OC(=O)c2cccc(C(*)=O)c2)c1 -0.375261 \n",
679
+ "7969 *C(=O)OCCN(CCOC(=O)c1ccc2c(c1)C(=O)N(c1cccc(N3... 1.284275 \n",
680
+ "7970 *c1cc(C(=O)NCCCCCCCC)cc(N2C(=O)c3ccc(-c4ccc5c(... 0.329570 \n",
681
+ "7971 *C=C(*)c1ccccc1C -1.359802 \n",
682
+ "7972 *c1ccc(OCCCCCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCCCCC... 1.160667 \n",
683
+ "\n",
684
+ " exact_mol_weight ... num_3_rings num_4_rings num_5_rings \\\n",
685
+ "0 -0.875617 ... -0.048476 -0.069289 -0.626991 \n",
686
+ "1 0.651916 ... -0.048476 -0.069289 -0.626991 \n",
687
+ "2 2.336165 ... -0.048476 -0.069289 -0.626991 \n",
688
+ "3 0.417722 ... -0.048476 -0.069289 -0.626991 \n",
689
+ "4 2.178499 ... -0.048476 -0.069289 1.501149 \n",
690
+ "... ... ... ... ... ... \n",
691
+ "7968 -0.375084 ... -0.048476 -0.069289 -0.626991 \n",
692
+ "7969 1.284737 ... -0.048476 -0.069289 1.501149 \n",
693
+ "7970 0.329823 ... -0.048476 -0.069289 1.501149 \n",
694
+ "7971 -1.359728 ... -0.048476 -0.069289 -0.626991 \n",
695
+ "7972 1.160653 ... -0.048476 -0.069289 0.437079 \n",
696
+ "\n",
697
+ " num_6_rings num_7_rings num_large_rings has_polymer_notation \\\n",
698
+ "0 -0.788904 -0.051542 -0.047917 0.0 \n",
699
+ "1 0.736852 -0.051542 -0.047917 0.0 \n",
700
+ "2 2.644047 -0.051542 -0.047917 0.0 \n",
701
+ "3 1.118291 -0.051542 -0.047917 0.0 \n",
702
+ "4 0.355413 -0.051542 -0.047917 0.0 \n",
703
+ "... ... ... ... ... \n",
704
+ "7968 -0.407465 -0.051542 -0.047917 0.0 \n",
705
+ "7969 0.736852 -0.051542 -0.047917 0.0 \n",
706
+ "7970 -0.026026 -0.051542 -0.047917 0.0 \n",
707
+ "7971 -0.788904 -0.051542 -0.047917 0.0 \n",
708
+ "7972 -0.407465 -0.051542 -0.047917 0.0 \n",
709
+ "\n",
710
+ " smiles_length branch_count branch_ratio \n",
711
+ "0 26 -0.985221 -0.813832 \n",
712
+ "1 82 0.336345 -0.286141 \n",
713
+ "2 134 1.657910 -0.109289 \n",
714
+ "3 79 0.556606 0.132247 \n",
715
+ "4 118 0.556606 -0.830501 \n",
716
+ "... ... ... ... \n",
717
+ "7968 44 -0.324438 0.124891 \n",
718
+ "7969 110 1.217388 0.008668 \n",
719
+ "7970 73 0.336345 0.021405 \n",
720
+ "7971 16 -1.205481 -1.182617 \n",
721
+ "7972 72 -0.324438 -1.005054 \n",
722
+ "\n",
723
+ "[7973 rows x 92 columns]"
724
+ ]
725
+ },
726
+ "execution_count": 3,
727
+ "metadata": {},
728
+ "output_type": "execute_result"
729
+ }
730
+ ],
731
+ "source": [
732
+ "features_df = pd.read_csv('7k_w_descriptors.csv')\n",
733
+ "features_df"
734
+ ]
735
+ },
736
+ {
737
+ "cell_type": "code",
738
+ "execution_count": 5,
739
+ "id": "49998b8a-3925-4383-917a-116f70187d46",
740
+ "metadata": {},
741
+ "outputs": [
742
+ {
743
+ "name": "stdout",
744
+ "output_type": "stream",
745
+ "text": [
746
+ "0\n"
747
+ ]
748
+ }
749
+ ],
750
+ "source": [
751
+ "old_len = len(features_df)\n",
752
+ "new_len = len(features_df.drop_duplicates())\n",
753
+ "print(new_len - old_len)"
754
+ ]
755
+ },
756
+ {
757
+ "cell_type": "code",
758
+ "execution_count": 6,
759
+ "id": "c2f08ca9-21f6-4a79-ab94-80556b8dab1d",
760
+ "metadata": {},
761
+ "outputs": [
762
+ {
763
+ "name": "stderr",
764
+ "output_type": "stream",
765
+ "text": [
766
+ "100%|█████████████████████████████████████| 6378/6378 [00:01<00:00, 3382.49it/s]\n",
767
+ "100%|█████████████████████████████████████| 1595/1595 [00:00<00:00, 3554.96it/s]\n"
768
+ ]
769
+ }
770
+ ],
771
+ "source": [
772
+ "import torch\n",
773
+ "from tqdm import tqdm\n",
774
+ "import copy\n",
775
+ "from sklearn.model_selection import train_test_split\n",
776
+ "\n",
777
+ "def create_splits(df):\n",
778
+ " train, test = train_test_split(df, test_size=0.2)\n",
779
+ " return train, test\n",
780
+ "\n",
781
+ "def create_samples(df, features):\n",
782
+ " samples = []\n",
783
+ " features_without_smiles = copy.deepcopy(features)\n",
784
+ " features_without_smiles.remove('Smiles')\n",
785
+ " for i, row in tqdm(df.iterrows(), total=len(df)):\n",
786
+ " properties = torch.Tensor(row[features_without_smiles].to_list())\n",
787
+ " sample = {'Smiles': row['Smiles'], 'property_tensor': properties}\n",
788
+ " samples.append(sample)\n",
789
+ " return samples\n",
790
+ "\n",
791
+ "train, val = create_splits(features_df.reset_index(drop=True))\n",
792
+ "\n",
793
+ "train = train.reset_index(drop=True)\n",
794
+ "val = val.reset_index(drop=True)\n",
795
+ "\n",
796
+ "train_list = create_samples(train, new_features)\n",
797
+ "val_list = create_samples(val, new_features)"
798
+ ]
799
+ },
800
+ {
801
+ "cell_type": "code",
802
+ "execution_count": 7,
803
+ "id": "2fdb3171-deda-4c1f-ae4b-853d781ffdd5",
804
+ "metadata": {},
805
+ "outputs": [
806
+ {
807
+ "name": "stderr",
808
+ "output_type": "stream",
809
+ "text": [
810
+ "100%|████████████████████████████████████████| 20/20 [00:00<00:00, 74764.78it/s]\n"
811
+ ]
812
+ }
813
+ ],
814
+ "source": [
815
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
816
+ "\n",
817
+ "prop_vectors = [el['property_tensor'] for el in train_list[:20]]\n",
818
+ "\n",
819
+ "sim_matrix = cosine_similarity(prop_vectors)\n",
820
+ " \n",
821
+ "n = len(prop_vectors)\n",
822
+ "positive_pairs, negative_candidates = [], []\n",
823
+ "sims = []\n",
824
+ "\n",
825
+ "positive_threshold = 0.9\n",
826
+ "negative_threshold = 0.2\n",
827
+ "\n",
828
+ "for i in tqdm(range(n)):\n",
829
+ " for j in range(i + 1, n):\n",
830
+ " sim = sim_matrix[i, j]\n",
831
+ "\n",
832
+ " if sim > positive_threshold:\n",
833
+ " positive_pairs.append((i, j, sim))\n",
834
+ " elif sim < negative_threshold:\n",
835
+ " negative_candidates.append((i, j, sim))\n",
836
+ " sims.append(float(sim))\n"
837
+ ]
838
+ },
839
+ {
840
+ "cell_type": "code",
841
+ "execution_count": null,
842
+ "id": "54f29e98-7c32-441c-bb1b-cdaf3fd1df49",
843
+ "metadata": {},
844
+ "outputs": [],
845
+ "source": [
846
+ "len(positive_pairs), len(negative_candidates)"
847
+ ]
848
+ },
849
+ {
850
+ "cell_type": "code",
851
+ "execution_count": null,
852
+ "id": "22e0f46e-2673-4840-95fd-f98914e57b78",
853
+ "metadata": {},
854
+ "outputs": [],
855
+ "source": [
856
+ "from matplotlib import pyplot as plt\n",
857
+ "\n",
858
+ "plt.plot(sims)"
859
+ ]
860
+ },
861
+ {
862
+ "cell_type": "code",
863
+ "execution_count": 8,
864
+ "id": "79e7e873-7950-4123-ab13-299360ae19ca",
865
+ "metadata": {},
866
+ "outputs": [],
867
+ "source": [
868
+ "import os\n",
869
+ "import torch\n",
870
+ "from torch.utils.data import Dataset, DataLoader\n",
871
+ "import torch.nn as nn\n",
872
+ "import torch.nn.functional as F\n",
873
+ "from transformers import BertConfig, BertModel, AutoTokenizer\n",
874
+ "import pickle\n",
875
+ "import numpy as np\n",
876
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
877
+ "\n",
878
+ "def global_ap(x):\n",
879
+ " return torch.mean(x.view(x.size(0), x.size(1), -1), dim=1)\n",
880
+ "\n",
881
+ "class SimSonEncoder(nn.Module):\n",
882
+ " def __init__(self, config: BertConfig, max_len: int, dropout: float = 0.1):\n",
883
+ " super(SimSonEncoder, self).__init__()\n",
884
+ " self.config = config\n",
885
+ " self.max_len = max_len\n",
886
+ " \n",
887
+ " self.bert = BertModel(config, add_pooling_layer=False)\n",
888
+ " \n",
889
+ " self.linear = nn.Linear(config.hidden_size, max_len)\n",
890
+ " self.dropout = nn.Dropout(dropout)\n",
891
+ " \n",
892
+ " def forward(self, input_ids, attention_mask=None):\n",
893
+ " if attention_mask is None:\n",
894
+ " attention_mask = input_ids.ne(0)\n",
895
+ " \n",
896
+ " outputs = self.bert(\n",
897
+ " input_ids=input_ids,\n",
898
+ " attention_mask=attention_mask\n",
899
+ " )\n",
900
+ " \n",
901
+ " hidden_states = outputs.last_hidden_state\n",
902
+ " \n",
903
+ " hidden_states = self.dropout(hidden_states)\n",
904
+ " \n",
905
+ " pooled = global_ap(hidden_states)\n",
906
+ " \n",
907
+ " out = self.linear(pooled)\n",
908
+ " \n",
909
+ " return out\n",
910
+ "\n",
911
+ "def initialize_model_and_tokenizer():\n",
912
+ " \"\"\"Initialize BERT model from config and ChemBERTa tokenizer\"\"\"\n",
913
+ " \n",
914
+ " \n",
915
+ " tokenizer = AutoTokenizer.from_pretrained(\"DeepChem/ChemBERTa-77M-MTR\")\n",
916
+ " config = BertConfig(\n",
917
+ " vocab_size=tokenizer.vocab_size,\n",
918
+ " hidden_size=768,\n",
919
+ " num_hidden_layers=4,\n",
920
+ " num_attention_heads=12,\n",
921
+ " intermediate_size=2048,\n",
922
+ " max_position_embeddings=512,\n",
923
+ " )\n",
924
+ " model = SimSonEncoder(config=config, max_len=512).cuda()\n",
925
+ " return model, tokenizer\n"
926
+ ]
927
+ },
928
+ {
929
+ "cell_type": "code",
930
+ "execution_count": 9,
931
+ "id": "8a3adaff-da65-46b4-b9ee-95851d786a67",
932
+ "metadata": {},
933
+ "outputs": [],
934
+ "source": [
935
+ "import time\n",
936
+ "\n",
937
+ "\n",
938
+ "class MolecularContrastiveDataset(Dataset):\n",
939
+ " def __init__(self, data_list, tokenizer, positive_threshold=0.9, cache_path=None, split_type='train'):\n",
940
+ " \"\"\"\n",
941
+ " Dataset that only contains positive pairs for NT-Xent contrastive learning\n",
942
+ " \"\"\"\n",
943
+ " self.data_list = data_list\n",
944
+ " self.tokenizer = tokenizer\n",
945
+ " self.positive_threshold = positive_threshold\n",
946
+ " self.cache_path = cache_path\n",
947
+ " self.split_type = split_type\n",
948
+ "\n",
949
+ " # Load or compute pairs\n",
950
+ " if cache_path and os.path.exists(cache_path) and os.path.getsize(cache_path) > 0:\n",
951
+ " print(f\"Loading cached pairs from {cache_path}\")\n",
952
+ " self._load_pairs()\n",
953
+ " else:\n",
954
+ " print(\"Computing positive pairs only...\")\n",
955
+ " self._compute_positive_pairs()\n",
956
+ " if cache_path:\n",
957
+ " self._save_pairs()\n",
958
+ " \n",
959
+ " def _compute_positive_pairs(self):\n",
960
+ " \"\"\"\n",
961
+ " Compute ONLY positive pairs based on descriptor similarity\n",
962
+ " \"\"\"\n",
963
+ " # --- 1. Cosine-similarity matrix ---------------------------------------\n",
964
+ " prop_vectors = torch.stack(\n",
965
+ " [item['property_tensor'] for item in self.data_list]\n",
966
+ " ).numpy()\n",
967
+ " sim_matrix = cosine_similarity(prop_vectors)\n",
968
+ "\n",
969
+ " n = len(self.data_list)\n",
970
+ " positive_pairs = []\n",
971
+ " pairs_per_molecule = 1 # STRICTLY ONE FOR CREATING PROPER NEGATIVE PAIRS\n",
972
+ " current_pairs_per_molecule = 0\n",
973
+ " # --- 2. Collect only positive pairs ------------------------------------\n",
974
+ " print(f'Collecting positive pairs with similarity threshold {self.positive_threshold}')\n",
975
+ " for i in tqdm(range(n)):\n",
976
+ " for j in range(i + 1, n):\n",
977
+ " sim = sim_matrix[i, j]\n",
978
+ " if sim > self.positive_threshold:\n",
979
+ " positive_pairs.append((i, j, sim))\n",
980
+ " current_pairs_per_molecule += 1\n",
981
+ " if current_pairs_per_molecule > pairs_per_molecule:\n",
982
+ " current_pairs_per_molecule = 0\n",
983
+ " break\n",
984
+ "\n",
985
+ " # --- 3. Store only positive pairs --------------------------------------\n",
986
+ " if len(positive_pairs) == 0:\n",
987
+ " raise ValueError(\"No positive pairs found – lower the positive_threshold.\")\n",
988
+ "\n",
989
+ " # No shuffling - we want consistent positive pairs\n",
990
+ " self.pairs = [(i, j) for i, j, _ in positive_pairs]\n",
991
+ " self.descriptor_similarities = [sim for _, _, sim in positive_pairs]\n",
992
+ "\n",
993
+ " print(f\"Generated {len(self.pairs)} positive pairs\")\n",
994
+ "\n",
995
+ " def _save_pairs(self):\n",
996
+ " \"\"\"Save computed pairs to cache file\"\"\"\n",
997
+ " cache_data = {\n",
998
+ " 'pairs': self.pairs,\n",
999
+ " 'descriptor_similarities': self.descriptor_similarities\n",
1000
+ " }\n",
1001
+ " with open(self.cache_path, 'wb') as f:\n",
1002
+ " pickle.dump(cache_data, f)\n",
1003
+ " print(f\"Cached pairs saved to {self.cache_path}\")\n",
1004
+ " \n",
1005
+ " def _load_pairs(self):\n",
1006
+ " \"\"\"Load pairs from cache file\"\"\"\n",
1007
+ " with open(self.cache_path, 'rb') as f:\n",
1008
+ " cache_data = pickle.load(f)\n",
1009
+ " \n",
1010
+ " self.pairs = cache_data['pairs']\n",
1011
+ " self.descriptor_similarities = cache_data['descriptor_similarities']\n",
1012
+ " \n",
1013
+ " def __len__(self):\n",
1014
+ " return len(self.pairs)\n",
1015
+ " \n",
1016
+ " def __getitem__(self, idx):\n",
1017
+ " i, j = self.pairs[idx]\n",
1018
+ " desc_sim = self.descriptor_similarities[idx]\n",
1019
+ " \n",
1020
+ " # Get SMILES for both molecules\n",
1021
+ " smiles_i = self.data_list[i]['Smiles']\n",
1022
+ " smiles_j = self.data_list[j]['Smiles']\n",
1023
+ " if self.split_type == 'val':\n",
1024
+ " print(f'POSITIVE PAIR SMILES: \\n{smiles_i} \\n {smiles_j}')\n",
1025
+ " # Tokenize SMILES\n",
1026
+ " tokens_i = self.tokenizer(\n",
1027
+ " smiles_i, \n",
1028
+ " return_tensors='pt', \n",
1029
+ " padding='max_length', \n",
1030
+ " truncation=True, \n",
1031
+ " max_length=256\n",
1032
+ " )\n",
1033
+ " tokens_j = self.tokenizer(\n",
1034
+ " smiles_j, \n",
1035
+ " return_tensors='pt', \n",
1036
+ " padding='max_length', \n",
1037
+ " truncation=True, \n",
1038
+ " max_length=256\n",
1039
+ " )\n",
1040
+ " \n",
1041
+ " # Remove batch dimension\n",
1042
+ " tokens_i = {key: val.squeeze(0) for key, val in tokens_i.items()}\n",
1043
+ " tokens_j = {key: val.squeeze(0) for key, val in tokens_j.items()}\n",
1044
+ " \n",
1045
+ " # Get property vectors\n",
1046
+ " prop_vec_i = self.data_list[i]['property_tensor']\n",
1047
+ " prop_vec_j = self.data_list[j]['property_tensor']\n",
1048
+ " \n",
1049
+ " return {\n",
1050
+ " 'tokens_i': tokens_i,\n",
1051
+ " 'tokens_j': tokens_j,\n",
1052
+ " 'descriptor_similarity': torch.tensor(desc_sim, dtype=torch.float32),\n",
1053
+ " 'property_tensor_i': prop_vec_i,\n",
1054
+ " 'property_tensor_j': prop_vec_j\n",
1055
+ " }\n",
1056
+ "\n",
1057
+ "\n",
1058
+ "def contrastive_collate_fn(batch):\n",
1059
+ " \"\"\"\n",
1060
+ " Collate function that creates proper NT-Xent batches:\n",
1061
+ " - Element 0 and 1 are positive pairs\n",
1062
+ " - Element 2 and 3 are positive pairs \n",
1063
+ " - etc.\n",
1064
+ " \"\"\"\n",
1065
+ " batch_size = len(batch)\n",
1066
+ " \n",
1067
+ " # Ensure even batch size for proper pairing\n",
1068
+ " if batch_size % 2 != 0:\n",
1069
+ " batch = batch[:-1] # Drop last element if odd\n",
1070
+ " batch_size = len(batch)\n",
1071
+ " \n",
1072
+ " # Interleave: [sample1_i, sample1_j, sample2_i, sample2_j, ...]\n",
1073
+ " tokens_list = []\n",
1074
+ " desc_similarities = []\n",
1075
+ " \n",
1076
+ " for i in range(0, batch_size, 1):\n",
1077
+ " # Add first molecule of pair i\n",
1078
+ " tokens_list.append(batch[i]['tokens_i'])\n",
1079
+ " desc_similarities.append(batch[i]['descriptor_similarity'])\n",
1080
+ " \n",
1081
+ " # Add second molecule of pair i (positive pair)\n",
1082
+ " tokens_list.append(batch[i]['tokens_j'])\n",
1083
+ " desc_similarities.append(batch[i]['descriptor_similarity']) # Same similarity for both elements in pair\n",
1084
+ " \n",
1085
+ " # Stack all tokens\n",
1086
+ " tokens = {}\n",
1087
+ " for key in tokens_list[0].keys():\n",
1088
+ " tokens[key] = torch.stack([item[key] for item in tokens_list])\n",
1089
+ " \n",
1090
+ " desc_similarities_tensor = torch.stack(desc_similarities)\n",
1091
+ " \n",
1092
+ " return {\n",
1093
+ " 'tokens': tokens,\n",
1094
+ " 'descriptor_similarities': desc_similarities_tensor,\n",
1095
+ " }\n",
1096
+ "\n",
1097
+ "\n",
1098
+ "def create_dataloaders(train_list, val_list, tokenizer, batch_size=32, \n",
1099
+ " positive_threshold=0.85, cache_dir=\"cache\"):\n",
1100
+ " \"\"\"Create train and validation dataloaders for NT-Xent\"\"\"\n",
1101
+ " os.makedirs(cache_dir, exist_ok=True)\n",
1102
+ " \n",
1103
+ " # Ensure even batch size for proper pairing\n",
1104
+ " if batch_size % 2 != 0:\n",
1105
+ " batch_size += 1\n",
1106
+ " print(f\"Adjusted batch_size to {batch_size} (must be even for NT-Xent)\")\n",
1107
+ " \n",
1108
+ " train_cache = os.path.join(cache_dir, 'train_positive_pairs.pkl')\n",
1109
+ " val_cache = os.path.join(cache_dir, 'val_positive_pairs.pkl')\n",
1110
+ " \n",
1111
+ " train_dataset = MolecularContrastiveDataset(\n",
1112
+ " train_list, tokenizer, positive_threshold=positive_threshold, cache_path=train_cache\n",
1113
+ " )\n",
1114
+ " val_dataset = MolecularContrastiveDataset(\n",
1115
+ " val_list, tokenizer, positive_threshold=positive_threshold, cache_path=val_cache, split_type='val',\n",
1116
+ " )\n",
1117
+ " \n",
1118
+ " train_loader = DataLoader(\n",
1119
+ " train_dataset, batch_size=batch_size, shuffle=True, collate_fn=contrastive_collate_fn, drop_last=True, pin_memory=True\n",
1120
+ " )\n",
1121
+ " val_loader = DataLoader(\n",
1122
+ " val_dataset, batch_size=batch_size, shuffle=False, collate_fn=contrastive_collate_fn, drop_last=True, pin_memory=True\n",
1123
+ " )\n",
1124
+ " \n",
1125
+ " return train_loader, val_loader\n",
1126
+ "\n"
1127
+ ]
1128
+ },
1129
+ {
1130
+ "cell_type": "code",
1131
+ "execution_count": 10,
1132
+ "id": "f956a50b-85a5-49df-b7c6-6e40dce160e1",
1133
+ "metadata": {},
1134
+ "outputs": [
1135
+ {
1136
+ "name": "stdout",
1137
+ "output_type": "stream",
1138
+ "text": [
1139
+ "Model initialized with 23,299,840 trainable parameters\n"
1140
+ ]
1141
+ }
1142
+ ],
1143
+ "source": [
1144
+ "def nt_xent_loss_with_temp_scaling(embeddings1, embeddings2, descriptor_similarity, base_temp=0.02):\n",
1145
+ " batch_size = embeddings1.shape[0]\n",
1146
+ " device = embeddings1.device\n",
1147
+ " #individual_temperatures = sigmoid_temp_scaling(descriptor_similarity, base_temp)\n",
1148
+ " #temperature = individual_temperatures.mean() # Single temperature for the whole batch\n",
1149
+ " temperature = base_temp\n",
1150
+ " # Normalize projections\n",
1151
+ " z_i = F.normalize(embeddings1, p=2, dim=1)\n",
1152
+ " z_j = F.normalize(embeddings2, p=2, dim=1)\n",
1153
+ " \n",
1154
+ " # Concatenate for similarity matrix calculation\n",
1155
+ " representations = torch.cat([z_i, z_j], dim=0)\n",
1156
+ " # Calculate cosine similarity between all pairs\n",
1157
+ " similarity_matrix = F.cosine_similarity(representations.unsqueeze(1), representations.unsqueeze(0), dim=2)\n",
1158
+ " #similarity_matrix = torch.clamp(similarity_matrix, min=-0.999, max=0.999)\n",
1159
+ " sim_ij = torch.diag(similarity_matrix, batch_size)\n",
1160
+ " sim_ji = torch.diag(similarity_matrix, -batch_size)\n",
1161
+ " positives = torch.cat([sim_ij, sim_ji], dim=0)\n",
1162
+ " \n",
1163
+ " # Create a mask to exclude self-comparisons\n",
1164
+ " nominator = torch.exp(positives / temperature)\n",
1165
+ " mask = (~torch.eye(batch_size * 2, batch_size * 2, dtype=torch.bool, device=device)).float()\n",
1166
+ " denominator = mask * torch.exp(similarity_matrix / temperature)\n",
1167
+ " \n",
1168
+ " # Calculate the final loss\n",
1169
+ " loss = -torch.log(nominator / torch.sum(denominator, dim=1))\n",
1170
+ " if torch.isnan(loss).any():\n",
1171
+ " print(similarity_matrix)\n",
1172
+ " print(f\"Temperature: {temperature}\")\n",
1173
+ " print(f\"Nominator range: {nominator.min().item():.6f} to {nominator.max().item():.6f}\")\n",
1174
+ " \n",
1175
+ " return torch.sum(loss) / (2 * batch_size)\n",
1176
+ "\n",
1177
+ "\n",
1178
+ "def sigmoid_temp_scaling(descriptor_similarity, base_temp=0.05, steepness=10.0, midpoint=0.5):\n",
1179
+ " \"\"\"Smooth sigmoid-based temperature scaling\"\"\"\n",
1180
+ " sigmoid_factor = torch.sigmoid(steepness * (descriptor_similarity - midpoint))\n",
1181
+ " temperature = base_temp * (2.0 - sigmoid_factor)\n",
1182
+ " return temperature\n",
1183
+ "\n",
1184
+ "\n",
1185
+ "def train_step(batch, model, optimizer, device, scheduler, base_temp=0.1):\n",
1186
+ " \"\"\"Single training step for NT-Xent\"\"\"\n",
1187
+ " model.train()\n",
1188
+ " optimizer.zero_grad()\n",
1189
+ " \n",
1190
+ " # Move batch to device\n",
1191
+ " tokens = {k: v.to(device) for k, v in batch['tokens'].items()}\n",
1192
+ " desc_similarities = batch['descriptor_similarities'].to(device)\n",
1193
+ " \n",
1194
+ " # Forward pass - get embeddings for all samples\n",
1195
+ " outputs = model(**tokens) # i1, j1, i2, j2 ...\n",
1196
+ " embeddings = outputs\n",
1197
+ " \n",
1198
+ " # Split embeddings: even indices are embeddings1, odd indices are embeddings2\n",
1199
+ " embeddings1 = embeddings[::2] # [0, 2, 4, ...]\n",
1200
+ " embeddings2 = embeddings[1::2] # [1, 3, 5, ...]\n",
1201
+ " \n",
1202
+ " # Get descriptor similarities for each pair (take every other one since they're duplicated)\n",
1203
+ " pair_desc_similarities = desc_similarities[::2]\n",
1204
+ " #print(f'FIRST TRAIN EMBED: {embeddings1}')\n",
1205
+ " #print(f'SECOND TRAIN EMBED: {embeddings2}')\n",
1206
+ " #print(f'COSINE SIM BETWEEN THEM TRAIN: {F.cosine_similarity(embeddings1, embeddings2, dim=1)}')\n",
1207
+ " # Calculate NT-Xent loss\n",
1208
+ " loss = nt_xent_loss_with_temp_scaling(embeddings1, embeddings2, pair_desc_similarities, base_temp=base_temp)\n",
1209
+ " \n",
1210
+ " # Backward pass\n",
1211
+ " loss.backward()\n",
1212
+ " optimizer.step()\n",
1213
+ " torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n",
1214
+ " scheduler.step()\n",
1215
+ " return loss.item()\n",
1216
+ "\n",
1217
+ "def val_step(batch, model, device, base_temp=0.1):\n",
1218
+ " \"\"\"Single validation step for NT-Xent\"\"\"\n",
1219
+ " model.eval()\n",
1220
+ " with torch.no_grad():\n",
1221
+ " # Move batch to device\n",
1222
+ " tokens = {k: v.to(device) for k, v in batch['tokens'].items()}\n",
1223
+ " desc_similarities = batch['descriptor_similarities'].to(device)\n",
1224
+ " \n",
1225
+ " # Forward pass\n",
1226
+ " outputs = model(**tokens)\n",
1227
+ " embeddings = outputs\n",
1228
+ " \n",
1229
+ " # Split embeddings\n",
1230
+ " embeddings1 = embeddings[::2]\n",
1231
+ " embeddings2 = embeddings[1::2]\n",
1232
+ " \n",
1233
+ " # Get descriptor similarities for pairs\n",
1234
+ " pair_desc_similarities = desc_similarities[::2]\n",
1235
+ " \n",
1236
+ " print(f'FIRST VAL EMBED: {embeddings1}')\n",
1237
+ " print(f'SECOND VAL EMBED: {embeddings2}')\n",
1238
+ " print(f'COSINE SIM BETWEEN THEM: {F.cosine_similarity(embeddings1, embeddings2, dim=1)}')\n",
1239
+ " #print(f'SECOND VAL EMBED: {embeddings2}')\n",
1240
+ " loss = nt_xent_loss_with_temp_scaling(embeddings1, embeddings2, pair_desc_similarities, base_temp=base_temp)\n",
1241
+ " print(f'VAL LOSS: {loss}')\n",
1242
+ " \n",
1243
+ " return loss.item()\n",
1244
+ "\n",
1245
+ "def train_epoch(train_loader, model, optimizer, scheduler, base_temp=0.01):\n",
1246
+ " \"\"\"Train for one epoch\"\"\"\n",
1247
+ " total_loss = 0\n",
1248
+ " num_batches = 0\n",
1249
+ " \n",
1250
+ " progress_bar = tqdm(train_loader, desc=\"Training\")\n",
1251
+ " \n",
1252
+ " for batch in progress_bar:\n",
1253
+ " loss = train_step(batch, model, optimizer, 'cuda', scheduler, base_temp=base_temp)\n",
1254
+ " total_loss += loss\n",
1255
+ " num_batches += 1\n",
1256
+ " \n",
1257
+ " # Calculate running average loss\n",
1258
+ " avg_loss = total_loss / num_batches\n",
1259
+ " \n",
1260
+ " # Update progress bar with current loss info\n",
1261
+ " progress_bar.set_postfix({\n",
1262
+ " 'Loss': f'{loss:.4f}',\n",
1263
+ " 'Avg Loss': f'{avg_loss:.4f}'\n",
1264
+ " })\n",
1265
+ " \n",
1266
+ " return total_loss / num_batches if num_batches > 0 else 0\n",
1267
+ "\n",
1268
+ "\n",
1269
+ "def validate_epoch(val_loader, model, base_temp=0.01):\n",
1270
+ " \"\"\"Validate for one epoch\"\"\"\n",
1271
+ " total_loss = 0\n",
1272
+ " num_batches = 0\n",
1273
+ " print('nah twin')\n",
1274
+ " return 0\n",
1275
+ " for batch in val_loader:\n",
1276
+ " loss = val_step(batch, model, 'cuda', base_temp=base_temp)\n",
1277
+ " total_loss += loss\n",
1278
+ " num_batches += 1\n",
1279
+ " \n",
1280
+ " return total_loss / num_batches if num_batches > 0 else 0\n",
1281
+ "\n",
1282
+ "def training_loop(train_loader, val_loader, model, tokenizer, epochs=50, patience=5, lr=1e-4, base_temp=0.02,\n",
1283
+ " device_name='cuda', save_path='best_model.pt'):\n",
1284
+ " \"\"\"Main training loop with early stopping\"\"\"\n",
1285
+ " device = torch.device(device_name if torch.cuda.is_available() else 'cpu')\n",
1286
+ " print(f\"Using device: {device}\")\n",
1287
+ " \n",
1288
+ " # Initialize model and optimizer\n",
1289
+ " optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n",
1290
+ " optimizer.zero_grad()\n",
1291
+ "\n",
1292
+ " total_steps = epochs * len(train_loader)\n",
1293
+ " scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_mult=1, T_0=total_steps)\n",
1294
+ " # Early stopping variables\n",
1295
+ " best_val_loss = float('inf')\n",
1296
+ " no_improve_epochs = 0\n",
1297
+ " \n",
1298
+ " print(\"Starting training...\")\n",
1299
+ " \n",
1300
+ " for epoch in range(epochs):\n",
1301
+ " # Training\n",
1302
+ " with torch.autocast(dtype=torch.float16, device_type='cuda'):\n",
1303
+ " train_loss = train_epoch(train_loader, model, optimizer, scheduler, base_temp=base_temp)\n",
1304
+ " print('END TRAIN')\n",
1305
+ " # Validation\n",
1306
+ " val_loss = validate_epoch(val_loader, model)\n",
1307
+ " \n",
1308
+ " print(f\"Epoch {epoch + 1}/{epochs}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}\")\n",
1309
+ " \n",
1310
+ " # Early stopping check\n",
1311
+ " if val_loss < best_val_loss:\n",
1312
+ " best_val_loss = val_loss\n",
1313
+ " no_improve_epochs = 0\n",
1314
+ " # Save best model\n",
1315
+ " torch.save(model.state_dict(), save_path)\n",
1316
+ " print(f\"New best model saved with val loss: {val_loss:.4f}\")\n",
1317
+ " else:\n",
1318
+ " no_improve_epochs += 1\n",
1319
+ " print(f\"No improvement for {no_improve_epochs} epochs\")\n",
1320
+ " \n",
1321
+ " if no_improve_epochs >= patience:\n",
1322
+ " print(f\"Early stopping triggered after {epoch + 1} epochs\")\n",
1323
+ " break\n",
1324
+ " \n",
1325
+ " # Load best model\n",
1326
+ " print(f\"Loading best model from {save_path}\")\n",
1327
+ " model.load_state_dict(torch.load(save_path))\n",
1328
+ " model.eval()\n",
1329
+ " \n",
1330
+ " print(f\"Training completed. Best validation loss: {best_val_loss:.4f}\")\n",
1331
+ "\n",
1332
+ "\n",
1333
+ "model, tokenizer = initialize_model_and_tokenizer()\n",
1334
+ "#model.load_state_dict(torch.load('/home/jovyan/simson_training_bolgov/regression/actual_encoder_state.pkl', weights_only=False))\n",
1335
+ "print(f\"Model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters\")\n"
1336
+ ]
1337
+ },
1338
+ {
1339
+ "cell_type": "code",
1340
+ "execution_count": 11,
1341
+ "id": "c73e2bba-59c1-4b41-b2ff-235526dd2912",
1342
+ "metadata": {},
1343
+ "outputs": [],
1344
+ "source": [
1345
+ "!rm -rf cache"
1346
+ ]
1347
+ },
1348
+ {
1349
+ "cell_type": "code",
1350
+ "execution_count": null,
1351
+ "id": "0072c8f5-c5e9-4590-9544-c73cf1fac1e8",
1352
+ "metadata": {},
1353
+ "outputs": [
1354
+ {
1355
+ "name": "stdout",
1356
+ "output_type": "stream",
1357
+ "text": [
1358
+ "Computing positive pairs only...\n",
1359
+ "Collecting positive pairs with similarity threshold 0.8\n"
1360
+ ]
1361
+ },
1362
+ {
1363
+ "name": "stderr",
1364
+ "output_type": "stream",
1365
+ "text": [
1366
+ "100%|████████████████████████████████████| 6378/6378 [00:00<00:00, 54740.22it/s]\n"
1367
+ ]
1368
+ },
1369
+ {
1370
+ "name": "stdout",
1371
+ "output_type": "stream",
1372
+ "text": [
1373
+ "Generated 12534 positive pairs\n",
1374
+ "Cached pairs saved to cache/train_positive_pairs.pkl\n",
1375
+ "Computing positive pairs only...\n",
1376
+ "Collecting positive pairs with similarity threshold 0.8\n"
1377
+ ]
1378
+ },
1379
+ {
1380
+ "name": "stderr",
1381
+ "output_type": "stream",
1382
+ "text": [
1383
+ "100%|█████████████████████████████████████| 100/100 [00:00<00:00, 200780.47it/s]\n"
1384
+ ]
1385
+ },
1386
+ {
1387
+ "name": "stdout",
1388
+ "output_type": "stream",
1389
+ "text": [
1390
+ "Generated 138 positive pairs\n",
1391
+ "Cached pairs saved to cache/val_positive_pairs.pkl\n",
1392
+ "Using device: cuda\n",
1393
+ "Starting training...\n"
1394
+ ]
1395
+ },
1396
+ {
1397
+ "name": "stderr",
1398
+ "output_type": "stream",
1399
+ "text": [
1400
+ "Training: 100%|█| 1566/1566 [00:31<00:00, 50.05it/s, Loss=1.1129, Avg Loss=1.528\n"
1401
+ ]
1402
+ },
1403
+ {
1404
+ "name": "stdout",
1405
+ "output_type": "stream",
1406
+ "text": [
1407
+ "END TRAIN\n",
1408
+ "nah twin\n",
1409
+ "Epoch 1/10: Train Loss = 1.5288, Val Loss = 0.0000\n",
1410
+ "New best model saved with val loss: 0.0000\n"
1411
+ ]
1412
+ },
1413
+ {
1414
+ "name": "stderr",
1415
+ "output_type": "stream",
1416
+ "text": [
1417
+ "Training: 100%|█| 1566/1566 [00:30<00:00, 50.76it/s, Loss=2.1831, Avg Loss=2.190\n"
1418
+ ]
1419
+ },
1420
+ {
1421
+ "name": "stdout",
1422
+ "output_type": "stream",
1423
+ "text": [
1424
+ "END TRAIN\n",
1425
+ "nah twin\n",
1426
+ "Epoch 2/10: Train Loss = 2.1905, Val Loss = 0.0000\n",
1427
+ "No improvement for 1 epochs\n"
1428
+ ]
1429
+ },
1430
+ {
1431
+ "name": "stderr",
1432
+ "output_type": "stream",
1433
+ "text": [
1434
+ "Training: 100%|█| 1566/1566 [00:30<00:00, 50.69it/s, Loss=2.7081, Avg Loss=2.708\n"
1435
+ ]
1436
+ },
1437
+ {
1438
+ "name": "stdout",
1439
+ "output_type": "stream",
1440
+ "text": [
1441
+ "END TRAIN\n",
1442
+ "nah twin\n",
1443
+ "Epoch 3/10: Train Loss = 2.7081, Val Loss = 0.0000\n",
1444
+ "No improvement for 2 epochs\n"
1445
+ ]
1446
+ },
1447
+ {
1448
+ "name": "stderr",
1449
+ "output_type": "stream",
1450
+ "text": [
1451
+ "Training: 100%|█| 1566/1566 [00:31<00:00, 50.37it/s, Loss=2.7081, Avg Loss=2.708\n"
1452
+ ]
1453
+ },
1454
+ {
1455
+ "name": "stdout",
1456
+ "output_type": "stream",
1457
+ "text": [
1458
+ "END TRAIN\n",
1459
+ "nah twin\n",
1460
+ "Epoch 4/10: Train Loss = 2.7081, Val Loss = 0.0000\n",
1461
+ "No improvement for 3 epochs\n"
1462
+ ]
1463
+ },
1464
+ {
1465
+ "name": "stderr",
1466
+ "output_type": "stream",
1467
+ "text": [
1468
+ "Training: 100%|█| 1566/1566 [00:31<00:00, 50.40it/s, Loss=2.7081, Avg Loss=2.708\n"
1469
+ ]
1470
+ },
1471
+ {
1472
+ "name": "stdout",
1473
+ "output_type": "stream",
1474
+ "text": [
1475
+ "END TRAIN\n",
1476
+ "nah twin\n",
1477
+ "Epoch 5/10: Train Loss = 2.7081, Val Loss = 0.0000\n",
1478
+ "No improvement for 4 epochs\n"
1479
+ ]
1480
+ },
1481
+ {
1482
+ "name": "stderr",
1483
+ "output_type": "stream",
1484
+ "text": [
1485
+ "Training: 13%|▏| 198/1566 [00:04<00:27, 50.58it/s, Loss=2.7081, Avg Loss=2.7081"
1486
+ ]
1487
+ }
1488
+ ],
1489
+ "source": [
1490
+ "train_loader, val_loader = create_dataloaders(\n",
1491
+ " train_list, val_list[:100], tokenizer, \n",
1492
+ " batch_size=8, positive_threshold=0.8\n",
1493
+ ")\n",
1494
+ "\n",
1495
+ "training_loop(\n",
1496
+ " train_loader, val_loader, model, tokenizer,\n",
1497
+ " epochs=10, patience=5, lr=1e-5, \n",
1498
+ " device_name='cuda', base_temp=0.1\n",
1499
+ ")\n"
1500
+ ]
1501
+ },
1502
+ {
1503
+ "cell_type": "code",
1504
+ "execution_count": null,
1505
+ "id": "58343b16-1bdb-4476-ac61-e797fbc661d2",
1506
+ "metadata": {},
1507
+ "outputs": [],
1508
+ "source": [
1509
+ "print(train_list[:5], '\\n\\n', val_list[:5])"
1510
+ ]
1511
+ },
1512
+ {
1513
+ "cell_type": "code",
1514
+ "execution_count": null,
1515
+ "id": "47561022-5f57-4b7b-b903-ef1f8773f903",
1516
+ "metadata": {},
1517
+ "outputs": [],
1518
+ "source": []
1519
+ },
1520
+ {
1521
+ "cell_type": "code",
1522
+ "execution_count": null,
1523
+ "id": "5fcef978-3630-4201-9301-6963a8560517",
1524
+ "metadata": {},
1525
+ "outputs": [],
1526
+ "source": []
1527
+ }
1528
+ ],
1529
+ "metadata": {
1530
+ "kernelspec": {
1531
+ "display_name": "Python [conda env:.mlspace-bolgov_simson_training]",
1532
+ "language": "python",
1533
+ "name": "conda-env-.mlspace-bolgov_simson_training-py"
1534
+ },
1535
+ "language_info": {
1536
+ "codemirror_mode": {
1537
+ "name": "ipython",
1538
+ "version": 3
1539
+ },
1540
+ "file_extension": ".py",
1541
+ "mimetype": "text/x-python",
1542
+ "name": "python",
1543
+ "nbconvert_exporter": "python",
1544
+ "pygments_lexer": "ipython3",
1545
+ "version": "3.12.11"
1546
+ }
1547
+ },
1548
+ "nbformat": 4,
1549
+ "nbformat_minor": 5
1550
+ }
simson_modeling/.ipynb_checkpoints/simson_ddp_train-checkpoint.py ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # 1. IMPORTS
3
+ # ==============================================================================
4
+ import os
5
+ import warnings
6
+ import wandb
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.optim as optim
11
+ import torch.nn.functional as F
12
+ from torch.utils.data import DataLoader, Dataset
13
+ import numpy as np
14
+ from tqdm import tqdm
15
+ from rdkit import Chem, RDLogger
16
+ from datasets import load_dataset, load_from_disk
17
+ from transformers import AutoTokenizer, BertModel, BertConfig
18
+ import pandas as pd
19
+
20
+ # ==============================================================================
21
+ # 2. INITIAL SETUP
22
+ # ==============================================================================
23
+ # Suppress RDKit console output
24
+ RDLogger.DisableLog('rdApp.*')
25
+ # Ignore warnings for cleaner output
26
+ warnings.filterwarnings("ignore")
27
+
28
+ # ==============================================================================
29
+ # 3. MODEL AND LOSS FUNCTION
30
+ # ==============================================================================
31
+ def global_average_pooling(x):
32
+ """Global Average Pooling: from [B, max_len, hid_dim] to [B, hid_dim]"""
33
+ return torch.mean(x, dim=1)
34
+
35
+ class SimSonEncoder(nn.Module):
36
+ """The main encoder model based on BERT."""
37
+ def __init__(self, config: BertConfig, max_len: int, dropout: float = 0.1):
38
+ super(SimSonEncoder, self).__init__()
39
+ self.bert = BertModel(config, add_pooling_layer=False)
40
+ self.linear = nn.Linear(config.hidden_size, max_len)
41
+ self.dropout = nn.Dropout(dropout)
42
+
43
+ def forward(self, input_ids, attention_mask=None):
44
+ if attention_mask is None:
45
+ attention_mask = input_ids.ne(self.bert.config.pad_token_id)
46
+
47
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
48
+ hidden_states = self.dropout(outputs.last_hidden_state)
49
+ pooled_output = global_average_pooling(hidden_states)
50
+ return self.linear(pooled_output)
51
+
52
+ class ContrastiveLoss(nn.Module):
53
+ """Calculates the contrastive loss for the SimSon model."""
54
+ def __init__(self, temperature=0.2):
55
+ super(ContrastiveLoss, self).__init__()
56
+ self.temperature = temperature
57
+ self.similarity_fn = F.cosine_similarity
58
+
59
+ def forward(self, proj_1, proj_2):
60
+ batch_size = proj_1.shape[0]
61
+ device = proj_1.device
62
+
63
+ # Normalize projections
64
+ z_i = F.normalize(proj_1, p=2, dim=1)
65
+ z_j = F.normalize(proj_2, p=2, dim=1)
66
+
67
+ # Concatenate for similarity matrix calculation
68
+ representations = torch.cat([z_i, z_j], dim=0)
69
+
70
+ # Calculate cosine similarity between all pairs
71
+ similarity_matrix = self.similarity_fn(representations.unsqueeze(1), representations.unsqueeze(0), dim=2)
72
+
73
+ # Identify positive pairs (original and its augmentation)
74
+ sim_ij = torch.diag(similarity_matrix, batch_size)
75
+ sim_ji = torch.diag(similarity_matrix, -batch_size)
76
+ positives = torch.cat([sim_ij, sim_ji], dim=0)
77
+
78
+ # Create a mask to exclude self-comparisons
79
+ nominator = torch.exp(positives / self.temperature)
80
+ mask = (~torch.eye(batch_size * 2, batch_size * 2, dtype=torch.bool, device=device)).float()
81
+ denominator = mask * torch.exp(similarity_matrix / self.temperature)
82
+
83
+ # Calculate the final loss
84
+ loss = -torch.log(nominator / torch.sum(denominator, dim=1))
85
+ return torch.sum(loss) / (2 * batch_size)
86
+
87
+ # ==============================================================================
88
+ # 4. DATA HANDLING (Keeping your existing classes unchanged)
89
+ # ==============================================================================
90
+ class SmilesEnumerator:
91
+ """Generates randomized SMILES strings for data augmentation."""
92
+ def randomize_smiles(self, smiles):
93
+ try:
94
+ mol = Chem.MolFromSmiles(smiles)
95
+ return Chem.MolToSmiles(mol, doRandom=True, canonical=False) if mol else smiles
96
+ except:
97
+ return smiles
98
+
99
+ class ContrastiveSmilesDataset(Dataset):
100
+ """Dataset for creating pairs of augmented SMILES for contrastive learning."""
101
+ def __init__(self, smiles_list, tokenizer, max_length=512):
102
+ self.smiles_list = smiles_list
103
+ self.tokenizer = tokenizer
104
+ self.max_length = max_length
105
+ self.enumerator = SmilesEnumerator()
106
+
107
+ def __len__(self):
108
+ return len(self.smiles_list)
109
+
110
+ def __getitem__(self, idx):
111
+ original_smiles = self.smiles_list[idx]
112
+
113
+ # Create two different augmentations of the same SMILES
114
+ smiles_1 = self.enumerator.randomize_smiles(original_smiles)
115
+ smiles_2 = self.enumerator.randomize_smiles(original_smiles)
116
+
117
+ # Tokenize and do pad. Padding will be handled by the collate_fn.
118
+ tokens_1 = self.tokenizer(smiles_1, max_length=self.max_length, truncation=True, padding='max_length')
119
+ tokens_2 = self.tokenizer(smiles_2, max_length=self.max_length, truncation=True, padding='max_length')
120
+
121
+ return {
122
+ 'input_ids_1': torch.tensor(tokens_1['input_ids']),
123
+ 'attention_mask_1': torch.tensor(tokens_1['attention_mask']),
124
+ 'input_ids_2': torch.tensor(tokens_2['input_ids']),
125
+ 'attention_mask_2': torch.tensor(tokens_2['attention_mask']),
126
+ }
127
+
128
+ class PrecomputedContrastiveSmilesDataset(Dataset):
129
+ """
130
+ A Dataset class that reads pre-augmented SMILES pairs from a Parquet file.
131
+ This is significantly faster as it offloads the expensive SMILES randomization
132
+ to a one-time preprocessing step.
133
+ """
134
+ def __init__(self, tokenizer, file_path: str, max_length: int = 512):
135
+ self.tokenizer = tokenizer
136
+ self.max_length = max_length
137
+
138
+ # Load the entire dataset from the Parquet file into memory.
139
+ # This is fast and efficient for subsequent access.
140
+ print(f"Loading pre-computed data from {file_path}...")
141
+ self.data = pd.read_parquet(file_path)
142
+ print("Data loaded successfully.")
143
+
144
+ def __len__(self):
145
+ """Returns the total number of pairs in the dataset."""
146
+ return len(self.data)
147
+
148
+ def __getitem__(self, idx):
149
+ """
150
+ Retrieves a pre-augmented pair, tokenizes it, and returns it
151
+ in the format expected by the DataCollator.
152
+ """
153
+ # Retrieve the pre-augmented pair from the DataFrame
154
+ row = self.data.iloc[idx]
155
+ smiles_1 = row['smiles_1']
156
+ smiles_2 = row['smiles_2']
157
+
158
+ # Tokenize the pair. This operation is fast and remains in the data loader.
159
+ tokens_1 = self.tokenizer(smiles_1, max_length=self.max_length, truncation=True, padding='max_length')
160
+ tokens_2 = self.tokenizer(smiles_2, max_length=self.max_length, truncation=True, padding='max_length')
161
+
162
+ return {
163
+ 'input_ids_1': torch.tensor(tokens_1['input_ids']),
164
+ 'attention_mask_1': torch.tensor(tokens_1['attention_mask']),
165
+ 'input_ids_2': torch.tensor(tokens_2['input_ids']),
166
+ 'attention_mask_2': torch.tensor(tokens_2['attention_mask']),
167
+ }
168
+
169
+ class PreTokenizedSmilesDataset(Dataset):
170
+ """
171
+ A Dataset that loads a pre-tokenized and pre-padded dataset created
172
+ by the preprocessing script. It uses memory-mapping for instant loads
173
+ and high efficiency.
174
+ """
175
+ def __init__(self, dataset_path: str):
176
+ # Load the dataset from disk. This is very fast due to memory-mapping.
177
+ self.dataset = load_from_disk(dataset_path)
178
+ # Set the format to PyTorch tensors for direct use in the model
179
+ self.dataset.set_format(type='torch', columns=[
180
+ 'input_ids_1', 'attention_mask_1', 'input_ids_2', 'attention_mask_2'
181
+ ])
182
+ print(f"Successfully loaded pre-tokenized dataset from {dataset_path}.")
183
+
184
+ def __len__(self):
185
+ """Returns the total number of items in the dataset."""
186
+ return len(self.dataset)
187
+
188
+ def __getitem__(self, idx):
189
+ """Retrieves a single pre-processed item."""
190
+ return self.dataset[idx]
191
+
192
+ class DataCollatorWithPadding:
193
+ """
194
+ A collate function that dynamically pads inputs to the longest sequence
195
+ across both augmented views in the batch, ensuring consistent tensor shapes.
196
+ """
197
+ def __init__(self, tokenizer):
198
+ self.tokenizer = tokenizer
199
+
200
+ def __call__(self, features):
201
+ # Create a combined list of features for both views to find the global max length
202
+ combined_features = []
203
+ for feature in features:
204
+ combined_features.append({'input_ids': feature['input_ids_1'], 'attention_mask': feature['attention_mask_1']})
205
+ combined_features.append({'input_ids': feature['input_ids_2'], 'attention_mask': feature['attention_mask_2']})
206
+
207
+ # Pad the combined batch. This ensures all sequences are padded to the same length.
208
+ padded_combined = self.tokenizer.pad(combined_features, padding='longest', return_tensors='pt')
209
+
210
+ # Split the padded tensors back into two views
211
+ batch_size = len(features)
212
+ input_ids_1, input_ids_2 = torch.split(padded_combined['input_ids'], batch_size, dim=0)
213
+ attention_mask_1, attention_mask_2 = torch.split(padded_combined['attention_mask'], batch_size, dim=0)
214
+
215
+ return {
216
+ 'input_ids_1': input_ids_1,
217
+ 'attention_mask_1': attention_mask_1,
218
+ 'input_ids_2': input_ids_2,
219
+ 'attention_mask_2': attention_mask_2,
220
+ }
221
+
222
+ # ==============================================================================
223
+ # 5. CHECKPOINT UTILITIES
224
+ # ==============================================================================
225
+ def save_checkpoint(model, optimizer, scheduler, global_step, save_path):
226
+ """Save complete checkpoint with model, optimizer, scheduler states and step count."""
227
+ checkpoint = {
228
+ 'model_state_dict': model.state_dict(),
229
+ 'optimizer_state_dict': optimizer.state_dict(),
230
+ 'scheduler_state_dict': scheduler.state_dict(),
231
+ 'global_step': global_step,
232
+ }
233
+ torch.save(checkpoint, save_path)
234
+ print(f"Full checkpoint saved at step {global_step}")
235
+
236
+ def load_checkpoint(checkpoint_path, model, optimizer, scheduler):
237
+ """Load checkpoint and return the global step to resume from."""
238
+ checkpoint = torch.load(checkpoint_path, map_location='cpu')
239
+ model.load_state_dict(checkpoint['model_state_dict'])
240
+ optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
241
+ scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
242
+ global_step = checkpoint['global_step']
243
+ print(f"Checkpoint loaded from step {global_step}")
244
+ return global_step
245
+
246
+ # ==============================================================================
247
+ # 6. TRAINING AND EVALUATION LOOPS - MODIFIED
248
+ # ==============================================================================
249
+ def evaluation_step(model, batch, criterion, device):
250
+ """Performs a single evaluation step on a batch of data."""
251
+ input_ids_1 = batch['input_ids_1'].to(device)
252
+ attention_mask_1 = batch['attention_mask_1'].to(device)
253
+ input_ids_2 = batch['input_ids_2'].to(device)
254
+ attention_mask_2 = batch['attention_mask_2'].to(device)
255
+
256
+ combined_input_ids = torch.cat([input_ids_1, input_ids_2], dim=0)
257
+ combined_attention_mask = torch.cat([attention_mask_1, attention_mask_2], dim=0)
258
+
259
+ with torch.no_grad():
260
+ combined_proj = model(combined_input_ids, combined_attention_mask)
261
+
262
+ batch_size = input_ids_1.size(0)
263
+ proj_1, proj_2 = torch.split(combined_proj, batch_size, dim=0)
264
+
265
+ loss = criterion(proj_1, proj_2)
266
+ return proj_1, proj_2, loss
267
+
268
+ def train_with_step_based_validation(model, train_loader, val_loader, optimizer, criterion, device,
269
+ scheduler, checkpoint_path, save_steps, validation_steps,
270
+ start_step=0, max_steps=None):
271
+ """
272
+ Modified training function with step-based validation and checkpointing.
273
+ """
274
+ model.train()
275
+ global_step = start_step
276
+ best_val_loss = float('inf')
277
+
278
+ # Calculate total steps if max_steps is not provided
279
+ if max_steps is None:
280
+ max_steps = len(train_loader)
281
+
282
+ progress_bar = tqdm(total=max_steps - start_step, desc="Training Steps", initial=start_step)
283
+
284
+ # Create iterator that can be resumed from any point
285
+ train_iterator = iter(train_loader)
286
+
287
+ # Skip batches if resuming from checkpoint
288
+ if start_step > 0:
289
+ batches_to_skip = start_step % len(train_loader)
290
+ for _ in range(batches_to_skip):
291
+ try:
292
+ next(train_iterator)
293
+ except StopIteration:
294
+ train_iterator = iter(train_loader)
295
+
296
+ while global_step < max_steps:
297
+ try:
298
+ batch = next(train_iterator)
299
+ except StopIteration:
300
+ train_iterator = iter(train_loader)
301
+ batch = next(train_iterator)
302
+
303
+ # Training step
304
+ input_ids_1 = batch['input_ids_1'].to(device)
305
+ attention_mask_1 = batch['attention_mask_1'].to(device)
306
+ input_ids_2 = batch['input_ids_2'].to(device)
307
+ attention_mask_2 = batch['attention_mask_2'].to(device)
308
+
309
+ optimizer.zero_grad()
310
+ with torch.autocast(dtype=torch.float16, device_type="cuda"):
311
+ combined_input_ids = torch.cat([input_ids_1, input_ids_2], dim=0)
312
+ combined_attention_mask = torch.cat([attention_mask_1, attention_mask_2], dim=0)
313
+
314
+ combined_proj = model(combined_input_ids, combined_attention_mask)
315
+
316
+ batch_size = input_ids_1.size(0)
317
+ proj_1, proj_2 = torch.split(combined_proj, batch_size, dim=0)
318
+
319
+ loss = criterion(proj_1, proj_2)
320
+
321
+ loss.backward()
322
+ optimizer.step()
323
+ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
324
+ scheduler.step()
325
+
326
+ global_step += 1
327
+
328
+ progress_bar.update(1)
329
+ progress_bar.set_postfix(loss=f"{loss.item():.4f}", step=global_step)
330
+
331
+ wandb.log({
332
+ "train_batch_loss": loss.item(),
333
+ "learning_rate": scheduler.get_last_lr()[0],
334
+ "global_step": global_step
335
+ })
336
+
337
+ # Step-based validation
338
+ if global_step % validation_steps == 0:
339
+ val_loss = validate_epoch(model, val_loader, criterion, device)
340
+ wandb.log({
341
+ "val_loss": val_loss,
342
+ "global_step": global_step
343
+ })
344
+
345
+ # Save best model (model state only for best checkpoint)
346
+ if val_loss < best_val_loss:
347
+ best_val_loss = val_loss
348
+ model_save_path = checkpoint_path.replace('.pt', '_best_model.bin')
349
+ torch.save(model.state_dict(), model_save_path)
350
+ progress_bar.write(f"Step {global_step}: New best model saved with val loss {val_loss:.4f}")
351
+
352
+ model.train() # Resume training mode after validation
353
+
354
+ # Step-based checkpointing (full checkpoint)
355
+ if global_step % save_steps == 0:
356
+ save_checkpoint(model, optimizer, scheduler, global_step, checkpoint_path)
357
+
358
+ progress_bar.close()
359
+ return global_step
360
+
361
+ def validate_epoch(model, val_loader, criterion, device):
362
+ """Validation function - unchanged from original."""
363
+ model.eval()
364
+ total_loss = 0
365
+ progress_bar = tqdm(val_loader, desc="Validating", leave=False)
366
+
367
+ for batch in progress_bar:
368
+ _, _, loss = evaluation_step(model, batch, criterion, device)
369
+ total_loss += loss.item()
370
+
371
+ avg_loss = total_loss / len(val_loader)
372
+ print(f'Validation loss: {avg_loss:.4f}')
373
+ return avg_loss
374
+
375
+ def test_model(model, test_loader, criterion, device):
376
+ """Test function - unchanged from original."""
377
+ model.eval()
378
+ total_loss = 0
379
+ all_similarities = []
380
+ progress_bar = tqdm(test_loader, desc="Testing", leave=False)
381
+
382
+ for batch in progress_bar:
383
+ proj_1, proj_2, loss = evaluation_step(model, batch, criterion, device)
384
+ total_loss += loss.item()
385
+
386
+ proj_1_norm = F.normalize(proj_1, p=2, dim=1)
387
+ proj_2_norm = F.normalize(proj_2, p=2, dim=1)
388
+ batch_similarities = F.cosine_similarity(proj_1_norm, proj_2_norm, dim=1)
389
+ all_similarities.extend(batch_similarities.cpu().numpy())
390
+
391
+ avg_loss = total_loss / len(test_loader)
392
+ avg_sim = np.mean(all_similarities)
393
+ std_sim = np.std(all_similarities)
394
+
395
+ return avg_loss, avg_sim, std_sim
396
+
397
+ # ==============================================================================
398
+ # 7. MODIFIED SINGLE-GPU TRAINING
399
+ # ==============================================================================
400
+ def run_training(model_config, hparams, data_splits):
401
+ """The main function to run the training and evaluation process with step-based validation."""
402
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
403
+ print(f"Using device: {device}")
404
+
405
+ wandb_key = os.getenv("WANDB_API_KEY")
406
+ if wandb_key:
407
+ wandb.login(key=wandb_key)
408
+ wandb.init(
409
+ #project="simson-contrastive-learning-single-gpu",
410
+ #name=f"run-{wandb.util.generate_id()}",
411
+ #config=hparams
412
+ )
413
+
414
+ train_smiles, val_smiles, test_smiles = data_splits
415
+
416
+ tokenizer = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
417
+
418
+ precomputed_train_path = '/home/jovyan/simson_training_bolgov/data/pubchem_119m_splits/train.parquet'
419
+ precomputed_test_path = '/home/jovyan/simson_training_bolgov/data/pubchem_119m_splits/test.parquet'
420
+ precomputed_val_path = '/home/jovyan/simson_training_bolgov/data/pubchem_119m_splits/validation.parquet'
421
+
422
+ train_dataset = PrecomputedContrastiveSmilesDataset(tokenizer, file_path=precomputed_train_path, max_length=hparams['max_length'])
423
+ test_dataset = PrecomputedContrastiveSmilesDataset(tokenizer, file_path=precomputed_test_path, max_length=hparams['max_length'])
424
+ val_dataset = PrecomputedContrastiveSmilesDataset(tokenizer, file_path=precomputed_val_path, max_length=hparams['max_length'])
425
+
426
+ train_loader = DataLoader(train_dataset, batch_size=hparams['batch_size'], shuffle=True, num_workers=8, prefetch_factor=128, pin_memory=True)
427
+ val_loader = DataLoader(val_dataset, batch_size=hparams['batch_size'], shuffle=False, num_workers=2, pin_memory=True)
428
+ test_loader = DataLoader(test_dataset, batch_size=hparams['batch_size'], shuffle=False, num_workers=2, pin_memory=True)
429
+
430
+ print('Initialized all data. Compiling the model...')
431
+ model = SimSonEncoder(config=model_config, max_len=hparams['max_embeddings']).to(device)
432
+ model = torch.compile(model)
433
+ model.load_state_dict(torch.load('/home/jovyan/simson_training_bolgov/simson_checkpoints/checkpoint_best_model.bin'))
434
+ print(model)
435
+
436
+ total_params = sum(p.numel() for p in model.parameters())
437
+
438
+ print(f"Total number of parameters: {total_params // 1_000_000} M")
439
+ wandb.config.update({"total_params_M": total_params // 1_000_000})
440
+
441
+ criterion = ContrastiveLoss(temperature=hparams['temperature']).to(device)
442
+ optimizer = optim.AdamW(model.parameters(), lr=hparams['lr'], weight_decay=1e-5, fused=True)
443
+
444
+ total_steps = hparams['epochs'] * len(train_loader)
445
+ scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_mult=1, T_0=total_steps)
446
+
447
+ print("Starting training...")
448
+ wandb.watch(model, log='all', log_freq=5000)
449
+
450
+ start_step = 0
451
+ checkpoint_path = hparams['checkpoint_path']
452
+
453
+ # Resume from checkpoint if provided
454
+ if hparams.get('resume_checkpoint') and os.path.exists(hparams['resume_checkpoint']):
455
+ print(f"Resuming from checkpoint: {hparams['resume_checkpoint']}")
456
+ start_step = load_checkpoint(hparams['resume_checkpoint'], model, optimizer, scheduler)
457
+
458
+ # Train with step-based validation
459
+ final_step = train_with_step_based_validation(
460
+ model, train_loader, val_loader, optimizer, criterion, device,
461
+ scheduler, checkpoint_path, hparams['save_steps'], hparams['validation_steps'],
462
+ start_step=start_step, max_steps=total_steps
463
+ )
464
+
465
+ print("Training complete. Starting final testing...")
466
+
467
+ # Load the best model for testing (model state only)
468
+ best_model_path = checkpoint_path.replace('.pt', '_best_model.bin')
469
+ if os.path.exists(best_model_path):
470
+ model.load_state_dict(torch.load(best_model_path))
471
+ print("Loaded best model for testing")
472
+
473
+ test_loss, avg_sim, std_sim = test_model(model, test_loader, criterion, device)
474
+
475
+ print("\n--- Test Results ---")
476
+ print(f"Test Loss: {test_loss:.4f}")
477
+ print(f"Average Cosine Similarity: {avg_sim:.4f} ± {std_sim:.4f}")
478
+ print("--------------------")
479
+
480
+ wandb.log({
481
+ "test_loss": test_loss,
482
+ "avg_cosine_similarity": avg_sim,
483
+ "std_cosine_similarity": std_sim
484
+ })
485
+
486
+ # Save final model state only
487
+ final_model_path = hparams['save_path']
488
+ torch.save(model.state_dict(), final_model_path)
489
+ print(f"Final model saved to {final_model_path}")
490
+
491
+ wandb.finish()
492
+
493
+ # ==============================================================================
494
+ # 8. MAIN EXECUTION
495
+ # ==============================================================================
496
+ def main():
497
+ """Main function to configure and run the training process."""
498
+ hparams = {
499
+ 'epochs': 2,
500
+ 'lr': 6e-6,
501
+ 'temperature': 0.05,
502
+ 'batch_size': 64,
503
+ 'max_length': 256,
504
+ 'save_path': "simson_checkpoints_more_epochs/simson_model_single_gpu.bin",
505
+ 'checkpoint_path': "simson_checkpoints_more_epochs/checkpoint.pt", # Full checkpoint
506
+ 'save_steps': 50000, # Save checkpoint every 10k steps
507
+ 'validation_steps': 5000, # Validate every 5k steps
508
+ 'max_embeddings': 512,
509
+ 'resume_checkpoint': None, # Set to checkpoint path to resume
510
+ }
511
+
512
+ dataset = load_dataset('HoangHa/SMILES-250M')['train']
513
+ smiles_column_name = 'SMILES'
514
+
515
+ total_size = len(dataset)
516
+ test_size = int(0.1 * total_size)
517
+ val_size = int(0.1 * (total_size - test_size))
518
+
519
+ test_smiles = dataset.select(range(test_size))[smiles_column_name]
520
+ val_smiles = dataset.select(range(test_size, test_size + val_size))[smiles_column_name]
521
+ train_smiles = dataset.select(range(test_size + val_size, total_size))[smiles_column_name]
522
+ data_splits = (train_smiles, val_smiles, test_smiles)
523
+
524
+ tokenizer = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
525
+ model_config = BertConfig(
526
+ vocab_size=tokenizer.vocab_size,
527
+ hidden_size=768,
528
+ num_hidden_layers=4,
529
+ num_attention_heads=12,
530
+ intermediate_size=2048,
531
+ max_position_embeddings=512
532
+ )
533
+
534
+ # Create directories
535
+ save_dir = os.path.dirname(hparams['save_path'])
536
+ checkpoint_dir = os.path.dirname(hparams['checkpoint_path'])
537
+ for directory in [save_dir, checkpoint_dir]:
538
+ if not os.path.exists(directory):
539
+ os.makedirs(directory)
540
+
541
+ # Directly call the training function for a single-GPU run
542
+ run_training(model_config, hparams, data_splits)
543
+
544
+ if __name__ == '__main__':
545
+ main()
simson_modeling/.ipynb_checkpoints/upload_state_to_hf-checkpoint.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+
3
+ state_path = '/home/jovyan/simson_training_bolgov/simson_checkpoints_1M/checkpoint_best_model.bin'
4
+
5
+ from huggingface_hub import HfApi
6
+ api = HfApi()
7
+ upload_folder = True
8
+ if not upload_folder:
9
+
10
+ api.upload_file(
11
+ path_or_fileobj=state_path,
12
+ path_in_repo="polymer_1M_weights.bin",
13
+ repo_id="Defetya/simson_base",
14
+ repo_type="model",
15
+ )
16
+ else:
17
+
18
+ api.upload_folder(
19
+ folder_path="/home/jovyan/simson_training_bolgov",
20
+ repo_id="Defetya/simson_base",
21
+ path_in_repo="simson_modeling",
22
+ repo_type="model",
23
+ )
simson_modeling/.simson_ddp_train.py.swp ADDED
Binary file (1.02 kB). View file
 
simson_modeling/.upload_state_to_hf.py.swp ADDED
Binary file (1.02 kB). View file
 
simson_modeling/__pycache__/create_augmented_dataset.cpython-312.pyc ADDED
Binary file (3.51 kB). View file
 
simson_modeling/__pycache__/create_splits.cpython-312.pyc ADDED
Binary file (6.44 kB). View file
 
simson_modeling/__pycache__/simson_ddp_train.cpython-312.pyc ADDED
Binary file (27 kB). View file
 
simson_modeling/__pycache__/upload_state_to_hf.cpython-312.pyc ADDED
Binary file (731 Bytes). View file
 
simson_modeling/create_augmented_dataset.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from tqdm import tqdm
3
+ from rdkit import Chem, RDLogger
4
+ from datasets import load_dataset
5
+ from multiprocessing import Pool, cpu_count
6
+ import os
7
+
8
+ # Suppress RDKit console output for cleaner logs
9
+ RDLogger.DisableLog('rdApp.*')
10
+
11
+ class SmilesEnumerator:
12
+ """
13
+ A simple class to encapsulate the SMILES randomization logic.
14
+ Needed for multiprocessing to work correctly with instance methods.
15
+ """
16
+ def randomize_smiles(self, smiles):
17
+ """Generates a randomized SMILES string."""
18
+ try:
19
+ mol = Chem.MolFromSmiles(smiles)
20
+ # Return a randomized, non-canonical SMILES string
21
+ return Chem.MolToSmiles(mol, doRandom=True, canonical=False) if mol else smiles
22
+ except:
23
+ # If RDKit fails, return the original smiles string
24
+ return smiles
25
+
26
+ def create_augmented_pair(smiles_string):
27
+ """
28
+ Worker function: takes one SMILES string and returns a tuple
29
+ containing two different randomized versions of it.
30
+ """
31
+ enumerator = SmilesEnumerator()
32
+ smiles_1 = enumerator.randomize_smiles(smiles_string)
33
+ smiles_2 = enumerator.randomize_smiles(smiles_string)
34
+ return smiles_1, smiles_2
35
+
36
+ def main():
37
+ """
38
+ Main function to run the parallel data preprocessing.
39
+ """
40
+ # --- Configuration ---
41
+ # Load your desired dataset from Hugging Face
42
+ dataset_name = 'jablonkagroup/pubchem-smiles-molecular-formula'
43
+ # Specify the column containing the SMILES strings
44
+ smiles_column_name = 'smiles'
45
+ # Set the output file path
46
+ output_path = 'data/pubchem_2_epoch_50M'
47
+
48
+ # --- Data Loading ---
49
+ print(f"Loading dataset '{dataset_name}'...")
50
+ # Use streaming to avoid downloading the whole dataset if you only need a subset
51
+ #dataset = pd.read_csv('/home/jovyan/simson_training_bolgov/data/PI1M_v2.csv')
52
+ dataset = load_dataset(dataset_name)['train'].select(range(50_000_000))
53
+ # Take the desired number of samples
54
+ smiles_list = dataset[smiles_column_name]
55
+ print(f"Successfully fetched {len(smiles_list)} SMILES strings.")
56
+
57
+ # --- Parallel Processing ---
58
+ # Use all available CPU cores for maximum speed
59
+ num_workers = cpu_count()
60
+ print(f"Starting SMILES augmentation with {num_workers} worker processes...")
61
+
62
+ # A Pool of processes will run the `create_augmented_pair` function in parallel
63
+ with Pool(num_workers) as p:
64
+ # Use tqdm to create a progress bar for the mapping operation
65
+ results = list(tqdm(p.imap(create_augmented_pair, smiles_list), total=len(smiles_list), desc="Augmenting Pairs"))
66
+
67
+ # --- Saving Data ---
68
+ print("Processing complete. Converting to DataFrame...")
69
+ # Convert the list of tuples into a pandas DataFrame
70
+ df = pd.DataFrame(results, columns=['smiles_1', 'smiles_2'])
71
+
72
+ # Ensure the output directory exists
73
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
74
+
75
+ print(f"Saving augmented pairs to '{output_path}'...")
76
+ # Save the DataFrame to a Parquet file for efficient storage and loading
77
+ df.to_parquet(output_path)
78
+
79
+ print("All done. Your pre-computed dataset is ready!")
80
+
81
+ if __name__ == '__main__':
82
+ main()
83
+
simson_modeling/create_augmented_dataset.py.save ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from tqdm import tqdm
3
+ from rdkit import Chem, RDLogger
4
+ from datasets import load_dataset
5
+ from multiprocessing import Pool, cpu_count
6
+ import os
7
+
8
+ # Suppress RDKit console output for cleaner logs
9
+ RDLogger.DisableLog('rdApp.*')
10
+
11
+ class SmilesEnumerator:
12
+ """
13
+ A simple class to encapsulate the SMILES randomization logic.
14
+ Needed for multiprocessing to work correctly with instance methods.
15
+ """
16
+ def randomize_smiles(self, smiles):
17
+ """Generates a randomized SMILES string."""
18
+ try:
19
+ mol = Chem.MolFromSmiles(smiles)
20
+ # Return a randomized, non-canonical SMILES string
21
+ return Chem.MolToSmiles(mol, doRandom=True, canonical=False) if mol else smiles
22
+ except:
23
+ # If RDKit fails, return the original smiles string
24
+ return smiles
25
+
26
+ def create_augmented_pair(smiles_string):
27
+ """
28
+ Worker function: takes one SMILES string and returns a tuple
29
+ containing two different randomized versions of it.
30
+ """
31
+ enumerator = SmilesEnumerator()
32
+ smiles_1 = enumerator.randomize_smiles(smiles_string)
33
+ smiles_2 = enumerator.randomize_smiles(smiles_string)
34
+ return smiles_1, smiles_2
35
+
36
+ def main():
37
+ """
38
+ Main function to run the parallel data preprocessing.
39
+ """
40
+ # --- Configuration ---
41
+ # Load your desired dataset from Hugging Face
42
+ dataset_name = 'jablonkagroup/pubchem-smiles-molecular-formula'
43
+ # Specify the column containing the SMILES strings
44
+ smiles_column_name = 'smiles'
45
+ # Set the output file path
46
+ output_path = 'data/pubchem_computed_110_end_M.parquet'
47
+
48
+ # --- Data Loading ---
49
+ print(f"Loading dataset '{dataset_name}'...")
50
+ # Use streaming to avoid downloading the whole dataset if you only need a subset
51
+ dataset = load_dataset(dataset_name, split='train').select(range(110_000_000, ))
52
+
53
+ # Take the desired number of samples
54
+ smiles_list = dataset[smiles_column_name]
55
+ print(f"Successfully fetched {len(smiles_list)} SMILES strings.")
56
+
57
+ # --- Parallel Processing ---
58
+ # Use all available CPU cores for maximum speed
59
+ num_workers = cpu_count()
60
+ print(f"Starting SMILES augmentation with {num_workers} worker processes...")
61
+
62
+ # A Pool of processes will run the `create_augmented_pair` function in parallel
63
+ with Pool(num_workers) as p:
64
+ # Use tqdm to create a progress bar for the mapping operation
65
+ results = list(tqdm(p.imap(create_augmented_pair, smiles_list), total=len(smiles_list), desc="Augmenting Pairs"))
66
+
67
+ # --- Saving Data ---
68
+ print("Processing complete. Converting to DataFrame...")
69
+ # Convert the list of tuples into a pandas DataFrame
70
+ df = pd.DataFrame(results, columns=['smiles_1', 'smiles_2'])
71
+
72
+ # Ensure the output directory exists
73
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
74
+
75
+ print(f"Saving augmented pairs to '{output_path}'...")
76
+ # Save the DataFrame to a Parquet file for efficient storage and loading
77
+ df.to_parquet(output_path)
78
+
79
+ print("All done. Your pre-computed dataset is ready!")
80
+
81
+ if __name__ == '__main__':
82
+ main()
83
+
simson_modeling/create_splits.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ import numpy as np
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ def concatenate_and_split_parquet(
8
+ input_dir: str,
9
+ output_dir: str,
10
+ val_size: int = 10000,
11
+ test_size: int = 5000,
12
+ random_state: int = 42
13
+ ):
14
+ """
15
+ Concatenate all parquet files in a directory and split into train/val/test sets.
16
+
17
+ Args:
18
+ input_dir: Path to directory containing parquet files
19
+ output_dir: Path to directory where split files will be saved
20
+ val_size: Number of samples for validation set (default: 10000)
21
+ test_size: Number of samples for test set (default: 5000)
22
+ random_state: Random seed for reproducibility
23
+ """
24
+
25
+ # Create output directory if it doesn't exist
26
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
27
+
28
+ # Find all parquet files in the input directory
29
+ input_path = Path(input_dir)
30
+ parquet_files = list(input_path.glob("*.parquet"))
31
+
32
+ if not parquet_files:
33
+ raise ValueError(f"No parquet files found in {input_dir}")
34
+
35
+ print(f"Found {len(parquet_files)} parquet files")
36
+
37
+ # Read and concatenate all parquet files
38
+ print("Reading and concatenating parquet files...")
39
+ dataframes = []
40
+
41
+ for file_path in parquet_files:
42
+ print(f"Reading {file_path.name}...")
43
+ df = pd.read_parquet(file_path)
44
+ dataframes.append(df)
45
+
46
+ # Concatenate all dataframes
47
+ combined_df = pd.concat(dataframes, ignore_index=True)
48
+ print(f"Combined dataset shape: {combined_df.shape}")
49
+
50
+ # Check if we have enough samples
51
+ total_samples = len(combined_df)
52
+ required_samples = val_size + test_size
53
+
54
+ if total_samples < required_samples:
55
+ raise ValueError(
56
+ f"Not enough samples. Required: {required_samples}, Available: {total_samples}"
57
+ )
58
+
59
+ # Shuffle the data
60
+ combined_df = combined_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
61
+
62
+ # Split the data
63
+ print("Splitting data...")
64
+
65
+ # First split: separate test set
66
+ temp_df, test_df = train_test_split(
67
+ combined_df,
68
+ test_size=test_size,
69
+ random_state=random_state
70
+ )
71
+
72
+ # Second split: separate validation from remaining data
73
+ train_df, val_df = train_test_split(
74
+ temp_df,
75
+ test_size=val_size,
76
+ random_state=random_state
77
+ )
78
+
79
+ print(f"Training set shape: {train_df.shape}")
80
+ print(f"Validation set shape: {val_df.shape}")
81
+ print(f"Test set shape: {test_df.shape}")
82
+
83
+ # Save the splits as parquet files
84
+ output_path = Path(output_dir)
85
+
86
+ train_path = output_path / "train.parquet"
87
+ val_path = output_path / "validation.parquet"
88
+ test_path = output_path / "test.parquet"
89
+
90
+ print("Saving split datasets...")
91
+ train_df.to_parquet(train_path, index=False)
92
+ val_df.to_parquet(val_path, index=False)
93
+ test_df.to_parquet(test_path, index=False)
94
+
95
+ print(f"Files saved to:")
96
+ print(f" Training: {train_path}")
97
+ print(f" Validation: {val_path}")
98
+ print(f" Test: {test_path}")
99
+
100
+ return train_df, val_df, test_df
101
+
102
+ # Alternative version using PyArrow for better performance with large files
103
+ def concatenate_and_split_parquet_arrow(
104
+ input_dir: str,
105
+ output_dir: str,
106
+ val_size: int = 10000,
107
+ test_size: int = 5000,
108
+ random_state: int = 42
109
+ ):
110
+ """
111
+ Same functionality as above but using PyArrow for better performance.
112
+ """
113
+ import pyarrow as pa
114
+ import pyarrow.parquet as pq
115
+
116
+ # Create output directory if it doesn't exist
117
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
118
+
119
+ # Find all parquet files
120
+ input_path = Path(input_dir)
121
+ parquet_files = list(input_path.glob("*.parquet"))
122
+
123
+ if not parquet_files:
124
+ raise ValueError(f"No parquet files found in {input_dir}")
125
+
126
+ print(f"Found {len(parquet_files)} parquet files")
127
+
128
+ # Read and concatenate using PyArrow
129
+ print("Reading and concatenating parquet files...")
130
+ tables = []
131
+
132
+ for file_path in parquet_files:
133
+ print(f"Reading {file_path.name}...")
134
+ table = pq.read_table(file_path)
135
+ tables.append(table)
136
+
137
+ # Concatenate tables
138
+ combined_table = pa.concat_tables(tables)
139
+ combined_df = combined_table.to_pandas()
140
+
141
+ print(f"Combined dataset shape: {combined_df.shape}")
142
+
143
+ # Rest of the function is the same as above
144
+ total_samples = len(combined_df)
145
+ required_samples = val_size + test_size
146
+
147
+ if total_samples < required_samples:
148
+ raise ValueError(
149
+ f"Not enough samples. Required: {required_samples}, Available: {total_samples}"
150
+ )
151
+
152
+ # Shuffle and split
153
+ combined_df = combined_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
154
+
155
+ temp_df, test_df = train_test_split(
156
+ combined_df, test_size=test_size, random_state=random_state
157
+ )
158
+
159
+ train_df, val_df = train_test_split(
160
+ temp_df, test_size=val_size, random_state=random_state
161
+ )
162
+
163
+ print(f"Training set shape: {train_df.shape}")
164
+ print(f"Validation set shape: {val_df.shape}")
165
+ print(f"Test set shape: {test_df.shape}")
166
+
167
+ # Save using PyArrow
168
+ output_path = Path(output_dir)
169
+
170
+ pq.write_table(pa.Table.from_pandas(train_df), output_path / "train.parquet")
171
+ pq.write_table(pa.Table.from_pandas(val_df), output_path / "validation.parquet")
172
+ pq.write_table(pa.Table.from_pandas(test_df), output_path / "test.parquet")
173
+
174
+ print(f"Files saved to {output_dir}")
175
+
176
+ return train_df, val_df, test_df
177
+
178
+ # Example usage
179
+ if __name__ == "__main__":
180
+ # Example usage
181
+ input_directory = "data"
182
+ output_directory = "data/polymer_splits"
183
+
184
+ # Using pandas version
185
+ train_df, val_df, test_df = concatenate_and_split_parquet(
186
+ input_dir=input_directory,
187
+ output_dir=output_directory,
188
+ val_size=10000,
189
+ test_size=5000,
190
+ random_state=42
191
+ )
192
+
193
+ # Or using PyArrow version for better performance
194
+ # train_df, val_df, test_df = concatenate_and_split_parquet_arrow(
195
+ # input_dir=input_directory,
196
+ # output_dir=output_directory,
197
+ # val_size=10000,
198
+ # test_size=5000,
199
+ # random_state=42
200
+ # )
simson_modeling/data/polymer_1M.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db6b4f85cfbbb110e31a910db1b7160d6f4732b9420e0cef824df581e2802c97
3
+ size 50029214
simson_modeling/data/polymer_splits/test.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba93d4addaef6c9074da8eed669e6921c5bd25c205d79d8e8fb3f01b081ca03f
3
+ size 268419
simson_modeling/data/polymer_splits/train.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c4eef5324339903c17386034a870095bd2ceb790166c86a693ff2e39b070448
3
+ size 49317149
simson_modeling/data/polymer_splits/validation.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89323d364b5b5db83a0465d29305b8dbfaed3c30f8ff428ff2128efbf27581f2
3
+ size 536944
simson_modeling/data/pubchem_119m_splits/test.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e25b7483f6bc8bf8b9eb3a718f0ac14c69f6000c63ca5b013a4f7e30c58a0ee
3
+ size 354906
simson_modeling/data/pubchem_119m_splits/train.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b97423bdd65c19946ec019cbf8257654ae287045fe6555385c55a27cd5ad7508
3
+ size 8006185287
simson_modeling/data/pubchem_119m_splits/validation.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e43d9eda261c2e7597f4ac59056143d4c28554e98929d21666da97005136893b
3
+ size 707989
simson_modeling/data/pubchem_2_epoch_50M ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faa9fbe85c19fbe67d401fefb7d8cec0ca74fcb405e8faed9100498ea4337c1c
3
+ size 2280103228
simson_modeling/data/splits/test.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e6485072aa9471239e6938836bd19a85d27f5ae63614a7887f2b1a6b464fde3
3
+ size 305047
simson_modeling/data/splits/train.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8497f6811d738b572fc7e30818473fa64668ea79cd407895dd62349c0552c64d
3
+ size 4881540720
simson_modeling/data/splits/validation.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0583f45131d6c2e16a14c70b7e6e1be0a5ffe8390c8d148b10dc8941d2a4e112
3
+ size 615952
simson_modeling/fingerprint_training.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
simson_modeling/kaggle_comp/.ipynb_checkpoints/simson-fine-tune-checkpoint.ipynb ADDED
@@ -0,0 +1,1608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from transformers import PreTrainedModel, AutoConfig, BertModel, BertTokenizerFast, BertConfig, AutoModel, AutoTokenizer\n",
10
+ "import pandas as pd\n",
11
+ "import torch\n",
12
+ "import os\n",
13
+ "from sklearn.preprocessing import StandardScaler\n",
14
+ "from sklearn.model_selection import train_test_split\n",
15
+ "from tqdm import tqdm\n",
16
+ "import joblib\n",
17
+ "\n",
18
+ "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 2,
24
+ "metadata": {},
25
+ "outputs": [],
26
+ "source": [
27
+ "df = pd.read_csv('/home/jovyan/simson_training_bolgov/kaggle_comp/train.csv')\n",
28
+ "\n",
29
+ "targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 3,
35
+ "metadata": {},
36
+ "outputs": [
37
+ {
38
+ "data": {
39
+ "text/html": [
40
+ "<div>\n",
41
+ "<style scoped>\n",
42
+ " .dataframe tbody tr th:only-of-type {\n",
43
+ " vertical-align: middle;\n",
44
+ " }\n",
45
+ "\n",
46
+ " .dataframe tbody tr th {\n",
47
+ " vertical-align: top;\n",
48
+ " }\n",
49
+ "\n",
50
+ " .dataframe thead th {\n",
51
+ " text-align: right;\n",
52
+ " }\n",
53
+ "</style>\n",
54
+ "<table border=\"1\" class=\"dataframe\">\n",
55
+ " <thead>\n",
56
+ " <tr style=\"text-align: right;\">\n",
57
+ " <th></th>\n",
58
+ " <th>id</th>\n",
59
+ " <th>SMILES</th>\n",
60
+ " <th>Tg</th>\n",
61
+ " <th>FFV</th>\n",
62
+ " <th>Tc</th>\n",
63
+ " <th>Density</th>\n",
64
+ " <th>Rg</th>\n",
65
+ " </tr>\n",
66
+ " </thead>\n",
67
+ " <tbody>\n",
68
+ " <tr>\n",
69
+ " <th>0</th>\n",
70
+ " <td>NaN</td>\n",
71
+ " <td>*/C=C/C1CC(*)C(C#N)(CCC)C1</td>\n",
72
+ " <td>NaN</td>\n",
73
+ " <td>NaN</td>\n",
74
+ " <td>NaN</td>\n",
75
+ " <td>NaN</td>\n",
76
+ " <td>NaN</td>\n",
77
+ " </tr>\n",
78
+ " <tr>\n",
79
+ " <th>1</th>\n",
80
+ " <td>NaN</td>\n",
81
+ " <td>*CCCCCCCCCCCCNC(=O)c1ccc(C(=O)N*)cc1</td>\n",
82
+ " <td>NaN</td>\n",
83
+ " <td>NaN</td>\n",
84
+ " <td>NaN</td>\n",
85
+ " <td>NaN</td>\n",
86
+ " <td>NaN</td>\n",
87
+ " </tr>\n",
88
+ " <tr>\n",
89
+ " <th>2</th>\n",
90
+ " <td>NaN</td>\n",
91
+ " <td>*c1ccc(Oc2ccc(C3(c4ccc(Oc5ccc(N6C(=O)c7ccc(Oc8...</td>\n",
92
+ " <td>NaN</td>\n",
93
+ " <td>NaN</td>\n",
94
+ " <td>NaN</td>\n",
95
+ " <td>NaN</td>\n",
96
+ " <td>NaN</td>\n",
97
+ " </tr>\n",
98
+ " <tr>\n",
99
+ " <th>3</th>\n",
100
+ " <td>1.522414e+09</td>\n",
101
+ " <td>*CC(*)c1cc(C(=O)OCCCC)ccc1-c1ccc(OCCCCCCCC)cc1</td>\n",
102
+ " <td>NaN</td>\n",
103
+ " <td>0.385500</td>\n",
104
+ " <td>NaN</td>\n",
105
+ " <td>NaN</td>\n",
106
+ " <td>NaN</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>4</th>\n",
110
+ " <td>NaN</td>\n",
111
+ " <td>*Oc1ccc(C(c2ccc(Oc3ccc(C(=O)c4c(C(=O)c5ccc(*)c...</td>\n",
112
+ " <td>NaN</td>\n",
113
+ " <td>NaN</td>\n",
114
+ " <td>NaN</td>\n",
115
+ " <td>NaN</td>\n",
116
+ " <td>NaN</td>\n",
117
+ " </tr>\n",
118
+ " <tr>\n",
119
+ " <th>...</th>\n",
120
+ " <td>...</td>\n",
121
+ " <td>...</td>\n",
122
+ " <td>...</td>\n",
123
+ " <td>...</td>\n",
124
+ " <td>...</td>\n",
125
+ " <td>...</td>\n",
126
+ " <td>...</td>\n",
127
+ " </tr>\n",
128
+ " <tr>\n",
129
+ " <th>16958</th>\n",
130
+ " <td>NaN</td>\n",
131
+ " <td>*/C=C/c1ccc2c(c1)Sc1cc(*)ccc1N2c1ccc(OCCCCCCCC...</td>\n",
132
+ " <td>NaN</td>\n",
133
+ " <td>NaN</td>\n",
134
+ " <td>NaN</td>\n",
135
+ " <td>NaN</td>\n",
136
+ " <td>NaN</td>\n",
137
+ " </tr>\n",
138
+ " <tr>\n",
139
+ " <th>16959</th>\n",
140
+ " <td>NaN</td>\n",
141
+ " <td>*Cc1ccc(CSSS*)cc1</td>\n",
142
+ " <td>NaN</td>\n",
143
+ " <td>NaN</td>\n",
144
+ " <td>NaN</td>\n",
145
+ " <td>NaN</td>\n",
146
+ " <td>NaN</td>\n",
147
+ " </tr>\n",
148
+ " <tr>\n",
149
+ " <th>16960</th>\n",
150
+ " <td>1.595107e+09</td>\n",
151
+ " <td>*Oc1ccc(C2(c3ccc(Oc4nc(*)nc(OC)n4)cc3)CCCCC2)cc1</td>\n",
152
+ " <td>NaN</td>\n",
153
+ " <td>0.363540</td>\n",
154
+ " <td>NaN</td>\n",
155
+ " <td>NaN</td>\n",
156
+ " <td>NaN</td>\n",
157
+ " </tr>\n",
158
+ " <tr>\n",
159
+ " <th>16961</th>\n",
160
+ " <td>8.406988e+08</td>\n",
161
+ " <td>*CC(*)C(=O)OCC(C)CC</td>\n",
162
+ " <td>NaN</td>\n",
163
+ " <td>0.372858</td>\n",
164
+ " <td>0.221</td>\n",
165
+ " <td>0.919641</td>\n",
166
+ " <td>13.549867</td>\n",
167
+ " </tr>\n",
168
+ " <tr>\n",
169
+ " <th>16962</th>\n",
170
+ " <td>1.563977e+08</td>\n",
171
+ " <td>*c1cc(*)cc(-c2nc3ccccc3o2)c1</td>\n",
172
+ " <td>NaN</td>\n",
173
+ " <td>0.390044</td>\n",
174
+ " <td>NaN</td>\n",
175
+ " <td>NaN</td>\n",
176
+ " <td>NaN</td>\n",
177
+ " </tr>\n",
178
+ " </tbody>\n",
179
+ "</table>\n",
180
+ "<p>16963 rows × 7 columns</p>\n",
181
+ "</div>"
182
+ ],
183
+ "text/plain": [
184
+ " id SMILES Tg \\\n",
185
+ "0 NaN */C=C/C1CC(*)C(C#N)(CCC)C1 NaN \n",
186
+ "1 NaN *CCCCCCCCCCCCNC(=O)c1ccc(C(=O)N*)cc1 NaN \n",
187
+ "2 NaN *c1ccc(Oc2ccc(C3(c4ccc(Oc5ccc(N6C(=O)c7ccc(Oc8... NaN \n",
188
+ "3 1.522414e+09 *CC(*)c1cc(C(=O)OCCCC)ccc1-c1ccc(OCCCCCCCC)cc1 NaN \n",
189
+ "4 NaN *Oc1ccc(C(c2ccc(Oc3ccc(C(=O)c4c(C(=O)c5ccc(*)c... NaN \n",
190
+ "... ... ... .. \n",
191
+ "16958 NaN */C=C/c1ccc2c(c1)Sc1cc(*)ccc1N2c1ccc(OCCCCCCCC... NaN \n",
192
+ "16959 NaN *Cc1ccc(CSSS*)cc1 NaN \n",
193
+ "16960 1.595107e+09 *Oc1ccc(C2(c3ccc(Oc4nc(*)nc(OC)n4)cc3)CCCCC2)cc1 NaN \n",
194
+ "16961 8.406988e+08 *CC(*)C(=O)OCC(C)CC NaN \n",
195
+ "16962 1.563977e+08 *c1cc(*)cc(-c2nc3ccccc3o2)c1 NaN \n",
196
+ "\n",
197
+ " FFV Tc Density Rg \n",
198
+ "0 NaN NaN NaN NaN \n",
199
+ "1 NaN NaN NaN NaN \n",
200
+ "2 NaN NaN NaN NaN \n",
201
+ "3 0.385500 NaN NaN NaN \n",
202
+ "4 NaN NaN NaN NaN \n",
203
+ "... ... ... ... ... \n",
204
+ "16958 NaN NaN NaN NaN \n",
205
+ "16959 NaN NaN NaN NaN \n",
206
+ "16960 0.363540 NaN NaN NaN \n",
207
+ "16961 0.372858 0.221 0.919641 13.549867 \n",
208
+ "16962 0.390044 NaN NaN NaN \n",
209
+ "\n",
210
+ "[16963 rows x 7 columns]"
211
+ ]
212
+ },
213
+ "execution_count": 3,
214
+ "metadata": {},
215
+ "output_type": "execute_result"
216
+ }
217
+ ],
218
+ "source": [
219
+ "for i in range(1, 5):\n",
220
+ " supplement_path = f'/home/jovyan/simson_training_bolgov/kaggle_comp/train_supplement/dataset{i}.csv'\n",
221
+ " supplement_ds = pd.read_csv(supplement_path)\n",
222
+ "\n",
223
+ " if 'TC_mean' in supplement_ds.columns:\n",
224
+ " supplement_ds = supplement_ds.rename(columns = {'TC_mean': 'Tc'})\n",
225
+ "\n",
226
+ " df = pd.concat([df, supplement_ds], axis=0)\n",
227
+ "\n",
228
+ "df = df.sample(frac=1).reset_index(drop=True)\n",
229
+ "df"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": 4,
235
+ "metadata": {},
236
+ "outputs": [],
237
+ "source": [
238
+ "import pandas as pd\n",
239
+ "import numpy as np\n",
240
+ "from rdkit import Chem\n",
241
+ "import random\n",
242
+ "from typing import Optional, List, Union\n",
243
+ "\n",
244
+ "def augment_smiles_dataset(df: pd.DataFrame,\n",
245
+ " smiles_column: str = 'SMILES',\n",
246
+ " augmentation_strategies: List[str] = ['enumeration', 'kekulize', 'stereo_enum'],\n",
247
+ " n_augmentations: int = 10,\n",
248
+ " preserve_original: bool = True,\n",
249
+ " random_seed: Optional[int] = None) -> pd.DataFrame:\n",
250
+ " \"\"\"\n",
251
+ " Advanced SMILES augmentation with multiple strategies.\n",
252
+ " \n",
253
+ " Parameters:\n",
254
+ " -----------\n",
255
+ " augmentation_strategies : List[str]\n",
256
+ " List of augmentation strategies: 'enumeration', 'kekulize', 'stereo_enum'\n",
257
+ " \"\"\"\n",
258
+ " \n",
259
+ " if random_seed is not None:\n",
260
+ " random.seed(random_seed)\n",
261
+ " np.random.seed(random_seed)\n",
262
+ " \n",
263
+ " def apply_augmentation_strategy(smiles: str, strategy: str) -> List[str]:\n",
264
+ " \"\"\"Apply specific augmentation strategy\"\"\"\n",
265
+ " try:\n",
266
+ " mol = Chem.MolFromSmiles(smiles)\n",
267
+ " if mol is None:\n",
268
+ " return [smiles]\n",
269
+ " \n",
270
+ " augmented = []\n",
271
+ " \n",
272
+ " if strategy == 'enumeration':\n",
273
+ " # Standard SMILES enumeration\n",
274
+ " for _ in range(n_augmentations):\n",
275
+ " enum_smiles = Chem.MolToSmiles(mol, \n",
276
+ " canonical=False, \n",
277
+ " doRandom=True,\n",
278
+ " isomericSmiles=True)\n",
279
+ " augmented.append(enum_smiles)\n",
280
+ " \n",
281
+ " elif strategy == 'kekulize':\n",
282
+ " # Kekulization variants\n",
283
+ " try:\n",
284
+ " Chem.Kekulize(mol)\n",
285
+ " kek_smiles = Chem.MolToSmiles(mol, kekuleSmiles=True)\n",
286
+ " augmented.append(kek_smiles)\n",
287
+ " except:\n",
288
+ " pass\n",
289
+ " \n",
290
+ " elif strategy == 'stereo_enum':\n",
291
+ " # Stereochemistry enumeration\n",
292
+ " for _ in range(n_augmentations // 2):\n",
293
+ " # Remove stereochemistry\n",
294
+ " Chem.RemoveStereochemistry(mol)\n",
295
+ " no_stereo = Chem.MolToSmiles(mol)\n",
296
+ " augmented.append(no_stereo)\n",
297
+ " \n",
298
+ " return list(set(augmented)) # Remove duplicates\n",
299
+ " \n",
300
+ " except Exception as e:\n",
301
+ " print(f\"Error in {strategy} for {smiles}: {e}\")\n",
302
+ " return [smiles]\n",
303
+ " \n",
304
+ " augmented_rows = []\n",
305
+ " \n",
306
+ " for idx, row in tqdm(df.iterrows(), total=len(df)):\n",
307
+ " original_smiles = row[smiles_column]\n",
308
+ " \n",
309
+ " # Add original if requested\n",
310
+ " if preserve_original:\n",
311
+ " original_row = row.to_dict()\n",
312
+ " original_row['augmentation_strategy'] = 'original'\n",
313
+ " original_row['is_augmented'] = False\n",
314
+ " augmented_rows.append(original_row)\n",
315
+ " \n",
316
+ " # Apply each augmentation strategy\n",
317
+ " for strategy in augmentation_strategies:\n",
318
+ " strategy_smiles = apply_augmentation_strategy(original_smiles, strategy)\n",
319
+ " \n",
320
+ " for aug_smiles in strategy_smiles:\n",
321
+ " if aug_smiles != original_smiles: # Avoid duplicating original\n",
322
+ " new_row = row.to_dict().copy()\n",
323
+ " new_row[smiles_column] = aug_smiles\n",
324
+ " new_row['augmentation_strategy'] = strategy\n",
325
+ " new_row['is_augmented'] = True\n",
326
+ " augmented_rows.append(new_row)\n",
327
+ " \n",
328
+ " augmented_df = pd.DataFrame(augmented_rows)\n",
329
+ " augmented_df = augmented_df.reset_index(drop=True)\n",
330
+ " \n",
331
+ " print(f\"Advanced augmentation completed:\")\n",
332
+ " print(f\"Original size: {len(df)}, Augmented size: {len(augmented_df)}\")\n",
333
+ " print(f\"Augmentation factor: {len(augmented_df) / len(df):.2f}x\")\n",
334
+ " \n",
335
+ " return augmented_df.reset_index(drop=True)\n",
336
+ "\n",
337
+ "def create_splits(df):\n",
338
+ " length = len(df)\n",
339
+ " train_length = int(0.85 * length)\n",
340
+ " train = df.loc[:train_length]\n",
341
+ " test = df.loc[train_length:]\n",
342
+ " return train, test\n",
343
+ "\n",
344
+ "train, test = create_splits(df)\n",
345
+ "\n",
346
+ "train = train.reset_index(drop=True)\n",
347
+ "test = test.reset_index(drop=True)\n",
348
+ "#train = augment_smiles_dataset(train)\n",
349
+ "#test = augment_smiles_dataset(test)"
350
+ ]
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "execution_count": 5,
355
+ "metadata": {},
356
+ "outputs": [],
357
+ "source": [
358
+ "scalers = []\n",
359
+ "\n",
360
+ "for target in targets:\n",
361
+ " target_scaler = StandardScaler()\n",
362
+ " train[target] = target_scaler.fit_transform(train[target].to_numpy().reshape(-1, 1))\n",
363
+ " test[target] = target_scaler.transform(test[target].to_numpy().reshape(-1, 1))\n",
364
+ " \n",
365
+ " scalers.append(target_scaler)\n",
366
+ "\n",
367
+ "smiles_train = train['SMILES']\n",
368
+ "smiles_test = test['SMILES']\n",
369
+ "\n",
370
+ "labels_train = train[targets].values\n",
371
+ "labels_test = test[targets].values"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "code",
376
+ "execution_count": 6,
377
+ "metadata": {},
378
+ "outputs": [
379
+ {
380
+ "data": {
381
+ "text/plain": [
382
+ "['target_scalers.pkl']"
383
+ ]
384
+ },
385
+ "execution_count": 6,
386
+ "metadata": {},
387
+ "output_type": "execute_result"
388
+ }
389
+ ],
390
+ "source": [
391
+ "joblib.dump(scalers, 'target_scalers.pkl')"
392
+ ]
393
+ },
394
+ {
395
+ "cell_type": "code",
396
+ "execution_count": 10,
397
+ "metadata": {},
398
+ "outputs": [
399
+ {
400
+ "name": "stderr",
401
+ "output_type": "stream",
402
+ "text": [
403
+ "/tmp/ipykernel_273264/2507782815.py:68: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
404
+ " simson_params = torch.load('/home/jovyan/simson_training_bolgov/kaggle_comp/simson_polymer_1m_uncompiled.pth')\n"
405
+ ]
406
+ }
407
+ ],
408
+ "source": [
409
+ "from sklearn.metrics import mean_absolute_error\n",
410
+ "from transformers import AutoTokenizer, BertModel\n",
411
+ "import torch\n",
412
+ "from torch import nn\n",
413
+ "from transformers.activations import ACT2FN\n",
414
+ "\n",
415
+ "def global_ap(x):\n",
416
+ " return torch.mean(x.view(x.size(0), x.size(1), -1), dim=1)\n",
417
+ "\n",
418
+ "class SimSonEncoder(nn.Module):\n",
419
+ " def __init__(self, config: BertConfig, max_len: int, dropout: float = 0.1):\n",
420
+ " super(SimSonEncoder, self).__init__()\n",
421
+ " self.config = config\n",
422
+ " self.max_len = max_len\n",
423
+ "\n",
424
+ " self.bert = BertModel(config, add_pooling_layer=False)\n",
425
+ "\n",
426
+ " self.linear = nn.Linear(config.hidden_size, max_len)\n",
427
+ " self.dropout = nn.Dropout(dropout)\n",
428
+ "\n",
429
+ " def forward(self, input_ids, attention_mask=None):\n",
430
+ " if attention_mask is None:\n",
431
+ " attention_mask = input_ids.ne(0)\n",
432
+ " \n",
433
+ " outputs = self.bert(\n",
434
+ " input_ids=input_ids,\n",
435
+ " attention_mask=attention_mask\n",
436
+ " )\n",
437
+ "\n",
438
+ " hidden_states = outputs.last_hidden_state\n",
439
+ " \n",
440
+ " hidden_states = self.dropout(hidden_states)\n",
441
+ " \n",
442
+ " pooled = global_ap(hidden_states)\n",
443
+ " \n",
444
+ " out = self.linear(pooled)\n",
445
+ " \n",
446
+ " return out\n",
447
+ "\n",
448
+ "\n",
449
+ "class SimSonClassifier(nn.Module):\n",
450
+ " def __init__(self, encoder: SimSonEncoder, num_labels: int, dropout=0.1):\n",
451
+ " super(SimSonClassifier, self).__init__()\n",
452
+ " self.encoder = encoder\n",
453
+ " self.clf = nn.Linear(encoder.max_len, num_labels)\n",
454
+ " self.relu = nn.ReLU()\n",
455
+ " self.dropout = nn.Dropout(dropout)\n",
456
+ "\n",
457
+ " def forward(self, input_ids, attention_mask=None, labels=None):\n",
458
+ " x = self.encoder(input_ids, attention_mask)\n",
459
+ " x = self.relu(self.dropout(x))\n",
460
+ " x = self.clf(x)\n",
461
+ " return x\n",
462
+ "\n",
463
+ "tokenizer_path = 'DeepChem/ChemBERTa-77M-MTR'\n",
464
+ "tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)\n",
465
+ "\n",
466
+ "# Only the hidden size is slightly larger, everything else is the same\n",
467
+ "config = BertConfig(\n",
468
+ " vocab_size=tokenizer.vocab_size,\n",
469
+ " hidden_size=768,\n",
470
+ " num_hidden_layers=4,\n",
471
+ " num_attention_heads=12,\n",
472
+ " intermediate_size=2048,\n",
473
+ " max_position_embeddings=512\n",
474
+ " )\n",
475
+ "\n",
476
+ "simson_params = torch.load('/home/jovyan/simson_training_bolgov/kaggle_comp/simson_polymer_1m_uncompiled.pth')\n",
477
+ "\n",
478
+ "backbone = SimSonEncoder(config=config, max_len=512)\n",
479
+ "backbone.load_state_dict(simson_params)\n",
480
+ "\n",
481
+ "model = SimSonClassifier(encoder=backbone, num_labels=len(targets))"
482
+ ]
483
+ },
484
+ {
485
+ "cell_type": "code",
486
+ "execution_count": 11,
487
+ "metadata": {},
488
+ "outputs": [],
489
+ "source": [
490
+ "import numpy as np\n",
491
+ "import torch\n",
492
+ "from torch.utils.data import Dataset\n",
493
+ "\n",
494
+ "\n",
495
+ "class SMILESDataset(Dataset):\n",
496
+ " def __init__(self, smiles_list, labels, tokenizer, max_length=256):\n",
497
+ " self.smiles_list = smiles_list\n",
498
+ " self.labels = labels # Shape: (num_samples, 5) - already scaled\n",
499
+ " self.tokenizer = tokenizer\n",
500
+ " self.max_length = max_length\n",
501
+ " \n",
502
+ " # Create mask for valid (non-NaN) labels\n",
503
+ " self.label_masks = ~np.isnan(self.labels) # True where label is valid\n",
504
+ " \n",
505
+ " # Replace NaNs with 0 for safe tensor conversion (mask will handle exclusion)\n",
506
+ " self.labels = np.nan_to_num(self.labels, nan=0.0)\n",
507
+ " \n",
508
+ " def __len__(self):\n",
509
+ " return len(self.smiles_list)\n",
510
+ " \n",
511
+ " def __getitem__(self, idx):\n",
512
+ " smiles = self.tokenizer.cls_token + self.smiles_list[idx]\n",
513
+ " \n",
514
+ " # Tokenize the SMILES string\n",
515
+ " encoding = self.tokenizer(\n",
516
+ " smiles,\n",
517
+ " truncation=True,\n",
518
+ " padding='max_length',\n",
519
+ " max_length=self.max_length,\n",
520
+ " return_tensors='pt'\n",
521
+ " )\n",
522
+ " \n",
523
+ " return {\n",
524
+ " 'input_ids': encoding['input_ids'].flatten(),\n",
525
+ " 'attention_mask': encoding['attention_mask'].flatten(),\n",
526
+ " 'labels': torch.tensor(self.labels[idx], dtype=torch.float32),\n",
527
+ " 'label_mask': torch.tensor(self.label_masks[idx], dtype=torch.float32)\n",
528
+ " }\n",
529
+ " \n",
530
+ " def get_label_statistics(self):\n",
531
+ " \"\"\"Return statistics about label availability for 5 labels\"\"\"\n",
532
+ " label_counts = self.label_masks.sum(axis=0)\n",
533
+ " total_samples = len(self.smiles_list)\n",
534
+ " \n",
535
+ " stats = {\n",
536
+ " 'total_samples': total_samples,\n",
537
+ " 'label_0_count': label_counts[0],\n",
538
+ " 'label_1_count': label_counts[1],\n",
539
+ " 'label_2_count': label_counts[2],\n",
540
+ " 'label_3_count': label_counts[3],\n",
541
+ " 'label_4_count': label_counts[4],\n",
542
+ " 'label_0_ratio': label_counts[0] / total_samples,\n",
543
+ " 'label_1_ratio': label_counts[1] / total_samples,\n",
544
+ " 'label_2_ratio': label_counts[2] / total_samples,\n",
545
+ " 'label_3_ratio': label_counts[3] / total_samples,\n",
546
+ " 'label_4_ratio': label_counts[4] / total_samples,\n",
547
+ " 'all_labels_count': (self.label_masks.sum(axis=1) == 5).sum(),\n",
548
+ " 'partial_labels_count': ((self.label_masks.sum(axis=1) > 0) & (self.label_masks.sum(axis=1) < 5)).sum(),\n",
549
+ " 'no_labels_count': (self.label_masks.sum(axis=1) == 0).sum()\n",
550
+ " }\n",
551
+ " \n",
552
+ " return stats\n",
553
+ "\n",
554
+ "def calculate_weighted_loss(predictions, labels, label_mask, label_weights):\n",
555
+ " \"\"\"\n",
556
+ " Calculate weighted loss for five labels with masking\n",
557
+ " \n",
558
+ " Args:\n",
559
+ " predictions: Model outputs (batch_size, 5)\n",
560
+ " labels: Ground truth labels (batch_size, 5)\n",
561
+ " label_mask: Mask for valid labels (batch_size, 5)\n",
562
+ " label_weights: Weights for each label (5,)\n",
563
+ " \"\"\"\n",
564
+ " loss_fn = nn.MSELoss(reduction='none')\n",
565
+ " \n",
566
+ " # Calculate per-sample, per-label losses\n",
567
+ " losses = loss_fn(predictions, labels) # Shape: (batch_size, 5)\n",
568
+ " \n",
569
+ " # Apply masking to exclude NaN labels\n",
570
+ " valid_mask = label_mask.bool()\n",
571
+ " masked_losses = losses * valid_mask.float()\n",
572
+ " \n",
573
+ " # Apply label-specific weights\n",
574
+ " weighted_losses = masked_losses * label_weights.unsqueeze(0) # Broadcast weights\n",
575
+ " \n",
576
+ " # Calculate final loss (only over valid predictions)\n",
577
+ " total_loss = weighted_losses.sum()\n",
578
+ " total_valid = valid_mask.sum()\n",
579
+ " \n",
580
+ " return total_loss / total_valid if total_valid > 0 else torch.tensor(0.0, device=predictions.device, requires_grad=True)\n",
581
+ "\n",
582
+ "def compute_label_weights(dataset):\n",
583
+ " \"\"\"\n",
584
+ " Compute inverse frequency weights based on label availability\n",
585
+ " \n",
586
+ " Args:\n",
587
+ " dataset: SMILESDataset instance\n",
588
+ " \n",
589
+ " Returns:\n",
590
+ " torch.Tensor: Normalized weights for each label\n",
591
+ " \"\"\"\n",
592
+ " # Get label counts from dataset\n",
593
+ " label_counts = dataset.label_masks.sum(axis=0) # Count valid samples per label\n",
594
+ " total_samples = len(dataset)\n",
595
+ " \n",
596
+ " # Inverse frequency weighting\n",
597
+ " weights = total_samples / (5 * label_counts) # 5 is the number of labels\n",
598
+ " \n",
599
+ " # Normalize weights so they sum to number of labels (5)\n",
600
+ " weights = weights / weights.sum() * 5\n",
601
+ " \n",
602
+ " return torch.tensor(weights, dtype=torch.float32)\n",
603
+ "\n",
604
+ "def calculate_true_loss(predictions, labels, label_mask, scalers=None):\n",
605
+ " \"\"\"\n",
606
+ " Calculate unscaled MAE loss for monitoring using separate scalers for each label\n",
607
+ " \n",
608
+ " Args:\n",
609
+ " predictions (torch.Tensor): Model outputs of shape (batch_size, 5).\n",
610
+ " labels (torch.Tensor): Ground truth labels of shape (batch_size, 5).\n",
611
+ " label_mask (torch.Tensor): Boolean mask for valid labels of shape (batch_size, 5).\n",
612
+ " scalers: List of scaler objects, one for each label\n",
613
+ " \n",
614
+ " Returns:\n",
615
+ " float: Average MAE across all valid samples\n",
616
+ " \"\"\"\n",
617
+ " # Detach tensors from the computation graph and move to CPU\n",
618
+ " predictions_np = predictions.cpu().detach().numpy()\n",
619
+ " labels_np = labels.cpu().numpy()\n",
620
+ " label_mask_np = label_mask.cpu().numpy().astype(bool)\n",
621
+ " \n",
622
+ " total_mae = 0\n",
623
+ " total_samples = 0\n",
624
+ " \n",
625
+ " for label_idx in range(5):\n",
626
+ " # Get valid samples for this label\n",
627
+ " valid_mask = label_mask_np[:, label_idx]\n",
628
+ " \n",
629
+ " if valid_mask.any():\n",
630
+ " valid_preds = predictions_np[valid_mask, label_idx].reshape(-1, 1)\n",
631
+ " valid_labels = labels_np[valid_mask, label_idx].reshape(-1, 1)\n",
632
+ " \n",
633
+ " if scalers is not None:\n",
634
+ " # Unscale using the corresponding scaler for this label\n",
635
+ " unscaled_preds = scalers[label_idx].inverse_transform(valid_preds).flatten()\n",
636
+ " unscaled_labels = scalers[label_idx].inverse_transform(valid_labels).flatten()\n",
637
+ " else:\n",
638
+ " unscaled_preds = valid_preds.flatten()\n",
639
+ " unscaled_labels = valid_labels.flatten()\n",
640
+ " \n",
641
+ " # Calculate MAE for this label\n",
642
+ " mae = np.mean(np.abs(unscaled_preds - unscaled_labels))\n",
643
+ " total_mae += mae * len(unscaled_preds)\n",
644
+ " total_samples += len(unscaled_preds)\n",
645
+ " \n",
646
+ " return total_mae / total_samples if total_samples > 0 else 0.0\n",
647
+ "\n",
648
+ "def calculate_individual_label_losses(predictions, labels, label_mask, scalers=None):\n",
649
+ " \"\"\"\n",
650
+ " Calculate unscaled MAE loss for each individual label\n",
651
+ " \n",
652
+ " Args:\n",
653
+ " predictions (torch.Tensor): Model outputs of shape (batch_size, 5).\n",
654
+ " labels (torch.Tensor): Ground truth labels of shape (batch_size, 5).\n",
655
+ " label_mask (torch.Tensor): Boolean mask for valid labels of shape (batch_size, 5).\n",
656
+ " scalers: List of scaler objects, one for each label\n",
657
+ " \n",
658
+ " Returns:\n",
659
+ " dict: Dictionary with MAE for each label\n",
660
+ " \"\"\"\n",
661
+ " # Detach tensors from the computation graph and move to CPU\n",
662
+ " predictions_np = predictions.cpu().detach().numpy()\n",
663
+ " labels_np = labels.cpu().numpy()\n",
664
+ " label_mask_np = label_mask.cpu().numpy().astype(bool)\n",
665
+ " \n",
666
+ " individual_losses = {}\n",
667
+ " \n",
668
+ " for label_idx in range(5):\n",
669
+ " # Get valid samples for this label\n",
670
+ " valid_mask = label_mask_np[:, label_idx]\n",
671
+ " \n",
672
+ " if valid_mask.any():\n",
673
+ " valid_preds = predictions_np[valid_mask, label_idx].reshape(-1, 1)\n",
674
+ " valid_labels = labels_np[valid_mask, label_idx].reshape(-1, 1)\n",
675
+ " \n",
676
+ " if scalers is not None:\n",
677
+ " # Unscale using the corresponding scaler for this label\n",
678
+ " unscaled_preds = scalers[label_idx].inverse_transform(valid_preds).flatten()\n",
679
+ " unscaled_labels = scalers[label_idx].inverse_transform(valid_labels).flatten()\n",
680
+ " else:\n",
681
+ " unscaled_preds = valid_preds.flatten()\n",
682
+ " unscaled_labels = valid_labels.flatten()\n",
683
+ " \n",
684
+ " # Calculate MAE for this label\n",
685
+ " mae = np.mean(np.abs(unscaled_preds - unscaled_labels))\n",
686
+ " individual_losses[f'label_{label_idx}'] = mae\n",
687
+ " else:\n",
688
+ " individual_losses[f'label_{label_idx}'] = None # No valid samples for this label\n",
689
+ " \n",
690
+ " return individual_losses\n",
691
+ "\n",
692
+ "\n",
693
+ "def train_model(model, train_dataloader, val_dataloader, label_weights, \n",
694
+ " scalers=None, num_epochs=10, learning_rate=2e-5, device='cuda', \n",
695
+ " patience=3, validation_steps=500):\n",
696
+ " \"\"\"\n",
697
+ " Train model with weighted loss for five labels with step-based validation\n",
698
+ " \n",
699
+ " Args:\n",
700
+ " model: CustomModel instance (should output 5 labels)\n",
701
+ " train_dataloader: Training data loader\n",
702
+ " val_dataloader: Validation data loader \n",
703
+ " label_weights: Tensor with weights for each label\n",
704
+ " scalers: List of scalers for unscaled loss monitoring\n",
705
+ " num_epochs: Number of training epochs\n",
706
+ " learning_rate: Learning rate\n",
707
+ " device: Training device\n",
708
+ " patience: Early stopping patience (in validation steps)\n",
709
+ " validation_steps: Perform validation every N training steps\n",
710
+ " \"\"\"\n",
711
+ " model.to(device)\n",
712
+ " label_weights = label_weights.to(device)\n",
713
+ " \n",
714
+ " optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)\n",
715
+ " total_steps = len(train_dataloader) * num_epochs\n",
716
+ " scheduler = LinearLR(optimizer, start_factor=1.0, end_factor=0.1, total_iters=total_steps)\n",
717
+ " \n",
718
+ " train_losses = []\n",
719
+ " val_losses = []\n",
720
+ " \n",
721
+ " # Early stopping initialization\n",
722
+ " best_val_loss = float('inf')\n",
723
+ " steps_no_improve = 0\n",
724
+ " best_model_state = None\n",
725
+ " \n",
726
+ " # Training tracking\n",
727
+ " global_step = 0\n",
728
+ " running_train_loss = 0\n",
729
+ " running_true_train_loss = 0\n",
730
+ " train_steps_count = 0\n",
731
+ " \n",
732
+ " print(f\"Label weights: {label_weights.cpu().numpy()}\")\n",
733
+ " print(f\"Validation will be performed every {validation_steps} steps\")\n",
734
+ " \n",
735
+ " model.train()\n",
736
+ " \n",
737
+ " for epoch in range(num_epochs):\n",
738
+ " print(f\"\\nEpoch {epoch + 1}/{num_epochs}\")\n",
739
+ " \n",
740
+ " train_progress = tqdm(train_dataloader, desc=\"Training\", leave=False)\n",
741
+ " \n",
742
+ " for batch_idx, batch in enumerate(train_progress):\n",
743
+ " with torch.autocast(dtype=torch.float16, device_type=\"cuda\"):\n",
744
+ " input_ids = batch['input_ids'].to(device)\n",
745
+ " attention_mask = batch['attention_mask'].to(device)\n",
746
+ " labels = batch['labels'].to(device)\n",
747
+ " label_mask = batch['label_mask'].to(device)\n",
748
+ " \n",
749
+ " optimizer.zero_grad()\n",
750
+ " \n",
751
+ " # Model forward pass\n",
752
+ " outputs = model(\n",
753
+ " input_ids=input_ids,\n",
754
+ " attention_mask=attention_mask,\n",
755
+ " )\n",
756
+ " \n",
757
+ " # Calculate weighted loss\n",
758
+ " loss = calculate_weighted_loss(outputs, labels, label_mask, label_weights)\n",
759
+ " \n",
760
+ " # Calculate true loss for monitoring\n",
761
+ " true_loss = calculate_true_loss(outputs, labels, label_mask, scalers)\n",
762
+ " \n",
763
+ " # Accumulate losses for averaging\n",
764
+ " running_train_loss += loss.item()\n",
765
+ " running_true_train_loss += true_loss\n",
766
+ " train_steps_count += 1\n",
767
+ " \n",
768
+ " loss.backward()\n",
769
+ " \n",
770
+ " torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n",
771
+ " \n",
772
+ " optimizer.step()\n",
773
+ " scheduler.step()\n",
774
+ " \n",
775
+ " global_step += 1\n",
776
+ " \n",
777
+ " train_progress.set_postfix({\n",
778
+ " 'step': global_step,\n",
779
+ " 'loss': f'{loss.item():.4f}',\n",
780
+ " 'true_loss': f'{true_loss:.4f}',\n",
781
+ " 'lr': f'{scheduler.get_last_lr()[0]:.2e}'\n",
782
+ " })\n",
783
+ " \n",
784
+ " # Perform validation every validation_steps\n",
785
+ " if global_step % validation_steps == 0:\n",
786
+ " # Calculate average training losses since last validation\n",
787
+ " avg_train_loss = running_train_loss / train_steps_count\n",
788
+ " avg_true_train_loss = running_true_train_loss / train_steps_count\n",
789
+ " \n",
790
+ " train_losses.append(avg_train_loss)\n",
791
+ " \n",
792
+ " # Reset running averages\n",
793
+ " running_train_loss = 0\n",
794
+ " running_true_train_loss = 0\n",
795
+ " train_steps_count = 0\n",
796
+ " \n",
797
+ " # Validation\n",
798
+ " model.eval()\n",
799
+ " total_val_loss = 0\n",
800
+ " total_true_val_loss = 0\n",
801
+ " val_batches = 0\n",
802
+ " \n",
803
+ " # Track individual label losses across all validation batches\n",
804
+ " accumulated_individual_losses = {f'label_{i}': [] for i in range(5)}\n",
805
+ "\n",
806
+ " with torch.no_grad():\n",
807
+ " for val_batch in val_dataloader:\n",
808
+ " with torch.autocast(dtype=torch.float16, device_type=\"cuda\"):\n",
809
+ " input_ids = val_batch['input_ids'].to(device)\n",
810
+ " attention_mask = val_batch['attention_mask'].to(device)\n",
811
+ " labels = val_batch['labels'].to(device)\n",
812
+ " label_mask = val_batch['label_mask'].to(device)\n",
813
+ " \n",
814
+ " outputs = model(\n",
815
+ " input_ids=input_ids,\n",
816
+ " attention_mask=attention_mask,\n",
817
+ " )\n",
818
+ " \n",
819
+ " val_loss = calculate_weighted_loss(outputs, labels, label_mask, label_weights)\n",
820
+ " val_true_loss = calculate_true_loss(outputs, labels, label_mask, scalers)\n",
821
+ " \n",
822
+ " # Calculate individual label losses for this batch\n",
823
+ " individual_losses = calculate_individual_label_losses(outputs, labels, label_mask, scalers)\n",
824
+ " \n",
825
+ " # Accumulate individual losses\n",
826
+ " for label_key, loss_value in individual_losses.items():\n",
827
+ " if loss_value is not None:\n",
828
+ " accumulated_individual_losses[label_key].append(loss_value)\n",
829
+ "\n",
830
+ " total_val_loss += val_loss.item()\n",
831
+ " total_true_val_loss += val_true_loss\n",
832
+ " val_batches += 1\n",
833
+ " \n",
834
+ " avg_val_loss = total_val_loss / val_batches\n",
835
+ " avg_val_true_loss = total_true_val_loss / val_batches\n",
836
+ " val_losses.append(avg_val_loss)\n",
837
+ " \n",
838
+ " # Calculate average individual label losses\n",
839
+ " avg_individual_losses = {}\n",
840
+ " for label_key, losses in accumulated_individual_losses.items():\n",
841
+ " if losses:\n",
842
+ " avg_individual_losses[label_key] = np.mean(losses)\n",
843
+ " else:\n",
844
+ " avg_individual_losses[label_key] = None\n",
845
+ " \n",
846
+ " # Print validation results with individual label losses\n",
847
+ " print(f\"\\nStep {global_step} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | True train loss: {avg_true_train_loss:.4f} | True val loss: {avg_val_true_loss:.4f}\")\n",
848
+ " print(\"Individual label losses (unscaled):\")\n",
849
+ " for i in range(5):\n",
850
+ " label_key = f'label_{i}'\n",
851
+ " if avg_individual_losses[label_key] is not None:\n",
852
+ " print(f\" Label {i}: {avg_individual_losses[label_key]:.4f}\")\n",
853
+ " else:\n",
854
+ " print(f\" Label {i}: No valid samples\")\n",
855
+ " \n",
856
+ " # Early stopping check and best model saving\n",
857
+ " if avg_val_loss < best_val_loss:\n",
858
+ " best_val_loss = avg_val_loss\n",
859
+ " steps_no_improve = 0\n",
860
+ " best_model_state = model.state_dict().copy()\n",
861
+ " print(f\"New best validation loss: {best_val_loss:.4f}\")\n",
862
+ " else:\n",
863
+ " steps_no_improve += 1\n",
864
+ " if steps_no_improve >= patience:\n",
865
+ " print(f\"Early stopping triggered after {global_step} steps ({steps_no_improve} validation steps without improvement).\")\n",
866
+ " # Load best model and return\n",
867
+ " if best_model_state is not None:\n",
868
+ " model.load_state_dict(best_model_state)\n",
869
+ " print(f\"Loaded best model with validation loss: {best_val_loss:.4f}\")\n",
870
+ " return train_losses, val_losses, best_val_loss\n",
871
+ " \n",
872
+ " model.train()\n",
873
+ " \n",
874
+ " # Handle any remaining training loss that hasn't been validated\n",
875
+ " if train_steps_count > 0:\n",
876
+ " avg_train_loss = running_train_loss / train_steps_count\n",
877
+ " train_losses.append(avg_train_loss)\n",
878
+ " \n",
879
+ " # Load the best model state before returning\n",
880
+ " if best_model_state is not None:\n",
881
+ " model.load_state_dict(best_model_state)\n",
882
+ " print(f\"Loaded best model with validation loss: {best_val_loss:.4f}\")\n",
883
+ " \n",
884
+ " return train_losses, val_losses, best_val_loss\n",
885
+ "\n",
886
+ "def run_training(smiles_train, smiles_test, labels_train, labels_test, \n",
887
+ " model, tokenizer, scalers, num_epochs=5, learning_rate=1e-5, \n",
888
+ " batch_size=256, validation_steps=500):\n",
889
+ " \"\"\"\n",
890
+ " Complete training pipeline for five labels with step-based validation\n",
891
+ " \n",
892
+ " Args:\n",
893
+ " smiles_train, smiles_test: Lists of SMILES strings\n",
894
+ " labels_train, labels_test: numpy arrays of shape (num_samples, 5) - ALREADY SCALED\n",
895
+ " model: CustomModel instance (configured for 5 outputs)\n",
896
+ " tokenizer: Tokenizer instance\n",
897
+ " scalers: List of 5 scalers, one for each label (for inverse transform only)\n",
898
+ " num_epochs: Number of training epochs\n",
899
+ " learning_rate: Learning rate\n",
900
+ " batch_size: Batch size for training\n",
901
+ " validation_steps: Perform validation every N training steps\n",
902
+ " \"\"\"\n",
903
+ " \n",
904
+ " print(\"Setting up datasets for five-label training (labels assumed pre-scaled)\")\n",
905
+ " \n",
906
+ " # Create datasets - no scaling performed here\n",
907
+ " train_dataset = SMILESDataset(smiles_train, labels_train, tokenizer)\n",
908
+ " val_dataset = SMILESDataset(smiles_test, labels_test, tokenizer)\n",
909
+ " \n",
910
+ " # Print dataset statistics\n",
911
+ " train_stats = train_dataset.get_label_statistics()\n",
912
+ " val_stats = val_dataset.get_label_statistics()\n",
913
+ " \n",
914
+ " print(\"Training dataset statistics:\")\n",
915
+ " for key, value in train_stats.items():\n",
916
+ " print(f\" {key}: {value}\")\n",
917
+ " \n",
918
+ " print(\"Validation dataset statistics:\")\n",
919
+ " for key, value in val_stats.items():\n",
920
+ " print(f\" {key}: {value}\")\n",
921
+ " \n",
922
+ " # Compute label weights based on training data\n",
923
+ " label_weights = compute_label_weights(train_dataset)\n",
924
+ " print(f\"Computed label weights: {label_weights.numpy()}\")\n",
925
+ " \n",
926
+ " # Create data loaders\n",
927
+ " train_dataloader = DataLoader(\n",
928
+ " train_dataset,\n",
929
+ " batch_size=batch_size,\n",
930
+ " shuffle=True,\n",
931
+ " num_workers=4,\n",
932
+ " pin_memory=True\n",
933
+ " )\n",
934
+ " \n",
935
+ " val_dataloader = DataLoader(\n",
936
+ " val_dataset,\n",
937
+ " batch_size=batch_size,\n",
938
+ " shuffle=False,\n",
939
+ " num_workers=4,\n",
940
+ " pin_memory=True\n",
941
+ " )\n",
942
+ " \n",
943
+ " # Set device\n",
944
+ " device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
945
+ " print(f\"Using device: {device}\")\n",
946
+ " print(f\"Training steps per epoch: {len(train_dataloader)}\")\n",
947
+ " print(f\"Total training steps: {len(train_dataloader) * num_epochs}\")\n",
948
+ " \n",
949
+ " # Train the model\n",
950
+ " train_losses, val_losses, best_val_loss = train_model(\n",
951
+ " model=model,\n",
952
+ " train_dataloader=train_dataloader,\n",
953
+ " val_dataloader=val_dataloader,\n",
954
+ " label_weights=label_weights,\n",
955
+ " scalers=scalers, # Still pass scalers for true loss calculation\n",
956
+ " num_epochs=num_epochs,\n",
957
+ " learning_rate=learning_rate,\n",
958
+ " device=device,\n",
959
+ " patience=10,\n",
960
+ " validation_steps=validation_steps,\n",
961
+ " )\n",
962
+ " \n",
963
+ " print('Training completed.')\n",
964
+ " print(f'Number of validation checkpoints: {len(val_losses)}')\n",
965
+ " print(f'Final training losses: {train_losses[-5:] if len(train_losses) >= 5 else train_losses}')\n",
966
+ " print(f'Best validation loss: {best_val_loss:.4f}')\n",
967
+ " \n",
968
+ " # Save model\n",
969
+ " torch.save(model.state_dict(), '/home/jovyan/simson_training_bolgov/kaggle_comp/checkpoints/simson_clf_kaggle.bin')\n",
970
+ " print(\"Model saved successfully!\")\n",
971
+ " \n",
972
+ " return train_losses, val_losses, best_val_loss\n"
973
+ ]
974
+ },
975
+ {
976
+ "cell_type": "code",
977
+ "execution_count": 12,
978
+ "metadata": {},
979
+ "outputs": [
980
+ {
981
+ "name": "stdout",
982
+ "output_type": "stream",
983
+ "text": [
984
+ "Setting up datasets for five-label training (labels assumed pre-scaled)\n",
985
+ "Training dataset statistics:\n",
986
+ " total_samples: 14419\n",
987
+ " label_0_count: 472\n",
988
+ " label_1_count: 6700\n",
989
+ " label_2_count: 1368\n",
990
+ " label_3_count: 517\n",
991
+ " label_4_count: 519\n",
992
+ " label_0_ratio: 0.03273458630973022\n",
993
+ " label_1_ratio: 0.46466467854913657\n",
994
+ " label_2_ratio: 0.09487481794854012\n",
995
+ " label_3_ratio: 0.03585546847909009\n",
996
+ " label_4_ratio: 0.035994174353283864\n",
997
+ " all_labels_count: 0\n",
998
+ " partial_labels_count: 8286\n",
999
+ " no_labels_count: 6133\n",
1000
+ "Validation dataset statistics:\n",
1001
+ " total_samples: 2545\n",
1002
+ " label_0_count: 86\n",
1003
+ " label_1_count: 1192\n",
1004
+ " label_2_count: 243\n",
1005
+ " label_3_count: 96\n",
1006
+ " label_4_count: 95\n",
1007
+ " label_0_ratio: 0.03379174852652259\n",
1008
+ " label_1_ratio: 0.46836935166994104\n",
1009
+ " label_2_ratio: 0.09548133595284872\n",
1010
+ " label_3_ratio: 0.037721021611001965\n",
1011
+ " label_4_ratio: 0.03732809430255403\n",
1012
+ " all_labels_count: 0\n",
1013
+ " partial_labels_count: 1470\n",
1014
+ " no_labels_count: 1075\n",
1015
+ "Computed label weights: [1.5442214 0.10878694 0.53280157 1.4098115 1.4043787 ]\n",
1016
+ "Using device: cuda\n",
1017
+ "Training steps per epoch: 113\n",
1018
+ "Total training steps: 2260\n",
1019
+ "Label weights: [1.5442214 0.10878694 0.53280157 1.4098115 1.4043787 ]\n",
1020
+ "Validation will be performed every 113 steps\n",
1021
+ "\n",
1022
+ "Epoch 1/20\n"
1023
+ ]
1024
+ },
1025
+ {
1026
+ "name": "stderr",
1027
+ "output_type": "stream",
1028
+ "text": [
1029
+ " \r"
1030
+ ]
1031
+ },
1032
+ {
1033
+ "name": "stdout",
1034
+ "output_type": "stream",
1035
+ "text": [
1036
+ "\n",
1037
+ "Step 113 | Train Loss: 0.2634 | Val Loss: 0.2308 | True train loss: 3.6083 | True val loss: 3.8929\n",
1038
+ "Individual label losses (unscaled):\n",
1039
+ " Label 0: 74.8777\n",
1040
+ " Label 1: 0.0177\n",
1041
+ " Label 2: 0.0347\n",
1042
+ " Label 3: 0.0917\n",
1043
+ " Label 4: 2.2249\n",
1044
+ "New best validation loss: 0.2308\n",
1045
+ "\n",
1046
+ "Epoch 2/20\n"
1047
+ ]
1048
+ },
1049
+ {
1050
+ "name": "stderr",
1051
+ "output_type": "stream",
1052
+ "text": [
1053
+ " \r"
1054
+ ]
1055
+ },
1056
+ {
1057
+ "name": "stdout",
1058
+ "output_type": "stream",
1059
+ "text": [
1060
+ "\n",
1061
+ "Step 226 | Train Loss: 0.2031 | Val Loss: 0.1903 | True train loss: 2.9943 | True val loss: 3.2239\n",
1062
+ "Individual label losses (unscaled):\n",
1063
+ " Label 0: 60.3146\n",
1064
+ " Label 1: 0.0170\n",
1065
+ " Label 2: 0.0394\n",
1066
+ " Label 3: 0.0517\n",
1067
+ " Label 4: 2.7417\n",
1068
+ "New best validation loss: 0.1903\n",
1069
+ "\n",
1070
+ "Epoch 3/20\n"
1071
+ ]
1072
+ },
1073
+ {
1074
+ "name": "stderr",
1075
+ "output_type": "stream",
1076
+ "text": [
1077
+ " \r"
1078
+ ]
1079
+ },
1080
+ {
1081
+ "name": "stdout",
1082
+ "output_type": "stream",
1083
+ "text": [
1084
+ "\n",
1085
+ "Step 339 | Train Loss: 0.1796 | Val Loss: 0.1766 | True train loss: 2.8803 | True val loss: 3.3839\n",
1086
+ "Individual label losses (unscaled):\n",
1087
+ " Label 0: 63.8170\n",
1088
+ " Label 1: 0.0136\n",
1089
+ " Label 2: 0.0362\n",
1090
+ " Label 3: 0.0785\n",
1091
+ " Label 4: 1.9163\n",
1092
+ "New best validation loss: 0.1766\n",
1093
+ "\n",
1094
+ "Epoch 4/20\n"
1095
+ ]
1096
+ },
1097
+ {
1098
+ "name": "stderr",
1099
+ "output_type": "stream",
1100
+ "text": [
1101
+ " \r"
1102
+ ]
1103
+ },
1104
+ {
1105
+ "name": "stdout",
1106
+ "output_type": "stream",
1107
+ "text": [
1108
+ "\n",
1109
+ "Step 452 | Train Loss: 0.1538 | Val Loss: 0.1525 | True train loss: 2.6186 | True val loss: 3.2207\n",
1110
+ "Individual label losses (unscaled):\n",
1111
+ " Label 0: 61.0257\n",
1112
+ " Label 1: 0.0142\n",
1113
+ " Label 2: 0.0347\n",
1114
+ " Label 3: 0.0523\n",
1115
+ " Label 4: 2.1894\n",
1116
+ "New best validation loss: 0.1525\n",
1117
+ "\n",
1118
+ "Epoch 5/20\n"
1119
+ ]
1120
+ },
1121
+ {
1122
+ "name": "stderr",
1123
+ "output_type": "stream",
1124
+ "text": [
1125
+ " \r"
1126
+ ]
1127
+ },
1128
+ {
1129
+ "name": "stdout",
1130
+ "output_type": "stream",
1131
+ "text": [
1132
+ "\n",
1133
+ "Step 565 | Train Loss: 0.1461 | Val Loss: 0.1432 | True train loss: 2.6254 | True val loss: 3.1313\n",
1134
+ "Individual label losses (unscaled):\n",
1135
+ " Label 0: 57.6456\n",
1136
+ " Label 1: 0.0131\n",
1137
+ " Label 2: 0.0318\n",
1138
+ " Label 3: 0.0394\n",
1139
+ " Label 4: 1.9632\n",
1140
+ "New best validation loss: 0.1432\n",
1141
+ "\n",
1142
+ "Epoch 6/20\n"
1143
+ ]
1144
+ },
1145
+ {
1146
+ "name": "stderr",
1147
+ "output_type": "stream",
1148
+ "text": [
1149
+ " \r"
1150
+ ]
1151
+ },
1152
+ {
1153
+ "name": "stdout",
1154
+ "output_type": "stream",
1155
+ "text": [
1156
+ "\n",
1157
+ "Step 678 | Train Loss: 0.1344 | Val Loss: 0.1506 | True train loss: 2.4023 | True val loss: 3.0162\n",
1158
+ "Individual label losses (unscaled):\n",
1159
+ " Label 0: 55.6189\n",
1160
+ " Label 1: 0.0154\n",
1161
+ " Label 2: 0.0315\n",
1162
+ " Label 3: 0.0464\n",
1163
+ " Label 4: 1.7522\n",
1164
+ "\n",
1165
+ "Epoch 7/20\n"
1166
+ ]
1167
+ },
1168
+ {
1169
+ "name": "stderr",
1170
+ "output_type": "stream",
1171
+ "text": [
1172
+ " \r"
1173
+ ]
1174
+ },
1175
+ {
1176
+ "name": "stdout",
1177
+ "output_type": "stream",
1178
+ "text": [
1179
+ "\n",
1180
+ "Step 791 | Train Loss: 0.1240 | Val Loss: 0.1399 | True train loss: 2.2227 | True val loss: 3.3619\n",
1181
+ "Individual label losses (unscaled):\n",
1182
+ " Label 0: 63.3619\n",
1183
+ " Label 1: 0.0119\n",
1184
+ " Label 2: 0.0301\n",
1185
+ " Label 3: 0.0426\n",
1186
+ " Label 4: 1.8000\n",
1187
+ "New best validation loss: 0.1399\n",
1188
+ "\n",
1189
+ "Epoch 8/20\n"
1190
+ ]
1191
+ },
1192
+ {
1193
+ "name": "stderr",
1194
+ "output_type": "stream",
1195
+ "text": [
1196
+ " \r"
1197
+ ]
1198
+ },
1199
+ {
1200
+ "name": "stdout",
1201
+ "output_type": "stream",
1202
+ "text": [
1203
+ "\n",
1204
+ "Step 904 | Train Loss: 0.1149 | Val Loss: 0.1359 | True train loss: 2.2502 | True val loss: 3.2314\n",
1205
+ "Individual label losses (unscaled):\n",
1206
+ " Label 0: 61.6004\n",
1207
+ " Label 1: 0.0121\n",
1208
+ " Label 2: 0.0311\n",
1209
+ " Label 3: 0.0403\n",
1210
+ " Label 4: 1.7468\n",
1211
+ "New best validation loss: 0.1359\n",
1212
+ "\n",
1213
+ "Epoch 9/20\n"
1214
+ ]
1215
+ },
1216
+ {
1217
+ "name": "stderr",
1218
+ "output_type": "stream",
1219
+ "text": [
1220
+ " \r"
1221
+ ]
1222
+ },
1223
+ {
1224
+ "name": "stdout",
1225
+ "output_type": "stream",
1226
+ "text": [
1227
+ "\n",
1228
+ "Step 1017 | Train Loss: 0.1094 | Val Loss: 0.1506 | True train loss: 2.2638 | True val loss: 3.4009\n",
1229
+ "Individual label losses (unscaled):\n",
1230
+ " Label 0: 63.4786\n",
1231
+ " Label 1: 0.0127\n",
1232
+ " Label 2: 0.0280\n",
1233
+ " Label 3: 0.0495\n",
1234
+ " Label 4: 2.0883\n",
1235
+ "\n",
1236
+ "Epoch 10/20\n"
1237
+ ]
1238
+ },
1239
+ {
1240
+ "name": "stderr",
1241
+ "output_type": "stream",
1242
+ "text": [
1243
+ " \r"
1244
+ ]
1245
+ },
1246
+ {
1247
+ "name": "stdout",
1248
+ "output_type": "stream",
1249
+ "text": [
1250
+ "\n",
1251
+ "Step 1130 | Train Loss: 0.0963 | Val Loss: 0.1467 | True train loss: 1.9351 | True val loss: 3.3818\n",
1252
+ "Individual label losses (unscaled):\n",
1253
+ " Label 0: 62.9852\n",
1254
+ " Label 1: 0.0127\n",
1255
+ " Label 2: 0.0298\n",
1256
+ " Label 3: 0.0511\n",
1257
+ " Label 4: 1.9974\n",
1258
+ "\n",
1259
+ "Epoch 11/20\n"
1260
+ ]
1261
+ },
1262
+ {
1263
+ "name": "stderr",
1264
+ "output_type": "stream",
1265
+ "text": [
1266
+ " \r"
1267
+ ]
1268
+ },
1269
+ {
1270
+ "name": "stdout",
1271
+ "output_type": "stream",
1272
+ "text": [
1273
+ "\n",
1274
+ "Step 1243 | Train Loss: 0.0957 | Val Loss: 0.1321 | True train loss: 1.9361 | True val loss: 2.9691\n",
1275
+ "Individual label losses (unscaled):\n",
1276
+ " Label 0: 55.0661\n",
1277
+ " Label 1: 0.0117\n",
1278
+ " Label 2: 0.0284\n",
1279
+ " Label 3: 0.0351\n",
1280
+ " Label 4: 2.0390\n",
1281
+ "New best validation loss: 0.1321\n",
1282
+ "\n",
1283
+ "Epoch 12/20\n"
1284
+ ]
1285
+ },
1286
+ {
1287
+ "name": "stderr",
1288
+ "output_type": "stream",
1289
+ "text": [
1290
+ " \r"
1291
+ ]
1292
+ },
1293
+ {
1294
+ "name": "stdout",
1295
+ "output_type": "stream",
1296
+ "text": [
1297
+ "\n",
1298
+ "Step 1356 | Train Loss: 0.0823 | Val Loss: 0.1424 | True train loss: 1.8204 | True val loss: 3.5836\n",
1299
+ "Individual label losses (unscaled):\n",
1300
+ " Label 0: 67.2052\n",
1301
+ " Label 1: 0.0123\n",
1302
+ " Label 2: 0.0262\n",
1303
+ " Label 3: 0.0391\n",
1304
+ " Label 4: 1.9690\n",
1305
+ "\n",
1306
+ "Epoch 13/20\n"
1307
+ ]
1308
+ },
1309
+ {
1310
+ "name": "stderr",
1311
+ "output_type": "stream",
1312
+ "text": [
1313
+ " \r"
1314
+ ]
1315
+ },
1316
+ {
1317
+ "name": "stdout",
1318
+ "output_type": "stream",
1319
+ "text": [
1320
+ "\n",
1321
+ "Step 1469 | Train Loss: 0.0797 | Val Loss: 0.1382 | True train loss: 1.7223 | True val loss: 3.2234\n",
1322
+ "Individual label losses (unscaled):\n",
1323
+ " Label 0: 59.5507\n",
1324
+ " Label 1: 0.0118\n",
1325
+ " Label 2: 0.0282\n",
1326
+ " Label 3: 0.0381\n",
1327
+ " Label 4: 1.9066\n",
1328
+ "\n",
1329
+ "Epoch 14/20\n"
1330
+ ]
1331
+ },
1332
+ {
1333
+ "name": "stderr",
1334
+ "output_type": "stream",
1335
+ "text": [
1336
+ " \r"
1337
+ ]
1338
+ },
1339
+ {
1340
+ "name": "stdout",
1341
+ "output_type": "stream",
1342
+ "text": [
1343
+ "\n",
1344
+ "Step 1582 | Train Loss: 0.0728 | Val Loss: 0.1321 | True train loss: 1.5747 | True val loss: 3.3817\n",
1345
+ "Individual label losses (unscaled):\n",
1346
+ " Label 0: 63.2403\n",
1347
+ " Label 1: 0.0115\n",
1348
+ " Label 2: 0.0262\n",
1349
+ " Label 3: 0.0339\n",
1350
+ " Label 4: 1.7621\n",
1351
+ "New best validation loss: 0.1321\n",
1352
+ "\n",
1353
+ "Epoch 15/20\n"
1354
+ ]
1355
+ },
1356
+ {
1357
+ "name": "stderr",
1358
+ "output_type": "stream",
1359
+ "text": [
1360
+ " \r"
1361
+ ]
1362
+ },
1363
+ {
1364
+ "name": "stdout",
1365
+ "output_type": "stream",
1366
+ "text": [
1367
+ "\n",
1368
+ "Step 1695 | Train Loss: 0.0676 | Val Loss: 0.1437 | True train loss: 1.5251 | True val loss: 3.4306\n",
1369
+ "Individual label losses (unscaled):\n",
1370
+ " Label 0: 63.7060\n",
1371
+ " Label 1: 0.0121\n",
1372
+ " Label 2: 0.0274\n",
1373
+ " Label 3: 0.0442\n",
1374
+ " Label 4: 1.9844\n",
1375
+ "\n",
1376
+ "Epoch 16/20\n"
1377
+ ]
1378
+ },
1379
+ {
1380
+ "name": "stderr",
1381
+ "output_type": "stream",
1382
+ "text": [
1383
+ " \r"
1384
+ ]
1385
+ },
1386
+ {
1387
+ "name": "stdout",
1388
+ "output_type": "stream",
1389
+ "text": [
1390
+ "\n",
1391
+ "Step 1808 | Train Loss: 0.0617 | Val Loss: 0.1494 | True train loss: 1.3651 | True val loss: 3.3514\n",
1392
+ "Individual label losses (unscaled):\n",
1393
+ " Label 0: 61.4547\n",
1394
+ " Label 1: 0.0118\n",
1395
+ " Label 2: 0.0260\n",
1396
+ " Label 3: 0.0504\n",
1397
+ " Label 4: 2.0026\n",
1398
+ "\n",
1399
+ "Epoch 17/20\n"
1400
+ ]
1401
+ },
1402
+ {
1403
+ "name": "stderr",
1404
+ "output_type": "stream",
1405
+ "text": [
1406
+ " \r"
1407
+ ]
1408
+ },
1409
+ {
1410
+ "name": "stdout",
1411
+ "output_type": "stream",
1412
+ "text": [
1413
+ "\n",
1414
+ "Step 1921 | Train Loss: 0.0580 | Val Loss: 0.1424 | True train loss: 1.3237 | True val loss: 3.3568\n",
1415
+ "Individual label losses (unscaled):\n",
1416
+ " Label 0: 61.8486\n",
1417
+ " Label 1: 0.0116\n",
1418
+ " Label 2: 0.0252\n",
1419
+ " Label 3: 0.0430\n",
1420
+ " Label 4: 1.9207\n",
1421
+ "\n",
1422
+ "Epoch 18/20\n"
1423
+ ]
1424
+ },
1425
+ {
1426
+ "name": "stderr",
1427
+ "output_type": "stream",
1428
+ "text": [
1429
+ " \r"
1430
+ ]
1431
+ },
1432
+ {
1433
+ "name": "stdout",
1434
+ "output_type": "stream",
1435
+ "text": [
1436
+ "\n",
1437
+ "Step 2034 | Train Loss: 0.0534 | Val Loss: 0.1376 | True train loss: 1.2378 | True val loss: 3.3407\n",
1438
+ "Individual label losses (unscaled):\n",
1439
+ " Label 0: 61.5502\n",
1440
+ " Label 1: 0.0115\n",
1441
+ " Label 2: 0.0247\n",
1442
+ " Label 3: 0.0433\n",
1443
+ " Label 4: 1.7560\n",
1444
+ "\n",
1445
+ "Epoch 19/20\n"
1446
+ ]
1447
+ },
1448
+ {
1449
+ "name": "stderr",
1450
+ "output_type": "stream",
1451
+ "text": [
1452
+ " \r"
1453
+ ]
1454
+ },
1455
+ {
1456
+ "name": "stdout",
1457
+ "output_type": "stream",
1458
+ "text": [
1459
+ "\n",
1460
+ "Step 2147 | Train Loss: 0.0497 | Val Loss: 0.1416 | True train loss: 1.1018 | True val loss: 3.2781\n",
1461
+ "Individual label losses (unscaled):\n",
1462
+ " Label 0: 59.6542\n",
1463
+ " Label 1: 0.0115\n",
1464
+ " Label 2: 0.0251\n",
1465
+ " Label 3: 0.0438\n",
1466
+ " Label 4: 1.8405\n",
1467
+ "\n",
1468
+ "Epoch 20/20\n"
1469
+ ]
1470
+ },
1471
+ {
1472
+ "name": "stderr",
1473
+ "output_type": "stream",
1474
+ "text": [
1475
+ " "
1476
+ ]
1477
+ },
1478
+ {
1479
+ "name": "stdout",
1480
+ "output_type": "stream",
1481
+ "text": [
1482
+ "\n",
1483
+ "Step 2260 | Train Loss: 0.0484 | Val Loss: 0.1329 | True train loss: 1.1016 | True val loss: 3.3233\n",
1484
+ "Individual label losses (unscaled):\n",
1485
+ " Label 0: 61.2343\n",
1486
+ " Label 1: 0.0114\n",
1487
+ " Label 2: 0.0243\n",
1488
+ " Label 3: 0.0402\n",
1489
+ " Label 4: 1.7624\n",
1490
+ "Loaded best model with validation loss: 0.1321\n",
1491
+ "Training completed.\n",
1492
+ "Number of validation checkpoints: 20\n",
1493
+ "Final training losses: [0.061716757780682724, 0.05798421218266002, 0.05344583738628214, 0.04969686268111773, 0.04844354389779336]\n",
1494
+ "Best validation loss: 0.1321\n"
1495
+ ]
1496
+ },
1497
+ {
1498
+ "name": "stderr",
1499
+ "output_type": "stream",
1500
+ "text": [
1501
+ "\r"
1502
+ ]
1503
+ },
1504
+ {
1505
+ "ename": "RuntimeError",
1506
+ "evalue": "File /home/jovyan/simson_training_bolgov/kaggle_comp/checkpoints cannot be opened.",
1507
+ "output_type": "error",
1508
+ "traceback": [
1509
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
1510
+ "\u001b[31mRuntimeError\u001b[39m Traceback (most recent call last)",
1511
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[12]\u001b[39m\u001b[32m, line 8\u001b[39m\n\u001b[32m 5\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mutils\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdata\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DataLoader\n\u001b[32m 6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtqdm\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m tqdm\n\u001b[32m----> \u001b[39m\u001b[32m8\u001b[39m train_losses, val_losses, best_loss = \u001b[43mrun_training\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 9\u001b[39m \u001b[43m \u001b[49m\u001b[43msmiles_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msmiles_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[32m 10\u001b[39m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscalers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_epochs\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m20\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlearning_rate\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m1e-4\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m128\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalidation_steps\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m113\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 11\u001b[39m \u001b[43m)\u001b[49m\n",
1512
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[11]\u001b[39m\u001b[32m, line 480\u001b[39m, in \u001b[36mrun_training\u001b[39m\u001b[34m(smiles_train, smiles_test, labels_train, labels_test, model, tokenizer, scalers, num_epochs, learning_rate, batch_size, validation_steps)\u001b[39m\n\u001b[32m 477\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[33mBest validation loss: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbest_val_loss\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.4f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m)\n\u001b[32m 479\u001b[39m \u001b[38;5;66;03m# Save model\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m480\u001b[39m \u001b[43mtorch\u001b[49m\u001b[43m.\u001b[49m\u001b[43msave\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m.\u001b[49m\u001b[43mstate_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m/home/jovyan/simson_training_bolgov/kaggle_comp/checkpoints\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 481\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mModel saved successfully!\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 483\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m train_losses, val_losses, best_val_loss\n",
1513
+ "\u001b[36mFile \u001b[39m\u001b[32m~/.mlspace/envs/bolgov_simson_training/lib/python3.12/site-packages/torch/serialization.py:849\u001b[39m, in \u001b[36msave\u001b[39m\u001b[34m(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization, _disable_byteorder_record)\u001b[39m\n\u001b[32m 846\u001b[39m _check_save_filelike(f)\n\u001b[32m 848\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m _use_new_zipfile_serialization:\n\u001b[32m--> \u001b[39m\u001b[32m849\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43m_open_zipfile_writer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m opened_zipfile:\n\u001b[32m 850\u001b[39m _save(\n\u001b[32m 851\u001b[39m obj,\n\u001b[32m 852\u001b[39m opened_zipfile,\n\u001b[32m (...)\u001b[39m\u001b[32m 855\u001b[39m _disable_byteorder_record,\n\u001b[32m 856\u001b[39m )\n\u001b[32m 857\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m\n",
1514
+ "\u001b[36mFile \u001b[39m\u001b[32m~/.mlspace/envs/bolgov_simson_training/lib/python3.12/site-packages/torch/serialization.py:716\u001b[39m, in \u001b[36m_open_zipfile_writer\u001b[39m\u001b[34m(name_or_buffer)\u001b[39m\n\u001b[32m 714\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 715\u001b[39m container = _open_zipfile_writer_buffer\n\u001b[32m--> \u001b[39m\u001b[32m716\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcontainer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname_or_buffer\u001b[49m\u001b[43m)\u001b[49m\n",
1515
+ "\u001b[36mFile \u001b[39m\u001b[32m~/.mlspace/envs/bolgov_simson_training/lib/python3.12/site-packages/torch/serialization.py:687\u001b[39m, in \u001b[36m_open_zipfile_writer_file.__init__\u001b[39m\u001b[34m(self, name)\u001b[39m\n\u001b[32m 685\u001b[39m \u001b[38;5;28msuper\u001b[39m().\u001b[34m__init__\u001b[39m(torch._C.PyTorchFileWriter(\u001b[38;5;28mself\u001b[39m.file_stream))\n\u001b[32m 686\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m687\u001b[39m \u001b[38;5;28msuper\u001b[39m().\u001b[34m__init__\u001b[39m(\u001b[43mtorch\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_C\u001b[49m\u001b[43m.\u001b[49m\u001b[43mPyTorchFileWriter\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m)\n",
1516
+ "\u001b[31mRuntimeError\u001b[39m: File /home/jovyan/simson_training_bolgov/kaggle_comp/checkpoints cannot be opened."
1517
+ ]
1518
+ }
1519
+ ],
1520
+ "source": [
1521
+ "import numpy as np\n",
1522
+ "import torch\n",
1523
+ "from torch.optim import AdamW\n",
1524
+ "from torch.optim.lr_scheduler import LinearLR\n",
1525
+ "from torch.utils.data import DataLoader\n",
1526
+ "from tqdm import tqdm\n",
1527
+ "\n",
1528
+ "train_losses, val_losses, best_loss = run_training(\n",
1529
+ " smiles_train, smiles_test, labels_train, labels_test, \n",
1530
+ " model, tokenizer, scalers, num_epochs=20, learning_rate=1e-4, batch_size=128, validation_steps=113,\n",
1531
+ ")"
1532
+ ]
1533
+ },
1534
+ {
1535
+ "cell_type": "code",
1536
+ "execution_count": null,
1537
+ "metadata": {},
1538
+ "outputs": [],
1539
+ "source": []
1540
+ }
1541
+ ],
1542
+ "metadata": {
1543
+ "kaggle": {
1544
+ "accelerator": "gpu",
1545
+ "dataSources": [
1546
+ {
1547
+ "databundleVersionId": 12966160,
1548
+ "sourceId": 74608,
1549
+ "sourceType": "competition"
1550
+ },
1551
+ {
1552
+ "datasetId": 7678100,
1553
+ "sourceId": 12189904,
1554
+ "sourceType": "datasetVersion"
1555
+ },
1556
+ {
1557
+ "datasetId": 7690162,
1558
+ "sourceId": 12207625,
1559
+ "sourceType": "datasetVersion"
1560
+ },
1561
+ {
1562
+ "datasetId": 7716502,
1563
+ "sourceId": 12322957,
1564
+ "sourceType": "datasetVersion"
1565
+ },
1566
+ {
1567
+ "datasetId": 7801155,
1568
+ "sourceId": 12372847,
1569
+ "sourceType": "datasetVersion"
1570
+ },
1571
+ {
1572
+ "datasetId": 7809006,
1573
+ "sourceId": 12525286,
1574
+ "sourceType": "datasetVersion"
1575
+ },
1576
+ {
1577
+ "datasetId": 7912957,
1578
+ "sourceId": 12668147,
1579
+ "sourceType": "datasetVersion"
1580
+ }
1581
+ ],
1582
+ "dockerImageVersionId": 31041,
1583
+ "isGpuEnabled": true,
1584
+ "isInternetEnabled": true,
1585
+ "language": "python",
1586
+ "sourceType": "notebook"
1587
+ },
1588
+ "kernelspec": {
1589
+ "display_name": "Python [conda env:.mlspace-bolgov_simson_training]",
1590
+ "language": "python",
1591
+ "name": "conda-env-.mlspace-bolgov_simson_training-py"
1592
+ },
1593
+ "language_info": {
1594
+ "codemirror_mode": {
1595
+ "name": "ipython",
1596
+ "version": 3
1597
+ },
1598
+ "file_extension": ".py",
1599
+ "mimetype": "text/x-python",
1600
+ "name": "python",
1601
+ "nbconvert_exporter": "python",
1602
+ "pygments_lexer": "ipython3",
1603
+ "version": "3.12.11"
1604
+ }
1605
+ },
1606
+ "nbformat": 4,
1607
+ "nbformat_minor": 4
1608
+ }
simson_modeling/kaggle_comp/checkpoints/clf_kaggle.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b78c67a2753f5d5686f0540dbb5ebca530d2f9117ed2a8d176663d9560f24574
3
+ size 93240610
simson_modeling/kaggle_comp/sample_submission.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ id,Tg,FFV,Tc,Density,Rg
2
+ 1109053969,0,0,0,0,0
3
+ 1422188626,0,0,0,0,0
4
+ 2032016830,0,0,0,0,0
simson_modeling/kaggle_comp/simson-fine-tune.ipynb ADDED
@@ -0,0 +1,1742 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from transformers import PreTrainedModel, AutoConfig, BertModel, BertTokenizerFast, BertConfig, AutoModel, AutoTokenizer\n",
10
+ "import pandas as pd\n",
11
+ "import torch\n",
12
+ "import os\n",
13
+ "from sklearn.preprocessing import StandardScaler\n",
14
+ "from sklearn.model_selection import train_test_split\n",
15
+ "from tqdm import tqdm\n",
16
+ "import joblib\n",
17
+ "\n",
18
+ "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 2,
24
+ "metadata": {},
25
+ "outputs": [],
26
+ "source": [
27
+ "df = pd.read_csv('/home/jovyan/simson_training_bolgov/kaggle_comp/train.csv')\n",
28
+ "\n",
29
+ "targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 3,
35
+ "metadata": {},
36
+ "outputs": [
37
+ {
38
+ "data": {
39
+ "text/html": [
40
+ "<div>\n",
41
+ "<style scoped>\n",
42
+ " .dataframe tbody tr th:only-of-type {\n",
43
+ " vertical-align: middle;\n",
44
+ " }\n",
45
+ "\n",
46
+ " .dataframe tbody tr th {\n",
47
+ " vertical-align: top;\n",
48
+ " }\n",
49
+ "\n",
50
+ " .dataframe thead th {\n",
51
+ " text-align: right;\n",
52
+ " }\n",
53
+ "</style>\n",
54
+ "<table border=\"1\" class=\"dataframe\">\n",
55
+ " <thead>\n",
56
+ " <tr style=\"text-align: right;\">\n",
57
+ " <th></th>\n",
58
+ " <th>id</th>\n",
59
+ " <th>SMILES</th>\n",
60
+ " <th>Tg</th>\n",
61
+ " <th>FFV</th>\n",
62
+ " <th>Tc</th>\n",
63
+ " <th>Density</th>\n",
64
+ " <th>Rg</th>\n",
65
+ " </tr>\n",
66
+ " </thead>\n",
67
+ " <tbody>\n",
68
+ " <tr>\n",
69
+ " <th>0</th>\n",
70
+ " <td>4.215886e+08</td>\n",
71
+ " <td>*C(=O)c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(Oc4ccc(N5...</td>\n",
72
+ " <td>NaN</td>\n",
73
+ " <td>0.376767</td>\n",
74
+ " <td>NaN</td>\n",
75
+ " <td>NaN</td>\n",
76
+ " <td>NaN</td>\n",
77
+ " </tr>\n",
78
+ " <tr>\n",
79
+ " <th>1</th>\n",
80
+ " <td>7.984549e+08</td>\n",
81
+ " <td>*c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(N4C(=O)c5ccc(C...</td>\n",
82
+ " <td>NaN</td>\n",
83
+ " <td>0.346993</td>\n",
84
+ " <td>NaN</td>\n",
85
+ " <td>NaN</td>\n",
86
+ " <td>NaN</td>\n",
87
+ " </tr>\n",
88
+ " <tr>\n",
89
+ " <th>2</th>\n",
90
+ " <td>NaN</td>\n",
91
+ " <td>*CC/C=C(/*)C</td>\n",
92
+ " <td>NaN</td>\n",
93
+ " <td>NaN</td>\n",
94
+ " <td>NaN</td>\n",
95
+ " <td>NaN</td>\n",
96
+ " <td>NaN</td>\n",
97
+ " </tr>\n",
98
+ " <tr>\n",
99
+ " <th>3</th>\n",
100
+ " <td>NaN</td>\n",
101
+ " <td>*CC(*)(C)C(=O)OCCN(CC)c1ccc(/N=N/c2ccc(OC)cc2)...</td>\n",
102
+ " <td>NaN</td>\n",
103
+ " <td>NaN</td>\n",
104
+ " <td>NaN</td>\n",
105
+ " <td>NaN</td>\n",
106
+ " <td>NaN</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>4</th>\n",
110
+ " <td>NaN</td>\n",
111
+ " <td>*Oc1cc(OC(=O)c2ccc(OCC)cc2)c(OC(=O)CCCC(*)=O)c...</td>\n",
112
+ " <td>NaN</td>\n",
113
+ " <td>NaN</td>\n",
114
+ " <td>NaN</td>\n",
115
+ " <td>NaN</td>\n",
116
+ " <td>NaN</td>\n",
117
+ " </tr>\n",
118
+ " <tr>\n",
119
+ " <th>...</th>\n",
120
+ " <td>...</td>\n",
121
+ " <td>...</td>\n",
122
+ " <td>...</td>\n",
123
+ " <td>...</td>\n",
124
+ " <td>...</td>\n",
125
+ " <td>...</td>\n",
126
+ " <td>...</td>\n",
127
+ " </tr>\n",
128
+ " <tr>\n",
129
+ " <th>16958</th>\n",
130
+ " <td>2.389975e+08</td>\n",
131
+ " <td>*OC(=O)Oc1ccc(S(=O)(=O)c2ccc(OC(=O)OC3CC4CC(*)...</td>\n",
132
+ " <td>NaN</td>\n",
133
+ " <td>0.339596</td>\n",
134
+ " <td>NaN</td>\n",
135
+ " <td>NaN</td>\n",
136
+ " <td>NaN</td>\n",
137
+ " </tr>\n",
138
+ " <tr>\n",
139
+ " <th>16959</th>\n",
140
+ " <td>NaN</td>\n",
141
+ " <td>*c1ccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4ccc(N5C(=O)c6c...</td>\n",
142
+ " <td>NaN</td>\n",
143
+ " <td>NaN</td>\n",
144
+ " <td>NaN</td>\n",
145
+ " <td>NaN</td>\n",
146
+ " <td>NaN</td>\n",
147
+ " </tr>\n",
148
+ " <tr>\n",
149
+ " <th>16960</th>\n",
150
+ " <td>NaN</td>\n",
151
+ " <td>*OC(F)(F)COC(=O)c1cc(OCCCCC)cc(C(=O)OCC(*)(F)F)c1</td>\n",
152
+ " <td>NaN</td>\n",
153
+ " <td>NaN</td>\n",
154
+ " <td>NaN</td>\n",
155
+ " <td>NaN</td>\n",
156
+ " <td>NaN</td>\n",
157
+ " </tr>\n",
158
+ " <tr>\n",
159
+ " <th>16961</th>\n",
160
+ " <td>1.973417e+09</td>\n",
161
+ " <td>*C=CC1CC(*)C2C(=O)N(c3ccc(F)cc3)C(=O)C12</td>\n",
162
+ " <td>NaN</td>\n",
163
+ " <td>0.374710</td>\n",
164
+ " <td>NaN</td>\n",
165
+ " <td>NaN</td>\n",
166
+ " <td>NaN</td>\n",
167
+ " </tr>\n",
168
+ " <tr>\n",
169
+ " <th>16962</th>\n",
170
+ " <td>NaN</td>\n",
171
+ " <td>*/C=C/[Ge](/C=C/[Si](*)(c1ccccc1)c1ccccc1)(c1c...</td>\n",
172
+ " <td>NaN</td>\n",
173
+ " <td>NaN</td>\n",
174
+ " <td>NaN</td>\n",
175
+ " <td>NaN</td>\n",
176
+ " <td>NaN</td>\n",
177
+ " </tr>\n",
178
+ " </tbody>\n",
179
+ "</table>\n",
180
+ "<p>16963 rows × 7 columns</p>\n",
181
+ "</div>"
182
+ ],
183
+ "text/plain": [
184
+ " id SMILES Tg \\\n",
185
+ "0 4.215886e+08 *C(=O)c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(Oc4ccc(N5... NaN \n",
186
+ "1 7.984549e+08 *c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(N4C(=O)c5ccc(C... NaN \n",
187
+ "2 NaN *CC/C=C(/*)C NaN \n",
188
+ "3 NaN *CC(*)(C)C(=O)OCCN(CC)c1ccc(/N=N/c2ccc(OC)cc2)... NaN \n",
189
+ "4 NaN *Oc1cc(OC(=O)c2ccc(OCC)cc2)c(OC(=O)CCCC(*)=O)c... NaN \n",
190
+ "... ... ... .. \n",
191
+ "16958 2.389975e+08 *OC(=O)Oc1ccc(S(=O)(=O)c2ccc(OC(=O)OC3CC4CC(*)... NaN \n",
192
+ "16959 NaN *c1ccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4ccc(N5C(=O)c6c... NaN \n",
193
+ "16960 NaN *OC(F)(F)COC(=O)c1cc(OCCCCC)cc(C(=O)OCC(*)(F)F)c1 NaN \n",
194
+ "16961 1.973417e+09 *C=CC1CC(*)C2C(=O)N(c3ccc(F)cc3)C(=O)C12 NaN \n",
195
+ "16962 NaN */C=C/[Ge](/C=C/[Si](*)(c1ccccc1)c1ccccc1)(c1c... NaN \n",
196
+ "\n",
197
+ " FFV Tc Density Rg \n",
198
+ "0 0.376767 NaN NaN NaN \n",
199
+ "1 0.346993 NaN NaN NaN \n",
200
+ "2 NaN NaN NaN NaN \n",
201
+ "3 NaN NaN NaN NaN \n",
202
+ "4 NaN NaN NaN NaN \n",
203
+ "... ... .. ... .. \n",
204
+ "16958 0.339596 NaN NaN NaN \n",
205
+ "16959 NaN NaN NaN NaN \n",
206
+ "16960 NaN NaN NaN NaN \n",
207
+ "16961 0.374710 NaN NaN NaN \n",
208
+ "16962 NaN NaN NaN NaN \n",
209
+ "\n",
210
+ "[16963 rows x 7 columns]"
211
+ ]
212
+ },
213
+ "execution_count": 3,
214
+ "metadata": {},
215
+ "output_type": "execute_result"
216
+ }
217
+ ],
218
+ "source": [
219
+ "for i in range(1, 5):\n",
220
+ " supplement_path = f'/home/jovyan/simson_training_bolgov/kaggle_comp/train_supplement/dataset{i}.csv'\n",
221
+ " supplement_ds = pd.read_csv(supplement_path)\n",
222
+ "\n",
223
+ " if 'TC_mean' in supplement_ds.columns:\n",
224
+ " supplement_ds = supplement_ds.rename(columns = {'TC_mean': 'Tc'})\n",
225
+ "\n",
226
+ " df = pd.concat([df, supplement_ds], axis=0)\n",
227
+ "\n",
228
+ "df = df.sample(frac=1).reset_index(drop=True)\n",
229
+ "df"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": 10,
235
+ "metadata": {},
236
+ "outputs": [
237
+ {
238
+ "name": "stderr",
239
+ "output_type": "stream",
240
+ "text": [
241
+ "100%|████████████████████████████████████| 14419/14419 [00:43<00:00, 328.78it/s]\n"
242
+ ]
243
+ },
244
+ {
245
+ "name": "stdout",
246
+ "output_type": "stream",
247
+ "text": [
248
+ "Advanced augmentation completed:\n",
249
+ "Original size: 14419, Augmented size: 168551\n",
250
+ "Augmentation factor: 11.69x\n"
251
+ ]
252
+ },
253
+ {
254
+ "name": "stderr",
255
+ "output_type": "stream",
256
+ "text": [
257
+ "100%|██████████████████████████████████████| 2545/2545 [00:07<00:00, 333.57it/s]"
258
+ ]
259
+ },
260
+ {
261
+ "name": "stdout",
262
+ "output_type": "stream",
263
+ "text": [
264
+ "Advanced augmentation completed:\n",
265
+ "Original size: 2545, Augmented size: 29716\n",
266
+ "Augmentation factor: 11.68x\n"
267
+ ]
268
+ },
269
+ {
270
+ "name": "stderr",
271
+ "output_type": "stream",
272
+ "text": [
273
+ "\n"
274
+ ]
275
+ }
276
+ ],
277
+ "source": [
278
+ "import pandas as pd\n",
279
+ "import numpy as np\n",
280
+ "from rdkit import Chem\n",
281
+ "import random\n",
282
+ "from typing import Optional, List, Union\n",
283
+ "\n",
284
+ "def augment_smiles_dataset(df: pd.DataFrame,\n",
285
+ " smiles_column: str = 'SMILES',\n",
286
+ " augmentation_strategies: List[str] = ['enumeration', 'kekulize', 'stereo_enum'],\n",
287
+ " n_augmentations: int = 10,\n",
288
+ " preserve_original: bool = True,\n",
289
+ " random_seed: Optional[int] = None) -> pd.DataFrame:\n",
290
+ " \"\"\"\n",
291
+ " Advanced SMILES augmentation with multiple strategies.\n",
292
+ " \n",
293
+ " Parameters:\n",
294
+ " -----------\n",
295
+ " augmentation_strategies : List[str]\n",
296
+ " List of augmentation strategies: 'enumeration', 'kekulize', 'stereo_enum'\n",
297
+ " \"\"\"\n",
298
+ " \n",
299
+ " if random_seed is not None:\n",
300
+ " random.seed(random_seed)\n",
301
+ " np.random.seed(random_seed)\n",
302
+ " \n",
303
+ " def apply_augmentation_strategy(smiles: str, strategy: str) -> List[str]:\n",
304
+ " \"\"\"Apply specific augmentation strategy\"\"\"\n",
305
+ " try:\n",
306
+ " mol = Chem.MolFromSmiles(smiles)\n",
307
+ " if mol is None:\n",
308
+ " return [smiles]\n",
309
+ " \n",
310
+ " augmented = []\n",
311
+ " \n",
312
+ " if strategy == 'enumeration':\n",
313
+ " # Standard SMILES enumeration\n",
314
+ " for _ in range(n_augmentations):\n",
315
+ " enum_smiles = Chem.MolToSmiles(mol, \n",
316
+ " canonical=False, \n",
317
+ " doRandom=True,\n",
318
+ " isomericSmiles=True)\n",
319
+ " augmented.append(enum_smiles)\n",
320
+ " \n",
321
+ " elif strategy == 'kekulize':\n",
322
+ " # Kekulization variants\n",
323
+ " try:\n",
324
+ " Chem.Kekulize(mol)\n",
325
+ " kek_smiles = Chem.MolToSmiles(mol, kekuleSmiles=True)\n",
326
+ " augmented.append(kek_smiles)\n",
327
+ " except:\n",
328
+ " pass\n",
329
+ " \n",
330
+ " elif strategy == 'stereo_enum':\n",
331
+ " # Stereochemistry enumeration\n",
332
+ " for _ in range(n_augmentations // 2):\n",
333
+ " # Remove stereochemistry\n",
334
+ " Chem.RemoveStereochemistry(mol)\n",
335
+ " no_stereo = Chem.MolToSmiles(mol)\n",
336
+ " augmented.append(no_stereo)\n",
337
+ " \n",
338
+ " return list(set(augmented)) # Remove duplicates\n",
339
+ " \n",
340
+ " except Exception as e:\n",
341
+ " print(f\"Error in {strategy} for {smiles}: {e}\")\n",
342
+ " return [smiles]\n",
343
+ " \n",
344
+ " augmented_rows = []\n",
345
+ " \n",
346
+ " for idx, row in tqdm(df.iterrows(), total=len(df)):\n",
347
+ " original_smiles = row[smiles_column]\n",
348
+ " \n",
349
+ " # Add original if requested\n",
350
+ " if preserve_original:\n",
351
+ " original_row = row.to_dict()\n",
352
+ " original_row['augmentation_strategy'] = 'original'\n",
353
+ " original_row['is_augmented'] = False\n",
354
+ " augmented_rows.append(original_row)\n",
355
+ " \n",
356
+ " # Apply each augmentation strategy\n",
357
+ " for strategy in augmentation_strategies:\n",
358
+ " strategy_smiles = apply_augmentation_strategy(original_smiles, strategy)\n",
359
+ " \n",
360
+ " for aug_smiles in strategy_smiles:\n",
361
+ " if aug_smiles != original_smiles: # Avoid duplicating original\n",
362
+ " new_row = row.to_dict().copy()\n",
363
+ " new_row[smiles_column] = aug_smiles\n",
364
+ " new_row['augmentation_strategy'] = strategy\n",
365
+ " new_row['is_augmented'] = True\n",
366
+ " augmented_rows.append(new_row)\n",
367
+ " \n",
368
+ " augmented_df = pd.DataFrame(augmented_rows)\n",
369
+ " augmented_df = augmented_df.reset_index(drop=True)\n",
370
+ " \n",
371
+ " print(f\"Advanced augmentation completed:\")\n",
372
+ " print(f\"Original size: {len(df)}, Augmented size: {len(augmented_df)}\")\n",
373
+ " print(f\"Augmentation factor: {len(augmented_df) / len(df):.2f}x\")\n",
374
+ " \n",
375
+ " return augmented_df.reset_index(drop=True)\n",
376
+ "\n",
377
+ "def create_splits(df):\n",
378
+ " length = len(df)\n",
379
+ " train_length = int(0.85 * length)\n",
380
+ " train = df.loc[:train_length]\n",
381
+ " test = df.loc[train_length:]\n",
382
+ " return train, test\n",
383
+ "\n",
384
+ "train, test = create_splits(df)\n",
385
+ "\n",
386
+ "train = train.reset_index(drop=True)\n",
387
+ "test = test.reset_index(drop=True)\n",
388
+ "\n",
389
+ "train = augment_smiles_dataset(train)\n",
390
+ "test = augment_smiles_dataset(test)"
391
+ ]
392
+ },
393
+ {
394
+ "cell_type": "code",
395
+ "execution_count": 11,
396
+ "metadata": {},
397
+ "outputs": [],
398
+ "source": [
399
+ "scalers = []\n",
400
+ "\n",
401
+ "for target in targets:\n",
402
+ " target_scaler = StandardScaler()\n",
403
+ " train[target] = target_scaler.fit_transform(train[target].to_numpy().reshape(-1, 1))\n",
404
+ " test[target] = target_scaler.transform(test[target].to_numpy().reshape(-1, 1))\n",
405
+ " \n",
406
+ " scalers.append(target_scaler)\n",
407
+ "\n",
408
+ "smiles_train = train['SMILES']\n",
409
+ "smiles_test = test['SMILES']\n",
410
+ "\n",
411
+ "labels_train = train[targets].values\n",
412
+ "labels_test = test[targets].values"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": 6,
418
+ "metadata": {},
419
+ "outputs": [
420
+ {
421
+ "data": {
422
+ "text/plain": [
423
+ "['target_scalers.pkl']"
424
+ ]
425
+ },
426
+ "execution_count": 6,
427
+ "metadata": {},
428
+ "output_type": "execute_result"
429
+ }
430
+ ],
431
+ "source": [
432
+ "joblib.dump(scalers, 'target_scalers.pkl')"
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "code",
437
+ "execution_count": 12,
438
+ "metadata": {},
439
+ "outputs": [
440
+ {
441
+ "name": "stderr",
442
+ "output_type": "stream",
443
+ "text": [
444
+ "/tmp/ipykernel_279009/2507782815.py:68: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
445
+ " simson_params = torch.load('/home/jovyan/simson_training_bolgov/kaggle_comp/simson_polymer_1m_uncompiled.pth')\n"
446
+ ]
447
+ }
448
+ ],
449
+ "source": [
450
+ "from sklearn.metrics import mean_absolute_error\n",
451
+ "from transformers import AutoTokenizer, BertModel\n",
452
+ "import torch\n",
453
+ "from torch import nn\n",
454
+ "from transformers.activations import ACT2FN\n",
455
+ "\n",
456
+ "def global_ap(x):\n",
457
+ " return torch.mean(x.view(x.size(0), x.size(1), -1), dim=1)\n",
458
+ "\n",
459
+ "class SimSonEncoder(nn.Module):\n",
460
+ " def __init__(self, config: BertConfig, max_len: int, dropout: float = 0.1):\n",
461
+ " super(SimSonEncoder, self).__init__()\n",
462
+ " self.config = config\n",
463
+ " self.max_len = max_len\n",
464
+ "\n",
465
+ " self.bert = BertModel(config, add_pooling_layer=False)\n",
466
+ "\n",
467
+ " self.linear = nn.Linear(config.hidden_size, max_len)\n",
468
+ " self.dropout = nn.Dropout(dropout)\n",
469
+ "\n",
470
+ " def forward(self, input_ids, attention_mask=None):\n",
471
+ " if attention_mask is None:\n",
472
+ " attention_mask = input_ids.ne(0)\n",
473
+ " \n",
474
+ " outputs = self.bert(\n",
475
+ " input_ids=input_ids,\n",
476
+ " attention_mask=attention_mask\n",
477
+ " )\n",
478
+ "\n",
479
+ " hidden_states = outputs.last_hidden_state\n",
480
+ " \n",
481
+ " hidden_states = self.dropout(hidden_states)\n",
482
+ " \n",
483
+ " pooled = global_ap(hidden_states)\n",
484
+ " \n",
485
+ " out = self.linear(pooled)\n",
486
+ " \n",
487
+ " return out\n",
488
+ "\n",
489
+ "\n",
490
+ "class SimSonClassifier(nn.Module):\n",
491
+ " def __init__(self, encoder: SimSonEncoder, num_labels: int, dropout=0.1):\n",
492
+ " super(SimSonClassifier, self).__init__()\n",
493
+ " self.encoder = encoder\n",
494
+ " self.clf = nn.Linear(encoder.max_len, num_labels)\n",
495
+ " self.relu = nn.ReLU()\n",
496
+ " self.dropout = nn.Dropout(dropout)\n",
497
+ "\n",
498
+ " def forward(self, input_ids, attention_mask=None, labels=None):\n",
499
+ " x = self.encoder(input_ids, attention_mask)\n",
500
+ " x = self.relu(self.dropout(x))\n",
501
+ " x = self.clf(x)\n",
502
+ " return x\n",
503
+ "\n",
504
+ "tokenizer_path = 'DeepChem/ChemBERTa-77M-MTR'\n",
505
+ "tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)\n",
506
+ "\n",
507
+ "# Only the hidden size is slightly larger, everything else is the same\n",
508
+ "config = BertConfig(\n",
509
+ " vocab_size=tokenizer.vocab_size,\n",
510
+ " hidden_size=768,\n",
511
+ " num_hidden_layers=4,\n",
512
+ " num_attention_heads=12,\n",
513
+ " intermediate_size=2048,\n",
514
+ " max_position_embeddings=512\n",
515
+ " )\n",
516
+ "\n",
517
+ "simson_params = torch.load('/home/jovyan/simson_training_bolgov/kaggle_comp/simson_polymer_1m_uncompiled.pth')\n",
518
+ "\n",
519
+ "backbone = SimSonEncoder(config=config, max_len=512)\n",
520
+ "backbone.load_state_dict(simson_params)\n",
521
+ "\n",
522
+ "model = SimSonClassifier(encoder=backbone, num_labels=len(targets))"
523
+ ]
524
+ },
525
+ {
526
+ "cell_type": "code",
527
+ "execution_count": 13,
528
+ "metadata": {},
529
+ "outputs": [],
530
+ "source": [
531
+ "import numpy as np\n",
532
+ "import torch\n",
533
+ "from torch.utils.data import Dataset, Sampler, DataLoader\n",
534
+ "\n",
535
+ "\n",
536
+ "class SMILESDataset(Dataset):\n",
537
+ " def __init__(self, smiles_list, labels, tokenizer, max_length=256):\n",
538
+ " self.smiles_list = smiles_list\n",
539
+ " self.labels = labels # Shape: (num_samples, 5) - already scaled\n",
540
+ " self.tokenizer = tokenizer\n",
541
+ " self.max_length = max_length\n",
542
+ " \n",
543
+ " # Create mask for valid (non-NaN) labels\n",
544
+ " self.label_masks = ~np.isnan(self.labels) # True where label is valid\n",
545
+ " \n",
546
+ " # Replace NaNs with 0 for safe tensor conversion (mask will handle exclusion)\n",
547
+ " self.labels = np.nan_to_num(self.labels, nan=0.0)\n",
548
+ " \n",
549
+ " def __len__(self):\n",
550
+ " return len(self.smiles_list)\n",
551
+ " \n",
552
+ " def __getitem__(self, idx):\n",
553
+ " smiles = self.tokenizer.cls_token + self.smiles_list[idx]\n",
554
+ " \n",
555
+ " # Tokenize the SMILES string\n",
556
+ " encoding = self.tokenizer(\n",
557
+ " smiles,\n",
558
+ " truncation=True,\n",
559
+ " padding='max_length',\n",
560
+ " max_length=self.max_length,\n",
561
+ " return_tensors='pt'\n",
562
+ " )\n",
563
+ " \n",
564
+ " return {\n",
565
+ " 'input_ids': encoding['input_ids'].flatten(),\n",
566
+ " 'attention_mask': encoding['attention_mask'].flatten(),\n",
567
+ " 'labels': torch.tensor(self.labels[idx], dtype=torch.float32),\n",
568
+ " 'label_mask': torch.tensor(self.label_masks[idx], dtype=torch.float32)\n",
569
+ " }\n",
570
+ " \n",
571
+ " def get_label_statistics(self):\n",
572
+ " \"\"\"Return statistics about label availability for 5 labels\"\"\"\n",
573
+ " label_counts = self.label_masks.sum(axis=0)\n",
574
+ " total_samples = len(self.smiles_list)\n",
575
+ " \n",
576
+ " stats = {\n",
577
+ " 'total_samples': total_samples,\n",
578
+ " 'label_0_count': label_counts[0],\n",
579
+ " 'label_1_count': label_counts[1],\n",
580
+ " 'label_2_count': label_counts[2],\n",
581
+ " 'label_3_count': label_counts[3],\n",
582
+ " 'label_4_count': label_counts[4],\n",
583
+ " 'label_0_ratio': label_counts[0] / total_samples,\n",
584
+ " 'label_1_ratio': label_counts[1] / total_samples,\n",
585
+ " 'label_2_ratio': label_counts[2] / total_samples,\n",
586
+ " 'label_3_ratio': label_counts[3] / total_samples,\n",
587
+ " 'label_4_ratio': label_counts[4] / total_samples,\n",
588
+ " 'all_labels_count': (self.label_masks.sum(axis=1) == 5).sum(),\n",
589
+ " 'partial_labels_count': ((self.label_masks.sum(axis=1) > 0) & (self.label_masks.sum(axis=1) < 5)).sum(),\n",
590
+ " 'no_labels_count': (self.label_masks.sum(axis=1) == 0).sum()\n",
591
+ " }\n",
592
+ " \n",
593
+ " return stats\n",
594
+ "\n",
595
+ "\n",
596
+ "class UnderrepresentedLabelSampler(Sampler):\n",
597
+ " \"\"\"\n",
598
+ " Custom sampler that gives higher sampling probability to samples containing under-represented labels.\n",
599
+ " This ensures each batch contains a good mix of samples with different label availability patterns.\n",
600
+ " \"\"\"\n",
601
+ " def __init__(self, dataset, num_labels=5, underrep_boost=2.0):\n",
602
+ " \"\"\"\n",
603
+ " Args:\n",
604
+ " dataset: SMILESDataset instance\n",
605
+ " num_labels: Number of labels (5)\n",
606
+ " underrep_boost: Multiplier to boost probability of under-represented labels\n",
607
+ " \"\"\"\n",
608
+ " self.dataset = dataset\n",
609
+ " self.num_samples = len(dataset)\n",
610
+ " self.num_labels = num_labels\n",
611
+ " self.underrep_boost = underrep_boost\n",
612
+ " \n",
613
+ " # Calculate label frequencies\n",
614
+ " label_counts = dataset.label_masks.sum(axis=0) # Count valid samples per label\n",
615
+ " total_samples = self.num_samples\n",
616
+ " \n",
617
+ " # Label frequencies (proportion of samples with each label)\n",
618
+ " label_freq = label_counts / total_samples\n",
619
+ " \n",
620
+ " # Inverse frequency weights (higher for under-represented labels)\n",
621
+ " # Add small epsilon to avoid division by zero\n",
622
+ " self.label_weights = 1.0 / (label_freq + 1e-6)\n",
623
+ " \n",
624
+ " # Apply boost to under-represented labels\n",
625
+ " # Labels with frequency < median get boosted\n",
626
+ " median_freq = np.median(label_freq)\n",
627
+ " underrep_mask = label_freq < median_freq\n",
628
+ " self.label_weights[underrep_mask] *= self.underrep_boost\n",
629
+ " \n",
630
+ " # Calculate sample weights based on which labels are present\n",
631
+ " sample_weights = []\n",
632
+ " for i in range(self.num_samples):\n",
633
+ " mask = dataset.label_masks[i] # Boolean mask for present labels\n",
634
+ " if mask.sum() > 0:\n",
635
+ " # Weight is average of present labels' weights\n",
636
+ " weights = self.label_weights[mask]\n",
637
+ " sample_weight = weights.mean()\n",
638
+ " else:\n",
639
+ " # If no labels present, give minimal weight\n",
640
+ " sample_weight = 0.1\n",
641
+ " sample_weights.append(sample_weight)\n",
642
+ " \n",
643
+ " self.sample_weights = torch.tensor(sample_weights, dtype=torch.double)\n",
644
+ " \n",
645
+ " # Print sampling statistics\n",
646
+ " print(f\"Label frequencies: {label_freq}\")\n",
647
+ " print(f\"Label weights: {self.label_weights}\")\n",
648
+ " print(f\"Under-represented labels (< median freq {median_freq:.3f}): {np.where(underrep_mask)[0]}\")\n",
649
+ " print(f\"Sample weight range: [{self.sample_weights.min():.3f}, {self.sample_weights.max():.3f}]\")\n",
650
+ " \n",
651
+ " def __iter__(self):\n",
652
+ " # Sample with replacement according to calculated weights\n",
653
+ " indices = torch.multinomial(self.sample_weights, self.num_samples, replacement=True)\n",
654
+ " return iter(indices.tolist())\n",
655
+ " \n",
656
+ " def __len__(self):\n",
657
+ " return self.num_samples\n",
658
+ "\n",
659
+ "\n",
660
+ "def calculate_unweighted_loss(predictions, labels, label_mask):\n",
661
+ " \"\"\"\n",
662
+ " Calculate simple unweighted MSE loss with masking (no label weights)\n",
663
+ " \n",
664
+ " Args:\n",
665
+ " predictions: Model outputs (batch_size, 5)\n",
666
+ " labels: Ground truth labels (batch_size, 5)\n",
667
+ " label_mask: Mask for valid labels (batch_size, 5)\n",
668
+ " \"\"\"\n",
669
+ " loss_fn = nn.MSELoss(reduction='none')\n",
670
+ " \n",
671
+ " # Calculate per-sample, per-label losses\n",
672
+ " losses = loss_fn(predictions, labels) # Shape: (batch_size, 5)\n",
673
+ " \n",
674
+ " # Apply masking to exclude NaN labels\n",
675
+ " valid_mask = label_mask.bool()\n",
676
+ " masked_losses = losses * valid_mask.float()\n",
677
+ " \n",
678
+ " # Calculate final loss (only over valid predictions)\n",
679
+ " total_loss = masked_losses.sum()\n",
680
+ " total_valid = valid_mask.sum()\n",
681
+ " \n",
682
+ " return total_loss / total_valid if total_valid > 0 else torch.tensor(0.0, device=predictions.device, requires_grad=True)\n",
683
+ "\n",
684
+ "\n",
685
+ "def calculate_true_loss(predictions, labels, label_mask, scalers=None):\n",
686
+ " \"\"\"\n",
687
+ " Calculate unscaled MAE loss for monitoring using separate scalers for each label\n",
688
+ " \n",
689
+ " Args:\n",
690
+ " predictions (torch.Tensor): Model outputs of shape (batch_size, 5).\n",
691
+ " labels (torch.Tensor): Ground truth labels of shape (batch_size, 5).\n",
692
+ " label_mask (torch.Tensor): Boolean mask for valid labels of shape (batch_size, 5).\n",
693
+ " scalers: List of scaler objects, one for each label\n",
694
+ " \n",
695
+ " Returns:\n",
696
+ " float: Average MAE across all valid samples\n",
697
+ " \"\"\"\n",
698
+ " # Detach tensors from the computation graph and move to CPU\n",
699
+ " predictions_np = predictions.cpu().detach().numpy()\n",
700
+ " labels_np = labels.cpu().numpy()\n",
701
+ " label_mask_np = label_mask.cpu().numpy().astype(bool)\n",
702
+ " \n",
703
+ " total_mae = 0\n",
704
+ " total_samples = 0\n",
705
+ " \n",
706
+ " for label_idx in range(5):\n",
707
+ " # Get valid samples for this label\n",
708
+ " valid_mask = label_mask_np[:, label_idx]\n",
709
+ " \n",
710
+ " if valid_mask.any():\n",
711
+ " valid_preds = predictions_np[valid_mask, label_idx].reshape(-1, 1)\n",
712
+ " valid_labels = labels_np[valid_mask, label_idx].reshape(-1, 1)\n",
713
+ " \n",
714
+ " if scalers is not None:\n",
715
+ " # Unscale using the corresponding scaler for this label\n",
716
+ " unscaled_preds = scalers[label_idx].inverse_transform(valid_preds).flatten()\n",
717
+ " unscaled_labels = scalers[label_idx].inverse_transform(valid_labels).flatten()\n",
718
+ " else:\n",
719
+ " unscaled_preds = valid_preds.flatten()\n",
720
+ " unscaled_labels = valid_labels.flatten()\n",
721
+ " \n",
722
+ " # Calculate MAE for this label\n",
723
+ " mae = np.mean(np.abs(unscaled_preds - unscaled_labels))\n",
724
+ " total_mae += mae * len(unscaled_preds)\n",
725
+ " total_samples += len(unscaled_preds)\n",
726
+ " \n",
727
+ " return total_mae / total_samples if total_samples > 0 else 0.0\n",
728
+ "\n",
729
+ "\n",
730
+ "def calculate_individual_label_losses(predictions, labels, label_mask, scalers=None):\n",
731
+ " \"\"\"\n",
732
+ " Calculate unscaled MAE loss for each individual label\n",
733
+ " \n",
734
+ " Args:\n",
735
+ " predictions (torch.Tensor): Model outputs of shape (batch_size, 5).\n",
736
+ " labels (torch.Tensor): Ground truth labels of shape (batch_size, 5).\n",
737
+ " label_mask (torch.Tensor): Boolean mask for valid labels of shape (batch_size, 5).\n",
738
+ " scalers: List of scaler objects, one for each label\n",
739
+ " \n",
740
+ " Returns:\n",
741
+ " dict: Dictionary with MAE for each label\n",
742
+ " \"\"\"\n",
743
+ " # Detach tensors from the computation graph and move to CPU\n",
744
+ " predictions_np = predictions.cpu().detach().numpy()\n",
745
+ " labels_np = labels.cpu().numpy()\n",
746
+ " label_mask_np = label_mask.cpu().numpy().astype(bool)\n",
747
+ " \n",
748
+ " individual_losses = {}\n",
749
+ " \n",
750
+ " for label_idx in range(5):\n",
751
+ " # Get valid samples for this label\n",
752
+ " valid_mask = label_mask_np[:, label_idx]\n",
753
+ " \n",
754
+ " if valid_mask.any():\n",
755
+ " valid_preds = predictions_np[valid_mask, label_idx].reshape(-1, 1)\n",
756
+ " valid_labels = labels_np[valid_mask, label_idx].reshape(-1, 1)\n",
757
+ " \n",
758
+ " if scalers is not None:\n",
759
+ " # Unscale using the corresponding scaler for this label\n",
760
+ " unscaled_preds = scalers[label_idx].inverse_transform(valid_preds).flatten()\n",
761
+ " unscaled_labels = scalers[label_idx].inverse_transform(valid_labels).flatten()\n",
762
+ " else:\n",
763
+ " unscaled_preds = valid_preds.flatten()\n",
764
+ " unscaled_labels = valid_labels.flatten()\n",
765
+ " \n",
766
+ " # Calculate MAE for this label\n",
767
+ " mae = np.mean(np.abs(unscaled_preds - unscaled_labels))\n",
768
+ " individual_losses[f'label_{label_idx}'] = mae\n",
769
+ " else:\n",
770
+ " individual_losses[f'label_{label_idx}'] = None # No valid samples for this label\n",
771
+ " \n",
772
+ " return individual_losses\n",
773
+ "\n",
774
+ "\n",
775
+ "def analyze_batch_composition(dataloader, num_batches=5):\n",
776
+ " \"\"\"\n",
777
+ " Analyze the composition of batches to see label distribution\n",
778
+ " \"\"\"\n",
779
+ " print(\"Analyzing batch composition:\")\n",
780
+ " \n",
781
+ " for batch_idx, batch in enumerate(dataloader):\n",
782
+ " if batch_idx >= num_batches:\n",
783
+ " break\n",
784
+ " \n",
785
+ " label_mask = batch['label_mask'].numpy()\n",
786
+ " \n",
787
+ " # Count samples with each label in this batch\n",
788
+ " label_counts = label_mask.sum(axis=0)\n",
789
+ " batch_size = label_mask.shape[0]\n",
790
+ " \n",
791
+ " print(f\"Batch {batch_idx + 1}: Size={batch_size}\")\n",
792
+ " for i in range(5):\n",
793
+ " print(f\" Label {i}: {label_counts[i]}/{batch_size} ({label_counts[i]/batch_size:.2%})\")\n",
794
+ " print()\n",
795
+ "\n",
796
+ "\n",
797
+ "def train_model(model, train_dataloader, val_dataloader, \n",
798
+ " scalers=None, num_epochs=10, learning_rate=2e-5, device='cuda', \n",
799
+ " patience=3, validation_steps=500):\n",
800
+ " \"\"\"\n",
801
+ " Train model with unweighted loss and custom sampler for five labels\n",
802
+ " \n",
803
+ " Args:\n",
804
+ " model: CustomModel instance (should output 5 labels)\n",
805
+ " train_dataloader: Training data loader with custom sampler\n",
806
+ " val_dataloader: Validation data loader \n",
807
+ " scalers: List of scalers for unscaled loss monitoring\n",
808
+ " num_epochs: Number of training epochs\n",
809
+ " learning_rate: Learning rate\n",
810
+ " device: Training device\n",
811
+ " patience: Early stopping patience (in validation steps)\n",
812
+ " validation_steps: Perform validation every N training steps\n",
813
+ " \"\"\"\n",
814
+ " model.to(device)\n",
815
+ " \n",
816
+ " optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)\n",
817
+ " total_steps = len(train_dataloader) * num_epochs\n",
818
+ " scheduler = LinearLR(optimizer, start_factor=1.0, end_factor=0.1, total_iters=total_steps)\n",
819
+ " \n",
820
+ " train_losses = []\n",
821
+ " val_losses = []\n",
822
+ " \n",
823
+ " # Early stopping initialization\n",
824
+ " best_val_loss = float('inf')\n",
825
+ " steps_no_improve = 0\n",
826
+ " best_model_state = None\n",
827
+ " \n",
828
+ " # Training tracking\n",
829
+ " global_step = 0\n",
830
+ " running_train_loss = 0\n",
831
+ " running_true_train_loss = 0\n",
832
+ " train_steps_count = 0\n",
833
+ " \n",
834
+ " print(f\"Training with custom sampler (no label weights)\")\n",
835
+ " print(f\"Validation will be performed every {validation_steps} steps\")\n",
836
+ " \n",
837
+ " model.train()\n",
838
+ " \n",
839
+ " for epoch in range(num_epochs):\n",
840
+ " print(f\"\\nEpoch {epoch + 1}/{num_epochs}\")\n",
841
+ " \n",
842
+ " train_progress = tqdm(train_dataloader, desc=\"Training\", leave=False)\n",
843
+ " \n",
844
+ " for batch_idx, batch in enumerate(train_progress):\n",
845
+ " with torch.autocast(dtype=torch.float16, device_type=\"cuda\"):\n",
846
+ " input_ids = batch['input_ids'].to(device)\n",
847
+ " attention_mask = batch['attention_mask'].to(device)\n",
848
+ " labels = batch['labels'].to(device)\n",
849
+ " label_mask = batch['label_mask'].to(device)\n",
850
+ " \n",
851
+ " optimizer.zero_grad()\n",
852
+ " \n",
853
+ " # Model forward pass\n",
854
+ " outputs = model(\n",
855
+ " input_ids=input_ids,\n",
856
+ " attention_mask=attention_mask,\n",
857
+ " )\n",
858
+ " \n",
859
+ " # Calculate unweighted loss (sampler handles the balancing)\n",
860
+ " loss = calculate_unweighted_loss(outputs, labels, label_mask)\n",
861
+ " \n",
862
+ " # Calculate true loss for monitoring\n",
863
+ " true_loss = calculate_true_loss(outputs, labels, label_mask, scalers)\n",
864
+ " \n",
865
+ " # Accumulate losses for averaging\n",
866
+ " running_train_loss += loss.item()\n",
867
+ " running_true_train_loss += true_loss\n",
868
+ " train_steps_count += 1\n",
869
+ " \n",
870
+ " loss.backward()\n",
871
+ " \n",
872
+ " torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n",
873
+ " \n",
874
+ " optimizer.step()\n",
875
+ " scheduler.step()\n",
876
+ " \n",
877
+ " global_step += 1\n",
878
+ " \n",
879
+ " train_progress.set_postfix({\n",
880
+ " 'step': global_step,\n",
881
+ " 'loss': f'{loss.item():.4f}',\n",
882
+ " 'true_loss': f'{true_loss:.4f}',\n",
883
+ " 'lr': f'{scheduler.get_last_lr()[0]:.2e}'\n",
884
+ " })\n",
885
+ " \n",
886
+ " # Perform validation every validation_steps\n",
887
+ " if global_step % validation_steps == 0:\n",
888
+ " # Calculate average training losses since last validation\n",
889
+ " avg_train_loss = running_train_loss / train_steps_count\n",
890
+ " avg_true_train_loss = running_true_train_loss / train_steps_count\n",
891
+ " \n",
892
+ " train_losses.append(avg_train_loss)\n",
893
+ " \n",
894
+ " # Reset running averages\n",
895
+ " running_train_loss = 0\n",
896
+ " running_true_train_loss = 0\n",
897
+ " train_steps_count = 0\n",
898
+ " \n",
899
+ " # Validation\n",
900
+ " model.eval()\n",
901
+ " total_val_loss = 0\n",
902
+ " total_true_val_loss = 0\n",
903
+ " val_batches = 0\n",
904
+ " \n",
905
+ " # Track individual label losses across all validation batches\n",
906
+ " accumulated_individual_losses = {f'label_{i}': [] for i in range(5)}\n",
907
+ "\n",
908
+ " with torch.no_grad():\n",
909
+ " for val_batch in val_dataloader:\n",
910
+ " with torch.autocast(dtype=torch.float16, device_type=\"cuda\"):\n",
911
+ " input_ids = val_batch['input_ids'].to(device)\n",
912
+ " attention_mask = val_batch['attention_mask'].to(device)\n",
913
+ " labels = val_batch['labels'].to(device)\n",
914
+ " label_mask = val_batch['label_mask'].to(device)\n",
915
+ " \n",
916
+ " outputs = model(\n",
917
+ " input_ids=input_ids,\n",
918
+ " attention_mask=attention_mask,\n",
919
+ " )\n",
920
+ " \n",
921
+ " val_loss = calculate_unweighted_loss(outputs, labels, label_mask)\n",
922
+ " val_true_loss = calculate_true_loss(outputs, labels, label_mask, scalers)\n",
923
+ " \n",
924
+ " # Calculate individual label losses for this batch\n",
925
+ " individual_losses = calculate_individual_label_losses(outputs, labels, label_mask, scalers)\n",
926
+ " \n",
927
+ " # Accumulate individual losses\n",
928
+ " for label_key, loss_value in individual_losses.items():\n",
929
+ " if loss_value is not None:\n",
930
+ " accumulated_individual_losses[label_key].append(loss_value)\n",
931
+ "\n",
932
+ " total_val_loss += val_loss.item()\n",
933
+ " total_true_val_loss += val_true_loss\n",
934
+ " val_batches += 1\n",
935
+ " \n",
936
+ " avg_val_loss = total_val_loss / val_batches\n",
937
+ " avg_val_true_loss = total_true_val_loss / val_batches\n",
938
+ " val_losses.append(avg_val_loss)\n",
939
+ " \n",
940
+ " # Calculate average individual label losses\n",
941
+ " avg_individual_losses = {}\n",
942
+ " for label_key, losses in accumulated_individual_losses.items():\n",
943
+ " if losses:\n",
944
+ " avg_individual_losses[label_key] = np.mean(losses)\n",
945
+ " else:\n",
946
+ " avg_individual_losses[label_key] = None\n",
947
+ " \n",
948
+ " # Print validation results with individual label losses\n",
949
+ " print(f\"\\nStep {global_step} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | True train loss: {avg_true_train_loss:.4f} | True val loss: {avg_val_true_loss:.4f}\")\n",
950
+ " print(\"Individual label losses (unscaled):\")\n",
951
+ " for i in range(5):\n",
952
+ " label_key = f'label_{i}'\n",
953
+ " if avg_individual_losses[label_key] is not None:\n",
954
+ " print(f\" Label {i}: {avg_individual_losses[label_key]:.4f}\")\n",
955
+ " else:\n",
956
+ " print(f\" Label {i}: No valid samples\")\n",
957
+ " \n",
958
+ " # Early stopping check and best model saving\n",
959
+ " if avg_val_loss < best_val_loss:\n",
960
+ " best_val_loss = avg_val_loss\n",
961
+ " steps_no_improve = 0\n",
962
+ " best_model_state = model.state_dict().copy()\n",
963
+ " print(f\"New best validation loss: {best_val_loss:.4f}\")\n",
964
+ " else:\n",
965
+ " steps_no_improve += 1\n",
966
+ " if steps_no_improve >= patience:\n",
967
+ " print(f\"Early stopping triggered after {global_step} steps ({steps_no_improve} validation steps without improvement).\")\n",
968
+ " # Load best model and return\n",
969
+ " if best_model_state is not None:\n",
970
+ " model.load_state_dict(best_model_state)\n",
971
+ " print(f\"Loaded best model with validation loss: {best_val_loss:.4f}\")\n",
972
+ " return train_losses, val_losses, best_val_loss\n",
973
+ " \n",
974
+ " model.train()\n",
975
+ " \n",
976
+ " # Handle any remaining training loss that hasn't been validated\n",
977
+ " if train_steps_count > 0:\n",
978
+ " avg_train_loss = running_train_loss / train_steps_count\n",
979
+ " train_losses.append(avg_train_loss)\n",
980
+ " \n",
981
+ " # Load the best model state before returning\n",
982
+ " if best_model_state is not None:\n",
983
+ " model.load_state_dict(best_model_state)\n",
984
+ " print(f\"Loaded best model with validation loss: {best_val_loss:.4f}\")\n",
985
+ " \n",
986
+ " return train_losses, val_losses, best_val_loss\n",
987
+ "\n",
988
+ "\n",
989
+ "def run_training(smiles_train, smiles_test, labels_train, labels_test, \n",
990
+ " model, tokenizer, scalers, num_epochs=5, learning_rate=1e-5, \n",
991
+ " batch_size=256, validation_steps=500, underrep_boost=2.0):\n",
992
+ " \"\"\"\n",
993
+ " Complete training pipeline for five labels with custom sampler\n",
994
+ " \n",
995
+ " Args:\n",
996
+ " smiles_train, smiles_test: Lists of SMILES strings\n",
997
+ " labels_train, labels_test: numpy arrays of shape (num_samples, 5) - ALREADY SCALED\n",
998
+ " model: CustomModel instance (configured for 5 outputs)\n",
999
+ " tokenizer: Tokenizer instance\n",
1000
+ " scalers: List of 5 scalers, one for each label (for inverse transform only)\n",
1001
+ " num_epochs: Number of training epochs\n",
1002
+ " learning_rate: Learning rate\n",
1003
+ " batch_size: Batch size for training\n",
1004
+ " validation_steps: Perform validation every N training steps\n",
1005
+ " underrep_boost: Boost factor for under-represented labels in sampler\n",
1006
+ " \"\"\"\n",
1007
+ " \n",
1008
+ " print(\"Setting up datasets for five-label training with custom sampler\")\n",
1009
+ " \n",
1010
+ " # Create datasets - no scaling performed here\n",
1011
+ " train_dataset = SMILESDataset(smiles_train, labels_train, tokenizer)\n",
1012
+ " val_dataset = SMILESDataset(smiles_test, labels_test, tokenizer)\n",
1013
+ " \n",
1014
+ " # Print dataset statistics\n",
1015
+ " train_stats = train_dataset.get_label_statistics()\n",
1016
+ " val_stats = val_dataset.get_label_statistics()\n",
1017
+ " \n",
1018
+ " print(\"Training dataset statistics:\")\n",
1019
+ " for key, value in train_stats.items():\n",
1020
+ " print(f\" {key}: {value}\")\n",
1021
+ " \n",
1022
+ " print(\"Validation dataset statistics:\")\n",
1023
+ " for key, value in val_stats.items():\n",
1024
+ " print(f\" {key}: {value}\")\n",
1025
+ " \n",
1026
+ " # Create custom sampler for balanced training\n",
1027
+ " train_sampler = UnderrepresentedLabelSampler(\n",
1028
+ " train_dataset, \n",
1029
+ " num_labels=5, \n",
1030
+ " underrep_boost=underrep_boost\n",
1031
+ " )\n",
1032
+ " \n",
1033
+ " # Create data loaders\n",
1034
+ " train_dataloader = DataLoader(\n",
1035
+ " train_dataset,\n",
1036
+ " batch_size=batch_size,\n",
1037
+ " sampler=None, # Use custom sampler instead of shuffle=True\n",
1038
+ " num_workers=4,\n",
1039
+ " pin_memory=True\n",
1040
+ " )\n",
1041
+ " \n",
1042
+ " val_dataloader = DataLoader(\n",
1043
+ " val_dataset,\n",
1044
+ " batch_size=batch_size,\n",
1045
+ " shuffle=False,\n",
1046
+ " num_workers=4,\n",
1047
+ " pin_memory=True\n",
1048
+ " )\n",
1049
+ " \n",
1050
+ " # Analyze batch composition to verify sampler effectiveness\n",
1051
+ " print(\"\\n\" + \"=\"*50)\n",
1052
+ " #analyze_batch_composition(train_dataloader, num_batches=3)\n",
1053
+ " print(\"=\"*50)\n",
1054
+ " \n",
1055
+ " # Set device\n",
1056
+ " device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
1057
+ " print(f\"Using device: {device}\")\n",
1058
+ " print(f\"Training steps per epoch: {len(train_dataloader)}\")\n",
1059
+ " print(f\"Total training steps: {len(train_dataloader) * num_epochs}\")\n",
1060
+ " \n",
1061
+ " # Train the model\n",
1062
+ " train_losses, val_losses, best_val_loss = train_model(\n",
1063
+ " model=model,\n",
1064
+ " train_dataloader=train_dataloader,\n",
1065
+ " val_dataloader=val_dataloader,\n",
1066
+ " scalers=scalers,\n",
1067
+ " num_epochs=num_epochs,\n",
1068
+ " learning_rate=learning_rate,\n",
1069
+ " device=device,\n",
1070
+ " patience=10,\n",
1071
+ " validation_steps=validation_steps,\n",
1072
+ " )\n",
1073
+ " \n",
1074
+ " print('Training completed.')\n",
1075
+ " print(f'Number of validation checkpoints: {len(val_losses)}')\n",
1076
+ " print(f'Final training losses: {train_losses[-5:] if len(train_losses) >= 5 else train_losses}')\n",
1077
+ " print(f'Best validation loss: {best_val_loss:.4f}')\n",
1078
+ " \n",
1079
+ " # Save model\n",
1080
+ " torch.save(model.state_dict(), '/home/jovyan/simson_training_bolgov/kaggle_comp/checkpoints/clf_kaggle.bin')\n",
1081
+ " print(\"Model saved successfully!\")\n",
1082
+ " \n",
1083
+ " return train_losses, val_losses, best_val_loss\n"
1084
+ ]
1085
+ },
1086
+ {
1087
+ "cell_type": "code",
1088
+ "execution_count": 15,
1089
+ "metadata": {},
1090
+ "outputs": [
1091
+ {
1092
+ "name": "stdout",
1093
+ "output_type": "stream",
1094
+ "text": [
1095
+ "Setting up datasets for five-label training with custom sampler\n",
1096
+ "Training dataset statistics:\n",
1097
+ " total_samples: 168551\n",
1098
+ " label_0_count: 5446\n",
1099
+ " label_1_count: 78850\n",
1100
+ " label_2_count: 14846\n",
1101
+ " label_3_count: 5779\n",
1102
+ " label_4_count: 5782\n",
1103
+ " label_0_ratio: 0.032310695279173664\n",
1104
+ " label_1_ratio: 0.46781092962960763\n",
1105
+ " label_2_ratio: 0.08808016564719284\n",
1106
+ " label_3_ratio: 0.03428635843157264\n",
1107
+ " label_4_ratio: 0.03430415719871137\n",
1108
+ " all_labels_count: 0\n",
1109
+ " partial_labels_count: 96406\n",
1110
+ " no_labels_count: 72145\n",
1111
+ "Validation dataset statistics:\n",
1112
+ " total_samples: 29716\n",
1113
+ " label_0_count: 947\n",
1114
+ " label_1_count: 13878\n",
1115
+ " label_2_count: 2764\n",
1116
+ " label_3_count: 957\n",
1117
+ " label_4_count: 955\n",
1118
+ " label_0_ratio: 0.03186835374882218\n",
1119
+ " label_1_ratio: 0.4670211333961502\n",
1120
+ " label_2_ratio: 0.0930138645847355\n",
1121
+ " label_3_ratio: 0.03220487279580024\n",
1122
+ " label_4_ratio: 0.03213756898640463\n",
1123
+ " all_labels_count: 0\n",
1124
+ " partial_labels_count: 17016\n",
1125
+ " no_labels_count: 12700\n",
1126
+ "Label frequencies: [0.0323107 0.46781093 0.08808017 0.03428636 0.03430416]\n",
1127
+ "Label weights: [61.89709276 2.13761116 11.35316492 58.33053614 29.15013606]\n",
1128
+ "Under-represented labels (< median freq 0.034): [0 3]\n",
1129
+ "Sample weight range: [0.100, 61.897]\n",
1130
+ "\n",
1131
+ "==================================================\n",
1132
+ "==================================================\n",
1133
+ "Using device: cuda\n",
1134
+ "Training steps per epoch: 1317\n",
1135
+ "Total training steps: 26340\n",
1136
+ "Training with custom sampler (no label weights)\n",
1137
+ "Validation will be performed every 1316 steps\n",
1138
+ "\n",
1139
+ "Epoch 1/20\n"
1140
+ ]
1141
+ },
1142
+ {
1143
+ "name": "stderr",
1144
+ "output_type": "stream",
1145
+ "text": [
1146
+ " \r"
1147
+ ]
1148
+ },
1149
+ {
1150
+ "name": "stdout",
1151
+ "output_type": "stream",
1152
+ "text": [
1153
+ "\n",
1154
+ "Step 1316 | Train Loss: 0.6250 | Val Loss: 0.4127 | True train loss: 3.9762 | True val loss: 3.8368\n",
1155
+ "Individual label losses (unscaled):\n",
1156
+ " Label 0: 76.7992\n",
1157
+ " Label 1: 0.0127\n",
1158
+ " Label 2: 0.0372\n",
1159
+ " Label 3: 0.0987\n",
1160
+ " Label 4: 3.3515\n",
1161
+ "New best validation loss: 0.4127\n",
1162
+ "\n",
1163
+ "Epoch 2/20\n"
1164
+ ]
1165
+ },
1166
+ {
1167
+ "name": "stderr",
1168
+ "output_type": "stream",
1169
+ "text": [
1170
+ " \r"
1171
+ ]
1172
+ },
1173
+ {
1174
+ "name": "stdout",
1175
+ "output_type": "stream",
1176
+ "text": [
1177
+ "\n",
1178
+ "Step 2632 | Train Loss: 0.5464 | Val Loss: 0.4244 | True train loss: 3.5447 | True val loss: 3.4895\n",
1179
+ "Individual label losses (unscaled):\n",
1180
+ " Label 0: 68.7228\n",
1181
+ " Label 1: 0.0130\n",
1182
+ " Label 2: 0.0379\n",
1183
+ " Label 3: 0.0952\n",
1184
+ " Label 4: 3.8732\n",
1185
+ "\n",
1186
+ "Epoch 3/20\n"
1187
+ ]
1188
+ },
1189
+ {
1190
+ "name": "stderr",
1191
+ "output_type": "stream",
1192
+ "text": [
1193
+ "Training: 100%|█| 1317/1317 [01:22<00:00, 1.88it/s, step=3951, loss=0.6545, tru"
1194
+ ]
1195
+ },
1196
+ {
1197
+ "name": "stdout",
1198
+ "output_type": "stream",
1199
+ "text": [
1200
+ "\n",
1201
+ "Step 3948 | Train Loss: 0.5242 | Val Loss: 0.4007 | True train loss: 3.4056 | True val loss: 3.2830\n",
1202
+ "Individual label losses (unscaled):\n",
1203
+ " Label 0: 63.8785\n",
1204
+ " Label 1: 0.0130\n",
1205
+ " Label 2: 0.0362\n",
1206
+ " Label 3: 0.1013\n",
1207
+ " Label 4: 3.4475\n",
1208
+ "New best validation loss: 0.4007\n"
1209
+ ]
1210
+ },
1211
+ {
1212
+ "name": "stderr",
1213
+ "output_type": "stream",
1214
+ "text": [
1215
+ " \r"
1216
+ ]
1217
+ },
1218
+ {
1219
+ "name": "stdout",
1220
+ "output_type": "stream",
1221
+ "text": [
1222
+ "\n",
1223
+ "Epoch 4/20\n"
1224
+ ]
1225
+ },
1226
+ {
1227
+ "name": "stderr",
1228
+ "output_type": "stream",
1229
+ "text": [
1230
+ "Training: 100%|▉| 1315/1317 [01:22<00:01, 1.87it/s, step=5267, loss=0.3083, tru"
1231
+ ]
1232
+ },
1233
+ {
1234
+ "name": "stdout",
1235
+ "output_type": "stream",
1236
+ "text": [
1237
+ "\n",
1238
+ "Step 5264 | Train Loss: 0.5011 | Val Loss: 0.3770 | True train loss: 3.1835 | True val loss: 3.3785\n",
1239
+ "Individual label losses (unscaled):\n",
1240
+ " Label 0: 66.0959\n",
1241
+ " Label 1: 0.0124\n",
1242
+ " Label 2: 0.0382\n",
1243
+ " Label 3: 0.0951\n",
1244
+ " Label 4: 3.3052\n",
1245
+ "New best validation loss: 0.3770\n"
1246
+ ]
1247
+ },
1248
+ {
1249
+ "name": "stderr",
1250
+ "output_type": "stream",
1251
+ "text": [
1252
+ " \r"
1253
+ ]
1254
+ },
1255
+ {
1256
+ "name": "stdout",
1257
+ "output_type": "stream",
1258
+ "text": [
1259
+ "\n",
1260
+ "Epoch 5/20\n"
1261
+ ]
1262
+ },
1263
+ {
1264
+ "name": "stderr",
1265
+ "output_type": "stream",
1266
+ "text": [
1267
+ "Training: 100%|▉| 1315/1317 [01:22<00:01, 1.87it/s, step=6583, loss=0.2640, tru"
1268
+ ]
1269
+ },
1270
+ {
1271
+ "name": "stdout",
1272
+ "output_type": "stream",
1273
+ "text": [
1274
+ "\n",
1275
+ "Step 6580 | Train Loss: 0.4860 | Val Loss: 0.3498 | True train loss: 3.2743 | True val loss: 3.4532\n",
1276
+ "Individual label losses (unscaled):\n",
1277
+ " Label 0: 67.9448\n",
1278
+ " Label 1: 0.0116\n",
1279
+ " Label 2: 0.0392\n",
1280
+ " Label 3: 0.0810\n",
1281
+ " Label 4: 3.3704\n",
1282
+ "New best validation loss: 0.3498\n"
1283
+ ]
1284
+ },
1285
+ {
1286
+ "name": "stderr",
1287
+ "output_type": "stream",
1288
+ "text": [
1289
+ " \r"
1290
+ ]
1291
+ },
1292
+ {
1293
+ "name": "stdout",
1294
+ "output_type": "stream",
1295
+ "text": [
1296
+ "\n",
1297
+ "Epoch 6/20\n"
1298
+ ]
1299
+ },
1300
+ {
1301
+ "name": "stderr",
1302
+ "output_type": "stream",
1303
+ "text": [
1304
+ "Training: 100%|▉| 1313/1317 [01:22<00:02, 1.87it/s, step=7899, loss=0.1156, tru"
1305
+ ]
1306
+ },
1307
+ {
1308
+ "name": "stdout",
1309
+ "output_type": "stream",
1310
+ "text": [
1311
+ "\n",
1312
+ "Step 7896 | Train Loss: 0.4671 | Val Loss: 0.3422 | True train loss: 3.1278 | True val loss: 3.3296\n",
1313
+ "Individual label losses (unscaled):\n",
1314
+ " Label 0: 63.2215\n",
1315
+ " Label 1: 0.0117\n",
1316
+ " Label 2: 0.0362\n",
1317
+ " Label 3: 0.0827\n",
1318
+ " Label 4: 3.2292\n",
1319
+ "New best validation loss: 0.3422\n"
1320
+ ]
1321
+ },
1322
+ {
1323
+ "name": "stderr",
1324
+ "output_type": "stream",
1325
+ "text": [
1326
+ " \r"
1327
+ ]
1328
+ },
1329
+ {
1330
+ "name": "stdout",
1331
+ "output_type": "stream",
1332
+ "text": [
1333
+ "\n",
1334
+ "Epoch 7/20\n"
1335
+ ]
1336
+ },
1337
+ {
1338
+ "name": "stderr",
1339
+ "output_type": "stream",
1340
+ "text": [
1341
+ "Training: 100%|▉| 1313/1317 [01:22<00:02, 1.86it/s, step=9215, loss=0.2901, tru"
1342
+ ]
1343
+ },
1344
+ {
1345
+ "name": "stdout",
1346
+ "output_type": "stream",
1347
+ "text": [
1348
+ "\n",
1349
+ "Step 9212 | Train Loss: 0.4557 | Val Loss: 0.3389 | True train loss: 3.0609 | True val loss: 3.2751\n",
1350
+ "Individual label losses (unscaled):\n",
1351
+ " Label 0: 63.4267\n",
1352
+ " Label 1: 0.0114\n",
1353
+ " Label 2: 0.0381\n",
1354
+ " Label 3: 0.0815\n",
1355
+ " Label 4: 2.8806\n",
1356
+ "New best validation loss: 0.3389\n"
1357
+ ]
1358
+ },
1359
+ {
1360
+ "name": "stderr",
1361
+ "output_type": "stream",
1362
+ "text": [
1363
+ " \r"
1364
+ ]
1365
+ },
1366
+ {
1367
+ "name": "stdout",
1368
+ "output_type": "stream",
1369
+ "text": [
1370
+ "\n",
1371
+ "Epoch 8/20\n"
1372
+ ]
1373
+ },
1374
+ {
1375
+ "name": "stderr",
1376
+ "output_type": "stream",
1377
+ "text": [
1378
+ "Training: 100%|▉| 1311/1317 [01:22<00:03, 1.87it/s, step=10531, loss=0.4604, tr"
1379
+ ]
1380
+ },
1381
+ {
1382
+ "name": "stdout",
1383
+ "output_type": "stream",
1384
+ "text": [
1385
+ "\n",
1386
+ "Step 10528 | Train Loss: 0.4474 | Val Loss: 0.3379 | True train loss: 3.0718 | True val loss: 3.2051\n",
1387
+ "Individual label losses (unscaled):\n",
1388
+ " Label 0: 61.2247\n",
1389
+ " Label 1: 0.0113\n",
1390
+ " Label 2: 0.0372\n",
1391
+ " Label 3: 0.0828\n",
1392
+ " Label 4: 2.9602\n",
1393
+ "New best validation loss: 0.3379\n"
1394
+ ]
1395
+ },
1396
+ {
1397
+ "name": "stderr",
1398
+ "output_type": "stream",
1399
+ "text": [
1400
+ " \r"
1401
+ ]
1402
+ },
1403
+ {
1404
+ "name": "stdout",
1405
+ "output_type": "stream",
1406
+ "text": [
1407
+ "\n",
1408
+ "Epoch 9/20\n"
1409
+ ]
1410
+ },
1411
+ {
1412
+ "name": "stderr",
1413
+ "output_type": "stream",
1414
+ "text": [
1415
+ "Training: 100%|▉| 1311/1317 [01:21<00:03, 1.87it/s, step=11847, loss=0.2547, tr"
1416
+ ]
1417
+ },
1418
+ {
1419
+ "name": "stdout",
1420
+ "output_type": "stream",
1421
+ "text": [
1422
+ "\n",
1423
+ "Step 11844 | Train Loss: 0.4285 | Val Loss: 0.3416 | True train loss: 3.0075 | True val loss: 3.1697\n",
1424
+ "Individual label losses (unscaled):\n",
1425
+ " Label 0: 61.3822\n",
1426
+ " Label 1: 0.0112\n",
1427
+ " Label 2: 0.0421\n",
1428
+ " Label 3: 0.0847\n",
1429
+ " Label 4: 3.3251\n"
1430
+ ]
1431
+ },
1432
+ {
1433
+ "name": "stderr",
1434
+ "output_type": "stream",
1435
+ "text": [
1436
+ " \r"
1437
+ ]
1438
+ },
1439
+ {
1440
+ "name": "stdout",
1441
+ "output_type": "stream",
1442
+ "text": [
1443
+ "\n",
1444
+ "Epoch 10/20\n"
1445
+ ]
1446
+ },
1447
+ {
1448
+ "name": "stderr",
1449
+ "output_type": "stream",
1450
+ "text": [
1451
+ "Training: 99%|▉| 1309/1317 [01:21<00:04, 1.87it/s, step=13163, loss=0.2791, tr"
1452
+ ]
1453
+ },
1454
+ {
1455
+ "name": "stdout",
1456
+ "output_type": "stream",
1457
+ "text": [
1458
+ "\n",
1459
+ "Step 13160 | Train Loss: 0.4116 | Val Loss: 0.3174 | True train loss: 2.9027 | True val loss: 3.1666\n",
1460
+ "Individual label losses (unscaled):\n",
1461
+ " Label 0: 59.6537\n",
1462
+ " Label 1: 0.0110\n",
1463
+ " Label 2: 0.0365\n",
1464
+ " Label 3: 0.0877\n",
1465
+ " Label 4: 3.1535\n",
1466
+ "New best validation loss: 0.3174\n"
1467
+ ]
1468
+ },
1469
+ {
1470
+ "name": "stderr",
1471
+ "output_type": "stream",
1472
+ "text": [
1473
+ " \r"
1474
+ ]
1475
+ },
1476
+ {
1477
+ "name": "stdout",
1478
+ "output_type": "stream",
1479
+ "text": [
1480
+ "\n",
1481
+ "Epoch 11/20\n"
1482
+ ]
1483
+ },
1484
+ {
1485
+ "name": "stderr",
1486
+ "output_type": "stream",
1487
+ "text": [
1488
+ "Training: 99%|▉| 1309/1317 [01:21<00:04, 1.87it/s, step=14479, loss=0.3915, tr"
1489
+ ]
1490
+ },
1491
+ {
1492
+ "name": "stdout",
1493
+ "output_type": "stream",
1494
+ "text": [
1495
+ "\n",
1496
+ "Step 14476 | Train Loss: 0.3983 | Val Loss: 0.3039 | True train loss: 2.8602 | True val loss: 3.1240\n",
1497
+ "Individual label losses (unscaled):\n",
1498
+ " Label 0: 60.6528\n",
1499
+ " Label 1: 0.0107\n",
1500
+ " Label 2: 0.0371\n",
1501
+ " Label 3: 0.0827\n",
1502
+ " Label 4: 3.2043\n",
1503
+ "New best validation loss: 0.3039\n"
1504
+ ]
1505
+ },
1506
+ {
1507
+ "name": "stderr",
1508
+ "output_type": "stream",
1509
+ "text": [
1510
+ " \r"
1511
+ ]
1512
+ },
1513
+ {
1514
+ "name": "stdout",
1515
+ "output_type": "stream",
1516
+ "text": [
1517
+ "\n",
1518
+ "Epoch 12/20\n"
1519
+ ]
1520
+ },
1521
+ {
1522
+ "name": "stderr",
1523
+ "output_type": "stream",
1524
+ "text": [
1525
+ "Training: 99%|▉| 1307/1317 [01:21<00:05, 1.87it/s, step=15795, loss=0.1155, tr"
1526
+ ]
1527
+ },
1528
+ {
1529
+ "name": "stdout",
1530
+ "output_type": "stream",
1531
+ "text": [
1532
+ "\n",
1533
+ "Step 15792 | Train Loss: 0.3863 | Val Loss: 0.3050 | True train loss: 2.7796 | True val loss: 3.0697\n",
1534
+ "Individual label losses (unscaled):\n",
1535
+ " Label 0: 59.8002\n",
1536
+ " Label 1: 0.0108\n",
1537
+ " Label 2: 0.0371\n",
1538
+ " Label 3: 0.0815\n",
1539
+ " Label 4: 3.1037\n"
1540
+ ]
1541
+ },
1542
+ {
1543
+ "name": "stderr",
1544
+ "output_type": "stream",
1545
+ "text": [
1546
+ " \r"
1547
+ ]
1548
+ },
1549
+ {
1550
+ "name": "stdout",
1551
+ "output_type": "stream",
1552
+ "text": [
1553
+ "\n",
1554
+ "Epoch 13/20\n"
1555
+ ]
1556
+ },
1557
+ {
1558
+ "name": "stderr",
1559
+ "output_type": "stream",
1560
+ "text": [
1561
+ "Training: 99%|▉| 1307/1317 [01:21<00:05, 1.87it/s, step=17111, loss=0.2704, tr"
1562
+ ]
1563
+ },
1564
+ {
1565
+ "name": "stdout",
1566
+ "output_type": "stream",
1567
+ "text": [
1568
+ "\n",
1569
+ "Step 17108 | Train Loss: 0.3779 | Val Loss: 0.2881 | True train loss: 2.7442 | True val loss: 3.1636\n",
1570
+ "Individual label losses (unscaled):\n",
1571
+ " Label 0: 61.2941\n",
1572
+ " Label 1: 0.0102\n",
1573
+ " Label 2: 0.0361\n",
1574
+ " Label 3: 0.0836\n",
1575
+ " Label 4: 3.1077\n",
1576
+ "New best validation loss: 0.2881\n"
1577
+ ]
1578
+ },
1579
+ {
1580
+ "name": "stderr",
1581
+ "output_type": "stream",
1582
+ "text": [
1583
+ " \r"
1584
+ ]
1585
+ },
1586
+ {
1587
+ "name": "stdout",
1588
+ "output_type": "stream",
1589
+ "text": [
1590
+ "\n",
1591
+ "Epoch 14/20\n"
1592
+ ]
1593
+ },
1594
+ {
1595
+ "name": "stderr",
1596
+ "output_type": "stream",
1597
+ "text": [
1598
+ "Training: 99%|▉| 1305/1317 [01:21<00:06, 1.87it/s, step=18427, loss=0.4965, tr"
1599
+ ]
1600
+ },
1601
+ {
1602
+ "name": "stdout",
1603
+ "output_type": "stream",
1604
+ "text": [
1605
+ "\n",
1606
+ "Step 18424 | Train Loss: 0.3645 | Val Loss: 0.2822 | True train loss: 2.6844 | True val loss: 3.1494\n",
1607
+ "Individual label losses (unscaled):\n",
1608
+ " Label 0: 61.1663\n",
1609
+ " Label 1: 0.0100\n",
1610
+ " Label 2: 0.0365\n",
1611
+ " Label 3: 0.0743\n",
1612
+ " Label 4: 3.2309\n",
1613
+ "New best validation loss: 0.2822\n"
1614
+ ]
1615
+ },
1616
+ {
1617
+ "name": "stderr",
1618
+ "output_type": "stream",
1619
+ "text": [
1620
+ " \r"
1621
+ ]
1622
+ },
1623
+ {
1624
+ "name": "stdout",
1625
+ "output_type": "stream",
1626
+ "text": [
1627
+ "\n",
1628
+ "Epoch 15/20\n"
1629
+ ]
1630
+ },
1631
+ {
1632
+ "name": "stderr",
1633
+ "output_type": "stream",
1634
+ "text": [
1635
+ " \r"
1636
+ ]
1637
+ },
1638
+ {
1639
+ "ename": "KeyboardInterrupt",
1640
+ "evalue": "",
1641
+ "output_type": "error",
1642
+ "traceback": [
1643
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
1644
+ "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
1645
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[15]\u001b[39m\u001b[32m, line 10\u001b[39m\n\u001b[32m 6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtqdm\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m tqdm\n\u001b[32m 8\u001b[39m BATCH_SIZE = \u001b[32m128\u001b[39m\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m train_losses, val_losses, best_loss = \u001b[43mrun_training\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 11\u001b[39m \u001b[43m \u001b[49m\u001b[43msmiles_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msmiles_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[32m 12\u001b[39m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscalers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_epochs\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m20\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlearning_rate\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m1e-4\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mBATCH_SIZE\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalidation_steps\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msmiles_train\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m/\u001b[49m\u001b[43m/\u001b[49m\u001b[43m \u001b[49m\u001b[43mBATCH_SIZE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 13\u001b[39m \u001b[43m)\u001b[49m\n",
1646
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[13]\u001b[39m\u001b[32m, line 532\u001b[39m, in \u001b[36mrun_training\u001b[39m\u001b[34m(smiles_train, smiles_test, labels_train, labels_test, model, tokenizer, scalers, num_epochs, learning_rate, batch_size, validation_steps, underrep_boost)\u001b[39m\n\u001b[32m 529\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mTotal training steps: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(train_dataloader)\u001b[38;5;250m \u001b[39m*\u001b[38;5;250m \u001b[39mnum_epochs\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 531\u001b[39m \u001b[38;5;66;03m# Train the model\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m532\u001b[39m train_losses, val_losses, best_val_loss = \u001b[43mtrain_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 533\u001b[39m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 534\u001b[39m \u001b[43m \u001b[49m\u001b[43mtrain_dataloader\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtrain_dataloader\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 535\u001b[39m \u001b[43m \u001b[49m\u001b[43mval_dataloader\u001b[49m\u001b[43m=\u001b[49m\u001b[43mval_dataloader\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 536\u001b[39m \u001b[43m \u001b[49m\u001b[43mscalers\u001b[49m\u001b[43m=\u001b[49m\u001b[43mscalers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 537\u001b[39m \u001b[43m \u001b[49m\u001b[43mnum_epochs\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnum_epochs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 538\u001b[39m \u001b[43m \u001b[49m\u001b[43mlearning_rate\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlearning_rate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 539\u001b[39m \u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 540\u001b[39m \u001b[43m \u001b[49m\u001b[43mpatience\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 541\u001b[39m \u001b[43m \u001b[49m\u001b[43mvalidation_steps\u001b[49m\u001b[43m=\u001b[49m\u001b[43mvalidation_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 542\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 544\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m'\u001b[39m\u001b[33mTraining completed.\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 545\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[33mNumber of validation checkpoints: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(val_losses)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m)\n",
1647
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[13]\u001b[39m\u001b[32m, line 351\u001b[39m, in \u001b[36mtrain_model\u001b[39m\u001b[34m(model, train_dataloader, val_dataloader, scalers, num_epochs, learning_rate, device, patience, validation_steps)\u001b[39m\n\u001b[32m 345\u001b[39m scheduler.step()\n\u001b[32m 347\u001b[39m global_step += \u001b[32m1\u001b[39m\n\u001b[32m 349\u001b[39m train_progress.set_postfix({\n\u001b[32m 350\u001b[39m \u001b[33m'\u001b[39m\u001b[33mstep\u001b[39m\u001b[33m'\u001b[39m: global_step,\n\u001b[32m--> \u001b[39m\u001b[32m351\u001b[39m \u001b[33m'\u001b[39m\u001b[33mloss\u001b[39m\u001b[33m'\u001b[39m: \u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mloss\u001b[49m\u001b[43m.\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.4f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m,\n\u001b[32m 352\u001b[39m \u001b[33m'\u001b[39m\u001b[33mtrue_loss\u001b[39m\u001b[33m'\u001b[39m: \u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtrue_loss\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.4f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m,\n\u001b[32m 353\u001b[39m \u001b[33m'\u001b[39m\u001b[33mlr\u001b[39m\u001b[33m'\u001b[39m: \u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mscheduler.get_last_lr()[\u001b[32m0\u001b[39m]\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.2e\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\n\u001b[32m 354\u001b[39m })\n\u001b[32m 356\u001b[39m \u001b[38;5;66;03m# Perform validation every validation_steps\u001b[39;00m\n\u001b[32m 357\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m global_step % validation_steps == \u001b[32m0\u001b[39m:\n\u001b[32m 358\u001b[39m \u001b[38;5;66;03m# Calculate average training losses since last validation\u001b[39;00m\n",
1648
+ "\u001b[31mKeyboardInterrupt\u001b[39m: "
1649
+ ]
1650
+ }
1651
+ ],
1652
+ "source": [
1653
+ "import numpy as np\n",
1654
+ "import torch\n",
1655
+ "from torch.optim import AdamW\n",
1656
+ "from torch.optim.lr_scheduler import LinearLR\n",
1657
+ "from torch.utils.data import DataLoader\n",
1658
+ "from tqdm import tqdm\n",
1659
+ "\n",
1660
+ "BATCH_SIZE = 128\n",
1661
+ "\n",
1662
+ "train_losses, val_losses, best_loss = run_training(\n",
1663
+ " smiles_train, smiles_test, labels_train, labels_test, \n",
1664
+ " model, tokenizer, scalers, num_epochs=20, learning_rate=1e-4, batch_size=BATCH_SIZE, validation_steps=len(smiles_train) // BATCH_SIZE,\n",
1665
+ ")"
1666
+ ]
1667
+ },
1668
+ {
1669
+ "cell_type": "code",
1670
+ "execution_count": null,
1671
+ "metadata": {},
1672
+ "outputs": [],
1673
+ "source": []
1674
+ }
1675
+ ],
1676
+ "metadata": {
1677
+ "kaggle": {
1678
+ "accelerator": "gpu",
1679
+ "dataSources": [
1680
+ {
1681
+ "databundleVersionId": 12966160,
1682
+ "sourceId": 74608,
1683
+ "sourceType": "competition"
1684
+ },
1685
+ {
1686
+ "datasetId": 7678100,
1687
+ "sourceId": 12189904,
1688
+ "sourceType": "datasetVersion"
1689
+ },
1690
+ {
1691
+ "datasetId": 7690162,
1692
+ "sourceId": 12207625,
1693
+ "sourceType": "datasetVersion"
1694
+ },
1695
+ {
1696
+ "datasetId": 7716502,
1697
+ "sourceId": 12322957,
1698
+ "sourceType": "datasetVersion"
1699
+ },
1700
+ {
1701
+ "datasetId": 7801155,
1702
+ "sourceId": 12372847,
1703
+ "sourceType": "datasetVersion"
1704
+ },
1705
+ {
1706
+ "datasetId": 7809006,
1707
+ "sourceId": 12525286,
1708
+ "sourceType": "datasetVersion"
1709
+ },
1710
+ {
1711
+ "datasetId": 7912957,
1712
+ "sourceId": 12668147,
1713
+ "sourceType": "datasetVersion"
1714
+ }
1715
+ ],
1716
+ "dockerImageVersionId": 31041,
1717
+ "isGpuEnabled": true,
1718
+ "isInternetEnabled": true,
1719
+ "language": "python",
1720
+ "sourceType": "notebook"
1721
+ },
1722
+ "kernelspec": {
1723
+ "display_name": "Python [conda env:.mlspace-bolgov_simson_training]",
1724
+ "language": "python",
1725
+ "name": "conda-env-.mlspace-bolgov_simson_training-py"
1726
+ },
1727
+ "language_info": {
1728
+ "codemirror_mode": {
1729
+ "name": "ipython",
1730
+ "version": 3
1731
+ },
1732
+ "file_extension": ".py",
1733
+ "mimetype": "text/x-python",
1734
+ "name": "python",
1735
+ "nbconvert_exporter": "python",
1736
+ "pygments_lexer": "ipython3",
1737
+ "version": "3.12.11"
1738
+ }
1739
+ },
1740
+ "nbformat": 4,
1741
+ "nbformat_minor": 4
1742
+ }
simson_modeling/kaggle_comp/simson_polymer_1m_uncompiled.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:447522e6635568b0a9f5ca015910bcb1fc98a56e60cb6b90a10080b15611ef3f
3
+ size 93224872
simson_modeling/kaggle_comp/test.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ id,SMILES
2
+ 1109053969,*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)(F)F)C(F)(F)F)cc3)cc2)cc1
3
+ 1422188626,*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c5ccc(*)cc5)c4)cc3)cc2)cc1
4
+ 2032016830,*c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5C(=O)N(*)C6=O)cc4C3=O)c2)c1
simson_modeling/kaggle_comp/train.csv ADDED
The diff for this file is too large to render. See raw diff
 
simson_modeling/kaggle_comp/train_supplement/dataset1.csv ADDED
@@ -0,0 +1,875 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SMILES,TC_mean
2
+ */C(=C(\c1ccccc1)c1ccc(*)cc1)c1ccccc1,0.338
3
+ */C(F)=C(\F)C(F)(C(*)(F)F)C(F)(F)F,0.102
4
+ */C=C(/*)C#CCCCCCCCCCCCCCCCCCCCCC(=O)O,0.4105
5
+ */C=C(/*)CCCCCCCCCCCCCCCCCCCCC(=O)O,0.403
6
+ */C=C/*,0.526
7
+ */C=C/*,0.262
8
+ */C=C/C(C)C(*)C(=O)OC,0.189
9
+ */C=C/C(C)CCCCC*,0.2486666666666666
10
+ */C=C/C(C*)C(C)CC,0.203
11
+ */C=C/C(C*)CCC,0.218
12
+ */C=C/C1CCC(*)C1,0.2603333333333333
13
+ */C=C/CC*,0.2486666666666666
14
+ */C=C/CC*,0.272
15
+ */C=C/CC*,0.244
16
+ */C=C/CCC*,0.2535
17
+ */C=C/CCC*,0.2796
18
+ */C=C/CCCCC*,0.3143333333333333
19
+ */C=C/CCCCCC*,0.33
20
+ */C=C/CCCCCCC(Cl)CCCCCC*,0.3405
21
+ */C=C/CCCCCCCC*,0.3571666666666666
22
+ */C=C/CCCCCCCC*,0.3804999999999999
23
+ */C=C/CCCCCCCCCC(CCCCCCCCC*)COCCOCCOCCOCCOCCCCCC,0.31
24
+ */C=C/CCCCCCCCCC(CCCCCCCCC*)COCCOCCOCCOCCOCCCCCCCCCCCCCC,0.332
25
+ */C=C/CCCCCCCCCC(Cl)CCCCCCCCC*,0.3419999999999999
26
+ */C=C/CCCCCCCCCC*,0.4142
27
+ */C=C/c1cc(-c2ccc3c(c2)C(CCCCCCCC)(CCCCCCCC)c2ccccc2-3)c(*)cc1-c1ccc(F)cc1F,0.198
28
+ */C=C/c1cc(CCCCCCCCCCC)c(/C=C/c2ccc(*)cc2)cc1CCCCCCCCCCC,0.368
29
+ */C=C/c1cc(CCCCCCCCCCCC)c(*)s1,0.368
30
+ */C=C/c1cc(CCCCCCCCCCCC)c(/C=C/c2ccc(*)cc2)cc1CCCCCCCCCCCC,0.315
31
+ */C=C/c1cc(OCC(CC)CCCC)c(*)cc1-c1ccc(F)c(C(F)(F)F)c1,0.185
32
+ */C=C/c1cc(OCC(CC)CCCC)c(*)cc1-c1ccc(N(c2ccc(OC)cc2)c2ccc(OC)cc2)cc1,0.242
33
+ */C=C/c1cc(OCC2CC3CC2C2CCCC32)c(*)cc1OC,0.258
34
+ */C=C/c1cc(OCC2CC3CC2C2CCCC32)c(*)cc1OCC1CC2CC1C1CCCC21,0.24
35
+ */C=C/c1cc(OCCCCCC)c(*)cc1OC,0.524
36
+ */C=C/c1cc(OCCCCCCCC)c(*)cc1OC,0.31
37
+ */C=C/c1cc(OCCCCCCCCCCCC)c(*)cc1OC,0.251
38
+ */C=C/c1cc(OCCCCCCCCCCCCCCCC)c(/C=C/c2ccc(*)cc2)cc1OCCCCCCCCCCCCCCCC,0.3329999999999999
39
+ */C=C/c1cc(OCCc2ccccc2)c(*)cc1OC,0.285
40
+ */C=C/c1ccc(*)c(-c2c(OCC(CC)CCCC)ccc3cc(-c4ccccc4)ccc23)c1,0.212
41
+ */C=C/c1ccc(*)c(-c2c(OCC(CC)CCCC)ccc3ccccc23)c1,0.257
42
+ */C=C/c1ccc(*)c(-c2cc(-c3ccccc3)c(OCC(CC)CCCC)cc2OCC(CC)CCCC)c1,0.256
43
+ */C=C/c1ccc(*)c(-c2cc(OCC(CC)CCCC)c(-c3ccccc3)cc2OCC(CC)CCCC)c1,0.237
44
+ */C=C/c1ccc(*)c(-c2cc(OCC(CC)CCCC)c(OCC(CC)CCCC)cc2-c2ccccc2)c1,0.196
45
+ */C=C/c1ccc(*)c(-c2cc(OCCC(C)C)c(OCCC(C)C)cc2-c2ccc(C(F)(F)F)cc2)c1,0.184
46
+ */C=C/c1ccc(*)c(-c2cc(OCCC(C)C)c(OCCC(C)C)cc2-c2ccc(F)cc2)c1,0.334
47
+ */C=C/c1ccc(*)c(-c2cc(OCCC(C)C)c(OCCC(C)C)cc2-c2ccc(OC)cc2)c1,0.308
48
+ */C=C/c1ccc(*)c(-c2cc(OCCC(C)C)c(OCCC(C)C)cc2-c2ccccc2)c1,0.307
49
+ */C=C/c1ccc(*)c(-c2ccc(OCC(CC)CCCC)c3ccccc23)c1,0.219
50
+ */C=C/c1ccc(*)c2ccc(CCCCCC)cc12,0.339
51
+ */C=C/c1ccc(*)c2ccc(CCCCCCCCCCC)cc12,0.27
52
+ */C=C/c1sc(*)c(OCCCCCCCCCCCC)c1OCCCCCCCCCCCC,0.365
53
+ */C=C/c1sc(/C=C/c2cc(CCCCCCCCCCCC)c(*)s2)cc1CCCCCCCCCCCC,0.291
54
+ *C(*)C,0.2033333333333333
55
+ *C(*)CC,0.206
56
+ *C(=O)N(*)CCCCCCCCCCCCCCCCCC,0.408
57
+ *C(=O)NCCCCCCCCCCNC(=O)c1ccc(*)o1,0.217
58
+ *C(=O)Nc1ccc(Oc2ccc(N3C(=O)c4ccc(*)cc4C3=O)cc2)cc1,0.297
59
+ *C(=O)c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(C(=O)c4cccc(C(=O)c5ccc(Oc6ccc(N7C(=O)c8ccc(*)cc8C7=O)cc6)cc5)c4)cc3)cc1)C2=O,0.328
60
+ *C(F)(F)C(*)(F)Cl,0.139
61
+ *C(F)(F)C(F)(F)C1(F)C(*)(F)OC(F)(F)C1(F)F,0.1265
62
+ *C([2H])([2H])C(*)([2H])c1c([2H])c([2H])c([2H])c([2H])c1[2H],0.199
63
+ *C1=NC2=CC(=CC=C2N1)C1=CC2=C(NC(O2)C2=CC=C(OC3=C4C5C6=C(C=CC=C6)C(C6=C5C=CC=C6)C4=C(OC4=CC=C(*)C=C4)C4=C3C3C5=C(C=CC=C5)C4C4=C3C=CC=C4)C=C2)C=C1,0.18
64
+ *C1=NC2=CC(=CC=C2N1)C1=CC2=C(NC(O2)C2=CC=C(OC3=CC=C(OC4=CC=C(*)C=C4)C4=C3C3C5=C(C=CC=C5)C4C4=C3C=CC=C4)C=C2)C=C1,0.252
65
+ *C1C(=O)N(C2CCCCC2)C(=O)C1*,0.1485
66
+ *C1C(=O)N(c2ccc(C)cc2)C(=O)C1*,0.1405
67
+ *C1C(=O)N(c2ccccc2)C(=O)C1*,0.143
68
+ *C1C(=O)N(c2ccccc2C(=O)OC)C(=O)C1*,0.143
69
+ *C1C(=O)OC(=O)C1*,0.1425
70
+ *C1C=CC(*)C1,0.2784999999999999
71
+ *C1CCC1*,0.163
72
+ *C1Cc2ccccc2C1*,0.13
73
+ *CC(*)(C#N)C(=O)OC,0.161
74
+ *CC(*)(C#N)C(=O)OCC,0.1805
75
+ *CC(*)(C#N)C(=O)OCCCC,0.179
76
+ *CC(*)(C#N)C(=O)OCCCCCC,0.217
77
+ *CC(*)(C(=O)OC)c1ccccc1,0.1426666666666667
78
+ *CC(*)(C)C,0.2125
79
+ *CC(*)(C)C#N,0.146
80
+ *CC(*)(C)C(=O)NC(=O)OC(C)COc1c(Br)cc(S(=O)(=O)c2cc(Br)c(OCC(C)O)c(Br)c2)cc1Br,0.15
81
+ *CC(*)(C)C(=O)NC(=O)Oc1c(Br)cc(S(=O)(=O)c2cc(Br)c(O)c(Br)c2)cc1Br,0.109
82
+ *CC(*)(C)C(=O)OC,0.1594999999999999
83
+ *CC(*)(C)C(=O)OC(C(F)(F)F)(C(F)(F)F)C(F)(F)F,0.0465
84
+ *CC(*)(C)C(=O)OC(C(F)(F)F)C(F)(F)F,0.0755
85
+ *CC(*)(C)C(=O)OC(C)(C)C,0.1525
86
+ *CC(*)(C)C(=O)OC(C)C(C)(C)C,0.1674999999999999
87
+ *CC(*)(C)C(=O)OC(C)C(F)(F)F,0.1139999999999999
88
+ *CC(*)(C)C(=O)OC(C)CC,0.182
89
+ *CC(*)(C)C(=O)OC(F)(C(F)(F)F)C(F)(F)Cl,0.069
90
+ *CC(*)(C)C(=O)OC(F)(C(F)(F)F)C(F)(F)F,0.069
91
+ *CC(*)(C)C(=O)OC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.084
92
+ *CC(*)(C)C(=O)OC12CC3CC(C)(CC(C)(C3)C1)C2,0.122
93
+ *CC(*)(C)C(=O)OC12CC3CC(CC(C3)C1)C2,0.1245
94
+ *CC(*)(C)C(=O)OC1CC(C)CC(C)(C)C1,0.146
95
+ *CC(*)(C)C(=O)OC1CC2CCC1(C)C2(C)C,0.143
96
+ *CC(*)(C)C(=O)OC1CCC(C(C)(C)C)CC1,0.153
97
+ *CC(*)(C)C(=O)OC1CCC(C)CC1,0.1709999999999999
98
+ *CC(*)(C)C(=O)OC1CCCC(C)C1,0.1545
99
+ *CC(*)(C)C(=O)OC1CCCCC1,0.2239999999999999
100
+ *CC(*)(C)C(=O)OC1CCCCC1C,0.153
101
+ *CC(*)(C)C(=O)OCC,0.1699999999999999
102
+ *CC(*)(C)C(=O)OCC(C)(C)C,0.1555
103
+ *CC(*)(C)C(=O)OCC(C)(C)C1OCC(C)(C)CO1,0.1619999999999999
104
+ *CC(*)(C)C(=O)OCC(CC)CCCC,0.1935
105
+ *CC(*)(C)C(=O)OCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)F,0.09
106
+ *CC(*)(C)C(=O)OCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.0735
107
+ *CC(*)(C)C(=O)OCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)F,0.113
108
+ *CC(*)(C)C(=O)OCC(F)(F)C(F)(F)C(F)(F)C(F)F,0.095
109
+ *CC(*)(C)C(=O)OCC1(C)COC(C)(C)OC1,0.1525
110
+ *CC(*)(C)C(=O)OCC1(C)COCOC1,0.1305
111
+ *CC(*)(C)C(=O)OCC1CCCO1,0.172
112
+ *CC(*)(C)C(=O)OCC1CO1,0.1855
113
+ *CC(*)(C)C(=O)OCCBr,0.128
114
+ *CC(*)(C)C(=O)OCCC#N,0.138
115
+ *CC(*)(C)C(=O)OCCC(C)(C)C,0.1895
116
+ *CC(*)(C)C(=O)OCCC(C)CC(C)(C)C,0.198
117
+ *CC(*)(C)C(=O)OCCCC,0.203
118
+ *CC(*)(C)C(=O)OCCCCC,0.1895
119
+ *CC(*)(C)C(=O)OCCCCCC,0.184
120
+ *CC(*)(C)C(=O)OCCCCCCCC,0.2415
121
+ *CC(*)(C)C(=O)OCCCCCCCCCC,0.259
122
+ *CC(*)(C)C(=O)OCCCCCCCCCCCC,0.2665
123
+ *CC(*)(C)C(=O)OCCCCCCCCCCCCCC,0.3115
124
+ *CC(*)(C)C(=O)OCCCCCCCCCCCCCCCC,0.26
125
+ *CC(*)(C)C(=O)OCCCCCCCCCCCCCCCCCC,0.317
126
+ *CC(*)(C)C(=O)OCCCCCCOc1ccc(-c2ccc(OC)cc2)cc1,0.2175
127
+ *CC(*)(C)C(=O)OCCCCn1c2ccccc2c2ccccc21,0.1694999999999999
128
+ *CC(*)(C)C(=O)OCCCl,0.1325
129
+ *CC(*)(C)C(=O)OCCF,0.1679999999999999
130
+ *CC(*)(C)C(=O)OCCN(C)C,0.2005
131
+ *CC(*)(C)C(=O)OCCN(CC)CC,0.2375
132
+ *CC(*)(C)C(=O)OCCN(CC)S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.1095
133
+ *CC(*)(C)C(=O)OCCN(CC)c1ccc(/N=N/c2ccc([N+](=O)[O-])cc2)cc1,0.19
134
+ *CC(*)(C)C(=O)OCCNC(C)(C)C,0.184
135
+ *CC(*)(C)C(=O)OCCOC,0.1815
136
+ *CC(*)(C)C(=O)OCCOCC,0.212
137
+ *CC(*)(C)C(=O)OCCS(=O)CC,0.199
138
+ *CC(*)(C)C(=O)OCC[N+](=O)[O-],0.1895
139
+ *CC(*)(C)C(=O)OCCc1ccccc1,0.168
140
+ *CC(*)(C)C(=O)OCc1cccc([N+](=O)[O-])c1,0.158
141
+ *CC(*)(C)C(=O)OCc1ccccc1,0.1755
142
+ *CC(*)(C)C(=O)OCc1ccccc1Cl,0.159
143
+ *CC(*)(C)C(=O)OCc1ccco1,0.172
144
+ *CC(*)(C)C(=O)Oc1c(Cl)c(Cl)c(Cl)c(Cl)c1Cl,0.0829999999999999
145
+ *CC(*)(C)C(=O)Oc1ccc(C#N)cc1,0.1345
146
+ *CC(*)(C)C(=O)Oc1ccc(C(C)(C)C)cc1,0.181
147
+ *CC(*)(C)C(=O)Oc1ccc(C)cc1,0.1545
148
+ *CC(*)(C)C(=O)Oc1ccc(C)cc1C,0.157
149
+ *CC(*)(C)C(=O)Oc1ccc(CC#N)cc1,0.1655
150
+ *CC(*)(C)C(=O)Oc1ccc2ccccc2c1,0.1639999999999999
151
+ *CC(*)(C)C(=O)Oc1cccc(C)c1,0.151
152
+ *CC(*)(C)C(=O)Oc1ccccc1,0.184
153
+ *CC(*)(C)C(=O)Oc1ccccc1C,0.1905
154
+ *CC(*)(C)C(C)=O,0.1669999999999999
155
+ *CC(*)(C)C(N)=O,0.145
156
+ *CC(*)(C)CC,0.17
157
+ *CC(*)(C)CCC,0.174
158
+ *CC(*)(C)c1ccc(C(C)C)cc1,0.2013333333333333
159
+ *CC(*)(C)c1ccccc1,0.164
160
+ *CC(*)(CC(=O)OC)C(=O)OC,0.152
161
+ *CC(*)(CC(=O)OC1CCCCC1)C(=O)OC1CCCCC1,0.1694999999999999
162
+ *CC(*)(CC(=O)OCC)C(=O)OCC,0.1689999999999999
163
+ *CC(*)(CC(=O)OCC1CCCCC1)C(=O)OCC1CCCCC1,0.19
164
+ *CC(*)(CC(=O)OCCC)C(=O)OCCC,0.1845
165
+ *CC(*)(CC(=O)OCCC1CCCCC1)C(=O)OCCC1CCCCC1,0.213
166
+ *CC(*)(CC(=O)OCCCC)C(=O)OCCCC,0.2249999999999999
167
+ *CC(*)(CC(=O)OCCCC1CCCCC1)C(=O)OCCCC1CCCCC1,0.199
168
+ *CC(*)(CC(=O)OCCCCC)C(=O)OCCCCC,0.215
169
+ *CC(*)(CC(=O)OCCCCCC)C(=O)OCCCCCC,0.2319999999999999
170
+ *CC(*)(CC(=O)OCCCc1ccccc1)C(=O)OCCCc1ccccc1,0.2055
171
+ *CC(*)(CC(=O)OCCc1ccccc1)C(=O)OCCc1ccccc1,0.199
172
+ *CC(*)(CC(=O)OCc1ccccc1)C(=O)OCc1ccccc1,0.176
173
+ *CC(*)(CC(=O)Oc1ccc(C)cc1)C(=O)Oc1ccc(C)cc1,0.182
174
+ *CC(*)(CC(=O)Oc1cccc(C)c1)C(=O)Oc1cccc(C)c1,0.165
175
+ *CC(*)(CC(=O)Oc1ccccc1)C(=O)Oc1ccccc1,0.154
176
+ *CC(*)(CC(=O)Oc1ccccc1C)C(=O)Oc1ccccc1C,0.1635
177
+ *CC(*)(CC)C(=O)OCC,0.145
178
+ *CC(*)(CF)C(=O)OC,0.1365
179
+ *CC(*)(CF)C(=O)OCC,0.1684999999999999
180
+ *CC(*)(Cl)C(=O)OC(C)C,0.1465
181
+ *CC(*)(Cl)C(=O)OC(C)CC,0.174
182
+ *CC(*)(Cl)C(=O)OC1CCCCC1,0.1365
183
+ *CC(*)(Cl)C(=O)OCC,0.1985
184
+ *CC(*)(Cl)C(=O)OCCC,0.173
185
+ *CC(*)(Cl)C(=O)OCCCC,0.178
186
+ *CC(*)(F)C#N,0.154
187
+ *CC(*)(F)C(=O)OCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.095
188
+ *CC(*)(F)C(=O)OCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)F,0.108
189
+ *CC(*)(F)C(=O)OCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)F,0.106
190
+ *CC(*)(F)C(=O)OCC(F)(F)C(F)(F)C(F)(F)F,0.097
191
+ *CC(*)(F)C(=O)OCCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.121
192
+ *CC(*)C,0.2445
193
+ *CC(*)C#N,0.207
194
+ *CC(*)C(=O)C(C)C,0.203
195
+ *CC(*)C(=O)C1CCCCC1,0.204
196
+ *CC(*)C(=O)N(C(C)C)C(C)C,0.1865
197
+ *CC(*)C(=O)N(C)C,0.2125
198
+ *CC(*)C(=O)N(C)c1ccccc1,0.1775
199
+ *CC(*)C(=O)N(CCCC)CCCC,0.211
200
+ *CC(*)C(=O)N1CCCCC1,0.218
201
+ *CC(*)C(=O)N1CCOCC1,0.212
202
+ *CC(*)C(=O)NC(C)(C)C,0.199
203
+ *CC(*)C(=O)NC(C)C,0.203
204
+ *CC(*)C(=O)NC(C)CC,0.2259999999999999
205
+ *CC(*)C(=O)NCCCC,0.252
206
+ *CC(*)C(=O)NCCCCCC(=O)O,0.2495
207
+ *CC(*)C(=O)NCCCCCCCC/C=C/CCCCCCCC,0.3065
208
+ *CC(*)C(=O)NCCCCCCCCCCCC,0.3435
209
+ *CC(*)C(=O)NCCCCCCCCCCCCCC,0.3746666666666667
210
+ *CC(*)C(=O)NCCCCCCCCCCCCCCCC,0.3105
211
+ *CC(*)C(=O)NCCCCCCCCCCCCCCCCCC,0.383
212
+ *CC(*)C(=O)NCCCCCCCCCCCCCCCCCCCCCC,0.3665
213
+ *CC(*)C(=O)OC(C)CC(C)C,0.207
214
+ *CC(*)C(=O)OC(C)CCC(CC)CCCC,0.212
215
+ *CC(*)C(=O)OC(C)CCCCC,0.2405
216
+ *CC(*)C(=O)OC(C)CCCCCC,0.245
217
+ *CC(*)C(=O)OC(CC)CC,0.213
218
+ *CC(*)C(=O)OC(CCC(CC)CCCC)CC(C)C,0.2319999999999999
219
+ *CC(*)C(=O)OC(F)(C(F)(F)F)C(F)(F)F,0.1119999999999999
220
+ *CC(*)C(=O)OC12CC3CC(C)(CC(C)(C3)C1)C2,0.1635
221
+ *CC(*)C(=O)OC1CC(C)CC(C)(C)C1,0.2205
222
+ *CC(*)C(=O)OC1CCCCC1,0.201
223
+ *CC(*)C(=O)OCC,0.2305
224
+ *CC(*)C(=O)OCC(C)(C)C,0.2115
225
+ *CC(*)C(=O)OCC(C)CC,0.221
226
+ *CC(*)C(=O)OCC(C)CCC,0.217
227
+ *CC(*)C(=O)OCC(CC)CC,0.2095
228
+ *CC(*)C(=O)OCC(CC)CCCC,0.2309999999999999
229
+ *CC(*)C(=O)OCC(F)(F)C(F)(F)OC(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.086
230
+ *CC(*)C(=O)OCCC#N,0.195
231
+ *CC(*)C(=O)OCCC(C)C,0.224
232
+ *CC(*)C(=O)OCCC(C)OC,0.212
233
+ *CC(*)C(=O)OCCCC,0.2075
234
+ *CC(*)C(=O)OCCCCCC,0.2565
235
+ *CC(*)C(=O)OCCCCCC(=O)Oc1ccc(-c2ccc(C#N)cc2)cc1,0.2299999999999999
236
+ *CC(*)C(=O)OCCCCCCCCC,0.281
237
+ *CC(*)C(=O)OCCCCCCCCCCCCCC,0.3215
238
+ *CC(*)C(=O)OCCCCCCCCCCCCCCCC,0.3626666666666667
239
+ *CC(*)C(=O)OCCCCCCCCCCCCCCCCCC,0.3795
240
+ *CC(*)C(=O)OCCCCCCCCCCCCCCCCCCCCCC,0.3585
241
+ *CC(*)C(=O)OCCCCCCOc1ccc(C(=O)Oc2ccc(C#N)cc2)cc1,0.2245
242
+ *CC(*)C(=O)OCCCCCCSCC#N,0.2175
243
+ *CC(*)C(=O)OCCCCSC,0.2005
244
+ *CC(*)C(=O)OCCCSC,0.211
245
+ *CC(*)C(=O)OCCCSCC,0.2155
246
+ *CC(*)C(=O)OCCCSCCC#N,0.219
247
+ *CC(*)C(=O)OCCN(CC)c1ccc(/N=N/c2ccc([N+](=O)[O-])cc2)cc1,0.2135
248
+ *CC(*)C(=O)OCCSC,0.181
249
+ *CC(*)C(=O)OCCSCC,0.2175
250
+ *CC(*)C(=O)OCCSCC#N,0.183
251
+ *CC(*)C(=O)OCCSCCC#N,0.1875
252
+ *CC(*)C(=O)OCCSCCCC#N,0.18
253
+ *CC(*)C(=O)OCCc1ccccc1,0.196
254
+ *CC(*)C(=O)OCc1ccc(C#N)cc1,0.182
255
+ *CC(*)C(=O)OCc1ccccc1,0.1985
256
+ *CC(*)C(=O)Oc1c(Cl)c(Cl)c(Cl)c(Cl)c1Cl,0.0849999999999999
257
+ *CC(*)C(=O)Oc1ccc(-c2ccccc2)cc1,0.2055
258
+ *CC(*)C(=O)Oc1ccc(C#N)cc1,0.1845
259
+ *CC(*)C(=O)Oc1ccc(C(=O)OCCCC)cc1,0.2329999999999999
260
+ *CC(*)C(=O)Oc1ccc(C(C)(C)C)cc1,0.2245
261
+ *CC(*)C(=O)Oc1ccc(C)cc1,0.193
262
+ *CC(*)C(=O)Oc1ccc(Cl)cc1,0.158
263
+ *CC(*)C(=O)Oc1ccc(Cl)cc1Cl,0.119
264
+ *CC(*)C(=O)Oc1ccc(OC)cc1,0.208
265
+ *CC(*)C(=O)Oc1cccc(C)c1,0.2055
266
+ *CC(*)C(=O)Oc1cccc(N(C)C)c1,0.221
267
+ *CC(*)C(=O)Oc1cccc2ccccc12,0.1644999999999999
268
+ *CC(*)C(=O)Oc1ccccc1,0.196
269
+ *CC(*)C(=O)Oc1ccccc1C,0.1735
270
+ *CC(*)C(=O)Oc1ccccc1C(C)(C)C,0.175
271
+ *CC(*)C(=O)Oc1ccccc1Cl,0.166
272
+ *CC(*)C(=O)c1ccc(C(C)(C)C)cc1,0.1915
273
+ *CC(*)C(=O)c1ccc(C)cc1,0.1975
274
+ *CC(*)C(C)(C)C,0.1363333333333333
275
+ *CC(*)C(C)=O,0.179
276
+ *CC(*)C(C)C,0.1868
277
+ *CC(*)C(C)CC,0.1897499999999999
278
+ *CC(*)C(C)c1ccccc1,0.1506666666666666
279
+ *CC(*)C(CC)CC,0.1883333333333333
280
+ *CC(*)C1CC1,0.2175
281
+ *CC(*)C1CC=CCC1,0.18075
282
+ *CC(*)C1CCC(C)CC1,0.1996666666666666
283
+ *CC(*)C1CCC1,0.209
284
+ *CC(*)C1CCCC(C)C1,0.1806666666666666
285
+ *CC(*)C1CCCC1,0.2016666666666666
286
+ *CC(*)C1CCCCC1,0.1906666666666666
287
+ *CC(*)C=O,0.2425
288
+ *CC(*)CC,0.2253333333333333
289
+ *CC(*)CC(C)(C)C,0.1735
290
+ *CC(*)CC(C)(C)CC,0.1863333333333333
291
+ *CC(*)CC(C)C,0.2083333333333333
292
+ *CC(*)CC(C)C(F)(F)F,0.148
293
+ *CC(*)CC(C)CC,0.202
294
+ *CC(*)CC(CC)CC,0.219
295
+ *CC(*)CC1CCCC1,0.2283333333333333
296
+ *CC(*)CC1CCCCC1,0.2256
297
+ *CC(*)CCC,0.2463333333333333
298
+ *CC(*)CCC(C)(C)C,0.1929999999999999
299
+ *CC(*)CCC(C)C,0.216
300
+ *CC(*)CCC(C)CC,0.229
301
+ *CC(*)CCC1CCCCC1,0.2176666666666666
302
+ *CC(*)CCCC,0.2306666666666666
303
+ *CC(*)CCCC(C)(C)C,0.216
304
+ *CC(*)CCCC(C)C,0.2573333333333333
305
+ *CC(*)CCCC1CCCCC1,0.2463333333333333
306
+ *CC(*)CCCCC,0.2583999999999999
307
+ *CC(*)CCCCCC,0.238
308
+ *CC(*)CCCCCCC,0.2669999999999999
309
+ *CC(*)CCCCCCCC,0.286
310
+ *CC(*)CCCCCCCCC,0.3193333333333333
311
+ *CC(*)CCCCCCCCCC,0.33575
312
+ *CC(*)CCCCCCCCCCC,0.3206666666666666
313
+ *CC(*)CCCCCCCCCCCC,0.339
314
+ *CC(*)CCCCCCCCCCCCC,0.3383333333333333
315
+ *CC(*)CCCCCCCCCCCCCC,0.3409999999999999
316
+ *CC(*)CCCCCCCCCCCCCCCC,0.375
317
+ *CC(*)CCCCCCCCCCCCCCCCCC,0.381
318
+ *CC(*)CCCCCCCCCCCCCCCCCCCC,0.4019999999999999
319
+ *CC(*)CCCN(CC(C)C)CC(C)C,0.201
320
+ *CC(*)CCc1ccccc1,0.1803333333333333
321
+ *CC(*)CNc1ccc([N+](=O)[O-])cn1,0.192
322
+ *CC(*)Cc1ccccc1,0.1963333333333333
323
+ *CC(*)Cl,0.139
324
+ *CC(*)O,0.424
325
+ *CC(*)OC,0.2614999999999999
326
+ *CC(*)OC(=O)C1(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C1(F)F,0.086
327
+ *CC(*)OC(=O)CC,0.212
328
+ *CC(*)OC(=O)CCCCCCCCCCC,0.261
329
+ *CC(*)OC(=O)CCCCCCCCCCCCCCC,0.319
330
+ *CC(*)OC(=O)CCCCCCCCCCCCCCCCC,0.305
331
+ *CC(*)OC(=O)CCCCCCCCCCCCCCCCCCCCC,0.371
332
+ *CC(*)OC(=O)c1ccc(OC(=O)CCC)cc1,0.223
333
+ *CC(*)OC(=O)c1ccccc1,0.1669999999999999
334
+ *CC(*)OC(C)C,0.2115
335
+ *CC(*)OCC,0.2465
336
+ *CC(*)OCCCCCCCCCCCCCCCCCC,0.3868
337
+ *CC(*)O[N+](=O)[O-],0.1585
338
+ *CC(*)c1c(C)cc(C)cc1C,0.1669999999999999
339
+ *CC(*)c1cc(Br)ccc1OC,0.132
340
+ *CC(*)c1cc(Br)ccc1OC(C)C,0.124
341
+ *CC(*)c1cc(Br)ccc1OCC,0.124
342
+ *CC(*)c1cc(Br)ccc1OCCC,0.136
343
+ *CC(*)c1cc(Br)ccc1OCCC(C)C,0.1674999999999999
344
+ *CC(*)c1cc(Br)ccc1OCCCC,0.1493333333333333
345
+ *CC(*)c1cc(Br)ccc1OCCCCC,0.1545
346
+ *CC(*)c1cc(C(C)(C)C)ccc1C,0.1636666666666666
347
+ *CC(*)c1cc(C(C)C)ccc1C(C)C,0.1616666666666666
348
+ *CC(*)c1cc(C)c(C)cc1C,0.2095
349
+ *CC(*)c1cc(C)ccc1C,0.173
350
+ *CC(*)c1cc(C)ccc1F,0.171
351
+ *CC(*)c1cc(Cl)ccc1Cl,0.1119999999999999
352
+ *CC(*)c1cc(F)ccc1F,0.1403333333333333
353
+ *CC(*)c1ccc(C(=O)CCCCCCC)cc1,0.2663333333333333
354
+ *CC(*)c1ccc(C(=O)CCN2CCCCC2)cc1,0.225
355
+ *CC(*)c1ccc(C(=O)N(C)C)cc1,0.246
356
+ *CC(*)c1ccc(C(=O)N(CC)CC)cc1,0.213
357
+ *CC(*)c1ccc(C(=O)N2CCOCC2)cc1,0.24
358
+ *CC(*)c1ccc(C(=O)O)c(C(=O)O)c1,0.171
359
+ *CC(*)c1ccc(C(=O)OC)cc1,0.2033333333333333
360
+ *CC(*)c1ccc(C(=O)OCCN(C)C)cc1,0.2314999999999999
361
+ *CC(*)c1ccc(C(C)(C)C)cc1,0.184
362
+ *CC(*)c1ccc(C(C)(C)O)cc1,0.2316666666666666
363
+ *CC(*)c1ccc(C(C)(O)CC)cc1,0.2266666666666666
364
+ *CC(*)c1ccc(C(C)=O)cc1,0.2296666666666666
365
+ *CC(*)c1ccc(C(C)C)cc1C(C)C,0.1979999999999999
366
+ *CC(*)c1ccc(C)c(C)c1,0.1905
367
+ *CC(*)c1ccc(C)cc1,0.1873333333333333
368
+ *CC(*)c1ccc(C)cc1C,0.1856666666666666
369
+ *CC(*)c1ccc(CC)cc1,0.2013333333333333
370
+ *CC(*)c1ccc(CCC)cc1,0.2345
371
+ *CC(*)c1ccc(CCCC)cc1,0.2096666666666666
372
+ *CC(*)c1ccc(CCCCCC)cc1,0.241
373
+ *CC(*)c1ccc(CCCCCCCC)cc1,0.2674999999999999
374
+ *CC(*)c1ccc(CCCCCCCCC)cc1,0.277
375
+ *CC(*)c1ccc(CCCCCCCCCC)cc1,0.2825
376
+ *CC(*)c1ccc(CCCCCCCCCCCC)cc1,0.3035
377
+ *CC(*)c1ccc(CCCCCCCCCCCCCC)cc1,0.3399999999999999
378
+ *CC(*)c1ccc(CCCCCCCCCCCCCCCC)cc1,0.3409999999999999
379
+ *CC(*)c1ccc(CCCCCCCCCCCCCCCCCC)cc1,0.3751666666666667
380
+ *CC(*)c1ccc(COCC(CC)CCCC)cc1,0.268
381
+ *CC(*)c1ccc(COCCCCCC)cc1,0.252
382
+ *CC(*)c1ccc(COCCCCCCOc2ccc(-c3ccc(OC)cc3)cc2)cc1,0.3
383
+ *CC(*)c1ccc(COCCOCCCC)cc1,0.262
384
+ *CC(*)c1ccc(COCCOCCCCCCCC)cc1,0.282
385
+ *CC(*)c1ccc(Cl)c(C)c1,0.154
386
+ *CC(*)c1ccc(Cl)c(Cl)c1,0.114
387
+ *CC(*)c1ccc(Cl)c(F)c1,0.1275
388
+ *CC(*)c1ccc(Cl)cc1C,0.1366666666666667
389
+ *CC(*)c1ccc(Cl)cc1Cl,0.1006666666666666
390
+ *CC(*)c1ccc(F)cc1C,0.1413333333333333
391
+ *CC(*)c1ccc(O)cc1,0.2073333333333333
392
+ *CC(*)c1ccc(OC(C)=O)cc1,0.2005
393
+ *CC(*)c1cccc(-c2ccc(-c3ccccc3)cc2)c1,0.1956666666666666
394
+ *CC(*)c1cccc(C)c1,0.1973333333333333
395
+ *CC(*)c1cccc(CC)c1,0.186
396
+ *CC(*)c1cccc(Cl)c1,0.14175
397
+ *CC(*)c1cccc(F)c1,0.1623333333333333
398
+ *CC(*)c1cccc2ccccc12,0.194
399
+ *CC(*)c1ccccc1,0.2045999999999999
400
+ *CC(*)c1ccccc1C,0.1903333333333333
401
+ *CC(*)c1ccccc1C(=O)N(C)C,0.165
402
+ *CC(*)c1ccccc1C(=O)NC,0.1836666666666666
403
+ *CC(*)c1ccccc1C(=O)Nc1ccccc1,0.1636666666666666
404
+ *CC(*)c1ccccc1C(=O)OC,0.1736666666666666
405
+ *CC(*)c1ccccc1C(=O)OC(C)C,0.1736666666666666
406
+ *CC(*)c1ccccc1C(=O)OCC,0.1816666666666666
407
+ *CC(*)c1ccccc1C(=O)OCC(C)C,0.1696666666666666
408
+ *CC(*)c1ccccc1C(=O)OCCC,0.1929999999999999
409
+ *CC(*)c1ccccc1C(=O)OCCC(C)C,0.1929999999999999
410
+ *CC(*)c1ccccc1C(=O)OCCCC,0.1953333333333333
411
+ *CC(*)c1ccccc1C(=O)OCCCCC,0.1923333333333333
412
+ *CC(*)c1ccccc1C(=O)OCCCCCC,0.2056666666666666
413
+ *CC(*)c1ccccc1C(=O)OCCN(C)C,0.1853333333333333
414
+ *CC(*)c1ccccc1C(=O)Oc1ccccc1,0.1693333333333333
415
+ *CC(*)c1ccccc1CC,0.1843333333333333
416
+ *CC(*)c1ccccc1COC,0.2066666666666666
417
+ *CC(*)c1ccccc1COC(C)C,0.171
418
+ *CC(*)c1ccccc1COCC,0.1846666666666666
419
+ *CC(*)c1ccccc1COCCC,0.217
420
+ *CC(*)c1ccccc1COCCC(C)C,0.1943333333333333
421
+ *CC(*)c1ccccc1COCCCC,0.2096666666666666
422
+ *CC(*)c1ccccc1COCCCCC,0.186
423
+ *CC(*)c1ccccc1COCCc1ccccc1,0.1883333333333333
424
+ *CC(*)c1ccccc1COCc1ccccc1,0.1913333333333333
425
+ *CC(*)c1ccccc1Cl,0.1436666666666667
426
+ *CC(*)c1ccccc1F,0.1746666666666666
427
+ *CC(*)c1ccccc1OC,0.1946666666666666
428
+ *CC(*)c1ccccn1,0.196
429
+ *CC(*)c1cccs1,0.1995
430
+ *CC(*)n1c2ccccc2c2ccccc21,0.147
431
+ *CC(*)n1cc2ccccc2n1,0.2075
432
+ *CC(=O)N*,0.277
433
+ *CC(=O)c1ccc(Oc2ccc(C(=O)COc3ccc(/C=C4\CC/C(=C\c5ccc(O*)c(OC)c5)C4=O)cc3OC)cc2)cc1,0.228
434
+ *CC(=O)c1ccc(Oc2ccc(C(=O)COc3ccc(/C=C4\CC/C(=C\c5ccc(O*)cc5)C4=O)cc3)cc2)cc1,0.259
435
+ *CC(C)(C)CS(=O)(=O)CC(C)(C)COC(=O)O*,0.186
436
+ *CC(C)(C)O*,0.1995
437
+ *CC(C)C(*)(C)C,0.2026666666666666
438
+ *CC(CC(*)(C#N)C(=O)OC)c1ccccc1,0.176
439
+ *CC(CCCCCCCCCCCCCCCC)C(CCCCCCCCCCCCCCCC)COC(=O)c1ccc(C(=O)O*)cc1,0.354
440
+ *CC(CCCCCCCCCCCCCCCC)C1C(=O)N(CCCCCCCCCCCC)C(=O)C1*,0.335
441
+ *CC(CCl)(CCl)O*,0.1159999999999999
442
+ *CC(F)(F)C1(F)CC(C(O)(C(F)(F)F)C(F)(F)F)CC1*,0.109
443
+ *CC(O)COc1c(Cl)cc(C(C)(C)c2cc(Cl)c(O*)c(Cl)c2)cc1Cl,0.1485
444
+ *CC(O)COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1,0.198
445
+ *CC(O*)c1ccccc1,0.181
446
+ *CC(c1ccccn1)C(c1ccccc1)C(*)c1ccccn1,0.151
447
+ *CC/C=C(/*)C,0.256
448
+ *CC/C=C(/*)C,0.258
449
+ *CC/C=C(/*)C,0.235
450
+ *CC/C=C(/*)C(C)(C)C,0.20825
451
+ *CC/C=C(/*)C(C)C,0.21
452
+ *CC/C=C(/*)CCC,0.2233333333333333
453
+ *CC/C=C(/*)CCCCCCC,0.269
454
+ *CC1(*)CCC(C)CC1,0.164
455
+ *CC1(*)CCCCC1,0.1443333333333333
456
+ *CC1CCC(*)C1,0.3353333333333333
457
+ *CC1CCC(COC(=O)CCCCCCCCCCC(=O)O*)CC1,0.279
458
+ *CC1CCC(COC(=O)NCCSCCCCCCSCCNC(=O)O*)CC1,0.24
459
+ *CCC(*)(C)C,0.1956666666666666
460
+ *CCC(=O)OC(=O)CCc1ccc(*)o1,0.216
461
+ *CCC(=O)Oc1ccc(OC(=O)CCN2CCN(*)CC2)cc1,0.224
462
+ *CCC(C(=O)OCC)C(*)C(=O)OCC,0.2075
463
+ *CCC(C)C(*)C,0.2329999999999999
464
+ *CCC(C)CCC(=O)O*,0.253
465
+ *CCC(Cl)C(*)Cl,0.1795
466
+ *CCC/C=C(/*)c1ccc(Cl)cc1,0.184
467
+ *CCC1CCCC1*,0.2863333333333333
468
+ *CCCC(*)(C)C,0.236
469
+ *CCCC(*)(C)C(=O)O,0.1985
470
+ *CCCC(*)(C)CC,0.2339999999999999
471
+ *CCCC(=O)NCc1ccc(CNC(=O)CCCO*)cc1,0.259
472
+ *CCCC1CC(=O)N(*)C(=O)C1,0.23
473
+ *CCCC1CCN(C(=O)SSCCCCSSC(=O)N2CCC(*)CC2)CC1,0.223
474
+ *CCCCC(C)CC(=O)N*,0.254
475
+ *CCCCCC(*)CCCCCCCCCC,0.406
476
+ *CCCCCC(*)CCCCCCCCCCCC,0.381
477
+ *CCCCCC(*)CCCCCCCCCCCCCC,0.403
478
+ *CCCCCC(=O)N*,0.301
479
+ *CCCCCCC(=O)NCCCCCCNC(=O)CCCCCCO*,0.3215
480
+ *CCCCCCCC(*)C,0.371
481
+ *CCCCCCCC(=O)N*,0.329
482
+ *CCCCCCCCC(=O)NCCCCCCNC(=O)CCCCCCCCO*,0.3575
483
+ *CCCCCCCCC(=O)NCCCCCCNC(=O)CCCCO*,0.3185
484
+ *CCCCCCCCC(=O)NCCCCCCNC(=O)CCCO*,0.286
485
+ *CCCCCCCCC(=O)NCCCCCOCCCCCNC(=O)CCCCO*,0.287
486
+ *CCCCCCCCC(=O)NCCCCCOCCCCCNC(=O)CCCO*,0.289
487
+ *CCCCCCCCCC#CC#CCCCCCCCCCOC(=O)CCCCCCCCC(=O)O*,0.307
488
+ *CCCCCCCCCCC(=O)NCCCCCC(=O)N*,0.296
489
+ *CCCCCCCCCCC(=O)NCCCCCCNC(=O)CCCCO*,0.3565
490
+ *CCCCCCCCCCC(=O)NCCCCCCNC(=O)CCCO*,0.3245
491
+ *CCCCCCCCCCC(=O)NCCCCCOCCCCCNC(=O)CCCCO*,0.3349999999999999
492
+ *CCCCCCCCCCC(=O)NCCCCCOCCCCCNC(=O)CCCO*,0.303
493
+ *CCCCCCCCCCC(=O)NCCc1ccc(CCNC(=O)CCCCCCCCCCS*)cc1,0.296
494
+ *CCCCCCCCCCC(=O)NCc1ccc(CNC(=O)CCCCCCCCCCS*)cc1,0.285
495
+ *CCCCCCCCCCC(Cl)C(*)Cl,0.316
496
+ *CCCCCCCCCCCC(=O)N*,0.3409999999999999
497
+ *CCCCCCCCCCCCC(=O)N*,0.33
498
+ *CCCCCCCCCCCCCCC(*)Cl,0.3735
499
+ *CCCCCCCCCCCCCCC(=O)N*,0.389
500
+ *CCCCCCCCCCCCCCC(=O)O*,0.3595
501
+ *CCCCCCCCCCCCCCCC(=O)O*,0.3954
502
+ *CCCCCCCCCCCCCCCCC(=O)N*,0.379
503
+ *CCCCCCCCCCCCCCCCCCCCC(*)COCCOCCOCCOCCOCCCCCC,0.39925
504
+ *CCCCCCCCCCCCCCCCCCCCC(*)COCCOCCOCCOCCOCCCCCCCCCCCCCC,0.38425
505
+ *CCCCCCCCCCCCCCCCCCCCC(*)Cl,0.468
506
+ *CCCCCCCCCCCCCCCCCCCCCC(=O)N*,0.4325
507
+ *CCCCCCCCCCCCCCCCCCCCCCOC(=O)CC(CC(=O)O*)c1ccccc1,0.354
508
+ *CCCCCCCCCCCCCCCCCCCCCCOC(=O)CCC(=O)O*,0.387
509
+ *CCCCCCCCCCCCCCCCCCCCCCOC(=O)CCCCC(=O)O*,0.376
510
+ *CCCCCCCCCCCCCCCCCCCCCCOC(=O)Cc1ccccc1CC(=O)O*,0.367
511
+ *CCCCCCCCCCCCCCCCCCCCOC(=O)C(=O)O*,0.3545
512
+ *CCCCCCCCCCCCCCCCCCCCOC(=O)CC(=O)O*,0.318
513
+ *CCCCCCCCCCCCCCCCCCCCOC(=O)CCC(=O)O*,0.362
514
+ *CCCCCCCCCCCCCCCCCCCCOC(=O)CCC(C)CC(=O)O*,0.362
515
+ *CCCCCCCCCCCCCCCCCCCCOC(=O)CCCC(=O)O*,0.302
516
+ *CCCCCCCCCCCCCCCCCCCCOC(=O)CCCCC(=O)O*,0.3545
517
+ *CCCCCCCCCCCCCCCCCCCCOC(=O)CCCCCC(=O)O*,0.406
518
+ *CCCCCCCCCCCCCCCCCCCCOC(=O)CCCCCCC(=O)O*,0.3379999999999999
519
+ *CCCCCCCCCCCCCCCCCCCCOC(=O)CCCCCCCC(=O)O*,0.507
520
+ *CCCCCCCCCCCCCCCCCCCCOC(=O)CCCCCCCCC(=O)O*,0.276
521
+ *CCCCCCCCCCCCCCCCCCCCOC(=O)CCCCS(=O)(=O)CCCCC(=O)O*,0.322
522
+ *CCCCCCCCCCCCCCCCCCCCOC(=O)CCCCSCCCCC(=O)O*,0.322
523
+ *CCCCCCCCCCCCCCCCCCCCOC(=O)COCC(=O)O*,0.3249999999999999
524
+ *CCCCCCCCCCCCCCCCCCCCc1nnc(*)o1,0.398
525
+ *CCCCCCCCCCCCCCCCCCNC(=O)CCCCCCCCC(=O)N*,0.369
526
+ *CCCCCCCCCCCCCCCCCCNC(=O)CCCCCCCCCCC(=O)N*,0.4337499999999999
527
+ *CCCCCCCCCCCCCCCCCCNC(=O)CCCCCCCCCCCCCCCCC(=O)N*,0.4533333333333333
528
+ *CCCCCCCCCCCCCCCCCCNC(=O)CCc1ccc(CCC(=O)N*)cc1,0.3755
529
+ *CCCCCCCCCCCCCCCCCCNC(=O)NCCCCCCCCCCNC(=O)N*,0.39
530
+ *CCCCCCCCCCCCCCCCCCNC(=O)NCCCCCCNC(=O)N*,0.383
531
+ *CCCCCCCCCCCCCCCCCCOC(=O)c1ccc(C(=O)O*)cc1,0.304
532
+ *CCCCCCCCCCCCCCCCCCOCO*,0.406
533
+ *CCCCCCCCCCCCCCCCOC(=O)C/C=C/CC(=O)O*,0.315
534
+ *CCCCCCCCCCCCCCCCOC(=O)CC/C=C/CCC(=O)O*,0.304
535
+ *CCCCCCCCCCCCCCCCOC(=O)CCCCCCC(=O)O*,0.308
536
+ *CCCCCCCCCCCCCCCCOC(=O)CCCCCCCCC(=O)O*,0.442
537
+ *CCCCCCCCCCCCCCCCOC(=O)CCCCCCCCCCCCC(=O)O*,0.297
538
+ *CCCCCCCCCCCCCCCCOC(=O)NCCCCCCCCCCNC(=O)O*,0.275
539
+ *CCCCCCCCCCCCCCCCOC(=O)NCCCCCCNC(=O)O*,0.348
540
+ *CCCCCCCCCCCCCCCC[N+](C)(C)CCCCCC[N+](*)(C)C,0.408
541
+ *CCCCCCCCCCCCCCNC(=O)CCCCCCCCC(=O)N*,0.3944999999999999
542
+ *CCCCCCCCCCCCCCNC(=O)CCCCCCCCCCC(=O)N*,0.346
543
+ *CCCCCCCCCCCCCCNC(=O)CCCCCCCCCCCCCCCCC(=O)N*,0.4341428571428571
544
+ *CCCCCCCCCCCCCCNC(=O)CCCCCCCCCCCCCCCCCCCCC(=O)N*,0.3915714285714285
545
+ *CCCCCCCCCCCCCCNC(=O)NCCCCCCCCCCNC(=O)N*,0.344
546
+ *CCCCCCCCCCCCCCNC(=O)NCCCCCCNC(=O)N*,0.4
547
+ *CCCCCCCCCCCCCCOC(=O)CC/C=C/CCC(=O)O*,0.318
548
+ *CCCCCCCCCCCCCCOC(=O)CCCCC(=O)O*,0.3555
549
+ *CCCCCCCCCCCCCCOC(=O)CCCCCCC(=O)O*,0.3165
550
+ *CCCCCCCCCCCCCCOC(=O)c1ccc(C(=O)NCCCCCCNC(=O)c2ccc(C(=O)O*)cc2)cc1,0.333
551
+ *CCCCCCCCCCCCCCOC(=O)c1ccc(C(=O)NCCCCCNC(=O)c2ccc(C(=O)O*)cc2)cc1,0.258
552
+ *CCCCCCCCCCCCCCOC(=O)c1ccc(C(=O)NCCNC(=O)c2ccc(C(=O)O*)cc2)cc1,0.274
553
+ *CCCCCCCCCCCCCCS*,0.34125
554
+ *CCCCCCCCCCCCCNC(=O)CCCCCCCCCCCC(=O)N*,0.3835
555
+ *CCCCCCCCCCCCNC(=O)C(=O)N*,0.309
556
+ *CCCCCCCCCCCCNC(=O)CCCCC(=O)N*,0.3745
557
+ *CCCCCCCCCCCCNC(=O)CCCCCCCCC(=O)N*,0.374
558
+ *CCCCCCCCCCCCNC(=O)CCCCCCCCCCCCC(=O)N*,0.3994999999999999
559
+ *CCCCCCCCCCCCNC(=O)CCCCCCCCCCCCCCC(=O)N*,0.396
560
+ *CCCCCCCCCCCCNC(=O)CCCCCCCCCCCCCCCCCCC(=O)N*,0.44
561
+ *CCCCCCCCCCCCNC(=O)CCCCCCCCCCCCCCCCCCCCC(=O)N*,0.3944285714285714
562
+ *CCCCCCCCCCCCNC(=O)NCCCCCCNC(=O)N*,0.3545
563
+ *CCCCCCCCCCCCO*,0.4307499999999999
564
+ *CCCCCCCCCCCCOC(=O)CC/C=C/CCC(=O)O*,0.274
565
+ *CCCCCCCCCCCCOC(=O)CCCCC(=O)O*,0.292
566
+ *CCCCCCCCCCCCOC(=O)CCCCCCC(=O)O*,0.284
567
+ *CCCCCCCCCCCCOC(=O)CCCCCCCCCCC(=O)O*,0.297
568
+ *CCCCCCCCCCCCOC(=O)CCCCCCCCCCCCC(=O)O*,0.294
569
+ *CCCCCCCCCCCCOC(=O)CCCCCNC(=O)CCCCC(=O)NCCCCCC(=O)O*,0.306
570
+ *CCCCCCCCCCCNC(=O)CCCCC(=O)N*,0.3295
571
+ *CCCCCCCCCCCNC(=O)CCCCCCCCC(=O)N*,0.314
572
+ *CCCCCCCCCCCNC(=O)CCCCCCCCC(=O)NCCCCCCCCCCCNC(=O)C(=O)N*,0.3195
573
+ *CCCCCCCCCCCNC(=O)CCCCCCCCCCC(=O)N*,0.389
574
+ *CCCCCCCCCCCNC(=O)CCCCCCCCCCCCCCCCCCC(=O)N*,0.37175
575
+ *CCCCCCCCCCCSCCCCCCCCCCS*,0.352
576
+ *CCCCCCCCCCCSCCCCCCS*,0.3145
577
+ *CCCCCCCCCCN/C(C)=N/*,0.298
578
+ *CCCCCCCCCCNC(=O)C(CCCCCCCCCCCC)C(=O)N*,0.35
579
+ *CCCCCCCCCCNC(=O)CC/C=C/CCC(=O)N*,0.3125
580
+ *CCCCCCCCCCNC(=O)CCCCCCC(=O)N*,0.422
581
+ *CCCCCCCCCCNC(=O)CCCCCCCCC(=O)N*,0.408
582
+ *CCCCCCCCCCNC(=O)CCCCCCCCC(=O)NCCCCCCCCCCNC(=O)C(=O)N*,0.4125
583
+ *CCCCCCCCCCNC(=O)CCCCCCCCCCCC(=O)N*,0.3685
584
+ *CCCCCCCCCCNC(=O)CCCCCCCCCCCCC(=O)N*,0.342
585
+ *CCCCCCCCCCNC(=O)CCCCCCCCCCCCCCC(=O)N*,0.368
586
+ *CCCCCCCCCCNC(=O)CCCCCCCCCCCCCCCCC(=O)N*,0.381
587
+ *CCCCCCCCCCNC(=O)CCCCCCCCCCCCCCCCCCC(=O)N*,0.3873333333333333
588
+ *CCCCCCCCCCNC(=O)CCCCCCCCCCCCCCCCCCCCC(=O)N*,0.38825
589
+ *CCCCCCCCCCNC(=O)NCCCCCCCCNC(=O)N*,0.366
590
+ *CCCCCCCCCCO*,0.353
591
+ *CCCCCCCCCCOC(=O)CCC(=O)O*,0.285
592
+ *CCCCCCCCCCOC(=O)CCCC(=O)O*,0.263
593
+ *CCCCCCCCCCOC(=O)CCCCCCC(=O)O*,0.3445
594
+ *CCCCCCCCCCOC(=O)CCCCCCCC(=O)O*,0.3325
595
+ *CCCCCCCCCCOC(=O)CCCCCCCCC(=O)O*,0.281
596
+ *CCCCCCCCCCOC(=O)CCCCCCCCCCC(=O)O*,0.319
597
+ *CCCCCCCCCCOC(=O)CCCCCCCCCCCCCCCCC(=O)O*,0.3415
598
+ *CCCCCCCCCCOC(=O)CCCCCNC(=O)CCCCC(=O)NCCCCCC(=O)O*,0.2955
599
+ *CCCCCCCCCCOC(=O)CCCCS(=O)(=O)CCCCC(=O)O*,0.278
600
+ *CCCCCCCCCCOC(=O)CCCCSCCCCC(=O)O*,0.27
601
+ *CCCCCCCCCCOC(=O)NCCCCCCCCCCNC(=O)O*,0.316
602
+ *CCCCCCCCCCOC(=O)c1ccc(C(=O)NCCCCCNC(=O)c2ccc(C(=O)O*)cc2)cc1,0.307
603
+ *CCCCCCCCCCOCCCCCCCCCCOCCCCCCO*,0.269
604
+ *CCCCCCCCCCOCCCCCCCCCCOCCCCCO*,0.38525
605
+ *CCCCCCCCCCOCCCCCCCCCCOCCCCO*,0.329
606
+ *CCCCCCCCCCOCCCCCCOCCCCCCO*,0.36175
607
+ *CCCCCCCCCCOCO*,0.297
608
+ *CCCCCCCCCCOc1ccc(OC(=O)c2ccc(OCCCCCCOc3ccc(C(=O)Oc4ccc(O*)cc4)cc3)cc2)cc1,0.344
609
+ *CCCCCCCCCCSCCCCCCS*,0.3305
610
+ *CCCCCCCCCCSCCCCS*,0.282
611
+ *CCCCCCCCCCSSCCCCCCSS*,0.2295
612
+ *CCCCCCCCCCc1ccc(-c2c(-c3ccccc3)cc(-c3cccc(-c4cc(-c5ccccc5)c(-c5ccc(*)cc5)c(-c5ccccc5)c4-c4ccccc4)c3)c(-c3ccccc3)c2-c2ccccc2)cc1,0.21
613
+ *CCCCCCCCCNC(=O)C(CCCCCCCCCCCC)C(=O)N*,0.347
614
+ *CCCCCCCCCNC(=O)CCCCCCCC(=O)N*,0.361
615
+ *CCCCCCCCCNC(=O)CCCCCCCCC(=O)N*,0.3485
616
+ *CCCCCCCCCNC(=O)CCCCCCCCC(=O)NCCCCCCCCCNC(=O)C(=O)N*,0.312
617
+ *CCCCCCCCCNC(=O)CCCCCCCCCC(=O)N*,0.312
618
+ *CCCCCCCCCNC(=O)CCCCCCCCCCCC(=O)N*,0.348
619
+ *CCCCCCCCCOC(=O)CCCCCCCC(=O)O*,0.294
620
+ *CCCCCCCCCOC(=O)c1ccc(C(=O)NCCCCCCNC(=O)c2ccc(C(=O)O*)cc2)cc1,0.259
621
+ *CCCCCCCCCOC(=O)c1ccc(C(=O)NCCCCCNC(=O)c2ccc(C(=O)O*)cc2)cc1,0.315
622
+ *CCCCCCCCCOCO*,0.277
623
+ *CCCCCCCCNC(=O)C(CCCCCCCCCCCC)C(=O)N*,0.331
624
+ *CCCCCCCCNC(=O)CCCCC(=O)N*,0.291
625
+ *CCCCCCCCNC(=O)CCCCCCCC(=O)N*,0.308
626
+ *CCCCCCCCNC(=O)CCCCCCCCC(=O)N*,0.392
627
+ *CCCCCCCCNC(=O)CCCCCCCCC(=O)NCCCCCCCCNC(=O)C(=O)N*,0.3315
628
+ *CCCCCCCCNC(=O)CCCCCCCCCCC(=O)N*,0.3463333333333334
629
+ *CCCCCCCCNC(=O)CCCCCCCCCCCCC(=O)N*,0.381
630
+ *CCCCCCCCNC(=O)CCCCCCCCCCCCCCC(=O)N*,0.355
631
+ *CCCCCCCCNC(=O)CCCCCCCCCCCCCCCCC(=O)N*,0.3725
632
+ *CCCCCCCCNC(=O)CCCCCCCCCCCCCCCCCCCCC(=O)N*,0.425
633
+ *CCCCCCCCOC(=O)CCCCCNC(=O)CCCCC(=O)NCCCCCC(=O)O*,0.293
634
+ *CCCCCCCCOC(=O)CCCCCNC(=O)c1ccc(C(=O)NCCCCCC(=O)O*)cc1,0.276
635
+ *CCCCCCCCOC(=O)c1ccc(C(=O)NCCCCCCNC(=O)c2ccc(C(=O)O*)cc2)cc1,0.261
636
+ *CCCCCCCCSSCCCCSS*,0.192
637
+ *CCCCCCCCc1nnc(*)o1,0.324
638
+ *CCCCCCCNC(=O)C(CCCCCCCCCCCC)C(=O)N*,0.3455
639
+ *CCCCCCCNC(=O)CCCCC(=O)N*,0.323
640
+ *CCCCCCCNC(=O)CCCCCC(=O)N*,0.334
641
+ *CCCCCCCNC(=O)CCCCCCCCC(=O)N*,0.3235
642
+ *CCCCCCCNC(=O)CCCCCCCCCC(=O)N*,0.4065
643
+ *CCCCCCCNC(=O)CCCCCCCCCCCCCCCCCCC(=O)N*,0.4137499999999999
644
+ *CCCCCCN(C)C(=O)CCCCCCCCCCCCCCC(=O)N(*)C,0.33
645
+ *CCCCCCN(C)C(=O)CCCCCCCCCCCCCCCCC(=O)N(*)C,0.3525
646
+ *CCCCCCN(CC)C(=O)CCCCCCCCCCCCCCCCC(=O)N(*)CC,0.333
647
+ *CCCCCCNC(=O)C(CCCCCCCCCCCCC)C(=O)N*,0.2925
648
+ *CCCCCCNC(=O)C(CCCCCCCCCCCCCC)C(=O)N*,0.319
649
+ *CCCCCCNC(=O)C(CCCCCCCCCCCCCCC)C(=O)N*,0.3369999999999999
650
+ *CCCCCCNC(=O)C(CCCCCCCCCCCCCCCC)C(=O)N*,0.3235
651
+ *CCCCCCNC(=O)C(CCCCCCCCCCCCCCCCC)C(=O)N*,0.3505
652
+ *CCCCCCNC(=O)C(CCCCCCCCCCCCCCCCCC)C(=O)N*,0.354
653
+ *CCCCCCNC(=O)CC/C=C/CCC(=O)N*,0.278
654
+ *CCCCCCNC(=O)CCCCCC(=O)N*,0.278
655
+ *CCCCCCNC(=O)CCCCCCC(=O)N*,0.345
656
+ *CCCCCCNC(=O)CCCCCCCCCCCCC(=O)N*,0.355
657
+ *CCCCCCNC(=O)CCCCCCCCCCCCCCC(=O)N*,0.367
658
+ *CCCCCCNC(=O)CCCCCCCCCCCCCCCCC(=O)N*,0.3955
659
+ *CCCCCCNC(=O)CCCCCCCCCCCCCCCCCCCCC(=O)N*,0.44
660
+ *CCCCCCNC(=O)N*,0.328
661
+ *CCCCCCOC(=O)CCCCSCCCCC(=O)O*,0.224
662
+ *CCCCCCOC(=O)OCCCCCCOC(=O)OCCCCCCOc1ccc(-c2ccc(O*)cc2)cc1,0.329
663
+ *CCCCCCOCO*,0.278
664
+ *CCCCCCSCCCCCS*,0.241
665
+ *CCCCCCSCCS*,0.234
666
+ *CCCCCCSSCCCCSS*,0.192
667
+ *CCCCCNC(=O)CCCCC(=O)N*,0.327
668
+ *CCCCCNC(=O)CCCCCCCC(=O)N*,0.301
669
+ *CCCCCNC(=O)CCCCCCCCCCCC(=O)N*,0.3545
670
+ *CCCCCNC(=O)CCCCCCCCCCCCC(=O)N*,0.3415
671
+ *CCCCCNC(=O)CCCCCCCCCCCCCCCCC(=O)N*,0.4165
672
+ *CCCCCNC(=O)CCCCCCCCCCCCCCCCCCC(=O)N*,0.3495
673
+ *CCCCCNC(=O)CCCCCCCCOCCCCCCCCC(=O)NCCCCCO*,0.267
674
+ *CCCCCNC(=O)CCCCCCOCCCCCCC(=O)NCCCCCO*,0.285
675
+ *CCCCCNC(=O)O*,0.268
676
+ *CCCCCOC(=O)CCCCCNC(=O)O*,0.317
677
+ *CCCCCOC(=O)CCCCSCCCCC(=O)O*,0.202
678
+ *CCCCCc1ccc(CCCCCOC(=O)c2ccc(C(=O)O*)cc2)cc1,0.276
679
+ *CCCCNC(=O)CC/C=C/CCC(=O)N*,0.258
680
+ *CCCCNC(=O)CCCCC(=O)N*,0.302
681
+ *CCCCNC(=O)CCCCCCC(=O)N*,0.2915
682
+ *CCCCNC(=O)CCCCCCCCC(=O)N*,0.355
683
+ *CCCCNC(=O)CCCCCCCCCC(=O)N*,0.3229999999999999
684
+ *CCCCNC(=O)CCCCCCCCCCC(=O)N*,0.332
685
+ *CCCCNC(=O)CCCCCCCCCCCCCCC(=O)N*,0.3735
686
+ *CCCCOC(=O)CCCCCCCC(=O)O*,0.241
687
+ *CCCCOC(=O)CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC(=O)O*,0.49475
688
+ *CCCCOC(=O)CCCCCNC(=O)CCCCC(=O)NCCCCCC(=O)O*,0.242
689
+ *CCCCOC(=O)c1ccc(-c2ccc(C(=O)O*)cc2)cc1,0.248
690
+ *CCCCOC(=O)c1cccc(-c2cccc(C(=O)O*)c2)c1,0.203
691
+ *CCCCOC(=O)c1ccccc1-c1ccccc1C(=O)O*,0.19
692
+ *CCCCSCCS*,0.226
693
+ *CCCCSS*,0.175
694
+ *CCCCSSCCCCO*,0.199
695
+ *CCCCSSCCCCOCO*,0.196
696
+ *CCCCSSCCCSS*,0.161
697
+ *CCCCc1ccc(CCCCOC(=O)c2ccc(C(=O)O*)cc2)cc1,0.241
698
+ *CCCN*,0.354
699
+ *CCCNC(=O)CCCCCCCCCCCCCCCCCCC(=O)N*,0.407
700
+ *CCCO*,0.296
701
+ *CCCOC(=O)C1CCC(C(=O)O*)CC1,0.222
702
+ *CCCOC(=O)CCCCCCCCCCCCCCCCC(=O)O*,0.364
703
+ *CCCc1ccc(CCCOC(=O)c2ccc(C(=O)O*)cc2)cc1,0.249
704
+ *CCN(*)C(=O)CCCCCCCCCCCCCC,0.309
705
+ *CCN(*)C(=O)CCCCCCCCCCCCCCC,0.3359999999999999
706
+ *CCN(*)C(=O)CCCCCCCCCCCCCCCCC,0.4005
707
+ *CCN*,0.351
708
+ *CCNC(=O)CCCCCCCCC(=O)N*,0.302
709
+ *CCNC(=O)CCCCCCCCCCCCCCC(=O)N*,0.412
710
+ *CCNC(=O)CCCCCCCCCCCCCCCCC(=O)N*,0.359
711
+ *CCOC(=O)CCCCSCCCCC(=O)O*,0.212
712
+ *CCOCCOC(=O)C(CCCCCCCCC)C(=O)O*,0.224
713
+ *CCOCCOC(=O)CCCCCCCCC(=O)O*,0.229
714
+ *CCOCCOC(=O)CCCCCCCCCCCCCCCCC(=O)O*,0.3485
715
+ *CCOCCOCCOC(=O)CCCCCCCC(=O)O*,0.267
716
+ *CCS*,0.223
717
+ *CCc1c2ccccc2c(*)c2ccccc12,0.166
718
+ *CCc1cc(*)c(C)cc1C,0.194
719
+ *CCc1ccc(*)c(C(=O)OC)c1,0.245
720
+ *CCc1ccc(*)c(C(C)=O)c1,0.217
721
+ *CCc1ccc(*)o1,0.2784999999999999
722
+ *CCc1ccc(*)s1,0.288
723
+ *CCc1ccc(CCNC(=O)CCCCCCCCCCCCCCCC(=O)N*)cc1,0.3515
724
+ *CCc1ccc(CCNC(=O)CCCCCCCCCCCCCCCCC(=O)N*)cc1,0.368
725
+ *CCc1ccc(CCNC(=O)CCCCCCCCCCCCCCCCCCCCC(=O)N*)cc1,0.3195
726
+ *CCc1ccc(CCOC(=O)CCc2ccc(CCC(=O)O*)cc2)cc1,0.319
727
+ *CCc1ccc(CCOC(=O)c2ccc(C(=O)O*)cc2)cc1,0.23
728
+ *CO*,0.246
729
+ *CS*,0.172
730
+ *Cc1ccc(*)cc1,0.267
731
+ *Cc1ccc(CNC(=O)CCCCCCCCCCCCCCCCC(=O)N*)cc1,0.3299999999999999
732
+ *Cc1ccc(CNC(=O)CCCCCCCCCCCCCCCCCCCCC(=O)N*)cc1,0.359
733
+ *Cc1ccc2nc(-c3cc(-c4nc5ccc(*)cc5c(=O)o4)cc(N4C(=O)c5ccccc5C4=O)c3)oc(=O)c2c1,0.227
734
+ *N1C(=O)C2=C(C=C(C=C2)C2=CC=C3C(=O)N(C(=O)C3=C2)C2=C3C=CC=C(*)C3=CC=C2)C1=O,0.314
735
+ *Nc1ccc(*)cc1CCCCCCCCCCCCCCC,0.314
736
+ *Nc1ccc(*)cc1OCCCCCCCCCCCCCCCC,0.3671666666666667
737
+ *OC(*)C(Cl)(Cl)Cl,0.066
738
+ *OC(*)CCC#N,0.1704999999999999
739
+ *OC(=O)C(Cc1ccccc1)NC(=O)CCCCCCCCCCC(=O)NC(Cc1ccccc1)C(=O)OC1COC2C(*)COC12,0.222
740
+ *OC(=O)CCCCCC(*)=O,0.237
741
+ *OC(C)COC(=O)CCCCSCCCCC(*)=O,0.214
742
+ *OC1CCCCC1*,0.1694999999999999
743
+ *ON(C(F)(F)F)C(F)(F)C(*)(F)F,0.0905
744
+ *OS(=O)(=O)c1cccc(S(=O)(=O)Oc2ccc(C3(c4ccc(*)cc4)CCCCC3)cc2)c1,0.151
745
+ *Oc1c(C)cc(*)cc1-c1ccccc1,0.1935
746
+ *Oc1c(Cl)cc(C2(c3cc(Cl)c(OC(*)=O)c(Cl)c3)CCCCC2)cc1Cl,0.109
747
+ *Oc1cc(CCC)cc(OC(=O)c2cccc(C(*)=O)c2)c1,0.234
748
+ *Oc1ccc(-c2ccc(-c3cc(-c4ccccc4)c(-c4ccc(-c5ccc(OC(=O)c6ccc(C(*)=O)cc6-c6ccccc6)cc5)cc4)c(-c4ccccc4)c3-c3ccccc3)cc2)cc1,0.315
749
+ *Oc1ccc(/C=N/c2ccc(/N=C/c3ccc(OC(=O)CCCCC(*)=O)c(OC)c3)cc2)cc1OC,0.283
750
+ *Oc1ccc(/C=N/c2ccc(/N=C/c3ccc(OC(=O)CCCCCCCCC(*)=O)c(OC)c3)cc2)cc1OC,0.274
751
+ *Oc1ccc(/C=N/c2ccc(/N=C/c3ccc(OC(=O)c4cccc(C(*)=O)c4)c(OC)c3)cc2)cc1OC,0.37
752
+ *Oc1ccc(/N=C/C=N/c2ccc(OC(=O)NC3CC(C)(C)CC(C)(CNC(*)=O)C3)cc2)cc1,0.349
753
+ *Oc1ccc(/N=C/C=N/c2ccc(OC(=O)NCCCCCCNC(*)=O)cc2)cc1,0.325
754
+ *Oc1ccc(/N=C/C=N/c2ccc(OC(=O)Nc3cc(NC(*)=O)ccc3C)cc2)cc1,0.297
755
+ *Oc1ccc(/N=C/C=N/c2ccc(OC(=O)Nc3ccc(Cc4ccc(NC(*)=O)cc4)cc3)cc2)cc1,0.417
756
+ *Oc1ccc(/N=C/CCC/C=N/c2ccc(OC(=O)NC3CC(C)(C)CC(C)(CNC(*)=O)C3)cc2)cc1,0.207
757
+ *Oc1ccc(/N=C/CCC/C=N/c2ccc(OC(=O)NCCCCCCNC(*)=O)cc2)cc1,0.294
758
+ *Oc1ccc(/N=C/CCC/C=N/c2ccc(OC(=O)Nc3cc(NC(*)=O)ccc3C)cc2)cc1,0.252
759
+ *Oc1ccc(/N=C/CCC/C=N/c2ccc(OC(=O)Nc3ccc(Cc4ccc(NC(*)=O)cc4)cc3)cc2)cc1,0.326
760
+ *Oc1ccc(C(C)(C)c2ccc(OC(*)=O)cc2)cc1,0.2385
761
+ *Oc1ccc(C(C)(C)c2ccc(OC(=O)CN(CC(*)=O)c3ccc(/N=N/c4ccc([N+](=O)[O-])cc4)cc3)cc2)cc1,0.244
762
+ *Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)Nc4ccc(-c5ccc(NC(=O)c6ccc(*)cc6)cc5C(F)(F)F)c(C(F)(F)F)c4)cc3)cc2)cc1,0.2685
763
+ *Oc1ccc(C(C)(c2ccccc2)c2ccc(OC(*)=O)cc2)cc1,0.206
764
+ *Oc1ccc(C(CCC)(CCC)c2ccc(OC(*)=O)cc2)cc1,0.219
765
+ *Oc1ccc(C2(c3ccc(OC(*)=O)c(C)c3)CCCCC2)cc1C,0.183
766
+ *Oc1ccc(C2(c3ccc(OC(*)=O)c(Cl)c3)CCCCC2)cc1Cl,0.165
767
+ *Oc1ccc(C2(c3ccc(OC(*)=O)cc3)CCCC2)cc1,0.241
768
+ *Oc1ccc(C2(c3ccc(OC(*)=O)cc3)CCCCC2)cc1,0.185
769
+ *Oc1ccc(C2(c3ccc(OC(*)=O)cc3)c3ccccc3-c3ccccc32)cc1,0.235
770
+ *Oc1ccc(NC(=O)CCCCCCCCCC(=O)Nc2ccc(*)cc2)cc1,0.3745
771
+ *Oc1ccc(OC(=O)CCCCCCCCC(*)=O)cc1,0.257
772
+ *Oc1ccc(Oc2ccc(C(=O)c3ccccc3-c3ccccc3C(=O)c3ccc(*)cc3)cc2)cc1,0.2205
773
+ *Oc1ccc(S(=O)(=O)c2ccc(*)cc2)cc1,0.268
774
+ *Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C(C)(C)c4ccc(*)cc4)cc3)cc2)cc1,0.204
775
+ *Oc1cccc(NC(=O)CCCCCCCCCCC(=O)Nc2ccc(*)cc2)c1,0.319
776
+ *Oc1cccc(OC(=O)c2ccc(C(C)(C)c3ccc(C(*)=O)cc3)cc2)c1,0.222
777
+ *Oc1cccc(Oc2ccc(C(=O)Nc3ccc(Oc4ccc(NC(=O)c5ccc(*)cc5)cc4)cc3)cc2)c1C#N,0.296
778
+ *SC(*)(F)F,0.114
779
+ *Sc1c(C)cc(*)cc1C,0.145
780
+ *\N=C\c1ccc(C=Nc2c(cc(cc2-c2ccccc2)-c2ccc(cc2)-c2cc(-c3ccccc3)c(*)c(c2)-c2ccccc2)-c2ccccc2)cc1,0.219
781
+ *c1[nH]c(*)c(CC(=O)OCCCCCCCCCCCC)c1CC(=O)OCCCCCCCCCCCC,0.2794999999999999
782
+ *c1c(*)c2ccccc2c2ccccc12,0.091
783
+ *c1c(-c2ccccc2)c(-c2ccccc2)c(*)c2cc(-c3ccc(C#Cc4ccccc4)c(C#Cc4ccccc4)c3)ccc12,0.224
784
+ *c1c(-c2ccccc2)c(-c2ccccc2)c(*)c2cc(C(c3ccc(C#Cc4ccccc4)c(C#Cc4ccccc4)c3)(C(F)(F)F)C(F)(F)F)ccc12,0.103
785
+ *c1c(-c2ccccc2)c(-c2ccccc2)c(*)c2cc(Oc3ccc(C#Cc4ccccc4)c(C#Cc4ccccc4)c3)ccc12,0.16
786
+ *c1cc(*)c(O)c(/C=N/c2ccc(Cl)cc2)c1,0.141
787
+ *c1cc(*)c(O)c(/C=N/c2ccc(N3CCOCC3)cc2)c1,0.266
788
+ *c1cc(-c2sc(-c3cc(CCCCCCCCCCCCCC)c(*)s3)cc2CCCCCCCCCCCCCC)c2cccccc1-2,0.316
789
+ *c1cc(/C=N/c2ccc(C)cc2)c(*)c(OC)c1,0.215
790
+ *c1cc(/C=N/c2cccc(C)c2)cc(*)c1O,0.192
791
+ *c1cc(CCCCCCCCCCCC)c(*)cc1CCCCCCCCCCCC,0.356
792
+ *c1cc(CCCCCCCCCCCCC)c(*)s1,0.40475
793
+ *c1cc(CCCCCCCCCCCCCC)c(*)s1,0.4182499999999999
794
+ *c1cc(CCCCCCCCCCCCCCC)c(*)s1,0.3742499999999999
795
+ *c1cc(CCCCCCCCCCCCCCCC)c(*)[nH]1,0.4075
796
+ *c1cc(CCCCCCCCCCCCCCCC)c(*)s1,0.381
797
+ *c1cc(CCCCCCCCCCCCCCCCC)c(*)s1,0.396
798
+ *c1cc(CCCCCCCCCCCCCCCCCC)c(*)s1,0.36525
799
+ *c1cc(CCCCCCCCCCCCCCCCCCC)c(*)s1,0.34
800
+ *c1cc(CCCCCCCCCCCCCCCCCCCCCC)c(*)s1,0.39725
801
+ *c1cc(O)c(O)cc1*,0.183
802
+ *c1cc(OCCCCCCCCCC)c(*)cc1OCCCCCCCCCC,0.256
803
+ *c1cc(OCCCCCCCCCCCC)c(*)cc1O,0.309
804
+ *c1cc2cccccc-2c1*,0.163
805
+ *c1ccc(*)c2ccccc12,0.571
806
+ *c1ccc(-c2cc(-c3ccc(OCCCCCC)cc3)cc(-c3ccc(-c4ccc5c(c4)C(CCCCCC)(CCCCCC)c4cc(*)ccc4-5)cc3)c2-c2ccc(OCCCCCC)cc2)cc1,0.29
807
+ *c1ccc(-c2ccc(-c3ccc(C(*)(CC)C(F)(F)F)cc3)cc2)cc1,0.236
808
+ *c1ccc(-c2ccc(-c3ccc(C(*)(c4c(F)c(F)c(F)c(F)c4F)C(F)(F)F)cc3)cc2)cc1,0.172
809
+ *c1ccc(-c2ccc(-c3ccc(C(*)(c4ccc(F)cc4)C(F)(F)F)cc3)cc2)cc1,0.22
810
+ *c1ccc(-c2ccc(-c3ccc(C(*)(c4ccccc4)C(F)(F)F)cc3)cc2)cc1,0.278
811
+ *c1ccc(-c2ccc(-c3ccc(C(*)c4c(F)c(F)c(F)c(F)c4F)cc3)cc2)cc1,0.291
812
+ *c1ccc(-c2ccc(-c3sc(*)c(CCCCCCCCCCCC)c3CCCCCCCCCCCC)s2)s1,0.299
813
+ *c1ccc(-c2ccc(C(*)(C)C(F)(F)F)cc2)cc1,0.194
814
+ *c1ccc(-c2ccc(C(*)(C)C)cc2)cc1,0.247
815
+ *c1ccc(-c2ccc(C(*)(c3ccc(F)cc3)C(F)(F)F)cc2)cc1,0.185
816
+ *c1ccc(-c2ccc(C(*)(c3ccccc3)C(F)(F)F)cc2)cc1,0.233
817
+ *c1ccc(-c2ccc(C3(*)CCCCC3)cc2)cc1,0.252
818
+ *c1ccc(-c2ccc(C3(*)c4ccccc4-c4ccccc43)cc2)cc1,0.203
819
+ *c1ccc(-c2ccc3c(c2)C(CCCCCC)(CCCCCC)c2cc(*)ccc2-3)cc1,0.376
820
+ *c1ccc(-c2ccc3c(c2)C(CCCCCCBr)(CCCCCCBr)c2cc(*)ccc2-3)cc1,0.236
821
+ *c1ccc(-c2ccc3c(c2)C(CCCCCCC#N)(CCCCCCC#N)c2cc(*)ccc2-3)cc1,0.487
822
+ *c1ccc(-c2nc3ccc(Oc4ccc5nc(*)c(-c6ccccc6)c(-c6ccccc6)c5c4)cc3c(-c3ccccc3)c2-c2ccccc2)cc1,0.27
823
+ *c1ccc(-c2nnc(-c3cccc(-c4nnc(*)n4-c4ccccc4)c3)n2-c2ccccc2)cc1,0.389
824
+ *c1ccc(-c2sc(-c3cc(CCCCCCCCCCCC)c(*)s3)cc2CCCCCCCCCCCC)cc1,0.413
825
+ *c1ccc(-c2sc(-c3cc(CCCCCCCCCCCC)c(*)s3)cc2CCCCCCCCCCCC)s1,0.34
826
+ *c1ccc(-c2sc(-c3cc(SCCCCCCCCCCCC)c(*)s3)cc2SCCCCCCCCCCCC)cc1,0.349
827
+ *c1ccc(C(Cl)(Cl)C(*)Cl)cc1,0.115
828
+ *c1ccc(C(F)(F)C(*)(F)F)cc1,0.101
829
+ *c1ccc(C2C(C#N)(C(=O)OCCC)C(*)C2(C#N)C(=O)OCCC)cc1,0.258
830
+ *c1ccc(C2C(C(=O)Oc3ccccc3)C(*)C2C(=O)Oc2ccccc2)cc1,0.225
831
+ *c1ccc(N2C(=O)c3ccc(-c4cccc5c4C(=O)N(*)C5=O)cc3C2=O)cc1,0.233
832
+ *c1ccc(N2C(=O)c3ccc(C(=O)c4ccc5c(c4)C(=O)N(c4ccc(-c6nc7cc(-c8ccc9nc(-c%10ccccc%10)c(*)nc9c8)ccc7nc6-c6ccccc6)cc4)C5=O)cc3C2=O)cc1,0.224
833
+ *c1ccc(NC(=O)Nc2ccc(NC(=O)Nc3ccc(-c4nc(-c5ccc([N+](=O)[O-])cc5)[nH]c4*)cc3)cc2)cc1,0.269
834
+ *c1ccc(Oc2ccc(-c3csc(/N=C/c4ccc(OCCCCCCCCCCOc5ccc(/C=N/c6nc(*)cs6)cc5OC)c(OC)c4)n3)cc2)cc1,0.383
835
+ *c1ccc(Oc2ccc(-c3csc(/N=C/c4ccc(OCCCCCCCCOc5ccc(/C=N/c6nc(*)cs6)cc5OC)c(OC)c4)n3)cc2)cc1,0.228
836
+ *c1ccc(Oc2ccc(-c3csc(/N=C/c4ccc(OCCCCCCOc5ccc(/C=N/c6nc(*)cs6)cc5OC)c(OC)c4)n3)cc2)cc1,0.253
837
+ *c1ccc(Oc2ccc(-c3csc(/N=C/c4ccc(OCCCCOc5ccc(/C=N/c6nc(*)cs6)cc5OC)c(OC)c4)n3)cc2)cc1,0.218
838
+ *c1ccc(Oc2ccc(-c3csc(/N=C/c4ccc(OCCOc5ccc(/C=N/c6nc(*)cs6)cc5OC)c(OC)c4)n3)cc2)cc1,0.243
839
+ *c1ccc(Oc2ccc(C(c3ccc(Oc4ccc(-n5c(=O)c6cc7c(=O)n(*)c(=O)c7cc6c5=O)cc4C(F)(F)F)cc3)(C(F)(F)F)C(F)(F)F)cc2)c(C(F)(F)F)c1,0.213
840
+ *c1ccc(Oc2cccc(Oc3ccc(N4C(=O)c5cccc(Oc6cccc(Oc7cccc8c7C(=O)N(*)C8=O)c6)c5C4=O)cc3)c2)cc1,0.165
841
+ *c1ccc(Sc2ccc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,0.252
842
+ *c1ccc2c(c1)C(=O)N(c1cc(OCCN(CC)c3ccc(/N=N/c4ccc([N+](=O)[O-])cc4)cc3)cc(N3C(=O)c4ccc(C(*)(C(F)(F)F)C(F)(F)F)cc4C3=O)c1)C2=O,0.154
843
+ *c1ccc2c(c1)C(=O)N(c1ccc(C(c3ccc(N4C(=O)c5ccc(C(*)(C(F)(F)F)C(F)(F)F)cc5C4=O)cc3)(C(F)(F)F)C(F)(F)F)cc1)C2=O,0.151
844
+ *c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(C(c4ccc(Oc5ccc(N6C(=O)c7ccc(C(*)(C(F)(F)F)C(F)(F)F)cc7C6=O)cc5C(F)(F)F)cc4)(C(F)(F)F)C(F)(F)F)cc3)c(C(F)(F)F)c1)C2=O,0.152
845
+ *c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(N4C(=O)c5ccc(C(*)(C(F)(F)F)C(F)(F)F)cc5C4=O)cc3)cc1)C2=O,0.165
846
+ *c1ccc2c(c1)C(=O)N(c1cccc(C(c3cccc(N4C(=O)c5ccc(C(*)(C(F)(F)F)C(F)(F)F)cc5C4=O)c3)(C(F)(F)F)C(F)(F)F)c1)C2=O,0.111
847
+ *c1ccc2c(c1)C(CCCCCC)(CCCCCC)c1cc(-c3cc(CCCCCC)c(*)cc3CCCCCC)ccc1-2,0.46
848
+ *c1ccc2c(c1)C(CCCCCC)(CCCCCC)c1cc(-c3cc(CCCCCCCC)c(*)cc3CCCCCCCC)ccc1-2,0.349
849
+ *c1ccc2c(c1)C(CCCCCC)(CCCCCC)c1cc(-c3cc(CCCCCCCCCC)c(*)cc3CCCCCCCCCC)ccc1-2,0.216
850
+ *c1ccc2c(c1)C(CCCCCC)(CCCCCC)c1cc(-c3cc(OCCCCCCCC)c(*)cc3OCCCCCCCC)ccc1-2,0.291
851
+ *c1ccc2c(c1)C(CCCCCC)(CCCCCC)c1cc(-c3cc(OCc4ccccc4)c(*)cc3OCc3ccccc3)ccc1-2,0.184
852
+ *c1ccc2c(c1)SC1=Nc3cc(-c4ccc5c(c4)N=C4Sc6cc(*)ccc6N=C4N5)ccc3NC1=N2,0.506
853
+ *c1ccc2cc(*)ccc2c1,1.59
854
+ *c1ccc2ccc(*)cc2c1,0.221
855
+ *c1ccc2ccc3c(*)cc(C#C)c4ccc1c2c34,0.582
856
+ *c1ccc2ccc3c(*)cc(C#CC=C)c4ccc1c2c34,0.429
857
+ *c1cccc(-c2nc3ccc(-c4ccc5oc(*)nc5c4)cc3o2)c1,0.3795
858
+ *c1cccc2c(*)cccc12,0.685
859
+ *c1nc(-c2ccccc2)nc(N(c2ccccc2)c2ncnc(N(*)c3ccccc3)n2)n1,0.185
860
+ *c1nc(C)nc(N(CCCCCCN(*)c2ccccc2)c2ccccc2)n1,0.21
861
+ *c1nc2cc(-c3ccc4[nH]c(-c5ccc(*)o5)nc4c3)ccc2[nH]1,0.446
862
+ *c1nc2cc3sc(-c4cc(OCCCCCCCCCCCC)c(*)cc4OCCCCCCCCCCCC)nc3cc2s1,0.3647499999999999
863
+ *c1sc(*)c(OCCCCCCCCCCCC)c1C,0.37875
864
+ *c1sc(*)c(OCCCCCCCCCCCCCC)c1C,0.43675
865
+ *c1sc(*)c(OCCCCCCCCCCCCCCCC)c1C,0.388
866
+ *c1sc(*)c(OCCCCCCCCCCCCCCCCCCCC)c1C,0.3747499999999999
867
+ *c1sc(*)c2c1OCC(CCCCCCCCCCCCCCCC)O2,0.44475
868
+ *c1sc(*)c2sc(CCCCCCCCC)nc12,0.482
869
+ *c1sc(-c2cc(CCCCCCCCCC)c(*)s2)cc1CCCCCCCCCC,0.307
870
+ CC1(C)CC(CC(C)(C*)C1)N1C(=O)C2=CC=C(C=C2C1=O)C1=CC2=C(C=C1)C(=O)N(*)C2=O,0.231
871
+ CCNC(=O)OCCCCC(*)=C=C=C(*)CCCCOC(=O)NCC,0.26
872
+ Cc1ccc(cc1)S(=O)(=O)OCCCCC(*)=C=C=C(*)CCCCOS(=O)(=O)c1ccc(C)cc1,0.21
873
+ FC(F)(F)C(C1=CC2=C(OC(=N2)C2=CC=C(OC3=C4C5C6=C(C=CC=C6)C(C6=C5C=CC=C6)C4=C(OC4=CC=C(*)C=C4)C4=C3C3C5=C(C=CC=C5)C4C4=C3C=CC=C4)C=C2)C=C1)(C1=CC=C2OC(*)=NC2=C1)C(F)(F)F,0.109
874
+ FC(F)(F)C(C1=CC2=C(OC(=N2)C2=CC=C(OC3=C4C5C6=C(C=CC=C6)C(C6=C5C=CC=C6)C4=C(OC4=CC=C(*)C=C4)C=C3)C=C2)C=C1)(C1=CC=C2OC(*)=NC2=C1)C(F)(F)F,0.176
875
+ FC(F)(F)C(C1=CC2=C(OC(=N2)C2=CC=C(OC3=CC=C(OC4=CC=C(*)C=C4)C4=C3C3C5=C(C=CC=C5)C4C4=C3C=CC=C4)C=C2)C=C1)(C1=CC=C2OC(*)=NC2=C1)C(F)(F)F,0.155
simson_modeling/kaggle_comp/train_supplement/dataset2.csv ADDED
The diff for this file is too large to render. See raw diff
 
simson_modeling/kaggle_comp/train_supplement/dataset3.csv ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SMILES,Tg
2
+ *=Nc1ccc(N=C(C)Nc2ccc(-c3ccc(NC(=*)C)c(C(=O)O)c3)cc2C(=O)O)cc1,89.38045943
3
+ *C(=O)OC(=O)COc1ccc(OCC(=O)OC(=O)c2ccc(*)nc2)cc1,155.9709567
4
+ *C(=O)c1ccc(C(=O)c2ccc(C=C3CCC(=Cc4ccc(*)cc4)C3=O)cc2)cc1,192.2096838
5
+ *C=C(*)c1ccc(OCCCCCC(=O)Oc2c(F)c(F)c(F)c(F)c2F)cc1,73.83198457
6
+ *C=CC1C=CC(*)c2ccc(CCCCCC)cc21,9.70407322
7
+ *CC#CC#CCOc1ccc(C(C)(C)c2ccc(O*)cc2)cc1,48.046331
8
+ *CC(CCCCCCOc1ccc(-c2ccc(C#N)cc2)cc1)COC(=O)N(C)c1ccc(N(C)C(=O)O*)cc1,78.11061669
9
+ *CC1CCCC(*)(CO)C1=O,71.70468573
10
+ *CCC(=O)NNC(=O)c1cccc(C(=O)NNC(=O)c2cccc(O*)c2)c1,234.2133464
11
+ *CCCC(=O)NCc1ccc(CNC(=O)CCCOC(=O)c2ccc(C(=O)O*)cc2)cc1,53.55726117
12
+ *CCCCCCCCCCOc1ccc(C=Cc2ccc(OCCCCCCCCCCOP(=O)(O*)OCCCCCCCCCCOc3ccc(N=Nc4ccc(C)cc4)cc3)cc2)cc1,5.81702489
13
+ *CCCCCCCCCCOc1ccc(C=Cc2ccc(OCCCCCCCCCCOP(=O)(O*)OCCCCCCCCCCOc3ccc(N=Nc4ccc(F)cc4)cc3)cc2)cc1,-7.2122695
14
+ *CCCCCCCOc1ccc(C=CC=Cc2ccc(O*)c(C)c2)cc1C,41.89270134
15
+ *CCCCCCOC(=O)c1ccc(S(=O)(=O)CCCCCCS(=O)(=O)c2ccc(C(=O)O*)cc2)cc1,35.86441642
16
+ *CCCCSSC(=O)N(C)c1ccc(Cc2ccc(N(C)C(=O)SS*)cc2)cc1,20.68923822
17
+ *CCCCc1nc2cc(NC(=NO)C(=NO)Nc3ccc4nc(*)[nH]c4c3)ccc2[nH]1,170.1130329
18
+ *CCNC(=O)c1ccc(C(=O)NCCOC(=O)c2cccc(C(=O)O*)c2)cc1,121.124261
19
+ *CCNC(=S)C=Cc1ccc(Cc2ccc(C=CC(=S)N*)cc2)cc1,99.87529352
20
+ *Cc1cc(C=O)c(O)c(Cc2cc(C=O)c(O)c(CNC(=O)NC(=O)N*)c2)c1,190.6072154
21
+ *Cc1ccc(COC(=O)c2ccccc2C(=O)O*)cc1,95.33964688
22
+ *Cc1ccc(CSC(=O)Nc2ccc(Cc3ccc(NC(=O)S*)cc3)cc2)cc1,90.77725081
23
+ *Cc1ccc2nc(-c3cc(-c4nc5ccc(*)cc5c(=O)o4)cc(N4C(=O)c5c(Cl)c(Cl)c(Cl)c(Cl)c5C4=O)c3)oc(=O)c2c1,378.8956296
24
+ *N=C1C=CC(=NC2=CC(=O)C(*)=CC2=O)C=C1,212.9967182
25
+ *N=Cc1ccc(C=Nc2sc(*)c(C#N)c2C#N)cc1,204.4271705
26
+ *Nc1cc(NC(=O)c2cc(OCCN(C)c3ccc(S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F)cc3)cc(C(*)=O)c2)cc(C(=O)OCCN(C)c2ccc(S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F)cc2)c1,172.7344272
27
+ *Nc1ccc(NC(=O)c2cc(NC(=O)C(CCSC)N3C(=O)c4ccccc4C3=O)cc(C(*)=O)c2)cc1,230.3191562
28
+ *OC(C)CCC(C)OC(=O)Nc1ccc(Cc2ccc(NC(*)=O)cc2)cc1,63.5893379
29
+ *OS(=O)(=O)c1ccc(S(=O)(=O)c2ccc(S(=O)(=O)Oc3ccc(S(=O)(=O)c4ccc(*)cc4)cc3)cc2)cc1,194.2678014
30
+ *OS(=O)(=O)c1ccc(Sc2ccc(S(=O)(=O)Oc3ccc(C4(c5ccc(*)cc5)CCCCC4)cc3)cc2)cc1,157.8104834
31
+ *Oc1c(C(=O)Nc2ccc(NC(=O)c3ccc4ccccc4c3Oc3nc(*)nc(N4CCCCC4)n3)cc2)ccc2ccccc12,162.6671353
32
+ *Oc1c([2H])c([2H])c(C(c2c([2H])c([2H])c(OC(*)=O)c([2H])c2[2H])(C([2H])([2H])[2H])C([2H])([2H])[2H])c([2H])c1[2H],230.9913822
33
+ *Oc1cc(OC(=O)c2ccc(C=Nc3ccc(OCCCCCCOc4ccc(N=Cc5ccc(C(*)=O)cc5)cc4Cl)c(Cl)c3)cc2)c(Cl)cc1Cl,82.06188226
34
+ *Oc1ccc(C(=O)CNc2ccc(NCC(=O)c3ccc(*)cc3)cc2)cc1,158.6944649
35
+ *Oc1ccc(C(=O)OCCCCOC(=O)c2ccc(OC(=O)c3ccc(C(*)=O)cc3)cc2)cc1,68.79315579
36
+ *Oc1ccc(C(C)(CCCC)c2ccc(OC(*)=O)cc2)cc1,118.8361345
37
+ *Oc1ccc(CNC(=O)CCCCCCC(=O)NCc2ccc(OC3COC4C(*)COC34)cc2)cc1,21.58173134
38
+ *Oc1ccc2c(c1)nc1n2C(=O)c2ccc(Oc3ccc4c(c3)C(=O)n3c-4nc4ccc(*)cc43)cc2-1,291.281809
39
+ *Oc1ccc2cc(C(=O)Oc3ccccc3OC(=O)c3ccc4cc(Oc5nc(*)nc(N6CCN(C)CC6)n5)ccc4c3)ccc2c1,54.90728348
40
+ *Oc1cccc(OC(=O)c2ccc(C=Nc3ccc(OCCCCCCOc4ccc(N=Cc5ccc(C(*)=O)cc5)cc4)cc3)cc2)c1,83.83402436
41
+ *c1cc(CCCCCC(F)(F)C(F)(F)C(F)(F)C(F)(F)F)c(-c2cc(CCCCCCCC)c(*)s2)s1,6.47608023
42
+ *c1cc(CCCCCC)c(C=Cc2sc(C=Cc3sc(-c4ccc(*)c5nsnc45)cc3CCCCCC)c(CCCCCC)c2CCCCCC)s1,17.9542177
43
+ *c1ccc(*)c2nccnc12,216.378732
44
+ *c1ccc(-c2ccc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,421.9822435
45
+ *c1ccc(C(=O)CNc2ccc(NCC(=O)c3ccc(S(*)(=O)=O)cc3)cc2)cc1,198.9026743
46
+ *c1ccc2c(c1)C(=O)N(c1c(F)c(F)c(N3C(=O)c4ccc(C(*)(C(F)(F)F)C(F)(F)F)cc4C3=O)c(F)c1F)C2=O,388.0137456
47
+ *c1cccc(N2C(=O)c3ccc(Oc4ccc5c(c4)C(C)(c4ccc(Oc6ccc7c(c6)C(=O)N(c6cccc(N8C(=O)c9ccc(Oc%10ccc(C%11(C)CC(C)(C)c%12cc(Oc%13ccc%14c(c%13)C(=O)N(*)C%14=O)ccc%12%11)cc%10)cc9C8=O)c6)C7=O)cc4)CC5(C)C)cc3C2=O)c1,231.956044
simson_modeling/kaggle_comp/train_supplement/dataset4.csv ADDED
@@ -0,0 +1,863 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SMILES,FFV
2
+ *C(=O)NNC(=O)c1ccc([Si](c2ccccc2)(c2ccccc2)c2ccc(C(=O)NNC(=O)c3ccc(*)nc3)cc2)cc1,0.37272461
3
+ *C(=O)NNC(=O)c1ccc([Si](c2ccccc2)(c2ccccc2)c2ccc(C(=O)NNC(=O)c3cncc(*)c3)cc2)cc1,0.36547823
4
+ *C(=O)Nc1cc(NC(=O)c2ccc3[nH]c(-c4cc(-c5nc6cc(*)ccc6[nH]5)cc(N5C(=O)c6ccccc6C5=O)c4)nc3c2)cc(-c2nc3ccccc3[nH]2)c1,0.37637691
5
+ *C(=O)Nc1ccc(-c2cc(-c3ccccc3)cc(-c3ccc(NC(=O)c4ccc5c(c4)C(=O)N(c4cccc(P(=O)(c6ccccc6)c6cccc(N7C(=O)c8ccc(*)cc8C7=O)c6)c4)C5=O)cc3)n2)cc1,0.37693882
6
+ *C(=O)Nc1ccc(-c2ccc(NC(=O)c3ccc4c(c3)C(=O)N(c3ccc(Oc5ccc(N6C(=O)c7ccc(*)cc7C6=O)cc5)cc3)C4=O)cc2)cc1,0.35523486
7
+ *C(=O)Nc1ccc(-c2ccc(NC(=O)c3ccc4c(c3)C(=O)N(c3ccc(S(=O)(=O)c5ccc(N6C(=O)c7ccc(*)cc7C6=O)cc5)cc3)C4=O)cc2)cc1,0.35424303
8
+ *C(=O)Nc1ccc(-c2sc(-c3ccc(NC(=O)c4ccc5c(c4)C(=O)N(c4ccc(Oc6cccc7c(Oc8ccc(N9C(=O)c%10ccc(*)cc%10C9=O)cc8)cccc67)cc4)C5=O)cc3)c(-c3ccccc3)c2-c2ccccc2)cc1,0.38574846
9
+ *C(=O)Nc1ccc(C(c2ccc(NC(=O)c3ccc4c(c3)C(=O)N(c3cccc5c(N6C(=O)c7ccc(*)cc7C6=O)cccc35)C4=O)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.36906594
10
+ *C(=O)Nc1ccc(C2(c3ccc(NC(=O)c4ccc5c(c4)C(=O)N(c4ccc(-c6ccc(N7C(=O)c8ccc(*)cc8C7=O)c(OC)c6)cc4OC)C5=O)cc3)c3ccccc3-c3ccccc32)cc1,0.37058306
11
+ *C(=O)Nc1ccc(C2(c3ccc(NC(=O)c4ccc5c(c4)C(=O)N(c4ccc(Oc6ccc(N7C(=O)c8ccc(*)cc8C7=O)cc6)cc4)C5=O)cc3)c3ccccc3-c3ccccc32)cc1,0.37211646
12
+ *C(=O)Nc1ccc(N=Nc2ccc([N+](=O)[O-])cc2)c(NC(=O)c2ccc3c(c2)C(=O)N(c2c(C)cc(Cc4cc(C)c(N5C(=O)c6ccc(*)cc6C5=O)c(C)c4)cc2C)C3=O)c1,0.37657696
13
+ *C(=O)Nc1ccc(Oc2ccc(-c3ccc(Oc4ccc(NC(=O)c5ccc6c(c5)C(=O)N(c5ccc(C)c(N7C(=O)c8ccc(*)cc8C7=O)c5)C6=O)cc4)cc3)cc2)cc1,0.36178667
14
+ *C(=O)Nc1ccc(Oc2ccc(-c3ccc(Oc4ccc(NC(=O)c5ccc6c(c5)C(=O)N(c5ccc(Oc7ccc(C8(c9ccc(Oc%10ccc(N%11C(=O)c%12ccc(*)cc%12C%11=O)cc%10)cc9)CC9CC8C8CCCC98)cc7)cc5)C6=O)cc4)cc3C)c(C)c2)cc1,0.37683611
15
+ *C(=O)Nc1ccc(Oc2ccc(-c3ccc(Oc4ccc(NC(=O)c5ccc6c(c5)C(=O)N(c5cccc(N7C(=O)c8ccc(*)cc8C7=O)c5)C6=O)cc4)cc3)cc2)cc1,0.35193775
16
+ *C(=O)Nc1ccc(Oc2ccc(C(C)(C)c3ccc(Oc4ccc(NC(=O)c5ccc(N6C(=O)c7ccc(*)cc7C6=O)cc5)cc4)cc3)cc2)cc1,0.36224354
17
+ *C(=O)Nc1ccc(Oc2ccc(C(C)(C)c3ccc(Oc4ccc(NC(=O)c5ccc6c(c5)C(=O)N(c5ccc(Oc7cccc(N8C(=O)c9ccc(*)cc9C8=O)c7)cc5)C6=O)cc4)cc3)cc2)cc1,0.36079128
18
+ *C(=O)Nc1ccc(Oc2ccc(Oc3ccc(NC(=O)c4ccc5c(c4)C(=O)N(c4ccc(-c6sc(-c7ccc(N8C(=O)c9ccc(*)cc9C8=O)cc7)c(-c7ccccc7)c6-c6ccccc6)cc4)C5=O)cc3)cc2)cc1,0.38060777
19
+ *C(=O)Nc1ccc(Oc2cccc(NC(=O)c3ccc4c(c3)C(=O)N(c3ccc(Oc5ccc(C6(c7ccc(Oc8ccc(N9C(=O)c%10ccc(*)cc%10C9=O)cc8)cc7)NC(=O)c7ccccc76)cc5)cc3)C4=O)c2)cc1,0.36268788
20
+ *C(=O)Nc1ccc(Oc2cccc(Oc3ccc(NC(=O)c4ccc5c(c4)C(=O)N(c4cccc(Oc6cccc(Oc7cccc(N8C(=O)c9ccc(*)cc9C8=O)c7)c6C#N)c4)C5=O)cc3)c2)cc1,0.35558372
21
+ *C(=O)Nc1ccc(S(=O)(=O)c2ccc(NC(=O)c3ccc4c(c3)C(=O)N(c3ccc(-c5sc(-c6ccc(N7C(=O)c8ccc(*)cc8C7=O)cc6)c(-c6ccccc6)c5-c5ccccc5)cc3)C4=O)cc2)cc1,0.39133819
22
+ *C(=O)Nc1ccc(S(=O)(=O)c2ccc(NC(=O)c3ccc4c(c3)C(=O)N(c3ccc(NC(=O)Nc5ccc(N6C(=O)c7ccc(*)cc7C6=O)cc5)cc3)C4=O)cc2)cc1,0.34470216
23
+ *C(=O)Nc1ccc(S(=O)(=O)c2ccc(NC(=O)c3ccc4c(c3)C(=O)N(c3cccc(N5C(=O)c6ccc(*)cc6C5=O)c3)C4=O)cc2)cc1,0.35285588
24
+ *C(=O)Nc1ccc(Sc2ccc(NC(=O)c3ccc4c(c3)C(=O)N(c3ccc(Oc5ccc(N6C(=O)c7ccc(*)cc7C6=O)cc5)cc3)C4=O)cc2)cc1,0.35778072
25
+ *C(=O)Nc1cccc(C=CC(=O)C=Cc2cccc(NC(=O)c3ccc(N4C(=O)c5ccc(*)cc5C4=O)cc3)c2)c1,0.34241742
26
+ *C(=O)Nc1cccc(NC(=O)c2ccc3[nH]c(-c4cccc(-c5nc6cc(*)ccc6[nH]5)c4)nc3c2)n1,0.36871452
27
+ *C(=O)Nc1cccc(Oc2ccc(C(=O)c3ccc(Oc4cccc(NC(=O)c5ccc(*)nc5)c4)cc3)cc2)c1,0.3539953
28
+ *C(=O)Nc1cccc(Oc2cccc(Oc3cccc(Oc4cccc(Oc5cccc(NC(=O)c6ccc7c(c6)C(=O)N(c6ccc(Oc8cccc(Oc9ccc(N%10C(=O)c%11ccc(*)cc%11C%10=O)cc9)c8C#N)cc6)C7=O)c5)c4C#N)c3)c2C#N)c1,0.3612591
29
+ *C(=O)Oc1ccc(C(C)(C)c2ccc(OC(=O)c3ccc4c(c3)C(=O)N(c3ccc(Oc5ccc(N6C(=O)c7ccc(*)cc7C6=O)cc5)cc3)C4=O)cc2)cc1,0.36144044
30
+ *C(=O)Oc1ccc(OC(=O)c2ccc3c(c2)C(=O)N(c2ccc(Cc4ccc(N5C(=O)c6ccc(*)cc6C5=O)cc4)cc2)C3=O)c(C(C)(C)C)c1,0.36805651
31
+ *C(=O)Oc1ccc([Si](c2ccccc2)(c2ccccc2)c2ccc(OC(=O)c3ccc(*)s3)cc2)cc1,0.38403914
32
+ *C(=O)c1cc(C(=O)c2ccc3c(c2)C(=O)N(c2ccc(Oc4ccc(N5C(=O)c6ccc(*)cc6C5=O)cc4)cc2)C3=O)cc(C(C)(C)C)c1,0.38251574
33
+ *C(=O)c1cc2c(cc1Cl)C(=O)N(c1cc(Cl)c(N3C(=O)c4cc(*)c(Cl)cc4C3=O)cc1Cl)C2=O,0.40503796
34
+ *C(=O)c1ccc(-c2ccc(C(C)(C)c3ccc(-c4ccc(C(=O)c5ccc6c(c5)C(=O)N(c5ccc(Cc7ccc(N8C(=O)c9ccc(*)cc9C8=O)cc7)cc5)C6=O)cc4)cc3)cc2)cc1,0.38251752
35
+ *C(=O)c1ccc(C(=O)N2CC(C)N(*)CC2C)cc1,0.36842025
36
+ *C(=O)c1ccc(Oc2ccc(Oc3ccc(C(=O)c4ccc5c(c4)C(=O)N(c4ccc(N6C(=O)c7ccc(*)cc7C6=O)cc4)C5=O)cc3)cc2)cc1,0.36785413
37
+ *C(=O)c1ccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4ccc(C(=O)c5ccc6c(c5)C(=O)N(c5ccc(Cc7ccc(N8C(=O)c9ccc(*)cc9C8=O)cc7)cc5)C6=O)cc4)cc3)cc2)cc1,0.37320978
38
+ *C(=O)c1ccc2c(c1)C(=O)N(c1ccc(N3C(=O)c4ccc(*)cc4C3=O)cc1)C2=O,0.37740339
39
+ *C(=O)c1ccc2c(c1)C(=O)N(c1ccc(NC(=O)c3cccc(C(=O)Nc4ccc(N5C(=O)c6ccc(*)cc6C5=O)cc4)c3)cc1)C2=O,0.35291653
40
+ *C(=O)c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(C(=O)c4ccc(C(=O)c5ccc(Oc6ccc(N7C(=O)c8ccc(*)cc8C7=O)cc6)cc5)cc4)cc3)cc1)C2=O,0.37475153
41
+ *C(=O)c1ccc2c(c1)C(=O)N(c1ccc(Oc3cccc4c(Oc5ccc(N6C(=O)c7ccc(*)cc7C6=O)cc5)cccc34)cc1)C2=O,0.3770815
42
+ *C(=O)c1ccc2c(c1)C(=O)N(c1cccc(NC(=O)c3cccc(C(=O)Nc4cccc(N5C(=O)c6ccc(*)cc6C5=O)c4)c3)c1)C2=O,0.3482104
43
+ *C(=O)c1cccc(C(=O)N2CCN(*)CC2)c1,0.34759038
44
+ *C(=O)c1cccc(C(=O)c2ccc(C=C3CCC(=Cc4ccc(*)cc4)C3=O)cc2)c1,0.37124566
45
+ *C(=O)c1cccc(C(=O)c2ccc3c(c2)C(=O)N(c2ccc(Oc4ccc(N5C(=O)c6ccc(*)cc6C5=O)cc4)cc2)C3=O)c1,0.36604695
46
+ *C1C(=O)N(C2CCCCC2)C(=O)C1*,0.39675252
47
+ *C1C(=O)N(CCCOc2ccc(-c3ccc(C#N)cc3)cc2)C(=O)C1*,0.34332558
48
+ *C1C(=O)N(c2cc(Br)c(O[Si](c3ccccc3)(c3ccccc3)c3ccccc3)c(Br)c2)C(=O)C1*,0.40469568
49
+ *C1C(=O)N(c2ccc(O[Si](c3ccccc3)(c3ccccc3)c3ccccc3)cc2)C(=O)C1*,0.40006119
50
+ *C1C=C(CCCCCC)C(*)S1,0.42189435
51
+ *C=CC1CC(*)C(C(=O)OCCCCCCCCOc2ccc(C=Cc3ccc([N+](=O)[O-])cc3)cc2)C1,0.36024408
52
+ *C=Cc1cc(OCCc2ccccc2)c(*)cc1OC,0.36576533
53
+ *C=Cc1ccc(C=Cc2ccc3c(c2)C(CCCCCCOc2ccc4ccc(=O)oc4c2)(CCCCCCOc2ccc4ccc(=O)oc4c2)c2cc(*)ccc2-3)cc1,0.35196182
54
+ *C=Cc1ccc(C=Cc2ccc3c(c2)Sc2cc(*)ccc2N3c2ccc(OCCCCCCCCCCCC)cc2)s1,0.39219852
55
+ *C=Cc1sc(-c2ccc(-c3sc(C=CC4=CC(=C(C#N)C#N)C=C(*)O4)c(CCCCCC)c3CCCCCC)s2)c(CCCCCC)c1CCCCCC,0.41455567
56
+ *CC#CC#CCOc1ccc(C(=O)OCCN(CCOC(=O)c2ccc(O*)cc2)c2ccc(N=Nc3ccc(C#N)cc3)cc2)cc1,0.36879149
57
+ *CC#CC#CCOc1ccc(C(=O)OCCN(CCOC(=O)c2ccc(O*)cc2)c2ccc(N=Nc3ccc([N+](=O)[O-])cc3)cc2)cc1,0.35856006
58
+ *CC#CC#CCOc1cccc(C(=O)OCCN(CCOC(=O)c2cccc(O*)c2)c2ccc(N=Nc3ccc(C#N)cc3)cc2)c1,0.36722522
59
+ *CC(*)(C)C(=O)OC(COc1cccc2ccccc12)COc1cccc2ccccc12,0.35180615
60
+ *CC(*)(C)C(=O)OC1CCCCCCCCC1,0.40440294
61
+ *CC(*)(C)C(=O)OCC1CO1,0.33062914
62
+ *CC(*)(C)C(=O)OCC1OC(n2ccc(=O)[nH]c2=O)C(O)C1O,0.28114011
63
+ *CC(*)(C)C(=O)OCCCCCCCCCCOc1ccc(C(=O)Oc2ccc(C(=O)C=Cc3c(C)c4ccccc4n3C)cc2)cc1,0.36171157
64
+ *CC(*)(C)C(=O)OCCCCCCCCCCOc1ccc(C(=O)Oc2ccc(C(=O)C=Cc3c(C)c4ccccc4n3CCCC)cc2)cc1,0.36405968
65
+ *CC(*)(C)C(=O)OCCCCCCCCOc1ccc(C(=O)Oc2ccc(C(=O)C=Cc3c(C)c4ccccc4n3C)cc2)cc1,0.35752069
66
+ *CC(*)(C)C(=O)OCCCCCCCCOc1ccc(C(=O)Oc2ccc(C(=O)C=Cc3c(C)c4ccccc4n3CCCC)cc2)cc1,0.36425846
67
+ *CC(*)(C)C(=O)OCCCCCCOc1ccc(C(=O)Oc2ccc(C(=O)C=Cc3c(C)c4ccccc4n3C)cc2)cc1,0.35645942
68
+ *CC(*)(C)C(=O)OCCCCCCOc1ccc(C(=O)Oc2ccc(C(=O)C=Cc3c(C)c4ccccc4n3CCCC)cc2)cc1,0.36019365
69
+ *CC(*)(C)C(=O)OCCC[Si]12O[Si]3(CC(C)C)O[Si]4(CC(C)C)O[Si](CC(C)C)(O1)O[Si]1(CC(C)C)O[Si](CC(C)C)(O2)O[Si](CC(C)C)(O3)O[Si](CC(C)C)(O4)O1,0.40617403
70
+ *CC(*)(CC(=O)OC1CCCCCCC1)C(=O)OC1CCCCCCC1,0.36818557
71
+ *CC(*)(CC(=O)OCCC1CCCCC1)C(=O)OCCC1CCCCC1,0.3743525
72
+ *CC(*)(F)C(=O)OCC(F)(F)C(F)(F)C(F)(F)F,0.32178633
73
+ *CC(*)C(=O)N1CCCCC1,0.36828508
74
+ *CC(*)C(=O)Nc1ccc2c(c1)C(=O)c1ccccc1C2=O,0.32709343
75
+ *CC(*)C(=O)OCC1(CC)COC(c2ccccc2)OC1,0.34648032
76
+ *CC(*)C(=O)OCCOc1ccc(C(=O)Oc2ccc(OC(=O)c3ccc(OCCCC)cc3)cc2)cc1,0.3436788
77
+ *CC(*)C(=O)OCCOc1ccc(C(=O)Oc2ccc(OC(=O)c3ccc(OCCCCC)cc3)cc2)cc1,0.34788271
78
+ *CC(*)C(=O)OCCOc1ccc(C(C)(C)c2ccc(OCCO)cc2)cc1,0.33916196
79
+ *CC(*)C(=O)Oc1ccc(C(=O)OCc2ccccc2)cc1,0.33941879
80
+ *CC(*)C(=O)Oc1ccc(C(=O)Oc2ccc(OC(=O)c3ccc(OCCCCC)cc3)cc2)cc1,0.34978241
81
+ *CC(*)N1CCCCCC1=O,0.36209846
82
+ *CC(*)c1ccc(C(=O)CCN2CCCCC2)cc1,0.37311156
83
+ *CC(*)c1ccc(C(=O)N(CC)CC)cc1,0.36711341
84
+ *CC(*)c1ccc(C(=O)N2CCOCC2)cc1,0.35986197
85
+ *CC(*)c1ccc(COCCOCCCC)cc1,0.37958253
86
+ *CC(*)c1ccc(COc2ccc(-c3ccc(-c4ccc(C)s4)c4nsnc34)cc2)cc1,0.36773
87
+ *CC(*)n1cncn1,0.33624313
88
+ *CC(=O)Nc1ccc(Oc2cccc(Oc3ccc(NC(=O)CN4C(=O)c5ccc(C(c6ccc7c(c6)C(=O)N(*)C7=O)(C(F)(F)F)C(F)(F)F)cc5C4=O)cc3)c2)cc1,0.34064566
89
+ *CC(=O)OC(=O)COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1,0.33458907
90
+ *CC(C)(C)C1C(=O)N(C2CCCCC2)C(=O)C1*,0.36558232
91
+ *CC(C)(C)COC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.3487602
92
+ *CC(C)(C)CS(=O)(=O)CC(C)(C)COC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.35233663
93
+ *CC(C)N1C(=O)C2C3C=CC(C4C(=O)N(*)C(=O)C34)C2C1=O,0.3634472
94
+ *CC(CCCCCCCCCCCCCCCC)C(CCCCCCCCCCCCCCCC)COC(=O)c1ccc(C(=O)O*)cc1,0.39720442
95
+ *CC(CO)(CCl)COc1ccc(C(c2ccc(O*)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.35329338
96
+ *CC(COc1c(Cl)cc(C(C)(C)c2cc(Cl)c(O*)c(Cl)c2)cc1Cl)OC(C)=O,0.36697598
97
+ *CC(COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1)OC(=O)C=Cc1ccccc1,0.35543793
98
+ *CC(COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1)OC(=O)CC,0.35519812
99
+ *CC(COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1)OC(=O)CCl,0.35599023
100
+ *CC(COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1)OC(C)=O,0.35142781
101
+ *CC(F)(F)C1(F)CC(C(O)(C(F)(F)F)C(F)(F)F)CC1*,0.32084568
102
+ *CC(O)CN(C)S(=O)(=O)c1cccc(S(=O)(=O)N(C)CC(O)COc2ccc(C(C)(C)c3ccc(O*)cc3)cc2)c1,0.34239354
103
+ *CC(O)CN(C)S(=O)(=O)c1cccc(S(=O)(=O)N(C)CC(O)COc2ccc(O*)cc2)c1,0.33491941
104
+ *CC(O)CN(CC(O)COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1)c1ccc(N=Nc2ccc([N+](=O)[O-])cc2)cc1,0.35223599
105
+ *CC(O)CN(CCO)CC(O)COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1,0.33466845
106
+ *CC(O)COC(=O)/C=C\C(=O)Oc1ccc(C(C)(C)c2ccc(O*)cc2)cc1,0.33533983
107
+ *CC(O)COC(=O)CCCCC(=O)OCC(O)COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1,0.33384251
108
+ *CC(O)COC(=O)CCCCCCC(=O)OCC(O)COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1,0.34100579
109
+ *CC(O)COC(=O)CCCCCCCCC(=O)OCC(O)COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1,0.3435317
110
+ *CC(O)COC(=O)CCCCCCCCCCC(=O)OCC(O)COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1,0.34941747
111
+ *CC(O)COc1c(C)cc(C(C)(C)c2cc(C)c(O*)c(C)c2)cc1C,0.37509406
112
+ *CC(O)COc1c(Cl)cc(C(C)(C)c2cc(Cl)c(O*)c(Cl)c2)cc1Cl,0.36746955
113
+ *CC(O)COc1ccc(C(C)(C)c2ccc(O*)c(Cl)c2)cc1Cl,0.36275869
114
+ *CC(O)COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1,0.35095055
115
+ *CC(O)COc1ccc(C(C)(CC)c2ccc(O*)cc2)cc1,0.35394579
116
+ *CC(O)COc1ccc(Cc2ccc(O*)cc2)cc1,0.34455061
117
+ *CC(O)COc1ccc(S(=O)(=O)c2ccc(OCC(O)COc3c(C)cc(S(=O)(=O)c4cc(C)c(O*)c(C)c4)cc3C)cc2)cc1,0.35617654
118
+ *CC(O)COc1ccc(S(=O)(=O)c2ccc(OCC(O)COc3ccc(S(=O)(=O)c4ccc(O*)c(C)c4)cc3C)cc2)cc1,0.34891923
119
+ *CC(OC(=O)Oc1ccc(C(=O)OC)cc1)C(COC(=O)O*)OC(=O)Oc1ccc(C(=O)OC)cc1,0.3223452
120
+ *CC(c1ccccn1)C(c1ccccc1)C(*)c1ccccn1,0.39661394
121
+ *CC1(C)CC(N2C(=O)c3ccc(-c4ccc5c(c4)C(=O)N(*)C5=O)cc3C2=O)CC(C)(C)C1,0.37465828
122
+ *CC1CC(CNC(=O)c2cccc(C(=O)N*)c2)CC(C(C)(C)C)C1,0.3514179
123
+ *CC1CCC(CNC(=O)c2cc(C(=O)N*)cc(C(C)(C)C)c2)CC1,0.3576586
124
+ *CC1CCC(CNC(=O)c2cccc(C(=O)N*)c2)CC1,0.34399294
125
+ *CC1CCC(COC(=O)C2CCC(C(=O)O*)CC2)CC1,0.35005916
126
+ *CC1CCC(COC(C)OC(=O)c2ccc(C(=O)OC(C)O*)c3ccccc23)CC1,0.35502936
127
+ *CC1CCCC(CNC(=O)c2cc(C(=O)N*)cc(C(C)(C)C)c2)C1,0.35610431
128
+ *CC1CCCC(CNC(=O)c2cccc(C(=O)N*)c2)C1,0.33940105
129
+ *CC=CCOC(=O)C(Cc1ccccc1)NC(=O)C=CC(=O)NC(Cc1ccccc1)C(=O)O*,0.33875482
130
+ *CCC1CC2C(CC(*)C2OC(=O)CCCCCCCCCCOc2ccc(-c3ccc(C#N)cc3)cc2)C1OC(=O)CCCCCCCCCCOc1ccc(-c2ccc(C#N)cc2)cc1,0.36522309
131
+ *CCCC(C)CN1C(=O)c2ccc(C(=O)Oc3ccc4cc(OC(=O)c5ccc6c(c5)C(=O)N(*)C6=O)ccc4c3)cc2C1=O,0.34177327
132
+ *CCCC1(CCCNC(=O)CCC2(CCC(=O)N*)c3ccccc3-c3ccccc32)c2ccccc2-c2ccccc21,0.35305757
133
+ *CCCCCC(=O)NNC(=O)c1ccc(C(=O)NNC(=O)CCCCCOc2ccc(O*)c(C)c2)cc1,0.34281562
134
+ *CCCCCCCCCCC(=O)Nc1ccc(Cc2ccc(NC(=O)CCCCCCCCCCN3C(=O)c4ccc(C(=O)c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,0.35192407
135
+ *CCCCCCCCCCCCCCCCOC(=O)CCCCCCC(=O)O*,0.37544747
136
+ *CCCCCCCCCCCCCCCCOC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.36655412
137
+ *CCCCCCCCCCCCOC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.35988995
138
+ *CCCCCCCCCCCN1C(=O)c2ccc(Oc3ccc4c(c3)C(=O)N(*)C4=O)cc2C1=O,0.3550053
139
+ *CCCCCCCCCCCNC(=O)CCCCCCCCC(=O)N*,0.37104926
140
+ *CCCCCCCCCCN1C(=O)C2C3C=CC(C4C(=O)N(*)C(=O)C34)C2C1=O,0.34804179
141
+ *CCCCCCCCCCN1C(=O)c2ccc(Oc3ccc4c(c3)C(=O)N(*)C4=O)cc2C1=O,0.34969426
142
+ *CCCCCCCCCCNC(=O)c1ccc(Cc2ccc(C(=O)N*)cc2)cc1,0.35614349
143
+ *CCCCCCCCCCOC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.35710365
144
+ *CCCCCCCCCCOc1c(OC)cc(C=C2CCCC(=Cc3cc(OC)c(O*)c(OC)c3)C2=O)cc1OC,0.3627628
145
+ *CCCCCCCCCCOc1ccc(C=C2CCCC(=Cc3ccc(O*)c(OC)c3)C2=O)cc1OC,0.36585975
146
+ *CCCCCCCCCCOc1ccc(C=C2CCCC(=Cc3ccc(O*)cc3)C2=O)cc1,0.37076764
147
+ *CCCCCCCCCCOc1ccc(C=Cc2ccc(OCCCCCCCCCCOP(=O)(O*)OCCCCCCCCCCOc3ccc(N=Nc4ccc(Cl)cc4)cc3)cc2)cc1,0.37552981
148
+ *CCCCCCCCCCOc1ccc(C=Cc2ccc(OCCCCCCCCCCOP(=O)(O*)OCCCCCCCCCCOc3ccc(N=Nc4ccc(OC)cc4)cc3)cc2)cc1,0.374166
149
+ *CCCCCCCCCCOc1ccc(C=Cc2ccc(OCCCCCCCCCCOP(=O)(O*)OCCCCCCCCCCOc3ccc(N=Nc4ccc([N+](=O)[O-])cc4)cc3)cc2)cc1,0.36876796
150
+ *CCCCCCCCCCOc1ccc(C=Cc2ccc(OCCCCCCCCCCOP(=O)(O*)OCCCCCCCCCCOc3ccc(N=Nc4ccccc4)cc3)cc2)cc1,0.37909933
151
+ *CCCCCCCCCCc1ccc(-c2c(-c3ccccc3)cc(-c3cccc(-c4cc(-c5ccccc5)c(-c5ccc(*)cc5)c(-c5ccccc5)c4-c4ccccc4)c3)c(-c3ccccc3)c2-c2ccccc2)cc1,0.39826604
152
+ *CCCCCCCCCN1C(=O)c2ccc(Oc3ccc4c(c3)C(=O)N(*)C4=O)cc2C1=O,0.34911033
153
+ *CCCCCCCCCOC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.35367211
154
+ *CCCCCCCCN1C(=O)c2ccc(Oc3ccc4c(c3)C(=O)N(*)C4=O)cc2C1=O,0.34901426
155
+ *CCCCCCCCOC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.35257805
156
+ *CCCCCCCCOC(=O)Nc1ccc(NC(=O)OCCCCCCCCOc2ccc(-c3ccc(O*)cc3)cc2)c(C)c1,0.3579984
157
+ *CCCCCCCCOc1c(OC)cc(C=C2CCCC(=Cc3cc(OC)c(O*)c(OC)c3)C2=O)cc1OC,0.35816729
158
+ *CCCCCCCCOc1ccc(C(C)(C)c2ccc(O*)cc2)cc1,0.36752126
159
+ *CCCCCCCCOc1ccc(C(c2ccc(O*)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.36265916
160
+ *CCCCCCCCOc1ccc(C=C2CCCC(=Cc3ccc(O*)c(OC)c3)C2=O)cc1OC,0.36354284
161
+ *CCCCCCCCOc1ccc(C=C2CCCC(=Cc3ccc(O*)cc3)C2=O)cc1,0.37127154
162
+ *CCCCCCCCc1nc2cc(NC(=O)CCCCC(=O)Nc3ccc4oc(*)nc4c3)ccc2o1,0.35552869
163
+ *CCCCCCCN1C(=O)c2ccc(Oc3ccc4c(c3)C(=O)N(*)C4=O)cc2C1=O,0.3473574
164
+ *CCCCCCCOC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.34774213
165
+ *CCCCCCN1C(=O)C(=O)N(c2ccc(C)c(N3C(=O)C(=O)N(*)C3=O)c2)C1=O,0.33065809
166
+ *CCCCCCN1C(=O)C(=O)N(c2ccc(Oc3ccc(N4C(=O)C(=O)N(*)C4=O)cc3)cc2)C1=O,0.33381102
167
+ *CCCCCCN1C(=O)c2ccc(Oc3ccc4c(c3)C(=O)N(*)C4=O)cc2C1=O,0.34767207
168
+ *CCCCCCNC(=O)c1cc(NC(=O)c2ccc(NC(=O)C(CC(C)C)N3C(=O)c4ccccc4C3=O)cc2)cc(C(=O)N*)c1,0.33657316
169
+ *CCCCCCNC(=O)c1ccc(C(C)(CC)c2ccc(C(=O)N*)cc2)cc1,0.35362781
170
+ *CCCCCCOC(=O)C(CCCCCOc1cc(-c2ccccn2)nc(-c2ccccn2)c1)C(=O)OCCCCCCOc1ccc(-c2ccc(O*)cc2)cc1,0.36437086
171
+ *CCCCCCOC(=O)C(Cc1ccccc1)NC(=O)C=CC(=O)NC(Cc1ccccc1)C(=O)O*,0.34396826
172
+ *CCCCCCOC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.34648888
173
+ *CCCCCCOc1c(OC)cc(C=C2CCCC(=Cc3cc(OC)c(O*)c(OC)c3)C2=O)cc1OC,0.35644797
174
+ *CCCCCCOc1ccc(C(=O)N(C(=O)c2ccc(O*)cc2)c2ccc(Oc3ccccc3)cc2)cc1,0.36783158
175
+ *CCCCCCOc1ccc(C(C)(C)c2ccc(O*)cc2)cc1,0.36632136
176
+ *CCCCCCOc1ccc(C=C2CCCC(=Cc3ccc(O*)c(OC)c3)C2=O)cc1OC,0.35895367
177
+ *CCCCCCOc1ccc(C=C2CCCC(=Cc3ccc(O*)cc3)C2=O)cc1,0.36783098
178
+ *CCCCCOC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.34273288
179
+ *CCCCN1C(=O)C2C3C=CC(C4C(=O)N(*)C(=O)C34)C2C1=O,0.34543609
180
+ *CCCCNC(=O)c1ccc(C(C)(CC)c2ccc(C(=O)N*)cc2)cc1,0.34798992
181
+ *CCCCOC(=O)C(Cc1ccccc1)NC(=O)C=CC(=O)NC(Cc1ccccc1)C(=O)O*,0.33930122
182
+ *CCCCOC(=O)CCCCCNC(=O)CCCCC(=O)NCCCCCC(=O)O*,0.34021952
183
+ *CCCCOC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.34189824
184
+ *CCCCOC(=O)c1ccc(-c2ccc(C(=O)O*)cc2)cc1,0.33930265
185
+ *CCCCOC(=O)c1ccc(C(C)(C)c2ccc(C(=O)O*)cc2)cc1,0.35363706
186
+ *CCCCOCCCCOCCCCOC(=O)c1ccc(N=Cc2cc(OCCCCCC)c(C=Cc3cc(OCCCCCC)c(C=Cc4cc(OCCCCCC)c(C=Nc5ccc(C(=O)O*)cc5)cc4OCCCCCC)cc3OCCCCCC)cc2OCCCCCC)cc1,0.38309201
187
+ *CCCN1C(=O)C2C3C=CC(C4C(=O)N(*)C(=O)C34)C2C1=O,0.34806971
188
+ *CCCOC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.33832602
189
+ *CCCOCCCCOCCCN1C(=O)c2ccc(C(=O)Oc3ccc(OC(=O)c4ccc5c(c4)C(=O)N(*)C5=O)cc3)cc2C1=O,0.33583724
190
+ *CCCOCCCCOCCCN1C(=O)c2ccc(C(=O)Oc3ccc4cc(OC(=O)c5ccc6c(c5)C(=O)N(*)C6=O)ccc4c3)cc2C1=O,0.33819968
191
+ *CCN(CCOC(=O)NCC1(C)CC(NC(=O)O*)CC(C)(C)C1)c1ccc(N=Nc2ccc(C#N)cc2)cc1,0.36385839
192
+ *CCN(CCOC(=O)NCC1(C)CC(NC(=O)O*)CC(C)(C)C1)c1ccc(N=Nc2ccc([N+](=O)[O-])cc2)cc1,0.34598899
193
+ *CCN(CCOC(=O)Nc1ccc(-c2ccc(NC(=O)O*)c(C)c2)cc1C)c1ccc(N=Nc2ccc([N+](=O)[O-])cc2)cc1,0.34910461
194
+ *CCN(CCOC(=O)Nc1ccc(-c2ccc(NC(=O)O*)c(OC)c2)cc1OC)c1ccc(N=Nc2ccc([N+](=O)[O-])cc2)cc1,0.34489269
195
+ *CCN(CCOC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1)c1ccccc1,0.34479226
196
+ *CCN(CCOC(=O)OCc1ccc(COC(=O)O*)cc1)c1ccc(N=Nc2ccc([N+](=O)[O-])cc2)cc1,0.34805417
197
+ *CCN(CCOC(=O)c1cc(OCCN(C)c2ccc(S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F)cc2)cc(C(=O)O*)c1)c1ccc(S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F)cc1,0.34325537
198
+ *CCN(CCOC(=O)c1cc(OCCN(C)c2ccc(S(=O)(=O)CCCCCCCC)cc2)cc(C(=O)O*)c1)c1ccc(S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F)cc1,0.35536738
199
+ *CCN(CCOC(=O)c1cc(OCc2c(F)c(F)c(OC)c(F)c2F)cc(C(=O)O*)c1)c1ccc(C=Cc2ccc(C=CC3=C(C#N)C(=C(C#N)C#N)OC3(c3ccccc3)C(F)(F)F)s2)cc1,0.36032332
200
+ *CCNC(=O)Nc1ccc(Cc2ccc(NC(=O)NCCOCCO*)cc2)cc1,0.33154374
201
+ *CCNC(=O)c1ccc([Si](C)(C)c2ccc(C(=O)NCCN(*)c3ccc(/C=C/c4ccc([N+](=O)[O-])cc4)cc3)cc2)cc1,0.35386775
202
+ *CCNC(=O)c1ccc([Si](CCCC)(CCCC)c2ccc(C(=O)NCCN(*)c3ccc(/C=C/c4ccc([N+](=O)[O-])cc4)cc3)cc2)cc1,0.35921769
203
+ *CCOC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.33628572
204
+ *CCOC(=O)c1ccc(C(C)(CC)c2ccc(C(=O)O*)cc2)cc1,0.35349899
205
+ *CCOCCOCCN1C(=O)c2ccc(C(=O)Oc3ccc(OC(=O)c4ccc5c(c4)C(=O)N(*)C5=O)cc3)cc2C1=O,0.33052641
206
+ *CCOCCOCCN1C(=O)c2ccc(C(=O)Oc3ccc4cc(OC(=O)c5ccc6c(c5)C(=O)N(*)C6=O)ccc4c3)cc2C1=O,0.3345641
207
+ *CCOCCOCCOCCOC(=O)Nc1ccc(Cc2ccc(NC(=O)O*)cc2)cc1,0.33889511
208
+ *CCOCCOCCOCCOCCOCCOc1ccc(C(=O)Nc2cc(NC(=O)c3ccc(O*)cc3)cc(-c3nc4ccccc4[nH]3)c2)cc1,0.34429063
209
+ *CCOCCOCCOCCOCCOc1ccc(C(=O)Nc2cc(NC(=O)c3ccc(O*)cc3)cc(-c3nc4ccccc4[nH]3)c2)cc1,0.34659235
210
+ *CCOCCOCCOCCOc1ccc(C(=O)Nc2cc(NC(=O)c3ccc(O*)cc3)cc(-c3nc4ccccc4[nH]3)c2)cc1,0.3483589
211
+ *CCOCCOCCOc1ccc(C(=O)Nc2cc(NC(=O)c3ccc(O*)cc3)cc(-c3nc4ccccc4[nH]3)c2)cc1,0.34154325
212
+ *CCOCCOc1ccc(C(=O)Nc2cc(NC(=O)c3ccc(O*)cc3)cc(-c3nc4ccccc4[nH]3)c2)cc1,0.34638057
213
+ *CCOc1ccc(C(=O)Nc2cc(NC(=O)c3ccc(O*)cc3)cc(-c3nc4ccccc4[nH]3)c2)cc1,0.35204064
214
+ *CN1C(=O)c2ccc(C(c3ccc4c(c3)C(=O)N(Cc3nnc(-c5ccc(-c6nnc(*)o6)cc5)o3)C4=O)(C(F)(F)F)C(F)(F)F)cc2C1=O,0.37085226
215
+ *CNC(=O)OCc1cocc1COC(=O)NCc1ccc(C(C)(C)c2ccc(*)o2)o1,0.33476958
216
+ *COc1ccc(C(C)(C)c2ccc(O*)cc2)cc1,0.35264631
217
+ *COc1ccc(C(c2ccc(O*)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.34402643
218
+ *Cc1cc2cc(C(=O)Nc3ccc(-c4ccc(NC(=O)c5cc6cc(*)c(OC(=O)COc7ccc8ccc(=O)oc8c7)cc6oc5=O)cc4)cc3)c(=O)oc2cc1OC(=O)COc1ccc2ccc(=O)oc2c1,0.32178966
219
+ *Cc1ccc(COC(=O)c2cccc(C(=O)O*)c2)cc1,0.34174571
220
+ *Cc1ccc(COP(=O)(N=Nc2ccc(-c3ccc(N=NP(=O)(O*)OC)cc3)cc2)OC)cc1,0.35769989
221
+ *Cc1ccc(C[n+]2ccc(-c3cc[n+](*)cc3)cc2)cc1,0.36770634
222
+ *Cc1ccc2nc(-c3cc(-c4nc5ccc(*)cc5c(=O)o4)cc(N4C(=O)c5ccccc5C4=O)c3)oc(=O)c2c1,0.37326662
223
+ *Cc1cccc(C[n+]2ccc(-c3cc[n+](*)cc3)cc2)c1,0.36232627
224
+ *Cc1ccccc1C[n+]1ccc(-c2cc[n+](*)cc2)cc1,0.35795758
225
+ *N/C(C=C)=C/C=C(\C)C1(c2ccc(N*)cc2)CCCCC1,0.35375342
226
+ *N/N=N\c1ccc(N*)c2c1C(=O)c1c(N)ccc(N)c1C2=O,0.34005083
227
+ *N=P(*)(Oc1ccc2ccccc2c1)Oc1ccc2ccccc2c1,0.3648512
228
+ *NC(=C)/C=C\C(=C/C)C1(c2ccc(N*)cc2)CCCCC1,0.3442031
229
+ *NC/C=C(/c1cccc(-c2ccccc2)c1)c1ccccc1-c1ccccc1-c1ccc(C2(c3ccccc3)c3ccccc3-c3ccccc32)cc1N*,0.37006353
230
+ *NC1=C(N)N[C@H](N*)NC1=O,0.28322193
231
+ *NC1=C(c2c(N*)ccc3ccccc23)c2ccccc2CC1,0.35718606
232
+ *NC1=C(c2ccccc2)[C@@](O)(N*)[C@H](N)C=C1,0.32796087
233
+ *NC1=CC=C(c2ccc(N*)cc2)C(C(C)(C)C)(C(C)(C)C)C1,0.34704926
234
+ *NC1=NC(=S)N=C(N)C1N*,0.30874827
235
+ *NCC1(C)CC(N*)CC(C)(C)C1,0.34576246
236
+ *NCC1CCC(CN*)CC1,0.33576539
237
+ *NCCCc1ccc2ccc3ccc(N*)cc3c2c1,0.33634808
238
+ *NCCc1ccc2ccc3ccc(N*)cc3c2c1,0.33458647
239
+ *NC[C@@H]1CCC[C@@H](CN*)C1,0.32836534
240
+ *NNC(=O)C=CC(=O)Nc1cccc(C=C2CCCC(=Cc3cccc(NC(=O)C=CC(*)=O)c3)C2=O)c1,0.3376016
241
+ *NNC(=O)c1cc(NC(=O)c2ccc3c(c2)C(=O)N(c2ccc(C)cn2)C3=O)cc(C(=O)NNC(=O)c2ccc(C(*)=O)cc2)c1,0.33692653
242
+ *NNC(=O)c1cc(NC(=O)c2ccc3c(c2)C(=O)N(c2ncccc2C)C3=O)cc(C(=O)NNC(=O)c2ccc(C(*)=O)cc2)c1,0.33234263
243
+ *NNC(=O)c1ccccc1C(=O)Nc1cccc(C=C2CCCC(=Cc3cccc(NC(=O)c4ccccc4C(*)=O)c3)C2=O)c1,0.35281266
244
+ *N[C@H]1C(=O)NC(=O)[C@@](N*)(n2c(C)nc3c(N)c(N)cc(N)c3c2=O)[C@H]1N,0.35197979
245
+ *Nc1c(C)cc(C(c2cc(C)c(N*)c(C)c2)C(c2ccc(N)c(C(C)(C)C)c2)c2ccc(N)c(C(C)(C)C)c2)cc1C,0.39156416
246
+ *Nc1c(C)cc(Cc2cc(C)c(N*)c(CC)c2)cc1CC,0.36832
247
+ *Nc1c(CC)cc(Cc2cc(CC)c(N*)c(CC)c2)cc1CC,0.37359767
248
+ *Nc1c(CC)cc(Cc2cc(CC)c(N*)c(CC)c2Cl)c(Cl)c1CC,0.37886544
249
+ *Nc1c(N)c2c(c(N*)c1NC)C(=O)c1cccc(N)c1C2=O,0.34496829
250
+ *Nc1c(N*)c(-c2ccccc2-c2ccccc2)c(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1,0.37334739
251
+ *Nc1cc(C(c2ccc(C)c(N*)c2)(C(F)(F)F)C(F)(F)F)ccc1C,0.34739951
252
+ *Nc1cc(C(c2ccc(O)c(N*)c2)(C(F)(F)F)C(F)(F)F)ccc1O,0.32241826
253
+ *Nc1cc(C)c(-c2c(C)cc(N*)cc2-c2ccccc2)c(-c2ccccc2)c1,0.36600387
254
+ *Nc1cc(C)c(C2(c3c(C)cc(N*)c4ccccc34)c3ccccc3-c3ccccc32)c2ccccc12,0.38470754
255
+ *Nc1cc(C)c(Cc2ccc(CCCCCCCCCCCCCc3ccc(Cc4c(C)cc(N*)cc4C)cc3)cc2)c(C)c1,0.35997455
256
+ *Nc1cc(C)c(Cc2ccc(CCCCCCCCCCCCc3ccc(Cc4c(C)cc(N*)cc4C)cc3)cc2)c(C)c1,0.35798452
257
+ *Nc1cc(C)c(Cc2ccc(CCCCCCCCCc3ccc(Cc4c(C)cc(N*)cc4C)cc3)cc2)c(C)c1,0.36147124
258
+ *Nc1cc(C)c(Cc2ccc(CCCCCCCCc3ccc(Cc4c(C)cc(N*)cc4C)cc3)cc2)c(C)c1,0.35897643
259
+ *Nc1cc(C)c(N*)c2c1C(=O)c1c(N)c(C)cc(N)c1C2=O,0.34660788
260
+ *Nc1cc(C2CCCCC2)c2ccc3c(N*)c(-c4ccc(C5(c6ccccc6)c6ccccc6-c6ccccc65)cc4)c(C4CCCCC4)c4ccc1c2c34,0.39403941
261
+ *Nc1cc(N)c(N)c(C)c1C(=O)Oc1cc(N)c(C(=O)Oc2cc(N)c(N)c(C)c2N)c(N)c1N*,0.32137021
262
+ *Nc1cc(N*)c(Cc2cc(C)c(N)cc2N)cc1C,0.35659354
263
+ *Nc1cc(N*)c2c(ccc3ccccc32)c1,0.33406138
264
+ *Nc1cc(N*)c2ccc3ccccc3c2c1,0.33145622
265
+ *Nc1cc(NC(=O)Nc2ccc(Cc3ccc(NC(*)=O)cc3)cc2)cc(C(=O)Nc2ccc3c(c2)C(=O)c2ccccc2C3=O)c1,0.33705899
266
+ *Nc1cc(NC(=O)Nc2ccc(NC(*)=O)cc2)cc(C(=O)Nc2cccc3c2C(=O)c2ccccc2C3=O)c1,0.32684433
267
+ *Nc1cc(NC(=O)c2cc(NC(=O)C(C(C)CC)N3C(=O)c4ccccc4C3=O)cc(C(*)=O)c2)ccc1C,0.34847575
268
+ *Nc1cc(NC(=O)c2cc(OCCN(C)c3ccc(C#N)cc3)cc(C(*)=O)c2)cc(C(=O)OCCN(C)c2ccc(C#N)cc2)c1,0.35581869
269
+ *Nc1cc(NC(=O)c2cc(OCCN(C)c3ccc(C#N)cc3)cc(C(*)=O)c2)cc(C(=O)OCCN(C)c2ccc(S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F)cc2)c1,0.34405633
270
+ *Nc1cc(NC(=O)c2cc(OCCN(C)c3ccc(S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F)cc3)cc(C(*)=O)c2)cc(C(=O)OCCN(C)c2ccc(C#N)cc2)c1,0.3482804
271
+ *Nc1cc(NC(=O)c2ccc(C(*)=O)cc2)cc(C(=O)Oc2cccc3ccccc23)c1,0.34233379
272
+ *Nc1cc(NC(=O)c2ccc3cc(C(*)=O)ccc3c2)cc(C(=O)OCCOc2ccc(C=CC(=O)c3ccccc3)cc2)c1,0.34387133
273
+ *Nc1cc2c(-c3ccc4c(c3)c3ccccc3c3c(N*)c(N)ccc43)cc3c4ccccc4ccc3c2cc1N,0.3459392
274
+ *Nc1cc2c(N*)cccc2c2ccccc12,0.333837
275
+ *Nc1cc2c(cc(N*)c3c(-c4ccc5ccccc5c4)c(-c4ccc5ccccc5c4)c(-c4ccc5ccccc5c4)c(-c4ccc5ccccc5c4)c32)c2ccccc12,0.37064494
276
+ *Nc1cc2c(cc1N*)C(=O)c1cc(N)c(N)cc1C2=O,0.34542857
277
+ *Nc1cc2c3cccc(-c4ccc(C)c(C)c4)c3c(N*)cc2c2ccccc12,0.35695006
278
+ *Nc1cc2cccc(N*)c2c2ccccc12,0.33765318
279
+ *Nc1cc2ccccc2c(N*)c1-c1ccccc1,0.3508717
280
+ *Nc1cc2ccccc2c2c(N*)cccc12,0.33679998
281
+ *Nc1cc2ccccc2c2c1ccc1ccc3c(N*)cc4ccccc4c3c12,0.36566777
282
+ *Nc1ccc(*)cc1OCCCCCCCCCCOc1ccc(C2CCC(CCCCC)CC2)cc1,0.3832015
283
+ *Nc1ccc(-c2c(-c3ccccc3)cc(-c3ccc(-c4cc(-c5ccccc5)c(-c5ccc(NC(=O)c6ccc(C(*)=O)cc6)cc5)c(-c5ccccc5)c4)cc3)cc2-c2ccccc2)cc1,0.38336728
284
+ *Nc1ccc(-c2c(C(C)C)cc(C)cc2C2(c3cc(C)cc(C(C)C)c3-c3ccc(N*)cc3)c3ccccc3-c3ccccc32)cc1,0.39756786
285
+ *Nc1ccc(-c2cc(-c3ccc(-c4ccccc4)cc3)cc(-c3ccc(-c4cc(-c5ccc(N*)cc5)cc(-c5ccc(-c6ccccc6)cc5)c4)cc3)c2)cc1,0.35431916
286
+ *Nc1ccc(-c2cc(-c3ccc(N*)cc3)c3ccc4c(-c5ccc(N)cc5)cc(-c5ccc(N)cc5)c5ccc2c3c54)cc1,0.36303147
287
+ *Nc1ccc(-c2ccc(-c3ccc(N*)c(-c4ccc5ccccc5c4)c3-c3ccc4ccccc4c3)c(-c3ccc4ccccc4c3)c2)c(-c2ccc3ccccc3c2)c1,0.36605712
288
+ *Nc1ccc(-c2ccc(-c3ccc(N*)cc3)c3c2CC2(CCCC2)C3)cc1,0.35946418
289
+ *Nc1ccc(-c2ccc(N*)c(-c3ccc(-c4ccccc4)cc3)c2-c2ccc(-c3ccccc3)cc2)cc1,0.35925281
290
+ *Nc1ccc(-c2ccc(N*)c(-c3cccc(C)c3-c3ccccc3)c2)cc1,0.35748308
291
+ *Nc1ccc(-c2ccc(N*)c(/C=C/c3ccccc3)c2)cc1,0.34484999
292
+ *Nc1ccc(-c2ccc(N*)c(Cc3ccccc3)c2Cc2ccccc2)c(Cc2ccccc2)c1Cc1ccccc1,0.35710982
293
+ *Nc1ccc(-c2ccc(N*)c(N)c2-c2ccccc2)c(-c2ccccc2)c1N,0.35339596
294
+ *Nc1ccc(-c2ccc(N*)cc2-c2cc(N)ccc2-c2ccc(N)cc2)cc1,0.36187072
295
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccc(C)cc2)c(-c2ccc(C)cc2)c1,0.36702255
296
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccc(C)cc2C)c(-c2ccc(C)cc2C)c1,0.37675264
297
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccc(C=C)cc2)c(-c2ccc(C=C)cc2)c1,0.35833444
298
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccc(CC)cc2)c(-c2ccc(CC)cc2)c1,0.36900944
299
+ *Nc1ccc(-c2ccc(N*)cc2-c2cccc(C)c2)c(-c2cccc(C)c2)c1,0.36515839
300
+ *Nc1ccc(-c2ccc(N*)cc2-c2cccc(CC)c2)c(-c2cccc(CC)c2)c1,0.36267578
301
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccccc2)c(-c2ccccc2)c1,0.35575719
302
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccccc2-c2ccc(C)cc2)cc1,0.35953405
303
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccccc2-c2ccccc2)cc1,0.35741644
304
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccccc2-c2ccccc2C)cc1,0.36316601
305
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccccc2C(C)(C)C)c(-c2ccccc2C(C)(C)C)c1,0.37633983
306
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccccc2C(C)C)c(-c2ccccc2C(C)C)c1,0.37695577
307
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccccc2C)c(-c2ccccc2C)c1,0.36761088
308
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccccc2CC)c(-c2ccccc2CC)c1,0.3676166
309
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccccc2CC=C)c(-c2ccccc2CC=C)c1,0.35805994
310
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccccc2CCC)c(-c2ccccc2CCC)c1,0.36819448
311
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccccc2CCCC)c(-c2ccccc2CCCC)c1,0.36989526
312
+ *Nc1ccc(-c2ccc(N*)cc2-c2ccccc2[C@@H](C)CC)c(-c2ccccc2[C@@H](C)CC)c1,0.37601542
313
+ *Nc1ccc(-c2ccc(N*)cc2/C(N)=C/c2ccccc2)c(/C(N)=C/c2ccccc2)c1,0.35009849
314
+ *Nc1ccc(-c2ccccc2)c(C2CCCCC2)c1N*,0.37359962
315
+ *Nc1ccc(-c2ccccc2-c2ccccc2-c2ccc(N*)cc2)cc1,0.35490612
316
+ *Nc1ccc(/C(=C(/c2ccc3ccccc3c2)c2ccc(N*)c(-c3ccc4ccccc4c3)c2-c2ccc3ccccc3c2)c2ccc3ccccc3c2)cc1,0.37988811
317
+ *Nc1ccc(/C(C)=C/C(C)(C)c2ccc(N*)cc2)cc1,0.35500937
318
+ *Nc1ccc(/C=C/c2ccc(N*)cc2-c2ccccc2)c(-c2ccccc2)c1,0.35626419
319
+ *Nc1ccc(/C=C/c2ccc3ccccc3c2/C=C/c2ccc(N*)cc2)cc1,0.35372743
320
+ *Nc1ccc(C(=C)C(C)(C)C(C)(C)c2ccc(N*)cc2)cc1,0.34980902
321
+ *Nc1ccc(C(=C)CC(C)(C)c2ccc(N*)cc2)cc1,0.34510572
322
+ *Nc1ccc(C(=C2CCCCC2)c2ccc(N*)cc2)cc1,0.37173273
323
+ *Nc1ccc(C(C)(C)c2cc(C(C)(C)c3ccc(N)cc3)cc(C(C)(C)c3ccc(N*)cc3)c2)cc1,0.36915657
324
+ *Nc1ccc(C(C)(C)c2ccc(C(C)(C)c3ccc(N*)cc3)cc2)cc1,0.35733218
325
+ *Nc1ccc(C(C)(C)c2ccc(C(C)(C)c3cccc(N*)c3)cc2)cc1,0.35860249
326
+ *Nc1ccc(C(C)(C)c2ccc(N*)cc2)cc1,0.34514842
327
+ *Nc1ccc(C(C)(C=C)c2ccc(N*)cc2)cc1,0.34310285
328
+ *Nc1ccc(C(C)(CC(C)C)c2ccc(N*)cc2)cc1,0.35303022
329
+ *Nc1ccc(C(C)(CC)c2ccc(N*)cc2)cc1,0.35054839
330
+ *Nc1ccc(C(C)(CC=C)c2ccc(N*)cc2)cc1,0.34950176
331
+ *Nc1ccc(C(C)(CCC(C)C)c2ccc(N*)cc2)cc1,0.3593959
332
+ *Nc1ccc(C(C)(CCC)c2ccc(N*)cc2)cc1,0.35256265
333
+ *Nc1ccc(C(C)(CCCC)c2ccc(N*)cc2)cc1,0.35842613
334
+ *Nc1ccc(C(C)(CCCCC)c2ccc(N*)cc2)cc1,0.35858625
335
+ *Nc1ccc(C(C)(CCCCCC)c2ccc(N*)cc2)cc1,0.36233629
336
+ *Nc1ccc(C(C)(c2ccc(N*)cc2)C(C)C)cc1,0.35465545
337
+ *Nc1ccc(C(C)(c2ccc(N*)cc2)[C@H](C)C(=C)C)cc1,0.35652932
338
+ *Nc1ccc(C(CC(C)(C)c2ccc(N*)cc2)=C(C)C)cc1,0.35535977
339
+ *Nc1ccc(C(CC(C)C)(CC(C)C)c2ccc(N*)cc2)cc1,0.36637357
340
+ *Nc1ccc(C(CC)(C/C=C/[C@@H](C)CCC)c2ccc(N*)cc2)cc1,0.36522046
341
+ *Nc1ccc(C(CC)(CC)c2ccc(N*)cc2)cc1,0.3582996
342
+ *Nc1ccc(C(CC)(CCC(C)C)c2ccc(N*)cc2)cc1,0.36819424
343
+ *Nc1ccc(C(CC)(CCC)c2ccc(N*)cc2)cc1,0.35892089
344
+ *Nc1ccc(C(CC)(CCCC)c2ccc(N*)cc2)cc1,0.36309206
345
+ *Nc1ccc(C(CC)(CCCCC)c2ccc(N*)cc2)cc1,0.36534914
346
+ *Nc1ccc(C(CC)(CCCCCC)c2ccc(N*)cc2)cc1,0.36353044
347
+ *Nc1ccc(C(CC)(C[C@H](C)CC)c2ccc(N*)cc2)cc1,0.35671255
348
+ *Nc1ccc(C(CC)c2ccc(N*)c(Cc3ccccc3)c2Cc2ccccc2)c(Cc2ccccc2)c1Cc1ccccc1,0.36855173
349
+ *Nc1ccc(C(CC=C)(CC=C)c2ccc(N*)cc2)cc1,0.34731572
350
+ *Nc1ccc(C(CC=C)c2ccc(N*)cc2)cc1,0.34719504
351
+ *Nc1ccc(C(CCC)(CCC)c2ccc(N*)cc2)cc1,0.36239191
352
+ *Nc1ccc(C(CCCC)(CCCC)c2ccc(N*)cc2)cc1,0.3699291
353
+ *Nc1ccc(C(C[C@@H]2CCC[C@@H](C(c3ccc(N)cc3)c3ccc(N*)cc3)C2)c2ccc(N)cc2)cc1,0.36461546
354
+ *Nc1ccc(C(c2ccc(N*)c(C(C)(C)C)c2)C(c2ccc(N)c(C(C)(C)C)c2)c2ccc(N)c(C(C)(C)C)c2)cc1C(C)(C)C,0.3977767
355
+ *Nc1ccc(C(c2ccc(N*)c(C)c2)C(c2ccc(N)c(C)c2)c2ccc(N)c(C)c2)cc1C,0.38150109
356
+ *Nc1ccc(C(c2ccc(N*)cc2)(C(C)C)C(C)C)cc1,0.36562181
357
+ *Nc1ccc(C(c2ccc(N*)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.34186008
358
+ *Nc1ccc(C(c2ccc(N*)cc2)([C@H](C)CC)[C@H](C)CC)cc1,0.37291553
359
+ *Nc1ccc(C(c2ccc(N*)cc2)C(C)(C)C)cc1,0.3599413
360
+ *Nc1ccc(C(c2ccc(N*)cc2)C(c2ccc(N)cc2)c2ccc(N)cc2)cc1,0.36559722
361
+ *Nc1ccc(C(c2ccc(N*)cc2)C(c2cccc(N)c2)c2cccc(N)c2)cc1,0.35897099
362
+ *Nc1ccc(C(c2ccc(N*)cc2)C(c2ccccc2N)(c2ccccc2N)[C@H](c2ccc(N)cc2)c2ccccc2N)cc1,0.36452149
363
+ *Nc1ccc(C(c2ccc(NC(=O)c3cc(NC(=O)CCCN4C(=O)c5ccccc5C4=O)cc(C(*)=O)c3)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.33372764
364
+ *Nc1ccc(C2(c3ccc(N*)c(-c4ccc(C)cc4)c3-c3ccc(C)cc3)CCCCC2)c(-c2ccc(C)cc2)c1-c1ccc(C)cc1,0.39222234
365
+ *Nc1ccc(C2(c3ccc(N*)c(C)c3)c3ccccc3Cc3ccccc32)cc1C,0.37721583
366
+ *Nc1ccc(C2(c3ccc(N*)cc3)C=CC=C3C2=Cc2ccccc23)cc1,0.36753725
367
+ *Nc1ccc(C2(c3ccc(N*)cc3)CCCC2)cc1,0.34564076
368
+ *Nc1ccc(C2(c3ccc(N*)cc3)CCc3ccccc32)cc1,0.35743146
369
+ *Nc1ccc(C2(c3ccc(N*)cc3)[C@H]3C[C@@H]4C[C@@H](C[C@H]2C4)C3)cc1,0.36122631
370
+ *Nc1ccc(C2(c3ccc(N*)cc3)c3ccccc3Cc3ccccc32)cc1,0.36815633
371
+ *Nc1ccc(C2=CC=C(c3ccc(N*)cc3)C2(C)C)cc1,0.35499932
372
+ *Nc1ccc(C2=CC[C@](N*)(c3ccccc3-c3ccccc3C)C=C2)cc1,0.35437873
373
+ *Nc1ccc(C2=C[C@@H]3CC[C@H]2C=C3c2ccc(N*)cc2)cc1,0.35798661
374
+ *Nc1ccc(CC[C@@H](Cc2ccc(N*)cc2)[C@@H](Cc2ccc(N)cc2)c2ccccc2)cc1,0.35442602
375
+ *Nc1ccc(CCc2ccc(C3(c4ccc(CCc5ccc(N*)cc5)cc4)CCC(c4ccccc4)CC3)cc2)cc1,0.362202
376
+ *Nc1ccc(Cc2cc(Cc3ccc(N)cc3)cc(Cc3ccc(N*)cc3)c2)cc1,0.36046154
377
+ *Nc1ccc(Cc2ccc(Cc3ccc(N*)c(C)c3C)cc2)c(C)c1C,0.35499508
378
+ *Nc1ccc(Cc2ccc(Cc3ccc(N*)cc3)cc2)cc1,0.34845951
379
+ *Nc1ccc(Cc2ccc(N*)c(C)c2)c(CC)c1,0.3496197
380
+ *Nc1ccc(Cc2ccc(N*)c(C)c2)cc1C,0.35117058
381
+ *Nc1ccc(Cc2ccc(N*)c(C)c2C)c(C)c1,0.35276469
382
+ *Nc1ccc(Cc2ccc(N*)c(C)c2C)c(C)c1C,0.35444913
383
+ *Nc1ccc(Cc2ccc(N*)c(CC)c2CC)c(CC)c1CC,0.37391243
384
+ *Nc1ccc(Cc2ccc(N*)c(CC=C)c2CC=C)c(CC=C)c1CC=C,0.35378076
385
+ *Nc1ccc(Cc2ccc(N*)c(Cl)c2)cc1Cl,0.3514118
386
+ *Nc1ccc(Cc2ccc(N*)cc2)cc1,0.34181931
387
+ *Nc1ccc(Cc2ccc(NC(=O)CCCCCCCC(*)=O)cc2)cc1,0.34905905
388
+ *Nc1ccc(Cc2ccc(NC(=O)CCCCCCCCC(*)=O)cc2)cc1,0.35151514
389
+ *Nc1ccc(Cc2ccc(NC(=O)NCCCCCCCCCCCCCCCCCCNC(*)=O)cc2)cc1,0.37009241
390
+ *Nc1ccc(Cc2ccc(NC(=O)NCCCCCCCCCCCCNC(*)=O)cc2)cc1,0.35680733
391
+ *Nc1ccc(Cc2ccc(NC(=O)NCCCCCCCCCCNC(*)=O)cc2)cc1,0.35544532
392
+ *Nc1ccc(Cc2ccc(NC(=O)NCCCCCCCCCNC(*)=O)cc2)cc1,0.35319219
393
+ *Nc1ccc(Cc2ccc(NC(=O)NCCCCCCCCNC(*)=O)cc2)cc1,0.35239715
394
+ *Nc1ccc(Cc2ccc(NC(=O)NCCCCCCNC(*)=O)cc2)cc1,0.34117859
395
+ *Nc1ccc(Cc2ccc(NC(=O)Nc3ccccc3CCc3ccc(NC(*)=O)cc3)cc2)cc1,0.35103259
396
+ *Nc1ccc(Cc2ccc(NC(=O)c3cc(C(*)=O)cc(N4C(=O)C5C6C=CC(C6)C5C4=O)c3)cc2)cc1,0.35689692
397
+ *Nc1ccc(Cc2ccc(NC(=O)c3cc(NC(=O)CCCN4C(=O)c5ccccc5C4=O)cc(C(*)=O)c3)cc2)cc1,0.33667621
398
+ *Nc1ccc(Cc2ccc(NC(=O)c3cc(NC(=O)c4ccc(NC(=O)C(CC(C)C)N5C(=O)c6ccccc6C5=O)cc4)cc(C(*)=O)c3)cc2)cc1,0.34741181
399
+ *Nc1ccc(Cc2cccc(N*)c2)cc1,0.33815386
400
+ *Nc1ccc(N*)c(-c2ccc3ccccc3c2)c1,0.34047659
401
+ *Nc1ccc(N*)c(C(c2ccc3ccccc3c2)c2ccc3ccccc3c2)c1,0.36285479
402
+ *Nc1ccc(N*)c2c1ccc1ccccc12,0.33816871
403
+ *Nc1ccc(NC(=O)Cc2cc(C)c(CC(*)=O)cc2C)cc1,0.3373035
404
+ *Nc1ccc(NC(=O)Cc2cc(CC(*)=O)c(C)cc2C)cc1,0.33730529
405
+ *Nc1ccc(NC(=O)c2cc(C(*)=O)cc(S(=O)(=O)c3ccccc3)c2)cc1,0.34297424
406
+ *Nc1ccc(NC(=O)c2ccc(C(=O)c3ccc(C(*)=O)c(C(=O)O)c3)cc2C(=O)O)cc1,0.31794387
407
+ *Nc1ccc(NC(=O)c2ccc(C(=O)c3ccc(C(*)=O)c(C(=O)OCC)c3)cc2C(=O)OCC)cc1,0.34534308
408
+ *Nc1ccc(NC(=O)c2ccc(NC(=O)CCCCCCC(=O)Nc3ccc(C(*)=O)cc3)cc2)cc1C(=O)OCCCCCCCCCCCCCCCCCC,0.35844067
409
+ *Nc1ccc(NC(=O)c2ccc(NC(=O)CCCCCCCCCCCCC(=O)Nc3ccc(C(*)=O)cc3)cc2)cc1C(=O)OCCCCCCCCCCCC,0.35882755
410
+ *Nc1ccc(NC(=O)c2ccc(NC(=O)CCCCCCCCCCCCC(=O)Nc3ccc(C(*)=O)cc3)cc2)cc1C(=O)OCCCCCCCCCCCCCC,0.3595526
411
+ *Nc1ccc(NC(=O)c2ccc(NC(=O)CCCCCCCCCCCCC(=O)Nc3ccc(C(*)=O)cc3)cc2)cc1C(=O)OCCCCCCCCCCCCCCCC,0.36201115
412
+ *Nc1ccc(NC(=O)c2ccc(NC(=O)CCCCCCCCCCCCC(=O)Nc3ccc(C(*)=O)cc3)cc2)cc1C(=O)OCCCCCCCCCCCCCCCCCC,0.36659861
413
+ *Nc1ccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4ccc(N*)cc4)cc3)cc2)cc1,0.34442362
414
+ *Nc1ccc(Oc2cccc(Oc3ccc(N*)cc3)c2)cc1,0.34111608
415
+ *Nc1ccc(SSc2ccc(N*)cc2)cc1,0.35710288
416
+ *Nc1ccc([C@@H](C)c2ccc(C(C)(C)c3ccc(N*)cc3)cc2)cc1,0.35607415
417
+ *Nc1ccc([C@@H](c2ccccc2)[C@H](c2ccccc2)c2ccc(N*)cc2)cc1,0.36377948
418
+ *Nc1ccc([C@@H]2CC(C)(C)c3cc(N*)ccc32)cc1,0.36237935
419
+ *Nc1ccc([C@@H]2CC(C)(C)c3ccc(N*)cc32)cc1,0.3624745
420
+ *Nc1ccc([C@H](CC)c2ccc([C@@H](CC)c3ccc(N*)cc3)cc2)cc1,0.3623964
421
+ *Nc1ccc([C@H](CCCC)c2ccc(C3(c4ccc([C@@H](CCCC)c5ccc(N*)cc5C)cc4)CCC(C)CC3)cc2)c(C)c1,0.38081893
422
+ *Nc1ccc([C@H](CCCCCC)c2ccc(C3(c4ccc([C@@H](CCCCCC)c5ccc(N*)cc5)cc4)CCC(CCCCC)CC3)cc2)cc1,0.37108303
423
+ *Nc1ccc([C@H](c2ccc(N*)c(C)c2)C(c2ccc(N)c(C)c2)c2ccc(N)c(C)c2)cc1,0.37242979
424
+ *Nc1ccc([C@H](c2cccc(N*)c2)C(c2cccc(N)c2)c2cccc(N)c2)cc1,0.35918801
425
+ *Nc1ccc([C@H](c2ccccc2N*)C(c2ccc(N)cc2)c2ccc(N)cc2)cc1,0.37002899
426
+ *Nc1ccc([C@H]2CC[C@H](c3ccc(N*)cc3)CC2)cc1,0.35770572
427
+ *Nc1ccc2c(N*)cc3ccccc3c2c1,0.334748
428
+ *Nc1ccc2c(c1)-c1ccc3c4ccc5c6c(ccc(c7ccc(c1c73)C2)c64)-c1cc(N*)ccc1C5,0.34349672
429
+ *Nc1ccc2c(c1)C(=O)c1ccc(N*)cc1C2=O,0.33028392
430
+ *Nc1ccc2c(c1)[C@]1(C)CC[C@@]2(C)c2cc(N*)ccc21,0.36507821
431
+ *Nc1ccc2c(c1)c(C)c(C)c1cc(N*)ccc12,0.33627798
432
+ *Nc1ccc2c(c1)c(N*)cc1ccccc12,0.33211721
433
+ *Nc1ccc2c(c1)cc(N*)c1ccccc12,0.33817655
434
+ *Nc1ccc2c(c1N*)CCc1ccccc1-2,0.341292
435
+ *Nc1ccc2c(ccc3c(N*)cccc32)c1,0.3358041
436
+ *Nc1ccc2c(ccc3cc(N*)ccc32)c1,0.3363309
437
+ *Nc1ccc2c(ccc3ccc(N*)cc32)c1,0.33807688
438
+ *Nc1ccc2c(ccc3ccc4ccc5ccc6cc(N*)ccc6c5c4c32)c1,0.3586391
439
+ *Nc1ccc2c(ccc3cccc(N*)c32)c1,0.3331587
440
+ *Nc1ccc2c(ccc3ccccc32)c1N*,0.3361781
441
+ *Nc1ccc2c3cc(-c4ccc(-c5ccccc5)cc4)c(-c4ccc(-c5ccccc5)cc4)c4c(N*)c(-c5ccc(-c6ccccc6)cc5)c(-c5ccc(-c6ccccc6)cc5)c(c5cccc1c25)c43,0.3619219
442
+ *Nc1ccc2cc(N*)c3ccccc3c2c1,0.33919037
443
+ *Nc1ccc2ccc3ccc(N*)cc3c2c1,0.33497194
444
+ *Nc1ccc2ccc3cccc(N*)c3c2c1,0.32965144
445
+ *Nc1ccc2ccc3ccccc3c2c1N*,0.33674811
446
+ *Nc1cccc(-c2cc(N*)ccc2-c2ccccc2-c2ccccc2)c1,0.35340744
447
+ *Nc1cccc(-c2ccccc2-c2cc(N*)ccc2-c2ccccc2)c1,0.35219454
448
+ *Nc1cccc(-c2ccccc2-c2ccccc2-c2cccc(N*)c2)c1,0.35392163
449
+ *Nc1cccc(-c2ccccc2-c2ccccc2-c2ccccc2)c1N*,0.35659206
450
+ *Nc1cccc(C(c2cccc(N*)c2)(C(F)(F)F)C(F)(F)F)c1,0.34059386
451
+ *Nc1cccc(Cc2cccc(N*)c2)c1,0.33697796
452
+ *Nc1cccc(NC(=C(C#N)C#N)c2cccc(C(*)=C(C#N)C#N)c2)c1,0.41003738
453
+ *Nc1cccc(NC(=O)Cc2cc(C)c(CC(*)=O)cc2C)c1,0.3367393
454
+ *Nc1cccc(NC(=O)Cc2cc(CC(*)=O)c(C)cc2C)c1,0.33294467
455
+ *Nc1cccc(NC(=O)c2cc(C(*)=O)cc(S(=O)(=O)c3ccccc3)c2)c1,0.34798034
456
+ *Nc1cccc(NC(=O)c2cc(NC(=O)C(CCSC)N3C(=O)c4ccccc4C3=O)cc(C(*)=O)c2)c1,0.33770588
457
+ *Nc1cccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4cccc(N*)c4)cc3)cc2)c1,0.34393632
458
+ *Nc1cccc(Oc2cccc(Oc3cccc(N*)c3)c2)c1,0.33787009
459
+ *Nc1cccc2c(NC(=O)c3ccc(C(*)=O)cc3)cccc12,0.34599686
460
+ *Nc1cccc2c(NC(=O)c3ccc(NC(=O)c4ccc([Si](C)(C)c5ccc(C(=O)Nc6ccc(C(*)=O)cc6)cc5)cc4)cc3)cccc12,0.3612299
461
+ *Nc1cccc2c1C(=O)c1cccc(N[Se]*)c1C2=O,0.3481788
462
+ *Nc1cccc2c1C1=C(C[C@](N*)(c3ccccc3-c3ccccc3)C=C1)C2(c1ccccc1)c1ccccc1,0.37540911
463
+ *Nc1cccc2c1c(N*)cc1ccccc12,0.33290552
464
+ *Nc1cccc2c1ccc1c(N*)cccc12,0.33410164
465
+ *Nc1cccc2c1ccc1cccc(N*)c12,0.33883964
466
+ *Nc1cccc2ccc3cccc(N*)c3c12,0.34248529
467
+ *Nc1ccccc1-c1cccc(-c2ccccc2-c2cccc(-c3ccccc3)c2)c1N*,0.35563081
468
+ *Nc1ccccc1-c1ccccc1-c1ccccc1-c1ccccc1N*,0.35892837
469
+ *Nc1ccccc1/C=C/c1ccc2ccccc2c1/C=C/c1ccccc1N*,0.35648517
470
+ *Nc1ccccc1CCc1ccccc1NC(=O)Nc1ccc(CCc2ccc(NC(*)=O)cc2)cc1,0.35411182
471
+ *Nc1ccccc1SSc1ccccc1N*,0.35017586
472
+ *Nc1nc(=O)n(C)c(N*)c1N,0.31039992
473
+ *OC(=O)C(Cc1ccccc1)NC(=O)CCCCCCC(=O)NC(Cc1ccccc1)C(=O)OC1COC2C(*)COC12,0.34161233
474
+ *OC(=O)C(Cc1ccccc1)NC(=O)CCCCCCCCC(=O)NC(Cc1ccccc1)C(=O)OC1COC2C(*)COC12,0.34522868
475
+ *OC(=O)C(Cc1ccccc1)NC(=O)CCCCCCCCCCC(=O)NC(Cc1ccccc1)C(=O)OC1COC2C(*)COC12,0.35207165
476
+ *OC(=O)Oc1ccc(C(=O)Oc2ccc(OC(=O)OC3COC4C(*)COC34)cc2)cc1,0.33078823
477
+ *OC(=O)c1ccc(C(=O)OC2COC3C(*)COC23)cc1,0.33842265
478
+ *OC(=O)c1ccc(Cc2ccc(C(*)=O)cc2)cc1,0.34410161
479
+ *OC(COC(=O)c1ccc2cc(C(*)=O)ccc2c1)COc1ccc(N=Nc2ccc(C#N)cc2)cc1,0.35895878
480
+ *OC1C(C)(C)C(OC(=O)C2CCC(C(*)=O)CC2)C1(C)C,0.35756125
481
+ *OS(=O)(=O)c1cccc(S(=O)(=O)Oc2ccc(C(C)(C)c3ccc(*)cc3)cc2)c1,0.36372411
482
+ *OS(=O)(=O)c1cccc(S(=O)(=O)Oc2ccc(C(C)(CC)c3ccc(*)cc3)cc2)c1,0.36537348
483
+ *OS(=O)(=O)c1cccc(S(=O)(=O)Oc2ccc(C3(c4ccc(*)cc4)CCCCC3)cc2)c1,0.36029922
484
+ *O[Si](*)(C)CCCOc1ccc(C(=O)Oc2ccc(C(=O)Oc3ccc(OC)cc3)cc2)cc1,0.34401737
485
+ *O[Si](C)(C)CCCC(=O)Oc1ccc(C=Nc2ccc(Cc3ccc(N=Cc4ccc(OC(=O)CCC[Si](*)(C)C)cc4)cc3)cc2)cc1,0.38299785
486
+ *O[Si](C)(C)CCCN=Cc1cc(Cc2ccc(O)c(C=NCCC[Si](*)(C)C)c2)ccc1O,0.39347596
487
+ *O[Si](C)(C)OC(CCl)COc1ccc(C(C)(C)c2ccc(OCC(*)CCl)cc2)cc1,0.3750933
488
+ *O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)Oc1c(*)c2ccccc2c2ccccc12,0.41772931
489
+ *O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)Oc1c(*)c2ccccc2c2ccccc12,0.41127913
490
+ *O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)Oc1c(*)c2ccccc2c2ccccc12,0.40298758
491
+ *O[Si](C)(C)Oc1ccc(C(C)(C)c2ccc(*)cc2)cc1,0.40918281
492
+ *O[Si](C)(C)c1cccc2c1ccc1c([Si](*)(C)C)cccc12,0.40173269
493
+ *O[Si](C)(Oc1ccc(*)cc1)c1ccccc1,0.40342541
494
+ *Oc1c(-c2ccccc2)cc(*)cc1-c1ccccc1-c1ccccc1,0.4050477
495
+ *Oc1c(-c2ccccc2)cc(Cc2cc(-c3ccccc3)c(OC(=O)CCCCC(*)=O)c(-c3ccccc3)c2)cc1-c1ccccc1,0.37195156
496
+ *Oc1c(-c2ccccc2)cc(Cc2cc(-c3ccccc3)c(OC(=O)CCCCCC(*)=O)c(-c3ccccc3)c2)cc1-c1ccccc1,0.37107521
497
+ *Oc1c(-c2ccccc2)cc(Cc2cc(-c3ccccc3)c(OC(=O)CCCCCCCCC(*)=O)c(-c3ccccc3)c2)cc1-c1ccccc1,0.3718338
498
+ *Oc1c(Br)cc(C(C)(C)c2cc(Br)c(OC(*)=O)c(Br)c2)cc1Br,0.4290399
499
+ *Oc1c(Br)cc(C(C)(C)c2cc(Br)c(OC(=O)c3ccc(OCCCCCCCCCCOc4ccc(C(*)=O)cc4)cc3)c(Br)c2)cc1Br,0.39126002
500
+ *Oc1c(Br)cc(C(c2cc(Br)c(OC(*)=O)c(Br)c2)(C(F)(F)F)C(F)(F)F)cc1Br,0.41928552
501
+ *Oc1c(C(C)C)cc(C(=O)c2cccc(C(=O)c3ccc(*)cc3)c2)cc1C(C)C,0.40376992
502
+ *Oc1c(C)cc(C(C)(C)c2cc(C)c(OC(*)=O)c(C)c2)cc1C,0.39805007
503
+ *Oc1c(C)cc(C(C)(C)c2cc(C)c(OC(*)=O)c(Cl)c2)cc1Cl,0.3979854
504
+ *Oc1c(C)cc(C(C)(C)c2cc(C)c(OC(=O)CCCCC(*)=O)c(C)c2)cc1C,0.37611741
505
+ *Oc1c(C)cc(C(C)(C)c2cc(C)c(OC(=O)CCCCCC(*)=O)c(C)c2)cc1C,0.37560672
506
+ *Oc1c(C)cc(C(C)(C)c2cc(C)c(OC(=O)CCCCCCCCC(*)=O)c(C)c2)cc1C,0.38134315
507
+ *Oc1c(C)cc(C(c2cc(C)c(OC(*)=O)c(C)c2)(C(F)(F)F)C(F)(F)F)cc1C,0.39598526
508
+ *Oc1c(Cl)cc(C(C)(C)c2cc(Cl)c(OC(*)=O)c(Cl)c2)cc1Cl,0.39482335
509
+ *Oc1c(Cl)cc(C(C)(C)c2cc(Cl)c(OC(=O)CCCCC(*)=O)c(Cl)c2)cc1Cl,0.37453429
510
+ *Oc1c(Cl)cc(C(C)(C)c2cc(Cl)c(OC(=O)c3ccc(OCCCCCCCCCCOc4ccc(C(*)=O)cc4)cc3)c(Cl)c2)cc1Cl,0.37760348
511
+ *Oc1c(Cl)cc(C(c2cc(Cl)c(OC(*)=O)c(Cl)c2)C(Cl)(Cl)Cl)cc1Cl,0.39461971
512
+ *Oc1c(Cl)cc(C2(c3cc(Cl)c(OC(*)=O)c(Cl)c3)CCCCC2)cc1Cl,0.39600978
513
+ *Oc1c(F)c(C#N)c(Oc2ccc(C(C)(C)c3cccc(C(C)(C)c4ccc(*)cc4)c3)cc2)c(F)c1C#N,0.38046147
514
+ *Oc1cc(Br)c(C(C)(C)c2c(Br)cc(OC(*)=O)cc2Br)c(Br)c1,0.3939151
515
+ *Oc1cc(Cl)c(C(C)(C)c2c(Cl)cc(OC(*)=O)cc2Cl)c(Cl)c1,0.37057341
516
+ *Oc1cc(OC(=O)c2ccc(C)cc2)c(OC(=O)CCCCCCCCCCCCCCC(*)=O)cc1OC(=O)c1ccc(C)cc1,0.36849104
517
+ *Oc1cc(OC(=O)c2ccc(OC)cc2)c(OC(=O)CCCC(*)=O)cc1OC(=O)c1ccc(OC)cc1,0.33574151
518
+ *Oc1cc(OC(=O)c2ccc(OC)cc2)c(OC(=O)CCCCCCCCCCCCCCC(*)=O)cc1OC(=O)c1ccc(OC)cc1,0.35345176
519
+ *Oc1cc(OC(=O)c2ccc(OCC(C)CC)cc2)c(OC(=O)CCCC(*)=O)cc1OC(=O)c1ccc(OCC(C)CC)cc1,0.35730159
520
+ *Oc1cc(OC(=O)c2ccc(OCC)cc2)c(OC(=O)CCCC(*)=O)cc1OC(=O)c1ccc(OCC)cc1,0.33981747
521
+ *Oc1cc(OC(=O)c2ccc(OCC)cc2)c(OC(=O)CCCCCCCCCCCCCCC(*)=O)cc1OC(=O)c1ccc(OCC)cc1,0.36137547
522
+ *Oc1cc(OC(=O)c2ccc(OCCC)cc2)c(OC(=O)CCCC(*)=O)cc1OC(=O)c1ccc(OCCC)cc1,0.34814804
523
+ *Oc1cc(OC(=O)c2ccc(OCCC)cc2)c(OC(=O)CCCCCCCCCCCCCCC(*)=O)cc1OC(=O)c1ccc(OCCC)cc1,0.36365256
524
+ *Oc1cc(OC(=O)c2ccc(OCCCC)cc2)c(OC(=O)C(C)(C)CCC(*)=O)cc1OC(=O)c1ccc(OCCCC)cc1,0.35709055
525
+ *Oc1cc(OC(=O)c2ccc(OCCCC)cc2)c(OC(=O)C(C)CCC(*)=O)cc1OC(=O)c1ccc(OCCCC)cc1,0.35328505
526
+ *Oc1cc(OC(=O)c2ccc(OCCCC)cc2)c(OC(=O)CCCC(*)=O)cc1OC(=O)c1ccc(OCCCC)cc1,0.35109057
527
+ *Oc1cc(OC(=O)c2ccc(OCCCC)cc2)c(OC(=O)CCCCCC(*)=O)cc1OC(=O)c1ccc(OCCCC)cc1,0.35336805
528
+ *Oc1cc(OC(=O)c2ccc(OCCCC)cc2)c(OC(=O)CCCCCCCC(*)=O)cc1OC(=O)c1ccc(OCCCC)cc1,0.35540563
529
+ *Oc1cc(OC(=O)c2ccc(OCCCCC)cc2)c(OC(=O)CCCC(*)=O)cc1OC(=O)c1ccc(OCCCCC)cc1,0.35445196
530
+ *Oc1cc(OC(=O)c2ccc(OCCCCCCCCCCCC)cc2)c(OC(=O)CCCC(*)=O)cc1OC(=O)c1ccc(OCCCCCCCCCCCC)cc1,0.37044637
531
+ *Oc1cc(OC(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(C(*)=O)cc6)cc5)CCC(C(C)(C)C)CC4)cc3)cc2)ccc1C12CC3CC(CC(C3)C1)C2,0.38278757
532
+ *Oc1ccc(-c2ccc(OC(=O)OC3C(C)(C)C(OC(*)=O)C3(C)C)cc2)cc1,0.35906343
533
+ *Oc1ccc(-c2ccc(OC3(F)C(*)(F)C(F)(F)C3(F)F)cc2)cc1,0.34511864
534
+ *Oc1ccc(-c2ccc(Oc3cccc(*)n3)cc2)cc1,0.34884407
535
+ *Oc1ccc(C(=O)NNC(=O)c2ccc(*)cc2)cc1,0.34717894
536
+ *Oc1ccc(C(=O)Nc2cc(NC(=O)c3ccc(*)cc3)cc(-c3nc4ccccc4[nH]3)c2)cc1,0.36445259
537
+ *Oc1ccc(C(=O)Nc2ccc(S(=O)(=O)c3ccc(NC(=O)c4ccc(Oc5ccc(C(=O)c6ccc(S(=O)(=O)c7ccc(C(=O)c8ccc(*)cc8)cc7)cc6)cc5)cc4)cc3)cc2)cc1,0.36195969
538
+ *Oc1ccc(C(=O)Nc2ccc(S(=O)(=O)c3ccc(NC(=O)c4ccc(Oc5nc(*)nc(Sc6ccccc6)n5)cc4)cc3)cc2)cc1,0.35315957
539
+ *Oc1ccc(C(=O)OCC(C)(C)COC(=O)c2ccc(*)cc2)cc1,0.35526052
540
+ *Oc1ccc(C(=O)OCCCCCOC(=O)c2ccc(*)cc2)cc1,0.34696395
541
+ *Oc1ccc(C(=O)OCCCOC(=O)c2ccc(*)cc2)cc1,0.34520298
542
+ *Oc1ccc(C(=O)c2ccc(C(=O)c3ccc(Oc4cccc(Cc5cccc(*)c5)c4)cc3)cc2)cc1,0.36639351
543
+ *Oc1ccc(C(=O)c2cccc(C(=O)c3ccc(*)c(C(C)C)c3)c2)cc1C,0.39318082
544
+ *Oc1ccc(C(=O)c2cccc(C(=O)c3ccc(*)c(C(C)C)c3)c2)cc1CC,0.39239857
545
+ *Oc1ccc(C(=O)c2cccc(C(=O)c3ccc(*)c(CC)c3)c2)cc1CC,0.38536105
546
+ *Oc1ccc(C(=O)c2cccc(C(=O)c3ccc(*)cc3)c2)cc1,0.36581437
547
+ *Oc1ccc(C(=O)c2cccc(C(=O)c3ccc(Oc4ccc(Cc5ccc(*)cc5)cc4)cc3)c2)cc1,0.37058113
548
+ *Oc1ccc(C(C)(C)c2cc(Cl)c(OC(*)=O)c(Cl)c2)cc1,0.37520677
549
+ *Oc1ccc(C(C)(C)c2ccc(C(C)(C)c3ccc(OC(*)=O)cc3)cc2)cc1,0.36640421
550
+ *Oc1ccc(C(C)(C)c2ccc(OC(*)(Oc3ccccc3)Oc3ccccc3)cc2)cc1,0.37316612
551
+ *Oc1ccc(C(C)(C)c2ccc(OC(*)=O)c(C(C)C)c2)cc1C(C)C,0.40309692
552
+ *Oc1ccc(C(C)(C)c2ccc(OC(*)=O)c(C)c2)cc1C,0.37267711
553
+ *Oc1ccc(C(C)(C)c2ccc(OC(*)=O)c(CC)c2)cc1CC,0.38097605
554
+ *Oc1ccc(C(C)(C)c2ccc(OC(*)=O)c(Cl)c2)cc1,0.3646685
555
+ *Oc1ccc(C(C)(C)c2ccc(OC(*)=O)c(Cl)c2)cc1C,0.37412442
556
+ *Oc1ccc(C(C)(C)c2ccc(OC(*)=O)c(Cl)c2)cc1Cl,0.37372084
557
+ *Oc1ccc(C(C)(C)c2ccc(OC(*)=S)cc2)cc1,0.35877547
558
+ *Oc1ccc(C(C)(C)c2ccc(OC(=O)CC(C)CCC(*)=O)cc2)cc1,0.35041329
559
+ *Oc1ccc(C(C)(C)c2ccc(OC(=O)CCCCC(*)=O)c(C)c2)cc1C,0.35992444
560
+ *Oc1ccc(C(C)(C)c2ccc(OC(=O)CCCCC(*)=O)cc2)cc1,0.34517826
561
+ *Oc1ccc(C(C)(C)c2ccc(OC(=O)CCCCCCCCC(*)=O)cc2)cc1,0.35872609
562
+ *Oc1ccc(C(C)(C)c2ccc(OC(=O)OCCCCCOC(*)=O)cc2)cc1,0.34414164
563
+ *Oc1ccc(C(C)(C)c2ccc(OC(=O)OCCCCOC(*)=O)cc2)cc1,0.34176051
564
+ *Oc1ccc(C(C)(C)c2ccc(OC(=O)OCCCOC(*)=O)cc2)cc1,0.33934743
565
+ *Oc1ccc(C(C)(C)c2ccc(OC(=O)OCCN(CCOC(*)=O)c3ccc(OC)cc3)cc2)cc1,0.34363634
566
+ *Oc1ccc(C(C)(C)c2ccc(OC(=O)OCCN(CCOC(*)=O)c3ccccc3)cc2)cc1,0.34756362
567
+ *Oc1ccc(C(C)(C)c2ccc(OC(=O)SCCCCCCSC(*)=O)cc2)cc1,0.3614361
568
+ *Oc1ccc(C(C)(C)c2ccc(OC(=O)SCCCSC(*)=O)cc2)cc1,0.35558207
569
+ *Oc1ccc(C(C)(C)c2ccc(OC(=O)c3ccc(OCCCCCCCCCCOc4ccc(C(*)=O)cc4)cc3)cc2)cc1,0.35755108
570
+ *Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)C(=O)c4ccc(*)cc4)cc3)cc2)cc1,0.37690328
571
+ *Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)Nc4ccc(NC(=O)c5ccc(*)cc5)cc4)cc3)cc2)cc1,0.35980678
572
+ *Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4c(C(=O)c5ccc(*)cc5)c(-c5ccccc5)c(-c5ccc6ccccc6c5)c(-c5ccc6ccccc6c5)c4-c4ccccc4)cc3)cc2)cc1,0.39617794
573
+ *Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c5ccc(*)cc5)c4)cc3)cc2)cc1,0.37447472
574
+ *Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4ccccc4-c4ccccc4C(=O)c4ccc(*)cc4)cc3)cc2)cc1,0.37964806
575
+ *Oc1ccc(C(C)(CC(C)C)c2ccc(OC(*)=S)cc2)cc1,0.36650609
576
+ *Oc1ccc(C(C)(CC)c2ccc(OC(*)=O)c(C)c2)cc1C,0.37939809
577
+ *Oc1ccc(C(C)(CC)c2ccc(OC(*)=O)c(Cl)c2)cc1Cl,0.37781492
578
+ *Oc1ccc(C(C)(CC)c2ccc(OC(*)=O)cc2)cc1,0.3639544
579
+ *Oc1ccc(C(C)(CC)c2ccc(OC(*)=S)cc2)cc1,0.36451292
580
+ *Oc1ccc(C(C)(CCC#N)c2ccc(OC(*)=O)c(C)c2)cc1C,0.38107835
581
+ *Oc1ccc(C(C)(CCC#N)c2ccc(OC(*)=O)cc2)cc1,0.36182275
582
+ *Oc1ccc(C(C)(CCC)c2ccc(OC(*)=O)cc2)cc1,0.37045396
583
+ *Oc1ccc(C(C)(c2ccccc2)c2ccc(OC(*)=O)cc2)cc1,0.36147298
584
+ *Oc1ccc(C(C)(c2ccccc2)c2ccc(OC(=O)c3ccccc3-c3ccccc3C(*)=O)cc2)cc1,0.36060662
585
+ *Oc1ccc(C(C)c2ccc(OC(*)=O)cc2)cc1,0.35421995
586
+ *Oc1ccc(C(CC)(CC)c2ccc(OC(*)=S)cc2)cc1,0.36838966
587
+ *Oc1ccc(C(CC)c2ccc(OC(*)=O)cc2)cc1,0.36281879
588
+ *Oc1ccc(C(CC)c2ccc(OC(*)=S)cc2)cc1,0.36420995
589
+ *Oc1ccc(C(CCC)(CCC)c2ccc(OC(*)=O)c(C)c2)cc1C,0.39582423
590
+ *Oc1ccc(C(CCC)(CCC)c2ccc(OC(*)=O)cc2)cc1,0.38249055
591
+ *Oc1ccc(C(CCC)c2ccc(OC(*)=O)cc2)cc1,0.36627071
592
+ *Oc1ccc(C(CCC)c2ccc(OC(*)=S)cc2)cc1,0.3689351
593
+ *Oc1ccc(C(CCCC)(CCCC)c2ccc(OC(*)=O)cc2)cc1,0.38524755
594
+ *Oc1ccc(C(c2ccc(OC(*)=O)cc2)(C(F)(F)Cl)C(F)(F)Cl)cc1,0.36111109
595
+ *Oc1ccc(C(c2ccc(OC(*)=O)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.3466164
596
+ *Oc1ccc(C(c2ccc(OC(*)=O)cc2)C(C)C)cc1,0.36947312
597
+ *Oc1ccc(C(c2ccc(OC(*)=O)cc2)C(Cl)(Cl)Cl)cc1,0.36652065
598
+ *Oc1ccc(C(c2ccc(OC(*)=O)cc2)C(Cl)Cl)cc1,0.36363286
599
+ *Oc1ccc(C(c2ccc(OC(*)=S)cc2)C(C)C)cc1,0.37198799
600
+ *Oc1ccc(C(c2ccc(OC(*)=S)cc2)C(CC)CC)cc1,0.37352636
601
+ *Oc1ccc(C(c2ccc(OC(=O)c3ccc(OCCCCCCCCCCOc4ccc(C(*)=O)cc4)cc3)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.35492275
602
+ *Oc1ccc(C(c2ccc(Oc3ccc(C(=O)C(=O)c4ccc(*)cc4)cc3)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.37168595
603
+ *Oc1ccc(C(c2ccc(Oc3ccc(C(=O)Nc4ccc(NC(=O)c5ccc(*)cc5)cc4)cc3)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.35746728
604
+ *Oc1ccc(C(c2ccc(Oc3ccc(C(=O)c4c(C(=O)c5ccc(*)cc5)c(-c5ccccc5)c(-c5ccc6ccccc6c5)c(-c5ccc6ccccc6c5)c4-c4ccccc4)cc3)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.39580075
605
+ *Oc1ccc(C(c2ccc(Oc3ccc(C(=O)c4cc(C(=O)c5ccc(*)cc5)cc(C(C)(C)C)c4)cc3)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.38577086
606
+ *Oc1ccc(C(c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c5ccc(*)cc5)c4)cc3)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.37070417
607
+ *Oc1ccc(C(c2ccc(Oc3ccc(C(=O)c4ccccc4-c4ccccc4C(=O)c4ccc(*)cc4)cc3)cc2)(C(F)(F)F)C(F)(F)F)cc1,0.37602202
608
+ *Oc1ccc(C(c2ccccc2)(c2ccccc2)c2ccc(OC(=O)OC3C(C)(C)C(OC(*)=O)C3(C)C)cc2)cc1,0.37906621
609
+ *Oc1ccc(C2(c3ccc(OC(*)=O)c(C)c3)CCCCC2)cc1C,0.38771377
610
+ *Oc1ccc(C2(c3ccc(OC(*)=O)c(Cl)c3)CCCCC2)cc1Cl,0.37361861
611
+ *Oc1ccc(C2(c3ccc(OC(*)=O)cc3)CCCC2)cc1,0.35187363
612
+ *Oc1ccc(C2(c3ccc(OC(*)=O)cc3)CCCCC2)cc1,0.35918735
613
+ *Oc1ccc(C2(c3ccc(OC(*)=O)cc3)CCc3ccccc32)cc1,0.36699475
614
+ *Oc1ccc(C2(c3ccc(OC(*)=O)cc3)c3ccccc3-c3ccccc32)cc1,0.38475968
615
+ *Oc1ccc(C2(c3ccc(OC(*)=S)cc3)CCCC2)cc1,0.35616229
616
+ *Oc1ccc(C2(c3ccc(OC(=O)CCCCCCCCC(*)=O)cc3)c3ccccc3Cc3ccccc32)cc1,0.36332436
617
+ *Oc1ccc(C2(c3ccc(OC(=O)OC4C(C)(C)C(OC(*)=O)C4(C)C)cc3)CC3CCC2C3)cc1,0.3705045
618
+ *Oc1ccc(C2(c3ccc(OC(=O)c4ccc(C(*)=O)cc4)cc3)c3ccccc3C(=O)c3ccccc32)cc1,0.36880796
619
+ *Oc1ccc(C2(c3ccc(OC(=O)c4ccc(Oc5ccc(C6(c7ccc(Oc8ccc(C(*)=O)cc8)cc7)CCC(C(C)(C)C)CC6)cc5)cc4)cc3)CC3CC2C2CCCC32)cc1,0.37986369
620
+ *Oc1ccc(C2(c3ccc(Oc4ccc(C(=O)c5c(C(=O)c6ccc(*)cc6)c(-c6ccccc6)c(-c6ccc7ccccc7c6)c(-c6ccc7ccccc7c6)c5-c5ccccc5)cc4)cc3)c3ccccc3-c3ccccc32)cc1,0.40813978
621
+ *Oc1ccc(C2(c3ccc(Oc4ccc5c(=O)n6c7cc(Oc8ccc9c(c8)nc8c%10ccc(*)c%11cccc(c(=O)n98)c%11%10)ccc7nc6c6cccc4c56)cc3)c3ccccc3-c3ccccc32)cc1,0.396728
622
+ *Oc1ccc(C=C2CCCC(=Cc3ccc(OC(=O)CCCCCCCCC(*)=O)c(OC)c3)C2=O)cc1OC,0.35776264
623
+ *Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(C)(C)c4ccc(*)cc4)cc3)cc2)cc1,0.3809024
624
+ *Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)(F)F)C(F)(F)F)cc3)cc2)cc1,0.37282962
625
+ *Oc1ccc(CCNC(=O)c2cccc(C(=O)NCCc3ccc(OC(=O)c4cccc(C(*)=O)c4)cc3)c2)cc1,0.33853882
626
+ *Oc1ccc(CNC(=O)CCCCC(=O)NCc2ccc(OC3COC4C(*)COC34)cc2)cc1,0.33099906
627
+ *Oc1ccc(Cc2ccc(Cc3ccc(OC(*)=O)cc3)cc2)cc1,0.36126118
628
+ *Oc1ccc(Cc2ccc(OC(*)=O)c(C)c2)cc1C,0.36953235
629
+ *Oc1ccc(Cc2ccc(OC(*)=O)cc2)cc1,0.34889783
630
+ *Oc1ccc(Cc2ccc(OC(*)=S)cc2)cc1,0.35541969
631
+ *Oc1ccc(Cc2ccc(OC(=O)CCCCC(*)=O)cc2)cc1,0.34469429
632
+ *Oc1ccc(NC(=O)CCCCCCC(=O)Nc2ccc(OC3COC4C(*)COC34)cc2)cc1,0.33802231
633
+ *Oc1ccc(NC(=O)NC2CC(C)(C)CC(C)(CNC(=O)Nc3ccc(*)cc3)C2)cc1,0.34918835
634
+ *Oc1ccc(NC(=O)c2cc(C(=O)Nc3ccc(Oc4ccc(C(C)(C)c5ccc(*)cc5)cc4)cc3)cc(C(C)(C)C)c2)cc1,0.36926959
635
+ *Oc1ccc(NC(=O)c2cc(C(=O)Nc3ccc(Oc4ccc(C(c5ccc(*)cc5)(C(F)(F)F)C(F)(F)F)cc4)cc3)cc(C(C)(C)C)c2)cc1,0.36861613
636
+ *Oc1ccc(NC(=O)c2cc(C(=O)Nc3ccc(Oc4ccc(C(c5ccc(*)cc5)(C(F)(F)F)C(F)(F)F)cc4)cc3)cc([N+](=O)[O-])c2)cc1,0.3558116
637
+ *Oc1ccc(NC(=O)c2cc(NC(=O)CCCCCCCCCCN3C(=O)c4ccccc4C3=O)cc(C(=O)Nc3ccc(*)cc3)c2)cc1,0.34139919
638
+ *Oc1ccc(NC(=O)c2cc(NC(=O)c3ccc(OC(C)=O)cc3)cc(C(=O)Nc3ccc(*)cc3)c2)cc1,0.33796697
639
+ *Oc1ccc(NC(=O)c2cc(NC(=O)c3ccccc3)cc(C(=O)Nc3ccc(*)cc3)c2)cc1,0.34264944
640
+ *Oc1ccc(NC(=O)c2ccc(NC(=O)c3ccc([Si](C)(C)c4ccc(C(=O)Nc5ccc(C(=O)Nc6ccc(*)cc6)cc5)cc4)cc3)cc2)cc1,0.35582732
641
+ *Oc1ccc(NC(=O)c2cccc(C(=O)Nc3ccc(Oc4ccc(C(c5ccc(*)cc5)(C(F)(F)F)C(F)(F)F)cc4)cc3)c2)cc1,0.35683545
642
+ *Oc1ccc(NC(=O)c2cccc(C(=O)Nc3ccc(Oc4ccc(C5(c6ccc(*)cc6)C6CC7CC(C6)CC5C7)cc4)cc3)c2)cc1,0.36643837
643
+ *Oc1ccc(OC(=O)c2cc(OCCCc3ccccc3)c(C(*)=O)cc2OCCCc2ccccc2)cc1C,0.35359821
644
+ *Oc1ccc(OC(=O)c2ccc(-c3ccc(C(*)=O)cc3)cc2-c2ccccc2)cc1,0.35277091
645
+ *Oc1ccc(OC(=O)c2ccc(-c3ccc(C(*)=O)cc3)cc2-c2ccccc2)cc1C,0.36137424
646
+ *Oc1ccc(OC(=O)c2ccc(C(*)=O)cc2)cc1C1CCCCCCC1,0.36680265
647
+ *Oc1ccc(OC(=O)c2ccc(C(*)=O)cc2Sc2ccc(C)cc2)cc1,0.36164456
648
+ *Oc1ccc(OC(=O)c2ccc(C(*)=O)cc2Sc2ccc3ccccc3c2)cc1,0.35895279
649
+ *Oc1ccc(OC(=O)c2ccc(C(*)=O)cc2Sc2ccccc2)cc1,0.35449521
650
+ *Oc1ccc(OC(=O)c2ccc(C(*)=O)cc2Sc2ccccc2)cc1C,0.36029743
651
+ *Oc1ccc(OC(=O)c2ccc(OCCCCCCCCCCCCOc3ccc(C(*)=O)cc3)cc2)cc1C=Cc1ccncc1,0.36385478
652
+ *Oc1ccc(OC(=O)c2ccc([Si](C)(C)c3ccc(C(*)=O)cc3)cc2)cc1,0.36637923
653
+ *Oc1ccc(OC(=O)c2cccc(Oc3cccc(C(*)=O)c3)c2)cc1,0.34914243
654
+ *Oc1ccc(Oc2ccc(C(=O)c3cccc(-c4cccc(C(=O)c5ccc(*)cc5)c4)c3)cc2)cc1,0.36870915
655
+ *Oc1ccc(Oc2ccc(C(=O)c3cccc(NC(=O)c4ccc(C(=O)Nc5cccc(C(=O)c6ccc(*)cc6)c5)cc4)c3)cc2)cc1,0.35637997
656
+ *Oc1ccc(Oc2ccc(NC(=C(C#N)C#N)c3cccc(C(Nc4ccc(*)cc4)=C(C#N)C#N)c3)cc2)cc1,0.39273786
657
+ *Oc1ccc(Oc2ccc(OC(=O)NC(=O)c3cc(C(=O)NC(*)=O)cc(C(C)(C)C)c3)cc2)cc1,0.33792975
658
+ *Oc1ccc(Oc2ccc(P(C)(=O)c3ccc(*)cc3)cc2)cc1,0.37537227
659
+ *Oc1ccc(Oc2ccc(S(=O)(=O)c3ccc(-c4ccc(-c5ccc(S(=O)(=O)c6ccc(*)cc6)cc5)cc4)cc3)cc2)cc1,0.3744368
660
+ *Oc1ccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4ccc(C(=O)c5ccc(*)cc5)cc4)cc3)cc2)cc1,0.37202073
661
+ *Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C(=O)c4ccc(*)cc4)cc3)cc2)cc1,0.37219019
662
+ *Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C(C)(C)c4ccc(OCCCCOc5ccc(C(C)(C)c6ccc(*)cc6)cc5)cc4)cc3)cc2)cc1,0.36776278
663
+ *Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(*)c(C)c5)CCCCC4)cc3C)cc2)cc1,0.38723397
664
+ *Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(*)cc5)CC5CCC4C5)cc3)cc2)cc1,0.38350802
665
+ *Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(*)cc5)CCCCC4)cc3)cc2)cc1,0.37562068
666
+ *Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(*)cc5)CCCCCC4)cc3)cc2)cc1,0.37649846
667
+ *Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C=C4CCCCC(=Cc5ccc(*)cc5)C4=O)cc3)cc2)cc1,0.3771408
668
+ *Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(Sc4ccc(*)cc4)cc3)cc2)cc1,0.37748733
669
+ *Oc1ccc(SSc2ccc(*)cc2)cc1,0.3974058
670
+ *Oc1ccc2ccc(Oc3ccc(C(=O)Nc4ccc(C(C)(C)c5ccc(C(C)(C)c6ccc(NC(=O)c7ccc(*)cc7)cc6)cc5)cc4)cc3)cc2c1,0.36959735
671
+ *Oc1cccc(C(=O)Nc2ccc(-c3ccc(NC(=O)c4cccc(Oc5nc(*)nc(Sc6ccccc6)n5)c4)cc3)cc2)c1,0.35084083
672
+ *Oc1cccc(C(=O)OCC(F)(F)C(F)(F)C(F)(F)COC(=O)c2cccc(*)c2)c1,0.33298625
673
+ *Oc1cccc(NC(=O)c2cc(NC(=O)c3ccc(OC(C)=O)cc3)cc(C(=O)Nc3ccc(*)cc3)c2)c1,0.3340158
674
+ *Oc1cccc(NC(=O)c2ccc(C(=O)c3cccc(C(=O)Nc4ccc(*)cc4)c3)cc2)c1,0.35065699
675
+ *Oc1cccc(NC(=O)c2ccc(P(=O)(c3ccccc3)c3ccc(C(=O)Nc4cccc(Oc5ccc(P(=O)(c6ccccc6)c6ccc(*)cc6)cc5)c4)cc3)cc2)c1,0.36861977
676
+ *Oc1cccc(OC(=O)c2ccc([Si](C)(C)c3ccc(C(*)=O)cc3)cc2)c1,0.3731365
677
+ *Oc1cccc(Oc2ccc(NC(=O)c3ccc(Oc4cccc(Oc5ccc(C(=O)Nc6ccc(*)cc6)cc5)c4)cc3)cc2)c1,0.3505917
678
+ *Oc1cccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4ccc(C(=O)c5ccc(*)cc5)cc4)cc3)cc2)c1,0.36939797
679
+ *Oc1cccc(Oc2ccc3c(=O)n4c5cc(-c6ccc7c(c6)nc6c8ccc(*)c9cccc(c(=O)n76)c98)ccc5nc4c4cccc2c34)c1,0.3689235
680
+ *Oc1cccc2ccc(Oc3ccc(NC(=O)c4ccc(Oc5cccc(Oc6ccc(C(=O)Nc7ccc(*)cc7)cc6)c5)cc4)cc3)cc12,0.35508677
681
+ *Oc1cccc2ccc(Oc3ccc(NC(=O)c4ccc(S(=O)(=O)c5ccc(C(=O)Nc6ccc(*)cc6)cc5)cc4)cc3)cc12,0.35780365
682
+ *Sc1ccc(Cc2ccc(SC(*)=O)cc2)cc1,0.37184413
683
+ *c1c(C)cc(-c2cc(C)c(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)c(C)c2)cc1C,0.41438034
684
+ *c1c(C)cc(C(C)(C)c2cc(C)c(S(*)(=O)=O)c(C)c2)cc1C,0.39361578
685
+ *c1cc(Br)c(-c2c(Br)cc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2C(=O)OCCCC)c(C(=O)OCCCC)c1,0.40136201
686
+ *c1cc(Br)c(-c2c(Br)cc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2C(=O)OCCCCCC)c(C(=O)OCCCCCC)c1,0.40008637
687
+ *c1cc(Br)c(-c2c(Br)cc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2C(=O)OCCCCCCCC)c(C(=O)OCCCCCCCC)c1,0.39346173
688
+ *c1cc(Br)c(-c2c(Br)cc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2C(=O)OCCCCCCCCCC)c(C(=O)OCCCCCCCCCC)c1,0.39569707
689
+ *c1cc(Br)c(-c2c(Br)cc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2C(=O)OCCCCCCCCCCCC)c(C(=O)OCCCCCCCCCCCC)c1,0.39501882
690
+ *c1cc(Br)c(-c2c(Br)cc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2C(=O)OCCCCCCCCCCCCCC)c(C(=O)OCCCCCCCCCCCCCC)c1,0.39219199
691
+ *c1cc(Br)c(-c2c(Br)cc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2C(=O)OCCCCCCCCCCCCCCCC)c(C(=O)OCCCCCCCCCCCCCCCC)c1,0.38889387
692
+ *c1cc(Br)c(-c2c(Br)cc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2C(=O)OCCCCCCCCCCCCCCCCCC)c(C(=O)OCCCCCCCCCCCCCCCCCC)c1,0.39124104
693
+ *c1cc(C(C)(C)c2ccc(O)cc2)cc(*)c1O,0.36895404
694
+ *c1cc(C)c(-c2c(C)cc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2C)c(C)c1,0.42456606
695
+ *c1cc(C)c(Cc2cc(C)c(N3C(=O)c4ccc(Oc5ccc(Oc6ccc7c(c6)C(=O)N(*)C7=O)cc5)cc4C3=O)cc2C)cc1C,0.38884791
696
+ *c1cc(CCCCCCOC(=O)Cc2ccccc2)c(*)s1,0.36920522
697
+ *c1cc(Oc2c(C)cc(-c3cc(C)c(Oc4cc(N5C(=O)c6ccc(-c7ccc8c(c7)C(=O)N(*)C8=O)cc6C5=O)cc(C(F)(F)F)c4)c(C)c3)cc2C)cc(C(F)(F)F)c1,0.3863168
698
+ *c1ccc(-c2cc(-c3ccc(OCCCC#N)cc3)cc(-c3ccc(N4C(=O)c5ccc(-c6ccc7c(c6)C(=O)N(*)C7=O)cc5C4=O)cc3)n2)cc1,0.38426678
699
+ *c1ccc(-c2cc(C(C)(C)C)c(Oc3ccc(N4C(=O)c5ccc(S(=O)(=O)c6ccc7c(c6)C(=O)N(*)C7=O)cc5C4=O)cc3)c(C(C)(C)C)c2)cc1,0.40835599
700
+ *c1ccc(-c2cc(C(C)(C)C)c(Oc3ccc(N4C(=O)c5ccc(S(=O)(=O)c6ccc7c(c6)C(=O)N(*)C7=O)cc5C4=O)cc3C(F)(F)F)c(C(C)(C)C)c2)cc1,0.40507785
701
+ *c1ccc(-c2cc(CCCCCCBr)c(*)s2)s1,0.4247369
702
+ *c1ccc(-c2cc3c(ccc4ccccc43)cc2-c2ccc(N(*)c3ccc(C)cc3)cc2)cc1,0.39961477
703
+ *c1ccc(-c2ccc(-c3cc(-c4ccccc4)c4cc(-c5ccc6nc(*)cc(-c7ccccc7)c6c5)ccc4n3)cc2)cc1,0.41687752
704
+ *c1ccc(-c2ccc(-c3ccc(-c4ccc(-c5ccc(*)n5CCCCCCCCCCCC)s4)s3)s2)s1,0.43060128
705
+ *c1ccc(-c2ccc(-c3ccc(-c4ccc(-c5ccc6c(c5)C(=O)N(c5ccc(Oc7ccc(-c8ccc(Oc9ccc(N%10C(=O)c%11ccc(*)cc%11C%10=O)cc9)cc8)cc7)cc5)C6=O)cc4)cc3)cc2)cc1,0.36472216
706
+ *c1ccc(-c2ccc(-c3ccc([Si](*)(C)C)s3)s2)s1,0.45230424
707
+ *c1ccc(-c2ccc(-c3ccc([Si](*)(CCCC)CCCC)s3)s2)s1,0.43066164
708
+ *c1ccc(-c2ccc(-c3ccc([Si](C)(C)[Si](*)(C)C)s3)s2)s1,0.45398953
709
+ *c1ccc(-c2ccc(-c3ccc([Si](CCCC)(CCCC)[Si](*)(CCCC)CCCC)s3)s2)s1,0.42687929
710
+ *c1ccc(-c2ccc(-c3nnc(-c4ccc([Si](c5ccccc5)(c5ccccc5)c5ccc(-c6nnc(*)o6)cc5)cc4)o3)cc2)cc1,0.46586821
711
+ *c1ccc(-c2ccc(C3(*)c4ccccc4-c4ccccc43)cc2)cc1,0.41877221
712
+ *c1ccc(-c2ccc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)c(C)c2)cc1C,0.39090779
713
+ *c1ccc(-c2ccc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2C(F)(F)F)c(C(F)(F)F)c1,0.40447598
714
+ *c1ccc(-c2ccc(N3C(=O)c4ccc(Oc5ccc(Oc6ccc7c(c6)C(=O)N(*)C7=O)cc5)cc4C3=O)c(OC)c2)cc1OC,0.36448396
715
+ *c1ccc(-c2ccc(N3C(=O)c4ccc(Oc5ccc(Oc6ccc7c(c6)C(=O)N(*)C7=O)cc5)cc4C3=O)cc2)cc1,0.36285392
716
+ *c1ccc(-c2ccc(NC(=O)c3cccc(C(=O)Nc4ccc(-c5ccc(N6C(=O)c7ccc(Oc8ccc9c(c8)C(=O)N(*)C9=O)cc7C6=O)cc5)cc4)c3)cc2)cc1,0.35467611
717
+ *c1ccc(-c2ccc([Si](*)(C)C)s2)s1,0.43527879
718
+ *c1ccc(-c2ccc([Si](*)(CCCC)CCCC)s2)s1,0.4246241
719
+ *c1ccc(-c2ccc([Si](CCCC)(CCCC)[Si](*)(CCCC)CCCC)s2)s1,0.42578552
720
+ *c1ccc(-c2cnc3cc(-c4ccc5nc(*)cnc5c4)ccc3n2)cc1,0.43314968
721
+ *c1ccc(-c2cnc3ccc(-c4ccc5nc(*)cnc5c4)cc3n2)cc1,0.42108918
722
+ *c1ccc(-c2nc3cc(C(=O)c4ccc5nc(-c6ccccc6)c(-c6ccc(N7C(=O)C8OC9C(=O)N(*)C(=O)C9C8C7=O)cc6)nc5c4)ccc3nc2-c2ccccc2)cc1,0.41616021
723
+ *c1ccc(-c2nc3cc(Oc4ccc5nc(-c6ccccc6)c(*)nc5c4)ccc3nc2-c2ccccc2)cc1,0.4245693
724
+ *c1ccc(/C=C(\C#N)C(=O)NC2CCCCC2NC(=O)/C(C#N)=C/c2ccc(N(c3ccccc3)c3ccc(N(*)c4ccccc4)cc3)cc2)cc1,0.39541341
725
+ *c1ccc(C(=O)NCCCCCCCCNC(=O)c2ccc(-c3nc4ccccc4nc3*)cc2)cc1,0.36159736
726
+ *c1ccc(C(=O)OCCCCCCCCCCOc2ccc(C=C3CCC(=Cc4ccc(OCCCCCCCCCCOC(=O)c5ccc(-c6nnc(*)o6)cc5)cc4)C3=O)cc2)cc1,0.36684911
727
+ *c1ccc(C(=O)OCCCCCCOc2ccc(C=C3CCCCC(=Cc4ccc(OCCCCCCOC(=O)c5ccc(-c6nnc(*)o6)cc5)cc4)C3=O)cc2)cc1,0.36570631
728
+ *c1ccc(C(=O)Oc2ccc3ccc(OC(=O)c4ccc(N5C(=O)CC(SCCOCCSC6CC(=O)N(*)C6=O)C5=O)cc4)cc3c2)cc1,0.33882496
729
+ *c1ccc(C(=O)c2cccc(C(=O)c3ccc(S(*)(=O)=O)cc3)c2)cc1,0.3680485
730
+ *c1ccc(C(C)(C)c2ccc(N3C(=O)c4ccc(Oc5ccc(Oc6ccc7c(c6)C(=O)N(*)C7=O)cc5)cc4C3=O)cc2)cc1,0.38365681
731
+ *c1ccc(C(Cl)(Cl)C(*)Cl)cc1,0.39641701
732
+ *c1ccc(C(F)(F)C(*)(F)F)cc1,0.37735155
733
+ *c1ccc(C(c2ccccc2)(c2ccc(-c3nc4ccc(-c5ccc6nc(*)oc6c5)cc4o3)cc2)C(F)(F)F)cc1,0.42450788
734
+ *c1ccc(C(c2ccccc2)c2ccc(N(c3ccc(C)cc3)c3ccc(-c4ccc(N(*)c5ccc(C)cc5)cc4)cc3)cc2)cc1,0.43608862
735
+ *c1ccc(C2(c3ccc(-c4cc(-c5ccccc5)c5cc(Oc6ccc7nc(*)cc(-c8ccccc8)c7c6)ccc5n4)cc3)c3ccccc3-c3ccccc32)cc1,0.42387068
736
+ *c1ccc(C2C3C(=O)N(c4ccc(N5C(=O)C6ON(C)C(*)C6C5=O)cc4)C(=O)C3ON2C)cc1,0.38700602
737
+ *c1ccc(Cc2ccc(N3C(=O)CC(Nc4ccc(N(c5ccc(NC6CC(=O)N(*)C6=O)cc5)c5ccc([N+](=O)[O-])cc5)cc4)C3=O)cc2)cc1,0.37849239
738
+ *c1ccc(Cc2ccc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,0.38180524
739
+ *c1ccc(Cc2ccc(N3C(=O)c4ccc(Oc5ccc(Oc6ccc7c(c6)C(=O)N(*)C7=O)cc5)cc4C3=O)c(C)c2)cc1C,0.37895775
740
+ *c1ccc(Cc2ccc(N3C(=O)c4ccc(Oc5ccc(Oc6ccc7c(c6)C(=O)N(*)C7=O)cc5)cc4C3=O)c(C)c2C)c(C)c1C,0.38199636
741
+ *c1ccc(Cc2ccc(N3C(=O)c4ccc(Oc5ccc(Oc6ccc7c(c6)C(=O)N(*)C7=O)cc5)cc4C3=O)c(OC)c2)cc1OC,0.37197016
742
+ *c1ccc(Cc2ccc(N3C(=O)c4ccc(Oc5ccc(Oc6ccc7c(c6)C(=O)N(*)C7=O)cc5)cc4C3=O)cc2)cc1,0.36838096
743
+ *c1ccc(Cc2ccc(N3C(=O)c4ccc(Oc5ccc(Oc6ccc7c(c6)C(=O)N(*)C7=O)cc5)cc4C3=O)cc2C)c(C)c1,0.37649357
744
+ *c1ccc(Cc2ccc(N3C(=O)c4ccc(S(=O)(=O)c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)c(C(C)(C)C)c2)cc1C(C)(C)C,0.40289103
745
+ *c1ccc(Cc2ccc(N3C(=O)c4ccc(S(=O)(=O)c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,0.38388245
746
+ *c1ccc(Cc2ccc(N3C(=O)c4ccc(Sc5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)c(C(C)(C)C)c2)cc1C(C)(C)C,0.40667097
747
+ *c1ccc(Cc2ccc(N3C(=O)c4ccc(Sc5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,0.37611089
748
+ *c1ccc(Cc2ccc(NC(=O)c3ccc(Oc4cccc5c4C(=O)N(*)C5=O)cc3)cc2)cc1,0.36075451
749
+ *c1ccc(Cc2ccc(S(*)(=O)=O)cc2)cc1,0.37585986
750
+ *c1ccc(N2C(=O)c3ccc(C(c4ccc5c(c4)C(=O)N(c4ccc(N6C(=O)c7ccc(-c8ccc9c(c8)C(=O)N(*)C9=O)cc7C6=O)cc4)C5=O)(C(F)(F)F)C(F)(F)F)cc3C2=O)cc1,0.39383584
751
+ *c1ccc(N2C(=O)c3ccc(Oc4ccc(Sc5ccc(Oc6ccc7c(c6)C(=O)N(*)C7=O)cc5)cc4)cc3C2=O)cc1,0.36725031
752
+ *c1ccc(NC(=O)Cc2ccc(-c3sc(-c4ccc(CC(=O)Nc5ccc(-c6sc(*)c(-c7ccccc7)c6-c6ccccc6)cc5)cc4)c(-c4ccccc4)c3-c3ccccc3)cc2)cc1,0.40560913
753
+ *c1ccc(NC(=O)c2ccc(NC(=O)c3ccc([Si](C)(C)c4ccc(C(=O)Nc5ccc(C(=O)Nc6ccc(-c7sc(*)c(-c8ccccc8)c7-c7ccccc7)cc6)cc5)cc4)cc3)cc2)cc1,0.38057891
754
+ *c1ccc(NC(=O)c2ccc(NC(=O)c3ccc([Si](C)(C)c4ccc(C(=O)Nc5ccc(C(=O)Nc6ccc(S(*)(=O)=O)cc6)cc5)cc4)cc3)cc2)cc1,0.3556199
755
+ *c1ccc(NC(=O)c2ccc(OCCOCCOc3ccc(C(=O)Nc4ccc5[nH]c(*)nc5c4)cc3)cc2)cc1,0.34897445
756
+ *c1ccc(OC(=O)CCCCCCCCCCCCCCCCCCCCC(=O)Oc2ccc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,0.35924634
757
+ *c1ccc(OC(=O)c2cccc(C(=O)Oc3ccc(C4(*)NC(=O)c5ccccc54)cc3)c2)cc1,0.35684437
758
+ *c1ccc(OC(=O)c2cccc(C(=O)Oc3ccc(C4(*)c5ccccc5C(=O)N4C)cc3)c2)cc1,0.36854686
759
+ *c1ccc(OCCCCOc2ccc(-c3nc4ccc(-c5ccc6nc(*)c(-c7ccccc7)nc6c5)cc4nc3-c3ccccc3)cc2)cc1,0.38981232
760
+ *c1ccc(OCCN(CC)c2ccc(-c3ccc(C(=C(C#N)C#N)c4ccc(-c5ccc(N(CC)CC)cc5)s4)s3)cc2)c(-c2cc(-c3ccccc3)c3cc(Oc4ccc5nc(*)cc(-c6ccccc6)c5c4)ccc3n2)c1,0.40222587
761
+ *c1ccc(Oc2cc3ccccc3cc2Oc2ccc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,0.36809048
762
+ *c1ccc(Oc2ccc(-c3c(-c4ccccc4)c(-c4ccc(-c5nc6ccccc6c(*)c5-c5ccccc5)cc4)nc4ccccc34)cc2)cc1,0.40153318
763
+ *c1ccc(Oc2ccc(-c3cc(-c4ccccc4)c4cc5c(-c6ccccc6)cc(*)nc5cc4n3)cc2)cc1,0.41795495
764
+ *c1ccc(Oc2ccc(-c3cc(OCCCCCC)c(-c4ccc(Oc5ccc(-c6nnc(*)o6)cc5)cc4)cc3OCCCCCC)cc2)cc1,0.39000442
765
+ *c1ccc(Oc2ccc(-c3ccc(Oc4ccc(-c5cnc6cc(-c7ccc8nc(*)cnc8c7)ccc6n5)cc4)cc3)cc2)cc1,0.38709806
766
+ *c1ccc(Oc2ccc(-c3ccc(Oc4ccc(-c5nc(-c6ccccn6)nnc5*)cc4)cc3)cc2)cc1,0.39038278
767
+ *c1ccc(Oc2ccc(-c3ccc(Oc4ccc(N5C(=O)c6ccc(Oc7ccc(C8(c9ccc(Oc%10ccc%11c(c%10)C(=O)N(*)C%11=O)cc9)CCC(c9ccccc9)CC8)cc7)cc6C5=O)cc4)cc3C)c(C)c2)cc1,0.38011618
768
+ *c1ccc(Oc2ccc(-c3nc4cc(Oc5ccc6nc(-c7ccccc7)c(*)nc6c5)ccc4nc3-c3ccccc3)cc2)cc1,0.41406386
769
+ *c1ccc(Oc2ccc(-n3c(=O)c4cc5c(=O)n(-c6ncc(*)s6)c(=O)c5cc4c3=O)cc2)cc1,0.39753647
770
+ *c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(-c5c6c(c(*)c7ccccc57)C(=O)N(C)C6=O)cc4)cc3)cc2)cc1,0.37568068
771
+ *c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(-c5c6c(c(*)c7ccccc57)C(=O)N(CCCCCCCCCCCC)C6=O)cc4)cc3)cc2)cc1,0.37658949
772
+ *c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(-c5cc(*)n(-c6ccccc6)n5)cc4)cc3)cc2)cc1,0.39170081
773
+ *c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(-c5nnc(*)c6c(-c7ccc(F)cc7)c(-c7ccc(F)cc7)c(-c7ccc(F)cc7)c(-c7ccc(F)cc7)c56)cc4)cc3)cc2)cc1,0.4007497
774
+ *c1ccc(Oc2ccc(C(=O)c3ccc4cc(C(=O)c5ccc(Oc6ccc(-c7nc(-c8ccccc8)[nH]c7*)cc6)cc5)ccc4c3)cc2)cc1,0.38999587
775
+ *c1ccc(Oc2ccc(C(=O)c3cccc(C(=O)c4ccc(Oc5ccc(-c6cc(*)n(-c7ccccc7)n6)cc5)cc4)c3)cc2)cc1,0.38437007
776
+ *c1ccc(Oc2ccc(C(C)(C)c3ccc(Oc4ccc(-c5nnc(*)c6c(-c7ccccc7)c(-c7ccc8ccccc8c7)c(-c7ccc8ccccc8c7)c(-c7ccccc7)c56)cc4)cc3)cc2)cc1,0.40019815
777
+ *c1ccc(Oc2ccc(C(C)(C)c3ccc(Oc4ccc(N5C(=O)C6CCC7C(=O)N(*)C(=O)C7C6C5=O)cc4)cc3)cc2)cc1,0.37215765
778
+ *c1ccc(Oc2ccc(C(C)(C)c3ccc(Oc4ccc(N5C(=O)c6ccc(Oc7ccc8c(c7)C(=O)N(*)C8=O)cc6C5=O)cc4)cc3)cc2)cc1,0.37435414
779
+ *c1ccc(Oc2ccc(C(c3ccc(Oc4ccc(-c5nnc(*)c6c(-c7ccccc7)c(-c7ccc8ccccc8c7)c(-c7ccc8ccccc8c7)c(-c7ccccc7)c56)cc4)cc3)(C(F)(F)F)C(F)(F)F)cc2)cc1,0.39876705
780
+ *c1ccc(Oc2ccc(C(c3ccc(Oc4ccc(N5C(=O)c6ccc(-c7ccc8c(c7)C(=O)N(*)C8=O)cc6C5=O)cc4)cc3)(C(F)(F)F)C(F)(F)F)cc2)cc1,0.37203901
781
+ *c1ccc(Oc2ccc(C(c3ccc(Oc4ccc(N5C(=O)c6ccc(Oc7ccc8c(c7)C(=O)N(*)C8=O)cc6C5=O)cc4)cc3)(C(F)(F)F)C(F)(F)F)cc2)cc1,0.37553925
782
+ *c1ccc(Oc2ccc(C3(c4ccc(Oc5ccc(N6C(=O)c7ccc(Oc8ccccc8Oc8ccc9c(c8)C(=O)N(*)C9=O)cc7C6=O)cc5C(F)(F)F)cc4)c4ccccc4-c4ccccc43)cc2)c(C(F)(F)F)c1,0.3879662
783
+ *c1ccc(Oc2ccc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,0.37648982
784
+ *c1ccc(Oc2ccc(N3C(=O)c4ccc(Oc5cc6ccccc6cc5Oc5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,0.36464157
785
+ *c1ccc(Oc2ccc(N3C(=O)c4ccc(P(=O)(c5ccccc5)c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,0.39239064
786
+ *c1ccc(Oc2ccc(N3C(=O)c4ccc([Si](C)(C)c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,0.38889031
787
+ *c1ccc(Oc2ccc(N3C(=O)c4cccc(Oc5c(C)cc(-c6cc(C)c(Oc7cccc8c7C(=O)N(*)C8=O)c(C)c6)cc5C)c4C3=O)cc2)cc1,0.38403589
788
+ *c1ccc(Oc2ccc(N3C(=O)c4cccc(Oc5c(C)cc(Cc6cc(C)c(Oc7cccc8c7C(=O)N(*)C8=O)c(C)c6)cc5C)c4C3=O)cc2)cc1,0.38048862
789
+ *c1ccc(Oc2ccc(N3C(=O)c4cccc(Oc5ccc(-c6ccc(Oc7cccc8c7C(=O)N(*)C8=O)cc6)cc5)c4C3=O)cc2)cc1,0.36710709
790
+ *c1ccc(Oc2ccc(N3C(=O)c4cccc(Oc5ccc(C(C)(C)c6ccc(Oc7cccc8c7C(=O)N(*)C8=O)cc6)cc5)c4C3=O)cc2)cc1,0.37037622
791
+ *c1ccc(Oc2ccc(N3C(=O)c4cccc(Oc5ccc(Sc6ccc(Oc7cccc8c7C(=O)N(*)C8=O)cc6)cc5)c4C3=O)cc2)cc1,0.36952098
792
+ *c1ccc(Oc2ccc(NC(=O)CN3C(=O)c4ccc(C(c5ccc6c(c5)C(=O)N(CC(=O)Nc5ccc(Oc7ccc(-c8nnc(*)o8)cc7)cc5)C6=O)(C(F)(F)F)C(F)(F)F)cc4C3=O)cc2)cc1,0.3576037
793
+ *c1ccc(Oc2ccc(NC(=O)c3cc(NC(=O)c4ccc(OC(C)=O)cc4)cc(C(=O)Nc4ccc(Oc5ccc(-c6nnc(*)o6)cc5)cc4)c3)cc2)cc1,0.35432779
794
+ *c1ccc(Oc2ccc(NC(=O)c3cccc(Oc4ccc5c(c4)C(=O)N(*)C5=O)c3)cc2)cc1,0.35195763
795
+ *c1ccc(Oc2ccc(Oc3ccc(-c4nnc(*)c5c(-c6ccc(F)cc6)c(-c6ccc(F)cc6)c(-c6ccc(F)cc6)c(-c6ccc(F)cc6)c45)cc3)cc2)cc1,0.40913388
796
+ *c1ccc(Oc2ccc(Oc3ccc(N4C(=O)c5ccc(Oc6ccc(Oc7ccc(Oc8ccc9c(c8)C(=O)N(*)C9=O)cc7)cc6)cc5C4=O)cc3)cc2)cc1,0.36392805
797
+ *c1ccc(Oc2ccc(Oc3ccc(N4C(=O)c5cccc(Oc6c(C)cc(Cc7cc(C)c(Oc8cccc9c8C(=O)N(*)C9=O)c(C)c7)cc6C)c5C4=O)c(C)c3)cc2)cc1C,0.38135415
798
+ *c1ccc(Oc2ccc(Oc3ccc(N4C(=O)c5cccc(Oc6ccc(C(C)(C)c7ccc(Oc8cccc9c8C(=O)N(*)C9=O)cc7)cc6)c5C4=O)c(C)c3)cc2)cc1C,0.37551052
799
+ *c1ccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4ccc(-c5nc(-c6ccc(-c7nc(*)c(-c8ccccc8)[nH]7)cc6)[nH]c5-c5ccccc5)cc4)cc3)cc2)cc1,0.40109727
800
+ *c1ccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4ccc(-c5nc6ccccc6n5-c5ccc(-n6c(*)nc7ccccc76)cc5)cc4)cc3)cc2)cc1,0.37966679
801
+ *c1ccc(Oc2ccc(Sc3ccc(Oc4ccc(-c5nc(*)nc(-c6ccccc6)n5)cc4)cc3)cc2)cc1,0.39963304
802
+ *c1ccc(Oc2ccc3c(c2)C(=O)N(*)C3=O)cc1,0.37183519
803
+ *c1ccc(Oc2ccc3ccccc3c2-c2c(Oc3ccc(N4C(=O)c5ccc(NC(=O)Nc6cccc7c(NC(=O)Nc8ccc9c(c8)C(=O)N(*)C9=O)cccc67)cc5C4=O)cc3)ccc3ccccc23)cc1,0.36092398
804
+ *c1ccc(Oc2cccc(N3C(=O)c4ccc(Oc5cccc6c(Oc7ccc8c(c7)C(=O)N(*)C8=O)cccc56)cc4C3=O)c2)cc1,0.37487112
805
+ *c1ccc(Oc2cccc(NC(=O)c3ccc(C(=O)Nc4cccc(Oc5ccc(-c6nnc(*)o6)cc5)c4)c(Oc4ccccc4)c3)c2)cc1,0.36258207
806
+ *c1ccc(Oc2cccc3c(NC(=O)c4ccc([Si](c5ccccc5)(c5ccccc5)c5ccc(C(=O)Nc6cccc7c(Oc8ccc(-c9nnc(*)o9)cc8)cccc67)cc5)cc4)cccc23)cc1,0.37704837
807
+ *c1ccc(Oc2ccccc2Oc2ccc(N3C(=O)c4ccc(Oc5ccccc5Oc5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,0.35809599
808
+ *c1ccc(Oc2ccccc2Oc2ccc(N3C(=O)c4ccc(Oc5ccccc5Oc5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2C(F)(F)F)c(C(F)(F)F)c1,0.36055432
809
+ *c1ccc(S(=O)(=O)c2ccc(N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)cc2)cc1,0.37525452
810
+ *c1ccc(Sc2ccc(Oc3ccc(Sc4ccc(N5C(=O)c6ccc(Oc7ccc(C(C)(C)c8ccc(Oc9ccc%10c(c9)C(=O)N(*)C%10=O)cc8)cc7)cc6C5=O)cc4)cc3)cc2)cc1,0.37448846
811
+ *c1ccc(Sc2ccc(Sc3ccc(N4C(=O)c5ccc(Sc6ccc(Sc7ccc(Sc8ccc9c(c8)C(=O)N(*)C9=O)cc7)cc6)cc5C4=O)cc3)cc2)cc1,0.37717954
812
+ *c1ccc(Sc2ccc(Sc3ccc(Sc4ccc(N5C(=O)c6ccc(-c7ccc8c(c7)C(=O)N(*)C8=O)cc6C5=O)cc4)cc3)cc2)cc1,0.37722426
813
+ *c1ccc([Si](c2ccccc2)(c2ccccc2)c2ccc(-c3nnc(-c4ccc(-c5nnc(*)o5)nc4)o3)cc2)cc1,0.46577218
814
+ *c1ccc2c(c1)C(=O)N(C1CCC(N3C(=O)c4ccc(C(*)(C(F)(F)F)C(F)(F)F)cc4C3=O)CC1)C2=O,0.37690953
815
+ *c1ccc2c(c1)C(=O)N(c1c(C)c(C)c(N3C(=O)c4ccc(C(*)(C(F)(F)F)C(F)(F)F)cc4C3=O)c(C)c1C)C2=O,0.41737364
816
+ *c1ccc2c(c1)C(=O)N(c1c(C)cc(-c3cc(C)c(N4C(=O)c5ccc(C(*)(C(F)(F)F)C(F)(F)F)cc5C4=O)c4ccccc34)c3ccccc13)C2=O,0.41291503
817
+ *c1ccc2c(c1)C(=O)N(c1cc(-c3ccc(O)c(N4C(=O)c5ccc(C(*)(C(F)(F)F)C(F)(F)F)cc5C4=O)c3)ccc1O)C2=O,0.37301864
818
+ *c1ccc2c(c1)C(=O)N(c1cc(C(=O)Nc3ccc(C(c4ccc(NC(=O)c5ccc(C)c(N6C(=O)c7ccc(C(*)(C(F)(F)F)C(F)(F)F)cc7C6=O)c5)cc4)(C(F)(F)F)C(F)(F)F)cc3)ccc1C)C2=O,0.36683082
819
+ *c1ccc2c(c1)C(=O)N(c1cc(C)c(N3C(=O)c4ccc(C(*)(C(F)(F)F)C(F)(F)F)cc4C3=O)cc1C)C2=O,0.40933634
820
+ *c1ccc2c(c1)C(=O)N(c1cc(OCCN(CC)c3ccc(N=Nc4ccc([N+](=O)[O-])cc4)cc3)cc(N3C(=O)c4ccc(C(*)(C(F)(F)F)C(F)(F)F)cc4C3=O)c1)C2=O,0.37866227
821
+ *c1ccc2c(c1)C(=O)N(c1ccc(-c3ccc(N4C(=O)c5ccc(C(*)(C(F)(F)F)C(F)(F)F)cc5C4=O)c(OCCOc4ccc5c(C)cc(=O)oc5c4)c3)cc1OCCOc1ccc3c(C)cc(=O)oc3c1)C2=O,0.34983336
822
+ *c1ccc2c(c1)C(=O)N(c1ccc(C(=O)Nc3ccc(Oc4cccc(Oc5ccc(NC(=O)c6ccc(N7C(=O)c8ccc(C(*)(C(F)(F)F)C(F)(F)F)cc8C7=O)cc6)cc5)c4)cc3)cc1)C2=O,0.35654385
823
+ *c1ccc2c(c1)C(=O)N(c1ccc(C(c3ccc(N4C(=O)c5ccc(C(*)(C(F)(F)F)C(F)(F)F)cc5C4=O)cc3)(C(F)(F)F)C(F)(F)F)cc1)C2=O,0.39524704
824
+ *c1ccc2c(c1)C(=O)N(c1ccc(N3C(=O)c4ccc(C(*)(C(F)(F)F)C(F)(F)F)cc4C3=O)cc1)C2=O,0.39320221
825
+ *c1ccc2c(c1)C(=O)N(c1ccc(NC(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(=O)Nc3ccc(N4C(=O)c5ccc(C(*)(C(F)(F)F)C(F)(F)F)cc5C4=O)cc3)cc1)C2=O,0.34222618
826
+ *c1ccc2c(c1)C(=O)N(c1ccc(OCCOCCOCCOCCOc3ccc(N4C(=O)c5ccc(C(*)(C(F)(F)F)C(F)(F)F)cc5C4=O)cc3)cc1)C2=O,0.34187588
827
+ *c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(C(c4ccc(Oc5ccc(N6C(=O)c7ccc(C(*)(C(F)(F)F)C(F)(F)F)cc7C6=O)cc5C(F)(F)F)cc4)(C(F)(F)F)C(F)(F)F)cc3)c(C(F)(F)F)c1)C2=O,0.38326805
828
+ *c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(N4C(=O)c5ccc(C(*)(C(F)(F)F)C(F)(F)F)cc5C4=O)cc3)cc1)C2=O,0.38674054
829
+ *c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(S(=O)(=O)c4ccc(Oc5ccc(N6C(=O)c7ccc(C(*)(C(F)(F)F)C(F)(F)F)cc7C6=O)cc5)cc4)cc3)cc1)C2=O,0.37337767
830
+ *c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccccc3-c3ccccc3Oc3ccc(N4C(=O)c5ccc(C(*)(C(F)(F)F)C(F)(F)F)cc5C4=O)cc3)cc1)C2=O,0.37740671
831
+ *c1ccc2c(c1)C(=O)N(c1cccc(C(=O)Nc3ccc(C(c4ccc(NC(=O)c5cccc(N6C(=O)c7ccc(C(*)(C(F)(F)F)C(F)(F)F)cc7C6=O)c5)cc4)(C(F)(F)F)C(F)(F)F)cc3)c1)C2=O,0.36118518
832
+ *c1ccc2c(c1)C(=O)N(c1cccc(C(=O)Nc3ccc(C(c4ccc(NC(=O)c5cccc(N6C(=O)c7ccc(C(*)(C(F)(F)F)C(F)(F)F)cc7C6=O)c5C)cc4)(C(F)(F)F)C(F)(F)F)cc3)c1C)C2=O,0.36895419
833
+ *c1ccc2c(c1)C(=O)N(c1cccc(C(=O)Nc3ccc(Oc4ccc(NC(=O)c5cccc(N6C(=O)c7ccc(C(*)(C(F)(F)F)C(F)(F)F)cc7C6=O)c5)cc4)cc3)c1)C2=O,0.35705395
834
+ *c1ccc2c(c1)C(=O)N(c1cccc(C(c3cccc(N4C(=O)c5ccc(C(*)(C(F)(F)F)C(F)(F)F)cc5C4=O)c3)(C(F)(F)F)C(F)(F)F)c1)C2=O,0.38091589
835
+ *c1ccc2c(c1)C(=O)N(c1cccc(Oc3cc(Oc4cccc(N5C(=O)c6ccc(C(*)(C(F)(F)F)C(F)(F)F)cc6C5=O)c4)ccc3P(=O)(c3ccccc3)c3ccccc3)c1)C2=O,0.37529286
836
+ *c1ccc2c(c1)C(=O)N(c1cccc(Oc3ccc(N4C(=O)c5ccc(-c6c(-c7ccccc7)c(-c7ccccc7)c(-c7ccc(Sc8ccc(-c9c(-c%10ccccc%10)c(-c%10ccccc%10)c(*)c(-c%10ccccc%10)c9-c9ccccc9)cc8)cc7)c(-c7ccccc7)c6-c6ccccc6)cc5C4=O)cc3)c1)C2=O,0.4011794
837
+ *c1ccc2c(c1)C(=O)N(c1cccc(Oc3ccc(Oc4cccc(N5C(=O)c6ccc(C(*)(C(F)(F)F)C(F)(F)F)cc6C5=O)c4)cc3)c1)C2=O,0.37037326
838
+ *c1ccc2c(c1)C(=O)N(c1cccc(Oc3ccc(P(=O)(c4ccccc4)c4ccc(Oc5cccc(N6C(=O)c7ccc(C(*)(C(F)(F)F)C(F)(F)F)cc7C6=O)c5)cc4)cc3)c1)C2=O,0.37783134
839
+ *c1ccc2c(c1)C(CCCCCC)(CCCCCC)c1cc(-c3ccc4c(c3)C(CC3=NC(Cc5ccccc5)CO3)(CC3=NC(Cc5ccccc5)CO3)c3cc(*)ccc3-4)ccc1-2,0.40982823
840
+ *c1ccc2c(c1)Sc1cc(-c3sc(C=CC4=CC(=C(C#N)C#N)C=C(C=Cc5sc(*)c(CCCCCC)c5CCCCCC)O4)c(CCCCCC)c3CCCCCC)ccc1N2CCCCCC,0.41634717
841
+ *c1ccc2c(c1)Sc1cc(-c3sc4cc(*)sc4c3CCCCC)ccc1N2CCCCCC,0.52516398
842
+ *c1ccc2c(c1)Sc1cc(-c3sc4cc(*)sc4c3CCCCC)ccc1N2CCCCCCCCCC,0.50803301
843
+ *c1ccc2c(c1)Sc1cc(-c3sc4cc(*)sc4c3CCCCC)ccc1N2CCCCCCCCCCCC,0.4916283
844
+ *c1ccc2c3ccc(-c4c5ccc(C=Cc6ccc(N(CCCCCC)CCCCCC)cc6)cc5c(*)c5ccc(C=Cc6ccc(N(CCCCCC)CCCCCC)cc6)cc45)cc3n(CCCCCCCC)c2c1,0.40341568
845
+ *c1ccc2oc(-c3cccc(-c4nc5cc(C(*)(C(F)(F)F)C(F)(F)F)ccc5o4)n3)nc2c1,0.45901468
846
+ *c1ccc2oc(C3CCC(c4nc5cc(C(*)(C(F)(F)F)C(F)(F)F)ccc5o4)CC3)nc2c1,0.38067376
847
+ *c1cccc(-c2nc3cc(Oc4ccc5nc(-c6ccccc6)c(*)nc5c4)ccc3nc2-c2ccccc2)c1,0.41748276
848
+ *c1cccc(C(=O)Nc2ccc(Cc3ccc(N4C(=O)c5ccc(S(=O)(=O)c6ccc7c(c6)C(=O)N(c6ccc(Cc8ccc(NC(=O)c9cccc(-c%10nc%11cc(-c%12ccc%13[nH]c(*)nc%13c%12)ccc%11[nH]%10)c9)cc8)cc6)C7=O)cc5C4=O)cc3)cc2)c1,0.37929385
849
+ *c1cccc(C(=O)Nc2ccc(NC(=O)c3cccc(N4C(=O)c5ccc(-c6cccc7c6C(=O)N(*)C7=O)cc5C4=O)c3)cc2)c1,0.34929698
850
+ *c1cccc(C(=O)Nc2ccc(Oc3ccc(-c4ccc(Oc5ccc(NC(=O)c6cccc(N7C(=O)c8ccc(-c9cccc%10c9C(=O)N(*)C%10=O)cc8C7=O)c6)cc5)cc4)cc3)cc2)c1,0.35563189
851
+ *c1cccc(C(=O)Nc2ccc(Oc3ccc(Oc4ccc(NC(=O)c5cccc(N6C(=O)c7ccc(-c8ccc9c(c8)C(=O)N(*)C9=O)cc7C6=O)c5)cc4)cc3)cc2)c1,0.35757013
852
+ *c1cccc(C(=O)Nc2ccc(Oc3ccc(Oc4ccc(Oc5ccc(NC(=O)c6cccc(N7C(=O)c8ccc(-c9ccc%10c(c9)C(=O)N(*)C%10=O)cc8C7=O)c6)cc5)cc4)cc3)cc2)c1,0.35608701
853
+ *c1cccc(C(=O)Nc2cccc(S(=O)(=O)c3cccc(NC(=O)c4cccc(N5C(=O)c6ccc(-c7cccc8c7C(=O)N(*)C8=O)cc6C5=O)c4)c3)c2)c1,0.35520964
854
+ *c1cccc(Cc2cccc(N3C(=O)c4ccc([Si](C)(C)O[Si](C)(C)O[Si](C)(C)c5ccc6c(c5)C(=O)N(*)C6=O)cc4C3=O)c2)c1,0.38675185
855
+ *c1cccc(N2C(=O)c3ccc(Oc4ccc(Sc5ccc(Oc6ccc7c(c6)C(=O)N(*)C7=O)cc5)cc4)cc3C2=O)c1,0.36600422
856
+ *c1cccc(NC(=O)c2ccc(-c3ccc(C(=O)Nc4cccc(S(*)(=O)=O)c4)c(C)c3)cc2C)c1,0.3574533
857
+ *c1cccc(NC(=O)c2ccc(OCCOCCOc3ccc(C(=O)Nc4ccc5[nH]c(*)nc5c4)cc3)cc2)c1,0.34893481
858
+ *c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5C(=O)N(*)C6=O)cc4C3=O)c2)c1,0.35046399
859
+ *c1cccc(OCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5C(=O)N(*)C6=O)cc4C3=O)c2)c1,0.34909456
860
+ *c1cccc(OCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5C(=O)N(*)C6=O)cc4C3=O)c2)c1,0.35089152
861
+ *c1cccc(OCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5C(=O)N(*)C6=O)cc4C3=O)c2)c1,0.34538642
862
+ *c1cccc(Oc2cccc(Oc3cccc(N4C(=O)c5ccc(Oc6ccc(Sc7ccc(Oc8ccc9c(c8)C(=O)N(*)C9=O)cc7)cc6)cc5C4=O)c3)c2)c1,0.36222427
863
+ *c1cccc(P(C)(=O)c2cccc(N3C(=O)c4ccc(Oc5ccc(C(C)(C)c6ccc(Oc7ccc8c(c7)C(=O)N(*)C8=O)cc6)cc5)cc4C3=O)c2)c1,0.36957365
simson_modeling/moleculenet_eval/.ipynb_checkpoints/better_eval-checkpoint.py ADDED
@@ -0,0 +1,671 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.optim as optim
6
+ from torch.utils.data import Dataset, DataLoader
7
+ from transformers import BertConfig, BertModel, AutoTokenizer
8
+ from rdkit import Chem, RDLogger
9
+ from rdkit.Chem.Scaffolds import MurckoScaffold
10
+ import copy
11
+ from tqdm import tqdm
12
+ import os
13
+ from sklearn.metrics import roc_auc_score, root_mean_squared_error, mean_absolute_error
14
+ from itertools import compress
15
+ from collections import defaultdict
16
+ from sklearn.metrics.pairwise import cosine_similarity
17
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
18
+ import optuna
19
+ import warnings
20
+ warnings.filterwarnings("ignore")
21
+ RDLogger.DisableLog('rdApp.*')
22
+
23
+ torch.set_float32_matmul_precision('high')
24
+
25
+ # --- 0. Pre-computed Contrastive SMILES Dataset ---
26
+ class PrecomputedContrastiveSmilesDataset(Dataset):
27
+ """
28
+ A Dataset class that reads pre-augmented SMILES pairs from a Parquet file.
29
+ This is significantly faster as it offloads the expensive SMILES randomization
30
+ to a one-time preprocessing step.
31
+ """
32
+ def __init__(self, tokenizer, file_path: str, max_length: int = 512):
33
+ self.tokenizer = tokenizer
34
+ self.max_length = max_length
35
+
36
+ # Load the entire dataset from the Parquet file into memory.
37
+ # This is fast and efficient for subsequent access.
38
+ print(f"Loading pre-computed data from {file_path}...")
39
+ self.data = pd.read_parquet(file_path)
40
+ print("Data loaded successfully.")
41
+
42
+ def __len__(self):
43
+ """Returns the total number of pairs in the dataset."""
44
+ return len(self.data)
45
+
46
+ def __getitem__(self, idx):
47
+ """
48
+ Retrieves a pre-augmented pair, tokenizes it, and returns it
49
+ in the format expected by the DataCollator.
50
+ """
51
+ # Retrieve the pre-augmented pair from the DataFrame
52
+ row = self.data.iloc[idx]
53
+ smiles_1 = row['smiles_1']
54
+ smiles_2 = row['smiles_2']
55
+
56
+ # Tokenize the pair. This operation is fast and remains in the data loader.
57
+ tokens_1 = self.tokenizer(smiles_1, max_length=self.max_length, truncation=True, padding='max_length')
58
+ tokens_2 = self.tokenizer(smiles_2, max_length=self.max_length, truncation=True, padding='max_length')
59
+
60
+ return {
61
+ 'input_ids_1': torch.tensor(tokens_1['input_ids']),
62
+ 'attention_mask_1': torch.tensor(tokens_1['attention_mask']),
63
+ 'input_ids_2': torch.tensor(tokens_2['input_ids']),
64
+ 'attention_mask_2': torch.tensor(tokens_2['attention_mask']),
65
+ }
66
+
67
+ # --- 0a. SMILES enumeration for preprocessing ---
68
+ class SmilesEnumerator:
69
+ """Generates randomized SMILES strings for data augmentation."""
70
+ def randomize_smiles(self, smiles):
71
+ try:
72
+ mol = Chem.MolFromSmiles(smiles)
73
+ return Chem.MolToSmiles(mol, doRandom=True, canonical=False) if mol else smiles
74
+ except:
75
+ return smiles
76
+
77
+ def compute_embedding_similarity_precomputed(encoder, dataset, device):
78
+ """
79
+ Compute embedding similarity using pre-computed augmented SMILES pairs
80
+ """
81
+ encoder.eval()
82
+ similarities = []
83
+
84
+ dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
85
+
86
+ with torch.no_grad():
87
+ for batch in dataloader:
88
+ input_ids_1 = batch['input_ids_1'].to(device)
89
+ attention_mask_1 = batch['attention_mask_1'].to(device)
90
+ input_ids_2 = batch['input_ids_2'].to(device)
91
+ attention_mask_2 = batch['attention_mask_2'].to(device)
92
+
93
+ emb_1 = encoder(input_ids_1, attention_mask_1).cpu().numpy()
94
+ emb_2 = encoder(input_ids_2, attention_mask_2).cpu().numpy()
95
+
96
+ # Compute cosine similarity for each pair in the batch
97
+ batch_similarities = []
98
+ for i in range(len(emb_1)):
99
+ sim = cosine_similarity([emb_1[i]], [emb_2[i]])[0][0]
100
+ batch_similarities.append(sim)
101
+
102
+ similarities.extend(batch_similarities)
103
+
104
+ return np.array(similarities)
105
+
106
+ def create_augmented_smiles_file(smiles_list, output_path, num_augmentations=1):
107
+ """
108
+ Create a parquet file with pre-computed augmented SMILES pairs
109
+ """
110
+ enumerator = SmilesEnumerator()
111
+ pairs = []
112
+
113
+ print(f"Generating {num_augmentations} augmentations for {len(smiles_list)} SMILES...")
114
+
115
+ for smiles in tqdm(smiles_list):
116
+ for _ in range(num_augmentations):
117
+ augmented = enumerator.randomize_smiles(smiles)
118
+ pairs.append({
119
+ 'smiles_1': smiles,
120
+ 'smiles_2': augmented
121
+ })
122
+
123
+ df = pd.DataFrame(pairs)
124
+ df.to_parquet(output_path, index=False)
125
+ print(f"Saved {len(pairs)} augmented pairs to {output_path}")
126
+ return output_path
127
+
128
+ # --- 1. Data Loading ---
129
+ def load_lists_from_url(data):
130
+ # Datasets and their splits, all configurations carried over
131
+ if data == 'bbbp':
132
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv')
133
+ smiles, labels = df.smiles, df.p_np
134
+ elif data == 'clintox':
135
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz', compression='gzip')
136
+ smiles = df.smiles
137
+ labels = df.drop(['smiles'], axis=1)
138
+ elif data == 'hiv':
139
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/HIV.csv')
140
+ smiles, labels = df.smiles, df.HIV_active
141
+ elif data == 'sider':
142
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/sider.csv.gz', compression='gzip')
143
+ smiles = df.smiles
144
+ labels = df.drop(['smiles'], axis=1)
145
+ elif data == 'esol':
146
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv')
147
+ smiles = df.smiles
148
+ labels = df['ESOL predicted log solubility in mols per litre']
149
+ elif data == 'freesolv':
150
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/SAMPL.csv')
151
+ smiles = df.smiles
152
+ labels = df.calc
153
+ elif data == 'lipophicility':
154
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv')
155
+ smiles, labels = df.smiles, df['exp']
156
+ elif data == 'tox21':
157
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz', compression='gzip')
158
+ df = df.dropna(axis=0, how='any').reset_index(drop=True)
159
+ smiles = df.smiles
160
+ labels = df.drop(['mol_id', 'smiles'], axis=1)
161
+ elif data == 'bace':
162
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv')
163
+ smiles, labels = df.mol, df.Class
164
+ elif data == 'qm8':
165
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv')
166
+ df = df.dropna(axis=0, how='any').reset_index(drop=True)
167
+ smiles = df.smiles
168
+ labels = df.drop(['smiles', 'E2-PBE0.1', 'E1-PBE0.1', 'f1-PBE0.1', 'f2-PBE0.1'], axis=1)
169
+ return smiles, labels
170
+
171
+ # --- 2. Scaffold Splitting ---
172
+ class ScaffoldSplitter:
173
+ def __init__(self, data, seed, train_frac=0.8, val_frac=0.1, test_frac=0.1, include_chirality=True):
174
+ self.data = data
175
+ self.seed = seed
176
+ self.include_chirality = include_chirality
177
+ self.train_frac = train_frac
178
+ self.val_frac = val_frac
179
+ self.test_frac = test_frac
180
+
181
+ def generate_scaffold(self, smiles):
182
+ mol = Chem.MolFromSmiles(smiles)
183
+ scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=self.include_chirality)
184
+ return scaffold
185
+
186
+ def scaffold_split(self):
187
+ smiles, labels = load_lists_from_url(self.data)
188
+ non_null = np.ones(len(smiles)) == 0
189
+
190
+ if self.data in {'tox21', 'sider', 'clintox'}:
191
+ for i in range(len(smiles)):
192
+ if Chem.MolFromSmiles(smiles[i]) and labels.loc[i].isnull().sum() == 0:
193
+ non_null[i] = 1
194
+ else:
195
+ for i in range(len(smiles)):
196
+ if Chem.MolFromSmiles(smiles[i]):
197
+ non_null[i] = 1
198
+
199
+ smiles_list = list(compress(enumerate(smiles), non_null))
200
+ rng = np.random.RandomState(self.seed)
201
+
202
+ scaffolds = defaultdict(list)
203
+ for i, sms in smiles_list:
204
+ scaffold = self.generate_scaffold(sms)
205
+ scaffolds[scaffold].append(i)
206
+
207
+ scaffold_sets = list(scaffolds.values())
208
+ rng.shuffle(scaffold_sets)
209
+ n_total_val = int(np.floor(self.val_frac * len(smiles_list)))
210
+ n_total_test = int(np.floor(self.test_frac * len(smiles_list)))
211
+ train_idx, val_idx, test_idx = [], [], []
212
+
213
+ for scaffold_set in scaffold_sets:
214
+ if len(val_idx) + len(scaffold_set) <= n_total_val:
215
+ val_idx.extend(scaffold_set)
216
+ elif len(test_idx) + len(scaffold_set) <= n_total_test:
217
+ test_idx.extend(scaffold_set)
218
+ else:
219
+ train_idx.extend(scaffold_set)
220
+ return train_idx, val_idx, test_idx
221
+
222
+ # --- 2a. Normal Random Split ---
223
+ def random_split_indices(n, seed=42, train_frac=0.8, val_frac=0.1, test_frac=0.1):
224
+ np.random.seed(seed)
225
+ indices = np.random.permutation(n)
226
+ n_train = int(n * train_frac)
227
+ n_val = int(n * val_frac)
228
+ train_idx = indices[:n_train]
229
+ val_idx = indices[n_train:n_train+n_val]
230
+ test_idx = indices[n_train+n_val:]
231
+ return train_idx.tolist(), val_idx.tolist(), test_idx.tolist()
232
+
233
+ # --- 3. PyTorch Dataset ---
234
+ class MoleculeDataset(Dataset):
235
+ def __init__(self, smiles_list, labels, tokenizer, max_len=512):
236
+ self.smiles_list = smiles_list
237
+ self.labels = labels
238
+ self.tokenizer = tokenizer
239
+ self.max_len = max_len
240
+
241
+ def __len__(self):
242
+ return len(self.smiles_list)
243
+
244
+ def __getitem__(self, idx):
245
+ smiles = self.smiles_list[idx]
246
+ label = self.labels.iloc[idx]
247
+
248
+ encoding = self.tokenizer(
249
+ smiles,
250
+ truncation=True,
251
+ padding='max_length',
252
+ max_length=self.max_len,
253
+ return_tensors='pt'
254
+ )
255
+ item = {key: val.squeeze(0) for key, val in encoding.items()}
256
+ if isinstance(label, pd.Series):
257
+ label_values = label.values.astype(np.float32)
258
+ else:
259
+ label_values = np.array([label], dtype=np.float32)
260
+ item['labels'] = torch.tensor(label_values, dtype=torch.float)
261
+ return item
262
+
263
+ # --- 4. Model Architecture ---
264
+ def global_ap(x):
265
+ return torch.mean(x.view(x.size(0), x.size(1), -1), dim=1)
266
+
267
+ class SimSonEncoder(nn.Module):
268
+ def __init__(self, config: BertConfig, max_len: int, dropout: float = 0.1):
269
+ super(SimSonEncoder, self).__init__()
270
+ self.config = config
271
+ self.max_len = max_len
272
+ self.bert = BertModel(config, add_pooling_layer=False)
273
+ self.linear = nn.Linear(config.hidden_size, max_len)
274
+ self.dropout = nn.Dropout(dropout)
275
+ def forward(self, input_ids, attention_mask=None):
276
+ if attention_mask is None:
277
+ attention_mask = input_ids.ne(self.config.pad_token_id)
278
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
279
+ hidden_states = self.dropout(outputs.last_hidden_state)
280
+ pooled = global_ap(hidden_states)
281
+ return self.linear(pooled)
282
+
283
+ class SimSonClassifier(nn.Module):
284
+ def __init__(self, encoder: SimSonEncoder, num_labels: int, dropout=0.1):
285
+ super(SimSonClassifier, self).__init__()
286
+ self.encoder = encoder
287
+ self.clf = nn.Linear(encoder.max_len, num_labels)
288
+ self.relu = nn.ReLU()
289
+ self.dropout = nn.Dropout(dropout)
290
+ def forward(self, input_ids, attention_mask=None):
291
+ x = self.encoder(input_ids, attention_mask)
292
+ x = self.relu(self.dropout(x))
293
+ logits = self.clf(x)
294
+ return logits
295
+
296
+ def load_encoder_params(self, state_dict_path):
297
+ self.encoder.load_state_dict(torch.load(state_dict_path))
298
+
299
+ # --- 5. Training, Validation, and Testing Loops ---
300
+ def get_criterion(task_type, num_labels):
301
+ if task_type == 'classification':
302
+ return nn.BCEWithLogitsLoss()
303
+ elif task_type == 'regression':
304
+ return nn.MSELoss()
305
+ else:
306
+ raise ValueError(f"Unknown task type: {task_type}")
307
+
308
+ def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
309
+ model.train()
310
+ total_loss = 0
311
+ for batch in dataloader:
312
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
313
+ labels = batch['labels'].to(device)
314
+ optimizer.zero_grad()
315
+ outputs = model(**inputs)
316
+ loss = criterion(outputs, labels)
317
+ loss.backward()
318
+ optimizer.step()
319
+ if scheduler is not None:
320
+ scheduler.step()
321
+ total_loss += loss.item()
322
+ return total_loss / len(dataloader)
323
+
324
+ def calc_val_metrics(model, dataloader, criterion, device, task_type):
325
+ model.eval()
326
+ all_labels, all_preds = [], []
327
+ total_loss = 0
328
+ with torch.no_grad():
329
+ for batch in dataloader:
330
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
331
+ labels = batch['labels'].to(device)
332
+ outputs = model(**inputs)
333
+ loss = criterion(outputs, labels)
334
+ total_loss += loss.item()
335
+ if task_type == 'classification':
336
+ pred_probs = torch.sigmoid(outputs).cpu().numpy()
337
+ all_preds.append(pred_probs)
338
+ all_labels.append(labels.cpu().numpy())
339
+ else:
340
+ # Regression
341
+ preds = outputs.cpu().numpy()
342
+ all_preds.append(preds)
343
+ all_labels.append(labels.cpu().numpy())
344
+ avg_loss = total_loss / len(dataloader)
345
+ if task_type == 'classification':
346
+ y_true = np.concatenate(all_labels)
347
+ y_pred = np.concatenate(all_preds)
348
+ try:
349
+ score = roc_auc_score(y_true, y_pred, average='macro')
350
+ except Exception:
351
+ score = 0.0
352
+ return avg_loss, score
353
+ else:
354
+ return avg_loss, None
355
+
356
+ def test_model(model, dataloader, device, task_type):
357
+ model.eval()
358
+ all_preds, all_labels = [], []
359
+ with torch.no_grad():
360
+ for batch in dataloader:
361
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
362
+ labels = batch['labels']
363
+ outputs = model(**inputs)
364
+ if task_type == 'classification':
365
+ preds = torch.sigmoid(outputs)
366
+ else:
367
+ preds = outputs
368
+ all_preds.append(preds.cpu().numpy())
369
+ all_labels.append(labels.numpy())
370
+ return np.concatenate(all_preds), np.concatenate(all_labels)
371
+
372
+ # --- 6. Optuna Objective Function ---
373
+ def create_objective(name, info, train_smiles, train_labels, val_smiles, val_labels,
374
+ test_smiles, test_labels, scaler, tokenizer, encoder_config, device):
375
+ """Creates objective function for Optuna optimization"""
376
+
377
+ def objective(trial):
378
+ # Suggest hyperparameters
379
+ lr = trial.suggest_float('lr', 1e-6, 1e-4, log=True)
380
+ batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128, 256])
381
+ dropout = trial.suggest_float('dropout', 0.1, 0.5)
382
+ weight_decay = trial.suggest_float('weight_decay', 0.0, 0.1)
383
+ scheduler_type = trial.suggest_categorical('scheduler', ['plateau', 'cosine', 'step'])
384
+
385
+ # Additional hyperparameters for optimization
386
+ patience_lr = trial.suggest_int('patience_lr', 3, 10)
387
+ gamma = trial.suggest_float('gamma', 0.5, 0.9) if scheduler_type == 'step' else 0.1
388
+
389
+ try:
390
+ # Create datasets and dataloaders
391
+ train_dataset = MoleculeDataset(train_smiles, train_labels, tokenizer, 512)
392
+ val_dataset = MoleculeDataset(val_smiles, val_labels, tokenizer, 512)
393
+ test_dataset = MoleculeDataset(test_smiles, test_labels, tokenizer, 512)
394
+
395
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
396
+ val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
397
+ test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
398
+
399
+ # Create model
400
+ encoder = SimSonEncoder(encoder_config, 512, dropout=dropout)
401
+ encoder = torch.compile(encoder)
402
+ model = SimSonClassifier(encoder, num_labels=info['num_labels'], dropout=dropout).to(device)
403
+ model.load_encoder_params('../simson_checkpoints/checkpoint_best_model.bin')
404
+
405
+ criterion = get_criterion(info['task_type'], info['num_labels'])
406
+ optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
407
+
408
+ # Create scheduler based on trial suggestion
409
+ if scheduler_type == 'plateau':
410
+ scheduler = optim.lr_scheduler.ReduceLROnPlateau(
411
+ optimizer, mode='max', factor=gamma, patience=patience_lr
412
+ )
413
+ elif scheduler_type == 'cosine':
414
+ scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)
415
+ else: # step
416
+ scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=gamma)
417
+
418
+ # Training loop
419
+ best_val_metric = -np.inf
420
+ patience_counter = 0
421
+ patience = 15
422
+
423
+ for epoch in range(50): # Max epochs
424
+ train_loss = train_epoch(model, train_loader, optimizer,
425
+ scheduler if scheduler_type == 'cosine' else None,
426
+ criterion, device)
427
+ val_loss, val_metric = calc_val_metrics(model, val_loader, criterion, device, info['task_type'])
428
+
429
+ # Update scheduler
430
+ if scheduler_type == 'plateau':
431
+ scheduler.step(val_loss if val_loss is not None else -val_loss)
432
+ elif scheduler_type == 'step':
433
+ scheduler.step()
434
+
435
+ # Determine metric to optimize
436
+ if info['task_type'] == 'classification':
437
+ current_metric = val_loss if val_loss is not None else 0.0
438
+ else:
439
+ current_metric = -val_loss # For regression, minimize loss
440
+
441
+ # Early stopping and best model tracking
442
+ if current_metric <= val_loss:
443
+ best_val_metric = current_metric
444
+ patience_counter = 0
445
+ else:
446
+ patience_counter += 1
447
+ if patience_counter >= patience:
448
+ break
449
+
450
+ # Optuna pruning
451
+ trial.report(current_metric, epoch)
452
+ if trial.should_prune():
453
+ raise optuna.TrialPruned()
454
+
455
+ return best_val_metric
456
+
457
+ except Exception as e:
458
+ print(f"Trial failed with error: {e}")
459
+ return -np.inf # Return worst possible score for failed trials
460
+
461
+ return objective
462
+
463
+ # --- 7. Main Execution Block ---
464
+ def main():
465
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
466
+ print(f"Using device: {DEVICE}")
467
+
468
+ DATASETS_TO_RUN = {
469
+ #'esol': {'task_type': 'regression', 'num_labels': 1, 'split': 'random'},
470
+ # Add more datasets here, e.g. 'bbbp': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
471
+ #'sider': {'task_type': 'classification', 'num_labels': 27, 'split': 'random'},
472
+ #'bace': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
473
+ 'clintox': {'task_type': 'classification', 'num_labels': 2, 'split': 'scaffold'},
474
+ 'tox21': {'task_type': 'classification', 'num_labels': 12, 'split': 'random'},
475
+ 'bbbp': {'task_type': 'classification', 'num_labels': 1, 'split': 'scaffold'},
476
+ 'hiv': {'task_type': 'classification', 'num_labels': 1, 'split': 'scaffold'},
477
+ }
478
+
479
+ MAX_LEN = 512
480
+ N_TRIALS = 100 # Number of Optuna trials to run
481
+
482
+ TOKENIZER = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
483
+ ENCODER_CONFIG = BertConfig(
484
+ vocab_size=TOKENIZER.vocab_size,
485
+ hidden_size=768,
486
+ num_hidden_layers=4,
487
+ num_attention_heads=12,
488
+ intermediate_size=2048,
489
+ max_position_embeddings=512
490
+ )
491
+
492
+ aggregated_results = {}
493
+
494
+ for name, info in DATASETS_TO_RUN.items():
495
+ print(f"\n{'='*20} Processing Dataset: {name.upper()} ({info['split']} split) {'='*20}")
496
+ smiles, labels = load_lists_from_url(name)
497
+
498
+ # For regression tasks, scale labels and remember scaling transform
499
+ scaler = None
500
+ if info["task_type"] == "regression":
501
+ scaler = StandardScaler()
502
+ all_labels = labels.values.reshape(-1, 1)
503
+ scaler.fit(all_labels)
504
+ labels = pd.Series(scaler.transform(all_labels).flatten(), index=labels.index)
505
+
506
+ # Data split
507
+ if info.get('split', 'scaffold') == 'scaffold':
508
+ splitter = ScaffoldSplitter(data=name, seed=42)
509
+ train_idx, val_idx, test_idx = splitter.scaffold_split()
510
+ elif info['split'] == 'random':
511
+ train_idx, val_idx, test_idx = random_split_indices(len(smiles), seed=42)
512
+ else:
513
+ raise ValueError(f"Unknown split type for {name}: {info['split']}")
514
+
515
+ train_smiles = smiles.iloc[train_idx].reset_index(drop=True)
516
+ train_labels = labels.iloc[train_idx].reset_index(drop=True)
517
+ val_smiles = smiles.iloc[val_idx].reset_index(drop=True)
518
+ val_labels = labels.iloc[val_idx].reset_index(drop=True)
519
+ test_smiles = smiles.iloc[test_idx].reset_index(drop=True)
520
+ test_labels = labels.iloc[test_idx].reset_index(drop=True)
521
+ print(f"Data split - Train: {len(train_smiles)}, Val: {len(val_smiles)}, Test: {len(test_smiles)}")
522
+
523
+ # Create Optuna study
524
+ study = optuna.create_study(
525
+ direction='maximize',
526
+ pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10)
527
+ )
528
+
529
+ # Create objective function
530
+ objective_func = create_objective(
531
+ name, info, train_smiles, train_labels, val_smiles, val_labels,
532
+ test_smiles, test_labels, scaler, TOKENIZER, ENCODER_CONFIG, DEVICE
533
+ )
534
+
535
+ # Run optimization
536
+ print(f"Starting Optuna optimization with {N_TRIALS} trials...")
537
+ study.optimize(objective_func, n_trials=N_TRIALS, timeout=None)
538
+
539
+ # Get best parameters
540
+ best_params = study.best_params
541
+ best_score = study.best_value
542
+ print(f"Best parameters: {best_params}")
543
+ print(f"Best validation score: {0:.4f}")
544
+
545
+ # Train final model with best parameters
546
+ print("Training final model with best parameters...")
547
+ train_dataset = MoleculeDataset(train_smiles, train_labels, TOKENIZER, MAX_LEN)
548
+ val_dataset = MoleculeDataset(val_smiles, val_labels, TOKENIZER, MAX_LEN)
549
+ test_dataset = MoleculeDataset(test_smiles, test_labels, TOKENIZER, MAX_LEN)
550
+
551
+ train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)
552
+ val_loader = DataLoader(val_dataset, batch_size=best_params['batch_size'], shuffle=False)
553
+ test_loader = DataLoader(test_dataset, batch_size=best_params['batch_size'], shuffle=False)
554
+
555
+ # Final model training
556
+ encoder = SimSonEncoder(ENCODER_CONFIG, 512, dropout=best_params['dropout'])
557
+ encoder = torch.compile(encoder)
558
+ model = SimSonClassifier(encoder, num_labels=info['num_labels'], dropout=best_params['dropout']).to(DEVICE)
559
+ model.load_encoder_params('../simson_checkpoints/checkpoint_best_model.bin')
560
+
561
+ criterion = get_criterion(info['task_type'], info['num_labels'])
562
+ optimizer = optim.Adam(model.parameters(), lr=best_params['lr'], weight_decay=best_params['weight_decay'])
563
+
564
+ # Set up best scheduler
565
+ if best_params['scheduler'] == 'plateau':
566
+ scheduler = optim.lr_scheduler.ReduceLROnPlateau(
567
+ optimizer, mode='max', factor=best_params.get('gamma', 0.7),
568
+ patience=best_params.get('patience_lr', 5)
569
+ )
570
+ elif best_params['scheduler'] == 'cosine':
571
+ scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)
572
+ else:
573
+ scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=best_params.get('gamma', 0.1))
574
+
575
+ # Train with best parameters
576
+ best_val_metric = -np.inf
577
+ best_model_state = None
578
+ patience_counter = 0
579
+ patience = 15
580
+
581
+ for epoch in range(50):
582
+ train_loss = train_epoch(model, train_loader, optimizer,
583
+ scheduler if best_params['scheduler'] == 'cosine' else None,
584
+ criterion, DEVICE)
585
+ val_loss, val_metric = calc_val_metrics(model, val_loader, criterion, DEVICE, info['task_type'])
586
+
587
+ if best_params['scheduler'] == 'plateau':
588
+ scheduler.step(val_loss if val_loss is not None else -val_loss)
589
+ elif best_params['scheduler'] == 'step':
590
+ scheduler.step()
591
+
592
+ if info['task_type'] == 'classification':
593
+ print(f"Epoch {epoch+1}/50 | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | ROC AUC: {val_metric:.4f}")
594
+ current_metric = val_metric if val_metric is not None else 0.0
595
+ else:
596
+ print(f"Epoch {epoch+1}/50 | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
597
+ current_metric = -val_loss
598
+
599
+ if current_metric <= val_loss:
600
+ best_val_metric = current_metric
601
+ best_model_state = copy.deepcopy(model.state_dict())
602
+ patience_counter = 0
603
+ else:
604
+ patience_counter += 1
605
+ if patience_counter >= patience:
606
+ print(f'Early stopping at epoch {epoch+1}')
607
+ break
608
+
609
+ # Test final model
610
+ if best_model_state is not None:
611
+ model.load_state_dict(best_model_state)
612
+
613
+ test_preds, test_true = test_model(model, test_loader, DEVICE, info['task_type'])
614
+
615
+ # Calculate final metrics
616
+ if info['task_type'] == 'regression' and scaler is not None:
617
+ test_preds = scaler.inverse_transform(test_preds.reshape(-1, 1)).flatten()
618
+ test_true = scaler.inverse_transform(test_true.reshape(-1, 1)).flatten()
619
+ rmse = root_mean_squared_error(test_true, test_preds)
620
+ mae = mean_absolute_error(test_true, test_preds)
621
+ final_score = -rmse
622
+ print(f"Test RMSE: {rmse:.4f}, MAE: {mae:.4f}")
623
+ else:
624
+ try:
625
+ final_score = roc_auc_score(test_true, test_preds, average='macro')
626
+ print(f"Test ROC AUC: {final_score:.4f}")
627
+ except Exception:
628
+ final_score = 0.0
629
+
630
+ # Compute embedding similarity using pre-computed augmented SMILES
631
+ print("Creating pre-computed augmented SMILES for similarity computation...")
632
+ test_smiles_list = list(test_smiles)
633
+ similarity_file_path = f"{name}_test_augmented.parquet"
634
+ create_augmented_smiles_file(test_smiles_list, similarity_file_path, num_augmentations=1)
635
+
636
+ # Load pre-computed dataset for similarity computation
637
+ similarity_dataset = PrecomputedContrastiveSmilesDataset(
638
+ TOKENIZER, similarity_file_path, max_length=MAX_LEN
639
+ )
640
+
641
+ similarities = compute_embedding_similarity_precomputed(
642
+ model.encoder, similarity_dataset, DEVICE
643
+ )
644
+ print(f"Similarity score: {similarities.mean():.4f}")
645
+
646
+ # Clean up temporary file
647
+ if os.path.exists(similarity_file_path):
648
+ os.remove(similarity_file_path)
649
+
650
+ aggregated_results[name] = {
651
+ 'best_score': final_score,
652
+ 'best_params': best_params,
653
+ 'optuna_trials': len(study.trials),
654
+ 'study': study,
655
+ 'similarity_score': similarities.mean()
656
+ }
657
+
658
+ if name == 'do_not_save':
659
+ torch.save(model.encoder.state_dict(), 'moleculenet_clintox_encoder.bin')
660
+
661
+ print(f"\n{'='*20} AGGREGATED RESULTS {'='*20}")
662
+ for name, result in aggregated_results.items():
663
+ print(f"{name}: Best score: {result['best_score']:.4f}")
664
+ print(f" Best parameters: {result['best_params']}")
665
+ print(f" Total trials: {result['optuna_trials']}")
666
+ print(f" Similarity score: {result['similarity_score']:.4f}")
667
+
668
+ print("\nScript finished.")
669
+
670
+ if __name__ == '__main__':
671
+ main()
simson_modeling/moleculenet_eval/.ipynb_checkpoints/eval-checkpoint.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.optim as optim
6
+ from torch.utils.data import Dataset, DataLoader
7
+ from transformers import BertConfig, BertModel, AutoTokenizer
8
+ from rdkit import Chem, RDLogger
9
+ from rdkit.Chem.Scaffolds import MurckoScaffold
10
+ import copy
11
+ from tqdm import tqdm
12
+ import os
13
+ from sklearn.metrics import roc_auc_score, root_mean_squared_error, mean_absolute_error
14
+ from itertools import compress
15
+ from collections import defaultdict
16
+ from sklearn.metrics.pairwise import cosine_similarity
17
+ RDLogger.DisableLog('rdApp.*')
18
+
19
+
20
+ torch.set_float32_matmul_precision('high')
21
+
22
+ # --- 0. Smiles enumeration
23
+ class SmilesEnumerator:
24
+ """Generates randomized SMILES strings for data augmentation."""
25
+ def randomize_smiles(self, smiles):
26
+ try:
27
+ mol = Chem.MolFromSmiles(smiles)
28
+ return Chem.MolToSmiles(mol, doRandom=True, canonical=False) if mol else smiles
29
+ except:
30
+ return smiles
31
+
32
+
33
+ def compute_embedding_similarity(encoder, smiles_list, tokenizer, device, max_len=256):
34
+ encoder.eval()
35
+ enumerator = SmilesEnumerator()
36
+
37
+ embeddings_orig = []
38
+ embeddings_aug = []
39
+
40
+ with torch.no_grad():
41
+ for smi in smiles_list:
42
+ # Original SMILES encoding
43
+ encoding_orig = tokenizer(
44
+ smi,
45
+ truncation=True,
46
+ padding='max_length',
47
+ max_length=max_len,
48
+ return_tensors='pt'
49
+ )
50
+ # Augmented SMILES encoding
51
+ smi_aug = enumerator.randomize_smiles(smi)
52
+ encoding_aug = tokenizer(
53
+ smi_aug,
54
+ truncation=True,
55
+ padding='max_length',
56
+ max_length=max_len,
57
+ return_tensors='pt'
58
+ )
59
+
60
+ input_ids_orig = encoding_orig.input_ids.to(device)
61
+ attention_mask_orig = encoding_orig.attention_mask.to(device)
62
+ input_ids_aug = encoding_aug.input_ids.to(device)
63
+ attention_mask_aug = encoding_aug.attention_mask.to(device)
64
+
65
+ emb_orig = encoder(input_ids_orig, attention_mask_orig).cpu().numpy().flatten()
66
+ emb_aug = encoder(input_ids_aug, attention_mask_aug).cpu().numpy().flatten()
67
+
68
+ embeddings_orig.append(emb_orig)
69
+ embeddings_aug.append(emb_aug)
70
+
71
+ embeddings_orig = np.array(embeddings_orig)
72
+ embeddings_aug = np.array(embeddings_aug)
73
+
74
+ # Cosine similarity between each original and its augmented version
75
+ similarities = np.array([cosine_similarity([embeddings_orig[i]], [embeddings_aug[i]])[0][0] for i in range(len(embeddings_orig))])
76
+ return similarities
77
+
78
+ # --- 1. Data Loading ---
79
+ def load_lists_from_url(data):
80
+ if data == 'bbbp':
81
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv')
82
+ smiles, labels = df.smiles, df.p_np
83
+ elif data == 'clintox':
84
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz', compression='gzip')
85
+ smiles = df.smiles
86
+ labels = df.drop(['smiles'], axis=1)
87
+ elif data == 'hiv':
88
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/HIV.csv')
89
+ smiles, labels = df.smiles, df.HIV_active
90
+ elif data == 'sider':
91
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/sider.csv.gz', compression='gzip')
92
+ smiles = df.smiles
93
+ labels = df.drop(['smiles'], axis=1)
94
+ elif data == 'esol':
95
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv')
96
+ smiles = df.smiles
97
+ labels = df['ESOL predicted log solubility in mols per litre']
98
+ elif data == 'freesolv':
99
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/SAMPL.csv')
100
+ smiles = df.smiles
101
+ labels = df.calc
102
+ elif data == 'lipophicility':
103
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv')
104
+ smiles, labels = df.smiles, df['exp']
105
+ elif data == 'tox21':
106
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz', compression='gzip')
107
+ df = df.dropna(axis=0, how='any').reset_index(drop=True)
108
+ smiles = df.smiles
109
+ labels = df.drop(['mol_id', 'smiles'], axis=1)
110
+ elif data == 'bace':
111
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv')
112
+ smiles, labels = df.mol, df.Class
113
+ elif data == 'qm8':
114
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv')
115
+ df = df.dropna(axis=0, how='any').reset_index(drop=True)
116
+ smiles = df.smiles
117
+ labels = df.drop(['smiles', 'E2-PBE0.1', 'E1-PBE0.1', 'f1-PBE0.1', 'f2-PBE0.1'], axis=1)
118
+ return smiles, labels
119
+
120
+ # --- 2. Scaffold Splitting ---
121
+ class ScaffoldSplitter:
122
+ def __init__(self, data, seed, train_frac=0.8, val_frac=0.1, test_frac=0.1, include_chirality=True):
123
+ self.data = data
124
+ self.seed = seed
125
+ self.include_chirality = include_chirality
126
+ self.train_frac = train_frac
127
+ self.val_frac = val_frac
128
+ self.test_frac = test_frac
129
+
130
+ def generate_scaffold(self, smiles):
131
+ mol = Chem.MolFromSmiles(smiles)
132
+ scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=self.include_chirality)
133
+ return scaffold
134
+
135
+ def scaffold_split(self):
136
+ smiles, labels = load_lists_from_url(self.data)
137
+ non_null = np.ones(len(smiles)) == 0
138
+
139
+ if self.data in {'tox21', 'sider', 'clintox'}:
140
+ for i in range(len(smiles)):
141
+ if Chem.MolFromSmiles(smiles[i]) and labels.loc[i].isnull().sum() == 0:
142
+ non_null[i] = 1
143
+ else:
144
+ for i in range(len(smiles)):
145
+ if Chem.MolFromSmiles(smiles[i]):
146
+ non_null[i] = 1
147
+
148
+ smiles_list = list(compress(enumerate(smiles), non_null))
149
+ rng = np.random.RandomState(self.seed)
150
+
151
+ scaffolds = defaultdict(list)
152
+ for i, sms in smiles_list:
153
+ scaffold = self.generate_scaffold(sms)
154
+ scaffolds[scaffold].append(i)
155
+
156
+ scaffold_sets = list(scaffolds.values())
157
+ rng.shuffle(scaffold_sets)
158
+ n_total_val = int(np.floor(self.val_frac * len(smiles_list)))
159
+ n_total_test = int(np.floor(self.test_frac * len(smiles_list)))
160
+ train_idx, val_idx, test_idx = [], [], []
161
+
162
+ for scaffold_set in scaffold_sets:
163
+ if len(val_idx) + len(scaffold_set) <= n_total_val:
164
+ val_idx.extend(scaffold_set)
165
+ elif len(test_idx) + len(scaffold_set) <= n_total_test:
166
+ test_idx.extend(scaffold_set)
167
+ else:
168
+ train_idx.extend(scaffold_set)
169
+ return train_idx, val_idx, test_idx
170
+
171
+ # --- 2a. Normal Random Split ---
172
+ def random_split_indices(n, seed=42, train_frac=0.8, val_frac=0.1, test_frac=0.1):
173
+ np.random.seed(seed)
174
+ indices = np.random.permutation(n)
175
+ n_train = int(n * train_frac)
176
+ n_val = int(n * val_frac)
177
+ train_idx = indices[:n_train]
178
+ val_idx = indices[n_train:n_train+n_val]
179
+ test_idx = indices[n_train+n_val:]
180
+ return train_idx.tolist(), val_idx.tolist(), test_idx.tolist()
181
+
182
+ # --- 3. PyTorch Dataset ---
183
+ class MoleculeDataset(Dataset):
184
+ def __init__(self, smiles_list, labels, tokenizer, max_len=512):
185
+ self.smiles_list = smiles_list
186
+ self.labels = labels
187
+ self.tokenizer = tokenizer
188
+ self.max_len = max_len
189
+
190
+ def __len__(self):
191
+ return len(self.smiles_list)
192
+
193
+ def __getitem__(self, idx):
194
+ smiles = self.smiles_list[idx]
195
+ label = self.labels.iloc[idx]
196
+
197
+ encoding = self.tokenizer(
198
+ smiles,
199
+ truncation=True,
200
+ padding='max_length',
201
+ max_length=self.max_len,
202
+ return_tensors='pt'
203
+ )
204
+ item = {key: val.squeeze(0) for key, val in encoding.items()}
205
+ if isinstance(label, pd.Series):
206
+ label_values = label.values.astype(np.float32)
207
+ else:
208
+ label_values = np.array([label], dtype=np.float32)
209
+ item['labels'] = torch.tensor(label_values, dtype=torch.float)
210
+ return item
211
+
212
+ # --- 4. Model Architecture ---
213
+ def global_ap(x):
214
+ return torch.mean(x.view(x.size(0), x.size(1), -1), dim=1)
215
+
216
+ class SimSonEncoder(nn.Module):
217
+ def __init__(self, config: BertConfig, max_len: int, dropout: float = 0.1):
218
+ super(SimSonEncoder, self).__init__()
219
+ self.config = config
220
+ self.max_len = max_len
221
+ self.bert = BertModel(config, add_pooling_layer=False)
222
+ self.linear = nn.Linear(config.hidden_size, max_len)
223
+ self.dropout = nn.Dropout(dropout)
224
+ def forward(self, input_ids, attention_mask=None):
225
+ if attention_mask is None:
226
+ attention_mask = input_ids.ne(self.config.pad_token_id)
227
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
228
+ hidden_states = self.dropout(outputs.last_hidden_state)
229
+ pooled = global_ap(hidden_states)
230
+ return self.linear(pooled)
231
+
232
+ class SimSonClassifier(nn.Module):
233
+ def __init__(self, encoder: SimSonEncoder, num_labels: int, dropout=0.1):
234
+ super(SimSonClassifier, self).__init__()
235
+ self.encoder = encoder
236
+ self.clf = nn.Linear(encoder.max_len, num_labels)
237
+ self.relu = nn.ReLU()
238
+ self.dropout = nn.Dropout(dropout)
239
+ def forward(self, input_ids, attention_mask=None):
240
+ x = self.encoder(input_ids, attention_mask)
241
+ x = self.relu(self.dropout(x))
242
+ logits = self.clf(x)
243
+ return logits
244
+
245
+ def load_encoder_params(self, state_dict_path):
246
+ self.encoder.load_state_dict(torch.load(state_dict_path))
247
+ print("Pretrained encoder parameters loaded.")
248
+
249
+ # --- 5. Training, Validation, and Testing Loops ---
250
+ def get_criterion(task_type, num_labels):
251
+ if task_type == 'classification':
252
+ return nn.BCEWithLogitsLoss()
253
+ elif task_type == 'regression':
254
+ return nn.MSELoss()
255
+ else:
256
+ raise ValueError(f"Unknown task type: {task_type}")
257
+
258
+ def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
259
+ model.train()
260
+ total_loss = 0
261
+ for batch in dataloader:
262
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
263
+ labels = batch['labels'].to(device)
264
+ optimizer.zero_grad()
265
+ outputs = model(**inputs)
266
+ loss = criterion(outputs, labels)
267
+ loss.backward()
268
+ optimizer.step()
269
+ #scheduler.step()
270
+ total_loss += loss.item()
271
+ return total_loss / len(dataloader)
272
+
273
+ def eval_epoch(model, dataloader, criterion, device):
274
+ model.eval()
275
+ total_loss = 0
276
+ with torch.no_grad():
277
+ for batch in dataloader:
278
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
279
+ labels = batch['labels'].to(device)
280
+ outputs = model(**inputs)
281
+ loss = criterion(outputs, labels)
282
+ total_loss += loss.item()
283
+ return total_loss / len(dataloader)
284
+
285
+ def test_model(model, dataloader, device):
286
+ model.eval()
287
+ all_preds, all_labels = [], []
288
+ with torch.no_grad():
289
+ for batch in dataloader:
290
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
291
+ labels = batch['labels']
292
+ outputs = model(**inputs)
293
+ preds = torch.sigmoid(outputs)
294
+ all_preds.append(preds.cpu().numpy())
295
+ all_labels.append(labels.numpy())
296
+ return np.concatenate(all_preds), np.concatenate(all_labels)
297
+
298
+ def calc_val_metrics(model, dataloader, criterion, device, task_type):
299
+ model.eval()
300
+ all_labels, all_preds = [], []
301
+ total_loss = 0
302
+ with torch.no_grad():
303
+ for batch in dataloader:
304
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
305
+ labels = batch['labels'].to(device)
306
+ outputs = model(**inputs)
307
+ loss = criterion(outputs, labels)
308
+ total_loss += loss.item()
309
+ if task_type == 'classification':
310
+ pred_probs = torch.sigmoid(outputs).cpu().numpy()
311
+ all_preds.append(pred_probs)
312
+ all_labels.append(labels.cpu().numpy())
313
+ else:
314
+ # Regression
315
+ preds = outputs.cpu().numpy()
316
+ all_preds.append(preds)
317
+ all_labels.append(labels.cpu().numpy())
318
+ avg_loss = total_loss / len(dataloader)
319
+ if task_type == 'classification':
320
+ y_true = np.concatenate(all_labels)
321
+ y_pred = np.concatenate(all_preds)
322
+ try:
323
+ score = roc_auc_score(y_true, y_pred, average='macro')
324
+ except Exception:
325
+ score = 0.0
326
+ return avg_loss, score
327
+ else:
328
+ return avg_loss, None
329
+
330
+ # --- 6. Main Execution Block ---
331
+ def main():
332
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
333
+ print(f"Using device: {DEVICE}")
334
+
335
+ DATASETS_TO_RUN = {
336
+ # 'esol': {'task_type': 'regression', 'num_labels': 1, 'split': 'random'},
337
+ #'tox21': {'task_type': 'classification', 'num_labels': 12, 'split': 'random'},
338
+ #'hiv': {'task_type': 'classification', 'num_labels': 1, 'split': 'scaffold'},
339
+ # Add more datasets here, e.g. 'bbbp': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
340
+ #'sider': {'task_type': 'classification', 'num_labels': 27, 'split': 'random'},
341
+ #'bace': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
342
+ 'clintox': {'task_type': 'classification', 'num_labels': 2, 'split': 'random'},
343
+ #'bbbp': {'task_type': 'classification', 'num_labels': 1, 'split': 'scaffold'}
344
+ }
345
+ PATIENCE = 15
346
+ EPOCHS = 50
347
+ LEARNING_RATE = 1e-4
348
+ BATCH_SIZE = 16
349
+ MAX_LEN = 512
350
+
351
+ TOKENIZER = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
352
+ ENCODER_CONFIG = BertConfig(
353
+ vocab_size=TOKENIZER.vocab_size,
354
+ hidden_size=768,
355
+ num_hidden_layers=4,
356
+ num_attention_heads=12,
357
+ intermediate_size=2048,
358
+ max_position_embeddings=512
359
+ )
360
+
361
+ aggregated_results = {}
362
+
363
+ for name, info in DATASETS_TO_RUN.items():
364
+ print(f"\n{'='*20} Processing Dataset: {name.upper()} ({info['split']} split) {'='*20}")
365
+ smiles, labels = load_lists_from_url(name)
366
+
367
+ # Split selection
368
+ if info.get('split', 'scaffold') == 'scaffold':
369
+ splitter = ScaffoldSplitter(data=name, seed=42)
370
+ train_idx, val_idx, test_idx = splitter.scaffold_split()
371
+ elif info['split'] == 'random':
372
+ train_idx, val_idx, test_idx = random_split_indices(len(smiles), seed=42)
373
+ else:
374
+ raise ValueError(f"Unknown split type for {name}: {info['split']}")
375
+
376
+ train_smiles = smiles.iloc[train_idx].reset_index(drop=True)
377
+ train_labels = labels.iloc[train_idx].reset_index(drop=True)
378
+ val_smiles = smiles.iloc[val_idx].reset_index(drop=True)
379
+ val_labels = labels.iloc[val_idx].reset_index(drop=True)
380
+ test_smiles = smiles.iloc[test_idx].reset_index(drop=True)
381
+ test_labels = labels.iloc[test_idx].reset_index(drop=True)
382
+ print(f"Data split - Train: {len(train_smiles)}, Val: {len(val_smiles)}, Test: {len(test_smiles)}")
383
+
384
+ train_dataset = MoleculeDataset(train_smiles, train_labels, TOKENIZER, MAX_LEN)
385
+ val_dataset = MoleculeDataset(val_smiles, val_labels, TOKENIZER, MAX_LEN)
386
+ test_dataset = MoleculeDataset(test_smiles, test_labels, TOKENIZER, MAX_LEN)
387
+
388
+ train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
389
+ val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
390
+ test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
391
+
392
+ encoder = SimSonEncoder(ENCODER_CONFIG, 512)
393
+ encoder = torch.compile(encoder)
394
+ model = SimSonClassifier(encoder, num_labels=info['num_labels']).to(DEVICE)
395
+ model.load_encoder_params('../simson_checkpoints/checkpoint_best_model.bin')
396
+ criterion = get_criterion(info['task_type'], info['num_labels'])
397
+ optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=0.0024)
398
+ scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.59298)
399
+
400
+ best_val_loss = float('-inf')
401
+ best_model_state = None
402
+ current_patience = 0
403
+ for epoch in range(EPOCHS):
404
+ train_loss = train_epoch(model, train_loader, optimizer, scheduler, criterion, DEVICE)
405
+ val_loss, val_metric = calc_val_metrics(model, val_loader, criterion, 'cuda', info['task_type'])
406
+ print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | ROC AUC: {val_metric:.4f}")
407
+
408
+ if val_metric <= val_loss:
409
+ best_val_loss = val_loss
410
+ best_model_state = copy.deepcopy(model.state_dict())
411
+ print(f" -> New best model saved with validation loss: {best_val_loss:.4f}")
412
+ current_patience = 0
413
+ else:
414
+ current_patience += 1
415
+ if current_patience >= PATIENCE:
416
+ print(f'Early stopping at {PATIENCE} epochs')
417
+ break
418
+
419
+ print("\nTesting with the best model...")
420
+ if not best_model_state is None:
421
+ model.load_state_dict(best_model_state)
422
+ test_loss = eval_epoch(model, test_loader, criterion, DEVICE)
423
+ print(f'Test loss: {test_loss}')
424
+ test_preds, test_true = test_model(model, test_loader, DEVICE)
425
+
426
+ aggregated_results[name] = {
427
+ 'best_val_loss': best_val_loss,
428
+ 'test_predictions': test_preds,
429
+ 'test_labels': test_true
430
+ }
431
+ print(f"Finished testing for {name}.")
432
+ test_smiles_list = list(test_smiles)
433
+ similarities = compute_embedding_similarity(
434
+ model.encoder, test_smiles_list, TOKENIZER, DEVICE, MAX_LEN
435
+ )
436
+ print(f"Similarity score: {similarities.mean():.4f}")
437
+ if name == 'do_not_save':
438
+ torch.save(model.encoder.state_dict(), 'moleculenet_clintox_encoder.bin')
439
+
440
+
441
+
442
+ print(f"\n{'='*20} AGGREGATED RESULTS {'='*20}")
443
+ for name, result in aggregated_results.items():
444
+ if name in ['bbbp', 'tox21', 'sider', 'clintox', 'hiv', 'bace']:
445
+ auc = roc_auc_score(result['test_labels'], result['test_predictions'], average='macro')
446
+ print(f'{name} ROC AUC: {auc}')
447
+
448
+ if name in ['lipophicility', 'esol', 'qm8']:
449
+ rmse = root_mean_squared_error(result['test_labels'], result['test_predictions'])
450
+ mae = mean_absolute_error(result['test_labels'], result['test_predictions'])
451
+ print(f'{name} MAE: {mae}')
452
+ print(f'{name} RMSE: {rmse}')
453
+
454
+ print("\nScript finished.")
455
+
456
+ if __name__ == '__main__':
457
+ main()
simson_modeling/moleculenet_eval/.ipynb_checkpoints/showcase-checkpoint.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
simson_modeling/moleculenet_eval/.ipynb_checkpoints/visualizations-checkpoint.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
simson_modeling/moleculenet_eval/__pycache__/better_eval.cpython-312.pyc ADDED
Binary file (36 kB). View file
 
simson_modeling/moleculenet_eval/__pycache__/eval.cpython-312.pyc ADDED
Binary file (27.2 kB). View file
 
simson_modeling/moleculenet_eval/better_eval.py ADDED
@@ -0,0 +1,671 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.optim as optim
6
+ from torch.utils.data import Dataset, DataLoader
7
+ from transformers import BertConfig, BertModel, AutoTokenizer
8
+ from rdkit import Chem, RDLogger
9
+ from rdkit.Chem.Scaffolds import MurckoScaffold
10
+ import copy
11
+ from tqdm import tqdm
12
+ import os
13
+ from sklearn.metrics import roc_auc_score, root_mean_squared_error, mean_absolute_error
14
+ from itertools import compress
15
+ from collections import defaultdict
16
+ from sklearn.metrics.pairwise import cosine_similarity
17
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
18
+ import optuna
19
+ import warnings
20
+ warnings.filterwarnings("ignore")
21
+ RDLogger.DisableLog('rdApp.*')
22
+
23
+ torch.set_float32_matmul_precision('high')
24
+
25
+ # --- 0. Pre-computed Contrastive SMILES Dataset ---
26
+ class PrecomputedContrastiveSmilesDataset(Dataset):
27
+ """
28
+ A Dataset class that reads pre-augmented SMILES pairs from a Parquet file.
29
+ This is significantly faster as it offloads the expensive SMILES randomization
30
+ to a one-time preprocessing step.
31
+ """
32
+ def __init__(self, tokenizer, file_path: str, max_length: int = 512):
33
+ self.tokenizer = tokenizer
34
+ self.max_length = max_length
35
+
36
+ # Load the entire dataset from the Parquet file into memory.
37
+ # This is fast and efficient for subsequent access.
38
+ print(f"Loading pre-computed data from {file_path}...")
39
+ self.data = pd.read_parquet(file_path)
40
+ print("Data loaded successfully.")
41
+
42
+ def __len__(self):
43
+ """Returns the total number of pairs in the dataset."""
44
+ return len(self.data)
45
+
46
+ def __getitem__(self, idx):
47
+ """
48
+ Retrieves a pre-augmented pair, tokenizes it, and returns it
49
+ in the format expected by the DataCollator.
50
+ """
51
+ # Retrieve the pre-augmented pair from the DataFrame
52
+ row = self.data.iloc[idx]
53
+ smiles_1 = row['smiles_1']
54
+ smiles_2 = row['smiles_2']
55
+
56
+ # Tokenize the pair. This operation is fast and remains in the data loader.
57
+ tokens_1 = self.tokenizer(smiles_1, max_length=self.max_length, truncation=True, padding='max_length')
58
+ tokens_2 = self.tokenizer(smiles_2, max_length=self.max_length, truncation=True, padding='max_length')
59
+
60
+ return {
61
+ 'input_ids_1': torch.tensor(tokens_1['input_ids']),
62
+ 'attention_mask_1': torch.tensor(tokens_1['attention_mask']),
63
+ 'input_ids_2': torch.tensor(tokens_2['input_ids']),
64
+ 'attention_mask_2': torch.tensor(tokens_2['attention_mask']),
65
+ }
66
+
67
+ # --- 0a. SMILES enumeration for preprocessing ---
68
+ class SmilesEnumerator:
69
+ """Generates randomized SMILES strings for data augmentation."""
70
+ def randomize_smiles(self, smiles):
71
+ try:
72
+ mol = Chem.MolFromSmiles(smiles)
73
+ return Chem.MolToSmiles(mol, doRandom=True, canonical=False) if mol else smiles
74
+ except:
75
+ return smiles
76
+
77
+ def compute_embedding_similarity_precomputed(encoder, dataset, device):
78
+ """
79
+ Compute embedding similarity using pre-computed augmented SMILES pairs
80
+ """
81
+ encoder.eval()
82
+ similarities = []
83
+
84
+ dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
85
+
86
+ with torch.no_grad():
87
+ for batch in dataloader:
88
+ input_ids_1 = batch['input_ids_1'].to(device)
89
+ attention_mask_1 = batch['attention_mask_1'].to(device)
90
+ input_ids_2 = batch['input_ids_2'].to(device)
91
+ attention_mask_2 = batch['attention_mask_2'].to(device)
92
+
93
+ emb_1 = encoder(input_ids_1, attention_mask_1).cpu().numpy()
94
+ emb_2 = encoder(input_ids_2, attention_mask_2).cpu().numpy()
95
+
96
+ # Compute cosine similarity for each pair in the batch
97
+ batch_similarities = []
98
+ for i in range(len(emb_1)):
99
+ sim = cosine_similarity([emb_1[i]], [emb_2[i]])[0][0]
100
+ batch_similarities.append(sim)
101
+
102
+ similarities.extend(batch_similarities)
103
+
104
+ return np.array(similarities)
105
+
106
+ def create_augmented_smiles_file(smiles_list, output_path, num_augmentations=1):
107
+ """
108
+ Create a parquet file with pre-computed augmented SMILES pairs
109
+ """
110
+ enumerator = SmilesEnumerator()
111
+ pairs = []
112
+
113
+ print(f"Generating {num_augmentations} augmentations for {len(smiles_list)} SMILES...")
114
+
115
+ for smiles in tqdm(smiles_list):
116
+ for _ in range(num_augmentations):
117
+ augmented = enumerator.randomize_smiles(smiles)
118
+ pairs.append({
119
+ 'smiles_1': smiles,
120
+ 'smiles_2': augmented
121
+ })
122
+
123
+ df = pd.DataFrame(pairs)
124
+ df.to_parquet(output_path, index=False)
125
+ print(f"Saved {len(pairs)} augmented pairs to {output_path}")
126
+ return output_path
127
+
128
+ # --- 1. Data Loading ---
129
+ def load_lists_from_url(data):
130
+ # Datasets and their splits, all configurations carried over
131
+ if data == 'bbbp':
132
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv')
133
+ smiles, labels = df.smiles, df.p_np
134
+ elif data == 'clintox':
135
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz', compression='gzip')
136
+ smiles = df.smiles
137
+ labels = df.drop(['smiles'], axis=1)
138
+ elif data == 'hiv':
139
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/HIV.csv')
140
+ smiles, labels = df.smiles, df.HIV_active
141
+ elif data == 'sider':
142
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/sider.csv.gz', compression='gzip')
143
+ smiles = df.smiles
144
+ labels = df.drop(['smiles'], axis=1)
145
+ elif data == 'esol':
146
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv')
147
+ smiles = df.smiles
148
+ labels = df['ESOL predicted log solubility in mols per litre']
149
+ elif data == 'freesolv':
150
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/SAMPL.csv')
151
+ smiles = df.smiles
152
+ labels = df.calc
153
+ elif data == 'lipophicility':
154
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv')
155
+ smiles, labels = df.smiles, df['exp']
156
+ elif data == 'tox21':
157
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz', compression='gzip')
158
+ df = df.dropna(axis=0, how='any').reset_index(drop=True)
159
+ smiles = df.smiles
160
+ labels = df.drop(['mol_id', 'smiles'], axis=1)
161
+ elif data == 'bace':
162
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv')
163
+ smiles, labels = df.mol, df.Class
164
+ elif data == 'qm8':
165
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv')
166
+ df = df.dropna(axis=0, how='any').reset_index(drop=True)
167
+ smiles = df.smiles
168
+ labels = df.drop(['smiles', 'E2-PBE0.1', 'E1-PBE0.1', 'f1-PBE0.1', 'f2-PBE0.1'], axis=1)
169
+ return smiles, labels
170
+
171
+ # --- 2. Scaffold Splitting ---
172
+ class ScaffoldSplitter:
173
+ def __init__(self, data, seed, train_frac=0.8, val_frac=0.1, test_frac=0.1, include_chirality=True):
174
+ self.data = data
175
+ self.seed = seed
176
+ self.include_chirality = include_chirality
177
+ self.train_frac = train_frac
178
+ self.val_frac = val_frac
179
+ self.test_frac = test_frac
180
+
181
+ def generate_scaffold(self, smiles):
182
+ mol = Chem.MolFromSmiles(smiles)
183
+ scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=self.include_chirality)
184
+ return scaffold
185
+
186
+ def scaffold_split(self):
187
+ smiles, labels = load_lists_from_url(self.data)
188
+ non_null = np.ones(len(smiles)) == 0
189
+
190
+ if self.data in {'tox21', 'sider', 'clintox'}:
191
+ for i in range(len(smiles)):
192
+ if Chem.MolFromSmiles(smiles[i]) and labels.loc[i].isnull().sum() == 0:
193
+ non_null[i] = 1
194
+ else:
195
+ for i in range(len(smiles)):
196
+ if Chem.MolFromSmiles(smiles[i]):
197
+ non_null[i] = 1
198
+
199
+ smiles_list = list(compress(enumerate(smiles), non_null))
200
+ rng = np.random.RandomState(self.seed)
201
+
202
+ scaffolds = defaultdict(list)
203
+ for i, sms in smiles_list:
204
+ scaffold = self.generate_scaffold(sms)
205
+ scaffolds[scaffold].append(i)
206
+
207
+ scaffold_sets = list(scaffolds.values())
208
+ rng.shuffle(scaffold_sets)
209
+ n_total_val = int(np.floor(self.val_frac * len(smiles_list)))
210
+ n_total_test = int(np.floor(self.test_frac * len(smiles_list)))
211
+ train_idx, val_idx, test_idx = [], [], []
212
+
213
+ for scaffold_set in scaffold_sets:
214
+ if len(val_idx) + len(scaffold_set) <= n_total_val:
215
+ val_idx.extend(scaffold_set)
216
+ elif len(test_idx) + len(scaffold_set) <= n_total_test:
217
+ test_idx.extend(scaffold_set)
218
+ else:
219
+ train_idx.extend(scaffold_set)
220
+ return train_idx, val_idx, test_idx
221
+
222
+ # --- 2a. Normal Random Split ---
223
+ def random_split_indices(n, seed=42, train_frac=0.8, val_frac=0.1, test_frac=0.1):
224
+ np.random.seed(seed)
225
+ indices = np.random.permutation(n)
226
+ n_train = int(n * train_frac)
227
+ n_val = int(n * val_frac)
228
+ train_idx = indices[:n_train]
229
+ val_idx = indices[n_train:n_train+n_val]
230
+ test_idx = indices[n_train+n_val:]
231
+ return train_idx.tolist(), val_idx.tolist(), test_idx.tolist()
232
+
233
+ # --- 3. PyTorch Dataset ---
234
+ class MoleculeDataset(Dataset):
235
+ def __init__(self, smiles_list, labels, tokenizer, max_len=512):
236
+ self.smiles_list = smiles_list
237
+ self.labels = labels
238
+ self.tokenizer = tokenizer
239
+ self.max_len = max_len
240
+
241
+ def __len__(self):
242
+ return len(self.smiles_list)
243
+
244
+ def __getitem__(self, idx):
245
+ smiles = self.smiles_list[idx]
246
+ label = self.labels.iloc[idx]
247
+
248
+ encoding = self.tokenizer(
249
+ smiles,
250
+ truncation=True,
251
+ padding='max_length',
252
+ max_length=self.max_len,
253
+ return_tensors='pt'
254
+ )
255
+ item = {key: val.squeeze(0) for key, val in encoding.items()}
256
+ if isinstance(label, pd.Series):
257
+ label_values = label.values.astype(np.float32)
258
+ else:
259
+ label_values = np.array([label], dtype=np.float32)
260
+ item['labels'] = torch.tensor(label_values, dtype=torch.float)
261
+ return item
262
+
263
+ # --- 4. Model Architecture ---
264
+ def global_ap(x):
265
+ return torch.mean(x.view(x.size(0), x.size(1), -1), dim=1)
266
+
267
+ class SimSonEncoder(nn.Module):
268
+ def __init__(self, config: BertConfig, max_len: int, dropout: float = 0.1):
269
+ super(SimSonEncoder, self).__init__()
270
+ self.config = config
271
+ self.max_len = max_len
272
+ self.bert = BertModel(config, add_pooling_layer=False)
273
+ self.linear = nn.Linear(config.hidden_size, max_len)
274
+ self.dropout = nn.Dropout(dropout)
275
+ def forward(self, input_ids, attention_mask=None):
276
+ if attention_mask is None:
277
+ attention_mask = input_ids.ne(self.config.pad_token_id)
278
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
279
+ hidden_states = self.dropout(outputs.last_hidden_state)
280
+ pooled = global_ap(hidden_states)
281
+ return self.linear(pooled)
282
+
283
+ class SimSonClassifier(nn.Module):
284
+ def __init__(self, encoder: SimSonEncoder, num_labels: int, dropout=0.1):
285
+ super(SimSonClassifier, self).__init__()
286
+ self.encoder = encoder
287
+ self.clf = nn.Linear(encoder.max_len, num_labels)
288
+ self.relu = nn.ReLU()
289
+ self.dropout = nn.Dropout(dropout)
290
+ def forward(self, input_ids, attention_mask=None):
291
+ x = self.encoder(input_ids, attention_mask)
292
+ x = self.relu(self.dropout(x))
293
+ logits = self.clf(x)
294
+ return logits
295
+
296
+ def load_encoder_params(self, state_dict_path):
297
+ self.encoder.load_state_dict(torch.load(state_dict_path))
298
+
299
+ # --- 5. Training, Validation, and Testing Loops ---
300
+ def get_criterion(task_type, num_labels):
301
+ if task_type == 'classification':
302
+ return nn.BCEWithLogitsLoss()
303
+ elif task_type == 'regression':
304
+ return nn.MSELoss()
305
+ else:
306
+ raise ValueError(f"Unknown task type: {task_type}")
307
+
308
+ def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
309
+ model.train()
310
+ total_loss = 0
311
+ for batch in dataloader:
312
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
313
+ labels = batch['labels'].to(device)
314
+ optimizer.zero_grad()
315
+ outputs = model(**inputs)
316
+ loss = criterion(outputs, labels)
317
+ loss.backward()
318
+ optimizer.step()
319
+ if scheduler is not None:
320
+ scheduler.step()
321
+ total_loss += loss.item()
322
+ return total_loss / len(dataloader)
323
+
324
+ def calc_val_metrics(model, dataloader, criterion, device, task_type):
325
+ model.eval()
326
+ all_labels, all_preds = [], []
327
+ total_loss = 0
328
+ with torch.no_grad():
329
+ for batch in dataloader:
330
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
331
+ labels = batch['labels'].to(device)
332
+ outputs = model(**inputs)
333
+ loss = criterion(outputs, labels)
334
+ total_loss += loss.item()
335
+ if task_type == 'classification':
336
+ pred_probs = torch.sigmoid(outputs).cpu().numpy()
337
+ all_preds.append(pred_probs)
338
+ all_labels.append(labels.cpu().numpy())
339
+ else:
340
+ # Regression
341
+ preds = outputs.cpu().numpy()
342
+ all_preds.append(preds)
343
+ all_labels.append(labels.cpu().numpy())
344
+ avg_loss = total_loss / len(dataloader)
345
+ if task_type == 'classification':
346
+ y_true = np.concatenate(all_labels)
347
+ y_pred = np.concatenate(all_preds)
348
+ try:
349
+ score = roc_auc_score(y_true, y_pred, average='macro')
350
+ except Exception:
351
+ score = 0.0
352
+ return avg_loss, score
353
+ else:
354
+ return avg_loss, None
355
+
356
+ def test_model(model, dataloader, device, task_type):
357
+ model.eval()
358
+ all_preds, all_labels = [], []
359
+ with torch.no_grad():
360
+ for batch in dataloader:
361
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
362
+ labels = batch['labels']
363
+ outputs = model(**inputs)
364
+ if task_type == 'classification':
365
+ preds = torch.sigmoid(outputs)
366
+ else:
367
+ preds = outputs
368
+ all_preds.append(preds.cpu().numpy())
369
+ all_labels.append(labels.numpy())
370
+ return np.concatenate(all_preds), np.concatenate(all_labels)
371
+
372
+ # --- 6. Optuna Objective Function ---
373
+ def create_objective(name, info, train_smiles, train_labels, val_smiles, val_labels,
374
+ test_smiles, test_labels, scaler, tokenizer, encoder_config, device):
375
+ """Creates objective function for Optuna optimization"""
376
+
377
+ def objective(trial):
378
+ # Suggest hyperparameters
379
+ lr = trial.suggest_float('lr', 1e-6, 1e-4, log=True)
380
+ batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128, 256])
381
+ dropout = trial.suggest_float('dropout', 0.1, 0.5)
382
+ weight_decay = trial.suggest_float('weight_decay', 0.0, 0.1)
383
+ scheduler_type = trial.suggest_categorical('scheduler', ['plateau', 'cosine', 'step'])
384
+
385
+ # Additional hyperparameters for optimization
386
+ patience_lr = trial.suggest_int('patience_lr', 3, 10)
387
+ gamma = trial.suggest_float('gamma', 0.5, 0.9) if scheduler_type == 'step' else 0.1
388
+
389
+ try:
390
+ # Create datasets and dataloaders
391
+ train_dataset = MoleculeDataset(train_smiles, train_labels, tokenizer, 512)
392
+ val_dataset = MoleculeDataset(val_smiles, val_labels, tokenizer, 512)
393
+ test_dataset = MoleculeDataset(test_smiles, test_labels, tokenizer, 512)
394
+
395
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
396
+ val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
397
+ test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
398
+
399
+ # Create model
400
+ encoder = SimSonEncoder(encoder_config, 512, dropout=dropout)
401
+ encoder = torch.compile(encoder)
402
+ model = SimSonClassifier(encoder, num_labels=info['num_labels'], dropout=dropout).to(device)
403
+ model.load_encoder_params('../simson_checkpoints/checkpoint_best_model.bin')
404
+
405
+ criterion = get_criterion(info['task_type'], info['num_labels'])
406
+ optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
407
+
408
+ # Create scheduler based on trial suggestion
409
+ if scheduler_type == 'plateau':
410
+ scheduler = optim.lr_scheduler.ReduceLROnPlateau(
411
+ optimizer, mode='max', factor=gamma, patience=patience_lr
412
+ )
413
+ elif scheduler_type == 'cosine':
414
+ scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)
415
+ else: # step
416
+ scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=gamma)
417
+
418
+ # Training loop
419
+ best_val_metric = -np.inf
420
+ patience_counter = 0
421
+ patience = 15
422
+
423
+ for epoch in range(50): # Max epochs
424
+ train_loss = train_epoch(model, train_loader, optimizer,
425
+ scheduler if scheduler_type == 'cosine' else None,
426
+ criterion, device)
427
+ val_loss, val_metric = calc_val_metrics(model, val_loader, criterion, device, info['task_type'])
428
+
429
+ # Update scheduler
430
+ if scheduler_type == 'plateau':
431
+ scheduler.step(val_loss if val_loss is not None else -val_loss)
432
+ elif scheduler_type == 'step':
433
+ scheduler.step()
434
+
435
+ # Determine metric to optimize
436
+ if info['task_type'] == 'classification':
437
+ current_metric = val_loss if val_loss is not None else 0.0
438
+ else:
439
+ current_metric = -val_loss # For regression, minimize loss
440
+
441
+ # Early stopping and best model tracking
442
+ if current_metric <= val_loss:
443
+ best_val_metric = current_metric
444
+ patience_counter = 0
445
+ else:
446
+ patience_counter += 1
447
+ if patience_counter >= patience:
448
+ break
449
+
450
+ # Optuna pruning
451
+ trial.report(current_metric, epoch)
452
+ if trial.should_prune():
453
+ raise optuna.TrialPruned()
454
+
455
+ return best_val_metric
456
+
457
+ except Exception as e:
458
+ print(f"Trial failed with error: {e}")
459
+ return -np.inf # Return worst possible score for failed trials
460
+
461
+ return objective
462
+
463
+ # --- 7. Main Execution Block ---
464
+ def main():
465
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
466
+ print(f"Using device: {DEVICE}")
467
+
468
+ DATASETS_TO_RUN = {
469
+ #'esol': {'task_type': 'regression', 'num_labels': 1, 'split': 'random'},
470
+ # Add more datasets here, e.g. 'bbbp': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
471
+ #'sider': {'task_type': 'classification', 'num_labels': 27, 'split': 'random'},
472
+ #'bace': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
473
+ 'clintox': {'task_type': 'classification', 'num_labels': 2, 'split': 'scaffold'},
474
+ 'tox21': {'task_type': 'classification', 'num_labels': 12, 'split': 'random'},
475
+ 'bbbp': {'task_type': 'classification', 'num_labels': 1, 'split': 'scaffold'},
476
+ 'hiv': {'task_type': 'classification', 'num_labels': 1, 'split': 'scaffold'},
477
+ }
478
+
479
+ MAX_LEN = 512
480
+ N_TRIALS = 100 # Number of Optuna trials to run
481
+
482
+ TOKENIZER = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
483
+ ENCODER_CONFIG = BertConfig(
484
+ vocab_size=TOKENIZER.vocab_size,
485
+ hidden_size=768,
486
+ num_hidden_layers=4,
487
+ num_attention_heads=12,
488
+ intermediate_size=2048,
489
+ max_position_embeddings=512
490
+ )
491
+
492
+ aggregated_results = {}
493
+
494
+ for name, info in DATASETS_TO_RUN.items():
495
+ print(f"\n{'='*20} Processing Dataset: {name.upper()} ({info['split']} split) {'='*20}")
496
+ smiles, labels = load_lists_from_url(name)
497
+
498
+ # For regression tasks, scale labels and remember scaling transform
499
+ scaler = None
500
+ if info["task_type"] == "regression":
501
+ scaler = StandardScaler()
502
+ all_labels = labels.values.reshape(-1, 1)
503
+ scaler.fit(all_labels)
504
+ labels = pd.Series(scaler.transform(all_labels).flatten(), index=labels.index)
505
+
506
+ # Data split
507
+ if info.get('split', 'scaffold') == 'scaffold':
508
+ splitter = ScaffoldSplitter(data=name, seed=42)
509
+ train_idx, val_idx, test_idx = splitter.scaffold_split()
510
+ elif info['split'] == 'random':
511
+ train_idx, val_idx, test_idx = random_split_indices(len(smiles), seed=42)
512
+ else:
513
+ raise ValueError(f"Unknown split type for {name}: {info['split']}")
514
+
515
+ train_smiles = smiles.iloc[train_idx].reset_index(drop=True)
516
+ train_labels = labels.iloc[train_idx].reset_index(drop=True)
517
+ val_smiles = smiles.iloc[val_idx].reset_index(drop=True)
518
+ val_labels = labels.iloc[val_idx].reset_index(drop=True)
519
+ test_smiles = smiles.iloc[test_idx].reset_index(drop=True)
520
+ test_labels = labels.iloc[test_idx].reset_index(drop=True)
521
+ print(f"Data split - Train: {len(train_smiles)}, Val: {len(val_smiles)}, Test: {len(test_smiles)}")
522
+
523
+ # Create Optuna study
524
+ study = optuna.create_study(
525
+ direction='maximize',
526
+ pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10)
527
+ )
528
+
529
+ # Create objective function
530
+ objective_func = create_objective(
531
+ name, info, train_smiles, train_labels, val_smiles, val_labels,
532
+ test_smiles, test_labels, scaler, TOKENIZER, ENCODER_CONFIG, DEVICE
533
+ )
534
+
535
+ # Run optimization
536
+ print(f"Starting Optuna optimization with {N_TRIALS} trials...")
537
+ study.optimize(objective_func, n_trials=N_TRIALS, timeout=None)
538
+
539
+ # Get best parameters
540
+ best_params = study.best_params
541
+ best_score = study.best_value
542
+ print(f"Best parameters: {best_params}")
543
+ print(f"Best validation score: {0:.4f}")
544
+
545
+ # Train final model with best parameters
546
+ print("Training final model with best parameters...")
547
+ train_dataset = MoleculeDataset(train_smiles, train_labels, TOKENIZER, MAX_LEN)
548
+ val_dataset = MoleculeDataset(val_smiles, val_labels, TOKENIZER, MAX_LEN)
549
+ test_dataset = MoleculeDataset(test_smiles, test_labels, TOKENIZER, MAX_LEN)
550
+
551
+ train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)
552
+ val_loader = DataLoader(val_dataset, batch_size=best_params['batch_size'], shuffle=False)
553
+ test_loader = DataLoader(test_dataset, batch_size=best_params['batch_size'], shuffle=False)
554
+
555
+ # Final model training
556
+ encoder = SimSonEncoder(ENCODER_CONFIG, 512, dropout=best_params['dropout'])
557
+ encoder = torch.compile(encoder)
558
+ model = SimSonClassifier(encoder, num_labels=info['num_labels'], dropout=best_params['dropout']).to(DEVICE)
559
+ model.load_encoder_params('../simson_checkpoints/checkpoint_best_model.bin')
560
+
561
+ criterion = get_criterion(info['task_type'], info['num_labels'])
562
+ optimizer = optim.Adam(model.parameters(), lr=best_params['lr'], weight_decay=best_params['weight_decay'])
563
+
564
+ # Set up best scheduler
565
+ if best_params['scheduler'] == 'plateau':
566
+ scheduler = optim.lr_scheduler.ReduceLROnPlateau(
567
+ optimizer, mode='max', factor=best_params.get('gamma', 0.7),
568
+ patience=best_params.get('patience_lr', 5)
569
+ )
570
+ elif best_params['scheduler'] == 'cosine':
571
+ scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)
572
+ else:
573
+ scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=best_params.get('gamma', 0.1))
574
+
575
+ # Train with best parameters
576
+ best_val_metric = -np.inf
577
+ best_model_state = None
578
+ patience_counter = 0
579
+ patience = 15
580
+
581
+ for epoch in range(50):
582
+ train_loss = train_epoch(model, train_loader, optimizer,
583
+ scheduler if best_params['scheduler'] == 'cosine' else None,
584
+ criterion, DEVICE)
585
+ val_loss, val_metric = calc_val_metrics(model, val_loader, criterion, DEVICE, info['task_type'])
586
+
587
+ if best_params['scheduler'] == 'plateau':
588
+ scheduler.step(val_loss if val_loss is not None else -val_loss)
589
+ elif best_params['scheduler'] == 'step':
590
+ scheduler.step()
591
+
592
+ if info['task_type'] == 'classification':
593
+ print(f"Epoch {epoch+1}/50 | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | ROC AUC: {val_metric:.4f}")
594
+ current_metric = val_metric if val_metric is not None else 0.0
595
+ else:
596
+ print(f"Epoch {epoch+1}/50 | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
597
+ current_metric = -val_loss
598
+
599
+ if current_metric <= val_loss:
600
+ best_val_metric = current_metric
601
+ best_model_state = copy.deepcopy(model.state_dict())
602
+ patience_counter = 0
603
+ else:
604
+ patience_counter += 1
605
+ if patience_counter >= patience:
606
+ print(f'Early stopping at epoch {epoch+1}')
607
+ break
608
+
609
+ # Test final model
610
+ if best_model_state is not None:
611
+ model.load_state_dict(best_model_state)
612
+
613
+ test_preds, test_true = test_model(model, test_loader, DEVICE, info['task_type'])
614
+
615
+ # Calculate final metrics
616
+ if info['task_type'] == 'regression' and scaler is not None:
617
+ test_preds = scaler.inverse_transform(test_preds.reshape(-1, 1)).flatten()
618
+ test_true = scaler.inverse_transform(test_true.reshape(-1, 1)).flatten()
619
+ rmse = root_mean_squared_error(test_true, test_preds)
620
+ mae = mean_absolute_error(test_true, test_preds)
621
+ final_score = -rmse
622
+ print(f"Test RMSE: {rmse:.4f}, MAE: {mae:.4f}")
623
+ else:
624
+ try:
625
+ final_score = roc_auc_score(test_true, test_preds, average='macro')
626
+ print(f"Test ROC AUC: {final_score:.4f}")
627
+ except Exception:
628
+ final_score = 0.0
629
+
630
+ # Compute embedding similarity using pre-computed augmented SMILES
631
+ print("Creating pre-computed augmented SMILES for similarity computation...")
632
+ test_smiles_list = list(test_smiles)
633
+ similarity_file_path = f"{name}_test_augmented.parquet"
634
+ create_augmented_smiles_file(test_smiles_list, similarity_file_path, num_augmentations=1)
635
+
636
+ # Load pre-computed dataset for similarity computation
637
+ similarity_dataset = PrecomputedContrastiveSmilesDataset(
638
+ TOKENIZER, similarity_file_path, max_length=MAX_LEN
639
+ )
640
+
641
+ similarities = compute_embedding_similarity_precomputed(
642
+ model.encoder, similarity_dataset, DEVICE
643
+ )
644
+ print(f"Similarity score: {similarities.mean():.4f}")
645
+
646
+ # Clean up temporary file
647
+ if os.path.exists(similarity_file_path):
648
+ os.remove(similarity_file_path)
649
+
650
+ aggregated_results[name] = {
651
+ 'best_score': final_score,
652
+ 'best_params': best_params,
653
+ 'optuna_trials': len(study.trials),
654
+ 'study': study,
655
+ 'similarity_score': similarities.mean()
656
+ }
657
+
658
+ if name == 'do_not_save':
659
+ torch.save(model.encoder.state_dict(), 'moleculenet_clintox_encoder.bin')
660
+
661
+ print(f"\n{'='*20} AGGREGATED RESULTS {'='*20}")
662
+ for name, result in aggregated_results.items():
663
+ print(f"{name}: Best score: {result['best_score']:.4f}")
664
+ print(f" Best parameters: {result['best_params']}")
665
+ print(f" Total trials: {result['optuna_trials']}")
666
+ print(f" Similarity score: {result['similarity_score']:.4f}")
667
+
668
+ print("\nScript finished.")
669
+
670
+ if __name__ == '__main__':
671
+ main()
simson_modeling/moleculenet_eval/eval.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.optim as optim
6
+ from torch.utils.data import Dataset, DataLoader
7
+ from transformers import BertConfig, BertModel, AutoTokenizer
8
+ from rdkit import Chem, RDLogger
9
+ from rdkit.Chem.Scaffolds import MurckoScaffold
10
+ import copy
11
+ from tqdm import tqdm
12
+ import os
13
+ from sklearn.metrics import roc_auc_score, root_mean_squared_error, mean_absolute_error
14
+ from itertools import compress
15
+ from collections import defaultdict
16
+ from sklearn.metrics.pairwise import cosine_similarity
17
+ RDLogger.DisableLog('rdApp.*')
18
+
19
+
20
+ torch.set_float32_matmul_precision('high')
21
+
22
+ # --- 0. Smiles enumeration
23
+ class SmilesEnumerator:
24
+ """Generates randomized SMILES strings for data augmentation."""
25
+ def randomize_smiles(self, smiles):
26
+ try:
27
+ mol = Chem.MolFromSmiles(smiles)
28
+ return Chem.MolToSmiles(mol, doRandom=True, canonical=False) if mol else smiles
29
+ except:
30
+ return smiles
31
+
32
+
33
+ def compute_embedding_similarity(encoder, smiles_list, tokenizer, device, max_len=256):
34
+ encoder.eval()
35
+ enumerator = SmilesEnumerator()
36
+
37
+ embeddings_orig = []
38
+ embeddings_aug = []
39
+
40
+ with torch.no_grad():
41
+ for smi in smiles_list:
42
+ # Original SMILES encoding
43
+ encoding_orig = tokenizer(
44
+ smi,
45
+ truncation=True,
46
+ padding='max_length',
47
+ max_length=max_len,
48
+ return_tensors='pt'
49
+ )
50
+ # Augmented SMILES encoding
51
+ smi_aug = enumerator.randomize_smiles(smi)
52
+ encoding_aug = tokenizer(
53
+ smi_aug,
54
+ truncation=True,
55
+ padding='max_length',
56
+ max_length=max_len,
57
+ return_tensors='pt'
58
+ )
59
+
60
+ input_ids_orig = encoding_orig.input_ids.to(device)
61
+ attention_mask_orig = encoding_orig.attention_mask.to(device)
62
+ input_ids_aug = encoding_aug.input_ids.to(device)
63
+ attention_mask_aug = encoding_aug.attention_mask.to(device)
64
+
65
+ emb_orig = encoder(input_ids_orig, attention_mask_orig).cpu().numpy().flatten()
66
+ emb_aug = encoder(input_ids_aug, attention_mask_aug).cpu().numpy().flatten()
67
+
68
+ embeddings_orig.append(emb_orig)
69
+ embeddings_aug.append(emb_aug)
70
+
71
+ embeddings_orig = np.array(embeddings_orig)
72
+ embeddings_aug = np.array(embeddings_aug)
73
+
74
+ # Cosine similarity between each original and its augmented version
75
+ similarities = np.array([cosine_similarity([embeddings_orig[i]], [embeddings_aug[i]])[0][0] for i in range(len(embeddings_orig))])
76
+ return similarities
77
+
78
+ # --- 1. Data Loading ---
79
+ def load_lists_from_url(data):
80
+ if data == 'bbbp':
81
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv')
82
+ smiles, labels = df.smiles, df.p_np
83
+ elif data == 'clintox':
84
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz', compression='gzip')
85
+ smiles = df.smiles
86
+ labels = df.drop(['smiles'], axis=1)
87
+ elif data == 'hiv':
88
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/HIV.csv')
89
+ smiles, labels = df.smiles, df.HIV_active
90
+ elif data == 'sider':
91
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/sider.csv.gz', compression='gzip')
92
+ smiles = df.smiles
93
+ labels = df.drop(['smiles'], axis=1)
94
+ elif data == 'esol':
95
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv')
96
+ smiles = df.smiles
97
+ labels = df['ESOL predicted log solubility in mols per litre']
98
+ elif data == 'freesolv':
99
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/SAMPL.csv')
100
+ smiles = df.smiles
101
+ labels = df.calc
102
+ elif data == 'lipophicility':
103
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv')
104
+ smiles, labels = df.smiles, df['exp']
105
+ elif data == 'tox21':
106
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz', compression='gzip')
107
+ df = df.dropna(axis=0, how='any').reset_index(drop=True)
108
+ smiles = df.smiles
109
+ labels = df.drop(['mol_id', 'smiles'], axis=1)
110
+ elif data == 'bace':
111
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv')
112
+ smiles, labels = df.mol, df.Class
113
+ elif data == 'qm8':
114
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv')
115
+ df = df.dropna(axis=0, how='any').reset_index(drop=True)
116
+ smiles = df.smiles
117
+ labels = df.drop(['smiles', 'E2-PBE0.1', 'E1-PBE0.1', 'f1-PBE0.1', 'f2-PBE0.1'], axis=1)
118
+ return smiles, labels
119
+
120
+ # --- 2. Scaffold Splitting ---
121
+ class ScaffoldSplitter:
122
+ def __init__(self, data, seed, train_frac=0.8, val_frac=0.1, test_frac=0.1, include_chirality=True):
123
+ self.data = data
124
+ self.seed = seed
125
+ self.include_chirality = include_chirality
126
+ self.train_frac = train_frac
127
+ self.val_frac = val_frac
128
+ self.test_frac = test_frac
129
+
130
+ def generate_scaffold(self, smiles):
131
+ mol = Chem.MolFromSmiles(smiles)
132
+ scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=self.include_chirality)
133
+ return scaffold
134
+
135
+ def scaffold_split(self):
136
+ smiles, labels = load_lists_from_url(self.data)
137
+ non_null = np.ones(len(smiles)) == 0
138
+
139
+ if self.data in {'tox21', 'sider', 'clintox'}:
140
+ for i in range(len(smiles)):
141
+ if Chem.MolFromSmiles(smiles[i]) and labels.loc[i].isnull().sum() == 0:
142
+ non_null[i] = 1
143
+ else:
144
+ for i in range(len(smiles)):
145
+ if Chem.MolFromSmiles(smiles[i]):
146
+ non_null[i] = 1
147
+
148
+ smiles_list = list(compress(enumerate(smiles), non_null))
149
+ rng = np.random.RandomState(self.seed)
150
+
151
+ scaffolds = defaultdict(list)
152
+ for i, sms in smiles_list:
153
+ scaffold = self.generate_scaffold(sms)
154
+ scaffolds[scaffold].append(i)
155
+
156
+ scaffold_sets = list(scaffolds.values())
157
+ rng.shuffle(scaffold_sets)
158
+ n_total_val = int(np.floor(self.val_frac * len(smiles_list)))
159
+ n_total_test = int(np.floor(self.test_frac * len(smiles_list)))
160
+ train_idx, val_idx, test_idx = [], [], []
161
+
162
+ for scaffold_set in scaffold_sets:
163
+ if len(val_idx) + len(scaffold_set) <= n_total_val:
164
+ val_idx.extend(scaffold_set)
165
+ elif len(test_idx) + len(scaffold_set) <= n_total_test:
166
+ test_idx.extend(scaffold_set)
167
+ else:
168
+ train_idx.extend(scaffold_set)
169
+ return train_idx, val_idx, test_idx
170
+
171
+ # --- 2a. Normal Random Split ---
172
+ def random_split_indices(n, seed=42, train_frac=0.8, val_frac=0.1, test_frac=0.1):
173
+ np.random.seed(seed)
174
+ indices = np.random.permutation(n)
175
+ n_train = int(n * train_frac)
176
+ n_val = int(n * val_frac)
177
+ train_idx = indices[:n_train]
178
+ val_idx = indices[n_train:n_train+n_val]
179
+ test_idx = indices[n_train+n_val:]
180
+ return train_idx.tolist(), val_idx.tolist(), test_idx.tolist()
181
+
182
+ # --- 3. PyTorch Dataset ---
183
+ class MoleculeDataset(Dataset):
184
+ def __init__(self, smiles_list, labels, tokenizer, max_len=512):
185
+ self.smiles_list = smiles_list
186
+ self.labels = labels
187
+ self.tokenizer = tokenizer
188
+ self.max_len = max_len
189
+
190
+ def __len__(self):
191
+ return len(self.smiles_list)
192
+
193
+ def __getitem__(self, idx):
194
+ smiles = self.smiles_list[idx]
195
+ label = self.labels.iloc[idx]
196
+
197
+ encoding = self.tokenizer(
198
+ smiles,
199
+ truncation=True,
200
+ padding='max_length',
201
+ max_length=self.max_len,
202
+ return_tensors='pt'
203
+ )
204
+ item = {key: val.squeeze(0) for key, val in encoding.items()}
205
+ if isinstance(label, pd.Series):
206
+ label_values = label.values.astype(np.float32)
207
+ else:
208
+ label_values = np.array([label], dtype=np.float32)
209
+ item['labels'] = torch.tensor(label_values, dtype=torch.float)
210
+ return item
211
+
212
+ # --- 4. Model Architecture ---
213
+ def global_ap(x):
214
+ return torch.mean(x.view(x.size(0), x.size(1), -1), dim=1)
215
+
216
+ class SimSonEncoder(nn.Module):
217
+ def __init__(self, config: BertConfig, max_len: int, dropout: float = 0.1):
218
+ super(SimSonEncoder, self).__init__()
219
+ self.config = config
220
+ self.max_len = max_len
221
+ self.bert = BertModel(config, add_pooling_layer=False)
222
+ self.linear = nn.Linear(config.hidden_size, max_len)
223
+ self.dropout = nn.Dropout(dropout)
224
+ def forward(self, input_ids, attention_mask=None):
225
+ if attention_mask is None:
226
+ attention_mask = input_ids.ne(self.config.pad_token_id)
227
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
228
+ hidden_states = self.dropout(outputs.last_hidden_state)
229
+ pooled = global_ap(hidden_states)
230
+ return self.linear(pooled)
231
+
232
+ class SimSonClassifier(nn.Module):
233
+ def __init__(self, encoder: SimSonEncoder, num_labels: int, dropout=0.1):
234
+ super(SimSonClassifier, self).__init__()
235
+ self.encoder = encoder
236
+ self.clf = nn.Linear(encoder.max_len, num_labels)
237
+ self.relu = nn.ReLU()
238
+ self.dropout = nn.Dropout(dropout)
239
+ def forward(self, input_ids, attention_mask=None):
240
+ x = self.encoder(input_ids, attention_mask)
241
+ x = self.relu(self.dropout(x))
242
+ logits = self.clf(x)
243
+ return logits
244
+
245
+ def load_encoder_params(self, state_dict_path):
246
+ self.encoder.load_state_dict(torch.load(state_dict_path))
247
+ print("Pretrained encoder parameters loaded.")
248
+
249
+ # --- 5. Training, Validation, and Testing Loops ---
250
+ def get_criterion(task_type, num_labels):
251
+ if task_type == 'classification':
252
+ return nn.BCEWithLogitsLoss()
253
+ elif task_type == 'regression':
254
+ return nn.MSELoss()
255
+ else:
256
+ raise ValueError(f"Unknown task type: {task_type}")
257
+
258
+ def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
259
+ model.train()
260
+ total_loss = 0
261
+ for batch in dataloader:
262
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
263
+ labels = batch['labels'].to(device)
264
+ optimizer.zero_grad()
265
+ outputs = model(**inputs)
266
+ loss = criterion(outputs, labels)
267
+ loss.backward()
268
+ optimizer.step()
269
+ #scheduler.step()
270
+ total_loss += loss.item()
271
+ return total_loss / len(dataloader)
272
+
273
+ def eval_epoch(model, dataloader, criterion, device):
274
+ model.eval()
275
+ total_loss = 0
276
+ with torch.no_grad():
277
+ for batch in dataloader:
278
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
279
+ labels = batch['labels'].to(device)
280
+ outputs = model(**inputs)
281
+ loss = criterion(outputs, labels)
282
+ total_loss += loss.item()
283
+ return total_loss / len(dataloader)
284
+
285
+ def test_model(model, dataloader, device):
286
+ model.eval()
287
+ all_preds, all_labels = [], []
288
+ with torch.no_grad():
289
+ for batch in dataloader:
290
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
291
+ labels = batch['labels']
292
+ outputs = model(**inputs)
293
+ preds = torch.sigmoid(outputs)
294
+ all_preds.append(preds.cpu().numpy())
295
+ all_labels.append(labels.numpy())
296
+ return np.concatenate(all_preds), np.concatenate(all_labels)
297
+
298
+ def calc_val_metrics(model, dataloader, criterion, device, task_type):
299
+ model.eval()
300
+ all_labels, all_preds = [], []
301
+ total_loss = 0
302
+ with torch.no_grad():
303
+ for batch in dataloader:
304
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
305
+ labels = batch['labels'].to(device)
306
+ outputs = model(**inputs)
307
+ loss = criterion(outputs, labels)
308
+ total_loss += loss.item()
309
+ if task_type == 'classification':
310
+ pred_probs = torch.sigmoid(outputs).cpu().numpy()
311
+ all_preds.append(pred_probs)
312
+ all_labels.append(labels.cpu().numpy())
313
+ else:
314
+ # Regression
315
+ preds = outputs.cpu().numpy()
316
+ all_preds.append(preds)
317
+ all_labels.append(labels.cpu().numpy())
318
+ avg_loss = total_loss / len(dataloader)
319
+ if task_type == 'classification':
320
+ y_true = np.concatenate(all_labels)
321
+ y_pred = np.concatenate(all_preds)
322
+ try:
323
+ score = roc_auc_score(y_true, y_pred, average='macro')
324
+ except Exception:
325
+ score = 0.0
326
+ return avg_loss, score
327
+ else:
328
+ return avg_loss, None
329
+
330
+ # --- 6. Main Execution Block ---
331
+ def main():
332
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
333
+ print(f"Using device: {DEVICE}")
334
+
335
+ DATASETS_TO_RUN = {
336
+ # 'esol': {'task_type': 'regression', 'num_labels': 1, 'split': 'random'},
337
+ #'tox21': {'task_type': 'classification', 'num_labels': 12, 'split': 'random'},
338
+ #'hiv': {'task_type': 'classification', 'num_labels': 1, 'split': 'scaffold'},
339
+ # Add more datasets here, e.g. 'bbbp': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
340
+ #'sider': {'task_type': 'classification', 'num_labels': 27, 'split': 'random'},
341
+ #'bace': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
342
+ 'clintox': {'task_type': 'classification', 'num_labels': 2, 'split': 'random'},
343
+ #'bbbp': {'task_type': 'classification', 'num_labels': 1, 'split': 'scaffold'}
344
+ }
345
+ PATIENCE = 15
346
+ EPOCHS = 50
347
+ LEARNING_RATE = 1e-4
348
+ BATCH_SIZE = 16
349
+ MAX_LEN = 512
350
+
351
+ TOKENIZER = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
352
+ ENCODER_CONFIG = BertConfig(
353
+ vocab_size=TOKENIZER.vocab_size,
354
+ hidden_size=768,
355
+ num_hidden_layers=4,
356
+ num_attention_heads=12,
357
+ intermediate_size=2048,
358
+ max_position_embeddings=512
359
+ )
360
+
361
+ aggregated_results = {}
362
+
363
+ for name, info in DATASETS_TO_RUN.items():
364
+ print(f"\n{'='*20} Processing Dataset: {name.upper()} ({info['split']} split) {'='*20}")
365
+ smiles, labels = load_lists_from_url(name)
366
+
367
+ # Split selection
368
+ if info.get('split', 'scaffold') == 'scaffold':
369
+ splitter = ScaffoldSplitter(data=name, seed=42)
370
+ train_idx, val_idx, test_idx = splitter.scaffold_split()
371
+ elif info['split'] == 'random':
372
+ train_idx, val_idx, test_idx = random_split_indices(len(smiles), seed=42)
373
+ else:
374
+ raise ValueError(f"Unknown split type for {name}: {info['split']}")
375
+
376
+ train_smiles = smiles.iloc[train_idx].reset_index(drop=True)
377
+ train_labels = labels.iloc[train_idx].reset_index(drop=True)
378
+ val_smiles = smiles.iloc[val_idx].reset_index(drop=True)
379
+ val_labels = labels.iloc[val_idx].reset_index(drop=True)
380
+ test_smiles = smiles.iloc[test_idx].reset_index(drop=True)
381
+ test_labels = labels.iloc[test_idx].reset_index(drop=True)
382
+ print(f"Data split - Train: {len(train_smiles)}, Val: {len(val_smiles)}, Test: {len(test_smiles)}")
383
+
384
+ train_dataset = MoleculeDataset(train_smiles, train_labels, TOKENIZER, MAX_LEN)
385
+ val_dataset = MoleculeDataset(val_smiles, val_labels, TOKENIZER, MAX_LEN)
386
+ test_dataset = MoleculeDataset(test_smiles, test_labels, TOKENIZER, MAX_LEN)
387
+
388
+ train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
389
+ val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
390
+ test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
391
+
392
+ encoder = SimSonEncoder(ENCODER_CONFIG, 512)
393
+ encoder = torch.compile(encoder)
394
+ model = SimSonClassifier(encoder, num_labels=info['num_labels']).to(DEVICE)
395
+ model.load_encoder_params('../simson_checkpoints/checkpoint_best_model.bin')
396
+ criterion = get_criterion(info['task_type'], info['num_labels'])
397
+ optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=0.0024)
398
+ scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.59298)
399
+
400
+ best_val_loss = float('-inf')
401
+ best_model_state = None
402
+ current_patience = 0
403
+ for epoch in range(EPOCHS):
404
+ train_loss = train_epoch(model, train_loader, optimizer, scheduler, criterion, DEVICE)
405
+ val_loss, val_metric = calc_val_metrics(model, val_loader, criterion, 'cuda', info['task_type'])
406
+ print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | ROC AUC: {val_metric:.4f}")
407
+
408
+ if val_metric <= val_loss:
409
+ best_val_loss = val_loss
410
+ best_model_state = copy.deepcopy(model.state_dict())
411
+ print(f" -> New best model saved with validation loss: {best_val_loss:.4f}")
412
+ current_patience = 0
413
+ else:
414
+ current_patience += 1
415
+ if current_patience >= PATIENCE:
416
+ print(f'Early stopping at {PATIENCE} epochs')
417
+ break
418
+
419
+ print("\nTesting with the best model...")
420
+ if not best_model_state is None:
421
+ model.load_state_dict(best_model_state)
422
+ test_loss = eval_epoch(model, test_loader, criterion, DEVICE)
423
+ print(f'Test loss: {test_loss}')
424
+ test_preds, test_true = test_model(model, test_loader, DEVICE)
425
+
426
+ aggregated_results[name] = {
427
+ 'best_val_loss': best_val_loss,
428
+ 'test_predictions': test_preds,
429
+ 'test_labels': test_true
430
+ }
431
+ print(f"Finished testing for {name}.")
432
+ test_smiles_list = list(test_smiles)
433
+ similarities = compute_embedding_similarity(
434
+ model.encoder, test_smiles_list, TOKENIZER, DEVICE, MAX_LEN
435
+ )
436
+ print(f"Similarity score: {similarities.mean():.4f}")
437
+ if name == 'do_not_save':
438
+ torch.save(model.encoder.state_dict(), 'moleculenet_clintox_encoder.bin')
439
+
440
+
441
+
442
+ print(f"\n{'='*20} AGGREGATED RESULTS {'='*20}")
443
+ for name, result in aggregated_results.items():
444
+ if name in ['bbbp', 'tox21', 'sider', 'clintox', 'hiv', 'bace']:
445
+ auc = roc_auc_score(result['test_labels'], result['test_predictions'], average='macro')
446
+ print(f'{name} ROC AUC: {auc}')
447
+
448
+ if name in ['lipophicility', 'esol', 'qm8']:
449
+ rmse = root_mean_squared_error(result['test_labels'], result['test_predictions'])
450
+ mae = mean_absolute_error(result['test_labels'], result['test_predictions'])
451
+ print(f'{name} MAE: {mae}')
452
+ print(f'{name} RMSE: {rmse}')
453
+
454
+ print("\nScript finished.")
455
+
456
+ if __name__ == '__main__':
457
+ main()
simson_modeling/moleculenet_eval/eval.py.save ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from torch.utils.data import Dataset, DataLoader
5
+ import pandas as pd
6
+ import numpy as np
7
+ from sklearn.metrics import roc_auc_score, average_precision_score
8
+ from transformers import BertModel, BertConfig
9
+ import os
10
+ import json
11
+ from collections import defaultdict
12
+ from rdkit import Chem
13
+ from rdkit.Chem import Scaffolds
14
+ import warnings
15
+ warnings.filterwarnings('ignore')
16
+ from transformers import AutoTokenizer
17
+
18
+ # Global average pooling function (assuming this exists in your codebase)
19
+ def global_ap(x, dim=1):
20
+ return torch.mean(x, dim=dim)
21
+
22
+ class SimSonClassifier(nn.Module):
23
+ def __init__(self, config: BertConfig, max_len: int, num_labels: int, dropout: float = 0.1):
24
+ super(SimSonClassifier, self).__init__()
25
+ self.config = config
26
+ self.max_len = max_len
27
+ self.num_labels = num_labels
28
+
29
+ # BERT encoder (same as SimSonEncoder)
30
+ self.bert = BertModel(config, add_pooling_layer=False)
31
+ self.dropout = nn.Dropout(dropout)
32
+
33
+ # Classification head
34
+ self.classifier = nn.Linear(config.hidden_size, num_labels)
35
+
36
+ def forward(self, input_ids, attention_mask=None):
37
+ if attention_mask is None:
38
+ attention_mask = input_ids.ne(0)
39
+
40
+ outputs = self.bert(
41
+ input_ids=input_ids,
42
+ attention_mask=attention_mask
43
+ )
44
+
45
+ hidden_states = outputs.last_hidden_state
46
+ hidden_states = self.dropout(hidden_states)
47
+
48
+ # Global average pooling
49
+ pooled = global_ap(hidden_states)
50
+
51
+ # Classification output
52
+ logits = self.classifier(pooled)
53
+
54
+ return logits
55
+
56
+ def load_encoder_weights(self, encoder_path):
57
+ """Load pretrained SimSonEncoder weights into the classifier"""
58
+ encoder_state = torch.load(encoder_path, map_location='cpu')
59
+
60
+ # Create mapping from encoder to classifier state dict
61
+ classifier_state = {}
62
+ for key, value in encoder_state.items():
63
+ if key.startswith('bert.') or key.startswith('dropout.'):
64
+ classifier_state[key] = value
65
+
66
+ # Load only the matching weights
67
+ self.load_state_dict(classifier_state, strict=False)
68
+ print(f"Loaded encoder weights from {encoder_path}")
69
+
70
+
71
+
72
+ def load_moleculenet_data(dataset_name):
73
+ """Load MoleculeNet dataset and return SMILES and labels"""
74
+ if dataset_name == 'bbbp':
75
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv')
76
+ smiles, labels = df.smiles, df.p_np
77
+ elif dataset_name == 'clintox':
78
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz', compression='gzip')
79
+ smiles = df.smiles
80
+ labels = df.drop(['smiles'], axis=1)
81
+ elif dataset_name == 'hiv':
82
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/HIV.csv')
83
+ smiles, labels = df.smiles, df.HIV_active
84
+ elif dataset_name == 'sider':
85
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/sider.csv.gz', compression='gzip')
86
+ smiles = df.smiles
87
+ labels = df.drop(['smiles'], axis=1)
88
+ elif dataset_name == 'tox21':
89
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz', compression='gzip')
90
+ df = df.dropna(axis=0, how='any').reset_index(drop=True)
91
+ smiles = df.smiles
92
+ labels = df.drop(['mol_id', 'smiles'], axis=1)
93
+ elif dataset_name == 'bace':
94
+ df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv')
95
+ smiles, labels = df.mol, df.Class
96
+ else:
97
+ raise ValueError(f"Dataset {dataset_name} not supported")
98
+
99
+ return smiles, labels
100
+
101
+ class MoleculeDataset(Dataset):
102
+ def __init__(self, smiles_list, labels, tokenizer, max_length=512):
103
+ self.smiles = smiles_list
104
+ self.labels = labels
105
+ self.tokenizer = tokenizer
106
+ self.max_length = max_length
107
+
108
+ def __len__(self):
109
+ return len(self.smiles)
110
+
111
+ def __getitem__(self, idx):
112
+ smiles = self.smiles[idx]
113
+
114
+ # Tokenize SMILES
115
+ encoding = self.tokenizer(
116
+ smiles,
117
+ truncation=True,
118
+ padding='max_length',
119
+ max_length=self.max_length,
120
+ return_tensors='pt'
121
+ )
122
+
123
+ # Handle labels
124
+ if isinstance(self.labels, pd.Series):
125
+ label = torch.tensor(self.labels.iloc[idx], dtype=torch.float32)
126
+ else: # DataFrame (multi-label)
127
+ label = torch.tensor(self.labels.iloc[idx].values, dtype=torch.float32)
128
+
129
+ return {
130
+ 'input_ids': encoding['input_ids'].flatten(),
131
+ 'attention_mask': encoding['attention_mask'].flatten(),
132
+ 'labels': label
133
+ }
134
+
135
+ def get_loss_fn(num_labels):
136
+ """Get appropriate loss function based on number of labels"""
137
+ if num_labels == 1:
138
+ return nn.BCEWithLogitsLoss()
139
+ else:
140
+ return nn.BCEWithLogitsLoss() # Multi-label classification
141
+
142
+ def compute_metrics(predictions, labels, num_labels):
143
+ """Compute ROC-AUC for single or multi-label classification"""
144
+ predictions = torch.sigmoid(predictions).cpu().numpy()
145
+ labels = labels.cpu().numpy()
146
+
147
+ if num_labels == 1:
148
+ # Single label
149
+ try:
150
+ auc = roc_auc_score(labels, predictions)
151
+ return {'roc_auc': auc}
152
+ except:
153
+ return {'roc_auc': 0.5}
154
+ else:
155
+ # Multi-label
156
+ aucs = []
157
+ for i in range(num_labels):
158
+ try:
159
+ auc = roc_auc_score(labels[:, i], predictions[:, i])
160
+ aucs.append(auc)
161
+ except:
162
+ aucs.append(0.5)
163
+ return {'roc_auc': np.mean(aucs), 'individual_aucs': aucs}
164
+
165
+ def train_epoch(model, dataloader, optimizer, loss_fn, device):
166
+ model.train()
167
+ total_loss = 0
168
+
169
+ for batch in dataloader:
170
+ input_ids = batch['input_ids'].to(device)
171
+ attention_mask = batch['attention_mask'].to(device)
172
+ labels = batch['labels'].to(device)
173
+
174
+ optimizer.zero_grad()
175
+
176
+ outputs = model(input_ids, attention_mask)
177
+ loss = loss_fn(outputs, labels)
178
+
179
+ loss.backward()
180
+ optimizer.step()
181
+
182
+ total_loss += loss.item()
183
+
184
+ return total_loss / len(dataloader)
185
+
186
+ def evaluate(model, dataloader, loss_fn, num_labels, device):
187
+ model.eval()
188
+ total_loss = 0
189
+ all_predictions = []
190
+ all_labels = []
191
+
192
+ with torch.no_grad():
193
+ for batch in dataloader:
194
+ input_ids = batch['input_ids'].to(device)
195
+ attention_mask = batch['attention_mask'].to(device)
196
+ labels = batch['labels'].to(device)
197
+
198
+ outputs = model(input_ids, attention_mask)
199
+ loss = loss_fn(outputs, labels)
200
+
201
+ total_loss += loss.item()
202
+ all_predictions.append(outputs)
203
+ all_labels.append(labels)
204
+
205
+ all_predictions = torch.cat(all_predictions)
206
+ all_labels = torch.cat(all_labels)
207
+
208
+ metrics = compute_metrics(all_predictions, all_labels, num_labels)
209
+ avg_loss = total_loss / len(dataloader)
210
+
211
+ return avg_loss, metrics
212
+
213
+ def run_experiment(dataset_name, config, tokenizer, encoder_path=None,
214
+ batch_size=32, learning_rate=1e-4, epochs=50, device='cuda'):
215
+ """Run complete experiment for one dataset"""
216
+ print(f"\n=== Running experiment for {dataset_name.upper()} ===")
217
+
218
+ # Load data
219
+ smiles, labels = load_moleculenet_data(dataset_name)
220
+ print(f"Loaded {len(smiles)} samples")
221
+
222
+ # Determine number of labels
223
+ if isinstance(labels, pd.Series):
224
+ num_labels = 1
225
+ else:
226
+ num_labels = labels.shape[1]
227
+ print(f"Number of labels: {num_labels}")
228
+
229
+ # Scaffold split
230
+ smiles_list = smiles.tolist()
231
+ train_idx, valid_idx, test_idx = scaffold_split(smiles_list)
232
+
233
+ print(f"Split sizes - Train: {len(train_idx)}, Valid: {len(valid_idx)}, Test: {len(test_idx)}")
234
+ # Create datasets
235
+ train_smiles = [smiles_list[i] for i in train_idx]
236
+ valid_smiles = [smiles_list[i] for i in valid_idx]
237
+ test_smiles = [smiles_list[i] for i in test_idx]
238
+
239
+ if isinstance(labels, pd.Series):
240
+ train_labels = labels.iloc[list(train_idx)]
241
+ valid_labels = labels.iloc[list(valid_idx)]
242
+ test_labels = labels.iloc[list(test_idx)]
243
+ else:
244
+ train_labels = labels.iloc[list(train_idx)]
245
+ valid_labels = labels.iloc[list(valid_idx)]
246
+ test_labels = labels.iloc[list(test_idx)]
247
+
248
+ # Create data loaders
249
+ train_dataset = MoleculeDataset(train_smiles, train_labels, tokenizer)
250
+ valid_dataset = MoleculeDataset(valid_smiles, valid_labels, tokenizer)
251
+ test_dataset = MoleculeDataset(test_smiles, test_labels, tokenizer)
252
+
253
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
254
+ valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
255
+ test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
256
+
257
+ # Initialize model
258
+ model = SimSonClassifier(config, max_len=512, num_labels=num_labels).to(device)
259
+
260
+ # Load encoder weights if provided
261
+ if encoder_path:
262
+ model.load_encoder_weights(encoder_path)
263
+
264
+ # Setup training
265
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
266
+ loss_fn = get_loss_fn(num_labels)
267
+
268
+ best_valid_loss = float('inf')
269
+ best_model_path = f'best_{dataset_name}_model.pth'
270
+
271
+ # Training loop
272
+ for epoch in range(epochs):
273
+ train_loss = train_epoch(model, train_loader, optimizer, loss_fn, device)
274
+ valid_loss, valid_metrics = evaluate(model, valid_loader, loss_fn, num_labels, device)
275
+
276
+ # Save best model
277
+ if valid_loss < best_valid_loss:
278
+ best_valid_loss = valid_loss
279
+ torch.save(model.state_dict(), best_model_path)
280
+
281
+ if epoch % 10 == 0:
282
+ print(f"Epoch {epoch}: Train Loss = {train_loss:.4f}, "
283
+ f"Valid Loss = {valid_loss:.4f}, Valid AUC = {valid_metrics['roc_auc']:.4f}")
284
+
285
+ # Load best model and test
286
+ model.load_state_dict(torch.load(best_model_path))
287
+ test_loss, test_metrics = evaluate(model, test_loader, loss_fn, num_labels, device)
288
+
289
+ print(f"Final Test Results - Loss: {test_loss:.4f}, ROC-AUC: {test_metrics['roc_auc']:.4f}")
290
+
291
+ # Cleanup
292
+ os.remove(best_model_path)
293
+
294
+ return {
295
+ 'dataset': dataset_name,
296
+ 'num_labels': num_labels,
297
+ 'test_loss': test_loss,
298
+ 'test_roc_auc': test_metrics['roc_auc'],
299
+ 'individual_aucs': test_metrics.get('individual_aucs', None)
300
+ }
301
+
302
+ def main():
303
+ """Main function to run all experiments"""
304
+ # Setup
305
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
306
+ print(f"Using device: {device}")
307
+
308
+ # Initialize tokenizer and config (you need to provide these)
309
+ # tokenizer = your_tokenizer # Replace with your tokenizer
310
+ # config = BertConfig(...) # Your config from above
311
+ tokenizer_path = 'DeepChem/ChemBERTa-77M-MTR'
312
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
313
+
314
+ # Only the hidden size is slightly larger, everything else is the same
315
+ config = BertConfig(
316
+ vocab_size=tokenizer.vocab_size,
317
+ hidden_size=768,
318
+ num_hidden_layers=4,
319
+ num_attention_heads=12,
320
+ intermediate_size=2048,
321
+ max_position_embeddings=512
322
+ )
323
+ # Datasets to test
324
+ datasets = ['bbbp', 'tox21', 'sider', 'clintox', 'hiv', 'bace']
325
+
326
+ # Path to your pretrained encoder (optional)
327
+ encoder_path = 'simson_checkpoints_small/simson_model_single_gpu.bin'
328
+
329
+ # Run experiments
330
+ all_results = []
331
+ for dataset in datasets:
332
+ try:
333
+ result = run_experiment(
334
+ dataset,
335
+ config,
336
+ tokenizer,
337
+ encoder_path=encoder_path,
338
+ device=device
339
+ )
340
+ all_results.append(result)
341
+ except Exception as e:
342
+ print(f"Error with {dataset}: {e}")
343
+
344
+ # Aggregate and display results
345
+ print("\n" + "="*60)
346
+ print("FINAL RESULTS SUMMARY")
347
+ print("="*60)
348
+
349
+ results_df = pd.DataFrame(all_results)
350
+ print(results_df.to_string(index=False))
351
+
352
+ # Save results
353
+ results_df.to_csv('moleculenet_results.csv', index=False)
354
+ print(f"\nResults saved to moleculenet_results.csv")
355
+
356
+ return results_df
357
+
358
+ if __name__ == "__main__":
359
+
360
+ results = main()