gbyuvd commited on
Commit
70ecb45
·
verified ·
1 Parent(s): 98bb8f5

Upload benchmark script and set

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ latent_space_plots/ChemBERTa_latent_interpolation.png filter=lfs diff=lfs merge=lfs -text
37
+ latent_space_plots/FastChemTokenizerHF_latent_interpolation.png filter=lfs diff=lfs merge=lfs -text
benchmark/FastChemTokenizer.py ADDED
@@ -0,0 +1,621 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import json
3
+ import os
4
+ from typing import List, Union, Optional, Tuple
5
+ from transformers.tokenization_utils_base import BatchEncoding
6
+ from functools import lru_cache
7
+
8
+ # Copyright 2025 Genta Pramillean Bayu (@gbyuvd)
9
+ #
10
+ # Licensed under the Apache License, Version 2.0 (the "License");
11
+ # you may not use this file except in compliance with the License.
12
+ # You may obtain a copy of the License at
13
+ #
14
+ # http://www.apache.org/licenses/LICENSE-2.0
15
+ #
16
+ # Unless required by applicable law or agreed to in writing, software
17
+ # distributed under the License is distributed on an "AS IS" BASIS,
18
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ # See the License for the specific language governing permissions and
20
+ # limitations under the License.
21
+
22
+ class TrieNode:
23
+ __slots__ = ['children', 'token_id']
24
+ def __init__(self):
25
+ self.children = {}
26
+ self.token_id = None # If set, this node completes a valid token
27
+
28
+
29
+ class FastChemTokenizer:
30
+ def __init__(self, token_to_id, model_max_length=512):
31
+ self.token_to_id = token_to_id
32
+ self.id_to_token = {v: k for k, v in token_to_id.items()}
33
+ # No more self.token_set — replaced by trie
34
+ self.model_max_length = model_max_length
35
+
36
+ # Precompute max token length for possible use & clarity
37
+ self.max_token_len = max(len(t) for t in token_to_id.keys())
38
+
39
+ # Build trie for fast longest-match lookup
40
+ self.trie_root = self._build_trie(token_to_id)
41
+
42
+ # Validate required special tokens
43
+ required_special_tokens = ["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
44
+ for tok in required_special_tokens:
45
+ if tok not in token_to_id:
46
+ raise KeyError(f"Required special token '{tok}' not found in vocab.")
47
+
48
+ # Special token IDs
49
+ self.bos_token_id = token_to_id["<s>"]
50
+ self.eos_token_id = token_to_id["</s>"]
51
+ self.pad_token_id = token_to_id["<pad>"]
52
+ self.unk_token_id = token_to_id["<unk>"]
53
+ self.mask_token_id = token_to_id["<mask>"]
54
+
55
+ # Special tokens for convenience
56
+ self.bos_token = "<s>"
57
+ self.eos_token = "</s>"
58
+ self.pad_token = "<pad>"
59
+ self.unk_token = "<unk>"
60
+ self.mask_token = "<mask>"
61
+
62
+ def _build_trie(self, token_to_id):
63
+ root = TrieNode()
64
+ for token, tid in token_to_id.items():
65
+ node = root
66
+ for char in token:
67
+ if char not in node.children:
68
+ node.children[char] = TrieNode()
69
+ node = node.children[char]
70
+ node.token_id = tid
71
+ return root
72
+
73
+ def __len__(self):
74
+ """Return vocab size — REQUIRED for HF compatibility."""
75
+ return len(self.token_to_id)
76
+
77
+ def __call__(self, text: Union[str, List[str]], text_pair: Optional[Union[str, List[str]]] = None, **kwargs) -> BatchEncoding:
78
+ if isinstance(text, list):
79
+ batch = [(t, p) if p is not None else t for t, p in zip(text, text_pair)] if text_pair else text
80
+ return self.batch_encode_plus(batch, **kwargs)
81
+ else:
82
+ return self.encode_plus(text=text, text_pair=text_pair, **kwargs)
83
+
84
+ @lru_cache(maxsize=10000)
85
+ def _cached_encode_str(self, s: str) -> Tuple[int, ...]:
86
+ return tuple(self._encode_core(s))
87
+
88
+ def _encode_core(self, text: str) -> List[int]:
89
+ """Core encoding logic using Trie — no caching."""
90
+ tokens = text
91
+ result_ids = []
92
+ i = 0
93
+ n = len(tokens)
94
+
95
+ while i < n:
96
+ node = self.trie_root
97
+ j = i
98
+ last_match_id = None
99
+ last_match_end = i
100
+
101
+ # Traverse trie while characters match
102
+ while j < n and tokens[j] in node.children:
103
+ node = node.children[tokens[j]]
104
+ j += 1
105
+ if node.token_id is not None:
106
+ last_match_id = node.token_id
107
+ last_match_end = j # Remember end of valid token
108
+
109
+ if last_match_id is not None:
110
+ result_ids.append(last_match_id)
111
+ i = last_match_end
112
+ else:
113
+ # Fallback: encode single char
114
+ tok = tokens[i]
115
+ result_ids.append(self.token_to_id.get(tok, self.unk_token_id))
116
+ i += 1
117
+
118
+ return result_ids
119
+
120
+ def encode(self, text: str) -> List[int]:
121
+ """Public encode method — strips input and uses cache."""
122
+ return list(self._cached_encode_str(text.strip()))
123
+
124
+ def decode(self, token_ids: Union[List[int], torch.Tensor], skip_special_tokens: bool = False) -> str:
125
+ if isinstance(token_ids, torch.Tensor):
126
+ token_ids = token_ids.tolist()
127
+
128
+ if skip_special_tokens:
129
+ special_ids = {
130
+ self.bos_token_id,
131
+ self.eos_token_id,
132
+ self.pad_token_id,
133
+ self.mask_token_id,
134
+ }
135
+ else:
136
+ special_ids = set()
137
+
138
+ tokens = []
139
+ for tid in token_ids:
140
+ if tid in special_ids:
141
+ continue
142
+ token = self.id_to_token.get(tid, self.unk_token)
143
+ tokens.append(token)
144
+
145
+ return "".join(tokens)
146
+
147
+ def decode_with_trace(self, token_ids: List[int]) -> None:
148
+ print(f"\n🔍 Decoding {len(token_ids)} tokens:")
149
+ for i, tid in enumerate(token_ids):
150
+ token = self.id_to_token.get(tid, self.unk_token)
151
+ print(f" [{i:03d}] ID={tid:5d} → '{token}'")
152
+
153
+ def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
154
+ return [self.id_to_token.get(i, self.unk_token) for i in ids]
155
+
156
+ def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
157
+ return [self.token_to_id.get(t, self.unk_token_id) for t in tokens]
158
+
159
+ def encode_plus(
160
+ self,
161
+ text: str,
162
+ text_pair: Optional[str] = None,
163
+ add_special_tokens: bool = True,
164
+ padding: Union[bool, str] = False,
165
+ truncation: bool = False,
166
+ max_length: Optional[int] = None,
167
+ return_tensors: Optional[str] = None,
168
+ return_attention_mask: bool = True,
169
+ return_token_type_ids: bool = True,
170
+ ) -> BatchEncoding:
171
+ if max_length is None:
172
+ max_length = self.model_max_length
173
+
174
+ ids_a = self.encode(text)
175
+
176
+ if text_pair is not None:
177
+ ids_b = self.encode(text_pair)
178
+ else:
179
+ ids_b = None
180
+
181
+ input_ids = []
182
+ token_type_ids = []
183
+
184
+ if add_special_tokens:
185
+ input_ids.append(self.bos_token_id)
186
+ token_type_ids.append(0)
187
+ if ids_b is not None:
188
+ input_ids.extend(ids_a)
189
+ token_type_ids.extend([0] * len(ids_a))
190
+ input_ids.append(self.eos_token_id)
191
+ token_type_ids.append(0)
192
+
193
+ input_ids.extend(ids_b)
194
+ token_type_ids.extend([1] * len(ids_b))
195
+ input_ids.append(self.eos_token_id)
196
+ token_type_ids.append(1)
197
+ else:
198
+ input_ids.extend(ids_a)
199
+ token_type_ids.extend([0] * len(ids_a))
200
+ input_ids.append(self.eos_token_id)
201
+ token_type_ids.append(0)
202
+ else:
203
+ input_ids = ids_a
204
+ token_type_ids = [0] * len(input_ids)
205
+ if ids_b is not None:
206
+ input_ids.extend(ids_b)
207
+ token_type_ids.extend([1] * len(ids_b))
208
+
209
+ if truncation and len(input_ids) > max_length:
210
+ input_ids = input_ids[:max_length]
211
+ token_type_ids = token_type_ids[:max_length]
212
+
213
+ if padding:
214
+ pad_len = max_length - len(input_ids)
215
+ if pad_len > 0:
216
+ input_ids.extend([self.pad_token_id] * pad_len)
217
+ token_type_ids.extend([0] * pad_len)
218
+
219
+ attention_mask = [1 if tid != self.pad_token_id else 0 for tid in input_ids]
220
+
221
+ encoded_dict = {
222
+ "input_ids": input_ids,
223
+ "attention_mask": attention_mask,
224
+ }
225
+ if return_token_type_ids:
226
+ encoded_dict["token_type_ids"] = token_type_ids
227
+
228
+ if return_tensors == "pt":
229
+ output = {}
230
+ for k, v in encoded_dict.items():
231
+ tensor = torch.tensor(v, dtype=torch.long) # Fixed: use torch.tensor, not as_tensor
232
+ if tensor.ndim == 1:
233
+ tensor = tensor.unsqueeze(0)
234
+ output[k] = tensor
235
+ else:
236
+ output = encoded_dict
237
+
238
+ return BatchEncoding(output, tensor_type=return_tensors)
239
+
240
+ def batch_encode_plus(
241
+ self,
242
+ batch_text_or_text_pairs: List[Union[str, Tuple[str, str]]],
243
+ **kwargs
244
+ ) -> BatchEncoding:
245
+ all_input_ids = []
246
+ all_attention_masks = []
247
+ all_token_type_ids = []
248
+
249
+ for item in batch_text_or_text_pairs:
250
+ if isinstance(item, tuple):
251
+ text, text_pair = item
252
+ else:
253
+ text, text_pair = item, None
254
+
255
+ encoded = self.encode_plus(
256
+ text=text,
257
+ text_pair=text_pair,
258
+ **kwargs
259
+ )
260
+ all_input_ids.append(encoded["input_ids"])
261
+ all_attention_masks.append(encoded["attention_mask"])
262
+ if "token_type_ids" in encoded:
263
+ all_token_type_ids.append(encoded["token_type_ids"])
264
+
265
+ batched = {
266
+ "input_ids": all_input_ids,
267
+ "attention_mask": all_attention_masks,
268
+ }
269
+ if all_token_type_ids:
270
+ batched["token_type_ids"] = all_token_type_ids
271
+
272
+ if kwargs.get("return_tensors") == "pt":
273
+ def to_tensor_list(lst):
274
+ # Fixed: Handle both tensor and non-tensor items properly
275
+ return [item.clone().detach() if isinstance(item, torch.Tensor)
276
+ else torch.tensor(item, dtype=torch.long) for item in lst]
277
+ batched = {
278
+ k: torch.nn.utils.rnn.pad_sequence(
279
+ to_tensor_list(v),
280
+ batch_first=True,
281
+ padding_value=self.pad_token_id if k == "input_ids" else 0
282
+ )
283
+ for k, v in batched.items()
284
+ }
285
+
286
+ return BatchEncoding(batched, tensor_type=kwargs.get("return_tensors"))
287
+
288
+ # Save vocab to directory
289
+ def save_pretrained(self, save_directory: str):
290
+ """
291
+ Save tokenizer vocab as `vocab.json` in target directory.
292
+ Mimics Hugging Face convention.
293
+ """
294
+ if not os.path.exists(save_directory):
295
+ os.makedirs(save_directory)
296
+
297
+ vocab_file = os.path.join(save_directory, "vocab.json")
298
+
299
+ # Keys are strings, values are ints — JSON-safe
300
+ with open(vocab_file, "w", encoding="utf-8") as f:
301
+ json.dump(self.token_to_id, f, ensure_ascii=False, indent=2)
302
+
303
+ print(f"✅ Tokenizer vocab saved to: {vocab_file}")
304
+
305
+ # Load from pretrained directory
306
+ @classmethod
307
+ def from_pretrained(cls, pretrained_directory: str, model_max_length=512):
308
+ """
309
+ Load tokenizer from directory containing `vocab.json`.
310
+ """
311
+ vocab_file = os.path.join(pretrained_directory, "vocab.json")
312
+
313
+ if not os.path.exists(vocab_file):
314
+ raise FileNotFoundError(f"Vocab file not found: {vocab_file}")
315
+
316
+ with open(vocab_file, "r", encoding="utf-8") as f:
317
+ token_to_id = json.load(f)
318
+
319
+ # Convert keys to str (JSON loads as str anyway), values to int
320
+ token_to_id = {str(k): int(v) for k, v in token_to_id.items()}
321
+
322
+ return cls(token_to_id=token_to_id, model_max_length=model_max_length)
323
+
324
+ class FastChemTokenizerSelfies:
325
+ def __init__(self, token_to_id, model_max_length=512):
326
+ self.token_to_id = token_to_id
327
+ self.id_to_token = {v: k for k, v in token_to_id.items()}
328
+ # No more self.token_set — replaced by trie
329
+ self.model_max_length = model_max_length
330
+
331
+ # Precompute max token length for possible use & clarity
332
+ self.max_token_len = max(len(t) for t in token_to_id.keys())
333
+
334
+ # Build trie for fast longest-match lookup
335
+ self.trie_root = self._build_trie(token_to_id)
336
+
337
+ # Validate required special tokens
338
+ required_special_tokens = ["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
339
+ for tok in required_special_tokens:
340
+ if tok not in token_to_id:
341
+ raise KeyError(f"Required special token '{tok}' not found in vocab.")
342
+
343
+ # Special token IDs
344
+ self.bos_token_id = token_to_id["<s>"]
345
+ self.eos_token_id = token_to_id["</s>"]
346
+ self.pad_token_id = token_to_id["<pad>"]
347
+ self.unk_token_id = token_to_id["<unk>"]
348
+ self.mask_token_id = token_to_id["<mask>"]
349
+
350
+ # Special tokens for convenience
351
+ self.bos_token = "<s>"
352
+ self.eos_token = "</s>"
353
+ self.pad_token = "<pad>"
354
+ self.unk_token = "<unk>"
355
+ self.mask_token = "<mask>"
356
+
357
+ def _build_trie(self, token_to_id):
358
+ root = TrieNode()
359
+ for token, tid in token_to_id.items():
360
+ node = root
361
+ for char in token:
362
+ if char not in node.children:
363
+ node.children[char] = TrieNode()
364
+ node = node.children[char]
365
+ node.token_id = tid
366
+ return root
367
+
368
+ def __len__(self):
369
+ """Return vocab size — REQUIRED for HF compatibility."""
370
+ return len(self.token_to_id)
371
+
372
+ def __call__(self, text: Union[str, List[str]], text_pair: Optional[Union[str, List[str]]] = None, **kwargs) -> BatchEncoding:
373
+ if isinstance(text, list):
374
+ batch = [(t, p) if p is not None else t for t, p in zip(text, text_pair)] if text_pair else text
375
+ return self.batch_encode_plus(batch, **kwargs)
376
+ else:
377
+ return self.encode_plus(text=text, text_pair=text_pair, **kwargs)
378
+
379
+ @lru_cache(maxsize=10000)
380
+ def _cached_encode_str(self, s: str) -> Tuple[int, ...]:
381
+ return tuple(self._encode_core(s))
382
+
383
+ def _encode_core(self, text: str) -> List[int]:
384
+ """Core encoding logic using Trie — skips whitespace if not part of a token."""
385
+ result_ids = []
386
+ i = 0
387
+ n = len(text)
388
+
389
+ while i < n:
390
+ if text[i].isspace(): # ← Skip whitespace unless part of a token
391
+ i += 1
392
+ continue
393
+
394
+ node = self.trie_root
395
+ j = i
396
+ last_match_id = None
397
+ last_match_end = i
398
+
399
+ # Traverse trie while characters match
400
+ while j < n and text[j] in node.children:
401
+ node = node.children[text[j]]
402
+ j += 1
403
+ if node.token_id is not None:
404
+ last_match_id = node.token_id
405
+ last_match_end = j
406
+
407
+ if last_match_id is not None:
408
+ result_ids.append(last_match_id)
409
+ i = last_match_end
410
+ else:
411
+ # Fallback: encode single char
412
+ result_ids.append(self.token_to_id.get(text[i], self.unk_token_id))
413
+ i += 1
414
+
415
+ return result_ids
416
+
417
+
418
+ def encode(self, text: str) -> List[int]:
419
+ """Public encode method — strips input and uses cache."""
420
+ return list(self._cached_encode_str(text.strip()))
421
+
422
+ def decode(self, token_ids: Union[List[int], torch.Tensor], skip_special_tokens: bool = False) -> str:
423
+ if isinstance(token_ids, torch.Tensor):
424
+ token_ids = token_ids.tolist()
425
+
426
+ if skip_special_tokens:
427
+ special_ids = {
428
+ self.bos_token_id,
429
+ self.eos_token_id,
430
+ self.pad_token_id,
431
+ self.mask_token_id,
432
+ }
433
+ else:
434
+ special_ids = set()
435
+
436
+ tokens = []
437
+ for tid in token_ids:
438
+ if tid in special_ids:
439
+ continue
440
+ token = self.id_to_token.get(tid, self.unk_token)
441
+ tokens.append(token)
442
+
443
+ # ✅ Join with SPACE between tokens — this reconstructs original format
444
+ return " ".join(tokens)
445
+
446
+ def decode_with_trace(self, token_ids: List[int]) -> None:
447
+ print(f"\n🔍 Decoding {len(token_ids)} tokens:")
448
+ for i, tid in enumerate(token_ids):
449
+ token = self.id_to_token.get(tid, self.unk_token)
450
+ print(f" [{i:03d}] ID={tid:5d} → '{token}'")
451
+
452
+ def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
453
+ return [self.id_to_token.get(i, self.unk_token) for i in ids]
454
+
455
+ def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
456
+ return [self.token_to_id.get(t, self.unk_token_id) for t in tokens]
457
+
458
+ def encode_plus(
459
+ self,
460
+ text: str,
461
+ text_pair: Optional[str] = None,
462
+ add_special_tokens: bool = True,
463
+ padding: Union[bool, str] = False,
464
+ truncation: bool = False,
465
+ max_length: Optional[int] = None,
466
+ return_tensors: Optional[str] = None,
467
+ return_attention_mask: bool = True,
468
+ return_token_type_ids: bool = True,
469
+ ) -> BatchEncoding:
470
+ if max_length is None:
471
+ max_length = self.model_max_length
472
+
473
+ ids_a = self.encode(text)
474
+
475
+ if text_pair is not None:
476
+ ids_b = self.encode(text_pair)
477
+ else:
478
+ ids_b = None
479
+
480
+ input_ids = []
481
+ token_type_ids = []
482
+
483
+ if add_special_tokens:
484
+ input_ids.append(self.bos_token_id)
485
+ token_type_ids.append(0)
486
+ if ids_b is not None:
487
+ input_ids.extend(ids_a)
488
+ token_type_ids.extend([0] * len(ids_a))
489
+ input_ids.append(self.eos_token_id)
490
+ token_type_ids.append(0)
491
+
492
+ input_ids.extend(ids_b)
493
+ token_type_ids.extend([1] * len(ids_b))
494
+ input_ids.append(self.eos_token_id)
495
+ token_type_ids.append(1)
496
+ else:
497
+ input_ids.extend(ids_a)
498
+ token_type_ids.extend([0] * len(ids_a))
499
+ input_ids.append(self.eos_token_id)
500
+ token_type_ids.append(0)
501
+ else:
502
+ input_ids = ids_a
503
+ token_type_ids = [0] * len(input_ids)
504
+ if ids_b is not None:
505
+ input_ids.extend(ids_b)
506
+ token_type_ids.extend([1] * len(ids_b))
507
+
508
+ if truncation and len(input_ids) > max_length:
509
+ input_ids = input_ids[:max_length]
510
+ token_type_ids = token_type_ids[:max_length]
511
+
512
+ if padding:
513
+ pad_len = max_length - len(input_ids)
514
+ if pad_len > 0:
515
+ input_ids.extend([self.pad_token_id] * pad_len)
516
+ token_type_ids.extend([0] * pad_len)
517
+
518
+ attention_mask = [1 if tid != self.pad_token_id else 0 for tid in input_ids]
519
+
520
+ encoded_dict = {
521
+ "input_ids": input_ids,
522
+ "attention_mask": attention_mask,
523
+ }
524
+ if return_token_type_ids:
525
+ encoded_dict["token_type_ids"] = token_type_ids
526
+
527
+ if return_tensors == "pt":
528
+ output = {}
529
+ for k, v in encoded_dict.items():
530
+ tensor = torch.tensor(v, dtype=torch.long) # Fixed: use torch.tensor, not as_tensor
531
+ if tensor.ndim == 1:
532
+ tensor = tensor.unsqueeze(0)
533
+ output[k] = tensor
534
+ else:
535
+ output = encoded_dict
536
+
537
+ return BatchEncoding(output, tensor_type=return_tensors)
538
+
539
+ def batch_encode_plus(
540
+ self,
541
+ batch_text_or_text_pairs: List[Union[str, Tuple[str, str]]],
542
+ **kwargs
543
+ ) -> BatchEncoding:
544
+ all_input_ids = []
545
+ all_attention_masks = []
546
+ all_token_type_ids = []
547
+
548
+ for item in batch_text_or_text_pairs:
549
+ if isinstance(item, tuple):
550
+ text, text_pair = item
551
+ else:
552
+ text, text_pair = item, None
553
+
554
+ encoded = self.encode_plus(
555
+ text=text,
556
+ text_pair=text_pair,
557
+ **kwargs
558
+ )
559
+ all_input_ids.append(encoded["input_ids"])
560
+ all_attention_masks.append(encoded["attention_mask"])
561
+ if "token_type_ids" in encoded:
562
+ all_token_type_ids.append(encoded["token_type_ids"])
563
+
564
+ batched = {
565
+ "input_ids": all_input_ids,
566
+ "attention_mask": all_attention_masks,
567
+ }
568
+ if all_token_type_ids:
569
+ batched["token_type_ids"] = all_token_type_ids
570
+
571
+ if kwargs.get("return_tensors") == "pt":
572
+ def to_tensor_list(lst):
573
+ # Fixed: Handle both tensor and non-tensor items properly
574
+ return [item.clone().detach() if isinstance(item, torch.Tensor)
575
+ else torch.tensor(item, dtype=torch.long) for item in lst]
576
+ batched = {
577
+ k: torch.nn.utils.rnn.pad_sequence(
578
+ to_tensor_list(v),
579
+ batch_first=True,
580
+ padding_value=self.pad_token_id if k == "input_ids" else 0
581
+ )
582
+ for k, v in batched.items()
583
+ }
584
+
585
+ return BatchEncoding(batched, tensor_type=kwargs.get("return_tensors"))
586
+
587
+ # Save vocab to directory
588
+ def save_pretrained(self, save_directory: str):
589
+ """
590
+ Save tokenizer vocab as `vocab.json` in target directory.
591
+ Mimics Hugging Face convention.
592
+ """
593
+ if not os.path.exists(save_directory):
594
+ os.makedirs(save_directory)
595
+
596
+ vocab_file = os.path.join(save_directory, "vocab.json")
597
+
598
+ # Keys are strings, values are ints — JSON-safe
599
+ with open(vocab_file, "w", encoding="utf-8") as f:
600
+ json.dump(self.token_to_id, f, ensure_ascii=False, indent=2)
601
+
602
+ print(f"✅ Tokenizer vocab saved to: {vocab_file}")
603
+
604
+ # Load from pretrained directory
605
+ @classmethod
606
+ def from_pretrained(cls, pretrained_directory: str, model_max_length=512):
607
+ """
608
+ Load tokenizer from directory containing `vocab.json`.
609
+ """
610
+ vocab_file = os.path.join(pretrained_directory, "vocab.json")
611
+
612
+ if not os.path.exists(vocab_file):
613
+ raise FileNotFoundError(f"Vocab file not found: {vocab_file}")
614
+
615
+ with open(vocab_file, "r", encoding="utf-8") as f:
616
+ token_to_id = json.load(f)
617
+
618
+ # Convert keys to str (JSON loads as str anyway), values to int
619
+ token_to_id = {str(k): int(v) for k, v in token_to_id.items()}
620
+
621
+ return cls(token_to_id=token_to_id, model_max_length=model_max_length)
benchmark/benchmark_HF_efficient.py ADDED
@@ -0,0 +1,1119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Molecule Tokenizer Benchmark & VAE Training Pipeline
3
+ # PATCHED VERSION — Updated for FastChemTokenizerHF (HF compatible)
4
+ #
5
+
6
+ #
7
+ # Step 1.1 — Imports & Reproducibility
8
+ #
9
+
10
+ import os
11
+ import time
12
+ import random
13
+ import pandas as pd
14
+ from pathlib import Path
15
+ from datetime import datetime
16
+ import torch
17
+ import numpy as np
18
+ # Tokenizers
19
+ from transformers import AutoTokenizer
20
+ from FastChemTokenizerHF import FastChemTokenizer
21
+ # Optional: for progress bars
22
+ from tqdm import tqdm
23
+ from rdkit import Chem
24
+ from sklearn.model_selection import train_test_split
25
+ import torch.nn as nn
26
+ import torch.nn.functional as F
27
+ from ranger21 import Ranger21
28
+ from torch.utils.data import DataLoader, Dataset
29
+ from scipy.stats import entropy
30
+ import json
31
+ import math
32
+ from typing import Optional, Tuple, Union
33
+ from rdkit import RDLogger
34
+ RDLogger.DisableLog('rdApp.*')
35
+ # Set seeds for reproducibility
36
+ def set_seed(seed=42):
37
+ torch.manual_seed(seed)
38
+ torch.cuda.manual_seed_all(seed)
39
+ np.random.seed(seed)
40
+ random.seed(seed)
41
+ os.environ['PYTHONHASHSEED'] = str(seed)
42
+ torch.backends.cudnn.deterministic = True
43
+ torch.backends.cudnn.benchmark = False
44
+
45
+ set_seed(42)
46
+
47
+ # Device setup
48
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
49
+ print(f"Using device: {device}")
50
+
51
+ #
52
+ # Step 1.2 — Load & Preprocess SMILES Corpus
53
+ #
54
+
55
+ data_path = "../data/sample_1k_smi_42.csv"
56
+ df = pd.read_csv(data_path)
57
+
58
+ if 'SMILES' not in df.columns:
59
+ raise ValueError("Expected column 'SMILES' in CSV")
60
+
61
+ smiles_list = df['SMILES'].dropna().tolist()
62
+ print(f"Loaded {len(smiles_list)} SMILES (assumed pre-canonicalized)")
63
+
64
+ # Validate with RDKit
65
+
66
+ def is_valid_smiles(smiles):
67
+ return Chem.MolFromSmiles(smiles) is not None
68
+
69
+ print("Validating SMILES with RDKit...")
70
+ valid_mask = [is_valid_smiles(s) for s in tqdm(smiles_list)]
71
+ smiles_list = [s for s, valid in zip(smiles_list, valid_mask) if valid]
72
+ print(f"After RDKit filtering: {len(smiles_list)} valid SMILES")
73
+
74
+ #
75
+ # Step 1.3 — Train/Val/Test Split (80/10/10)
76
+ #
77
+
78
+ train_smiles, temp_smiles = train_test_split(smiles_list, test_size=0.2, random_state=42, shuffle=True)
79
+ val_smiles, test_smiles = train_test_split(temp_smiles, test_size=0.5, random_state=42, shuffle=True)
80
+
81
+ print(f"Train: {len(train_smiles)}")
82
+ print(f"Val: {len(val_smiles)}")
83
+ print(f"Test: {len(test_smiles)}")
84
+
85
+ # Cache splits
86
+ splits = {'train': train_smiles, 'val': val_smiles, 'test': test_smiles}
87
+ for split_name, smiles in splits.items():
88
+ with open(f"../data/{split_name}_smiles.txt", "w") as f:
89
+ f.write("\n".join(smiles))
90
+
91
+ #
92
+ # Step 1.4 — Tokenizer Wrapper (Simplified for HF compatibility)
93
+ #
94
+
95
+ class TokenizerWrapper:
96
+ def __init__(self, tokenizer, name,
97
+ bos_token="<s>", eos_token="</s>",
98
+ pad_token="<pad>", unk_token="<unk>"):
99
+ self.tokenizer = tokenizer
100
+ self.name = name
101
+
102
+ # Only call add_special_tokens if the tokenizer actually supports it
103
+ if hasattr(tokenizer, "add_special_tokens") and callable(tokenizer.add_special_tokens):
104
+ try:
105
+ tokenizer.add_special_tokens({
106
+ "bos_token": bos_token,
107
+ "eos_token": eos_token,
108
+ "pad_token": pad_token,
109
+ "unk_token": unk_token,
110
+ })
111
+ except NotImplementedError:
112
+ # Your FastChemTokenizerHF already defines these tokens internally
113
+ pass
114
+
115
+
116
+
117
+ def encode(self, smiles: str, add_special_tokens: bool = True):
118
+ return self.tokenizer(
119
+ smiles,
120
+ add_special_tokens=add_special_tokens,
121
+ return_attention_mask=False,
122
+ return_tensors=None
123
+ )
124
+
125
+ def decode(self, token_ids, skip_special_tokens=True):
126
+ return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
127
+
128
+ def __len__(self):
129
+ return len(self.tokenizer)
130
+
131
+ def get_vocab(self):
132
+ return self.tokenizer.get_vocab()
133
+
134
+ @property
135
+ def bos_token_id(self):
136
+ return self.tokenizer.bos_token_id
137
+
138
+ @property
139
+ def eos_token_id(self):
140
+ return self.tokenizer.eos_token_id
141
+
142
+ @property
143
+ def pad_token_id(self):
144
+ return self.tokenizer.pad_token_id
145
+
146
+ @property
147
+ def unk_token_id(self):
148
+ return self.tokenizer.unk_token_id
149
+
150
+ #
151
+ # Step 1.5 — Initialize Tokenizers
152
+ #
153
+
154
+ tok1_hf = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
155
+ tok2_fast = FastChemTokenizer.from_pretrained("../smitok_core")
156
+
157
+ tokenizer1 = TokenizerWrapper(tok1_hf, name="ChemBERTa", bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>")
158
+ tokenizer2 = TokenizerWrapper(tok2_fast, name="FastChemTokenizerHF", bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>")
159
+
160
+ TOKENIZERS = [tokenizer1, tokenizer2]
161
+
162
+
163
+ #
164
+ # Step 1.6 — Benchmarking Functions (Fixed Bug #4 implicitly via epsilon)
165
+ #
166
+
167
+ def benchmark_tokenizer(tokenizer, smiles_sample, encode_only=False):
168
+ V = len(tokenizer)
169
+ sample = smiles_sample[:10000] if len(smiles_sample) > 10000 else smiles_sample
170
+
171
+ encode_times, token_counts, char_counts = [], [], []
172
+ unk_counts, total_tokens = 0, 0
173
+
174
+ for smiles in tqdm(sample, desc=f"Encoding with {tokenizer.name}", leave=False):
175
+ char_counts.append(len(smiles))
176
+ start = time.perf_counter()
177
+ enc = tokenizer.encode(smiles, add_special_tokens=True)
178
+ end = time.perf_counter()
179
+ encode_times.append(end - start)
180
+
181
+ input_ids = enc['input_ids']
182
+ token_counts.append(len(input_ids))
183
+ total_tokens += len(input_ids)
184
+ unk_id = tokenizer.tokenizer.unk_token_id
185
+ unk_counts += input_ids.count(unk_id)
186
+
187
+ L_bar = np.mean(token_counts)
188
+ C = np.mean(char_counts) / L_bar
189
+ U = unk_counts / total_tokens if total_tokens > 0 else 0.0
190
+ Tenc = len(sample) / sum(encode_times)
191
+
192
+ metrics = {
193
+ 'vocab_size': V,
194
+ 'avg_tokens_per_mol': L_bar,
195
+ 'compression_ratio': C,
196
+ 'percent_unknown': U * 100,
197
+ 'encode_throughput_smiles_per_sec': Tenc,
198
+ }
199
+
200
+ if encode_only:
201
+ return metrics
202
+
203
+ decode_times, reconstruction_ok = [], 0
204
+
205
+ for smiles in tqdm(sample, desc=f"Decoding with {tokenizer.name}", leave=False):
206
+ enc = tokenizer.encode(smiles, add_special_tokens=True)
207
+ input_ids = enc['input_ids']
208
+ start = time.perf_counter()
209
+ decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
210
+ end = time.perf_counter()
211
+ decode_times.append(end - start)
212
+ if decoded == smiles:
213
+ reconstruction_ok += 1
214
+
215
+ Tdec = len(sample) / sum(decode_times)
216
+ recon_acc = reconstruction_ok / len(sample)
217
+
218
+ metrics.update({
219
+ 'decode_throughput_smiles_per_sec': Tdec,
220
+ 'decode_reconstruction_accuracy': recon_acc * 100,
221
+ })
222
+
223
+ return metrics
224
+
225
+
226
+ #
227
+ # Step 1.7 — Run Benchmark
228
+ #
229
+
230
+ benchmark_sample = train_smiles
231
+ results = []
232
+
233
+ for tokenizer in TOKENIZERS:
234
+ print(f"\n=== Benchmarking {tokenizer.name} ===")
235
+ metrics = benchmark_tokenizer(tokenizer, benchmark_sample)
236
+ metrics['tokenizer'] = tokenizer.name
237
+ results.append(metrics)
238
+ for k, v in metrics.items():
239
+ if k != 'tokenizer':
240
+ print(f"{k:35s}: {v:.4f}" if isinstance(v, float) else f"{k:35s}: {v}")
241
+
242
+ df_results = pd.DataFrame(results)
243
+ df_results.to_csv("tokenizer_benchmark_results.csv", index=False)
244
+ print("\nTokenizer benchmark results saved to 'tokenizer_benchmark_results.csv'")
245
+
246
+ #
247
+ # Step 2.1 — VAE Model Class (PATCHED: decode stops at EOS)
248
+ #
249
+
250
+
251
+ import torch
252
+ import torch.nn as nn
253
+ import torch.nn.functional as F
254
+ from typing import Optional, Tuple, Union
255
+
256
+ import torch
257
+ import torch.nn as nn
258
+ import torch.nn.functional as F
259
+ from typing import Tuple, Optional
260
+
261
+ class MoleculeVAE(nn.Module):
262
+ """
263
+ Optimized MoleculeVAE with:
264
+ - Bidirectional encoder (restored)
265
+ - Proper latent2hidden + latent2cell (restored)
266
+ - Adjustable dropout for small dataset
267
+ - Attention pooling option
268
+ - Quantization-ready hooks
269
+ """
270
+
271
+ def __init__(self,
272
+ vocab_size: int,
273
+ embed_dim: int = 128,
274
+ hidden_dim: int = 256,
275
+ latent_dim: int = 128,
276
+ num_layers: int = 2,
277
+ pad_token_id: int = 0,
278
+ bos_token_id: int = 1,
279
+ eos_token_id: int = 2,
280
+ dropout: float = 0.2,
281
+ use_attention: bool = True,
282
+ quantize_ready: bool = False):
283
+ super().__init__()
284
+ self.vocab_size = vocab_size
285
+ self.embed_dim = embed_dim
286
+ self.hidden_dim = hidden_dim
287
+ self.latent_dim = latent_dim
288
+ self.num_layers = num_layers
289
+ self.pad_token_id = pad_token_id
290
+ self.bos_token_id = bos_token_id
291
+ self.eos_token_id = eos_token_id
292
+ self.use_attention = use_attention
293
+
294
+ # Shared embedding
295
+ self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_token_id)
296
+
297
+ # Bidirectional encoder
298
+ self.encoder_lstm = nn.LSTM(
299
+ embed_dim, hidden_dim, num_layers,
300
+ batch_first=True, dropout=dropout if num_layers > 1 else 0,
301
+ bidirectional=True
302
+ )
303
+
304
+ # Attention pooling (optional)
305
+ if use_attention:
306
+ self.attention = nn.MultiheadAttention(
307
+ hidden_dim * 2, num_heads=4, dropout=dropout, batch_first=True
308
+ )
309
+ self.attention_linear = nn.Linear(hidden_dim * 2, 1)
310
+
311
+ self.encoder_norm = nn.LayerNorm(hidden_dim * 2)
312
+
313
+ # Latent bottleneck
314
+ self.fc_mu = nn.Linear(hidden_dim * 2, latent_dim)
315
+ self.fc_logvar = nn.Linear(hidden_dim * 2, latent_dim)
316
+
317
+ # Decoder init (restored)
318
+ self.latent2hidden = nn.Linear(latent_dim, num_layers * hidden_dim)
319
+ self.latent2cell = nn.Linear(latent_dim, num_layers * hidden_dim)
320
+
321
+ # Decoder
322
+ self.decoder_lstm = nn.LSTM(
323
+ embed_dim, hidden_dim, num_layers,
324
+ batch_first=True, dropout=dropout if num_layers > 1 else 0
325
+ )
326
+ self.decoder_norm = nn.LayerNorm(hidden_dim)
327
+ self.fc_out = nn.Linear(hidden_dim, vocab_size)
328
+
329
+ # Weight tying
330
+ if embed_dim == hidden_dim:
331
+ self.fc_out.weight = self.embedding.weight
332
+
333
+ self.dropout = nn.Dropout(dropout)
334
+
335
+ # Quantization stubs
336
+ if quantize_ready:
337
+ self.quant = torch.quantization.QuantStub()
338
+ self.dequant = torch.quantization.DeQuantStub()
339
+ else:
340
+ self.quant = self.dequant = nn.Identity()
341
+
342
+ self._init_weights()
343
+
344
+ def _init_weights(self):
345
+ for name, param in self.named_parameters():
346
+ if 'weight' in name:
347
+ if param.ndim >= 2:
348
+ nn.init.xavier_uniform_(param)
349
+ else:
350
+ nn.init.normal_(param, 0, 0.01)
351
+ elif 'bias' in name:
352
+ nn.init.zeros_(param)
353
+
354
+ def _pool_sequence(self, packed_output, lengths):
355
+ output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
356
+ if self.use_attention:
357
+ attn_out, _ = self.attention(output, output, output)
358
+ weights = torch.softmax(self.attention_linear(attn_out), dim=1)
359
+ pooled = (weights * output).sum(dim=1)
360
+ else:
361
+ # mean pooling with mask
362
+ batch_size, max_len, _ = output.size()
363
+ mask = torch.arange(max_len, device=output.device).expand(batch_size, max_len) < lengths.unsqueeze(1)
364
+ masked_output = output * mask.unsqueeze(-1).float()
365
+ pooled = masked_output.sum(dim=1) / lengths.unsqueeze(-1).float()
366
+ return pooled
367
+
368
+ def encode(self, x: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
369
+ x = self.quant(x)
370
+ embedded = self.dropout(self.embedding(x))
371
+ packed = nn.utils.rnn.pack_padded_sequence(
372
+ embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
373
+ )
374
+ packed_out, _ = self.encoder_lstm(packed)
375
+ h = self._pool_sequence(packed_out, lengths)
376
+ h = self.encoder_norm(h)
377
+ mu, logvar = self.fc_mu(h), self.fc_logvar(h)
378
+ return mu, logvar
379
+
380
+ def reparameterize(self, mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
381
+ if self.training:
382
+ std = torch.exp(0.5 * logvar)
383
+ eps = torch.randn_like(std)
384
+ return mu + eps * std
385
+ return mu
386
+
387
+ def _init_decoder_state(self, z: torch.Tensor):
388
+ batch_size = z.size(0)
389
+ h0 = self.latent2hidden(z).view(self.num_layers, batch_size, self.hidden_dim)
390
+ c0 = self.latent2cell(z).view(self.num_layers, batch_size, self.hidden_dim)
391
+ return h0, c0
392
+
393
+ def decode(self, z: torch.Tensor, max_length: int = 64, mode: str = "greedy", temperature: float = 1.0):
394
+ batch_size = z.size(0)
395
+ device = z.device
396
+ h0, c0 = self._init_decoder_state(z)
397
+ hidden = (h0, c0)
398
+
399
+ input_ids = torch.full((batch_size, 1), self.bos_token_id, dtype=torch.long, device=device)
400
+ finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
401
+ logits_list = []
402
+
403
+ for _ in range(max_length):
404
+ embedded = self.embedding(input_ids)
405
+ output, hidden = self.decoder_lstm(embedded, hidden)
406
+ output = self.decoder_norm(output)
407
+ logit = self.fc_out(output)
408
+ logits_list.append(logit)
409
+
410
+ if mode == "greedy":
411
+ next_tokens = logit.argmax(dim=-1)
412
+ elif mode == "sample":
413
+ probs = F.softmax(logit.squeeze(1) / temperature, dim=-1)
414
+ next_tokens = torch.multinomial(probs, 1)
415
+ else:
416
+ raise ValueError(f"Unknown decode mode: {mode}")
417
+
418
+ just_finished = (next_tokens.squeeze(-1) == self.eos_token_id)
419
+ finished |= just_finished
420
+ next_tokens = torch.where(
421
+ finished.unsqueeze(-1),
422
+ torch.tensor(self.pad_token_id, device=device),
423
+ next_tokens
424
+ )
425
+ input_ids = next_tokens
426
+ if finished.all():
427
+ break
428
+
429
+ return self.dequant(torch.cat(logits_list, dim=1))
430
+
431
+ def forward(self, input_ids: torch.Tensor, lengths: torch.Tensor,
432
+ target_seq: Optional[torch.Tensor] = None,
433
+ teacher_forcing_ratio: float = 0.0,
434
+ temperature: float = 1.0):
435
+ mu, logvar = self.encode(input_ids, lengths)
436
+ z = self.reparameterize(mu, logvar)
437
+ if self.training and target_seq is not None and teacher_forcing_ratio > 0:
438
+ return self._forward_teacher_forcing(z, target_seq, teacher_forcing_ratio), mu, logvar
439
+ else:
440
+ max_len = target_seq.size(1) if target_seq is not None else 64
441
+ return self.decode(z, max_length=max_len, temperature=temperature), mu, logvar
442
+
443
+ def _forward_teacher_forcing(self, z: torch.Tensor, target_seq: torch.Tensor, teacher_forcing_ratio: float):
444
+ batch_size, seq_len = target_seq.size()
445
+ h0, c0 = self._init_decoder_state(z)
446
+ hidden = (h0, c0)
447
+ logits_list = []
448
+ input_token = target_seq[:, 0:1]
449
+
450
+ for t in range(1, seq_len):
451
+ embedded = self.embedding(input_token)
452
+ output, hidden = self.decoder_lstm(embedded, hidden)
453
+ output = self.decoder_norm(output)
454
+ logit = self.fc_out(output)
455
+ logits_list.append(logit)
456
+
457
+ if torch.rand(1).item() < teacher_forcing_ratio:
458
+ input_token = target_seq[:, t:t+1]
459
+ else:
460
+ input_token = logit.argmax(dim=-1)
461
+
462
+ return torch.cat(logits_list, dim=1)
463
+
464
+ #
465
+ # Step 2.2 — Loss Function (PATCHED: β applied OUTSIDE, not inside)
466
+ #
467
+
468
+ # PATCH 2: Fix VAE Loss Function - Ensure beta is properly applied
469
+ # Replace the existing vae_loss function:
470
+
471
+ def vae_loss(logits, targets, mu, logvar, pad_token_id, beta=1.0):
472
+ # 1. align lengths
473
+ max_len = max(logits.size(1), targets.size(1))
474
+ if logits.size(1) < max_len:
475
+ logits = F.pad(logits, (0, 0, 0, max_len - logits.size(1)))
476
+ if targets.size(1) < max_len:
477
+ targets = F.pad(targets, (0, max_len - targets.size(1)), value=pad_token_id)
478
+
479
+ logits_flat = logits.view(-1, logits.size(-1)) # [B*L, V]
480
+ targets_flat = targets.reshape(-1) # [B*L]
481
+
482
+ mask = (targets_flat != pad_token_id).float()
483
+ ce_loss = F.cross_entropy(logits_flat, targets_flat, reduction='none')
484
+ mask_sum = mask.sum()
485
+ ce_loss = (ce_loss * mask).sum() / (mask_sum + 1e-8)
486
+
487
+ # FIXED: Raw KL loss computation
488
+ kl_loss_raw = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1)
489
+ # Apply mask to KL loss if needed (but typically KL is per-sample)
490
+ kl_loss = kl_loss_raw.mean()
491
+
492
+ # CRITICAL FIX: Apply beta scaling correctly
493
+ total_loss = ce_loss + beta * kl_loss
494
+
495
+ return total_loss, ce_loss, kl_loss
496
+
497
+ #
498
+ # Step 2.3 — KLAnnealer (Fixed Bug #5: double increment)
499
+ #
500
+
501
+ import math
502
+
503
+ class KLAnnealer:
504
+ def __init__(self, total_steps, n_cycle=1, ratio=0.3, mode="linear", per_epoch=False, steps_per_epoch=None):
505
+ self.total_steps = total_steps
506
+ self.n_cycle = n_cycle
507
+ self.ratio = ratio
508
+ self.mode = mode
509
+ self.per_epoch = per_epoch
510
+ self.steps_per_epoch = steps_per_epoch
511
+ self.current_step = 0
512
+ self.current_epoch = 0
513
+
514
+ def get_beta(self, increment=True):
515
+ """Get current KL weight (beta).
516
+ Args:
517
+ increment (bool): whether to advance the annealer (use False in validation).
518
+ """
519
+ if increment:
520
+ self.current_step += 1
521
+
522
+ # Calculate progress based on total steps
523
+ progress = min(self.current_step / max(self.total_steps, 1.0), 1.0)
524
+
525
+ # For cyclical annealing
526
+ if self.n_cycle > 1:
527
+ cycle_length = self.total_steps / self.n_cycle
528
+ pos_in_cycle = (self.current_step % cycle_length)
529
+ cycle_progress = min(pos_in_cycle / max(cycle_length * self.ratio, 1.0), 1.0)
530
+ else:
531
+ # For single cycle, use full progress
532
+ cycle_progress = min(progress / self.ratio, 1.0) if self.ratio > 0 else 1.0
533
+
534
+ if self.mode == "linear":
535
+ beta = min(cycle_progress, 1.0)
536
+ elif self.mode == "sigmoid":
537
+ k = 6
538
+ # scale progress ∈ [0,1] → [-3, +3] for a smooth S curve
539
+ beta = 1 / (1 + math.exp(-k * (cycle_progress - 0.5)))
540
+ elif self.mode == "cosine":
541
+ # Cosine annealing from 0 to 1
542
+ beta = 0.5 * (1 + math.cos(math.pi * (1 - cycle_progress)))
543
+ else:
544
+ raise ValueError(f"Unknown mode: {self.mode}")
545
+
546
+ return min(beta, 1.0)
547
+
548
+ def step(self):
549
+ """Increment the step counter."""
550
+ self.current_step += 1
551
+
552
+ def epoch_step(self):
553
+ """Increment the epoch counter."""
554
+ self.current_epoch += 1
555
+
556
+ #
557
+ # Teacher forcing ratio
558
+ #
559
+
560
+ def get_teacher_forcing_ratio(epoch, num_epochs, min_tfr=0.6, warmup_fraction=0.3):
561
+ """
562
+ Linear decay of teacher forcing ratio (TFR).
563
+ - Starts at 1.0
564
+ - Decays to min_tfr by (warmup_fraction * num_epochs)
565
+ - Then stays flat
566
+ """
567
+ warmup_epochs = int(num_epochs * warmup_fraction)
568
+ if epoch < warmup_epochs:
569
+ # linearly decay from 1.0 → min_tfr
570
+ return 1.0 - (1.0 - min_tfr) * (epoch / warmup_epochs)
571
+ else:
572
+ return min_tfr
573
+
574
+
575
+ #
576
+ # Step 2.4 — Collate Function (Fixed Bug #2: dynamic pad id)
577
+ #
578
+
579
+ def collate_fn(batch, tokenizer, max_length=128):
580
+ encodings = [tokenizer.encode(s, add_special_tokens=True) for s in batch]
581
+ input_ids = [e['input_ids'] for e in encodings]
582
+
583
+ max_len = min(max(len(ids) for ids in input_ids), max_length)
584
+ padded = []
585
+ lengths = []
586
+
587
+ pad_token_id = tokenizer.tokenizer.pad_token_id # FIXED: dynamic
588
+
589
+ for ids in input_ids:
590
+ if len(ids) > max_length:
591
+ ids = ids[:max_length]
592
+ else:
593
+ ids = ids + [pad_token_id] * (max_len - len(ids))
594
+ padded.append(ids)
595
+ lengths.append(min(len(ids), max_length))
596
+
597
+ return torch.tensor(padded, dtype=torch.long), torch.tensor(lengths, dtype=torch.long)
598
+
599
+ #
600
+ # Step 2.5 — Dataset & DataLoader
601
+ #
602
+
603
+ class SmilesDataset(Dataset):
604
+ def __init__(self, smiles_list):
605
+ self.smiles_list = smiles_list
606
+ def __len__(self):
607
+ return len(self.smiles_list)
608
+ def __getitem__(self, idx):
609
+ return self.smiles_list[idx]
610
+
611
+ #
612
+ # Step 3.x — Training Loop (PATCHED: per-tokenizer annealer, exponential TFR, device-safe eval, KL beta logging clarity)
613
+ #
614
+
615
+ LEARNING_RATE = 1e-5
616
+ BATCH_SIZE = 16
617
+ ACCUMULATION_STEPS = 4
618
+ NUM_EPOCHS = 5
619
+ MAX_SEQ_LEN = 128
620
+ KL_ANNEAL_RATIO = 0.3
621
+
622
+ def train_vae(
623
+ model,
624
+ train_loader,
625
+ val_loader,
626
+ optimizer,
627
+ kl_annealer,
628
+ pad_token_id,
629
+ device,
630
+ num_epochs,
631
+ accumulation_steps=4,
632
+ save_dir="./checkpoints",
633
+ tokenizer_name="default"
634
+ ):
635
+ os.makedirs(save_dir, exist_ok=True)
636
+ log_file = os.path.join(save_dir, f"training_log_{tokenizer_name}.csv")
637
+
638
+ with open(log_file, "w") as f:
639
+ f.write("epoch,step,train_loss,train_ce,train_kl,val_loss,val_ce,val_kl,kl_beta\n")
640
+
641
+ best_val_loss = float('inf')
642
+
643
+ for epoch in range(num_epochs):
644
+ print(f"\n=== Epoch {epoch+1}/{num_epochs} ===")
645
+ model.train()
646
+ total_train_loss = total_train_ce = total_train_kl = 0.0
647
+ num_batches = 0
648
+
649
+ optimizer.zero_grad()
650
+
651
+ for step, (input_ids, lengths) in enumerate(tqdm(train_loader, desc="Training")):
652
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
653
+
654
+ # ← PATCHED: exponential decay per epoch (not per batch, but smoother than linear)
655
+ tfr = get_teacher_forcing_ratio(epoch, num_epochs, min_tfr=0.6, warmup_fraction=0.3)
656
+
657
+ logits, mu, logvar = model(input_ids, lengths, target_seq=input_ids, teacher_forcing_ratio=tfr)
658
+ beta = kl_annealer.get_beta(increment=True)
659
+ loss, ce_loss, kl_loss = vae_loss(logits, input_ids, mu, logvar, pad_token_id, beta=beta)
660
+
661
+ loss = loss / accumulation_steps
662
+ loss.backward()
663
+
664
+ total_train_loss += loss.item() * accumulation_steps
665
+ total_train_ce += ce_loss.item()
666
+ total_train_kl += kl_loss.item()
667
+ num_batches += 1
668
+
669
+ if (step + 1) % accumulation_steps == 0:
670
+ optimizer.step()
671
+ optimizer.zero_grad()
672
+
673
+ if len(train_loader) % accumulation_steps != 0:
674
+ optimizer.step()
675
+ optimizer.zero_grad()
676
+
677
+ # ✅ CAPTURE BETA AFTER TRAINING — BEFORE VALIDATION
678
+ # This ensures we log the beta that was actually used during training
679
+ current_beta = kl_annealer.get_beta(increment=False)
680
+
681
+ # Validation — DO NOT query beta again here
682
+ model.eval()
683
+ total_val_loss = total_val_ce = total_val_kl = 0.0
684
+ val_batches = 0
685
+
686
+ with torch.no_grad():
687
+ for input_ids, lengths in tqdm(val_loader, desc="Validating"):
688
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
689
+ # Use captured beta — DO NOT call kl_annealer again here
690
+ logits, mu, logvar = model(input_ids, lengths, target_seq=input_ids, teacher_forcing_ratio=0.0)
691
+ loss, ce_loss, kl_loss = vae_loss(logits, input_ids, mu, logvar, pad_token_id, beta=current_beta)
692
+
693
+ total_val_loss += loss.item()
694
+ total_val_ce += ce_loss.item()
695
+ total_val_kl += kl_loss.item()
696
+ val_batches += 1
697
+
698
+ avg_train_loss = total_train_loss / num_batches
699
+ avg_val_loss = total_val_loss / val_batches
700
+
701
+ current_step = (epoch + 1) * len(train_loader)
702
+ with open(log_file, "a") as f:
703
+ f.write(f"{epoch+1},{current_step},{avg_train_loss:.6f},{total_train_ce/num_batches:.6f},{total_train_kl/num_batches:.6f},"
704
+ f"{avg_val_loss:.6f},{total_val_ce/val_batches:.6f},{total_val_kl/val_batches:.6f},{current_beta:.6f}\n")
705
+
706
+ print(f"Train Loss: {avg_train_loss:.4f}")
707
+ print(f"Val Loss: {avg_val_loss:.4f}")
708
+ print(f"KL Beta: {current_beta:.4f}") # ← Now explicitly the training beta
709
+
710
+ if avg_val_loss < best_val_loss:
711
+ best_val_loss = avg_val_loss
712
+ checkpoint_path = os.path.join(save_dir, f"best_model_{tokenizer_name}.pt")
713
+ torch.save({
714
+ 'epoch': epoch + 1,
715
+ 'model_state_dict': model.state_dict(),
716
+ 'optimizer_state_dict': optimizer.state_dict(),
717
+ 'val_loss': avg_val_loss,
718
+ }, checkpoint_path)
719
+ print(f"→ Saved best model to {checkpoint_path}")
720
+
721
+ return best_val_loss
722
+
723
+ #
724
+ # TRAINING LOOP OVER TOKENIZERS (PATCHED: KLAnnealer reset per tokenizer)
725
+ #
726
+
727
+ for tokenizer in TOKENIZERS:
728
+ print(f"\n STARTING TRAINING FOR: {tokenizer.name}\n")
729
+
730
+ vocab_size = len(tokenizer)
731
+ pad_token_id = tokenizer.tokenizer.pad_token_id
732
+
733
+ # Validate token IDs
734
+ sample_ids = tokenizer.encode(train_smiles[0], add_special_tokens=True)['input_ids']
735
+ max_id_in_sample = max(sample_ids)
736
+ assert max_id_in_sample < vocab_size, f"Token ID {max_id_in_sample} >= vocab size {vocab_size} in {tokenizer.name}"
737
+
738
+ model = MoleculeVAE(
739
+ vocab_size=len(tokenizer),
740
+ pad_token_id=tokenizer.pad_token_id,
741
+ bos_token_id=tokenizer.bos_token_id,
742
+ eos_token_id=tokenizer.eos_token_id
743
+ ).to(device)
744
+
745
+ ########################################################################
746
+ # 1. CREATE A FRESH annealer FOR EVERY TOKENIZER
747
+ ########################################################################
748
+
749
+
750
+
751
+ optimizer = Ranger21(
752
+ model.parameters(),
753
+ lr=LEARNING_RATE,
754
+ weight_decay=0.01,
755
+ use_adabelief=True,
756
+ use_warmup=True,
757
+ use_madgrad=True,
758
+ num_epochs=NUM_EPOCHS,
759
+ num_batches_per_epoch=len(train_smiles) // (BATCH_SIZE * ACCUMULATION_STEPS),
760
+ warmdown_active=False,
761
+ )
762
+
763
+ train_dataset = SmilesDataset(train_smiles)
764
+ val_dataset = SmilesDataset(val_smiles)
765
+
766
+ train_loader = DataLoader(
767
+ train_dataset,
768
+ batch_size=BATCH_SIZE,
769
+ shuffle=True,
770
+ collate_fn=lambda batch: collate_fn(batch, tokenizer, max_length=MAX_SEQ_LEN),
771
+ num_workers=0,
772
+ pin_memory=True
773
+ )
774
+
775
+ val_loader = DataLoader(
776
+ val_dataset,
777
+ batch_size=BATCH_SIZE,
778
+ shuffle=False,
779
+ collate_fn=lambda batch: collate_fn(batch, tokenizer, max_length=MAX_SEQ_LEN),
780
+ num_workers=0,
781
+ pin_memory=True
782
+ )
783
+
784
+ steps_per_epoch = len(train_loader)
785
+ total_steps = steps_per_epoch * NUM_EPOCHS
786
+ # total_steps = (len(train_smiles) // (BATCH_SIZE * ACCUMULATION_STEPS)) * NUM_EPOCHS
787
+ kl_annealer = KLAnnealer(
788
+ total_steps=total_steps,
789
+ n_cycle=1, # REDUCED: 2 cycles instead of 4 for longer warmup per cycle
790
+ ratio=0.6, # INCREASED: 60% of each cycle is warmup (was 25%)
791
+ mode="linear", # CHANGED: Linear is more predictable than sigmoid
792
+ per_epoch=False
793
+ )
794
+
795
+ train_vae(
796
+ model=model,
797
+ train_loader=train_loader,
798
+ val_loader=val_loader,
799
+ optimizer=optimizer,
800
+ kl_annealer=kl_annealer,
801
+ pad_token_id=pad_token_id,
802
+ device=device,
803
+ num_epochs=NUM_EPOCHS,
804
+ accumulation_steps=ACCUMULATION_STEPS,
805
+ save_dir=f"./checkpoints/{tokenizer.name}",
806
+ tokenizer_name=tokenizer.name
807
+ )
808
+
809
+ #
810
+ # Step 4.x — Evaluation Pipeline (Fixed Bug #6, #7, #8)
811
+ #
812
+
813
+ def canonicalize_smiles(smiles):
814
+ mol = Chem.MolFromSmiles(smiles)
815
+ if mol is None:
816
+ return None
817
+ return Chem.MolToSmiles(mol, isomericSmiles=True)
818
+
819
+ def evaluate_reconstruction(model, dataloader, tokenizer, device, max_length=128):
820
+ model.eval()
821
+ total_token_correct = total_tokens = exact_matches = valid_count = total_samples = 0
822
+ all_generated, all_targets = [], []
823
+
824
+ pad_id = tokenizer.tokenizer.pad_token_id
825
+ eos_id = tokenizer.tokenizer.eos_token_id
826
+ special_ids = {pad_id, eos_id}
827
+
828
+ def trim_to_special(ids, specials):
829
+ for i, id_ in enumerate(ids):
830
+ if id_ in specials:
831
+ return ids[:i]
832
+ return ids
833
+
834
+ with torch.no_grad():
835
+ for input_ids, lengths in tqdm(dataloader, desc="Evaluating Reconstruction"):
836
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
837
+ B = input_ids.size(0)
838
+
839
+ mu, logvar = model.encode(input_ids, lengths)
840
+ z = model.reparameterize(mu, logvar)
841
+ logits = model.decode(z, max_length=128, mode="greedy") # FIXED #7 for reconstruction
842
+ preds = logits.argmax(dim=-1)
843
+
844
+ # FIXED: Align logits and targets to same sequence length
845
+ min_len = min(logits.size(1), input_ids.size(1))
846
+ preds = preds[:, :min_len] # trim predictions
847
+ input_ids_eval = input_ids[:, :min_len] # trim targets
848
+
849
+ mask = (input_ids_eval != pad_id)
850
+ token_correct = ((preds == input_ids_eval) & mask).sum().item()
851
+ total_token_correct += token_correct
852
+ total_tokens += mask.sum().item()
853
+
854
+ for i in range(B):
855
+ target_ids = input_ids_eval[i].cpu().tolist()
856
+ pred_ids = preds[i].cpu().tolist()
857
+
858
+ # FIXED BUG #6: Trim before decode
859
+ target_ids_trim = trim_to_special(target_ids, special_ids)
860
+ pred_ids_trim = trim_to_special(pred_ids, special_ids)
861
+
862
+ target_smiles = tokenizer.decode(target_ids_trim, skip_special_tokens=False)
863
+ pred_smiles = tokenizer.decode(pred_ids_trim, skip_special_tokens=False)
864
+
865
+ all_targets.append(target_smiles)
866
+ all_generated.append(pred_smiles)
867
+
868
+ if pred_smiles == target_smiles:
869
+ exact_matches += 1
870
+ if Chem.MolFromSmiles(pred_smiles) is not None:
871
+ valid_count += 1
872
+ total_samples += 1
873
+
874
+ token_acc = total_token_correct / total_tokens if total_tokens > 0 else 0.0
875
+ exact_match_rate = exact_matches / total_samples
876
+ validity_rate = valid_count / total_samples
877
+
878
+ print(f"Token-level Accuracy: {token_acc:.4f}")
879
+ print(f"Exact Match Rate: {exact_match_rate:.4f}")
880
+ print(f"Validity Rate: {validity_rate:.4f}")
881
+
882
+ return {
883
+ 'token_accuracy': token_acc,
884
+ 'exact_match_rate': exact_match_rate,
885
+ 'validity_rate': validity_rate,
886
+ 'generated_smiles': all_generated,
887
+ 'target_smiles': all_targets
888
+ }
889
+
890
+ def compute_uniqueness_and_novelty(generated_smiles, train_smiles_set):
891
+ total = len(generated_smiles)
892
+ unique = len(set(generated_smiles))
893
+ novel = len([s for s in generated_smiles if s not in train_smiles_set])
894
+ uniqueness = unique / total if total > 0 else 0.0
895
+ novelty = novel / total if total > 0 else 0.0
896
+ print(f"Uniqueness: {uniqueness:.4f} ({unique}/{total})")
897
+ print(f"Novelty: {novelty:.4f} ({novel}/not in train)")
898
+ return uniqueness, novelty
899
+
900
+ def kl_divergence_from_samples(samples, bins=512):
901
+ dim_kls = []
902
+ for d in range(samples.shape[1]):
903
+ data = samples[:, d]
904
+ hist, bin_edges = np.histogram(data, bins=bins, density=True)
905
+ bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
906
+ norm_pdf = (1 / np.sqrt(2 * np.pi)) * np.exp(-0.5 * bin_centers**2)
907
+ hist = np.clip(hist, 1e-10, None)
908
+ norm_pdf = np.clip(norm_pdf, 1e-10, None)
909
+ kl = entropy(hist, norm_pdf)
910
+ dim_kls.append(kl)
911
+ return np.mean(dim_kls)
912
+
913
+ def evaluate_latent_kl(model, dataloader, device, latent_dim=128, bins=512):
914
+ model.eval()
915
+ all_z = []
916
+ with torch.no_grad():
917
+ for input_ids, lengths in tqdm(dataloader, desc="Sampling Latents"):
918
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
919
+ mu, logvar = model.encode(input_ids, lengths)
920
+ z = model.reparameterize(mu, logvar)
921
+ all_z.append(z.cpu().numpy())
922
+ all_z = np.concatenate(all_z, axis=0)
923
+ kl_div = kl_divergence_from_samples(all_z, bins=bins)
924
+ print(f"KL Divergence (empirical vs N(0,1)): {kl_div:.4f}")
925
+ return kl_div
926
+
927
+ def evaluate_interpolation_validity(model, tokenizer, test_smiles, device, num_pairs=100, steps=10, max_length=128):
928
+ model.eval()
929
+ pairs = random.sample(list(zip(test_smiles[::2], test_smiles[1::2])), min(num_pairs, len(test_smiles)//2))
930
+ valid_interps = total_interps = 0
931
+
932
+ with torch.no_grad():
933
+ for smiles_a, smiles_b in tqdm(pairs, desc="Interpolation Validity"):
934
+ if not smiles_a or not smiles_b: continue
935
+
936
+ enc_a = tokenizer.encode(smiles_a, add_special_tokens=True)
937
+ enc_b = tokenizer.encode(smiles_b, add_special_tokens=True)
938
+
939
+ ids_a = torch.tensor([enc_a['input_ids']], device=device)
940
+ ids_b = torch.tensor([enc_b['input_ids']], device=device)
941
+ len_a = torch.tensor([len(enc_a['input_ids'])], device=device)
942
+ len_b = torch.tensor([len(enc_b['input_ids'])], device=device)
943
+
944
+ mu_a, _ = model.encode(ids_a, len_a)
945
+ mu_b, _ = model.encode(ids_b, len_b)
946
+
947
+ alphas = torch.linspace(0, 1, steps, device=device)
948
+ for alpha in alphas:
949
+ z_interp = alpha * mu_b + (1 - alpha) * mu_a
950
+ # Ensure z_interp maintains batch dimension [1, latent_dim]
951
+ if z_interp.dim() == 1:
952
+ z_interp = z_interp.unsqueeze(0)
953
+
954
+ logits = model.decode(z_interp, max_length=max_length, mode="sample", temperature=0.8)
955
+ preds = logits.argmax(dim=-1)
956
+ # Handle batch dimension properly
957
+ if preds.dim() > 1:
958
+ preds = preds[0] # Take first (and only) batch item
959
+ pred_smiles = tokenizer.decode(preds.cpu().tolist(), skip_special_tokens=True)
960
+ if Chem.MolFromSmiles(pred_smiles) is not None:
961
+ valid_interps += 1
962
+ total_interps += 1
963
+
964
+ interp_validity = valid_interps / total_interps if total_interps > 0 else 0.0
965
+ print(f"Interpolation Validity: {interp_validity:.4f}")
966
+ return interp_validity
967
+
968
+ def sample_from_latent(model, tokenizer, num_samples=30000, latent_dim=128, max_length=128, device=device, temperature=0.8):
969
+ model.eval()
970
+ generated_smiles = []
971
+ with torch.no_grad():
972
+ for _ in tqdm(range(0, num_samples, BATCH_SIZE), desc="Sampling from Latent"):
973
+ current_batch_size = min(BATCH_SIZE, num_samples - len(generated_smiles))
974
+ if current_batch_size <= 0: break
975
+ z = torch.randn(current_batch_size, latent_dim, device=device)
976
+ logits = model.decode(z, max_length=max_length, mode="sample", temperature=temperature)
977
+ preds = logits.argmax(dim=-1)
978
+ for i in range(current_batch_size):
979
+ pred_ids = preds[i].cpu().tolist()
980
+ smiles = tokenizer.decode(pred_ids, skip_special_tokens=True)
981
+ generated_smiles.append(smiles)
982
+ if len(generated_smiles) >= num_samples: break
983
+ return generated_smiles
984
+
985
+ def measure_inference_throughput(model, tokenizer, test_smiles, device,
986
+ max_length=128,
987
+ batch_sizes=[1, 4, 8, 16]):
988
+ """
989
+ Benchmark inference speed & peak GPU memory across several batch sizes.
990
+ Returns a JSON-serialisable dict:
991
+ {batch_size: {'tokens_per_sec': <float>, 'peak_mem_mb': <float>}, ...}
992
+ """
993
+ model.eval()
994
+ results = {}
995
+
996
+ for bs in batch_sizes:
997
+ # Build a small fixed subset so every BS processes the same #samples
998
+ subset = SmilesDataset(test_smiles[:bs * 10])
999
+ loader = DataLoader(
1000
+ subset,
1001
+ batch_size=bs,
1002
+ shuffle=False,
1003
+ num_workers=0,
1004
+ collate_fn=lambda b: collate_fn(b, tokenizer, max_length=max_length),
1005
+ )
1006
+
1007
+ total_tokens = 0
1008
+ if torch.cuda.is_available():
1009
+ torch.cuda.reset_peak_memory_stats(device)
1010
+
1011
+ start_time = time.perf_counter()
1012
+ with torch.no_grad():
1013
+ for input_ids, lengths in loader:
1014
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
1015
+ mu, logvar = model.encode(input_ids, lengths)
1016
+ z = model.reparameterize(mu, logvar)
1017
+ logits = model.decode(z, max_length=max_length)
1018
+ total_tokens += logits.numel() # number of float elements
1019
+ duration = time.perf_counter() - start_time
1020
+
1021
+ tokens_per_sec = total_tokens / duration
1022
+ peak_mem_mb = (
1023
+ torch.cuda.max_memory_allocated(device) / (1024 ** 2)
1024
+ if torch.cuda.is_available()
1025
+ else 0.0
1026
+ )
1027
+
1028
+ # Store as plain Python floats
1029
+ results[bs] = {
1030
+ "tokens_per_sec": float(tokens_per_sec),
1031
+ "peak_mem_mb": float(peak_mem_mb),
1032
+ }
1033
+ print(f"BS {bs:3d} → {tokens_per_sec:8.2f} tok/s | Peak Mem: {peak_mem_mb:.2f} MB")
1034
+
1035
+ return results
1036
+
1037
+ #
1038
+ # FINAL EVALUATION PIPELINE
1039
+ #
1040
+
1041
+ def full_evaluation_pipeline(model, tokenizer, train_smiles, test_smiles, device, save_dir):
1042
+ print(f"\n FULL EVALUATION FOR: {tokenizer.name}")
1043
+
1044
+ test_dataset = SmilesDataset(test_smiles)
1045
+ test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False,
1046
+ collate_fn=lambda b: collate_fn(b, tokenizer, max_length=MAX_SEQ_LEN),
1047
+ num_workers=0)
1048
+
1049
+ # 1. Reconstruction
1050
+ recon_metrics = evaluate_reconstruction(model, test_loader, tokenizer, device)
1051
+
1052
+ # 2. Uniqueness & Novelty
1053
+ train_set = set(train_smiles)
1054
+ uniqueness, novelty = compute_uniqueness_and_novelty(recon_metrics['generated_smiles'], train_set)
1055
+
1056
+ # 3. KL Divergence
1057
+ kl_div = evaluate_latent_kl(model, test_loader, device)
1058
+
1059
+ # 4. Interpolation Validity
1060
+ interp_validity = evaluate_interpolation_validity(model, tokenizer, test_smiles, device)
1061
+
1062
+ # 5. Latent Sampling (for FCD — optional)
1063
+ # gen_smiles_30k = sample_from_latent(model, tokenizer, num_samples=10000, temperature=0.8) # reduce for speed
1064
+ # fcd_score = compute_fcd(test_smiles, gen_smiles_30k) if 'get_fcd' in globals() else None
1065
+
1066
+ # 6. Throughput & Memory
1067
+ # throughput = measure_inference_throughput(model, tokenizer, test_loader, device)
1068
+
1069
+ eval_results = {
1070
+ **recon_metrics,
1071
+ 'uniqueness': uniqueness,
1072
+ 'novelty': novelty,
1073
+ 'kl_divergence': kl_div,
1074
+ 'interpolation_validity': interp_validity,
1075
+ # 'fcd': fcd_score,
1076
+ # 'inference_throughput': throughput,
1077
+ }
1078
+
1079
+ eval_path = os.path.join(save_dir, "evaluation_results.json")
1080
+ with open(eval_path, "w") as f:
1081
+ json.dump(eval_results, f, indent=2, default=str)
1082
+
1083
+ print(f" Evaluation saved to {eval_path}")
1084
+ return eval_results
1085
+
1086
+ #
1087
+ # RUN EVALUATION FOR EACH TOKENIZER
1088
+ #
1089
+
1090
+ for tokenizer in TOKENIZERS:
1091
+ print(f"\n🔄 LOADING BEST MODEL FOR: {tokenizer.name}")
1092
+ checkpoint_path = f"./checkpoints/{tokenizer.name}/best_model_{tokenizer.name}.pt"
1093
+ if not os.path.exists(checkpoint_path):
1094
+ print(f"⚠️ Checkpoint not found: {checkpoint_path}")
1095
+ continue
1096
+
1097
+ vocab_size = len(tokenizer)
1098
+ pad_token_id = tokenizer.tokenizer.pad_token_id
1099
+ model = MoleculeVAE(
1100
+ vocab_size=vocab_size,
1101
+ pad_token_id=pad_token_id,
1102
+ bos_token_id=tokenizer.bos_token_id,
1103
+ eos_token_id=tokenizer.eos_token_id
1104
+ ).to(device)
1105
+
1106
+ checkpoint = torch.load(checkpoint_path, map_location=device)
1107
+ model.load_state_dict(checkpoint['model_state_dict'])
1108
+ model.eval()
1109
+
1110
+ full_evaluation_pipeline(
1111
+ model=model,
1112
+ tokenizer=tokenizer,
1113
+ train_smiles=train_smiles,
1114
+ test_smiles=test_smiles,
1115
+ device=device,
1116
+ save_dir=f"./checkpoints/{tokenizer.name}"
1117
+ )
1118
+
1119
+ print("\n🎉 PIPELINE COMPLETE — ALL TOKENIZERS BENCHMARKED, TRAINED, AND EVALUATED!")
benchmark/benchmark_HF_simpler.py ADDED
@@ -0,0 +1,895 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Molecule Tokenizer Benchmark & VAE Training Pipeline
3
+ # PATCHED VERSION — Updated for FastChemTokenizerHF (HF compatible)
4
+ # PATCHED: Simplified KL annealing (linear warmup), updated TFR schedule, updated training loop
5
+ #
6
+ #
7
+ # Step 1.1 — Imports & Reproducibility
8
+ #
9
+ import os
10
+ import time
11
+ import random
12
+ import pandas as pd
13
+ from pathlib import Path
14
+ from datetime import datetime
15
+ import torch
16
+ import numpy as np
17
+ # Tokenizers
18
+ from transformers import AutoTokenizer
19
+ from FastChemTokenizerHF import FastChemTokenizer
20
+ # Optional: for progress bars
21
+ from tqdm import tqdm
22
+ from rdkit import Chem
23
+ from sklearn.model_selection import train_test_split
24
+ import torch.nn as nn
25
+ import torch.nn.functional as F
26
+ from ranger21 import Ranger21
27
+ from torch.utils.data import DataLoader, Dataset
28
+ from scipy.stats import entropy
29
+ import json
30
+ import math
31
+ from typing import Optional, Tuple, Union
32
+ from rdkit import RDLogger
33
+ RDLogger.DisableLog('rdApp.*')
34
+ # Set seeds for reproducibility
35
+ def set_seed(seed=42):
36
+ torch.manual_seed(seed)
37
+ torch.cuda.manual_seed_all(seed)
38
+ np.random.seed(seed)
39
+ random.seed(seed)
40
+ os.environ['PYTHONHASHSEED'] = str(seed)
41
+ torch.backends.cudnn.deterministic = True
42
+ torch.backends.cudnn.benchmark = False
43
+ set_seed(42)
44
+ # Device setup
45
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
46
+ print(f"Using device: {device}")
47
+ #
48
+ # Step 1.2 — Load & Preprocess SMILES Corpus
49
+ #
50
+ data_path = "./data/chunk_1smi.csv"
51
+ df = pd.read_csv(data_path)
52
+ # Replace df with a 10% sample for prototyping
53
+ df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)
54
+ print(f"Prototype size: {len(df)} rows")
55
+ if 'SMILES' not in df.columns:
56
+ raise ValueError("Expected column 'SMILES' in CSV")
57
+ smiles_list = df['SMILES'].dropna().tolist()
58
+ print(f"Loaded {len(smiles_list)} SMILES (assumed pre-canonicalized)")
59
+ # Validate with RDKit
60
+ def is_valid_smiles(smiles):
61
+ return Chem.MolFromSmiles(smiles) is not None
62
+ print("Validating SMILES with RDKit...")
63
+ valid_mask = [is_valid_smiles(s) for s in tqdm(smiles_list)]
64
+ smiles_list = [s for s, valid in zip(smiles_list, valid_mask) if valid]
65
+ print(f"After RDKit filtering: {len(smiles_list)} valid SMILES")
66
+ #
67
+ # Step 1.3 — Train/Val/Test Split (80/10/10)
68
+ #
69
+ train_smiles, temp_smiles = train_test_split(smiles_list, test_size=0.2, random_state=42, shuffle=True)
70
+ val_smiles, test_smiles = train_test_split(temp_smiles, test_size=0.5, random_state=42, shuffle=True)
71
+ print(f"Train: {len(train_smiles)}")
72
+ print(f"Val: {len(val_smiles)}")
73
+ print(f"Test: {len(test_smiles)}")
74
+ # Cache splits
75
+ splits = {'train': train_smiles, 'val': val_smiles, 'test': test_smiles}
76
+ for split_name, smiles in splits.items():
77
+ with open(f"./data/{split_name}_smiles.txt", "w") as f:
78
+ f.write("\n".join(smiles))
79
+ #
80
+ # Step 1.4 — Tokenizer Wrapper (Simplified for HF compatibility)
81
+ #
82
+ class TokenizerWrapper:
83
+ def __init__(self, tokenizer, name,
84
+ bos_token="<s>", eos_token="</s>",
85
+ pad_token="<pad>", unk_token="<unk>"):
86
+ self.tokenizer = tokenizer
87
+ self.name = name
88
+ # Only call add_special_tokens if the tokenizer actually supports it
89
+ if hasattr(tokenizer, "add_special_tokens") and callable(tokenizer.add_special_tokens):
90
+ try:
91
+ tokenizer.add_special_tokens({
92
+ "bos_token": bos_token,
93
+ "eos_token": eos_token,
94
+ "pad_token": pad_token,
95
+ "unk_token": unk_token,
96
+ })
97
+ except NotImplementedError:
98
+ # Your FastChemTokenizerHF already defines these tokens internally
99
+ pass
100
+ def encode(self, smiles: str, add_special_tokens: bool = True):
101
+ return self.tokenizer(
102
+ smiles,
103
+ add_special_tokens=add_special_tokens,
104
+ return_attention_mask=False,
105
+ return_tensors=None
106
+ )
107
+ def decode(self, token_ids, skip_special_tokens=True):
108
+ return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
109
+ def __len__(self):
110
+ return len(self.tokenizer)
111
+ def get_vocab(self):
112
+ return self.tokenizer.get_vocab()
113
+ @property
114
+ def bos_token_id(self):
115
+ return self.tokenizer.bos_token_id
116
+ @property
117
+ def eos_token_id(self):
118
+ return self.tokenizer.eos_token_id
119
+ @property
120
+ def pad_token_id(self):
121
+ return self.tokenizer.pad_token_id
122
+ @property
123
+ def unk_token_id(self):
124
+ return self.tokenizer.unk_token_id
125
+ #
126
+ # Step 1.5 — Initialize Tokenizers
127
+ #
128
+ tok1_hf = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
129
+ tok2_fast = FastChemTokenizer.from_pretrained("../smitok")
130
+ tokenizer1 = TokenizerWrapper(tok1_hf, name="ChemBERTa", bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>")
131
+ tokenizer2 = TokenizerWrapper(tok2_fast, name="FastChemTokenizerHF", bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>")
132
+ TOKENIZERS = [tokenizer1, tokenizer2]
133
+ #
134
+ # Step 1.6 — Benchmarking Functions (Fixed Bug #4 implicitly via epsilon)
135
+ #
136
+ def benchmark_tokenizer(tokenizer, smiles_sample, encode_only=False):
137
+ V = len(tokenizer)
138
+ sample = smiles_sample[:10000] if len(smiles_sample) > 10000 else smiles_sample
139
+ encode_times, token_counts, char_counts = [], [], []
140
+ unk_counts, total_tokens = 0, 0
141
+ for smiles in tqdm(sample, desc=f"Encoding with {tokenizer.name}", leave=False):
142
+ char_counts.append(len(smiles))
143
+ start = time.perf_counter()
144
+ enc = tokenizer.encode(smiles, add_special_tokens=True)
145
+ end = time.perf_counter()
146
+ encode_times.append(end - start)
147
+ input_ids = enc['input_ids']
148
+ token_counts.append(len(input_ids))
149
+ total_tokens += len(input_ids)
150
+ unk_id = tokenizer.tokenizer.unk_token_id
151
+ unk_counts += input_ids.count(unk_id)
152
+ L_bar = np.mean(token_counts)
153
+ C = np.mean(char_counts) / L_bar
154
+ U = unk_counts / total_tokens if total_tokens > 0 else 0.0
155
+ Tenc = len(sample) / sum(encode_times)
156
+ metrics = {
157
+ 'vocab_size': V,
158
+ 'avg_tokens_per_mol': L_bar,
159
+ 'compression_ratio': C,
160
+ 'percent_unknown': U * 100,
161
+ 'encode_throughput_smiles_per_sec': Tenc,
162
+ }
163
+ if encode_only:
164
+ return metrics
165
+ decode_times, reconstruction_ok = [], 0
166
+ for smiles in tqdm(sample, desc=f"Decoding with {tokenizer.name}", leave=False):
167
+ enc = tokenizer.encode(smiles, add_special_tokens=True)
168
+ input_ids = enc['input_ids']
169
+ start = time.perf_counter()
170
+ decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
171
+ end = time.perf_counter()
172
+ decode_times.append(end - start)
173
+ if decoded == smiles:
174
+ reconstruction_ok += 1
175
+ Tdec = len(sample) / sum(decode_times)
176
+ recon_acc = reconstruction_ok / len(sample)
177
+ metrics.update({
178
+ 'decode_throughput_smiles_per_sec': Tdec,
179
+ 'decode_reconstruction_accuracy': recon_acc * 100,
180
+ })
181
+ return metrics
182
+ #
183
+ # Step 1.7 — Run Benchmark
184
+ #
185
+ benchmark_sample = train_smiles
186
+ results = []
187
+ for tokenizer in TOKENIZERS:
188
+ print(f"\n=== Benchmarking {tokenizer.name} ===")
189
+ metrics = benchmark_tokenizer(tokenizer, benchmark_sample)
190
+ metrics['tokenizer'] = tokenizer.name
191
+ results.append(metrics)
192
+ for k, v in metrics.items():
193
+ if k != 'tokenizer':
194
+ print(f"{k:35s}: {v:.4f}" if isinstance(v, float) else f"{k:35s}: {v}")
195
+ df_results = pd.DataFrame(results)
196
+ df_results.to_csv("tokenizer_benchmark_results.csv", index=False)
197
+ print("\nTokenizer benchmark results saved to 'tokenizer_benchmark_results.csv'")
198
+ #
199
+ # Step 2.1 — VAE Model Class (PATCHED: decode stops at EOS)
200
+ #
201
+ import torch
202
+ import torch.nn as nn
203
+ import torch.nn.functional as F
204
+ from typing import Optional, Tuple, Union
205
+ import torch
206
+ import torch.nn as nn
207
+ import torch.nn.functional as F
208
+ from typing import Tuple, Optional
209
+ class MoleculeVAE(nn.Module):
210
+ """
211
+ Optimized MoleculeVAE with:
212
+ - Bidirectional encoder (restored)
213
+ - Proper latent2hidden + latent2cell (restored)
214
+ - Adjustable dropout for small dataset
215
+ - Attention pooling option
216
+ - Quantization-ready hooks
217
+ """
218
+ def __init__(self,
219
+ vocab_size: int,
220
+ embed_dim: int = 64,
221
+ hidden_dim: int = 128,
222
+ latent_dim: int = 64,
223
+ num_layers: int = 2,
224
+ pad_token_id: int = 0,
225
+ bos_token_id: int = 1,
226
+ eos_token_id: int = 2,
227
+ dropout: float = 0.2,
228
+ use_attention: bool = True,
229
+ quantize_ready: bool = False):
230
+ super().__init__()
231
+ self.vocab_size = vocab_size
232
+ self.embed_dim = embed_dim
233
+ self.hidden_dim = hidden_dim
234
+ self.latent_dim = latent_dim
235
+ self.num_layers = num_layers
236
+ self.pad_token_id = pad_token_id
237
+ self.bos_token_id = bos_token_id
238
+ self.eos_token_id = eos_token_id
239
+ self.use_attention = use_attention
240
+ # Shared embedding
241
+ self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_token_id)
242
+ # Bidirectional encoder
243
+ self.encoder_lstm = nn.LSTM(
244
+ embed_dim, hidden_dim, num_layers,
245
+ batch_first=True, dropout=dropout if num_layers > 1 else 0,
246
+ bidirectional=True
247
+ )
248
+ # Attention pooling (optional)
249
+ if use_attention:
250
+ self.attention = nn.MultiheadAttention(
251
+ hidden_dim * 2, num_heads=4, dropout=dropout, batch_first=True
252
+ )
253
+ self.attention_linear = nn.Linear(hidden_dim * 2, 1)
254
+ self.encoder_norm = nn.LayerNorm(hidden_dim * 2)
255
+ # Latent bottleneck
256
+ self.fc_mu = nn.Linear(hidden_dim * 2, latent_dim)
257
+ self.fc_logvar = nn.Linear(hidden_dim * 2, latent_dim)
258
+ # Decoder init (restored)
259
+ self.latent2hidden = nn.Linear(latent_dim, num_layers * hidden_dim)
260
+ self.latent2cell = nn.Linear(latent_dim, num_layers * hidden_dim)
261
+ # Decoder
262
+ self.decoder_lstm = nn.LSTM(
263
+ embed_dim, hidden_dim, num_layers,
264
+ batch_first=True, dropout=dropout if num_layers > 1 else 0
265
+ )
266
+ self.decoder_norm = nn.LayerNorm(hidden_dim)
267
+ self.fc_out = nn.Linear(hidden_dim, vocab_size)
268
+ # Weight tying
269
+ if embed_dim == hidden_dim:
270
+ self.fc_out.weight = self.embedding.weight
271
+ self.dropout = nn.Dropout(dropout)
272
+ # Quantization stubs
273
+ if quantize_ready:
274
+ self.quant = torch.quantization.QuantStub()
275
+ self.dequant = torch.quantization.DeQuantStub()
276
+ else:
277
+ self.quant = self.dequant = nn.Identity()
278
+ self._init_weights()
279
+ def _init_weights(self):
280
+ for name, param in self.named_parameters():
281
+ if 'weight' in name:
282
+ if param.ndim >= 2:
283
+ nn.init.xavier_uniform_(param)
284
+ else:
285
+ nn.init.normal_(param, 0, 0.01)
286
+ elif 'bias' in name:
287
+ nn.init.zeros_(param)
288
+ def _pool_sequence(self, packed_output, lengths):
289
+ output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
290
+ if self.use_attention:
291
+ attn_out, _ = self.attention(output, output, output)
292
+ weights = torch.softmax(self.attention_linear(attn_out), dim=1)
293
+ pooled = (weights * output).sum(dim=1)
294
+ else:
295
+ # mean pooling with mask
296
+ batch_size, max_len, _ = output.size()
297
+ mask = torch.arange(max_len, device=output.device).expand(batch_size, max_len) < lengths.unsqueeze(1)
298
+ masked_output = output * mask.unsqueeze(-1).float()
299
+ pooled = masked_output.sum(dim=1) / lengths.unsqueeze(-1).float()
300
+ return pooled
301
+ def encode(self, x: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
302
+ x = self.quant(x)
303
+ embedded = self.dropout(self.embedding(x))
304
+ packed = nn.utils.rnn.pack_padded_sequence(
305
+ embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
306
+ )
307
+ packed_out, _ = self.encoder_lstm(packed)
308
+ h = self._pool_sequence(packed_out, lengths)
309
+ h = self.encoder_norm(h)
310
+ mu, logvar = self.fc_mu(h), self.fc_logvar(h)
311
+ return mu, logvar
312
+ def reparameterize(self, mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
313
+ if self.training:
314
+ std = torch.exp(0.5 * logvar)
315
+ eps = torch.randn_like(std)
316
+ return mu + eps * std
317
+ return mu
318
+ def _init_decoder_state(self, z: torch.Tensor):
319
+ batch_size = z.size(0)
320
+ h0 = self.latent2hidden(z).view(self.num_layers, batch_size, self.hidden_dim)
321
+ c0 = self.latent2cell(z).view(self.num_layers, batch_size, self.hidden_dim)
322
+ return h0, c0
323
+ def decode(self, z: torch.Tensor, max_length: int = 64, mode: str = "greedy", temperature: float = 1.0):
324
+ batch_size = z.size(0)
325
+ device = z.device
326
+ h0, c0 = self._init_decoder_state(z)
327
+ hidden = (h0, c0)
328
+ input_ids = torch.full((batch_size, 1), self.bos_token_id, dtype=torch.long, device=device)
329
+ finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
330
+ logits_list = []
331
+ for _ in range(max_length):
332
+ embedded = self.embedding(input_ids)
333
+ output, hidden = self.decoder_lstm(embedded, hidden)
334
+ output = self.decoder_norm(output)
335
+ logit = self.fc_out(output)
336
+ logits_list.append(logit)
337
+ if mode == "greedy":
338
+ next_tokens = logit.argmax(dim=-1)
339
+ elif mode == "sample":
340
+ probs = F.softmax(logit.squeeze(1) / temperature, dim=-1)
341
+ next_tokens = torch.multinomial(probs, 1)
342
+ else:
343
+ raise ValueError(f"Unknown decode mode: {mode}")
344
+ just_finished = (next_tokens.squeeze(-1) == self.eos_token_id)
345
+ finished |= just_finished
346
+ next_tokens = torch.where(
347
+ finished.unsqueeze(-1),
348
+ torch.tensor(self.pad_token_id, device=device),
349
+ next_tokens
350
+ )
351
+ input_ids = next_tokens
352
+ if finished.all():
353
+ break
354
+ return self.dequant(torch.cat(logits_list, dim=1))
355
+ def forward(self, input_ids: torch.Tensor, lengths: torch.Tensor,
356
+ target_seq: Optional[torch.Tensor] = None,
357
+ teacher_forcing_ratio: float = 0.0,
358
+ temperature: float = 1.0):
359
+ mu, logvar = self.encode(input_ids, lengths)
360
+ z = self.reparameterize(mu, logvar)
361
+ if self.training and target_seq is not None and teacher_forcing_ratio > 0:
362
+ return self._forward_teacher_forcing(z, target_seq, teacher_forcing_ratio), mu, logvar
363
+ else:
364
+ max_len = target_seq.size(1) if target_seq is not None else 64
365
+ return self.decode(z, max_length=max_len, temperature=temperature), mu, logvar
366
+ def _forward_teacher_forcing(self, z: torch.Tensor, target_seq: torch.Tensor, teacher_forcing_ratio: float):
367
+ batch_size, seq_len = target_seq.size()
368
+ h0, c0 = self._init_decoder_state(z)
369
+ hidden = (h0, c0)
370
+ logits_list = []
371
+ input_token = target_seq[:, 0:1]
372
+ for t in range(1, seq_len):
373
+ embedded = self.embedding(input_token)
374
+ output, hidden = self.decoder_lstm(embedded, hidden)
375
+ output = self.decoder_norm(output)
376
+ logit = self.fc_out(output)
377
+ logits_list.append(logit)
378
+ if torch.rand(1).item() < teacher_forcing_ratio:
379
+ input_token = target_seq[:, t:t+1]
380
+ else:
381
+ input_token = logit.argmax(dim=-1)
382
+ return torch.cat(logits_list, dim=1)
383
+
384
+ # ============================
385
+ # Utility: Simple Linear KL Warmup (PATCHED IN)
386
+ # ============================
387
+ def linear_kl_beta(global_step: int, warmup_steps: int, start: float = 0.0, end: float = 1.0):
388
+ """Linear schedule from start → end over warmup_steps. Caps at end."""
389
+ if warmup_steps <= 0:
390
+ return float(end)
391
+ frac = float(global_step) / float(max(1, warmup_steps))
392
+ return float(start + (end - start) * min(1.0, frac))
393
+
394
+ #
395
+ # Step 2.2 — Loss Function (PATCHED: β applied OUTSIDE, not inside)
396
+ #
397
+ # PATCH 2: Fix VAE Loss Function - Ensure beta is properly applied
398
+ # Replace the existing vae_loss function:
399
+ def vae_loss(logits, targets, mu, logvar, pad_token_id, beta=1.0):
400
+ # 1. align lengths
401
+ max_len = max(logits.size(1), targets.size(1))
402
+ if logits.size(1) < max_len:
403
+ logits = F.pad(logits, (0, 0, 0, max_len - logits.size(1)))
404
+ if targets.size(1) < max_len:
405
+ targets = F.pad(targets, (0, max_len - targets.size(1)), value=pad_token_id)
406
+ logits_flat = logits.view(-1, logits.size(-1)) # [B*L, V]
407
+ targets_flat = targets.reshape(-1) # [B*L]
408
+ mask = (targets_flat != pad_token_id).float()
409
+ ce_loss = F.cross_entropy(logits_flat, targets_flat, reduction='none')
410
+ mask_sum = mask.sum()
411
+ ce_loss = (ce_loss * mask).sum() / (mask_sum + 1e-8)
412
+ # FIXED: Raw KL loss computation
413
+ kl_loss_raw = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1)
414
+ # Apply mask to KL loss if needed (but typically KL is per-sample)
415
+ kl_loss = kl_loss_raw.mean()
416
+ # CRITICAL FIX: Apply beta scaling correctly
417
+ total_loss = ce_loss + beta * kl_loss
418
+ return total_loss, ce_loss, kl_loss
419
+
420
+ # ============================
421
+ # Teacher Forcing Ratio Schedule (PATCHED IN)
422
+ # ============================
423
+ def get_teacher_forcing_ratio(epoch, num_epochs, min_tfr=0.6, warmup_fraction=0.3):
424
+ """Linear schedule: 1.0 until warmup_epochs, then linear decay to min_tfr."""
425
+ warmup_epochs = int(num_epochs * warmup_fraction)
426
+ if epoch < warmup_epochs:
427
+ return 1.0
428
+ else:
429
+ progress = (epoch - warmup_epochs) / max(1, num_epochs - warmup_epochs)
430
+ return max(min_tfr, 1.0 - (1.0 - min_tfr) * progress)
431
+
432
+ # REMOVED: KLAnnealer class (PATCHED OUT)
433
+
434
+ #
435
+ # Step 2.4 — Collate Function (Fixed Bug #2: dynamic pad id)
436
+ #
437
+ def collate_fn(batch, tokenizer, max_length=128):
438
+ encodings = [tokenizer.encode(s, add_special_tokens=True) for s in batch]
439
+ input_ids = [e['input_ids'] for e in encodings]
440
+ max_len = min(max(len(ids) for ids in input_ids), max_length)
441
+ padded = []
442
+ lengths = []
443
+ pad_token_id = tokenizer.tokenizer.pad_token_id # FIXED: dynamic
444
+ for ids in input_ids:
445
+ if len(ids) > max_length:
446
+ ids = ids[:max_length]
447
+ else:
448
+ ids = ids + [pad_token_id] * (max_len - len(ids))
449
+ padded.append(ids)
450
+ lengths.append(min(len(ids), max_length))
451
+ return torch.tensor(padded, dtype=torch.long), torch.tensor(lengths, dtype=torch.long)
452
+ #
453
+ # Step 2.5 — Dataset & DataLoader
454
+ #
455
+ class SmilesDataset(Dataset):
456
+ def __init__(self, smiles_list):
457
+ self.smiles_list = smiles_list
458
+ def __len__(self):
459
+ return len(self.smiles_list)
460
+ def __getitem__(self, idx):
461
+ return self.smiles_list[idx]
462
+
463
+ # ============================
464
+ # Training Loop (PATCHED: Uses linear_kl_beta)
465
+ # ============================
466
+ LEARNING_RATE = 1e-5
467
+ BATCH_SIZE = 16
468
+ ACCUMULATION_STEPS = 4
469
+ NUM_EPOCHS = 1
470
+ MAX_SEQ_LEN = 128
471
+ KL_WARMUP_FRAC = 0.1 # PATCHED: New parameter for KL warmup fraction
472
+
473
+ def train_vae(
474
+ model,
475
+ train_loader,
476
+ val_loader,
477
+ optimizer,
478
+ pad_token_id,
479
+ device,
480
+ num_epochs,
481
+ accumulation_steps=4,
482
+ save_dir="./checkpoints",
483
+ tokenizer_name="default",
484
+ warmup_steps=100, # PATCHED: New parameter for warmup steps
485
+ ):
486
+ os.makedirs(save_dir, exist_ok=True)
487
+ log_file = os.path.join(save_dir, f"training_log_{tokenizer_name}.csv")
488
+ with open(log_file, "w") as f:
489
+ f.write("epoch,step,train_loss,train_ce,train_kl,val_loss,val_ce,val_kl,kl_beta\n")
490
+
491
+ best_val_loss = float('inf')
492
+ global_step = 0 # PATCHED: Initialize global step counter
493
+
494
+ for epoch in range(num_epochs):
495
+ print(f"\n=== Epoch {epoch+1}/{num_epochs} ===")
496
+ model.train()
497
+ total_train_loss = total_train_ce = total_train_kl = 0.0
498
+ num_batches = 0
499
+
500
+ optimizer.zero_grad()
501
+
502
+ for step, (input_ids, lengths) in enumerate(tqdm(train_loader, desc="Training")):
503
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
504
+ tfr = get_teacher_forcing_ratio(epoch, num_epochs, min_tfr=0.6, warmup_fraction=0.3)
505
+
506
+ logits, mu, logvar = model(input_ids, lengths, target_seq=input_ids, teacher_forcing_ratio=tfr)
507
+
508
+ beta = linear_kl_beta(global_step, warmup_steps) # PATCHED: Use linear_kl_beta
509
+ loss, ce_loss, kl_loss = vae_loss(logits, input_ids, mu, logvar, pad_token_id, beta=beta)
510
+
511
+ loss = loss / accumulation_steps
512
+ loss.backward()
513
+
514
+ total_train_loss += loss.item() * accumulation_steps
515
+ total_train_ce += ce_loss.item()
516
+ total_train_kl += kl_loss.item()
517
+ num_batches += 1
518
+
519
+ if (step + 1) % accumulation_steps == 0:
520
+ optimizer.step()
521
+ optimizer.zero_grad()
522
+ global_step += 1 # PATCHED: Increment global step
523
+
524
+ if len(train_loader) % accumulation_steps != 0:
525
+ optimizer.step()
526
+ optimizer.zero_grad()
527
+ global_step += 1 # PATCHED: Increment global step
528
+
529
+ current_beta = linear_kl_beta(global_step, warmup_steps) # PATCHED: Get current beta after training
530
+
531
+ model.eval()
532
+ total_val_loss = total_val_ce = total_val_kl = 0.0
533
+ val_batches = 0
534
+
535
+ with torch.no_grad():
536
+ for input_ids, lengths in tqdm(val_loader, desc="Validating"):
537
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
538
+ logits, mu, logvar = model(input_ids, lengths, target_seq=input_ids, teacher_forcing_ratio=0.0)
539
+ loss, ce_loss, kl_loss = vae_loss(logits, input_ids, mu, logvar, pad_token_id, beta=current_beta) # PATCHED: Use current_beta
540
+ total_val_loss += loss.item()
541
+ total_val_ce += ce_loss.item()
542
+ total_val_kl += kl_loss.item()
543
+ val_batches += 1
544
+
545
+ avg_train_loss = total_train_loss / num_batches
546
+ avg_val_loss = total_val_loss / val_batches
547
+
548
+ current_step = (epoch + 1) * len(train_loader)
549
+ with open(log_file, "a") as f:
550
+ f.write(f"{epoch+1},{current_step},{avg_train_loss:.6f},{total_train_ce/num_batches:.6f},{total_train_kl/num_batches:.6f},"
551
+ f"{avg_val_loss:.6f},{total_val_ce/val_batches:.6f},{total_val_kl/val_batches:.6f},{current_beta:.6f}\n")
552
+
553
+ print(f"Train Loss: {avg_train_loss:.4f}")
554
+ print(f"Val Loss: {avg_val_loss:.4f}")
555
+ print(f"KL Beta: {current_beta:.4f}")
556
+
557
+ if avg_val_loss < best_val_loss:
558
+ best_val_loss = avg_val_loss
559
+ checkpoint_path = os.path.join(save_dir, f"best_model_{tokenizer_name}.pt")
560
+ torch.save({
561
+ 'epoch': epoch + 1,
562
+ 'model_state_dict': model.state_dict(),
563
+ 'optimizer_state_dict': optimizer.state_dict(),
564
+ 'val_loss': avg_val_loss,
565
+ }, checkpoint_path)
566
+ print(f"→ Saved best model to {checkpoint_path}")
567
+
568
+ return best_val_loss # PATCHED: Return best_val_loss
569
+
570
+ #
571
+ # TRAINING LOOP OVER TOKENIZERS (PATCHED: Uses linear_kl_beta, calculates warmup_steps)
572
+ #
573
+ for tokenizer in TOKENIZERS:
574
+ print(f"\n STARTING TRAINING FOR: {tokenizer.name}\n")
575
+ vocab_size = len(tokenizer)
576
+ pad_token_id = tokenizer.tokenizer.pad_token_id
577
+ # Validate token IDs
578
+ sample_ids = tokenizer.encode(train_smiles[0], add_special_tokens=True)['input_ids']
579
+ max_id_in_sample = max(sample_ids)
580
+ assert max_id_in_sample < vocab_size, f"Token ID {max_id_in_sample} >= vocab size {vocab_size} in {tokenizer.name}"
581
+ model = MoleculeVAE(
582
+ vocab_size=len(tokenizer),
583
+ pad_token_id=tokenizer.pad_token_id,
584
+ bos_token_id=tokenizer.bos_token_id,
585
+ eos_token_id=tokenizer.eos_token_id
586
+ ).to(device)
587
+ ########################################################################
588
+ # 1. CREATE A FRESH optimizer FOR EVERY TOKENIZER
589
+ ########################################################################
590
+ optimizer = Ranger21(
591
+ model.parameters(),
592
+ lr=LEARNING_RATE,
593
+ weight_decay=0.01,
594
+ use_adabelief=True,
595
+ use_warmup=True, # Keep Ranger21's LR warmup as-is
596
+ use_madgrad=True,
597
+ num_epochs=NUM_EPOCHS,
598
+ num_batches_per_epoch=len(train_smiles) // (BATCH_SIZE * ACCUMULATION_STEPS),
599
+ warmdown_active=False,
600
+ )
601
+ train_dataset = SmilesDataset(train_smiles)
602
+ val_dataset = SmilesDataset(val_smiles)
603
+ train_loader = DataLoader(
604
+ train_dataset,
605
+ batch_size=BATCH_SIZE,
606
+ shuffle=True,
607
+ collate_fn=lambda batch: collate_fn(batch, tokenizer, max_length=MAX_SEQ_LEN),
608
+ num_workers=0,
609
+ pin_memory=True
610
+ )
611
+ val_loader = DataLoader(
612
+ val_dataset,
613
+ batch_size=BATCH_SIZE,
614
+ shuffle=False,
615
+ collate_fn=lambda batch: collate_fn(batch, tokenizer, max_length=MAX_SEQ_LEN),
616
+ num_workers=0,
617
+ pin_memory=True
618
+ )
619
+ steps_per_epoch = len(train_loader)
620
+ total_steps = steps_per_epoch * NUM_EPOCHS
621
+ # Calculate warmup steps based on total steps and fraction
622
+ warmup_steps = int(total_steps * KL_WARMUP_FRAC) # PATCHED: Calculate warmup steps
623
+
624
+ train_vae(
625
+ model=model,
626
+ train_loader=train_loader,
627
+ val_loader=val_loader,
628
+ optimizer=optimizer,
629
+ pad_token_id=pad_token_id,
630
+ device=device,
631
+ num_epochs=NUM_EPOCHS,
632
+ accumulation_steps=ACCUMULATION_STEPS,
633
+ save_dir=f"./checkpoints/{tokenizer.name}",
634
+ tokenizer_name=tokenizer.name,
635
+ warmup_steps=warmup_steps, # PATCHED: Pass warmup_steps
636
+ )
637
+
638
+ #
639
+ # Step 4.x — Evaluation Pipeline (Fixed Bug #6, #7, #8)
640
+ #
641
+ def canonicalize_smiles(smiles):
642
+ mol = Chem.MolFromSmiles(smiles)
643
+ if mol is None:
644
+ return None
645
+ return Chem.MolToSmiles(mol, isomericSmiles=True)
646
+ def evaluate_reconstruction(model, dataloader, tokenizer, device, max_length=128):
647
+ model.eval()
648
+ total_token_correct = total_tokens = exact_matches = valid_count = total_samples = 0
649
+ all_generated, all_targets = [], []
650
+ pad_id = tokenizer.tokenizer.pad_token_id
651
+ eos_id = tokenizer.tokenizer.eos_token_id
652
+ special_ids = {pad_id, eos_id}
653
+ def trim_to_special(ids, specials):
654
+ for i, id_ in enumerate(ids):
655
+ if id_ in specials:
656
+ return ids[:i]
657
+ return ids
658
+ with torch.no_grad():
659
+ for input_ids, lengths in tqdm(dataloader, desc="Evaluating Reconstruction"):
660
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
661
+ B = input_ids.size(0)
662
+ mu, logvar = model.encode(input_ids, lengths)
663
+ z = model.reparameterize(mu, logvar)
664
+ logits = model.decode(z, max_length=128, mode="greedy") # FIXED #7 for reconstruction
665
+ preds = logits.argmax(dim=-1)
666
+ # FIXED: Align logits and targets to same sequence length
667
+ min_len = min(logits.size(1), input_ids.size(1))
668
+ preds = preds[:, :min_len] # trim predictions
669
+ input_ids_eval = input_ids[:, :min_len] # trim targets
670
+ mask = (input_ids_eval != pad_id)
671
+ token_correct = ((preds == input_ids_eval) & mask).sum().item()
672
+ total_token_correct += token_correct
673
+ total_tokens += mask.sum().item()
674
+ for i in range(B):
675
+ target_ids = input_ids_eval[i].cpu().tolist()
676
+ pred_ids = preds[i].cpu().tolist()
677
+ # FIXED BUG #6: Trim before decode
678
+ target_ids_trim = trim_to_special(target_ids, special_ids)
679
+ pred_ids_trim = trim_to_special(pred_ids, special_ids)
680
+ target_smiles = tokenizer.decode(target_ids_trim, skip_special_tokens=False)
681
+ pred_smiles = tokenizer.decode(pred_ids_trim, skip_special_tokens=False)
682
+ all_targets.append(target_smiles)
683
+ all_generated.append(pred_smiles)
684
+ if pred_smiles == target_smiles:
685
+ exact_matches += 1
686
+ if Chem.MolFromSmiles(pred_smiles) is not None:
687
+ valid_count += 1
688
+ total_samples += 1
689
+ token_acc = total_token_correct / total_tokens if total_tokens > 0 else 0.0
690
+ exact_match_rate = exact_matches / total_samples
691
+ validity_rate = valid_count / total_samples
692
+ print(f"Token-level Accuracy: {token_acc:.4f}")
693
+ print(f"Exact Match Rate: {exact_match_rate:.4f}")
694
+ print(f"Validity Rate: {validity_rate:.4f}")
695
+ return {
696
+ 'token_accuracy': token_acc,
697
+ 'exact_match_rate': exact_match_rate,
698
+ 'validity_rate': validity_rate,
699
+ 'generated_smiles': all_generated,
700
+ 'target_smiles': all_targets
701
+ }
702
+ def compute_uniqueness_and_novelty(generated_smiles, train_smiles_set):
703
+ total = len(generated_smiles)
704
+ unique = len(set(generated_smiles))
705
+ novel = len([s for s in generated_smiles if s not in train_smiles_set])
706
+ uniqueness = unique / total if total > 0 else 0.0
707
+ novelty = novel / total if total > 0 else 0.0
708
+ print(f"Uniqueness: {uniqueness:.4f} ({unique}/{total})")
709
+ print(f"Novelty: {novelty:.4f} ({novel}/not in train)")
710
+ return uniqueness, novelty
711
+ def kl_divergence_from_samples(samples, bins=512):
712
+ dim_kls = []
713
+ for d in range(samples.shape[1]):
714
+ data = samples[:, d]
715
+ hist, bin_edges = np.histogram(data, bins=bins, density=True)
716
+ bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
717
+ norm_pdf = (1 / np.sqrt(2 * np.pi)) * np.exp(-0.5 * bin_centers**2)
718
+ hist = np.clip(hist, 1e-10, None)
719
+ norm_pdf = np.clip(norm_pdf, 1e-10, None)
720
+ kl = entropy(hist, norm_pdf)
721
+ dim_kls.append(kl)
722
+ return np.mean(dim_kls)
723
+ def evaluate_latent_kl(model, dataloader, device, latent_dim=128, bins=512):
724
+ model.eval()
725
+ all_z = []
726
+ with torch.no_grad():
727
+ for input_ids, lengths in tqdm(dataloader, desc="Sampling Latents"):
728
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
729
+ mu, logvar = model.encode(input_ids, lengths)
730
+ z = model.reparameterize(mu, logvar)
731
+ all_z.append(z.cpu().numpy())
732
+ all_z = np.concatenate(all_z, axis=0)
733
+ kl_div = kl_divergence_from_samples(all_z, bins=bins)
734
+ print(f"KL Divergence (empirical vs N(0,1)): {kl_div:.4f}")
735
+ return kl_div
736
+ def evaluate_interpolation_validity(model, tokenizer, test_smiles, device, num_pairs=100, steps=10, max_length=128):
737
+ model.eval()
738
+ pairs = random.sample(list(zip(test_smiles[::2], test_smiles[1::2])), min(num_pairs, len(test_smiles)//2))
739
+ valid_interps = total_interps = 0
740
+ with torch.no_grad():
741
+ for smiles_a, smiles_b in tqdm(pairs, desc="Interpolation Validity"):
742
+ if not smiles_a or not smiles_b: continue
743
+ enc_a = tokenizer.encode(smiles_a, add_special_tokens=True)
744
+ enc_b = tokenizer.encode(smiles_b, add_special_tokens=True)
745
+ ids_a = torch.tensor([enc_a['input_ids']], device=device)
746
+ ids_b = torch.tensor([enc_b['input_ids']], device=device)
747
+ len_a = torch.tensor([len(enc_a['input_ids'])], device=device)
748
+ len_b = torch.tensor([len(enc_b['input_ids'])], device=device)
749
+ mu_a, _ = model.encode(ids_a, len_a)
750
+ mu_b, _ = model.encode(ids_b, len_b)
751
+ alphas = torch.linspace(0, 1, steps, device=device)
752
+ for alpha in alphas:
753
+ z_interp = alpha * mu_b + (1 - alpha) * mu_a
754
+ # Ensure z_interp maintains batch dimension [1, latent_dim]
755
+ if z_interp.dim() == 1:
756
+ z_interp = z_interp.unsqueeze(0)
757
+ logits = model.decode(z_interp, max_length=max_length, mode="sample", temperature=0.8)
758
+ preds = logits.argmax(dim=-1)
759
+ # Handle batch dimension properly
760
+ if preds.dim() > 1:
761
+ preds = preds[0] # Take first (and only) batch item
762
+ pred_smiles = tokenizer.decode(preds.cpu().tolist(), skip_special_tokens=True)
763
+ if Chem.MolFromSmiles(pred_smiles) is not None:
764
+ valid_interps += 1
765
+ total_interps += 1
766
+ interp_validity = valid_interps / total_interps if total_interps > 0 else 0.0
767
+ print(f"Interpolation Validity: {interp_validity:.4f}")
768
+ return interp_validity
769
+ def sample_from_latent(model, tokenizer, num_samples=30000, latent_dim=128, max_length=128, device=device, temperature=0.8):
770
+ model.eval()
771
+ generated_smiles = []
772
+ with torch.no_grad():
773
+ for _ in tqdm(range(0, num_samples, BATCH_SIZE), desc="Sampling from Latent"):
774
+ current_batch_size = min(BATCH_SIZE, num_samples - len(generated_smiles))
775
+ if current_batch_size <= 0: break
776
+ z = torch.randn(current_batch_size, latent_dim, device=device)
777
+ logits = model.decode(z, max_length=max_length, mode="sample", temperature=temperature)
778
+ preds = logits.argmax(dim=-1)
779
+ for i in range(current_batch_size):
780
+ pred_ids = preds[i].cpu().tolist()
781
+ smiles = tokenizer.decode(pred_ids, skip_special_tokens=True)
782
+ generated_smiles.append(smiles)
783
+ if len(generated_smiles) >= num_samples: break
784
+ return generated_smiles
785
+ def measure_inference_throughput(model, tokenizer, test_smiles, device,
786
+ max_length=128,
787
+ batch_sizes=[1, 4, 8, 16]):
788
+ """
789
+ Benchmark inference speed & peak GPU memory across several batch sizes.
790
+ Returns a JSON-serialisable dict:
791
+ {batch_size: {'tokens_per_sec': <float>, 'peak_mem_mb': <float>}, ...}
792
+ """
793
+ model.eval()
794
+ results = {}
795
+ for bs in batch_sizes:
796
+ # Build a small fixed subset so every BS processes the same #samples
797
+ subset = SmilesDataset(test_smiles[:bs * 10])
798
+ loader = DataLoader(
799
+ subset,
800
+ batch_size=bs,
801
+ shuffle=False,
802
+ num_workers=0,
803
+ collate_fn=lambda b: collate_fn(b, tokenizer, max_length=max_length),
804
+ )
805
+ total_tokens = 0
806
+ if torch.cuda.is_available():
807
+ torch.cuda.reset_peak_memory_stats(device)
808
+ start_time = time.perf_counter()
809
+ with torch.no_grad():
810
+ for input_ids, lengths in loader:
811
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
812
+ mu, logvar = model.encode(input_ids, lengths)
813
+ z = model.reparameterize(mu, logvar)
814
+ logits = model.decode(z, max_length=max_length)
815
+ total_tokens += logits.numel() # number of float elements
816
+ duration = time.perf_counter() - start_time
817
+ tokens_per_sec = total_tokens / duration
818
+ peak_mem_mb = (
819
+ torch.cuda.max_memory_allocated(device) / (1024 ** 2)
820
+ if torch.cuda.is_available()
821
+ else 0.0
822
+ )
823
+ # Store as plain Python floats
824
+ results[bs] = {
825
+ "tokens_per_sec": float(tokens_per_sec),
826
+ "peak_mem_mb": float(peak_mem_mb),
827
+ }
828
+ print(f"BS {bs:3d} → {tokens_per_sec:8.2f} tok/s | Peak Mem: {peak_mem_mb:.2f} MB")
829
+ return results
830
+ #
831
+ # FINAL EVALUATION PIPELINE
832
+ #
833
+ def full_evaluation_pipeline(model, tokenizer, train_smiles, test_smiles, device, save_dir):
834
+ print(f"\n FULL EVALUATION FOR: {tokenizer.name}")
835
+ test_dataset = SmilesDataset(test_smiles)
836
+ test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False,
837
+ collate_fn=lambda b: collate_fn(b, tokenizer, max_length=MAX_SEQ_LEN),
838
+ num_workers=0)
839
+ # 1. Reconstruction
840
+ recon_metrics = evaluate_reconstruction(model, test_loader, tokenizer, device)
841
+ # 2. Uniqueness & Novelty
842
+ train_set = set(train_smiles)
843
+ uniqueness, novelty = compute_uniqueness_and_novelty(recon_metrics['generated_smiles'], train_set)
844
+ # 3. KL Divergence
845
+ kl_div = evaluate_latent_kl(model, test_loader, device)
846
+ # 4. Interpolation Validity
847
+ interp_validity = evaluate_interpolation_validity(model, tokenizer, test_smiles, device)
848
+ # 5. Latent Sampling (for FCD — optional)
849
+ # gen_smiles_30k = sample_from_latent(model, tokenizer, num_samples=10000, temperature=0.8) # reduce for speed
850
+ # fcd_score = compute_fcd(test_smiles, gen_smiles_30k) if 'get_fcd' in globals() else None
851
+ # 6. Throughput & Memory
852
+ # throughput = measure_inference_throughput(model, tokenizer, test_loader, device)
853
+ eval_results = {
854
+ **recon_metrics,
855
+ 'uniqueness': uniqueness,
856
+ 'novelty': novelty,
857
+ 'kl_divergence': kl_div,
858
+ 'interpolation_validity': interp_validity,
859
+ # 'fcd': fcd_score,
860
+ # 'inference_throughput': throughput,
861
+ }
862
+ eval_path = os.path.join(save_dir, "evaluation_results.json")
863
+ with open(eval_path, "w") as f:
864
+ json.dump(eval_results, f, indent=2, default=str)
865
+ print(f" Evaluation saved to {eval_path}")
866
+ return eval_results
867
+ #
868
+ # RUN EVALUATION FOR EACH TOKENIZER
869
+ #
870
+ for tokenizer in TOKENIZERS:
871
+ print(f"\n🔄 LOADING BEST MODEL FOR: {tokenizer.name}")
872
+ checkpoint_path = f"./checkpoints/{tokenizer.name}/best_model_{tokenizer.name}.pt"
873
+ if not os.path.exists(checkpoint_path):
874
+ print(f"⚠️ Checkpoint not found: {checkpoint_path}")
875
+ continue
876
+ vocab_size = len(tokenizer)
877
+ pad_token_id = tokenizer.tokenizer.pad_token_id
878
+ model = MoleculeVAE(
879
+ vocab_size=vocab_size,
880
+ pad_token_id=pad_token_id,
881
+ bos_token_id=tokenizer.bos_token_id,
882
+ eos_token_id=tokenizer.eos_token_id
883
+ ).to(device)
884
+ checkpoint = torch.load(checkpoint_path, map_location=device)
885
+ model.load_state_dict(checkpoint['model_state_dict'])
886
+ model.eval()
887
+ full_evaluation_pipeline(
888
+ model=model,
889
+ tokenizer=tokenizer,
890
+ train_smiles=train_smiles,
891
+ test_smiles=test_smiles,
892
+ device=device,
893
+ save_dir=f"./checkpoints/{tokenizer.name}"
894
+ )
895
+ print("\n🎉 PIPELINE COMPLETE — ALL TOKENIZERS BENCHMARKED, TRAINED, AND EVALUATED!")
benchmark/benchmark_legacy.py ADDED
@@ -0,0 +1,1039 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Molecule Tokenizer Benchmark & VAE Training Pipeline
3
+ # PATCHED VERSION — All 5 critical bugs fixed + KL Beta Logging Clarity
4
+ #
5
+
6
+ #
7
+ # Step 1.1 — Imports & Reproducibility
8
+ #
9
+
10
+ import os
11
+ import time
12
+ import random
13
+ import pandas as pd
14
+ from pathlib import Path
15
+ from datetime import datetime
16
+ import torch
17
+ import numpy as np
18
+
19
+ # Tokenizers
20
+ from transformers import AutoTokenizer
21
+ from FastChemTokenizer import FastChemTokenizer # assuming it's in PYTHONPATH
22
+ # Optional: for progress bars
23
+ from tqdm import tqdm
24
+ from rdkit import Chem
25
+ from sklearn.model_selection import train_test_split
26
+ import torch.nn as nn
27
+ import torch.nn.functional as F
28
+ from ranger21 import Ranger21
29
+ from torch.utils.data import DataLoader, Dataset
30
+ from scipy.stats import entropy
31
+ import json
32
+ import math
33
+
34
+ from rdkit import RDLogger
35
+ RDLogger.DisableLog('rdApp.*')
36
+ # Set seeds for reproducibility
37
+ def set_seed(seed=42):
38
+ torch.manual_seed(seed)
39
+ torch.cuda.manual_seed_all(seed)
40
+ np.random.seed(seed)
41
+ random.seed(seed)
42
+ os.environ['PYTHONHASHSEED'] = str(seed)
43
+ torch.backends.cudnn.deterministic = True
44
+ torch.backends.cudnn.benchmark = False
45
+
46
+ set_seed(42)
47
+
48
+ # Device setup
49
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
50
+ print(f"Using device: {device}")
51
+
52
+ #
53
+ # Step 1.2 — Load & Preprocess SMILES Corpus
54
+ #
55
+
56
+ data_path = "../data/sample_1k_smi_42.csv"
57
+ df = pd.read_csv(data_path)
58
+
59
+ if 'SMILES' not in df.columns:
60
+ raise ValueError("Expected column 'SMILES' in CSV")
61
+
62
+ smiles_list = df['SMILES'].dropna().tolist()
63
+ print(f"Loaded {len(smiles_list)} SMILES (assumed pre-canonicalized)")
64
+
65
+ # Validate with RDKit
66
+
67
+ def is_valid_smiles(smiles):
68
+ return Chem.MolFromSmiles(smiles) is not None
69
+
70
+ print("Validating SMILES with RDKit...")
71
+ valid_mask = [is_valid_smiles(s) for s in tqdm(smiles_list)]
72
+ smiles_list = [s for s, valid in zip(smiles_list, valid_mask) if valid]
73
+ print(f"After RDKit filtering: {len(smiles_list)} valid SMILES")
74
+
75
+ #
76
+ # Step 1.3 — Train/Val/Test Split (80/10/10)
77
+ #
78
+
79
+ train_smiles, temp_smiles = train_test_split(smiles_list, test_size=0.2, random_state=42, shuffle=True)
80
+ val_smiles, test_smiles = train_test_split(temp_smiles, test_size=0.5, random_state=42, shuffle=True)
81
+
82
+ print(f"Train: {len(train_smiles)}")
83
+ print(f"Val: {len(val_smiles)}")
84
+ print(f"Test: {len(test_smiles)}")
85
+
86
+ # Cache splits
87
+ splits = {'train': train_smiles, 'val': val_smiles, 'test': test_smiles}
88
+ for split_name, smiles in splits.items():
89
+ with open(f"../data/{split_name}_smiles.txt", "w") as f:
90
+ f.write("\n".join(smiles))
91
+
92
+ #
93
+ # Step 1.4 — Tokenizer Wrapper (Fixed Bug #2, #3, #6)
94
+ #
95
+
96
+ class TokenizerWrapper:
97
+ def __init__(self, tokenizer, name, bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>"):
98
+ self.tokenizer = tokenizer
99
+ self.name = name
100
+ self.bos_token = bos_token
101
+ self.eos_token = eos_token
102
+ self.pad_token = pad_token
103
+ self.unk_token = unk_token
104
+
105
+ if hasattr(tokenizer, 'add_special_tokens'):
106
+ tokenizer.add_special_tokens({
107
+ 'bos_token': bos_token,
108
+ 'eos_token': eos_token,
109
+ 'pad_token': pad_token,
110
+ 'unk_token': unk_token
111
+ })
112
+
113
+ def encode(self, smiles: str, add_special_tokens: bool = True):
114
+ if isinstance(self.tokenizer, FastChemTokenizer):
115
+ # 1. get ids directly
116
+ ids = self.tokenizer.encode(smiles) # ← no .tokenize() here
117
+ # 2. add specials ourselves
118
+ if add_special_tokens:
119
+ ids = [self.tokenizer.bos_token_id] + ids + [self.tokenizer.eos_token_id]
120
+ return {'input_ids': ids}
121
+ else:
122
+ # Hugging-Face style tokenizer
123
+ return self.tokenizer(
124
+ smiles,
125
+ add_special_tokens=add_special_tokens,
126
+ return_attention_mask=False,
127
+ return_tensors=None
128
+ )
129
+
130
+ def decode(self, token_ids, skip_special_tokens=True):
131
+ if isinstance(self.tokenizer, FastChemTokenizer):
132
+ # 1. map single ids → tokens
133
+ tokens = [self.tokenizer.id_to_token.get(tid, self.tokenizer.unk_token)
134
+ for tid in token_ids]
135
+ # 2. drop specials if requested
136
+ if skip_special_tokens:
137
+ specials = {self.tokenizer.bos_token,
138
+ self.tokenizer.eos_token,
139
+ self.tokenizer.pad_token,
140
+ self.tokenizer.unk_token} # add any others you use
141
+ tokens = [t for t in tokens if t not in specials]
142
+ # 3. detokenise
143
+ if hasattr(self.tokenizer, 'detokenize'):
144
+ return self.tokenizer.detokenize(tokens)
145
+ else:
146
+ return "".join(tokens) # chemistry tokens are atomic
147
+ else:
148
+ return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
149
+
150
+ def __len__(self):
151
+ if isinstance(self.tokenizer, FastChemTokenizer):
152
+ # FastChemTokenizer uses ._vocab or .vocab depending on version
153
+ return len(getattr(self.tokenizer, 'vocab',
154
+ getattr(self.tokenizer, '_vocab', self.tokenizer)))
155
+ else:
156
+ return len(self.tokenizer)
157
+
158
+ def get_vocab(self):
159
+ if isinstance(self.tokenizer, FastChemTokenizer):
160
+ return self.tokenizer.vocab
161
+ else:
162
+ return self.tokenizer.get_vocab()
163
+
164
+ @property
165
+ def bos_token_id(self):
166
+ return self.tokenizer.bos_token_id
167
+
168
+ @property
169
+ def eos_token_id(self):
170
+ return self.tokenizer.eos_token_id
171
+
172
+ @property
173
+ def pad_token_id(self):
174
+ return self.tokenizer.pad_token_id
175
+
176
+ @property
177
+ def unk_token_id(self):
178
+ return self.tokenizer.unk_token_id
179
+
180
+ #
181
+ # Step 1.5 — Initialize Tokenizers
182
+ #
183
+
184
+ tok1_hf = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
185
+ tok2_fast = FastChemTokenizer.from_pretrained("../smitok")
186
+
187
+ tokenizer1 = TokenizerWrapper(tok1_hf, name="ChemBERTa", bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>")
188
+ tokenizer2 = TokenizerWrapper(tok2_fast, name="FastChemTokenizer", bos_token="[BOS]", eos_token="[EOS]", pad_token="[PAD]", unk_token="[UNK]")
189
+
190
+ TOKENIZERS = [tokenizer1, tokenizer2]
191
+
192
+ #
193
+ # Step 1.6 — Benchmarking Functions (Fixed Bug #4 implicitly via epsilon)
194
+ #
195
+
196
+ def benchmark_tokenizer(tokenizer, smiles_sample, encode_only=False):
197
+ V = len(tokenizer)
198
+ sample = smiles_sample[:10000] if len(smiles_sample) > 10000 else smiles_sample
199
+
200
+ encode_times = []
201
+ token_counts = []
202
+ char_counts = []
203
+ unk_counts = 0
204
+ total_tokens = 0
205
+
206
+ for smiles in tqdm(sample, desc=f"Encoding with {tokenizer.name}", leave=False):
207
+ char_counts.append(len(smiles))
208
+
209
+ start = time.perf_counter()
210
+ enc = tokenizer.encode(smiles, add_special_tokens=True)
211
+ end = time.perf_counter()
212
+ encode_times.append(end - start)
213
+
214
+ input_ids = enc['input_ids']
215
+ token_counts.append(len(input_ids))
216
+ total_tokens += len(input_ids)
217
+
218
+ if isinstance(tokenizer.tokenizer, FastChemTokenizer):
219
+ unk_id = tokenizer.tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
220
+ else:
221
+ unk_id = tokenizer.tokenizer.unk_token_id
222
+
223
+ unk_counts += input_ids.count(unk_id)
224
+
225
+ L̄ = np.mean(token_counts)
226
+ C = np.mean(char_counts) / L̄
227
+ U = unk_counts / total_tokens if total_tokens > 0 else 0.0
228
+ Tenc = len(sample) / sum(encode_times)
229
+
230
+ metrics = {
231
+ 'vocab_size': V,
232
+ 'avg_tokens_per_mol': L̄,
233
+ 'compression_ratio': C,
234
+ 'percent_unknown': U * 100,
235
+ 'encode_throughput_smiles_per_sec': Tenc,
236
+ }
237
+
238
+ if encode_only:
239
+ return metrics
240
+
241
+ decode_times = []
242
+ reconstruction_ok = 0
243
+
244
+ for smiles in tqdm(sample, desc=f"Decoding with {tokenizer.name}", leave=False):
245
+ enc = tokenizer.encode(smiles, add_special_tokens=True)
246
+ input_ids = enc['input_ids']
247
+
248
+ start = time.perf_counter()
249
+ decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
250
+ end = time.perf_counter()
251
+ decode_times.append(end - start)
252
+
253
+ if decoded == smiles:
254
+ reconstruction_ok += 1
255
+
256
+ Tdec = len(sample) / sum(decode_times)
257
+ recon_acc = reconstruction_ok / len(sample)
258
+
259
+ metrics.update({
260
+ 'decode_throughput_smiles_per_sec': Tdec,
261
+ 'decode_reconstruction_accuracy': recon_acc * 100,
262
+ })
263
+
264
+ return metrics
265
+
266
+ #
267
+ # Step 1.7 — Run Benchmark
268
+ #
269
+
270
+ benchmark_sample = train_smiles
271
+ results = []
272
+
273
+ for tokenizer in TOKENIZERS:
274
+ print(f"\n=== Benchmarking {tokenizer.name} ===")
275
+ metrics = benchmark_tokenizer(tokenizer, benchmark_sample)
276
+ metrics['tokenizer'] = tokenizer.name
277
+ results.append(metrics)
278
+ for k, v in metrics.items():
279
+ if k != 'tokenizer':
280
+ print(f"{k:35s}: {v:.4f}" if isinstance(v, float) else f"{k:35s}: {v}")
281
+
282
+ df_results = pd.DataFrame(results)
283
+ df_results.to_csv("tokenizer_benchmark_results.csv", index=False)
284
+ print("\nTokenizer benchmark results saved to 'tokenizer_benchmark_results.csv'")
285
+
286
+ #
287
+ # Step 2.1 — VAE Model Class (PATCHED: decode stops at EOS)
288
+ #
289
+
290
+ class MoleculeVAE(nn.Module):
291
+ def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, latent_dim=128, num_layers=2,
292
+ pad_token_id=0, bos_token_id=1, eos_token_id=2):
293
+ super().__init__()
294
+ self.vocab_size = vocab_size
295
+ self.embed_dim = embed_dim
296
+ self.hidden_dim = hidden_dim
297
+ self.latent_dim = latent_dim
298
+ self.num_layers = num_layers
299
+ self.pad_token_id = pad_token_id
300
+ self.bos_token_id = bos_token_id
301
+ self.eos_token_id = eos_token_id
302
+
303
+ self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_token_id)
304
+ self.encoder_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
305
+ self.fc_mu = nn.Linear(hidden_dim * 2, latent_dim)
306
+ self.fc_logvar = nn.Linear(hidden_dim * 2, latent_dim)
307
+
308
+ self.decoder_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
309
+ self.fc_out = nn.Linear(hidden_dim, vocab_size)
310
+
311
+ self.latent2hidden = nn.Linear(latent_dim, num_layers * hidden_dim)
312
+ self.latent2cell = nn.Linear(latent_dim, num_layers * hidden_dim)
313
+
314
+ self._init_weights()
315
+
316
+ def _init_weights(self):
317
+ for m in self.modules():
318
+ if isinstance(m, nn.Linear):
319
+ nn.init.xavier_uniform_(m.weight)
320
+ if m.bias is not None:
321
+ nn.init.zeros_(m.bias)
322
+ elif isinstance(m, nn.LSTM):
323
+ for name, param in m.named_parameters():
324
+ if 'weight' in name:
325
+ nn.init.orthogonal_(param)
326
+ elif 'bias' in name:
327
+ nn.init.zeros_(param)
328
+
329
+ def encode(self, x, lengths):
330
+ embedded = self.embedding(x)
331
+ packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
332
+ packed_out, (hidden, _) = self.encoder_lstm(packed)
333
+ h_forward = hidden[-2]
334
+ h_backward = hidden[-1]
335
+ h = torch.cat([h_forward, h_backward], dim=1)
336
+ mu = self.fc_mu(h)
337
+ logvar = self.fc_logvar(h)
338
+ return mu, logvar
339
+
340
+ def reparameterize(self, mu, logvar):
341
+ if self.training:
342
+ std = torch.exp(0.5 * logvar)
343
+ eps = torch.randn_like(std)
344
+ return mu + eps * std
345
+ else:
346
+ return mu
347
+
348
+ def decode(self, z, max_length=128, mode="greedy", temperature=1.0):
349
+ """
350
+ Decode latent vector z into a sequence.
351
+ Returns full logits at each step.
352
+ PATCHED: stops generation when EOS is predicted.
353
+ """
354
+ batch_size = z.size(0)
355
+ device = z.device
356
+
357
+ # Initialize hidden states from latent
358
+ h0 = self.latent2hidden(z).view(self.num_layers, batch_size, self.hidden_dim)
359
+ c0 = self.latent2cell(z).view(self.num_layers, batch_size, self.hidden_dim)
360
+ hidden = (h0, c0)
361
+
362
+ # Start with BOS token — shape: (batch_size, 1)
363
+ input_token = torch.full((batch_size, 1), self.bos_token_id, dtype=torch.long, device=device)
364
+ logits = []
365
+ finished = torch.zeros(batch_size, dtype=torch.bool, device=device) # ← TRACK FINISHED SEQS
366
+
367
+ for _ in range(max_length):
368
+ embedded = self.embedding(input_token) # (batch, 1, embed_dim)
369
+ output, hidden = self.decoder_lstm(embedded, hidden)
370
+ logit = self.fc_out(output) # (batch, 1, vocab)
371
+ logits.append(logit)
372
+
373
+ if mode == "greedy":
374
+ input_token = logit.argmax(dim=-1) # (batch, 1)
375
+ elif mode == "sample":
376
+ probs = torch.softmax(logit.squeeze(1) / temperature, dim=-1) # (batch, vocab)
377
+ input_token = torch.multinomial(probs, 1) # (batch, 1)
378
+ else:
379
+ raise ValueError(f"Unknown decode mode: {mode}")
380
+
381
+ # ← EARLY STOPPING AT EOS
382
+ just_finished = (input_token.squeeze(1) == self.eos_token_id)
383
+ finished |= just_finished
384
+ input_token[finished] = self.pad_token_id # pad finished sequences
385
+ if finished.all():
386
+ break
387
+
388
+ return torch.cat(logits, dim=1) # (batch, seq_len, vocab)
389
+
390
+ def forward(self, input_ids, lengths, target_seq=None, teacher_forcing_ratio=0.0, temperature=1.0):
391
+ mu, logvar = self.encode(input_ids, lengths)
392
+ z = self.reparameterize(mu, logvar)
393
+
394
+ if self.training and target_seq is not None and teacher_forcing_ratio > 0:
395
+ # Training with teacher forcing
396
+ batch_size, seq_len = target_seq.size()
397
+ device = target_seq.device
398
+
399
+ # Initialize hidden states
400
+ h0 = self.latent2hidden(z).view(self.num_layers, batch_size, self.hidden_dim)
401
+ c0 = self.latent2cell(z).view(self.num_layers, batch_size, self.hidden_dim)
402
+ hidden = (h0, c0)
403
+
404
+ logits = []
405
+ input_token = target_seq[:, 0].unsqueeze(1) # BOS
406
+
407
+ for t in range(1, seq_len):
408
+ embedded = self.embedding(input_token)
409
+ output, hidden = self.decoder_lstm(embedded, hidden)
410
+ logit = self.fc_out(output)
411
+ logits.append(logit)
412
+
413
+ use_teacher = torch.rand(1).item() < teacher_forcing_ratio
414
+ if use_teacher:
415
+ input_token = target_seq[:, t].unsqueeze(1)
416
+ else:
417
+ input_token = logit.argmax(dim=-1)
418
+
419
+ logits = torch.cat(logits, dim=1)
420
+ else:
421
+ # Inference mode
422
+ max_len = target_seq.size(1) if target_seq is not None else 128
423
+ logits = self.decode(z, max_length=max_len, mode="greedy", temperature=temperature)
424
+
425
+ return logits, mu, logvar
426
+
427
+ #
428
+ # Step 2.2 — Loss Function (PATCHED: β applied OUTSIDE, not inside)
429
+ #
430
+
431
+ def vae_loss(logits, targets, mu, logvar, pad_token_id, beta=1.0):
432
+ # 1. align lengths
433
+ max_len = max(logits.size(1), targets.size(1))
434
+ if logits.size(1) < max_len:
435
+ logits = F.pad(logits, (0, 0, 0, max_len - logits.size(1)))
436
+ if targets.size(1) < max_len:
437
+ targets = F.pad(targets, (0, max_len - targets.size(1)), value=pad_token_id)
438
+
439
+ logits_flat = logits.view(-1, logits.size(-1)) # [B*L, V]
440
+ targets_flat = targets.reshape(-1) # [B*L]
441
+
442
+ mask = (targets_flat != pad_token_id).float()
443
+ ce_loss = F.cross_entropy(logits_flat, targets_flat, reduction='none')
444
+ mask_sum = mask.sum()
445
+ ce_loss = (ce_loss * mask).sum() / (mask_sum + 1e-8)
446
+
447
+ kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1).mean()
448
+ # ← β is applied OUTSIDE — so return raw KL
449
+ return ce_loss + kl_loss, ce_loss, kl_loss
450
+
451
+ #
452
+ # Step 2.3 — KLAnnealer (Fixed Bug #5: double increment)
453
+ #
454
+
455
+ class KLAnnealer:
456
+ def __init__(self, total_steps, n_cycle=1, ratio=0.3, mode="linear", per_epoch=False, steps_per_epoch=None):
457
+ self.total_steps = total_steps
458
+ self.n_cycle = n_cycle
459
+ self.ratio = ratio
460
+ self.mode = mode
461
+ self.per_epoch = per_epoch
462
+ self.steps_per_epoch = steps_per_epoch
463
+ self.current_step = 0
464
+
465
+ def get_beta(self, increment=True):
466
+ """Get current KL weight.
467
+ Args:
468
+ increment (bool): whether to advance the annealer (use False in validation).
469
+ """
470
+ if increment:
471
+ self.current_step += 1
472
+
473
+ if self.current_step > self.total_steps:
474
+ return 1.0
475
+
476
+ # effective cycle length
477
+ if self.per_epoch:
478
+ assert self.steps_per_epoch is not None, "steps_per_epoch required if per_epoch=True"
479
+ cycle_length = self.steps_per_epoch / self.n_cycle
480
+ pos_in_cycle = (self.current_step % self.steps_per_epoch) / cycle_length
481
+ else:
482
+ cycle_length = self.total_steps / self.n_cycle
483
+ pos_in_cycle = (self.current_step % cycle_length) / cycle_length
484
+
485
+ pos_in_cycle = min(pos_in_cycle, 1.0)
486
+
487
+ # warmup phase
488
+ fraction = pos_in_cycle / self.ratio if pos_in_cycle < self.ratio else 1.0
489
+
490
+ if self.mode == "linear":
491
+ return min(fraction, 1.0)
492
+ elif self.mode == "sigmoid":
493
+ # Map pos_in_cycle ∈ [0,1] to sigmoid ∈ [0,1]
494
+ # Center at 0.5, so at pos_in_cycle=0.5, sigmoid=0.5
495
+ k = 6
496
+ return 1 / (1 + math.exp(-k * (pos_in_cycle - 0.5)))
497
+ else:
498
+ raise ValueError(f"Unknown mode: {self.mode}")
499
+
500
+ #
501
+ # Step 2.4 — Collate Function (Fixed Bug #2: dynamic pad id)
502
+ #
503
+
504
+ def collate_fn(batch, tokenizer, max_length=128):
505
+ encodings = [tokenizer.encode(s, add_special_tokens=True) for s in batch]
506
+ input_ids = [e['input_ids'] for e in encodings]
507
+
508
+ max_len = min(max(len(ids) for ids in input_ids), max_length)
509
+ padded = []
510
+ lengths = []
511
+
512
+ pad_token_id = tokenizer.tokenizer.pad_token_id # FIXED: dynamic
513
+
514
+ for ids in input_ids:
515
+ if len(ids) > max_length:
516
+ ids = ids[:max_length]
517
+ else:
518
+ ids = ids + [pad_token_id] * (max_len - len(ids))
519
+ padded.append(ids)
520
+ lengths.append(min(len(ids), max_length))
521
+
522
+ return torch.tensor(padded, dtype=torch.long), torch.tensor(lengths, dtype=torch.long)
523
+
524
+ #
525
+ # Step 2.5 — Dataset & DataLoader
526
+ #
527
+
528
+ class SmilesDataset(Dataset):
529
+ def __init__(self, smiles_list):
530
+ self.smiles_list = smiles_list
531
+ def __len__(self):
532
+ return len(self.smiles_list)
533
+ def __getitem__(self, idx):
534
+ return self.smiles_list[idx]
535
+
536
+ #
537
+ # Step 3.x — Training Loop (PATCHED: per-tokenizer annealer, exponential TFR, device-safe eval, KL beta logging clarity)
538
+ #
539
+
540
+ LEARNING_RATE = 5e-6
541
+ BATCH_SIZE = 16
542
+ ACCUMULATION_STEPS = 4
543
+ NUM_EPOCHS = 5
544
+ MAX_SEQ_LEN = 128
545
+ KL_ANNEAL_RATIO = 0.3
546
+
547
+ def train_vae(
548
+ model,
549
+ train_loader,
550
+ val_loader,
551
+ optimizer,
552
+ kl_annealer,
553
+ pad_token_id,
554
+ device,
555
+ num_epochs,
556
+ accumulation_steps=4,
557
+ save_dir="./checkpoints",
558
+ tokenizer_name="default"
559
+ ):
560
+ os.makedirs(save_dir, exist_ok=True)
561
+ log_file = os.path.join(save_dir, f"training_log_{tokenizer_name}.csv")
562
+
563
+ with open(log_file, "w") as f:
564
+ f.write("epoch,step,train_loss,train_ce,train_kl,val_loss,val_ce,val_kl,kl_beta\n")
565
+
566
+ best_val_loss = float('inf')
567
+
568
+ for epoch in range(num_epochs):
569
+ print(f"\n=== Epoch {epoch+1}/{num_epochs} ===")
570
+ model.train()
571
+ total_train_loss = total_train_ce = total_train_kl = 0.0
572
+ num_batches = 0
573
+
574
+ optimizer.zero_grad()
575
+
576
+ for step, (input_ids, lengths) in enumerate(tqdm(train_loader, desc="Training")):
577
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
578
+
579
+ # ← PATCHED: exponential decay per epoch (not per batch, but smoother than linear)
580
+ tfr = 1.0 * (0.5 ** (epoch / max(1, num_epochs-1))) # decay from 1.0 → 0.5
581
+
582
+ logits, mu, logvar = model(input_ids, lengths, target_seq=input_ids, teacher_forcing_ratio=tfr)
583
+ beta = kl_annealer.get_beta(increment=True)
584
+ loss, ce_loss, kl_loss = vae_loss(logits, input_ids, mu, logvar, pad_token_id, beta=beta)
585
+
586
+ loss = loss / accumulation_steps
587
+ loss.backward()
588
+
589
+ total_train_loss += loss.item() * accumulation_steps
590
+ total_train_ce += ce_loss.item()
591
+ total_train_kl += kl_loss.item()
592
+ num_batches += 1
593
+
594
+ if (step + 1) % accumulation_steps == 0:
595
+ optimizer.step()
596
+ optimizer.zero_grad()
597
+
598
+ if len(train_loader) % accumulation_steps != 0:
599
+ optimizer.step()
600
+ optimizer.zero_grad()
601
+
602
+ # ✅ CAPTURE BETA AFTER TRAINING — BEFORE VALIDATION
603
+ # This ensures we log the beta that was actually used during training
604
+ current_beta = kl_annealer.get_beta(increment=False)
605
+
606
+ # Validation — DO NOT query beta again here
607
+ model.eval()
608
+ total_val_loss = total_val_ce = total_val_kl = 0.0
609
+ val_batches = 0
610
+
611
+ with torch.no_grad():
612
+ for input_ids, lengths in tqdm(val_loader, desc="Validating"):
613
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
614
+ # Use captured beta — DO NOT call kl_annealer again here
615
+ logits, mu, logvar = model(input_ids, lengths, target_seq=input_ids, teacher_forcing_ratio=0.0)
616
+ loss, ce_loss, kl_loss = vae_loss(logits, input_ids, mu, logvar, pad_token_id, beta=current_beta)
617
+
618
+ total_val_loss += loss.item()
619
+ total_val_ce += ce_loss.item()
620
+ total_val_kl += kl_loss.item()
621
+ val_batches += 1
622
+
623
+ avg_train_loss = total_train_loss / num_batches
624
+ avg_val_loss = total_val_loss / val_batches
625
+
626
+ current_step = (epoch + 1) * len(train_loader)
627
+ with open(log_file, "a") as f:
628
+ f.write(f"{epoch+1},{current_step},{avg_train_loss:.6f},{total_train_ce/num_batches:.6f},{total_train_kl/num_batches:.6f},"
629
+ f"{avg_val_loss:.6f},{total_val_ce/val_batches:.6f},{total_val_kl/val_batches:.6f},{current_beta:.6f}\n")
630
+
631
+ print(f"Train Loss: {avg_train_loss:.4f}")
632
+ print(f"Val Loss: {avg_val_loss:.4f}")
633
+ print(f"KL Beta: {current_beta:.4f}") # ← Now explicitly the training beta
634
+
635
+ if avg_val_loss < best_val_loss:
636
+ best_val_loss = avg_val_loss
637
+ checkpoint_path = os.path.join(save_dir, f"best_model_{tokenizer_name}.pt")
638
+ torch.save({
639
+ 'epoch': epoch + 1,
640
+ 'model_state_dict': model.state_dict(),
641
+ 'optimizer_state_dict': optimizer.state_dict(),
642
+ 'val_loss': avg_val_loss,
643
+ }, checkpoint_path)
644
+ print(f"→ Saved best model to {checkpoint_path}")
645
+
646
+ return best_val_loss
647
+
648
+ #
649
+ # TRAINING LOOP OVER TOKENIZERS (PATCHED: KLAnnealer reset per tokenizer)
650
+ #
651
+
652
+ for tokenizer in TOKENIZERS:
653
+ print(f"\n STARTING TRAINING FOR: {tokenizer.name}\n")
654
+
655
+ vocab_size = len(tokenizer)
656
+ pad_token_id = tokenizer.tokenizer.pad_token_id
657
+
658
+ # Validate token IDs
659
+ sample_ids = tokenizer.encode(train_smiles[0], add_special_tokens=True)['input_ids']
660
+ max_id_in_sample = max(sample_ids)
661
+ assert max_id_in_sample < vocab_size, f"Token ID {max_id_in_sample} >= vocab size {vocab_size} in {tokenizer.name}"
662
+
663
+ model = MoleculeVAE(
664
+ vocab_size=len(tokenizer),
665
+ pad_token_id=tokenizer.pad_token_id,
666
+ bos_token_id=tokenizer.bos_token_id,
667
+ eos_token_id=tokenizer.eos_token_id
668
+ ).to(device)
669
+
670
+ ########################################################################
671
+ # 1. CREATE A FRESH annealer FOR EVERY TOKENIZER
672
+ ########################################################################
673
+ total_steps = (len(train_smiles) // (BATCH_SIZE*ACCUMULATION_STEPS)) * NUM_EPOCHS
674
+ kl_annealer = KLAnnealer(
675
+ total_steps=total_steps,
676
+ n_cycle=4, # 4 cycles across all epochs → real cyclical
677
+ ratio=0.25, # 25% of each cycle is warmup
678
+ mode="sigmoid",
679
+ per_epoch=False
680
+ )
681
+
682
+ optimizer = Ranger21(
683
+ model.parameters(),
684
+ lr=LEARNING_RATE,
685
+ weight_decay=0.01,
686
+ use_adabelief=True,
687
+ use_warmup=True,
688
+ use_madgrad=True,
689
+ num_epochs=NUM_EPOCHS,
690
+ num_batches_per_epoch=len(train_smiles) // (BATCH_SIZE * ACCUMULATION_STEPS),
691
+ warmdown_active=False,
692
+ )
693
+
694
+ train_dataset = SmilesDataset(train_smiles)
695
+ val_dataset = SmilesDataset(val_smiles)
696
+
697
+ train_loader = DataLoader(
698
+ train_dataset,
699
+ batch_size=BATCH_SIZE,
700
+ shuffle=True,
701
+ collate_fn=lambda batch: collate_fn(batch, tokenizer, max_length=MAX_SEQ_LEN),
702
+ num_workers=0,
703
+ pin_memory=True
704
+ )
705
+
706
+ val_loader = DataLoader(
707
+ val_dataset,
708
+ batch_size=BATCH_SIZE,
709
+ shuffle=False,
710
+ collate_fn=lambda batch: collate_fn(batch, tokenizer, max_length=MAX_SEQ_LEN),
711
+ num_workers=0,
712
+ pin_memory=True
713
+ )
714
+
715
+ train_vae(
716
+ model=model,
717
+ train_loader=train_loader,
718
+ val_loader=val_loader,
719
+ optimizer=optimizer,
720
+ kl_annealer=kl_annealer,
721
+ pad_token_id=pad_token_id,
722
+ device=device,
723
+ num_epochs=NUM_EPOCHS,
724
+ accumulation_steps=ACCUMULATION_STEPS,
725
+ save_dir=f"./checkpoints/{tokenizer.name}",
726
+ tokenizer_name=tokenizer.name
727
+ )
728
+
729
+ #
730
+ # Step 4.x — Evaluation Pipeline (Fixed Bug #6, #7, #8)
731
+ #
732
+
733
+ def canonicalize_smiles(smiles):
734
+ mol = Chem.MolFromSmiles(smiles)
735
+ if mol is None:
736
+ return None
737
+ return Chem.MolToSmiles(mol, isomericSmiles=True)
738
+
739
+ def evaluate_reconstruction(model, dataloader, tokenizer, device, max_length=128):
740
+ model.eval()
741
+ total_token_correct = total_tokens = exact_matches = valid_count = total_samples = 0
742
+ all_generated, all_targets = [], []
743
+
744
+ pad_id = tokenizer.tokenizer.pad_token_id
745
+ eos_id = tokenizer.tokenizer.eos_token_id
746
+ special_ids = {pad_id, eos_id}
747
+
748
+ def trim_to_special(ids, specials):
749
+ for i, id_ in enumerate(ids):
750
+ if id_ in specials:
751
+ return ids[:i]
752
+ return ids
753
+
754
+ with torch.no_grad():
755
+ for input_ids, lengths in tqdm(dataloader, desc="Evaluating Reconstruction"):
756
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
757
+ B = input_ids.size(0)
758
+
759
+ mu, logvar = model.encode(input_ids, lengths)
760
+ z = model.reparameterize(mu, logvar)
761
+ logits = model.decode(z, max_length=128, mode="greedy") # FIXED #7 for reconstruction
762
+ preds = logits.argmax(dim=-1)
763
+
764
+ # FIXED: Align logits and targets to same sequence length
765
+ min_len = min(logits.size(1), input_ids.size(1))
766
+ preds = preds[:, :min_len] # trim predictions
767
+ input_ids_eval = input_ids[:, :min_len] # trim targets
768
+
769
+ mask = (input_ids_eval != pad_id)
770
+ token_correct = ((preds == input_ids_eval) & mask).sum().item()
771
+ total_token_correct += token_correct
772
+ total_tokens += mask.sum().item()
773
+
774
+ for i in range(B):
775
+ target_ids = input_ids_eval[i].cpu().tolist()
776
+ pred_ids = preds[i].cpu().tolist()
777
+
778
+ # FIXED BUG #6: Trim before decode
779
+ target_ids_trim = trim_to_special(target_ids, special_ids)
780
+ pred_ids_trim = trim_to_special(pred_ids, special_ids)
781
+
782
+ target_smiles = tokenizer.decode(target_ids_trim, skip_special_tokens=False)
783
+ pred_smiles = tokenizer.decode(pred_ids_trim, skip_special_tokens=False)
784
+
785
+ all_targets.append(target_smiles)
786
+ all_generated.append(pred_smiles)
787
+
788
+ if pred_smiles == target_smiles:
789
+ exact_matches += 1
790
+ if Chem.MolFromSmiles(pred_smiles) is not None:
791
+ valid_count += 1
792
+ total_samples += 1
793
+
794
+ token_acc = total_token_correct / total_tokens if total_tokens > 0 else 0.0
795
+ exact_match_rate = exact_matches / total_samples
796
+ validity_rate = valid_count / total_samples
797
+
798
+ print(f"Token-level Accuracy: {token_acc:.4f}")
799
+ print(f"Exact Match Rate: {exact_match_rate:.4f}")
800
+ print(f"Validity Rate: {validity_rate:.4f}")
801
+
802
+ return {
803
+ 'token_accuracy': token_acc,
804
+ 'exact_match_rate': exact_match_rate,
805
+ 'validity_rate': validity_rate,
806
+ 'generated_smiles': all_generated,
807
+ 'target_smiles': all_targets
808
+ }
809
+
810
+ def compute_uniqueness_and_novelty(generated_smiles, train_smiles_set):
811
+ total = len(generated_smiles)
812
+ unique = len(set(generated_smiles))
813
+ novel = len([s for s in generated_smiles if s not in train_smiles_set])
814
+ uniqueness = unique / total if total > 0 else 0.0
815
+ novelty = novel / total if total > 0 else 0.0
816
+ print(f"Uniqueness: {uniqueness:.4f} ({unique}/{total})")
817
+ print(f"Novelty: {novelty:.4f} ({novel}/not in train)")
818
+ return uniqueness, novelty
819
+
820
+ def kl_divergence_from_samples(samples, bins=512):
821
+ dim_kls = []
822
+ for d in range(samples.shape[1]):
823
+ data = samples[:, d]
824
+ hist, bin_edges = np.histogram(data, bins=bins, density=True)
825
+ bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
826
+ norm_pdf = (1 / np.sqrt(2 * np.pi)) * np.exp(-0.5 * bin_centers**2)
827
+ hist = np.clip(hist, 1e-10, None)
828
+ norm_pdf = np.clip(norm_pdf, 1e-10, None)
829
+ kl = entropy(hist, norm_pdf)
830
+ dim_kls.append(kl)
831
+ return np.mean(dim_kls)
832
+
833
+ def evaluate_latent_kl(model, dataloader, device, latent_dim=128, bins=512):
834
+ model.eval()
835
+ all_z = []
836
+ with torch.no_grad():
837
+ for input_ids, lengths in tqdm(dataloader, desc="Sampling Latents"):
838
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
839
+ mu, logvar = model.encode(input_ids, lengths)
840
+ z = model.reparameterize(mu, logvar)
841
+ all_z.append(z.cpu().numpy())
842
+ all_z = np.concatenate(all_z, axis=0)
843
+ kl_div = kl_divergence_from_samples(all_z, bins=bins)
844
+ print(f"KL Divergence (empirical vs N(0,1)): {kl_div:.4f}")
845
+ return kl_div
846
+
847
+ def evaluate_interpolation_validity(model, tokenizer, test_smiles, device, num_pairs=100, steps=10, max_length=128):
848
+ model.eval()
849
+ pairs = random.sample(list(zip(test_smiles[::2], test_smiles[1::2])), min(num_pairs, len(test_smiles)//2))
850
+ valid_interps = total_interps = 0
851
+
852
+ with torch.no_grad():
853
+ for smiles_a, smiles_b in tqdm(pairs, desc="Interpolation Validity"):
854
+ if not smiles_a or not smiles_b: continue
855
+
856
+ enc_a = tokenizer.encode(smiles_a, add_special_tokens=True)
857
+ enc_b = tokenizer.encode(smiles_b, add_special_tokens=True)
858
+
859
+ ids_a = torch.tensor([enc_a['input_ids']], device=device)
860
+ ids_b = torch.tensor([enc_b['input_ids']], device=device)
861
+ len_a = torch.tensor([len(enc_a['input_ids'])], device=device)
862
+ len_b = torch.tensor([len(enc_b['input_ids'])], device=device)
863
+
864
+ mu_a, _ = model.encode(ids_a, len_a)
865
+ mu_b, _ = model.encode(ids_b, len_b)
866
+
867
+ alphas = torch.linspace(0, 1, steps, device=device)
868
+ for alpha in alphas:
869
+ z_interp = alpha * mu_b + (1 - alpha) * mu_a
870
+ # Ensure z_interp maintains batch dimension [1, latent_dim]
871
+ if z_interp.dim() == 1:
872
+ z_interp = z_interp.unsqueeze(0)
873
+
874
+ logits = model.decode(z_interp, max_length=max_length, mode="sample", temperature=0.8)
875
+ preds = logits.argmax(dim=-1)
876
+ # Handle batch dimension properly
877
+ if preds.dim() > 1:
878
+ preds = preds[0] # Take first (and only) batch item
879
+ pred_smiles = tokenizer.decode(preds.cpu().tolist(), skip_special_tokens=True)
880
+ if Chem.MolFromSmiles(pred_smiles) is not None:
881
+ valid_interps += 1
882
+ total_interps += 1
883
+
884
+ interp_validity = valid_interps / total_interps if total_interps > 0 else 0.0
885
+ print(f"Interpolation Validity: {interp_validity:.4f}")
886
+ return interp_validity
887
+
888
+ def sample_from_latent(model, tokenizer, num_samples=30000, latent_dim=128, max_length=128, device=device, temperature=0.8):
889
+ model.eval()
890
+ generated_smiles = []
891
+ with torch.no_grad():
892
+ for _ in tqdm(range(0, num_samples, BATCH_SIZE), desc="Sampling from Latent"):
893
+ current_batch_size = min(BATCH_SIZE, num_samples - len(generated_smiles))
894
+ if current_batch_size <= 0: break
895
+ z = torch.randn(current_batch_size, latent_dim, device=device)
896
+ logits = model.decode(z, max_length=max_length, mode="sample", temperature=temperature)
897
+ preds = logits.argmax(dim=-1)
898
+ for i in range(current_batch_size):
899
+ pred_ids = preds[i].cpu().tolist()
900
+ smiles = tokenizer.decode(pred_ids, skip_special_tokens=True)
901
+ generated_smiles.append(smiles)
902
+ if len(generated_smiles) >= num_samples: break
903
+ return generated_smiles
904
+
905
+ def measure_inference_throughput(model, tokenizer, test_smiles, device,
906
+ max_length=128,
907
+ batch_sizes=[1, 4, 8, 16]):
908
+ """
909
+ Benchmark inference speed & peak GPU memory across several batch sizes.
910
+ Returns a JSON-serialisable dict:
911
+ {batch_size: {'tokens_per_sec': <float>, 'peak_mem_mb': <float>}, ...}
912
+ """
913
+ model.eval()
914
+ results = {}
915
+
916
+ for bs in batch_sizes:
917
+ # Build a small fixed subset so every BS processes the same #samples
918
+ subset = SmilesDataset(test_smiles[:bs * 10])
919
+ loader = DataLoader(
920
+ subset,
921
+ batch_size=bs,
922
+ shuffle=False,
923
+ num_workers=0,
924
+ collate_fn=lambda b: collate_fn(b, tokenizer, max_length=max_length),
925
+ )
926
+
927
+ total_tokens = 0
928
+ if torch.cuda.is_available():
929
+ torch.cuda.reset_peak_memory_stats(device)
930
+
931
+ start_time = time.perf_counter()
932
+ with torch.no_grad():
933
+ for input_ids, lengths in loader:
934
+ input_ids, lengths = input_ids.to(device), lengths.to(device)
935
+ mu, logvar = model.encode(input_ids, lengths)
936
+ z = model.reparameterize(mu, logvar)
937
+ logits = model.decode(z, max_length=max_length)
938
+ total_tokens += logits.numel() # number of float elements
939
+ duration = time.perf_counter() - start_time
940
+
941
+ tokens_per_sec = total_tokens / duration
942
+ peak_mem_mb = (
943
+ torch.cuda.max_memory_allocated(device) / (1024 ** 2)
944
+ if torch.cuda.is_available()
945
+ else 0.0
946
+ )
947
+
948
+ # Store as plain Python floats
949
+ results[bs] = {
950
+ "tokens_per_sec": float(tokens_per_sec),
951
+ "peak_mem_mb": float(peak_mem_mb),
952
+ }
953
+ print(f"BS {bs:3d} → {tokens_per_sec:8.2f} tok/s | Peak Mem: {peak_mem_mb:.2f} MB")
954
+
955
+ return results
956
+
957
+ #
958
+ # FINAL EVALUATION PIPELINE
959
+ #
960
+
961
+ def full_evaluation_pipeline(model, tokenizer, train_smiles, test_smiles, device, save_dir):
962
+ print(f"\n FULL EVALUATION FOR: {tokenizer.name}")
963
+
964
+ test_dataset = SmilesDataset(test_smiles)
965
+ test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False,
966
+ collate_fn=lambda b: collate_fn(b, tokenizer, max_length=MAX_SEQ_LEN),
967
+ num_workers=0)
968
+
969
+ # 1. Reconstruction
970
+ recon_metrics = evaluate_reconstruction(model, test_loader, tokenizer, device)
971
+
972
+ # 2. Uniqueness & Novelty
973
+ train_set = set(train_smiles)
974
+ uniqueness, novelty = compute_uniqueness_and_novelty(recon_metrics['generated_smiles'], train_set)
975
+
976
+ # 3. KL Divergence
977
+ kl_div = evaluate_latent_kl(model, test_loader, device)
978
+
979
+ # 4. Interpolation Validity
980
+ interp_validity = evaluate_interpolation_validity(model, tokenizer, test_smiles, device)
981
+
982
+ # 5. Latent Sampling (for FCD — optional)
983
+ # gen_smiles_30k = sample_from_latent(model, tokenizer, num_samples=10000, temperature=0.8) # reduce for speed
984
+ # fcd_score = compute_fcd(test_smiles, gen_smiles_30k) if 'get_fcd' in globals() else None
985
+
986
+ # 6. Throughput & Memory
987
+ # throughput = measure_inference_throughput(model, tokenizer, test_loader, device)
988
+
989
+ eval_results = {
990
+ **recon_metrics,
991
+ 'uniqueness': uniqueness,
992
+ 'novelty': novelty,
993
+ 'kl_divergence': kl_div,
994
+ 'interpolation_validity': interp_validity,
995
+ # 'fcd': fcd_score,
996
+ # 'inference_throughput': throughput,
997
+ }
998
+
999
+ eval_path = os.path.join(save_dir, "evaluation_results.json")
1000
+ with open(eval_path, "w") as f:
1001
+ json.dump(eval_results, f, indent=2, default=str)
1002
+
1003
+ print(f" Evaluation saved to {eval_path}")
1004
+ return eval_results
1005
+
1006
+ #
1007
+ # RUN EVALUATION FOR EACH TOKENIZER
1008
+ #
1009
+
1010
+ for tokenizer in TOKENIZERS:
1011
+ print(f"\n🔄 LOADING BEST MODEL FOR: {tokenizer.name}")
1012
+ checkpoint_path = f"./checkpoints/{tokenizer.name}/best_model_{tokenizer.name}.pt"
1013
+ if not os.path.exists(checkpoint_path):
1014
+ print(f"⚠️ Checkpoint not found: {checkpoint_path}")
1015
+ continue
1016
+
1017
+ vocab_size = len(tokenizer)
1018
+ pad_token_id = tokenizer.tokenizer.pad_token_id
1019
+ model = MoleculeVAE(
1020
+ vocab_size=vocab_size,
1021
+ pad_token_id=pad_token_id,
1022
+ bos_token_id=tokenizer.bos_token_id,
1023
+ eos_token_id=tokenizer.eos_token_id
1024
+ ).to(device)
1025
+
1026
+ checkpoint = torch.load(checkpoint_path, map_location=device)
1027
+ model.load_state_dict(checkpoint['model_state_dict'])
1028
+ model.eval()
1029
+
1030
+ full_evaluation_pipeline(
1031
+ model=model,
1032
+ tokenizer=tokenizer,
1033
+ train_smiles=train_smiles,
1034
+ test_smiles=test_smiles,
1035
+ device=device,
1036
+ save_dir=f"./checkpoints/{tokenizer.name}"
1037
+ )
1038
+
1039
+ print("\n🎉 PIPELINE COMPLETE — ALL TOKENIZERS BENCHMARKED, TRAINED, AND EVALUATED!")
benchmark/data/chunk_1smi.csv ADDED
The diff for this file is too large to render. See raw diff
 
benchmark/data/test_smiles.txt ADDED
@@ -0,0 +1,1628 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CN(CCc1ccccc1)C(=O)C=Cc1ccccc1
2
+ OCc1cc(CC2(NCC3CCCCC3)COC2)no1
3
+ COc1ccc(C23CCC(=O)C=C2N(C)CC3)cc1OC
4
+ CC1(C)CC2C(=O)CCC3OC3(C)CCC21
5
+ CCOC(=O)CC(c1c(O)c2ccccc2[nH]c1=O)C(C)C
6
+ CC(OC(=O)Cn1cnc2ccccc2c1=O)C(N)=O
7
+ CC(C)NC(=O)NC1C2COC(O2)C(n2ccnc2)C1O
8
+ O=C(O)CNC(=O)c1cccc(Cl)c1
9
+ Cc1ccc(C(C)CC=CC(C)(C)O)cc1O
10
+ NC1CCN(Cc2ccc(OCc3ccccc3)cc2)CC1
11
+ O=C1N=CN=C2C1=NC(=S)N2C1OC(CO)C(O)C1O
12
+ CC1(C)NCCc2cc(O)c(O)cc21
13
+ CC(=O)c1c(C)cc2c(c1O)C(=O)C=CC2=O
14
+ COC1C(O)OC(C)C(N)C1O
15
+ Nc1cc2nncoc-2c1
16
+ Cc1ccc(Nc2nc(Cl)nc(NC(C)(C)C)n2)cc1
17
+ CC(C)CCC(=O)OCCCc1ccc(O)c(O)c1
18
+ C=C(C)C=CC12CC(C)C3CCC(C)([NH2+][CH2-])C(CCC1C)C32
19
+ C[C@]12Cc3ccccc3C[C@](C)(N1)c1ccccc12
20
+ CCOC(=O)c1ccc(NC(=O)CCCCC2SCC3NC(=O)NC32)cc1
21
+ CCCCCCCCCCCCCCCCCCCC(C)CC
22
+ CC1=C(CCC(C)(O)C(O)CO)C2(C)CCCC(C)(C)C2CC1
23
+ CN(C)CCOc1ccc2c(=O)cc(-c3ccccc3)oc2c1
24
+ COc1cc(O)c2c(c1)Cc1cc(C)cc(O)c1C2=O
25
+ CC1=CCC2CC3C(C)CCC13C2(C)C
26
+ Cc1nccn1C1C2OCC(O2)C(NCc2ccccn2)C1O
27
+ C=C1CCC2OC1C1C(C(C)C)CCC21C
28
+ CC1CC2CCCN2C(CC(=O)CC2N3CCCC3CC(C)C2(O)c2ccccc2)C1(O)c1ccccc1
29
+ CCC(C)Cc1ccc(C(C)O)oc1=O
30
+ Cc1cccc2c1sc1c(C)cccc12
31
+ CC1(C)CC2C=C(C=O)C34CC3(C(=O)OC4O)C2C1
32
+ CCC(C)C(NC(=O)C(N)CCSC)C(=O)NC(CCCN=C(N)N)C(=O)O
33
+ O=C(Oc1ccc2c(c1)OC(=Cc1ccco1)C2=O)c1ccccc1F
34
+ N#CCCCC1CS1
35
+ CC(C)CC(C(=O)NCC1CCCN2CCCCC12)n1cccc1
36
+ c1csc(-c2nnc3n2C2(CCCC2)Cc2ccccc2-3)c1
37
+ C#CC=CC1C(O)CCCC12CCCC(CC=C)N2
38
+ CC(C)C(CCNCc1ccc(N(C)C)cc1)c1ccco1
39
+ NC(=O)C(CCC(=O)O)NC(=O)C1=CC(NC(=O)NC2CCCCC2)C(O)C(O)C1
40
+ O=c1c(O)cccc2ccc(O)c(O)c12
41
+ COC=Cc1cc2ccc(=O)oc2cc1OC
42
+ C#CC=CC(Cl)C(O)C1CC2OC2CC(Br)C(CC)O1
43
+ C=C1CCC2C(C3CC(C4CCCCC4)CC13)C2(CN)CCC
44
+ C=C(C)CC1CCC=C2C(=O)CC(C)(C)C21
45
+ CCCCCCCCCCCCCCCC(C)(C)C
46
+ CN1CCN(CC2OCC(NCc3nccn3C)C2O)CC1
47
+ COc1cc(Cc2cnc(N)nc2N)cc(OCCC(=O)O)c1OC
48
+ CC(=NOCCSC(=N)N)c1ccc(Cl)c(Cl)c1
49
+ CC1=Cc2cc3c(c(O)c2C(C)O1)C(=O)C=C(O)C3=O
50
+ CN1C(=O)Nc2cccc(CN)c2S1(=O)=O
51
+ c1ccc2c(CNCCCNCc3cccc4ccccc34)cccc2c1
52
+ O=P(O)(O)c1ccccc1O
53
+ CC(C)(C)NCC(=O)Nc1c2c(nc3c1CCC3)CCC2
54
+ COC(=O)CC(C)CCC1C(C)=CCC2C(C)(C)C(=O)CCC12C
55
+ CC(C)CCNC1(Cc2cc(-c3cccc(O)c3)on2)COC1
56
+ CCc1c(O)cc(CCC(C)C)oc1=O
57
+ CC(C)C1Oc2cc3oc(=O)ccc3cc2C1=O
58
+ O=NN1CCCc2cc3c(cc21)N(NO)CCC3
59
+ CCNC(=O)Nc1ccc(C(O)C2COCC(=O)N2C)cc1
60
+ CC=CC=Cc1cc(O)cc(O)c1C=O
61
+ O=C(Nc1ccccc1[NH+]([O-])O)C(F)(F)F
62
+ CCOC(=O)C1=C(C)N=C(C)/C(=C(/O)OCC)C1c1cccc(I)c1
63
+ COC1COC2C(NS(=O)(=O)c3cccs3)COC12
64
+ CCCCCCCc1ccc(C#Cc2ccc(OCCCC)cc2)nc1
65
+ COC(=O)c1c(C)oc2ccc(OC(=O)c3ccc(F)cc3)cc12
66
+ Clc1cc(Cl)c(Cl)c(-c2cccc(Cl)c2Cl)c1
67
+ COc1cc(OC)c(C(=O)C=Cc2ccccc2OC)c(OC)c1
68
+ Cc1c(O)cc2c(c1C)OC(C)(CCCC(C)CCCC(C)CCCC(C)C)CC2
69
+ COc1cc(C(O)C(O)c2ccccc2)oc(=O)c1
70
+ COc1ccccc1CC[C@H](O)CC[C@@H]1[C@@H](CCCCCCC(=O)O)[C@@H](O)C[C@H]1O
71
+ CC1=CC(c2ccccc2)CC(=O)O1
72
+ CC1C(O)CCC2(C)CC(=O)C(C(C)(C)O)=CC12
73
+ CCCCCCCCCCCCCCCCCCCCCCCCCCCCOC(=O)CCCCCCCCCCCCCCCCCCC
74
+ CC(C)=CCc1ccc2[nH]c3c(CC(C)O)c(C)c(O)cc3c2c1
75
+ CCC(C)=CC(=O)OC1C(O)C(C2(C)CO2)CC2(C)C(C)CC(=O)CC12
76
+ CCCCCCCCCCCCCCCC=CCCC=CCCCC(=O)OC
77
+ COc1ccc2c3c([nH]c2c1)-c1cc(C)cc(=O)n1CC3
78
+ CCCCCCC(Br)(Br)C(=O)C(Br)Br
79
+ O=CCCCCCCCC1OC1CCCCCCCC(=O)O
80
+ C1=CC(=Nn2cccc2)C=CC1=Nn1cccc1
81
+ CCCCCC(=O)CCCC(=O)CCCCCC(=O)CCCC(=O)CCCCC
82
+ CC(C)CC1=C(O)N(O)C(CC(C)C)C=N1
83
+ c1ccc2ncncc2c1
84
+ CC#CC#CC#Cc1ccc(-c2ccccc2)s1
85
+ COc1ccc(C2Oc3cc(OC)cc(O)c3C(=O)C2O)cc1
86
+ CCCCCCCCCCCCC=CC(O)C(COC1OC(CO)C(O)C(O)C1O)NC(=O)C(O)CCCCCCCCCCCCCCCC
87
+ COc1ccc(-c2cn3nccnc3n2)c(OC)c1
88
+ OCC1OC(n2nnc3ccccc32)C(O)C(O)C1O
89
+ CC(C)(C)c1ccc(-c2nc(I)ccc2O)cc1
90
+ C(=NC(N=Cc1ccco1)c1ccco1)c1ccco1
91
+ CC(=O)OCC1(O)CC23CCC4C(C)(C)CCCC4(C)C2CCC1C3
92
+ CC1=CCC(C)(C)C(O)C2CC2(C)C(O)CC1
93
+ CC12CCC3c4ccc(O)cc4CCC3C1CCC2O
94
+ COc1c2c(cc3ccoc13)C=CC(O)O2
95
+ O=C1C2CC(CN3CCC(O)CC23)C2=CC(O)CCN12
96
+ C=C(C(=O)O)C1CC=C2CCCC(C)C2(C)C1
97
+ CC1(C)CCCC1(C)c1cc(C=O)cc(O)c1O
98
+ CCCCCCCCCCCCCCCC1CCNCCCN(C)CCCCNCCCN1
99
+ CN(Cc1ccccc1)Cc1cc(CC2CNCCC2CC(=O)N2CCc3ccccc3C2)no1
100
+ CCCCCCC(=O)/C=C\C=C\C(=O)c1ccc(C(=O)OC)cc1
101
+ CC1=C(C(=O)O)C(c2ccccc2)N(C)C(=O)N1C
102
+ CCC(C)C(O)(CC(=O)O)C(=O)O
103
+ CC(=NO)C(CC(C)C)=NO
104
+ C=C(C)C(CC=C(C)C)Cc1c(O)ccc(C(=O)C=Cc2ccccc2O)c1O
105
+ CCCCCCCCCCCCCC(=O)OC(CO)CO
106
+ CCCCCCCc1cc(=O)c2ccccc2n1C
107
+ O=C(O)C(CCCc1ccccc1)c1ccccc1
108
+ C=C=Cn1nc(C)c2c(C)nc(CCC)n2c1=O
109
+ c1ccc(CNCCCNCCCCCCCNCCCNCc2ccccc2)cc1
110
+ C=C1C(=O)OC2C1CCC(C)C1CCC(OC(C)=O)C12C
111
+ CC=CC#CC#CC=CC=CCCCC
112
+ CCC=CC=CC1(C)OC(CC(CO)OC)=CC1=O
113
+ CN1CCc2nc(N(C)C)cc(N)c2C1
114
+ C#CCOC(=O)C=C(C)C=CCC(C)CCCC(C)C
115
+ COc1ccc(O)c2oc3ccc(O)cc3c(=O)c12
116
+ COC(=O)Cc1c(C)c2ccc(OCc3ccc(C)cc3)cc2oc1=O
117
+ CCC1CN(C(C)=O)CCC1CC(=O)Nc1ccccc1
118
+ CC=CC#CC#CC=CCC(CCOC(C)=O)OC(C)=O
119
+ C=CCOC(=O)COc1ccc2c(=O)c(Oc3ccc(OC)cc3)coc2c1
120
+ O=C(CCc1c[nH]c2ccccc12)NCCn1ccc2ccccc21
121
+ CCCCCC(O)c1cccc(OCc2ccccn2)c1
122
+ CC1(C)CC(CCNC(=O)c2ccccc2C(=O)O)(Cc2ccccc2)CCO1
123
+ Cc1ccc(Br)cc1F
124
+ Cc1c(CC(=O)NC(C)C)c(=O)oc2cc(O)cc(O)c12
125
+ C=CC1(C)C=C2C(=O)OC34CCCC(C)(C)C3C(=O)OC24CC1
126
+ COc1cc(C(Br)=CC=CC=CC=CC=CC=CC=CC=CC(=O)O)ccc1Br
127
+ O=C1CCC(=O)N(O)CCCCCNC(=O)CCC(=O)N(O)CCCCCN1
128
+ CCCCCCCCCCCCCCCC(=O)OC(CO)COC(=O)CCCCCCCCCCCCCC
129
+ COc1ccc(C(=O)CSC(=N)N)cc1[NH+]([O-])O
130
+ CNS(=O)(=O)Cc1ccc2[nH]cc(CCN(C)C)c2c1
131
+ ON=C1C2CCCC1C1(O)CCCCC1C2
132
+ C=C(C)C1CC=C(C)C(=O)C1O
133
+ c1ccc(CNC2CC2)cc1
134
+ C=C(C)C1CC(=O)C2CCC(O)C(C)C2(C)C1
135
+ CC1(C)CC=CC23CCC(C=C12)C3(C)C
136
+ C=C(C(=O)O)C(CCCCCCCCCCCCCCC(C)O)C(=O)O
137
+ CCOc1ccc2[nH]c([S+]([O-])Cc3ccccc3N)nc2c1
138
+ Cc1ccccc1-n1c(=O)[nH]c(O)c(C2NCCc3ccccc32)c1=O
139
+ COc1ccc(C(=O)Nc2ccccc2Cl)cc1OC(C)=O
140
+ C=C1C(=O)OC2CC3(COC(C)=O)C(CC12)C(=C)C1OC(O)C3O1
141
+ c1cnc2c(C3NCCc4c3[nH]c3ccccc43)cccc2c1
142
+ CC(=O)OC1C(C(C)C)C(O)CC(C)=C2CCC(C)(O)C21
143
+ O=C1NCCc2c1[nH]c1ccc([NH+]([O-])O)cc21
144
+ CC(C)C(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
145
+ COc1ccc2c3ccnc4c3n(c2c1O)C(=O)CC4
146
+ C=C(CCC(C(=O)O)C1CCC2C3=CCC4CC(O)CCC4(C)C3CCC21C)C(C)C
147
+ COCCC(=O)Nc1ccc2n(c1=O)CC1CC2CN(C(C)=O)C1
148
+ CNCC(O)c1cc(O)c(O)cc1F
149
+ C=C(C(=O)OC)C(C)O
150
+ CC1=CC(=O)CC2(C)CCC(=O)C=C12
151
+ COc1ccc(-c2cc3ccc(OC(=O)c4ccccc4OC)cc3oc2=O)cc1
152
+ Clc1ccc(Sc2ccc(NC3=NCCN3)cc2)cc1
153
+ C=C(C)C(=O)OC1CC(C(=O)OC)=CCCC(C)=CC2OC(=O)C(=C)C21
154
+ CCCCCCCC1CC=CC(=O)O1
155
+ Cn1c(O)c(C(=O)C(Cl)Cl)c(=O)c2ccccc21
156
+ CCCCCCCCC=CCCCCCCCC(=O)C1=C(O)CCC(O)C1=O
157
+ O=C(NCCCn1ccnc1)c1cccs1
158
+ CCCCC(CC)CC1(CC)C=C(CC)C(CC(=O)O)OO1
159
+ C=c1[nH]c2onnc2c1=C
160
+ CCCCCCC=CCCCCCCCCCC(=O)OCC(COC(=O)CCCCCCCCCC=CCCCCCCCC)OC(=O)CCCCCCCC=CCCCCCCCC
161
+ CC(O)CC(=O)CCCCCCCCCCCCCCCCCCCCCCCC(O)CCOC1OC(CO)C(O)C(O)C1O
162
+ c1coc(Cc2ccc(Cc3ccco3)o2)c1
163
+ O=C(O)CCC(O)C=C(O)C(=O)O
164
+ CCCCCCCCCCCCCC=CC=CC=CC=CC(=O)O
165
+ CC(C)=CCOc1ccc(C=O)cc1
166
+ CC1CC(O)CC(C)(C)C1
167
+ CC=CC#CC#CC(=O)O
168
+ C=Cc1ccc(OC)c(OC)c1
169
+ C=CCCCCCCCCC1CC(CC(COC)OC)C(=O)O1
170
+ COc1cc(Br)cc2c(O)cc(C(=O)O)nc12
171
+ CC(C)(C)OC(=O)NCC1OCC(N)C1O
172
+ CCOC(=O)c1c(C)oc2ccc(OCC(=O)OC(C)(C)C)cc12
173
+ O=C(Cl)ON1C(=O)CCC1=O
174
+ COc1ccccc1CNC1C2COC(O2)C(N(C)CCc2ccccn2)C1O
175
+ S=c1c2ccccc2oc2ccccc12
176
+ CC(C)=CCCC(C=O)=CC=O
177
+ Oc1ccc(C=Cc2c(O)c(O)c3c(c2O)CCCC3)cc1O
178
+ C=CC(C)(CCC=C(C)Cc1cc(C)co1)CC(=O)c1ccc(O)cc1O
179
+ COc1cc(C2(COC(=O)C(C)C)CO2)c(OC(=O)C(C)C)cc1C
180
+ CCCCCC(=O)c1ccc(O)c(C(=O)Nc2ccc(Br)cc2)c1
181
+ COc1ccc(C(=O)C=Cc2cc3ccccc3o2)c(OCc2ccccc2)c1
182
+ CCCCCCC1CC1CCCC(=O)O
183
+ CC(CCCCCCCCCCCCCCCC(O)CC(=O)O)OC1OC(C)C(O)CC1O
184
+ CCC=CCC=CCC=CCCCCCCCC(=O)O[Si](C)(C)C
185
+ O=C(O)CCCCC1C2NC(=O)NC2CS1(=O)=O
186
+ O=C1CC(=Cc2ccc3[nH]ccc3c2)C(=O)N1
187
+ CC12CCCCC1CCC1C2CCC2(C)C(C3=CC(=O)OC3)CCC12
188
+ CCCCCCCCCCCCCCCC(=O)CC(=O)CCC
189
+ O=C(CN1CCCCC1CCO)c1c[nH]c2ccccc12
190
+ CS(=O)(=O)c1ccc(-c2ccccc2-c2ccc(F)c(Cl)c2)cc1
191
+ Cc1nn(-c2ccccc2)c(Cl)c1C=NO
192
+ CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=O
193
+ COc1ccc2nc(CC3CN(C4CCOCC4)CCC3CC(=O)O)[nH]c2c1
194
+ O=P(Cc1ccccc1OCCCCCOc1ccccc1)(c1ccccc1)c1ccccc1
195
+ CC1N=CC23CCC4C(CCC5CC(N)CCC54C)C2CCC13
196
+ CCC(O)CCC=CC#Cc1cccs1
197
+ COC1C(O)COC(O)C1O
198
+ C=C1C(=O)OC(CCCCCCCCCCC)C1C(=O)O
199
+ CCCCCCCCCCCCCCCCCCCC(=O)OCC(O)CO
200
+ CC(=O)OCC(COC(C)=O)=C1C=CC2(C)CC=C(C)CCC3OC3(C)CCC12
201
+ COC(=O)C1C(NC(=O)C2CCCC2)CCN1C(C)=O
202
+ C[Si](C)(C)OC1CSSCC1O[Si](C)(C)C
203
+ Nc1ccn(C2OC(CO)C(O)C2O)c(=O)n1
204
+ C=C(C)C1CC=C(C=NO)CC1
205
+ Nc1ccc(O)c(F)c1
206
+ CN(C)Cc1c(O)ccc2c1OC(=Cc1ccccc1Cl)C2=O
207
+ SCOC(OCS)c1ccccc1
208
+ Nc1cc(C2CC3CCC2N3)cnc1Cl
209
+ O=C(O)c1cn2c(n1)COc1ccccc1-2
210
+ CCCCCCCCCCCCCCCCCCN(CC)c1ccc(C(=O)O)cc1
211
+ N=C(N)c1cccc(OCCNC(=O)c2ccc(C(=O)N3CCCC3)cc2)c1
212
+ CCCCOc1cc(OCCCC)c2c3c(c(=O)oc2c1)CCC3
213
+ COc1ccc2cc([NH+]([O-])O)ccc2c1C(=O)O
214
+ COC(=O)CCC(C)CCCCCCC(C)C
215
+ COC(=O)COCCOCC(=O)OC
216
+ C=C(C)C1(O)CCC(C)=CC1=O
217
+ COc1cc2c(cc1OC)C(=CC(=O)c1ccccn1)NCC2
218
+ CC1=CCCC(C)(O)C=CC(C(C)C)CCC(C)(CO)C(=O)C1
219
+ CCNC=C1C(=O)C=C(C2C(C)C=CC3CCCCC32)OC1=O
220
+ O=C(C(c1ccccc1)n1cnnn1)N1CCCCC1c1cccnc1
221
+ CC(=O)C=Cc1ccc(C)c(C)c1C
222
+ COc1cccc2c1C(O)(CC#N)C(O)=N2
223
+ CC(=O)c1ccc(C)cc1OC(=O)c1ccccc1OC(C)C
224
+ CCn1cc(Br)ccc1=O
225
+ C=C(C)c1oc2ccc(C(=O)COC(C)=O)cc2c1OC
226
+ COc1cc(C=Cc2ccccc2)cc(O)c1O
227
+ CC12CC(CCl)C3c4ccc(O)cc4CCC3C1CCC2O
228
+ C=C(C)C1CC=C2C=C(C(C)C)CCC2(O)C1(C)CCC(=O)O
229
+ OC1CCOC1Cc1ccccc1
230
+ CCNc1ccccc1C(=O)O
231
+ OB(O)c1cccc(-c2ccccc2)c1
232
+ CCCCCCC=CCCC=CCCCCCCCCCCCCCCCCCCCC(=O)O
233
+ CCCCCCCC(C)CCCCC
234
+ CCn1c(=O)[nH]c2cc(C(=O)O)ccc2c1=O
235
+ CC(CCO)CCC1C(C)(O)CCC2C(C)(C)CCCC21C
236
+ COc1ccc2c3ccnc(C)c3n(C)c2c1
237
+ CC1(C)C2CCC(C(=O)O)(C2)C1O
238
+ CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)Nc1ccc(O)cc1
239
+ COC(=O)C1(c2cc3ccccc3[nH]2)COCC=C2CNCCC21
240
+ Cc1cc(O)c(O)c2c(=O)c(O)c(-c3ccc(O)cc3)oc12
241
+ CCCC1CC(O)C(Cl)C(O)(C(Br)Br)O1
242
+ COc1cc2c(cc1O)C(Cc1ccccc1)NCC2
243
+ CCOC(=O)C=C1CCC2C(O)(CCC3C(C)(C)CCCC32C)C1
244
+ O=C(O)Cc1c(O)cccc1O
245
+ CC(C)CCCC(C)CCCC(C)CCCC(C)C(O)CO
246
+ C[C]1[CH][CH][C](N)[NH+](C[C]2[CH][CH][CH][CH][C]2C)[CH]1
247
+ COc1cc(-c2cc3ccccc3o2)c(C)c(O)c1C
248
+ c1ccc(N=Nc2ccccc2N=Nc2ccccc2)cc1
249
+ N=C(N)NCCCC(NC(CCC(=O)O)C(=O)O)C(=O)O
250
+ Cc1ccc2oc(-c3cc(NCCCC#N)ccc3Cl)nc2c1
251
+ COc1ccc(CC(C)N)c(OC)c1OC
252
+ CCc1cc(C(C)=O)c(O)cc1OCCCCCCC(=O)NC
253
+ NC(Cc1ccc(-c2ccc(CC(N)C(=O)O)cc2O)c(O)c1)C(=O)O
254
+ CCCCCc1ncc(C)s1
255
+ CCCCCCCCCCCCCCCC(=O)OCC(O)COP(=O)(O)OC1C(O)C(O)C(O)C(O)C1O
256
+ COC1=CC(=O)OC(CC(O)c2ccccc2)C1
257
+ CCOC(=O)C(NCC(O)COc1ccccc1C)C(=O)OCC
258
+ Cn1ccc2c(c1=O)C(=O)OC2(C)C
259
+ Cc1cc2cnnc-2no1
260
+ N=C(N)SCc1cccc(CSC(=N)N)c1
261
+ C=C1CCC2C(C)(C)CCCC2(C)C1CC=C1CC(OCC)OC1=O
262
+ CCCCCCC(O)C(=O)O
263
+ CC1=CC2OC(O)C3(C)OC23CCC(C)=CCCC2(C)OC2CC1
264
+ COCCNc1ncccc1-c1noc(C2CCCN2CC(C)C)n1
265
+ CNCc1cccc(OC)c1O
266
+ CCCC=C1OC(=O)c2ccccc21
267
+ Cc1c(CCC(=O)NCCC(=O)O)c(=O)oc2cc3occ(C(C)(C)C)c3cc12
268
+ CC(C)C(Nc1nc(N)nc2[nH]cnc12)C(=O)O
269
+ Cc1c[nH]cc1-c1ccccc1
270
+ O=C1CC2C=CC1CC2
271
+ Cc1ccc(C(=O)NC(=O)CSc2nccc(O)n2)cc1
272
+ COc1cc(O)ccc1-c1oc2cc(O)cc(O)c2c(=O)c1CC=C(C)C
273
+ COc1ccc2nc(C)c(C(N)=O)cc2c1
274
+ CC1C(=O)Oc2cc3c(c(O)c21)CCC1C(C)(C)CCCC31C
275
+ CC(O)C(O)(C(=O)OCC1CCN2CCCC12)C(C)(C)O
276
+ CCCCc1cccc(CCC)c1O
277
+ O=Cc1ccc2ccoc2c1
278
+ CC(C)NC(C)C(O)COc1ccc(CC(N)=O)cc1
279
+ CC(=O)OC1C=C(CCO)C(C)CC2OC(=O)C(C)C12
280
+ CC(=O)OC1CC(C)C2(CCCC(=O)O2)C2(C)CCCC(C)(C)C12
281
+ CC(=CC(=O)O)C1CC2C(C)(CCC3C(C)(C)CCCC32C)O1
282
+ COc1cc2c3c(c1O)C1(CCC(O)CC1)CCC3N(C)CC2
283
+ Oc1nnc2c[nH]ccc1-2
284
+ CCCCCC=CCC=CCCCC1CC(=O)C2CCCCC2N1C
285
+ O=C1C(=CC=Cc2ccccc2)CCC1=C1SCCCS1
286
+ CCN(CC)C(=O)C1CCCN(CCCCCCCCCCCCN2CCCC(C(=O)N(CC)CC)C2)C1
287
+ CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC(=O)NC(C)C(=O)O
288
+ O=C(O)CCCCCCCCC(=O)O
289
+ COc1cccc(CCc2ccccc2)c1
290
+ COc1cc2oc(=O)c(CC(=O)NCCCCCC(=O)O)c(C)c2cc1Cl
291
+ COCCCN1C(=O)C(O)=C(C(=O)c2ccc(OCC(C)C)cc2C)C1c1ccncc1
292
+ c1cn(-c2ccon2)cn1
293
+ CC(=O)Nc1nc2ncc(C(=O)O)nc2c(=O)[nH]1
294
+ COc1ccc(O)c2c1C(=O)OC(CCCC(C)=O)C2
295
+ C=C(C)C#Cc1cc(O)c(C#CC(=C)C)c(CO)c1O
296
+ NCc1ccc2c(Br)cccc2c1
297
+ COc1ccc(C=C2Cc3cc(OC)c(OC)cc3C2=O)c(OC)c1
298
+ OC1CNC(c2nc(-c3cnccn3)no2)C1
299
+ CCC=CC=C1COC2(CCC(CO)O2)C1O
300
+ C#CCOC(=O)C(F)(F)F
301
+ CC1CCC(C(C)C)C(OC2OC(CO)C(O)C(O)C2O)C1
302
+ Cc1cc(=O)c2c(o1)-c1cc(O)c(O)cc1OC2
303
+ CC(N)C(=O)NC(CCCCN)C(=O)NC(CCCCN)C(=O)O
304
+ NC(CC[Se][Se]CCC(N)C(=O)O)C(=O)O
305
+ CC(C)=CCCC=C(C)COc1ccc2ccc(=O)oc2c1
306
+ NC(=O)c1cc(Br)ccc1F
307
+ C=CC=C(C)COCC(=O)C#CC
308
+ CC(NC(=O)c1c(O)c2cccc3c2n(c1=O)CC3)c1ccccc1
309
+ O=C(O)Cc1ccc(C(F)(F)F)cc1Br
310
+ CCCCCC1(O)C(C)=C(C)C(=O)C1CC(=O)O
311
+ Cc1cc(=O)c(O)c(C(CC(N)=O)c2cccnc2)o1
312
+ CCOC(=O)Cc1nc(-c2ccc(OC)cc2)oc1-c1ccco1
313
+ CCCCNC(=O)[C@H](C)C[C@H](O)[C@@H](N)C[C@@H](C)Cc1ccccc1
314
+ CCCN=C(C)c1ccccc1
315
+ OCC1OC(n2nnc3c(O)ncnc32)C(O)C1O
316
+ CC(C)=CCCC1(C)C=Cc2c(c(C=O)cc3c2[nH]c2ccccc23)O1
317
+ CCCCCCC=CCCCCCC(OC(C)=O)C(C)NC(C)=O
318
+ C=C1C(=O)CC2CC3(C)CCCC(=C)C3CC12
319
+ COc1cccc(C=CC(=O)O)c1C(C)C
320
+ CCCCCCCCCCCCC(=O)C1(O)C(O)C=CC1OC(C)=O
321
+ CC(=O)C(C)=CC1C(C)=CCCC1(C)C
322
+ CCCCSCC(NC(=O)CC[C@H](N)C(=O)O)C(=O)NCC(=O)O
323
+ O=C1NC2c3ccccc3C3CCC2C3NC(=O)N1c1ccccc1
324
+ CC(=O)OCC1=CCN2CCC(OC(C)=O)C12
325
+ COc1ccccc1C(CCN=Cc1ccc(N(C)C)cc1)C1CCOC(C)(C)C1
326
+ CC(=O)OCC(C)(O)c1ccc(C)cc1
327
+ O=C(O)c1ccc(CN2CCN3C(=O)N(c4ccccc4)CC3C2)cc1
328
+ CCCCCCCCCCCCCCCCCc1cc(O)cc(O)c1
329
+ Cc1ccccc1-c1nc(-c2ccccc2)nc(N)c1CN
330
+ COC(=O)C(C)COC(=O)c1ccccc1
331
+ O=C1C=CCCCc2ccc(O)c(c2)-c2cc(ccc2O)C1
332
+ CCCCCCCCCCCC=CC(O)C(COC1OC(CO)C(O)C(O)C1O)NC(=O)C(O)CCCCCCCCCCCCCCCCCCCCCC
333
+ N=c1ncoc2[nH]ccc12
334
+ COc1ccccc1CCNCC1C(=O)OC2CC3=CCCC(C)C3(C)C(O)C21
335
+ COc1cc(OC(C)=O)c2c(=O)cc(C)oc2c1OC
336
+ CCCCCCCCCCCC(=O)SCCNC(=O)CCNC(=O)C(O)C(C)(C)COP(=O)(O)O
337
+ CNCCN1CCN([NH+](O)NOc2ccnc(Cl)n2)CC1
338
+ C[C]1[CH][CH][CH][NH+](C[C]2[CH][CH][CH][CH][C]2F)[CH]1
339
+ C=C1C(=O)OC2CC(C)(C3CO3)C(C(=C)C=O)CC12
340
+ CC=CC(=O)OC(=O)C=CC
341
+ C=C1CCC2C(C3C(=C)C(OC)OC(OC(C)=O)C13)C2(C)C
342
+ C=C1CCC2C(=C)C(=O)OC2C=C(C)CCC1=O
343
+ O=S(=O)(O)c1ccc2c3c1-c1cc(ccc1O)CCC=CC3CC2
344
+ CC(=O)OC1CCC2(C)C(CCC(C)=CCO)C(C)=CCC2C1(C)C
345
+ CC(=O)OC1CC(C)C(C=O)=C2C(C)CC(C)(C)C21
346
+ O=C(O)CCCC/C=C(\c1ccccc1)c1cccnc1
347
+ COC(=O)CNC(=O)N1CCc2nc[nH]c2C1c1ccncc1
348
+ COc1cc(CCc2ccc(O)c(O)c2)cc(O)c1OC
349
+ CCCCC=CCCCCCCCc1cc(OC)cc(OC)c1
350
+ C=CCCCCCCCCCCCC=CCCCCCCCC
351
+ N=C(N)c1ccc(CN2CCN(c3cccc(OCC(=O)O)c3)CC2)cc1
352
+ CCCCCCCCCCCCC(C)CCCCCCCCCCCC
353
+ CNC1=CC(=O)CCC1
354
+ Cc1cccc2c1ccn2CCNC(=O)CCC1NC(=O)c2ccccc2NC1=O
355
+ CSC=CC(=O)NCCCCNC(=O)C=Cc1ccccc1
356
+ O=C(O)c1cc2ccc(O)cc2oc1=O
357
+ CC(C(O)c1ccccc1)N(C)Cc1ccccc1
358
+ CCCCCC1C=CC(=O)CCCCCCCCC(=O)O1
359
+ C=C(C)C=Cc1cccc2c1NC1ON=C(C(=O)OC)CC21O
360
+ CCCCC=CCCCCCCCCCCCCCCCCCC1=C(O)C(=O)C=C(O)C1=O
361
+ CC(=O)C(C=NCC(C)C)C(C)=O
362
+ COCCc1nccc(CC2C(NCc3ccc(F)c(F)c3)CC(O)C2CO)n1
363
+ Nc1ccc2ccccc2n1
364
+ CCCCc1ccccc1-c1n[nH]c(-c2cccc(OC)c2)n1
365
+ CCCCCC=CCC=CCCCCCCCC(=O)OCC(COC(=O)CCCCCCCCCCCCCCC)OC(=O)CCCCCCCC=CCC=CCCCCC
366
+ CC=C(C)C=CCC(C)CCC=C(C)CCC=C(C)C
367
+ COc1ccc2c(c1)cc(C(=O)NC(C(=O)NC(C(=O)O)C(C)C)C(C)C)n2C
368
+ NS(=O)(=O)Oc1cccc(Br)c1
369
+ COc1ccc(C2=CC=C3C=CC=CC3[OH+]2)cc1OC
370
+ CNCCCCCCCCCCCCCc1cccnc1
371
+ c1ccc(C2Nc3ccccc3C3OCCC23)cc1
372
+ O=C1CCCC2=C1C1(CC3c4ccccc4CCN23)SCCS1
373
+ CC(=O)OCCC#Cc1ccc(-c2cccs2)s1
374
+ O=C(c1ccccc1)c1ccc(O)cc1O
375
+ CCCCN(C)CC1OCC(NC(C)=O)C1O
376
+ CN(C=Cc1ccccc1)C(=O)C1OC1c1ccccc1
377
+ CC(=CC(=O)O)CCC1(CO)C(C)CCC2(C)C(C)=CCCC21
378
+ C=C(C)C1CC2(C)C(=CC1=O)CCC(OC(=O)C=CC(C)COC(C)=O)C2C
379
+ Cc1c(OCC(=O)Nc2ccc(C(N)=O)cc2)c(=O)ccn1CCC(C)C
380
+ CC=C1CN2CCc3c([nH]c4ccccc34)C2CC1C(C)=O
381
+ CSC(C)CC(=O)C1=C(C)CCCC1(C)C
382
+ CCCc1nc(OC(C)=O)n(-c2ccccc2)n1
383
+ CCCCn1c(=O)c2cc(C(=O)O)cn2c2ccccc21
384
+ CNC(=O)C(C)(C)N1CCCC1C(=O)NCCN1CCOCC1
385
+ COC1(C)C=CC(C(C)C)CCC2=CC(CC(C)=CCC1)OC2=O
386
+ COc1ccc(CC(C(=O)O)C(=O)O)cc1OC
387
+ CCOC1(C)CCC2C1C1C(CCC2(C)O)C1(C)C
388
+ CCCCCC1NCCS1
389
+ CC(C)n1c(=O)nc(-c2ccccc2)c2cc3c(cc21)OCO3
390
+ CC(=O)OCC1=CCCC2C1(C)CCC(C)C2(C)CCC(C)=CC(=O)O
391
+ Fc1ccc(C2CC3CCC2N3)cn1
392
+ CC(C)C(=O)C1CCC2(O)CCCC(C)C12C
393
+ CC1Cc2ccccc2C1(O)c1ccccc1
394
+ CC1(O)CCC(C2=CCCC3CCCCC23)CC2C1CCC2(C)O
395
+ COc1ccc2oc3c(OC)c(O)cc(O)c3c(=O)c2c1
396
+ OCc1cc(CC2(NCc3ccccc3)COC2)no1
397
+ CC(=O)OCN1C(=O)c2ccccc2S1(=O)=O
398
+ COC(C(O)C=O)C(O)C(O)CO
399
+ COc1ccc(C2COc3cc(O)ccc3C2)cc1
400
+ CC(=O)OCC=C(C)C(=O)C=CC(C)(C)OO
401
+ COCc1cn(C2COC3C(NC(=O)C4CCCCC4)COC32)nn1
402
+ COc1nc(N)nc2c1ncn2C1OC(CO)C(O)C1O
403
+ CC(=O)OC1OCC=C2OC(=O)C=C21
404
+ COC(=O)C=CCCCCC1OC2(CCCCCCCCCCCCCCCOCC(N)CO)CCC(O)C1O2
405
+ CCCCc1ncc(C)nc1C
406
+ NC(CCC(=O)NCCC(=O)O)C(=O)O
407
+ COC1C=C(CCC2(C)C(C)=CC(=O)CC2C)C(=O)O1
408
+ C=C1CC(O)C(O)C2(C)CCC(C(C)(O)CO)CC12
409
+ CC(C)(C)c1nnc(C2CCN(Cc3ccncc3)C2)o1
410
+ CCCC1(CCC)C(=N)NC(=S)N=C1O
411
+ Brc1ccc2n(-c3ccccc3)nc3ccccc3c1-2
412
+ COC(=O)C1Cc2c([nH]c3ccccc23)C2CCC(=O)N12
413
+ O=C(C=Cc1cccc(O)c1O)OC1CC(O)(C(=O)O)CC(O)C1O
414
+ COc1cccc(NC(=O)N2CCOCC(OC)C2)c1
415
+ O=CCCCCC(=O)O
416
+ COc1ccccc1OC
417
+ CCCCCCCCCCCCCCCCCCCCCCCCCC(C)(O)CCO
418
+ CC(C)(C)NCC(O)COc1nsnc1N(CCO)CC(=O)O
419
+ CCCCCCCCCCCC(=O)C1C(=O)CCC(C(=O)OC)=C1O
420
+ O=C1C(=Cc2ccccc2F)Oc2c1ccc(O)c2CN1CCOCC1
421
+ CC(=O)NCC1OC(CO)C(O)C1N(C)CCc1ccccc1
422
+ C#CC=CCC(Br)=C1CC2CC(CC)C(Br)CC2O1
423
+ COc1ccccc1COc1ccc2c3c(c(=O)oc2c1C)CCC3
424
+ CC1(C)CCCC2CC3CC21CC=C3C(=O)O
425
+ C=C(C)C1CCC(C)CC1=O
426
+ CC1CCCc2c(O)c(O)c3c(c21)COC(=O)C3(C)O
427
+ Oc1ccncc1-c1ncccc1O
428
+ [O-][N+]12CCCCC1C(CO)CCC2
429
+ COC(=O)CCC(C)C1CCC2C3CCC4CCCCC4(C)C3CC(O)C12C
430
+ COCCCCCCN/N=C(\C)C(=O)O
431
+ CCCCCCCCC1OCCCC1CCCCCCC
432
+ COC(=O)C1CN(C(=O)c2ccccc2)CCN1C(C)=O
433
+ Cc1ccc(S(=O)(=O)NC(=O)Nc2ccc(Cl)cc2)cc1
434
+ COc1ccc(O)c2c(=O)c3c(O)cc(O)cc3oc12
435
+ CC(C)CCCCCCCCCCCCCCCCCCC(O)C(=O)O
436
+ C=C(C)CCCC(C)=C1CC=C(C)CC1
437
+ c1ccc2oc(C3CCN(C4CCC4)C3)nc2c1
438
+ COc1c(O)cccc1C(=O)OCc1ccccc1
439
+ C=C(CC(=O)C(C)=CCCC(=CCCC(C)=CCO)CO)C(C)C
440
+ CC1=CCCC2C1(C)CCC(C)C2(C)CCC1CC(=O)OC1O
441
+ COC=C(C(=O)OC)C(=CC=Cc1ccccc1)CO
442
+ NC(=O)CC[C@H](NC(=O)OCc1ccccc1)c1nc2ccsc2c(=O)o1
443
+ CC(=O)OC1C=C2C(C)(C)OOC2(O)CC1C
444
+ Cc1cccc(C)c1C(=O)OC1OC(CO)C(O)C(O)C1O
445
+ CN(O)C(=O)Cc1ccccc1
446
+ COc1ccc(C(CC(=O)NCCCNc2ccccc2)c2cc3c(cc2O)OCO3)cc1
447
+ COc1cc(C2c3cc(OC)c(O)cc3CC(C)C2C)ccc1O
448
+ CC(C)=CCc1cc(C(=O)O)ccc1OC(=O)C=C(C)C
449
+ CC(C)CC(N)C(=O)NC(CO)C(=O)O
450
+ OC1C(NCC2CCCCC2)C2COC(O2)C1n1ccnc1
451
+ Cc1cc2c(c(=O)o1)C(c1ccsc1)CC(=O)O2
452
+ CC(=O)OCCI
453
+ CCCCCCCCC(C)CCCCCCCC=CCCCCCCC1OCC(N)C1O
454
+ CC(C)=CCCOc1c2ccoc2cc2oc(=O)ccc12
455
+ O=C1c2[nH]cnc2N(Cc2ccc(F)cc2)C2=NCCN12
456
+ CN1C(=O)c2ccccc2NC(=O)C12OC2c1cccc(O)c1
457
+ CC(=O)OCC1(O)CCC2C1CC(C)(C)CC1CC12C
458
+ Cc1cccc(CC(=O)O)c1
459
+ COc1cc(CC2COCC2C(O)c2ccc(O)c(OC)c2)ccc1O
460
+ CCC=CCC=CCC=CCCCCCCCC(=O)OCC(COC(=O)CCCCCCCC=CCC=CCCCCC)OC(=O)CCCCCCCCCCCCC
461
+ CC1(C)C=Cc2cc(C=Cc3cc(O)cc(O)c3)ccc2O1
462
+ NC(CC(O)C(O)C(=O)O)C(=O)O
463
+ CCOC1c2c(ccc3ccc(=O)oc23)OC1C(C)C
464
+ Nc1nc(Cl)nc2c1ncn2C1CCC(CO)O1
465
+ COC(=O)CN1C(=O)C2CC(O)CN2C2(CN(CC(C)C)C2)C1=O
466
+ COC1CC(NC(C)=O)C(O)C(C)O1
467
+ On1cc2nccc-2cn1
468
+ C=CCCCC=CC=C(C)CCCCC=CCCC=CC(=O)NC(CO)CO
469
+ COc1ccccc1C=C1Oc2c(ccc(O)c2CN2CCCC2)C1=O
470
+ COc1ccc2ccc(=O)oc2c1C(O)C(O)C(C)C
471
+ C=CC1CN2CCC1CC2CNC(=O)c1ccc2c(c1)OCO2
472
+ CN1CCCN2CCN(CCCN(CCC#N)CC1)CC2
473
+ Cc1c([N+](=O)[O-])oc2ccccc12
474
+ O=C(NCC1CCCCC1)c1cccc2nc(CCl)cn12
475
+ COC(=O)c1cncc(C(C)OC)c1
476
+ O=C(C=Cc1ccccc1)NCCc1ccc(O)cc1
477
+ CN(C)c1ccc(C=C(C#N)c2nc(O)c3ccccc3n2)cc1
478
+ O=C(CCCN1Cc2ccccc2C1=O)NCC1CCCN2CCCCC12
479
+ CCCCCCCCCCCCCCCCCCCCCCC(=O)OCC=C(C)CCCC(C)CCCC(C)CCCC(C)C
480
+ C=C1CCC(=O)C(C)CCC2C1CC2(C)C(=O)CCC(C)O
481
+ COc1ccc(C2OC(=O)C(C)(C)C(=O)C2C)cc1OC
482
+ Cc1ccccc1NC(=O)Oc1ccc2c(c1)[C@]1(C)CCN(C)C1N2
483
+ C=C1CCC2C(C)(C(=O)O)CCCC2(C)C1CCC1COC(OC)C1
484
+ CC12CCC3c4ccc(O)cc4C(=O)CC3C1CCC2O
485
+ NC(CCNC(CNC(Cc1c[nH]cn1)C(=O)O)C(=O)O)C(=O)O
486
+ COc1cc2[nH]c(C(=O)O)c(C=O)c2cc1OC
487
+ COc1cccc(C2CC(CO)C3CC(O)CCN3C2)n1
488
+ CC(C)N1CCN2C(=O)N(C3CCCCC3)CC2C1
489
+ CC(NC(=O)C(N)CC(=O)O)NC(=O)N(C)C(C)(C)C
490
+ CCCCCC=CCC=CCCCCCCCC(=O)OC(COC(=O)CCCCCCCCCCCCCC)COC(=O)CCCCCCCCCCCCCCC
491
+ CC(C)COC(=O)c1ccccc1C(=O)OCC1CCCCC1
492
+ COc1cc(NCCCN)c2nccc(C)c2c1Oc1ccccc1
493
+ Clc1ccc(-c2nnc(-c3ccc(Cl)cc3)s2)cc1
494
+ O=C1Nc2ccccc2-c2cccn2[C@H]1Cc1ccc(O)cc1
495
+ CC(C=O)=CCCC(C)(O)C=Cc1cc(O)ccc1O
496
+ CC(=O)c1ccc2[nH]c3c(c2c1)CCCC3=NCCO
497
+ CC1=CCC(C(C)=CC(O)C(O)C(C)(C)O)CC1
498
+ CC1=CCC2(O)C(C)=CCC3C(C)C(=O)OC3C12
499
+ C=C(C(=O)OC)C1CCC2(C)C(O)C(O)CC(=C)C2C1O
500
+ CCCCCNC(=O)COc1ccc2nc3n(c(=O)c2c1)CCCCC3
501
+ CCOc1cccnc1
502
+ CC(=O)NCCCCC(=O)O
503
+ CN(C)c1ccc(-c2nc3ccc(I)cc3s2)cc1
504
+ CCCCCC=CCC=CCCC1OC1CCCC(=O)O
505
+ c1ccc2c(c1)NCC(C1=NCCN1)O2
506
+ COc1cc2c(c3c1C=COC=C3)OC(C(C)(C)O)C2
507
+ O=C1NC(=O)C2=C1CCC2O
508
+ CCC(CC=CCCC(=O)O)CCCCCCC1C=CCC1
509
+ CCCCCC(O)CCC(=O)Cc1ccc(O)c(OC)c1
510
+ CC(=O)OC(C)(C)C1CCC(C)=CCCC(C)=CCCC2(C)OC2C1
511
+ CC(C)CNC1C2COC(O2)C(n2cncn2)C1O
512
+ O=CCS(=O)(=O)O
513
+ CC(C)C(NC(=O)C(CS)NC(=O)CCCC(N)C(=O)O)C(=O)O
514
+ Clc1ccc(C2N(c3ccccc3)CCN2c2ccccc2)cc1
515
+ CC1(C)SC2C(N=C(O)CCCCO)C(=O)N2C1C(=O)O
516
+ CC(=O)C=CCC(C)C1CCC(C)c2c(O)cc(C(=O)O)cc21
517
+ CNC1CCC2(C)C(=CCC3C2CCC24C(=O)OC(C)C2CCC34)C1
518
+ C=CCn1ncc2c(CC)ncn2c1=S
519
+ CCC(=O)C=CC=CC(C)C(O)CC
520
+ N=c1cn[nH]c2cocc12
521
+ COP(=O)(O)N(C)N=C(O)C(N)CC(C)C
522
+ COc1cccc2ccn(CCC(=O)N3CCC(C(=O)O)CC3)c12
523
+ C=CC1C2C=C(CO)CC1(NC)c1ccc(=O)[nH]c1C2
524
+ COc1cccc2c1C(=O)c1ccccc1C2=O
525
+ CCCCCCC(=O)C(Br)=C(Br)Br
526
+ O=C(O)c1cc(C=CC(=O)c2ccccc2O)c2c(c1)COCO2
527
+ CC=C1CN(C)CC2CCc3c([nH]c4ccccc34)C(=O)C12
528
+ N#CC(C#N)=Cc1ccc2c(c1)OCO2
529
+ Nc1ccc(-c2nc3cc(F)ccc3s2)cc1Cl
530
+ CCn1c(SCC(=O)Nc2cccc(C)c2C)nc2c(=O)[nH]cnc21
531
+ CCCC=CCC1CC=CC=C(C)C=CC=CC(O)CC=C(C)C=CC=CC(O)=N1
532
+ Oc1ccccc1CNC1C2COC(O2)C(n2cnc3ccccc32)C1O
533
+ CC(C)[C@H](N)C(=O)O
534
+ C=C(CC=CC(C)(C)OO)C1CC=C(C)CC1
535
+ O=C(NCc1ccc2c(c1)OCO2)NC(Cc1ccccc1)C(=O)O
536
+ COC(=O)/C=C/c1ccccc1OCC(O)CNCCNC(=O)C(C)C
537
+ O=C(O)c1ccccc1C(=O)NCCC(c1ccccc1)c1ccco1
538
+ COc1cccc(C=CC2=NCCCC2)c1
539
+ CCCCCCCCCCCCC(SCCC(=O)O)SCCC(=O)O
540
+ CC(C)(C)CCNC1CC(O)C(CO)C1Cc1ccnc(-c2ccccc2)n1
541
+ NC(N)=NCCCC(N)C(=O)NC(Cc1ccccc1)C(=O)O
542
+ CC(=O)c1ccccc1OCC(=O)O
543
+ Cc1c(O)cc2c3c1C(=O)OCC3(C)CC2(C)C
544
+ O=C(Cc1ccccc1)NC1COC(CN2CCC(F)(F)CC2)C1O
545
+ N#Cn1c(N)nc2ccccc21
546
+ CCCCCCCCCC(O)CNC(C)=O
547
+ COc1cc(CCC(=O)CC(O)CC(C)CCCO)ccc1O
548
+ CC1CCC2(O)C(C)(C)C3CC(O)C2(C)C1C3
549
+ O=C(O)c1ccc(C(=O)CBr)cc1
550
+ CCCC=CC=CC1CC(O)C(O)C(O)C1
551
+ C=C1CC(=O)OC1=O
552
+ CCCc1ncc(C)nc1C
553
+ CCCCCCCCCCCCCC(=O)CC(O)CCCCC
554
+ C=CCC=CCC=CCCCCCCC=CCCCC(=O)O
555
+ O=Cc1ccc(COCCc2ccc(O)cc2)[nH]1
556
+ CCCCCCCCCCC(O)CCCCCCCCCCCCCC(=O)O
557
+ COc1ccc(C(=O)COC(C)=O)cc1OC
558
+ CN[C@H](CS)CCC(=O)O
559
+ CC1OC(O)CC(O)C1O
560
+ C=CCOC(=O)CCCCC
561
+ O=C(O)CCSCCSCCC(=O)O
562
+ CC12CCC3C4CCC(=O)C=C4CCC3C1CCC2OC(=O)CCC1CCCC1
563
+ OCc1ccc(COCCc2ccccc2)o1
564
+ COC(=O)c1ccccc1NC(=O)N1CCc2nc[nH]c2C1c1cccnc1
565
+ CCCCCCCCCCCCCCCCCCCC(=O)OCC(COC(=O)CCCCCCCCCCCCCCCCC)OC(=O)CCCCCCCCCCCCCCCCC
566
+ C=C1CCC2C(C)(C)CCCC2(C)C1COC(C)=O
567
+ O=C1NCCCC1=CC(=O)c1c[nH]c2ccccc12
568
+ C=C(C)C1CC=C(C)CCC=C(C)CC1
569
+ O=C1C(O)=CC(=O)c2c(O)cccc21
570
+ O=C(CC1Sc2ccccc2N(CCC2OCCO2)C1=O)NO
571
+ O=c1ccc2c(OC3OC(CO)C(O)C(O)C3O)cccc2o1
572
+ C=C1CCC2C(C3CC(C)CC13)C2(C(N)C=O)C1CCCCC1
573
+ CCCCOc1ccc([C@@H]2CC[C@H](NC)c3ccccc32)cc1
574
+ O=C(O)[C@H]1C[C@@H](CO)N1
575
+ N=S(=O)(O)c1ccccc1
576
+ CC(C)CC(NCC=Cc1ccccc1)C(=O)O
577
+ O=CNCCCCN(CCCNC(=O)c1ccccc1)C(=O)C=Cc1ccccc1
578
+ CC1C(c2ccccc2)OC(C#Cc2ccccc2)N1C
579
+ CCC(C)CC(C)C
580
+ N#CCCC1CS1
581
+ COc1c2c(cc3c1[nH]c1ccccc13)C(=O)CC2O
582
+ CCCCCCC=CCCCC1=NC(=Cc2ccc(O)cc2)C(=O)O1
583
+ OCCN(CCO)CCc1ccc(CSc2ccccc2)cc1
584
+ CC12CCCC(O)(C1)C1CCC1C(O)C2
585
+ OCC(O)CC#CC#Cc1ccccc1
586
+ COc1cc(O)cc2c1C(=O)C(O)C(c1ccccc1)O2
587
+ CCc1cc2c(=O)c(-c3nc(C)cs3)c(C(=O)O)oc2cc1O
588
+ CCCCCC(=O)NCC(=O)c1ccc(O)cc1
589
+ CC(C)(C)N=C1C(=O)N=C2C=CC=CN21
590
+ C=CCc1cc(OC)c(OC)c(OC)c1OC
591
+ COc1ccc2c(c1)C=CC(c1ccccc1)O2
592
+ OC(c1ccccc1)C1CCCCN1CCc1ccccc1
593
+ CCCCCCCCCCCCOc1ccc(C(=N)N)cc1
594
+ CCOc1ccc2c(=O)c(Oc3ccccc3)c(C)oc2c1
595
+ CC#CC=C1C=CC2(CCCO2)O1
596
+ Cc1cc2ncccc2c2nc(N)n(C)c12
597
+ O=C(C1=C(O)CCC1)C1CC1
598
+ COC=C(C(=O)OC)C(C)C(C=Cc1ccccc1)OC
599
+ COC(=O)C(CC=Cc1ccccc1)NC(C)=O
600
+ Nc1ccc(-c2nc3ccc(F)cc3s2)cc1I
601
+ CCCCCCCCCCCCCc1cc(=O)c2c(O)cc(O)cc2o1
602
+ CC(=O)NCCc1c(Br)[nH]c2ccccc12
603
+ C#CCN(C)Cc1nc(C2(O)CCN(C(=O)CCc3ccccc3)CC2)cs1
604
+ Cc1c(C)c(CCl)c2ccccc2c1CCl
605
+ CNCC1(c2ccc(OC)c(OC)c2)CCCC1
606
+ CC(c1cc2ncccc2s1)N(O)C(N)=O
607
+ CCC(=O)c1ccc2c(c1)N(CCCN(C)C)c1ccccc1S2
608
+ CCCc1scnc1CC
609
+ O=C(N[C@H]1CN2CCC1CC2)c1ccccc1
610
+ CC(C)COC(=O)C1C(=O)CC(C)(O)C(C(=O)OCC(C)C)C1c1ccc(O)cc1
611
+ COC1=CC23CCN(C)C(Cc4ccc(O)c(OC)c42)C3=CC1=O
612
+ COc1cc2[nH]c3cccc(OC)c3c2cc1C
613
+ CC(=O)Nc1ccc(F)c(Cl)c1
614
+ CCCCCCCC(C)CCCCCCCC(O)CC(=O)OCCCc1cc(O)c(O)c(OC)c1
615
+ Nc1cccc(OC2OC(C(=O)O)C(O)C(O)C2O)c1
616
+ CC(=CCOc1ccc2ccc(=O)oc2c1)CCC(=O)C(C)C
617
+ CC/C(=C(/c1ccccc1)c1ccc(O)cc1)c1ccccc1
618
+ C=CCC/C=C(\NC(=O)C1CC1(C)C)C(=O)O
619
+ CCc1ccc2cc(-c3ccc(Cl)cc3)cn2c1
620
+ CCC(CC)C(C)C
621
+ O=C1OC(=O)C2C3OC(C=C3COC3CCCCC3)C12
622
+ C=CC1(C)Cc2c(O)ccc(OC)c2CC1C(=C)C
623
+ C=C1CC(OC(=O)C(O)=CCO)C2C(=C)C(=O)OC2C2C(C)=CCC12
624
+ O=NN(CCF)C(=O)NCCF
625
+ Cc1ccc2[nH]c(C3CCN(C(=O)c4ccccc4)C3)nc2c1
626
+ CC1=CCCC2C1(C)CCC1(C)C3=C(CC21C)C(=O)C=C(NCC(C)C)C3=O
627
+ Cc1ccc(C(=O)Oc2ccccc2)cc1
628
+ c1ccc(CCc2ccccc2OCCCCN2CCNCC2)cc1
629
+ Cc1ncc2n1-c1ccc(Cl)cc1C(c1ccccc1F)=NC2
630
+ O=C(NC(=O)c1ccccc1Cl)Nc1ccc(OC(F)(F)F)cc1
631
+ CNCCc1cc(Br)c(OCCCN)c(Br)c1
632
+ CCCCC(C)CC(C)CC(C)C(=O)OC
633
+ COc1ccc(C(=O)C=Cc2cccs2)c(OC(=O)c2ccccc2)c1
634
+ COc1ccccc1CCNC(=O)Cn1cc(OC)c(=O)cc1C
635
+ CC=C(CC(C)C(O)(CO)C(=O)O)C(=O)O
636
+ CCC=CCC(O)CCO
637
+ CCC=CCC=CCC(O)C(O)C(O)C=CCC=CCC=CCCC(=O)O
638
+ CCOC(=O)C1(c2ccccc2)CCN(C)CC1
639
+ C=C1C=Cc2c(c(O)c3occc3c2CC=C(C)C)O1
640
+ N=C(N)NCCCCNCCCNCCCNCCCNC(=O)c1ccc(O)cc1
641
+ CC(=O)N(C)CCc1c[nH]c2ccccc12
642
+ CCCCCCCCCCCCC1(O)C(O)C=CC(=O)C1O
643
+ CCC(C)CC(C)CCCCCCCCCCC(O)C(C)N
644
+ O=C1CCCCCCCCCCC(=O)OCCC1
645
+ C=C1C=CC(C(C)C)C12CC=C(C(=O)O)CC2
646
+ COc1cc(C)cc(OC)c1
647
+ C=CC1(C)CCC(C(=C)C)C2C3(C(=O)Nc4ccccc43)C21NC
648
+ COC(=O)C(C)COc1coc2c1c(O)cn2Cc1ccccc1
649
+ CCN(CC)CCCNc1nccc2c(C)c3[nH]c4ccc(O)cc4c3cc12
650
+ O=C(NCC1CCCN2CCCCC12)c1n[nH]c2ccccc12
651
+ CCC(CC(C)=O)N1CNc2nc[nH]c(=O)c21
652
+ O=C1CCOC(c2ccccc2)C1
653
+ O=C(O)C(c1ccccc1)c1ccccn1
654
+ O=C1Nc2ccc(Br)cc2C(=O)N2CCN(C(=O)NC3CCCCC3)CC12
655
+ CC(C)(C)OC(=O)NNC(=NCC(=O)O)NNC(=O)OC(C)(C)C
656
+ O=C1c2scc(-c3ccc(F)cc3)c2-n2cccc21
657
+ Cc1cc(C)c2nc(C)cc(O)c2c1
658
+ CC(=O)Nc1nc(C)c(O)c(C)c1C
659
+ ON(Cc1c(F)cccc1Cl)Cc1c(F)cccc1Cl
660
+ CCCCCCCCCCCCC(C)=O
661
+ O=C(O)C1CCCN1C(=O)OCc1ccccc1
662
+ COc1ccc(CN2Cc3nc[nH]c3CC2C(=O)NC2CCCCC2)c2ccccc12
663
+ CC(=O)NC(Cc1cn(C)cn1)C(=O)O
664
+ COC(=O)c1c(C)c(C)c(O)c(C)c1O
665
+ c1ccc(Cc2nnc(C3CCN(C4CCCCC4)C3)o2)cc1
666
+ CN(C)CCN(Cc1cccs1)c1ccccc1
667
+ COc1ccc(-c2coc3cc(O)c(O)cc3c2=O)cc1O
668
+ C=CCC(=NOS(=O)(=O)O)SC1OC(O)C(O)C(O)C1CO
669
+ CCN1c2ncccc2-c2nccn2-c2cccnc21
670
+ CCCC(O)C(O)C1CC(OC)=CC(=O)O1
671
+ CC1=CC(=O)C2C(C)CCC(=C(C)C)C2C1
672
+ COc1ccc(C2NC(CO)C(O)C2O)cc1O
673
+ CCOCCOCCO
674
+ CCCCCCCCCC=CCCC=CC(O)C(CO)NC(=O)C(O)CCCCCCCCC
675
+ CCNC(=O)CNC(=O)COc1ccccc1C(=O)OC
676
+ CCOC1CC(CCC2(C)C(C)CCC3(C)C(CO)=CCCC32)CO1
677
+ O=C(CC(=O)OCc1ccccc1)OCc1ccccc1
678
+ Cc1cc2nnncc2o1
679
+ CCCC=CC(CC)CC1C=C(CC)C(=CC(=O)OC)O1
680
+ CCn1c2ccccc2c2ccc(N)nc21
681
+ [CH2-][NH2+]C1C(C(C)C)CCC(C)C12C=C(C)CC2
682
+ CC(C)(C)CC(=O)NCC1COCc2nc3cccnc3n21
683
+ COCCN(CC1CCCN2CCCCC12)C(=O)c1cc(CCC(C)C)n[nH]1
684
+ COc1c2occc2cc2c(=O)cc(-c3ccccc3)oc12
685
+ CC(=NNC(N)=S)C(=O)Nc1ccc(Br)cc1
686
+ Oc1cc(-c2ccccc2Cl)nc2cc3c(cc12)OCO3
687
+ N#Cc1cncc(/C=C/c2ccccc2)c1Oc1ccc2[nH]ccc2c1
688
+ CCc1ccc(C(C)NC(=S)NC2CCCCC2)cc1
689
+ COC(=O)C=Cc1ccc(OC2OC(CO)C(O)C(O)C2O)c(OC)c1
690
+ C=C1CCC=C(C)CCC2C1C(=O)OC2(C=CC=C(C)C)CO
691
+ Cc1cc(Nc2cccnc2)c2c(ccc3c[nH]nc32)n1
692
+ C#CC(Br)C1CC(O)C(CC(OC(C)=O)C(Br)CC=CCC)O1
693
+ CC(C)C1=CC(O)C(C)(O)CC1O
694
+ CC1(C)CCCC2(C)C1CCC1(C)C3COC(=O)C3=CC(O)C12
695
+ COc1ccc(C(=O)Nc2cc(C)ccc2C)cc1OC
696
+ CC1=CCCC(=O)C=CC(C)(C)CCC1
697
+ CCCCCCSc1cc(C(N)=S)cc(Cl)n1
698
+ C=CC(C)(O)CCC1(C)C2=CC(=O)CC(C)(C(=O)O)C2CCC1C
699
+ COc1cc(OC)c2ccc(=O)oc2c1OCC=C(C)C
700
+ C#CCN(C)CC1CN2CCC1CC2CNC(=S)NCCN1CCOCC1
701
+ CC(C)(C)CC(=O)NC1C(c2cccs2)N(C(=O)c2ccccc2)CCC1(C)O
702
+ COc1ccc2c(C)c(CCC(=O)NC(C)C(=O)O)c(=O)oc2c1
703
+ CC(C)Nc1ncccn1
704
+ C=CCC1CC2(CC=C(C)C)OCOC2=CC1=O
705
+ COc1ccc2c(c1)OC=C(c1ccc(OC)c(OC)c1)C2O
706
+ O=C(NCCCN1CCC(Cc2ccc(F)cc2)CC1)NC1CCCCC1
707
+ C=C(C)C1CC2C(C)(C)C(Br)=CC(O)C2(C)OCO1
708
+ O=C(CC(CN1CCN(CC=Cc2ccccc2)CC1)C(=O)O)Nc1cccc(O)c1
709
+ Cc1[nH]c(O)nc1C(=O)c1ccc(Cl)cc1
710
+ CC(=O)OCC(=CCCC(C)=CCO)CCCC(C)C(=O)CC=C(C)C
711
+ Oc1c2ccccc2cc2ccccc12
712
+ CC(O)C12CCCN3CCc4c(n(c5ccccc45)CC1)C32
713
+ CCCCCCCCCCCCCCCCCCC(O)CC(=O)c1ccccc1
714
+ Cc1ccc2c(c1C)CCc1cc(C(C)C)c(O)cc1-2
715
+ Cc1ncc(COP(=O)(O)O)c(CO)c1O
716
+ COc1c(C=O)c(CC(C)=O)cc(O)c1CO
717
+ CC(C)C=CC=C(CO)C1CCC2(C)CC(O)(CCC2O)C1CO
718
+ CNC1=CC2=NCCc3c[nH]c(c32)C1=O
719
+ CC1=CC2CC(C)C1(C)CCC1=CC(=O)N(CC(=O)O)C12
720
+ C=CCc1cc(C(C)=O)c(O)cc1OCCCCCC(=O)OC
721
+ CCCCCCCCCC=CC=C(C)C(=O)O
722
+ COc1ccc(CNc2nnc(-c3ccccc3)c3ccccc23)cc1
723
+ CC1=NN(c2cc(C)ccn2)CC1
724
+ COc1cc2c(c3oc(CO)cc(=O)c13)C=CC(C)(C)O2
725
+ CC1CC2C=CC3COC(=O)C3(C)C2CC1O
726
+ CCOC(=O)c1ccc2[nH]c(O)c(Cc3ccccc3)c(=O)c2c1
727
+ CCC(C(=O)c1ccc(OC)cc1)C1CCC(OC)CC1
728
+ CC1(C)CCn2nc(COc3ccccc3)cc2C1=O
729
+ CCCN(CCC)C(=O)c1ccccc1CN(CC)Cc1ccccc1
730
+ CCCCCCCCCCCCCCCCC(C)=O
731
+ CCC=CCC=CCC=CC=CC(O)CC=CCCCCCC(=O)OCC
732
+ CC=C1CN2CCC3(C(=O)Nc4ccccc43)C2CC1CCO
733
+ CCC1(C)COc2ccc(C(=O)CCCC3CC3)cc21
734
+ COc1cccc2c1ccn2CCNC(C)=O
735
+ Cc1ccc(C(C)(C)O)cc1
736
+ CC1(C)S[C@@H]2[C@H](S)C(=O)N2[C@H]1C(=O)O
737
+ COc1cccc(CN=C(O)CCCCCc2ccccc2)c1
738
+ COc1c(C)c(CCCCCCCCCCSC(C)=O)oc(=O)c1OC
739
+ CCCCC1=CC(=O)C=C(OC)C1=O
740
+ O=C(CCC1NC(=O)c2ccccc2NC1=O)Nc1ccc2[nH]ccc2c1
741
+ CC(C)C=C(NC(=O)c1ccccc1)C(=O)O
742
+ C=CCCCC=CCC=CCC=CCC=CCCCCC
743
+ CC(C)Oc1ccc(C(=O)NC(Cc2c[nH]c3ccccc23)C(=O)O)cc1
744
+ C#CC1(O)CCC2C3CCC4=CC(=NO)CCC4C3CCC21CC
745
+ COc1c(C(C)C)cc2c3c1OC(=O)C31CCCC(C)(C)C1CC2
746
+ CNC1=CC(=CCC(=O)O)CC1
747
+ CC1=CCSS1
748
+ COC(=Cc1ccc(O)cc1)C(=O)NC=Cc1ccc(O)cc1
749
+ Nc1ccccc1C(=O)OC1CCCCC1
750
+ O=C(CCn1ccc2c(Br)cccc21)NC(Cc1ccccc1)C(=O)O
751
+ Oc1cccc2[nH]ccc12
752
+ COc1cccc2nc3c(O)cccc3nc12
753
+ COc1cc(CC(C)C(C)C(OC(C)=O)c2ccc3c(c2)OCO3)ccc1O
754
+ COCC(=O)NCC1C=C(C)C(CC(=O)N2CCN(C)CC2)CC1C(C)C
755
+ CCC(=O)OC1c2c(C)coc2C(=O)C2C(O)CCC(C)C12C
756
+ Oc1cc(-c2ccccc2)c(O)c2c1-c1ccccc1CO2
757
+ C=C(C=CC=CC=CCCC)CC
758
+ Oc1cccc(CCC=CCC=CCCCCCCCc2cccc(O)c2O)c1O
759
+ COc1ccccc1C(CCNC(C)c1ccccc1)C1CCOC(C)(C)C1
760
+ CC1CCC(C)C12CCC1C2=CC2(C)CCC(C)(C)C12
761
+ O=C(O)CCCCCCCCC(=O)Nc1ccc(Cl)c(Cl)c1
762
+ COc1ccc(C(=O)CCN2CCCCCC2)c(OC)c1
763
+ CCCCCCCCCCC=CCC=CCCCCCC1=CC(C)(O)OC1=O
764
+ C=C1CCCC2(C)CC3OC(=O)C(CN(C)CC(O)c4ccc(O)cc4)C3CC12
765
+ CC(=O)NC1C(SCC(O)CN)OC(CO)C(O)C1O
766
+ Cc1coc2cc3oc(=O)c(CC(=O)N4CCOCC4)c(C)c3cc12
767
+ CC12CCC(C(OC(=O)C=Cc3ccc(O)cc3)C1)C2(C)C
768
+ CC(C)=CCCC(C)=CCCC1(C)OC1CCC(=CCO)CO
769
+ CCCCCCCCCCCCCCCCCCCC(=O)OCCCCCCCCCCCCCCCCC
770
+ CC(C)c1cc(C=O)c(C2CCCC(C)(C)C2C=O)cc1O
771
+ CCCCCCCC(C)=CC(CC=CCCC(=O)OC)OC
772
+ CCCCCC(=O)NC1COC(CN2CCN(C)CC2)C1O
773
+ CCCCCC(O)C(O)C(O)C=CCCCCCCCC(=O)O
774
+ CC=C(COC(=O)C(C)=CC)OC(=O)CC=Cc1cc(OC)c2c(c1)OCO2
775
+ CC#CC#Cc1ccc(C#CC(O)COC(C)=O)s1
776
+ CCNC1=NC(=O)C2(CC(C)(C)Oc3ccc(F)cc32)N1
777
+ COC(=O)C1(C)CCCC2(C)C(Cc3ccoc3)CCCC12
778
+ Clc1ccc(C2OCC3(CO2)CC2C=CC3C2)cc1
779
+ CC(C)C(=O)OCC1(c2ccc(CO)cc2OC(=O)C(C)C)CO1
780
+ Cc1cc(C)c(C=CC(=O)c2ccccc2)c(C)c1
781
+ CC(=CCCC12C(=O)OC3(O)CC1CC2C3C)C(N)=O
782
+ c1ccc2c3c([nH]c2c1)CNCC3
783
+ OC1C(NC2CCCC2)C2COC(O2)C1n1cnc2ccccc21
784
+ CCCCNC(=O)NNC(=O)OCC
785
+ CC(=O)OCC(C)=CC(=O)OC1C=CC(C)(C)C(C)=C1C=O
786
+ O=C(O)C1CCCN2CCCCC12
787
+ CC(C)N1CC2OCC(=O)N(C(C)C)C2C1
788
+ c1ccc(-c2c[nH]c(C3COCCN3C3CCC3)n2)cc1
789
+ Cc1cncc(SCc2ccco2)n1
790
+ C=CC(C)(C)OC1OC(CO)C(O)C(O)C1O
791
+ C=CC(C)(CCC=C(C)CCC=C(C)C(O)C(O)C=C(C)C)Oc1ccc(O)cc1CC(=O)OC
792
+ CCOC(=O)c1ccc(NC(=O)CSc2nc3c(=O)[nH]cnc3n2C)cc1
793
+ COc1c(O)ccc2c1OC1c3cc(O)cc(O)c3COC1C2=O
794
+ C=C1C(=O)OC2CC(C)C3CCC(O)C3(C)C(O)C12
795
+ CC1(C)CCCC2(C)C(O)C(C=O)=CC(OC(=O)C=Cc3ccccc3)C12
796
+ Oc1cc(O)cc(CCc2ccc(O)cc2O)c1
797
+ CC(C)=CCCC(C)=CC(O)CC(C)=CCCC(C)(O)C=Cc1cc(O)c(C)cc1O
798
+ CC1CCC2(O)C(CCCC2(C)C)C1(C)CCc1ccoc1
799
+ O=C(O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21
800
+ C=C(Cl)C(Cl)(CBr)CCC(Cl)C(C)(C)Br
801
+ CCCCCCC=CCCCCCC(CC(=O)O)CC1(c2ccccc2)CCC2(CCCC2)C1
802
+ CC1=CCCC2C(=O)OC3C2C(C1=O)C1CC13C
803
+ CCC12CN3CC(C)(CN(C1)C3c1cccnc1)C2O
804
+ CCCCNc1ccc(C(=O)OCCN(C)C)cc1
805
+ Cn1c(-c2cc3ccc(O)c(CN4CCCC4)c3oc2=O)nc2ccccc21
806
+ COC(=O)C(C)Oc1ccc2c3c(c(=O)oc2c1C)CCC3
807
+ Cc1cc(C)c2c(c1)Oc1ccccc1C(=O)N2
808
+ COc1cc2c(cc1O)C1C=c3cc(O)c(OC)cc3=C[NH+]1CC2
809
+ O=c1[nH]c(O)c(C2NCCc3ccccc32)c(=O)n1CCc1ccccc1
810
+ OC1CSC(O)CS1
811
+ COc1ccc(C=C2COc3cc(O)c(OC)c(O)c3C2=O)cc1
812
+ CCC12CCc3[nH]c4ccccc4c3CCN(CC(O)C1)C2
813
+ COCCC(NC(=O)OC(C)(C)C)C(=O)O
814
+ CC=CC(=O)C1=C(C)C=CCC1(C)C
815
+ COc1ccc2cc(C(C)=O)ccc2c1
816
+ O=S1(=O)CSCSSC1
817
+ CC1CC(=O)O1
818
+ C=CCCCCCC(OC(C)=O)C(OC(C)=O)C(O)C#CC#CCCC
819
+ CC(O)(C=CC1C(C)(O)C(O)C(O)C2C(C)(C)CCCC21C)C(O)CO
820
+ COC(=O)/C=C/C(Cc1ccccc1)NC(=O)CN
821
+ O=C(CCCCC1SCC2NC(=O)NC21)Nc1nccs1
822
+ C=C(C)C1CC2OC(=O)C3(CCC(O)C3(C)C1)C2O
823
+ CC1NC(=O)c2ccccc2N(CC(=O)NCCSc2ccccc2)C1=O
824
+ c1c2c(c3c4c1C15CCNC1CCC(O3)C5OC4)OCO2
825
+ O=C(CSc1ccc2nncn2n1)NCC1CCCN2CCCCC12
826
+ CCc1ccc(C(=O)c2cncc(Br)c2)cc1
827
+ O=c1c2ccccc2nc2n1CCC2=Cc1ccc(Br)cc1
828
+ CCCCCC=CCCC(O)CCCCCCCC(=O)O
829
+ O=P(O)(O)OCC1OC(O)CC1O
830
+ CC1=CC(=O)C2CC1C2(C)COC1OC(CO)C(O)C(O)C1O
831
+ COc1ccc(Br)c(CCN(C)CC(C)O)c1Br
832
+ CCCC1OC1c1cc(OC)cc(=O)o1
833
+ C=C(C)C1CCC2(C)C(CC=C3C4CC(C)(C)CCC4(C)CCC32C)C1(C)CCC(=O)O
834
+ CN(CC#CCN1CCCC1=O)CCCCl
835
+ CC(O)C(O)CO
836
+ CCCCCC=CCC=CCC=CCC=CCCCC(=O)OCC(CO)OC(=O)CCCCCCCCCCCCCCCCCCCCC
837
+ Cc1nnc(C2CCN(Cc3ccc4c(c3)OCO4)C2)o1
838
+ CC(=Nc1ccccn1)c1cc2ccccc2oc1=O
839
+ CC1(C)C2CCC3=CCCC(C)(O)C3(C)C21
840
+ C=C(C)CN1CCN(Cc2oc(C)cc(=O)c2O)CC1
841
+ CC(c1sc2ccccc2c1Cl)N(O)C(N)=O
842
+ CCCCNC(=O)Oc1ccc2c(c1)[C@@H]1CCN(CC)C1C2
843
+ S=C=Nc1cc(-c2ncon2)ccc1Cl
844
+ CC1CCCCC1
845
+ CC(C)CCCCCCOC(=O)C1CCCCC1C(=O)OCCCCCCC(C)C
846
+ CN(C)c1ccc(CNCCCCNCc2ccc(N(C)C)cc2)cc1
847
+ CC1(C)CC2C=C(C(=O)O)C3CC(O)C(C)(O)C23C1
848
+ COC(=O)c1[nH]c(=O)c2ccccc2c1-c1ccc[nH]1
849
+ COc1cc(C2Oc3ccc(CCCOC(C)=O)cc3C2COC(C)=O)ccc1OC(C)=O
850
+ COc1ccc(C2CCc3ccc(O)cc3O2)cc1O
851
+ Cc1coc2c1C(O)C1(C)C(CCC(O)C1C)C2
852
+ CC1(C)CCc2cc(CC3OC(=O)C(O)=C3c3ccc(O)cc3)ccc2O1
853
+ COc1cc(N2C(=O)NC(CC(=O)O)C2=O)cc(OC)c1OC
854
+ COc1ccc(O)c(C=Cc2cc(O)cc(OC)c2Cc2ccc(O)cc2)c1
855
+ CC(=O)C(C)=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CCC(O)CC(=O)O
856
+ C=C(C)C1C=C(C(C)C)CCC2(C)OC2CCC(C)CC1=O
857
+ C=C(C)C1CCC(C)C2CCC(C(=O)O)=CC12
858
+ CC1(C(=O)O)CSC(c2ccccc2O)=N1
859
+ O=C1c2ccccc2NS(=O)(=O)N1COCc1ccccc1
860
+ CN1CC(O)=C(C(=O)/C=C/C=C/c2ccccc2)C1=O
861
+ CCCCCC(=O)CCCCC=CCCCCCC(=O)OC
862
+ Cc1oc2c(C)c(O)ccc2c(=O)c1-c1ccc2c(c1)OCCO2
863
+ CC(=O)OC(C)(C)C1CC=C(C)CC1
864
+ COc1ccc(-c2coc3c(OC)c(O)c(OC)c(O)c3c2=O)cc1
865
+ CCCCCCCCCCCCCCCCCCCCCCOC(=O)CCCCCC
866
+ CC(O)C1OC(O)C(O)C1O
867
+ CC=CC=CC(=O)c1cc(C(=O)O)c(OC)cc1O
868
+ COc1cccc2c1CCC[C@H]2CN(C)CCc1ccc2ocnc2c1
869
+ Oc1c(C2OCCc3ccccc32)ccc2cccnc12
870
+ O=C(CCCCCNC(=O)N1CCn2c1nc1ccccc12)NC(C(=O)O)c1ccccc1
871
+ CCCCCCCCCC=CC(O)C(C)N
872
+ Cn1cnc(CC(N)C(=O)O)c1SSc1c(CC(N)C(=O)O)ncn1C
873
+ Clc1nssc1=NC1=NCCS1
874
+ OC(c1cccnc1)c1cccs1
875
+ Oc1nc2ccccc2nc1N1CCNCC1
876
+ NS(=O)(=O)CC/N=C(\S)Nc1c(Cl)cccc1Cl
877
+ N=c1c2c(n(Cc3c(Cl)cccc3Cl)c3c1CCC3)CCCC2
878
+ COc1c2c(c(COC(=O)CC(C)C)c3c(C)coc13)C(C)CC=C2
879
+ OCC(CO)(CO)NCCCNC(CO)(CO)CO
880
+ CCCCCCCCOC(=O)c1cccc(N)c1
881
+ COc1c(-c2ccc(O)c(O)c2)cc2oc3cc(O)c(O)cc3c2c1O
882
+ CC(O)=Nc1nc(O)c2nc[nH]c2n1
883
+ Oc1ccc2cncn2n1
884
+ Cc1cc2cc(O)cc(O)c2c2oc(=O)cc(O)c12
885
+ O=C(CCC1NC(=O)c2ccccc2NC1=O)Nc1cccc(O)c1
886
+ CCCCCCCCCCCCCCCCCCN=C(S)NN=Cc1ccccc1[NH+]([O-])O
887
+ CC(C)=CCCC(C)C1CCC(C)=C2CC=C(C)C2C1
888
+ Cc1cc(C(=O)O)cc2c1OC(C(C)(O)CO)C2
889
+ CN1CCC(NCC2OC(CO)C(O)C2N2CCCCC2)CC1
890
+ COc1cc(C(=O)n2ccc(C)n2)cc(OC)c1OC
891
+ O=C(O)c1cc2cc3ccccc3cc2ccc1=O
892
+ COc1ccc(C2CC(=O)Oc3cc(OC)cc(O)c32)cc1
893
+ COc1ccc(C=CC(=O)N2CCC3(O)CCCCC3C2)cc1OC
894
+ CC(=O)Nc1ccccc1C(=O)C(=O)NCCc1c[nH]c2ccccc12
895
+ Cc1ccc2c(COC(=O)C(C)C)coc2c1
896
+ CCCCCC=CCC=CCCCCCCCCCCCC(=O)OC(CO)COP(=O)(O)OCCN
897
+ COC(=O)Oc1ccccc1C(=O)O
898
+ CN1C=CCC=C1C=NO
899
+ Cc1ccc(O)c(C(=O)CCC(=O)c2cc(C)ccc2O)c1
900
+ CN1C(=O)Nc2cc(CN)ccc2S1(=O)=O
901
+ CCCC1=C(C)C(=O)C(O)O1
902
+ c1cc(-c2conn2)c[nH]1
903
+ COC1C=CC(O)CC(O)CC=CC=CC(O)CC=CC=CC(=O)OC(C)CCCC1O
904
+ Cc1cc(=O)n(C)c2ccccc12
905
+ C/C=C/COc1noc2c1CNCC2.Cl
906
+ COc1cccc2[nH]cc(CC3(O)C(=O)OC4C(O)COC43O)c12
907
+ CC1(O)CCC(C(C)(C)O)CC1
908
+ CCCCCCCCCCCCCCc1cc(=O)c2ccccc2n1C
909
+ COc1ccc2[nH]c3c(c2c1)CN(C(=O)C1CCCCC1)CC3
910
+ CC1CCC23C1CCC2(C)CC(C)(C)C3O
911
+ CCCCC1OCC2(COC(=O)NC(C)C)C(C)C=C(C)C1C2C
912
+ CCCCS(=O)(=O)Nc1ccc2[nH]c(=O)c3ccccc3c2c1
913
+ O=C(CNC(=O)c1ccccc1)Oc1ccc([N+](=O)[O-])cc1
914
+ COc1ccc2[nH]c3c(C)c4cc[n+](C)c(N(C)C)c4cc3c2c1
915
+ CC1CC2OC2C=CC=CC(=O)CC2C(Cl)=C(O)C=C(O)C2C(=O)O1
916
+ CC1OC2OC(=O)C1(O)C2O
917
+ CC=CC=CC1CC2=C(C(O)O1)C(O)C(C)(O)C(O)C2=O
918
+ COc1cc(OC)c(C(C)O)c2c1C=CC(C)(C)O2
919
+ COC1=CC(=O)C23CCN(Cc4c2cc2c(c4OC)OCO2)C3C1
920
+ O=C1NC(=O)c2ccccc2C1=Cc1ccco1
921
+ Oc1ccccc1-c1cc[nH]n1
922
+ Cc1cc2c(cc1O)OC(c1ccc(O)cc1)C(O)C2=O
923
+ CC(=O)C(C)CCC=C(C)C1OC(=O)C(C)CC1C
924
+ CCCCOCc1ccc(O)c(OC)c1
925
+ CCC1(C)C=C2C(=O)C(C)(C)C(=O)C(C)(C)C2(O)OO1
926
+ Nc1nc(=O)c2c(ncn2O)[nH]1
927
+ O=C1CC(Oc2ccc(C(=O)C=Cc3ccc(Cl)cc3)cc2)N1
928
+ C=CC1(C)CC2OC(=O)C(=C)C2CC1C(=C)CO
929
+ C=CCC1=CC(=O)C2C(=O)C1(O)CC(O)C2(C)C
930
+ COc1ccc(C2Cc3cccc(O)c3C(=O)O2)cc1OC
931
+ O=S(=O)(O)c1cc(N=Nc2cccc3ccccc23)c(O)c2ncccc12
932
+ CCCCCC=CC(=O)CCc1ccc(O)c(CO)c1
933
+ CCC=CCC=CCC1OC1C(O)C=CCC=CCCCCCC(=O)O
934
+ OC=c1ncc2ccnn12
935
+ CC1CC2CC(C)C(C)(C#N)C3CCC4C(C1CCC4(C)C#N)C23
936
+ COc1ccc(CN2Cc3ccccc3N3CCCC23)cc1
937
+ CC(C)C1CC(O)C(C)(O)C(O)C1O
938
+ C=C(C)C1C(=C)C(OC(=O)C(C)C)C=CC1OC(C)=O
939
+ CCCCCCCCCC(=O)Nc1cc(Cl)ccc1O
940
+ COc1ccc2nc(O)c(CN(C)C(=O)CC(C)C)cc2c1
941
+ CC1=C(CO)C2(C)CCCC(C)(C)C2C(O)C1O
942
+ Oc1[nH]cc2nccnc12
943
+ COc1cc(OC)c2[nH]c3cc(O)c(C=O)cc3c2c1
944
+ CCCN(CCC)CCCCOc1ccc(/C=C/c2nc3ccccc3s2)cc1
945
+ [O-][NH+](O)c1ccc(O)c(Cl)c1
946
+ C=CCC1OC(=O)C(C)(C2CC(CCc3ccccc3)OC(C)(C)O2)C1O
947
+ Brc1ccc(Oc2ccc(Br)c(Br)c2)c(Br)c1
948
+ CC1(C)OC(S)=Nc2ccc(-c3ccc(F)c(F)c3)cc21
949
+ Cn1c(O)c(C(=O)C=Cc2ccccn2)c(=O)c2ccccc21
950
+ CN(C)Cc1cn(O)c2ccccc12
951
+ O=C(O)C1=C(O)C2=COC(CCCCCCCO)CC2=CC1=O
952
+ COc1c(C)c(O)cc2c1CCC(c1ccccc1)O2
953
+ CC(=O)OC1C2=C(C)C(=O)OC2=CC2(O)C(O)CCC(C)C12C
954
+ C=C1CC23CC(C)(C)CC(O)C2(C)CCC1O3
955
+ O=C(C=Cc1cc2ccccc2o1)c1ccccc1OCc1ccccc1
956
+ CCN(CC)C(=O)Oc1ccc2c(c1)OC(=Cc1ccccc1Br)C2=O
957
+ CN(C)c1ncnc2c1ncn2Cc1cccc(C#N)c1
958
+ N=c1cnn2occcc1-2
959
+ O=C(Nc1ccccc1)Nc1ccc2cnccc2c1
960
+ O=CSCc1ccco1
961
+ COc1ccc(C(=O)CNC(=O)CC2NC(=O)c3ccccc3NC2=O)cc1
962
+ CCNC(=O)OC1COC2C(NC(=S)Nc3ccc(N(C)C)cc3)COC12
963
+ COc1nc(N)nc2[nH]cc(C#N)c12
964
+ CCCC(CCC)C(N)=O
965
+ CC(C)=CC1CC(C)C2=C(O1)C(=O)C(C)=CC2=O
966
+ CCC(C)NC(=O)COc1ccc(OCCNCC(O)COc2ccccc2)cc1
967
+ CCCC1CC2=C(C(OC)O1)C(O)C(O)C(O)C2O
968
+ COc1ccc(-c2c(C)noc2-c2ccc(OC)cc2O)cc1
969
+ CC=CC(OC1OC(C)C(O)C(O)C1O)=C1C(=O)OCC1CO
970
+ CCOC1OCC(CO)C1OCC
971
+ O=C(O)C1OC(OC2C(O)C(O)OC(CO)C2O)C(O)C(O)C1O
972
+ CC(C)=CCC1Oc2cc(C)c(O)cc2C1C
973
+ C=C1C(=O)OC2C1C(O)CC1(C)CCC=C(C)C21
974
+ O=C(CCc1c[nH]c2ccccc12)NCCNC(=O)c1ccc2cc[nH]c2c1
975
+ O=C1C=Cc2cc3ccccc3cc2C1=O
976
+ C=CCn1cc2c3c(cccc31)C(CC(C)C)NC(C(=O)NCC1CC1)C2
977
+ C=C(C)C1CCC(C)=C1CC(C)(C)C=NCc1ccco1
978
+ Cc1coc2c1C(OC(=O)C(C)C)C1(C)C(C)CCC(Cl)C1C2=O
979
+ COc1cc(O)cc(OC)c1C=O
980
+ Cc1cc2c(cc1Br)C1(C)CCC(C)C1(C=O)O2
981
+ N=C(N)Nc1ccc(Cl)cc1
982
+ OCC1=CCCCC1
983
+ CC1=CCCC2(C)OC2CCC(C)=CC(O)C(C(C)(C)O)CC1
984
+ O=C(CC1CC2OC(CNC3CCC3)C(O)C2O1)N1CCOCC1
985
+ CN(C)CCSC1Cc2ccccc2Sc2ccccc21
986
+ O=C(O)C1C2C=CC3(CN(CC4CCCO4)C(=O)C13)O2
987
+ NCCCNCCCCNCCCNCCCN
988
+ CC(C)Cn1ccc2c(NC(=O)c3cccnc3)cccc21
989
+ CCN(CC)CCCOC(=O)C[C@@H](C)CC[C@H]1C(CO)=CC[C@H]2C(C)(C)CCC[C@]12C
990
+ O=c1oc2cc(O)cc(O)c2c2c1CCC2
991
+ CC(C)(O)C1CCC2(C)C(O)CCC(C)(O)C2C1O
992
+ CC(C)CCCCCOc1ccc(C2NC(=O)NC2=O)cc1
993
+ CN(C)C(=O)Oc1cccc([N+](C)(C)C)c1.[Br-]
994
+ O=C(O)c1cc(-c2ccccc2)sc1-n1cccc1
995
+ N#CCCN1N=C(c2ccc(OCc3ccccc3)cc2)OCC1=S
996
+ CCOC(=O)c1c(C)oc2ccc(OC(=O)c3ccc(F)cc3)cc12
997
+ CC1=CCc2c(cc(O)c3c(=O)cc(C)oc23)OC1
998
+ C=CCC1(CC(O)C(C)(C)O)C=C2OCOC2=CC1=O
999
+ COc1cc2c(cc1OC)CN(C(=O)CC#N)CC2
1000
+ CCCCCCCCCC(C)(C)C(=O)Nc1c(OC)cc(OC)cc1OC
1001
+ Nc1ncnc2c1ncn2C1OC(CF)C(O)C1O
1002
+ CC(C)(C)OC(=O)NCC(CO)NC(=O)OCc1ccccc1
1003
+ CC(=O)N1CC23C=CC(CC2CC12CCCC2)O3
1004
+ C=C(C)c1cc2c(o1)C(=O)c1c(O)cccc1C2=O
1005
+ CC12CCC(O)C(C)(C)C1CC(=O)c1c2ccc(O)c1O
1006
+ CC(=NNC(N)=O)c1c(O)n(C)c2ccccc2c1=O
1007
+ COC1C=C2CC(C)CCC2C(CCC2CC(O)CC(=O)O2)C1C
1008
+ CC(=O)Nc1ccccc1O
1009
+ COc1ccc(NC(=O)Cc2ccccc2)c(OC)c1
1010
+ COc1ccc(-c2coc3c(C)c(OCCN4CCOCC4)ccc3c2=O)cc1
1011
+ CC1(C)CC2CC(C)(CN2Cc2c(O)occ(CN3CCCCCC3)c2=O)C1
1012
+ CCCCCCCCCCCCc1ccc(C(C)=O)o1
1013
+ COc1ccc(C(=O)c2ccc(Cl)cc2)cc1OC
1014
+ CCC=CCC=CC(OC(C)=O)C(C=CC=CCC=CCCCC(=O)OC)OC(C)=O
1015
+ CCCCCCCCCCCCC(N)=O
1016
+ Cc1cc(=O)c2c(=O)oc3ccccc3c2o1
1017
+ O=C(CCCn1cccn1)Nc1ccc2c(c1)C(=O)N1CCCC1C(=O)N2
1018
+ CCC1CN(S(=O)(=O)c2ccccc2)CCC1CC(=O)O
1019
+ O=C(NC1CCCCC1)c1ccccc1
1020
+ O=C(CCCCCCCCCCCCC1C=CCC1)OCC(O)CO
1021
+ CC1CN(CCCc2ccc(COc3ccccc3)cc2)CCO1
1022
+ CC(C)C1=CC2C(C)(OC(=O)C=Cc3ccccc3)CCC(O)C2(C)CC1
1023
+ O=C(O)CC1CCN(C(=O)C2CCCC2)CC1CCN1CCN(c2ccccn2)CC1
1024
+ CC=C1CN2CCC34C(=C(C=O)C1CC23)Nc1c(O)cccc14
1025
+ CC(=NNS(=O)(=O)c1ccc(C)cc1)c1ccccc1
1026
+ O=C(O)c1ccc(CC2CCC2)nc1O
1027
+ COC1CC(=O)C2C34CCC5CCCCC5C3(C)C(C)(CC4)C12O
1028
+ O=c1c2ccccc2oc2ccc(OCCOC3CCCCO3)cc12
1029
+ C#CC#CCCCCCCCC=C1C(=O)OC(C)C1O
1030
+ COc1ccc(C(=O)C=Cc2ccc(OC)cc2OC)cc1
1031
+ N#Cc1cc(N)ccc1F
1032
+ CCCCCCCCCCCC1=C(O)C(=O)C=C(NCCc2ccccc2)C1=O
1033
+ COC(=O)CCCC#CCCCCCCCCCCCCCc1ccco1
1034
+ CC1CC23NC4CCN2CCCC32C(CC(=O)C42)C1O
1035
+ CC(=O)OCCCCCCCC=CC(=O)O
1036
+ Oc1nccc2c1[nH]c1ccccc12
1037
+ CCCCCCCCCCCCCCCCCC(=O)NC(COC1OC(CO)C(O)C(O)C1O)C(O)CCCCCCCCCCCCCCC
1038
+ CCNC(=O)N1CC2OCC(=O)N(CC3CC3)C2C1
1039
+ CCOC(=O)CC(=O)C(=O)OCC
1040
+ c1ccc(SCc2ccc(CN3CCOCC3)cc2)cc1
1041
+ C=C(C)C1CCC(C)=C1COC(=O)c1ccncc1
1042
+ Cc1cc(CNC(=O)C2CCCC(NC3CCOCC3)CN(C)C(=O)C2)no1
1043
+ COc1ccc(OCC(O)CNC(C)C)c(/C=C/CO)c1
1044
+ CCOC(=O)N=C(C)c1c(O)n(C)c2ccccc2c1=O
1045
+ COc1cc(C(N)=O)cc(OC)c1O
1046
+ O=C(O)CCC1NC(=O)N(Cc2ccc3c(c2)OCO3)C1=O
1047
+ CC1=CC2C(CC=O)C1(C)CCCC2(C)C
1048
+ CC1(C)CCCC2(C)C1CCC13CC(CCC12)C1(CO1)C3=O
1049
+ COC(=O)C(O)=Cc1ccc(O)c(O)c1
1050
+ C#CC(O)C=CCCCCCCCCCCCCCCC=CC(O)C#C
1051
+ NC(=O)NN1c2ccccc2CCc2ccccc21
1052
+ Cc1cc(C)c(C)c(C)c1
1053
+ O=C(Cc1ccon1)c1ccc(O)cc1O
1054
+ CC(=CCC(O)C(C)(Cl)CBr)C(O)CBr
1055
+ CC(=O)Nc1ccc(CC(=O)NCCNCC(O)c2ccccc2)cc1
1056
+ CCCCP(=O)(O)O
1057
+ CCc1cc(C(=O)Cn2cnc3ccccc32)c(O)cc1O
1058
+ COc1ccc(-c2cc(=O)c3ccc4occc4c3o2)cc1
1059
+ CNc1nc2c(c(=O)[nH]c(=O)n2C)n1CCC(C)C
1060
+ CCCCCCCCCCCCCOP1(=O)OCC2COC(=O)C2=C(CCCC)O1
1061
+ COC(=O)C1c2ccoc2CC2(C)C(C)CCC12C
1062
+ COC(=O)Cc1ccc(OCCCOc2cc3c(cc2O)CCO3)cc1
1063
+ CCOP(=O)(c1ccccc1)c1ccccc1O
1064
+ COc1cc(C=CCO)cc(O)c1O
1065
+ CC12CCC(C(CO)C1)C2(C)C
1066
+ COc1cc2c3c(c1OC)C(=O)NC3Cc1ccccc1-2
1067
+ COc1c(O)c(OC)c2occ(-c3ccc(O)cc3)c(=O)c2c1O
1068
+ CC(C)c1ccc(C2c3ccccc3C(=O)c3ccccc32)cc1
1069
+ Cc1c(C)c2ccc(OC(=O)CCCNC(=O)OC(C)(C)C)cc2oc1=O
1070
+ C[C@@H]1CC2C(CCC3(C)C(=O)CCC23)C2(C)C=CC(=O)C=C12
1071
+ OCC1OC(OCCc2c[nH]c3ccccc23)C(O)C(O)C1O
1072
+ COc1ccc2c(ccn2CCC(=O)N2CCc3[nH]c4ccc(Cl)cc4c3C2)c1
1073
+ COc1cc(C(C)=CC(=O)O)oc(=O)c1C
1074
+ CN(CCC1CN(C(=O)Nc2ccccc2)CCC1CC(=O)O)c1ccccc1
1075
+ O=C(O)C1=CC(OP(=O)(O)O)C(O)C(O)C1
1076
+ CC(C=CC=C(C)C=CC1(O)C(C)(C)CCCC1(C)O)=CC=CC=C(C)C(=O)O
1077
+ CC(=O)c1ccc(NC(=O)NC2COC3C(OC(=O)Nc4ccccc4)COC23)cc1
1078
+ C#CC#CC=C=CC=CC=CCC(=O)O
1079
+ NC(N)=NCCCC(N)[PH](=O)O
1080
+ CC(C)CCCCCCCCCCCCOC(=O)CC(C)C
1081
+ C=C(C(=O)OC)C(O)CO
1082
+ CC(CCC(=O)O)C1CCC2C3CCC4CC(O)CCC4(C)C3C=CC12C
1083
+ Cc1cc(CCCCCOc2c(Cl)cc(C3=NCCO3)cc2Cl)on1
1084
+ COc1cccc(-c2cc(=O)c3cc(OC)ccc3o2)c1
1085
+ Oc1ccc2c(c1)Cc1ccccc1-2
1086
+ CC1=CC(C)C2COC3(CCCC3)C1C2C
1087
+ COc1c(O)cc2ccc(=O)oc2c1OC
1088
+ Clc1cc2c(cc1Cl)Oc1cc(Cl)c(Cl)cc1O2
1089
+ CC(NC1=NC(C)(C)Cc2ccccc21)C(=O)O
1090
+ CCCCCCCCCNS(=O)(=O)O
1091
+ C=C1CCCC(C)CCC2=C(C)C3C1CCC3(C)C(O)C2O
1092
+ CC(C)C=Cc1c(O)cc(-c2cc3ccc(O)cc3o2)cc1O
1093
+ CCCCCCCCCCCCCC=CC(O)C(COC1OC(CO)C(O)C(O)C1O)NC(=O)CCCCCCCCCCCCCCCCCCCCCCC
1094
+ Oc1ccc2c(c1)CCNC2c1ccc(F)cc1
1095
+ O=C(CCl)Nc1ccccc1C(F)(F)F
1096
+ COC(=O)C(C)C1CCC(C)(CCC(=O)C(C)(C)CCCC(C)=O)OO1
1097
+ CC(CC(=O)O)c1ccc(C(=O)O)cc1
1098
+ Cc1ccc2c(c1)C1CN(C)CCC1N2S(=O)(=O)c1cccnc1
1099
+ CCC(O)C=CC=CCCc1cc(=O)c2ccccc2[nH]1
1100
+ CCCCCCCCCCCCCCCCC=CCCC=CCCCCC1OCC(N)C1O
1101
+ N=c1occ2cc[nH]nc1-2
1102
+ CCC(=O)C(C)C(C)=O
1103
+ CCCCC(=O)OCCCC(C)C
1104
+ COC(=O)C=C(C(C)=O)C1CCC(C)(Cl)C(Br)C1
1105
+ COc1cc(C=Cc2cc(OC)c3c(c2)OC(C)(C)C=C3)ccc1O
1106
+ CC(C)=CCCc1cc2c(c(O)c1O)C(=O)c1ccccc1C2=O
1107
+ CC(C)NCC(O)COc1ccc2[nH]cc(CCN)c2c1
1108
+ COC1C=CC(CO)(OC)O1
1109
+ CC1(Cl)CCCCC1O
1110
+ CC(C)C1CC(O)C2(C)C3CC(O)C2(C)C(=O)C13
1111
+ O=C1OC2C(CO)OC(O)C(O)C2(O)C1=Cc1ccc(O)cc1
1112
+ CSC1OC(Cn2cnc3c(=O)[nH]cnc32)C(O)C1O
1113
+ COC1C(C)OC(Oc2cccc3c2NC(=O)CC3)C(O)C1O
1114
+ CC(C)C12CCC(C)(O)C3CCC(C)(O)C3C1O2
1115
+ Cc1cccc(O)c1C(=O)OC1C(=O)C(CO)=CC(O)C1O
1116
+ Cc1cc2c(cc1O)[nH]c1c3c(ccc12)OC(C)(C)C=C3
1117
+ O=[N+]([O-])c1nccn1C1SC(CO)C(O)C(O)C1O
1118
+ CC1C(=O)c2c(O)cc(O)cc2OC1c1ccc(O)c(O)c1
1119
+ Cc1ccc(NC(=O)NCCCCNC(=O)N(O)c2ccccc2)cc1
1120
+ CC1=CC(=O)C(O)C(C)(C)C1CCC(C)O
1121
+ CCN(CC)c1ccccc1
1122
+ O=C1C2CCCCN2C(=O)N1CCCN1CCN(c2ccc(F)cc2)CC1
1123
+ CC1(C)CC23CC(=O)CC2C1CCC3C(=O)O
1124
+ CN1C(=O)CNC(=O)c2c1ncn2C
1125
+ O=C1CCc2cccc3c2N1CC3
1126
+ COc1ccc(C(=N)S)cc1
1127
+ COc1cc2ccc(=O)oc2c(OC)c1OC
1128
+ CC1C(=O)OC(CCc2ccccc2)CC1O
1129
+ COc1ccc2c(c1OC)C(=O)c1ccccc1C2=O
1130
+ CCc1ccc(C2CC3CCC(C2C(C)=O)N3C)cc1
1131
+ COc1cccc2c3c([nH]c12)C(=O)NC(=O)C3=O
1132
+ CC(=O)C1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3CCC21C
1133
+ CN1CCC(N2CCC(c3nnc(C(C)(C)C)o3)C2)CC1
1134
+ Cc1oc2cc(O)ccc2c(=O)c1-c1nc2ccccc2n1C
1135
+ O=C(O)CCCCCO
1136
+ CC1CN(c2ccncc2)Cc2ccccc21
1137
+ CCC1Cc2cc(OC)c(O)cc2C(c2ccc(O)c(OC)c2)C1CC
1138
+ COc1ccc2[nH]c3c(c2c1)CN(C(=O)C=Cc1cccs1)CC3
1139
+ COc1c(O)cc(-c2ccccc2)cc1CC=C(C)C
1140
+ CC(=O)c1ccc2c(c1)C=CC(C)(CO)O2
1141
+ CCCCC=CCC=O
1142
+ COc1ccccc1-c1coc2c(CN(C)C)c(O)ccc2c1=O
1143
+ CC1(C)CCc2cc(CC(=O)NCC3CCCN4CCCCC34)ccc2O1
1144
+ CCC=CC=CCCC=CCCCCCCCC(=O)O
1145
+ CCN(CC)c1ccc2c(C)cc(=O)oc2c1
1146
+ O=C1OC(O)C(Cc2ccccc2)=C1c1ccccc1
1147
+ O=c1oc2ccc(O)c3c(=O)oc4c(O)ccc1c4c23
1148
+ CCCCCCCC/C=C\CCCCCCCCOC(=O)N1CCOC(CCCCCCCCCCCC)C1
1149
+ CC1=CC23CCC1C(C)(C)C2CCC3C
1150
+ CCCCCCCCCCCCCCCCCCCCCCCCC(=O)CCCCC
1151
+ CC1OCC(=O)C(O)C1=O
1152
+ CC(=O)Nc1cccc2c1ccn2CC(=O)NCC(C)C
1153
+ CCCCC=CC=CC#CC#CC=CCO
1154
+ CC12CCCC3C1C(C)(CCC2)N(C=O)C3(C)CO
1155
+ CC=CC1OC2C(C)OC(=O)C2C(O)C1O
1156
+ Cc1nn(C)c2nc(C(C)C)cc(C(=O)NCC3CCCN4CCCCC34)c12
1157
+ COc1cc(OC)c(C(C)=O)c2c1C=CC(C)(C)O2
1158
+ C(=NNc1ccccc1)c1ccc[nH]1
1159
+ CC(O)C(C)(O)C(C)O
1160
+ CC1=C2CC3(C)C(=CC2OC1=O)C(O)CCC3C
1161
+ CC(C)C(C)CCC(C)C1CCC2C3CCC4CCCCC4(C)C3CCC12C
1162
+ C=CC1CC(C)(C)C(O)C1=C(CO)C1COC(=O)C1
1163
+ CCC(C)CNC(N)=S
1164
+ COc1ccc2c(ccn2CCC(=O)NC(CC(C)C)C(=O)O)c1
1165
+ CCc1ccccc1NCN1C(=O)c2ccccc2C1=O
1166
+ c1cncc(O[C@H]2CCNC2)c1
1167
+ CC1=C(C=O)NC(=C2C=CC=CC2=O)S1
1168
+ CN1C(=O)c2cccnc2OC2CN(Cc3ccccc3)CC21
1169
+ CC(C(=O)O)c1cccc(Oc2ccccc2)c1
1170
+ CC(=O)NC1C(O)OC(C)C(O)C1O
1171
+ O=C(O)C1CSC(c2nc3ccc(O)cc3s2)=N1
1172
+ CC(C)=CC1C(C(=O)NCc2ccccn2)C1(C)C
1173
+ CC1=CCC(C)(CCC(=O)C(C)C)C=CCC(C)=C(C)C(=O)C1
1174
+ CC(C)=CC1OC(=O)C(=CCC(O)C(C)=CCCC(C)=CCO)C1O
1175
+ OCC=CC#CC#CC(O)C=CCCCCO
1176
+ CC1(C)C2CCC(C(=O)O)C1C2
1177
+ C1=C(c2ccccc2)CCN(CCCCc2c[nH]c3ccccc23)C1
1178
+ SC1=NCCCN1
1179
+ Cc1cccc2c1C(=O)NC2=O
1180
+ CC1=CC(CO)C(C)(C)C12CCCC2(C)C
1181
+ COC(=O)c1ccc(OC(=O)c2ccccc2Cl)cc1
1182
+ COC(=O)C(=CCCC(=CCCc1ccoc1)C(=O)OC)CCC=C(C)C
1183
+ CCCCCCCC(O)CCCC1Cc2cc(O)cc(O)c2CO1
1184
+ COc1cc(C)c(Br)c(C)c1
1185
+ CC1CCC(C(C)C)C2CC(C)(N)CC=C12
1186
+ CC(C)NC(=O)NC1C2COC(O2)C(n2cccn2)C1O
1187
+ COCC1CCCN1CC1CN2CCC1CC2CNC(=O)Nc1ccccc1
1188
+ OC12CC3CC(CC(C3)C1)C2
1189
+ CC(=O)c1cc2c(CO)cc(O)cc2oc1=O
1190
+ CCN(CC)CCOC(=O)C(O)(c1ccccc1)c1ccccc1
1191
+ NC1CCCN(P(N)(=O)NS(=O)(=O)O)C1=O
1192
+ N=Cc1coc2cncn12
1193
+ C=C1CCC2C(C)(C)CCC(OC(C)=O)C2(C)C1COC(C)=O
1194
+ CCCCCCCC=CCC=CCC1OC1CC
1195
+ CCNC(=O)c1ccc(COC(COCc2ccc(OC)cc2)Cn2ccnc2)cc1
1196
+ CCCCCCCCC=CCCCCCCCCCCCC(=O)OC(CO)COC(=O)CCCCCCCCCCCCCCCCCCCCC
1197
+ CN1CCN(C(c2ccccc2)c2ccccc2)CC1
1198
+ NC(=O)c1ccccc1[NH+]([O-])O
1199
+ CC(C)=CCCC(C)=CCCC1(C)C=Cc2cc(O)ccc2O1
1200
+ Nc1ncnc2c1ncn2C1OC(CSCC(N)C(=O)O)C(O)C1O
1201
+ Cc1nccc2nonc12
1202
+ S=C=NC=CCCCCCCCCCCCCCCCC=CN=C=S
1203
+ O=C(O)C=CC=CCCCCC(=O)O
1204
+ CCOC(O)=Nc1c([NH+]([O-])O)cc(Cl)cc1[NH+]([O-])O
1205
+ CCC(O)c1cccc(O)c1CN
1206
+ CC1=C2C(=CC1)C(C)(O)CCC1C(C)C(=O)OC21
1207
+ COc1c(O)c(O)cc2c1-c1ccc(OC)c(=O)cc1C(NC(C)=O)CC2
1208
+ C=CCCCCCCCCCCC(O)CC(O)CCOC(C)=O
1209
+ C=C(C)CC1CCC(C)C1
1210
+ CCCCCCCCCCCCOc1ccc(N2C(N)=NC(N)=NC2(C)C)cc1
1211
+ C=CC(=C)CCC1C(=C)CCC2C(C)(COC(=O)C=Cc3ccc(O)cc3)CCCC12C
1212
+ NC(CC(O)(Cc1c[nH]c2ccccc12)C(=O)O)C(=O)O
1213
+ C=CCn1ncc2c(C)nc(Cc3ccccc3)n2c1=O
1214
+ CCOCCn1c(N2CCN(CC)CC2)nc2ccccc21
1215
+ COC(=O)c1cccc2nc(C3CCN(C4CCCCC4)C3)oc12
1216
+ O=C(C=Cc1ccccc1Cl)c1ccc(OC(=O)c2ccccc2Cl)cc1
1217
+ NNc1ccc([NH+]([O-])O)cn1
1218
+ COC(=O)c1ccccc1N(C)C
1219
+ O=C1C=CC2CCCN12
1220
+ CC1(C)C=Cc2cc(OC3OC(CO)C(O)C3O)ccc2O1
1221
+ CCCCCC(C)CCCC(C)CCCC(C)=CCOP(=O)(O)O
1222
+ Cc1ccc(-c2cc(=O)c3cc(NC(=O)c4ccccc4Br)ccc3o2)cc1
1223
+ CCOC(=O)Cc1nc(-c2ccc(N=C=S)cc2)no1
1224
+ C=CC(C)=CC=CC(C)=CC=C1C(C)=CCCC1(C)C
1225
+ CCCCCCCCC(O)C(C)C(=O)O
1226
+ Cn1nc(C(C)(C)C)cc1C(=O)NCC1CCCN2CCCCC12
1227
+ CN(C)CCOc1ccc2c(c1)CCCC(c1ccccc1)=C2c1ccc(O)cc1
1228
+ COc1ccc2c(c1OC)C(=O)OC2CC(=O)Nc1ccc(O)c(C(=O)O)c1
1229
+ COC(=O)CCC(=O)CC=CCc1ccccc1
1230
+ C=C(CCC=C(C)CO)C1CCC(C)(O)C1C
1231
+ CCCCCCCC=CC#CC#CCCCC(=O)OC(C)C(=O)O
1232
+ C=C(C)C1CCC(C)(O)C2CCC(C)(O)C2C1
1233
+ CC(=O)NCCC(F)CNC(=O)CCC(=O)N(O)CCC(F)CNC(=O)CCC(=O)N(O)CCC(F)CNC(C)=O
1234
+ CC1(CO)CC2C=C(C(=O)O)C3CCC3(C)C2C1
1235
+ N#CCCn1nc(-c2ccc(OCc3ccccc3)cc2)oc1=S
1236
+ CCC(C)CCCCCCCCCCC(=O)OC
1237
+ CC1=C2CC(C)(CO)CC2CC2(C)CCC12O
1238
+ c1ccc(CCc2nn3c(-c4ccco4)nnc3s2)cc1
1239
+ C=CC1(C)C=C2CCC3C(C)(C)CCCC3(C)C2CC1
1240
+ OC1CCCc2nc3ccccc3c(NCc3cccs3)c21
1241
+ Cn1nc(CC(=O)NCC2CCCN3CCCCC23)c2ccccc2c1=O
1242
+ CC1NC2(CCCCC2)CC(C)(C)c2ccccc21
1243
+ CC(C)=CCc1ccc(O)c(CO)c1-c1ccc(C)o1
1244
+ CC=CC=CC(O)=C1C(=O)C2(C)C(=O)C(C)(O)C1N1CCCC12
1245
+ CC1CCC2(C)CC(O)C3(C)CCC4C3=C2C1CCC4C
1246
+ COc1ccccc1C=CC(=O)Nc1cccc2ncccc12
1247
+ COc1ccc(CN2CC3CN(C(=O)C(C)(C)C)CCN3C(C)(C)C2)cc1
1248
+ CCC(C)C(=O)OCC1OC(Oc2ccc(C(=O)O)cc2)C(O)C(O)C1O
1249
+ O=c1c(-c2ccccc2)nccn1C(CN1CCCC1)c1ccccc1
1250
+ CC1NC(CCCCCCCCCCCC(=O)O)CCC1O
1251
+ COc1cc2c(cc1OC)CCN(C)C(C(=O)c1ccccc1)=C2
1252
+ CCCC=CCOC(=O)CCC
1253
+ C=C1CCCC1(C)C1(C)CC=C(CO)CC1
1254
+ NC(CS(=O)Cc1ccccc1)C(=O)O
1255
+ O=C(O)C1=CN(CCOC(=O)c2cc(Br)c[nH]2)CC=C1
1256
+ Oc1ccc2c(c1)C[C@@H]1c3ccc(O)cc3CC[C@H]21
1257
+ CC(CCCCCCCCCCCCCCCCCCCCC(=O)C(C)C(=O)NCCO)OC1OC(C)C(O)CC1O
1258
+ C=C(C)CC=Cc1ccoc1
1259
+ COCC(=O)NC1C(c2ccccc2)N(Cc2nccs2)CCC1(C)O
1260
+ CC(=O)N1CCN(CC2OCC(NC(=O)NC3CCC3)C2O)CC1
1261
+ CCC(C)C(NC1=NC(C)(C)Cc2ccccc21)C(=O)O
1262
+ O=C(Nc1nccs1)C1CN2CCC1CC2Cn1cc(CO)nn1
1263
+ CCCCCCCCCCCCCCCCCCCC=O
1264
+ CC=CCCCCCCCCCCc1oc(=O)cc2c1C(=O)OC(C)C2
1265
+ CC(C)C(=O)NCCNCC(O)COc1ccccc1
1266
+ O=C(NC1CCC(C(=O)O)CC1)OCc1ccccc1
1267
+ CSCSc1nc(O)c(C)c(Cc2ccccc2)n1
1268
+ CCOC(=O)CC(CCc1ccc(O)cc1)OC(C)=O
1269
+ CC(=O)NC(C)C(=O)CCC(=O)O
1270
+ Cc1cc(O)cc2c1OC(C)(CCCC(C)CCCC(C)CCCC(C)C)CC2
1271
+ CC(C)N1CCC(c2nnc(-c3cncn3C)o2)C1
1272
+ c1coc(-c2nnc(C3CCN(Cc4ncc[nH]4)C3)o2)c1
1273
+ CCCCc1cc(OC)c(CC(C)N)cc1OC
1274
+ C=CC1C(OC2OC(CO)C(O)C(O)C2O)OC=C2C(=O)OCCC21
1275
+ CNCCc1nc(-c2cn(C)c3ccccc23)no1
1276
+ Cc1ncc(-c2ccccc2F)cn1
1277
+ CC(=O)NCC1OC(CC(=O)NCc2ccc(F)cc2)C(O)C1O
1278
+ CC(C)=CCCC(C)=CCc1cc(C(=O)O)ccc1O
1279
+ CC(=O)N(O)CCCCCNC(=O)C(O)(CC(=O)O)CC(=O)NCCCN(O)C(C)=O
1280
+ C=C(C)C1=CC2=C(C=CC2=C)C(C)=CC1
1281
+ CCS(=O)(=O)CCN1CC2CC(C1)c1cccc(=O)n1C2
1282
+ COc1ccc(-c2noc(C(C)NC(=O)c3ccccn3)c2C(=O)O)cc1
1283
+ CCCCCCCCCCCCCCCC(O)CC(O)CO
1284
+ O=C(C=Cc1ccc(O)cc1)Oc1ccc(O)cc1
1285
+ O=C(CNCCc1cc2ccccc2[nH]1)Nc1c2c(nc3ccccc13)CCC2
1286
+ CSCC(=O)NC1CCN2C(=O)c3cc(-c4ccsc4)ccc3NC(=O)C12
1287
+ CC1(C)c2cc(O)c(O)cc2CCN1C(CO)CO
1288
+ COc1ccc(C(=O)CN2CCc3cc(OC)c(OC)cc3C2)cc1
1289
+ Cc1ccc(S(=O)(=O)OCC2C3CCC(C3)C2(C)C)cc1
1290
+ COc1ccc(C2C3CCCCC3(O)CCN2C(=O)C=Cc2ccccc2)c(OC)c1
1291
+ C=Cc1nccc2c1[nH]c1c(OC)cccc12
1292
+ CC(C)=CCCC(C)=CCCC(=CCc1cc(O)ccc1O)C(=O)O
1293
+ Cc1c(C)c2ccc(OC(C)C(=O)NCC(=O)O)cc2oc1=O
1294
+ CC1(C)CCCC2(C)C1CCC1CC3CC12CCC3(O)CO
1295
+ C=C1C=CC(OC)C(C)CC(=O)c2c(C)coc2C1
1296
+ CC1(CN)c2ccccc2Cc2ccccc21
1297
+ CCC=C1CC2C(O)Nc3cc(O)c(OC)cc3C(=O)N2C1
1298
+ COc1c2occc2c(OC)c2c(=O)ccoc12
1299
+ CC1CC2=C(CC1C1OCC(CN3CCCCC3)O1)C(C)(C)CCC2
1300
+ Cc1cc(=O)oc2cc(OCC(=O)NCCCO)ccc12
1301
+ CC=C(C)CSC
1302
+ Nc1ncc(Cc2ccc3c(c2)CCCN3)c(N)n1
1303
+ CC12CC3OCCC3(CCO1)O2
1304
+ CCCCCCCOc1ccc(C(=O)O)cc1CC(=O)C(F)(F)F
1305
+ COc1cccc(Sc2ccc(NC3=NCCN3)cc2)c1
1306
+ O=P(O)(O)c1ccccc1OCCOc1ccccc1P(=O)(O)O
1307
+ CCOC(=O)CCN1CCN(c2ccccn2)CC1
1308
+ CC(=O)OCC(O)C(O)C1OC(O)(C(=O)O)CC(O)C1N
1309
+ [O-][NH+](O)c1ccc(N=Cc2ccccc2O)cc1
1310
+ O=c1cc(-c2cc3ccccc3o2)c2cc3c(cc2o1)CCCC3
1311
+ COC12C3=CC(O)CC1c1cc4c(cc1C[NH+]2CC3)OCO4
1312
+ O=C(CCc1ccc(O)cc1)n1cccc1
1313
+ CN(C)Cc1c(O)ccc2cc(-c3ccc(Cl)cc3)c(=O)oc12
1314
+ O=c1occ(CN2CCc3ccccc3C2)c(O)c1CN1CCOCC1
1315
+ CC1CC2C3=CCCN4CCCC(C(=O)C2O)C34C1
1316
+ C=CC1(C)CC(O)C2(C)C(C1)C(=O)CC1C3(C)CCCC12OC3=O
1317
+ Cc1ccc2c(c1)c1c3n2CCN=C3CCC1
1318
+ Cc1cc(O)cc2c1C(=O)C=C(O)C2=O
1319
+ OCC1NC(O)C(O)C(O)C1O
1320
+ CCCCOC(=O)CCC1OC(=O)C(O)=C1c1ccccc1
1321
+ COc1cc2oc(=O)ccc2cc1C(O)C(O)C(C)(C)O
1322
+ NC(CCCCNCC(=O)c1ccco1)C(=O)O
1323
+ Brc1cccc2cc(C=NNc3ccccc3)ccc12
1324
+ O=C(O)C1Cc2c([nH]c3ccccc23)C(c2cccc(O)c2)N1
1325
+ OCC=CC#CC#CCCO
1326
+ CCC(C)=CC=CC1CC2OC(C=C=CBr)CC2O1
1327
+ Nc1nc(N)c2ccn(COCCO)c2n1
1328
+ COc1ccc(C=O)cc1CN1CCCCC1c1cccnc1
1329
+ Cc1ccc(C(=O)c2ccc(Cl)cc2)c(O)c1
1330
+ CC(C)C(N)C(=O)NC(C(=O)O)C(C)O
1331
+ O=C(CCCCC1CCSS1)Nc1ccc(N2CCCS2(=O)=O)cc1
1332
+ CC1(C)CC(=O)CC(C)(C)N1Cc1ccccc1
1333
+ CCC(C)C(=O)OCC(C)(OC)c1ccc(C)cc1O
1334
+ CC(C)=C1C=CC(C)C2CCC(C)C2C1
1335
+ CC(C)(C)c1cc(-c2nnc(O)s2)cc(C(C)(C)C)c1O
1336
+ COc1ccc(N=O)c(C=CN(C)C)n1
1337
+ O=C(O)CCC(NC(=O)c1cncc(O)c1)C(=O)O
1338
+ CCOC(=O)C1C(=O)C=C(C=Cc2ccco2)CC1c1ccccc1
1339
+ COC(=O)C1C2C=CC3(CN(Cc4ccco4)C(=O)C13)O2
1340
+ CNC(=N)NC(CCCC(=O)CC(=O)CCc1ccc(O)c2c1CCCO2)CCn1ccnc1
1341
+ Clc1ccc(-c2cc(Cl)c(Cl)c(Cl)c2)cc1Cl
1342
+ C=CC1CC=C(CC)C(C(=O)OC)C2CC(=O)C=CCC12
1343
+ CCN1CCC[C@@H](c2cccc(O)c2)C1
1344
+ COCCN(CC1CCCN2CCCCC12)C(=O)c1ccccc1-n1nnnc1C
1345
+ Cc1cccc(CN2CCN3C(=O)NCC3C2)n1
1346
+ CC12CCC3c4ccc(O)cc4CCC3C1CCC2=O
1347
+ O=c1c(-c2ccccc2)coc2c(CN3CCOCC3)c(O)ccc12
1348
+ CC(C)(C)CNC1CC(Cc2cc(CN3CCOCC3)on2)C1(C)C
1349
+ COc1cc(CC=CC#Cc2ccc(O)cc2)ccc1O
1350
+ CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
1351
+ CCCCCC1C=CC(CCCCCC(CO)C(=O)O)C(O)C1
1352
+ CC(CNc1ccc(C(=O)O)cc1O)C(=O)O
1353
+ COc1cc(OC)c2cc(C(=O)O)n(C)c2c1
1354
+ CC=CC1C=CC(=O)C(O)C1C(=O)C(C)O
1355
+ C=C1C(O)CC2C3(CO3)CC3OC(=O)C(C)C3CC12O
1356
+ CCCCCCC(C)CC(C)(C)C
1357
+ CC(=O)N=C(N)Nc1nc(C)c2cc(C)ccc2n1
1358
+ CCOC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1ccccn1
1359
+ N#CC(C(=O)O)C1C(=O)Nc2ccccc21
1360
+ CC(=O)C=Cc1ccc(O)c(O)c1
1361
+ N#Cc1ccccc1NC(=O)Nc1ccccc1O
1362
+ CCCCCCC=CC#CCCCCCCC1CC(CO)OC1=O
1363
+ CC(O)CCCC(O)C=CC1C(O)CC(=O)C1CC=CCCCC(=O)O
1364
+ Cn1c(O)c(C(=O)O)c(=O)c2ccccc21
1365
+ COc1c2c(cc3c1C(CC(=O)C=CC1=C(C)CCCC1(C)C)N(C)CC3)OCO2
1366
+ Cn1ccc2c(NC(=O)CC3NC(=O)N(CCc4ccccc4)C3=O)cccc21
1367
+ CC(O)c1c(-c2ccc(Cl)cc2)noc1C(=O)N1CCCC1
1368
+ C=C1CCC2COC(C3CCC=C(C)C3)C1C2
1369
+ CC(=O)C1CC(=O)C(C)C(Cc2ccccc2)C1
1370
+ O=C(Nc1ccc2c(c1)OCCO2)NC(Cc1c[nH]c2ccccc12)C(=O)O
1371
+ C=CC1(C)C=C2C(O)C3OC(=O)C4(CCCC(C)(C)C34)C2(O)CC1O
1372
+ CCCCSSCC
1373
+ NC(=O)[C@H]1CCCN1C(=O)[C@H](CCC(=O)O)NC(=O)[C@H]1CCC(=O)N1
1374
+ C=CCCCCC=CC=CC#CC=CC(=O)C=C
1375
+ CCCCCCCCCCCCCCCCCCCC(=O)CCCCCCCCCC(=O)O
1376
+ CC(=O)OC1CC(C)=C2C(C=C(C)C(=O)O)CCC(C)C21
1377
+ OC1C(NCc2ccncc2)C2COC(O2)C1N1CCOCC1
1378
+ CC(C)=CC(=O)C(C)c1ccc(C)c(O)c1
1379
+ CCCCCCN(CCCCCC)C(=O)NC(=O)C(F)(F)F
1380
+ COc1cc(N)c(Cl)cc1NC(=O)C1CCN(CC2CCC2)CC1
1381
+ CC1c2ccoc2CC2C1CCC1C(C)(C(=O)O)CCCC21C
1382
+ CCc1cnc(C)s1
1383
+ O=C1CN=C(c2ccccc2Cl)c2cc([NH+]([O-])O)ccc2N1
1384
+ CC(=O)N1CC2COCCN2C2(CN(CC(C)C)C2)C1
1385
+ CCCCCC1CCCC1
1386
+ Cc1ccc(C(=O)c2ccc3n2CCC3C(=O)O)cc1
1387
+ CN(C)CCN(C)CCC1CN(C(=O)Cc2cccs2)CCC1CC(=O)O
1388
+ Cc1ccc(O)c(C(O)Cc2ccccc2)c1
1389
+ COc1ccc(C(Cc2ccc(N)cc2)n2ccnc2)cc1
1390
+ O=C1CCc2ccc(O)c(OC3OC(CO)C(O)C(O)C3O)c2O1
1391
+ O=S(=O)([O-])Nc1ccc(-c2nc3ccccc3s2)cc1I.[Na+]
1392
+ NC(Cc1cnc[nH]1)C(=O)NC(CO)C(=O)O
1393
+ Cc1c(O)cc(O)c2c1C(C)(O)C(C)OC2=O
1394
+ CC1C(=O)OC2C1CCC(C)(O)C2O
1395
+ O=C1NC(Cc2c[nH]c3ccccc23)C(=O)N1CCc1ccccc1
1396
+ CCC=CC=CC=CC=O
1397
+ O=c1cc(-c2ccccc2)c2ccc(O)c(CN3CCCCC3)c2o1
1398
+ O=C(CCNC(=O)N1CCc2c([nH]c3ccccc23)C1)NC1CC1
1399
+ CCC(C)C1SCSS1
1400
+ Cc1cccc(Nc2c3ccccc3nc3ccccc23)c1
1401
+ CC1CC(O)C2C(C=O)=COC(OC3OC(CO)C(O)C(O)C3O)C12
1402
+ C[C@@H](NC(=O)c1ccccc1)C(=O)O
1403
+ CCCCOC(=O)CC(O)(CC(=O)OCCCC)C(=O)OCCCC
1404
+ O=C(O)C(=O)/C=C(\O)c1cccc(Br)c1
1405
+ COc1cc2c(cc1OC)CN(C(=O)NCCC(=O)O)CC2
1406
+ S=C=Nc1ccc(-c2noc(C3CCCCC3)n2)cc1
1407
+ CC(=O)NC[C@H]1CN(c2ccn(-c3ccc(F)cc3)c2)C(=O)O1
1408
+ CCCCCCCCCCCCC(O)C(O)CCC=CCCC=CCCC(O)CCCCCC(O)CC1=CC(C)OC1=O
1409
+ CCCCC1C(=O)C(C)=C(C)C1(O)CCCCCCC(=O)OC
1410
+ CC(=O)OC1CC(C)=CCCC(C)=CC2OC(=O)C(O)(CO)C21
1411
+ O=c1oc2cc(O)ccc2c2c(O)cc(O)cc12
1412
+ C=C(OC1C=CC(C(=O)CCC(=O)O)C(C(=O)O)C1O)C(=O)O
1413
+ CCc1cc(O)c(O)c(-c2cc(O)c(O)cc2CC)c1
1414
+ C=C1CC2C=CC3(CC(C)CC13)C1C2C1(C)C(N)C=O
1415
+ CCCC1NCCc2c1[nH]c1ccc(O)cc21
1416
+ COc1cc(C=CC=O)cc(CC2OC2(C)C)c1O
1417
+ CC1=CCC2C(C1)CN1CCc3c([nH]c4ccccc34)C21
1418
+ N=Cc1c[nH]c2ocnc12
1419
+ Oc1c(Cl)cc(I)cc1Cl
1420
+ CCCCCC=CCC(O)C(O)C=CC1CC1C1CCCC(=O)O1
1421
+ CCCCCCC=CCCC=CCCCCCCCCCCCCC=CCCC=CCCCC(=O)O
1422
+ CCCN(C)C(=O)c1c(-c2ccc(F)cc2)noc1C(C)O
1423
+ COC(=O)CCCCCCCCC(C)=O
1424
+ CC(CCC(=O)NC(CC(=O)O)C(=O)O)C1CCC2C3CCC4CC(O)CCC4(C)C3CC(O)C12C
1425
+ COc1ccc(-c2nnc(C3CCN(Cc4ccccc4)C3)o2)cc1
1426
+ CCCCCCCCC=CCCCC1CCCC(C)N1
1427
+ COC(=O)CN1C(=O)N2CCc3c([nH]c4ccccc34)C2(C)C1=O
1428
+ CCCC1C(c2ccc(F)cc2)CC2C[C@@H](F)C1N2C
1429
+ CC12CCC3c4ccc(O)cc4CCC3C1CC(=NO)C2=O
1430
+ C1CCCC(C2CCCCCC2)CC1
1431
+ CCCCCCCCCCCCCCCCCCCCCC(O)CCO
1432
+ COC(=O)C(Cc1cccc(I)c1)NC(=O)OC(C)(C)C
1433
+ N/C(=N/CCCC(N)[PH](=O)O)N[N+](=O)[O-]
1434
+ O=C(NCC1(COc2cccnc2)CC(O)C(O)C1)c1cnccn1
1435
+ CC1=CC2OC3CC(OC(=O)C=CC=CC(O)C(C)O)C(C)(C2(C)CC1)C31CO1
1436
+ CCCCC1(CCCC)C(=O)N=C(Nc2cccc(OC)c2)N=C1O
1437
+ CCCCCC1SSC(C)S1
1438
+ COc1ccc2[nH]c3c(c2c1)CN(C(=O)C1CCCO1)CC3
1439
+ COc1ccc(C=CCOC2OC(CO)C(O)C(O)C2O)c(OC)c1OC
1440
+ O=C(O)c1cccc(C(=O)CO[NH+]([O-])O)c1
1441
+ O=S1(=O)CCC(Br)C1
1442
+ Cc1ccc(NC(=O)c2oc3ccccc3c2C)cc1
1443
+ C#CC=CCCCCCCCCCCCCCC=CCCCCC=CCCCCC#CC(O)C#CCCCCCCC=CC(O)C#C
1444
+ CCOC(=O)Cc1cc(O)cc(O)c1C(=O)CCCCCC(C)=O
1445
+ COC1(C)Oc2cc(O)cc3cc(C)nc(c23)C1=O
1446
+ OC1CN=C2C=CC=CN2C1
1447
+ CC1CCC2C3CCC4CC(O)CCC4(C)C3CCC2(C)C1OS(=O)(=O)O
1448
+ CCCCCCC=CCCCC=CCCC(=O)O
1449
+ CC1=C(CCC2=CC=CC(O)=COC2)C2(C)CCCC(C)(C)C2CC1
1450
+ Cc1cc(C)c(CC(O)c2cc3ccccc3o2)c(C)c1
1451
+ CCCn1cnc2c1c(=O)n(CCCCC(C)=O)c(=O)n2C
1452
+ C=C(C)c1ccc(O)c(OC)c1
1453
+ CCCCCCC=CCC=CCCCCCCCCC(=O)O
1454
+ Cn1c(NC(=O)C(F)(F)F)cc(=O)n(C)c1=O
1455
+ CCCCCCCCSCC(O)CN1CC2CC(C1)c1cccc(=O)n1C2
1456
+ COc1cc(O)c2c(c1C)C(C)(O)C(CO)OC2=O
1457
+ CC1(C)CCCC2(C)C1C(OC(=O)C=CC(=O)O)C=C1COC(=O)C12O
1458
+ NC(Cc1ccc(O)c(Br)c1)C(=O)O
1459
+ O=C(COC(=O)c1ccccc1)c1ccccc1
1460
+ CC(Cc1ccc(O)c(O)c1)C(C)Cc1ccc(O)c(O)c1
1461
+ CC12CCC3c4ccc(OC#N)cc4CCC3C1CCC2=O
1462
+ CC1(C)CC2C1CC1OC1(C)C(=O)CCC2(O)CCl
1463
+ CCCCCCCCCCCCCCCCCCCC1Oc2c(C)c(C)c(O)c(C)c2S1
1464
+ CCCCCCCCC=CCCCc1cc(=O)c2ccccc2n1C
1465
+ CNC1CCC23CC24CCC2(C)C(C(C)NC(C)=O)CCC2(C)C4CCC3C1
1466
+ O=C(Nc1ccccc1C(=O)NC(Cc1ccc(O)cc1)C(=O)O)c1ccccc1
1467
+ OCC1OC(c2ccc(O)cc2)C=C1c1ccc(O)cc1
1468
+ Fc1ccc(-c2cc(NCCCCN3CCCC3)c3ccccc3n2)cc1
1469
+ CCOC(=O)C(NCC(O)COc1ccc(CCC(=O)OC)cc1)C(=O)OCC
1470
+ CCOC(=O)C1C2C=CC3(CN(CC4CCCO4)C(=O)C13)O2
1471
+ COc1ccc(C=C(C#N)C(=O)OC(C)C)c(OC)c1OC
1472
+ NC(CCCNO)C(=O)O
1473
+ COc1ccc(C=CCc2ccc(O)c(OC)c2)c(O)c1
1474
+ O=C1OCC(Cc2ccc3c(c2)OCO3)C1=Cc1cccc2c1OCO2
1475
+ CC(CCCC(=O)O)c1ccccc1
1476
+ CC1(C)CCCC2(C)C3CCC4(C)C(CC=C5COC(O)C54)C3=CCC12
1477
+ CCC(C)C(=O)OCC1CC(=O)OC(C)C1C
1478
+ CC(=O)OCCN(CCOC(C)=O)CC(c1ccccc1)c1ccccc1
1479
+ CCOC(=O)COc1ccc2c(c1)OC(=Cc1cccc(F)c1)C2=O
1480
+ S=C=NCCCCc1ccccc1
1481
+ COC(=O)CC(=O)OCC1=CCCC2C1(C)CC(O)C(C)C2(C)CCC1COC(OC)C1
1482
+ COc1cc(O)cc2oc(C)c(C)c(=O)c12
1483
+ COC(=O)C(NC(=O)c1ccccn1)C(C)O
1484
+ COc1cc(OC)cc(-c2cc(=O)c3c(OC)cccc3o2)c1
1485
+ NC(=O)CCNC(=O)C1=CC(N)C(O)C(O)C1
1486
+ Cc1ccccc1COc1ccc2oc(C)c(C(=O)O)c2c1
1487
+ CCCNC(=O)Nc1cccc2c1CN(C)CC2c1ccccc1
1488
+ C[C@H](N)Cn1ccc2cc(F)c(F)cc21
1489
+ CC(=O)C1C(O)CCC23OC2C(O)CC(C)C13C
1490
+ O=c1ccc2cc3c(-c4ccc5c(c4)OCCCO5)coc3cc2o1
1491
+ O=C1Cc2cc([NH+]([O-])O)ccc21
1492
+ COc1c2occc2cc2c(OCC=C(C)C)cc(=O)oc12
1493
+ CNC(=O)C(C1CC1)N1CCCC1C(=O)NC(C)C
1494
+ CN(C)C(=O)Oc1ccc2c(c1)OC(=Cc1cccc(Cl)c1)C2=O
1495
+ COc1cc(-n2cc(C(C)=O)c3ccccc32)ccc1C(N)=O
1496
+ CC1(C)CCc2cc(CC(=O)Nc3ccc4[nH]ccc4c3)ccc2O1
1497
+ CC1(C)CCc2c(c(O)cc3oc4cc(O)cc(O)c4c(=O)c23)O1
1498
+ N#Cc1cccc(CN2CC(O)CC2c2nc(C3CC3)no2)c1
1499
+ CC1CC=CC2(C)c3occc3CCC12C
1500
+ CCCc1cc(O)cc(OC2OC(CO)C(O)C(O)C2O)c1C(=O)O
1501
+ CCOc1ccccc1OC(c1ccccc1)C1CNCCO1
1502
+ N=Cc1cc2oncn2c1
1503
+ O=S1(=O)NC2CN(Cc3ccncc3Cl)CC2Oc2ncccc21
1504
+ CC(C)(CC(=O)O)Cc1nc2ccccc2n1Cc1ccc(O)cc1
1505
+ O=C(NCc1cccc(F)c1)n1ccnc1
1506
+ O=c1cc(-c2ccccc2)oc2cc(O)cc(OC3OCC(O)C(O)C3O)c12
1507
+ Cl.NC(N)=NC(=O)c1ccc2c(c1)C(O)c1c(Cl)cccc1-2
1508
+ CCC=CC(CC)CC(C)=CC1(CC)CC(C=CC(=O)O)(CC)OO1
1509
+ CC1(CO)CN(Cc2ccccc2)CC2CN(C(=O)C3CCCCC3)CCN21
1510
+ O=C(CCCc1nc(-c2cccnc2)no1)NCCc1c[nH]c2ccccc12
1511
+ CC(=O)OC1CCC2(C)C(CCC3C4CCC(=O)C4(C)CC(O)C32)C1
1512
+ COc1cc2oc(-c3ccccc3)cc(N)c-2c(=O)c1OC
1513
+ COC(=O)C1Cc2c([nH]c3ccccc23)C(c2ccccc2)N1
1514
+ CCC1=CC(O)CCC1=O
1515
+ Cc1c(CC(=O)N(C)CCc2ccccn2)c(=O)oc2cc(O)ccc12
1516
+ O=C(O)C=CC(=O)Nc1c(Cl)cc(Cl)cc1Cl
1517
+ C=Nc1noc2[nH]ccc12
1518
+ CC1CCCC=CCCC(=O)CCCC=CC=CC(O)CC=CC=CC(=O)O1
1519
+ CC1=CCC2CC1C2(C)C
1520
+ CCOC(=O)c1[nH]c2ccc3nc[nH]c3c2c1CCN1C(=O)c2ccccc2C1=O
1521
+ O=C(NC1COC2C(O)COC12)c1ccco1
1522
+ COc1oc(CCCCCC(C)O)c(C)c(=O)c1C
1523
+ COc1c(OC)c(OC)c(C(C)=O)c(OC)c1OC
1524
+ CC1OC(=O)Cc2c3c(c(O)c(O)c21)OC(C)(C)CC3
1525
+ CC(NC(=O)C1CCCCC1)c1c(-c2ccc(F)cc2)noc1C(=O)O
1526
+ CCCCCC=CCCCCCCCCc1cccc(O)c1O
1527
+ CC(=O)C1CC2C(C)(N=C=S)CCC(C(C)C)C2(O)C1O
1528
+ CCCc1nc(C)c(C)o1
1529
+ CC(C)=CCc1c(O)ccc2c1C(=O)C(O)=C(C)C2=O
1530
+ CC(=O)CC1CC1C1OC2=C(Br)CC(C=C=CBr)OC21
1531
+ CCCCC(=O)c1cc(O)cc(OC)c1
1532
+ O=C(COc1ccc(C(Cc2ccccc2)=NO)c(O)c1)OCc1ccccc1
1533
+ CC(O)C1NC(=O)CNC1=O
1534
+ O=C1CCc2c1c(CCCO)cn(CCO)c2=O
1535
+ O=C(Nc1cc(F)cc(F)c1)[C@H]1CCCC[C@H]1C(=O)O
1536
+ CCCCCCCCCCCCc1ccccc1C(SCCC(=O)O)[S+]([O-])CCC(=O)O
1537
+ CC#CC#CC#CC=CC(=O)CCCCO
1538
+ C=C1C(=O)OC2CC(C)C3C(O)CC(O)C3(C)C(OC(=O)C=C(C)C)C12
1539
+ CC1c2ccccc2CCN1c1ccncc1
1540
+ C=CC(C)(O)CCC1=C(C)C(O)CC(Br)C1(C)C
1541
+ C=CC(C)(O)CCC=C(C)C(O)C(=O)C=C(C)C
1542
+ COc1ccccc1C(=O)NCC1Cn2cc(-c3ccccc3)nc2CO1
1543
+ COc1cc(OC)c(C2COc3cc(O)cc(O)c3C2=O)cc1OC
1544
+ CC(=O)C=CC1CCc2c(cc(CC=C(C)C)c(O)c2C=O)O1
1545
+ Cc1nccc2onnc12
1546
+ O=C1C(O)=C(O)OC1C(O)CO
1547
+ CC1C(=O)Nc2cc(O)cc(c2O)C(Cc2ccccc2C(=O)O)CC=CC=CC1O
1548
+ O=C(NCc1ccc2c(c1)OCO2)c1c(O)c2cccc3c2n(c1=O)CCC3
1549
+ CNC1=CC(=O)C2(C)CCC1C2(C)C
1550
+ CC(O)C1(C)NC(=O)c2ccccc2N1
1551
+ COCOc1ccccc1[PH](=O)O
1552
+ CC1CCC2=C(COC2=O)C2(O)CC(C)(C)CC12
1553
+ CCCCCCCCCCCCC(C)CCC(=O)O
1554
+ O=C(c1cc(Cl)c(Cl)[nH]1)c1ccccc1O
1555
+ CC12COc3cc(CCc4ccccc4)cc(O)c3C1C2
1556
+ O=C(O)CC(=O)CBr
1557
+ COc1cc2c(c(O)c1O)C(=O)C(=O)C=C2
1558
+ Cc1ocnc2cnnc1-2
1559
+ CC1CCN(C2C(CNC(=O)N3CCOCC3)OC(CO)C2O)CC1
1560
+ O=P(O)(O)c1ccccc1OCCO
1561
+ CCC1CCCCN1Cc1c(O)ccc2c(=O)c(-c3nc4ccccc4s3)coc12
1562
+ OC1C=CC23c4cc5c(cc4CN(CC2O)C3C1)OCO5
1563
+ F[C]1[CH][CH][CH][C](C[NH+]2[CH][CH][CH][CH][CH]2)[CH]1
1564
+ CCCCCCCCCCC(C)(C)C(=O)Nc1c(OC)cc(OC)cc1OC
1565
+ CC(=O)OCC(C)=CCC(O)C(C)=CCCC(C)=CCO
1566
+ COc1ccccc1OCCCN1CCN(C(=O)c2ccc(=O)[nH]n2)CC1
1567
+ C=C1CCC2(N=C=S)C(C)CCC3C(C)CC(CC(C)(C)NC=O)C1C32
1568
+ CC(C(=O)O)c1ccc(OC2OC(CO)C(O)C(O)C2O)cc1
1569
+ Cn1cnc2c1CCNC2
1570
+ COc1c(O)cc(C(=O)O)cc1CC=C(C)C
1571
+ CC(C)C(N)C(=O)OCCOCn1cnc2c(=O)nc(N)[nH]c21
1572
+ C=CC(C)(O)C=CC=C(C)C(O)CC=C(C)C
1573
+ COc1ccc(C2=CC=C(N)[NH+](CCCC(=O)O)N2)cc1
1574
+ CCCCCc1cc(=O)c2cccc(OC)c2n1C
1575
+ O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C1CC1
1576
+ CCN(CC)CC1OCC(NS(C)(=O)=O)C1O
1577
+ Cc1ccc(Cl)c(C)c1
1578
+ NS(=O)(=O)c1nnc(NC(=O)CNCC(=O)O)s1
1579
+ O=C1Nc2ccc(-c3ccsc3)cc2C(=O)N2CCN(C(=O)C3CCCN3)CC12
1580
+ CC(=O)OCCC(C)=C(Cl)C=CC(C)(Cl)CBr
1581
+ N=Cc1cc2ncon2c1
1582
+ CCC(=O)N(CCC(Cc1ccccc1)c1ccco1)Cc1ccco1
1583
+ CC#Cc1ccc(-c2ccc(C(CCl)OC(C)=O)s2)s1
1584
+ COC(=O)C(C)C1CCC(C)(CCC=C(C)CCC=C(C)CCC=C(C)C)OO1
1585
+ CC(O)C(O)C12C(=O)OC(C=CC1C)C2O
1586
+ COC1CCC(=CC#N)C(OC2OC(CO)C(O)C(O)C2O)C1
1587
+ Nc1ccc2nc(NCCCN3CCOCC3)oc2c1
1588
+ CC#Cc1ccc(C=O)s1
1589
+ O=c1cc(-c2cccc(O)c2)oc2c1ccc1ccccc12
1590
+ CC1(c2ccncc2)CCC(=O)NC1=O
1591
+ O=P(NCc1ccccc1)(c1ccccc1)c1ccccc1
1592
+ O=C(NC1CCN2C(=O)c3ccccc3NC(=O)C2C1)c1cccs1
1593
+ O=C(O)Cc1cccc2nc3ccccc3nc12
1594
+ CC(C)=CCc1c(C=O)cc(O)c2[nH]c3ccccc3c12
1595
+ [O-][NH+]1CC=CC=C1SSC1=CC=CC[NH+]1[O-]
1596
+ COc1ccc(N2CN(C=O)c3ccc(OC)cc3C2=O)c(CO)c1
1597
+ CCP(=O)(OC)C(=O)C(C)(C)C
1598
+ CCCC(=O)c1ccc(O)cc1O
1599
+ CCC(CC)C(=O)N1CCC(CC(=O)O)C(CC2=NCCc3ccc(C)cc32)C1
1600
+ CC(C)=CCCC(C)(O)C1CC(=O)C(C)=CC1O
1601
+ COc1cccc2oc3ccc(O)cc3c(=O)c12
1602
+ Cc1nc(C(=O)O)sc1CCOP(=O)(O)O
1603
+ O=C1CC2C(O)C=C(CO)C2(CO)O1
1604
+ COc1ccc2c(ccn2CCC(=O)NC(Cc2ccccc2)C(=O)O)c1
1605
+ CCCCCCCCCCCCCCCCCCCC1CC(O)CC(=O)O1
1606
+ C=Cc1ccc(O)c(C(=O)C=C(C)C)c1
1607
+ COCC(=O)N1CCN2C(=O)N(CC(N)=O)C(=O)C2C1
1608
+ CCOC(=O)c1ccc([NH+]([O-])O)cc1NC(=O)c1ccccc1
1609
+ CC(C)(C)CC(=O)NC(Cn1cc(F)c(=O)[nH]c1=O)C(=O)O
1610
+ O=P(N1CC1)(N1CC1)N1CC1
1611
+ Cc1[nH]cnc1CSCC/N=C(\N)NCCSCc1nccs1
1612
+ CC1CC(=O)c2c(O)cc(O)cc2O1
1613
+ CC1=CC(O)CC(C)=CC(O)CC(C)(O)C=CC(C(C)C)CC1
1614
+ CC(=O)CCc1oc2ccc(C)cc2c1-c1ccccc1
1615
+ Cc1cccc(OC2COCCN(C(=O)c3cnccn3)C2)n1
1616
+ O=C(c1ccccc1)N1CCc2[nH]c3c(Br)cccc3c2C1
1617
+ CC(=O)Nc1ccc2c(c1)Cc1cc(NC(C)=O)ccc1-2
1618
+ NC(=O)CCNC(=O)C1=CC(NC(=O)c2cccc(F)c2)C(O)C(O)C1
1619
+ CC1=C2CC3C(C)(C=CC(O)C34CO4)CC2OC1=O
1620
+ COc1ccc(CO)c2c1Nc1c(C(=O)O)cccc1N2C(=O)CO
1621
+ COc1ccc(C=CC(=O)c2c(O)cc(OC)c(O)c2OC)cc1
1622
+ O=C(O)COc1ccc2c(c1)OC(=Cc1cccc(Br)c1)C2=O
1623
+ CN1C(=O)COCC1C(O)c1ccc(NC(=O)c2ccco2)cc1
1624
+ CCOP(=O)(CCN)OCC
1625
+ Cc1cc2oc(=O)cc(CN3CCCCC3C)c2cc1O
1626
+ CC1(C)CCCC2(C)C1CCC(C)(O)C2CC(=O)c1ccoc1
1627
+ CCN1C(=O)C(Cc2c[nH]c3ccccc23)NC1=S
1628
+ CCCCCCC1CCCCCCCCCCC(=O)OC2C(O1)OC(CO)C(O)C2O
benchmark/data/train_smiles.txt ADDED
The diff for this file is too large to render. See raw diff
 
benchmark/data/val_smiles.txt ADDED
@@ -0,0 +1,1627 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ C=CCON=C(CCC)C1C(=O)CC(C)(C)C(C(=O)OC)C1=O
2
+ CC(=O)OC1c2c(C)coc2CC2C(=O)CCC(C)C21C
3
+ COc1ccccc1-c1nnc(N)[nH]1
4
+ CCN(CC)CCOC(=O)C(C)Oc1ccc(Cl)cc1
5
+ COc1ccc(CCNC(=O)c2ccc3ccn(C)c3c2)cc1OC
6
+ Cc1ccc(C2CC(O)C(O)C2NCC(C)(C)O)cc1
7
+ COc1ccc(OC)c(-c2oc3ccccc3c(=O)c2O)c1
8
+ CC1CC(=O)OCC1O
9
+ NC(=O)C1(O)CC(O)C2OC21
10
+ C#CCCCCCCCCCC=C1C(=O)OC(=C)C1OC
11
+ C#CC1CN2CCC1CC2CNC(=O)N1CCOCC1
12
+ COc1ccc(CNCC(CO)C2(c3ccccc3)CCOC(C)(C)C2)cc1
13
+ OCC1OC(n2cnc3c(NC4CCCC4O)ncnc32)C(O)C1O
14
+ Ic1ccc(NCn2nnc3ccccc32)cc1
15
+ CCC(C)CCc1oc(=O)c(C)c(O)c1C
16
+ C=C(C)C1CCC(C)(O)C1C
17
+ c1ccc(C2=NN(c3ccccc3)C2)cc1
18
+ Cc1ccc(OP(=O)(Oc2ccc(C)cc2)Oc2ccc(C)cc2)cc1
19
+ Cc1c(O)cc(O)c(C=O)c1C
20
+ O=C(NCC1CCCO1)c1c(O)c2ccccc2[nH]c1=O
21
+ COC1CC(=O)OC(C)CCCCCC(O)C1=O
22
+ COc1cc2c(c(O)c1OC)C(C)N(C)CC2O
23
+ COc1ccc2[nH]c3c(NN)nncc3c2c1
24
+ O=C(O)CCc1ccc(O)cc1O
25
+ COc1ccccc1C=CC(=O)c1ccc(NC(C)=O)cc1O
26
+ C#CCN(C)Cc1nc(C2(O)CCN(C(=O)C3CCCCC3)CC2)cs1
27
+ CCC1CN2CCC3(C(=O)Nc4cc(OC)ccc43)C2CC1CCO
28
+ COC(=O)c1c(O)cc(O)c(CC=C(C)CCC=C(C)CCC=C(C)C)c1C
29
+ CCCCCC[N+](C)(C)CCO
30
+ CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCOC(=O)CCCCCCCCCCCCCCC
31
+ CCNC(=O)/C=C(C)/C=C/CC(C)CCCC(C)(C)OC
32
+ O=C(NCCc1ccccc1)c1cc2ccc(O)c(O)c2cn1
33
+ CC1CCOC1=O
34
+ O=C(OCCO)c1cc(O)c(CO)cc1C(=O)c1ccccc1O
35
+ CCCC(CC(CCc1ccc(OC)c(O)c1)OC(C)=O)OC(C)=O
36
+ COc1c(C)c(O)cc2cc(CC(C)O)oc(=O)c12
37
+ C=C1CCCC(C)(C)C1CCC(C)=CCO
38
+ CNCC(c1ccccc1)c1ccccc1
39
+ CCCCCCCCc1cc2cn(C3CCC(CO)O3)c(=O)nc2o1
40
+ CC(C)CCn1c(N)nc2c1c(=O)n(C)c(=O)n2C
41
+ O=C(Cc1ccc(Cl)cc1)NC1C(c2cncnc2)CC(O)C1O
42
+ CC(C)=CCCC(C)=CCOc1cc(O)c(C(=O)c2ccccc2)c(O)c1
43
+ COc1cc(C=CC(=O)NC2=C(O)CCC2=O)ccc1O
44
+ CCCCCCCCCCCCCCCC(=O)OCCC
45
+ Oc1ccc(-c2ccc(-c3ccc(O)nn3)cc2)nn1
46
+ CC(=O)C1CCC(C)C1c1occc1C(C)C
47
+ CCCCCC=CCC1OC(C=C=CBr)CC1OC(C)=O
48
+ C=C1C(=O)OC2CC(=C)C3C(OC(C)=O)CC(C)(O)C3CC12
49
+ CNC1C(O)CC(N)C(OC2OC(CN)=CCC2N)C1O
50
+ O=C(CC(CO)C(=O)c1ccc2c(c1)OCO2)c1ccc2c(c1)OCO2
51
+ CC(C)C(N)C(=O)NC(C(=O)O)C1CC(O)C(O)CN1
52
+ COc1cc(C=CC(=O)OCC2CCCN3CCCCC23)ccc1O
53
+ CC1CCC(C)(COC(=O)CCc2ccccc2)C1(C)C
54
+ CCCOC(=O)C(Cl)(C(F)(F)F)C(F)(F)F
55
+ COc1c(C)c(O)c(C=O)c2c1C(=O)OC2
56
+ Nc1ccc(F)c(CO)c1
57
+ O=C(/C=C/c1ccc(OCc2ccccc2)cc1)N(O)CCc1ccccc1
58
+ CC=Cc1ccc(OC)cc1
59
+ CCC=CCCCCO
60
+ CC(C)C1CCC2(CO)CCC3(C)CC(O)C4(C)CC4C3C12
61
+ COc1ccc(-c2coc3cc(O)c(OC)cc3c2=O)c(O)c1
62
+ C=C(COCC(O)COC(=O)CCCC=CCC=CCC=CCC=CCCCCC)C(=O)O
63
+ C=CC1(C)CC2(O)OC(=O)C(C)=C2CC1C(=C)C(=O)OC
64
+ CCCCCn1cnc2c(S)nc(N)nc21
65
+ COc1c(C)c(O)c2c(c1C(=O)O)C(O)OC2
66
+ CNC(Cc1c[nH]c2cccc([NH+]([O-])O)c12)C(=O)O
67
+ O=c1ccn(C2OC(CO)C(O)C2O)c(=O)[nH]1
68
+ COc1ccc(-n2cc(-c3ccccn3)nn2)cc1N
69
+ O=Cc1ccc(O)c(Br)c1
70
+ CC(=CCO)CCCC(C)(C)O
71
+ O=C1CC2CCN(Cc3ccccc3)CC2CCN1C1CCCCC1
72
+ CC(C)=CCCC(=CCCC(=CCCC(=CCO)CO)CO)CO
73
+ S=CC=NCc1ccccc1
74
+ CCC(C)CCC1=C(C)C(=O)C(=O)c2c1[nH]c1ccccc21
75
+ CC=CC=CC=CC(=O)CC=C1CC=C(O)C(OC)C1
76
+ COc1ccc2c(c1)OC1C3C=CC(OC)C=C3OCC21O
77
+ CC(O)C(=O)OC1C=CC(CC(=O)C(=O)O)(C(=O)O)C=C1
78
+ CCCC(C)CCCCC(C)CC
79
+ COc1ccc2c3c1OC1C(=O)CCC4C(C2)N(C)CCC314
80
+ CCCCN(C)Cc1cc(=O)oc2ccc3ccccc3c12
81
+ COC1CC2C(=O)N3CCN(C(=O)NC(C)C)CC3C(=O)N2C1
82
+ CCCCCCCC(=O)c1c(O)cccc1O
83
+ CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=CCCC1OC(=O)COC1=O
84
+ CC(C)[C@H](NC(=O)[C@@H](S)Cc1ccccc1)C(=O)N1CCCC1C(=O)O
85
+ CCCCOCC(C)OCC(C)OCC(C)OCC(C)OCC(C)O
86
+ CCc1ccc(C)c(O)c1
87
+ CC(=O)Nc1ccc(-c2ccc(N=Nc3ccccc3)cc2)cc1
88
+ CCCCCC=CCC=CC=CC(O)C(O)C=CCCCC(=O)O
89
+ Cc1ccccc1CN1CCN2C(=O)N(Cc3ccccn3)CC2C1
90
+ CC(C)CC(NC(=O)C(CC(=N)O)NC(=O)CN)C(=O)O
91
+ CC=Cc1cc(OC)c2oc(-c3ccc(O)cc3O)cc2c1
92
+ O=C1CCCCCCCC=CCCCCCCC1
93
+ CC12CCC3C(C)(CO)C(O)CCC3(C)C1CC(C1=CCOC1=O)O2
94
+ CCCCCC(O)C=CC(=O)CCCCCCCCC(=O)O
95
+ COC1C=CC2(C(=O)OCc3cc4c(cc32)OCO4)C(N(C)C=O)C1
96
+ O=C(O)C=Cc1ccc2ccccc2n1
97
+ CC(=O)c1ncco1
98
+ CC(C)N(C(=O)CS(=O)(=O)O)c1ccccc1
99
+ C=CC(C)(O)CCC1(C)C(C)CCC2(C)C(C)=CCC(C)C21
100
+ CO[C]1[CH][CH][C]([C]2[NH2+][CH][CH][CH][C]2[O-])[CH][C]1N
101
+ Fc1ccc2[nH]c(C3CCN(Cc4ccccn4)C3)nc2c1
102
+ C=C1CCC2C3(C)COC3CCC2(C)C1CC=C1C(=O)OCC1O
103
+ Cc1cc(C)nc(NC(Cc2ccccc2)C(=O)O)n1
104
+ CCOc1ccc(Cc2ccc(NC3=NCCN3)cc2)cc1
105
+ COC1CCC(OC2CCC(O)C(C)O2)C(C)O1
106
+ COc1ccc2c(O)c(C(=O)O)cnc2c1
107
+ CC[C]1[CH][CH][CH][NH+](C[C]2[CH][CH][C](Cl)[CH][CH]2)[CH]1
108
+ CCSC(SCC)C(CCC(O)CNC(C)=O)NC(C)=O
109
+ CC1Oc2c(O)cc3ccc(=O)oc3c2C1(C)C
110
+ C=CCC1(Cl)C(O)=C(Cl)C(=NCCCC(=O)OC)C1(OC)OC
111
+ Clc1cn2ccsc2n1
112
+ CN(C)Cc1ccccc1Sc1ccc(Br)cc1N
113
+ CC(C)CCCC(=O)CCCCCCC(=O)CCN(O)C(=O)C(N)CO
114
+ C=C1C(=O)OC2C=C(CO)CCC=C(C)CC(OCC(C)C)C12
115
+ CC(=O)NC(CCC(O)=CNN)C(=O)O
116
+ CCCCCCCCC=CCCCCCCCCCC(=O)OCC(CO)OC(=O)CCCCCCCCCC=CCCCCCCCC
117
+ C=CC(C)(CCC1C2(C)CCC(O2)C1(C)CCC=C(C)C)OC(C)=O
118
+ COC(=O)C1Cc2c([nH]c3ccccc23)C(C)N1
119
+ c1coc(-c2ccc(C3=Nc4cccc5cccc(c45)N3)cc2)c1
120
+ COc1cc2c(cc1O)C1CCc3cc(OC)c(O)cc3N1CC2
121
+ C#CC#CCCCC=CCCCC(=O)NCC(C)C
122
+ CCCCCCCCC(C)CCCCCCCC(=O)OC1C(O)C(O)C(O)C(O)C1OC1OCC(O)C(O)C1O
123
+ Nc1nc(O)c2ncn(CCC(CO)CO)c2n1
124
+ CCOC(=O)C1(CCCc2ccc(Cl)cc2)OC12CCCCC2
125
+ COc1ccccc1N1CCN(CCN2C(=O)CC3(CCCC3)CC2=O)CC1
126
+ CC(=O)OCC(C)CCCC(C)C1CCC(C)C12CC=C(C)C(O)C2
127
+ CC1=C(C(=O)O)C2(C)CCCC(C)(C(=O)O)C2CC1O
128
+ CC1=CCC(C)(C)C2CCC(C)C2(O)CC1
129
+ CC1(C)CCCC2(C)CC=C(C=O)C3CC312
130
+ CCOc1cc(OC)c(CC(C)N)cc1OC
131
+ CC1CC2OC2(C)CCC(=O)C2CC(C)(C)C12
132
+ O=Cc1cc(O)c2[nH]c3ccccc3c2c1
133
+ CC(C)Oc1ccc(CNCC2CCOC(C)(C)C2)cc1
134
+ O=C(O)c1ccc(Nc2ncc(F)c(Nc3ccccc3F)n2)cc1
135
+ CCc1[nH]c(O)nc1C(=O)NCCCn1ccnc1
136
+ Cc1cc(-c2ccc(N)c(C)c2)ccc1N
137
+ CCOC(=O)c1c(O)cc(O)cc1CCCCCCCC(C)O
138
+ CCCCCCCCCCC(=O)CC(=O)N[C@H]1CCOC1=O
139
+ CC12CCC3C(C(=O)CC4CC(=O)CCC43C)C1CCC2=O
140
+ Nc1ccc([N+](=O)[O-])cc1C(=O)O
141
+ C#Cc1cc(O)nc(O)n1
142
+ NC(Cc1ccc(S(=O)(=O)O)cc1)C(=O)O
143
+ COC(=O)C=CC(O)=C1C(=O)Oc2cc(O)c(O)cc21
144
+ COC(=O)C(C)=CCCC(C)=CCC1(CC(=O)O)CC(=O)CCC1=O
145
+ O=C(O)CNC(=O)c1ccc(C(O)c2ccccc2)cn1
146
+ CCCC(=O)C(CC)Sc1ccoc1C
147
+ O=c1cc2c3c(ccn2CCCO)cnc3c1O
148
+ COc1ccc(CCNC(=O)CCCc2c[nH]c3ccccc23)cc1
149
+ COc1ccc2oc(C)c(C(=O)Nc3ccccc3C)c2c1
150
+ CC=C(C)C(=O)OC1c2occ(C)c2CC2(C)C(C)CCCC12
151
+ CCCCN(C(C)=O)C(CC)C(=O)NCc1ccccc1
152
+ Cc1cc(O)c2c(c1)C1C(C(C)C)CCC1(C)C(O)C(O)C2=O
153
+ CCCCCC(CC)OC1OC(COC(=O)CC(C)(O)CC(=O)O)C(O)C(O)C1O
154
+ CC1C(=O)OC2C=C(CO)CCC=C(C=O)CC(O)C21
155
+ CCC1(c2ccc(N)cc2)CCC(=O)NC1=O
156
+ C[C]1[CH][CH][NH+](C[C]2[CH][CH][CH][CH][C]2F)[C](C)[CH]1
157
+ NC(N)=NCCCCC(N)C(=O)O
158
+ CC(C)=C1CCC2=CC(=O)CC(C)C2(C)C1
159
+ C=CC1(C)CC(O)C2C(O)(CCC3C(C)(C)CCCC32C)C1
160
+ CCCCCCC=CCCCCCCCc1cc(O)cc(OC(C)=O)c1
161
+ C=C1C=CC(C(=C)C)CC1
162
+ CN(C)/N=N/c1ccccc1C(N)=O
163
+ CC(C)(C)NC(=O)CC1CCNCC1Cc1cc(C(C)(C)C)on1
164
+ CC(=O)NC1C(O)C=C(CO)C(O)C1O
165
+ CC(C)=CCc1ccc2[nH]ccc2c1
166
+ CC1CCCC1C
167
+ CC1=CCC2C(C)(C)CC(O)CC2(C)C1CCC(C)CCO
168
+ CC1CC2OC(=O)C3=CCCC(C1(C)CC(O)C1=CC(=O)OC1O)C32C
169
+ Cc1cccc(Nc2cc(Cl)nc(SCC(=O)O)n2)c1C
170
+ CCCCCCCCCCCCCCCCc1ccc(CC(=O)O)o1
171
+ CC=C(C)C(=O)OCC1=CCN2CCC(OC(=O)C=C(C)CO)C12
172
+ COc1cc2c(c(OC)c1OC)C(=O)C(Cc1ccc(O)cc1)CO2
173
+ Oc1ccc(C=Nc2ccccc2)cc1
174
+ O=C(O)CCCCCNc1ccc(C(=O)O)cc1
175
+ CCCCCCCCCCc1ccc2c(c1)N(C)[C@@H](C(C)C)C(=O)N[C@H](CO)C2
176
+ C=CCCCCC=CC#CCCCCCCCCC1CC(CO)OC1=O
177
+ C=C1OC(=O)C(C(C=CC)C=CC(=O)C=O)C1=O
178
+ CC(=O)c1c(O)cc(O)cc1CC1Cc2cc(O)cc(O)c2C(=O)O1
179
+ CCC(C)C(NC(=O)NCC(C)C)C(=O)O
180
+ COc1cc(O)c2c(c1)CCCCCCCCCCCCCC(C)OC2=O
181
+ NC(=O)c1ccccc1NC(=O)c1ncn2c(=O)n(CCCl)nnc12
182
+ CC1Cc2cc(O)cc(O)c2C(=O)O1
183
+ COc1c(Br)cc(C=CC(=O)NCCCNCCCCNCCCN)cc1Br
184
+ CC1Cc2c(O)c(O)cc(O)c2C(=O)O1
185
+ CCCCCCCCC=C=CCCCCCCCCCCCCCCCCCCCC
186
+ CC(C)=CCC1C(=O)C(C(=O)CCC(C)C)=C(O)C1O
187
+ COc1c(C)c(O)c2c(=O)c(O)c(-c3ccccc3)oc2c1C
188
+ CC#CC#CC#CC=CC1OCCCC1O
189
+ CN(C)Cc1c[nH]c2ccccc12
190
+ Oc1ccc(O)nc1
191
+ CCCCOCCn1c(N2CCCN(C)CC2)nc2ccccc21
192
+ CCCCCCCCCCCCCCCC(=O)OCC1(O)OCC(O)C(O)C1O
193
+ CC(C)=CCCC(C)=CCc1cc2c(cc1O)oc(=O)c1c3ccc(O)cc3oc21
194
+ CCCCCC1=C(C(=O)OCC)C(c2cccc(C)c2)NC(=O)N1
195
+ CC(C)CC(=O)OC1CC2CC(O)C(C1)N2C
196
+ CC(O)C(O)C(O)CO
197
+ CCC(C)C=Cc1cc2cc(O)c(C)c(O)c2c(=O)o1
198
+ CCC(C)=CCC(O)CCCCCCCCCCCCCCCOC(C)=O
199
+ CCCCCCCCCCCCCCCC(=O)OCC(C=COC(C)=O)=CCC1C(C)=CCCC1(C)C
200
+ NCCCCCCCCCCC(=O)O
201
+ C=C1CCC2OC2(C)C(O)CC2C(C=O)=CC(C=C(C)C)C(O)C12
202
+ Cc1cc(C)c(C#N)c(SCc2cc(=O)oc3cc(O)c(O)cc23)n1
203
+ O=Cc1ccc(CO)n1CCc1ccc(O)cc1
204
+ CC(CC1=C(CO)COC1=O)C1=CC(C)(C)CC1
205
+ COc1ccc2c(=O)c(-c3ccccc3)c(C)oc2c1
206
+ C[n+]1ccn(COCCCS(C)(=O)=O)c1/C=N/O.[Cl-]
207
+ COc1ccc(-c2oc3cc(OC)ccc3c(=O)c2O)cc1
208
+ C=C1CCC2C(C)(C)C(O)CCC2(C)C1CCC(C)CCO
209
+ C=CC=CC1OC(C)CC1O
210
+ Cc1cccc(O)c1-c1nc2c(C(=O)O)cccc2o1
211
+ COc1cccc(CO)c1O
212
+ OCC1CCC(O)CN1
213
+ O=C1c2ccccc2CC2C=CC=CC12
214
+ CC(C)NCC(O)COC(=O)c1ccccc1Cl
215
+ O=C1c2ccccc2-c2c1c1ccccc1c(=O)n2CCCn1ccnc1
216
+ CCCCN1CCC[C@H]1CNC(=O)c1cc(SC)cc2c1OCCN2C
217
+ COc1cc(O)cc2c1CCN(C)C2
218
+ O=C(Cc1ccccc1)NNc1ccc([NH+]([O-])O)cc1
219
+ COC(=O)c1ccc2c(c1)-c1cc(O)ccc1OC2(C)C
220
+ CC(=O)CC(C[NH+]([O-])O)c1ccccc1
221
+ COc1ccc2ccc3oc([N+](=O)[O-])cc3c2c1OC
222
+ NCCN(C(=O)c1ccc(Cl)cc1Cl)c1ccc(F)cc1
223
+ CC1=CCCC2(C)OC2CC(=C(C)CO)C(=O)C1
224
+ CC=C(C)C(=O)Oc1ccc(OC(=O)C(C)C)cc1C1OC1C
225
+ CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC(C)=O
226
+ C=CCc1ccc(O)c(-c2cc(OC(O)C=C)ccc2O)c1
227
+ CCc1c(C)c(O)cc2c1C=CC1C(C)(C)C(=O)CCC21C
228
+ CCCCCCCCCCCCCCCC1OC(COP(=O)([O-])O)CS1.[Na+]
229
+ CN(C)Cc1ccccc1-c1ccc2n(c1=O)CC1CC2CN(C(=O)CSCC(=O)O)C1
230
+ CC(C)=CCCC(C)=CCCC(C)CC(=O)O
231
+ CCN1CCCC1CNC(=O)COc1cc(O)c2c(c1)OC(C)(C)CC2=O
232
+ CC1(C)CCC2(CCC(C)(C)O2)O1
233
+ C=CC(C1=CC(O)C(OC)=CC1OC)c1ccccc1
234
+ C#CCn1ccc2cc(C(=O)OC)ccc21
235
+ CC(=O)Oc1cc(O)c2c(c1)OC(c1ccccc1)CC2=O
236
+ Nc1ncc(-c2ccc(O)cc2)nc1Cc1ccccc1
237
+ COc1ccccc1CCN1CC(C(=O)NCC2CCCN3CCCCC23)CC1=O
238
+ CCCCc1cc(OC)c(OC)cc1OC
239
+ COc1cc(OC)c2c(C)c(CC(=O)NCCC(=O)O)c(=O)oc2c1
240
+ Cc1ccc2c(c1)C(=O)CCC2C
241
+ Cc1cc(N)ccc1C(=O)OCC(O)CNC(C)(C)C
242
+ CCN(CC)C(=O)C=C(C)C(F)(F)F
243
+ CC(=O)OC1OC2OC=C(C(=O)CC=C(C)C)C3CCC1C23
244
+ CCCCC(=O)N1CSC[C@H]1C(=O)N1CCCC1
245
+ COc1c(C=CC=O)cc2c(c1OC)OCO2
246
+ O=C(NCC1CC2CCN1CC2CN1CCOCC1)c1ccco1
247
+ COC(=O)c1ccc(CCCCCCCCCCC(C)=O)cn1
248
+ CNc1ccccc1C(=O)CC(NC(C)=O)C(=O)OC
249
+ O=C(O)CSCC(=O)N1CC2CC(C1)c1ccc(-c3cccnc3)c(=O)n1C2
250
+ C=C1C(=O)OC2CC(=C)C3CCC4(C)OC34CC12
251
+ CC(=O)Nc1ccccc1-c1onc(-c2ccccc2)c1-c1ccccc1
252
+ CCC(=O)C(C)C
253
+ CCn1nnnc1Cc1ccc2[nH]cc(CCN(C)C)c2c1
254
+ COC1COCCN(C(=O)c2cccc(F)c2)C1
255
+ COc1ccc2c(C)c(CC(=O)NC(C)C)c(=O)oc2c1OC
256
+ COc1cc(OC)c(C(=O)CC(C)=O)c(OC)c1
257
+ CC(=O)OCC=C(C)CCC1C(C)=CCC2C(C)(C)C(OC(C)=O)CCC12C
258
+ CC1=C(C)C(=O)C(CCC(C)(O)CCC(=O)O)=C(C)C1=O
259
+ Cc1cc2c(c(C)c1CCO)CC(C)(CO)C2
260
+ C=C(C(=O)OC1C2C(CC(C)C3C=CC(=O)C31C)OC(=O)C2C)C(C)O
261
+ CC=CC#CC#CCCCOC(=O)CC(C)C
262
+ COc1cc(C=Cc2ccc3c(c2)OCO3)oc(=O)c1
263
+ CC(=CC(=O)c1ccccc1)NCCCC(=O)O
264
+ COc1cccc(C2=CCN(C)CC2)c1
265
+ COc1ccc(CN2CC3CN(Cc4ccccn4)CCN3C2=O)cc1
266
+ CC1CC23C4=CCCN2CCCC3C(=O)CC4C1O
267
+ Cc1cc2c(cc1Br)C1(C)CCC(C)(O2)C1C
268
+ CN(CCc1ccccc1)C1C(CNC(=O)c2cccc(F)c2)OC(CO)C1O
269
+ CC[C]1[CH][CH][CH][NH+](CC(=O)[C]2[CH][CH][CH][CH][CH]2)[CH]1
270
+ CCCC=CCOC(=O)CCCCCCC
271
+ CNC(=S)N(O)c1ccccc1
272
+ CC1=CCc2c(cc(CCc3ccc(O)cc3)c(C(=O)O)c2O)OC1
273
+ CC(=CCC(Br)C(C)=CC(Cl)Cl)CBr
274
+ O=C(O)c1ccccc1C1c2ccc(O)cc2Oc2cc(O)ccc21
275
+ CC(=O)CCC(=O)c1ccoc1
276
+ CN1CCC(c2c[nH]c3ccc(-n4cnnc4)cc23)CC1
277
+ C=CC=CC=CCC
278
+ O=C(O)c1cc(C(=O)O)c([NH+]([O-])O)cc1[NH+]([O-])O
279
+ Cc1nnn(C2CCN(Cc3ccccc3)CC2)c1-c1ccccc1
280
+ O=c1[nH]c2ccccc2o1
281
+ CC(=O)OC1CC2(C)C3CC1C(C)(C)C2(O)CCC3C
282
+ Cc1ccc(-c2cc(=O)c3cc(Cl)cc(Cl)c3o2)cc1
283
+ NC(=O)Nc1ccc(C2=NNC(=O)CC2)cc1
284
+ CC(C)(C)C1(C)CCNC1=O
285
+ C=C1CCC2OC2(C)CCC2C(C)CC12
286
+ CN(Cc1ccccc1)C(=O)CC1C(O)CCC2C(C)(CO)C(O)CCC12C
287
+ COc1cc(C=CC(=O)O)c(O)cc1O
288
+ CC(C)=CCCC(C)C1CC=C(C)C2CCC(C)=CC21
289
+ CC1(C)C2C=CCC1C2
290
+ O=C(O)C(=Cc1ccccc1[NH+]([O-])O)c1ccccc1
291
+ [O-][NH+](O)c1cccc(NC2OC(CO)C(O)C2O)c1
292
+ O=C(NCC1OC(CO)C(O)C1N1CCc2ccccc2C1)c1ccccc1
293
+ CCCCCCC=CCCCCC=CCCCCCC(=O)O
294
+ CCCCC12CN3CC(C)(CN(C1)C3C(O)C(O)C(O)CO)C2=O
295
+ NC(=O)OCC(N)C(=O)O
296
+ Cc1ccn(C(N)=O)n1
297
+ CC1(C)C=C(n2ccccc2=O)c2cc(C#N)ccc2O1
298
+ CC(=CC(=O)O)C=C(C)CC(C)CCCCC(O)CCO
299
+ COC(=O)C1CCC=C(C)C(=O)C2CC(C)(C)CC12
300
+ COC(=O)c1cocc2c(C)ccc1-2
301
+ CC(=O)CCC1=C(C)CCC(C)C1(C)C
302
+ CC(O)c1c(-c2ccc(F)cc2)noc1C(=O)NC1CCCCC1
303
+ CCn1c(O)c(C(=O)NCCN(C)C)c(=O)c2ccccc21
304
+ C=C(C)C1CCC2(C)OC3=C(CC12)C(=O)C1(O)COC3C1
305
+ CCCCCCCCCCCCCCCCCCCCCC(=O)NC(CO)C(O)CCCCCCCCCCCCCCC
306
+ COc1ccc(CCc2cc(=O)c3cc(OC)c(O)cc3o2)cc1O
307
+ CCCCCCC(=O)CC(=O)NC1CCOC1=O
308
+ COc1ccc(CCN(C)C)c2cc(-c3ccc(O)cc3)oc12
309
+ CC[C@]1(CCCc2ccccc2)CN(c2ccc(OC)cc2)C1=O
310
+ CC=C(C)CC1C(=O)Nc2ccccc21
311
+ C=CC1(C)CCC2(C)C(=C)C1CCC2C
312
+ COc1cc(O)ccc1C=CC(=O)c1ccc(O)cc1
313
+ CC1=C2CC3C(C)(O)C4OC4C(O)C3(C)C=C2OC1=O
314
+ OC(c1ccccc1)c1ccc(Br)cc1
315
+ COc1ccc(OCC(=O)NCc2c3c(c(OC)c4c2OCO4)CN(C)CC3)cc1
316
+ CC1=C(c2ccc(O)cc2)C(O)OC1=O
317
+ CCCCCC=CCC=CCCCCCCCC(C)=O
318
+ OCc1cc(Br)c(O)c(O)c1Br
319
+ Fc1ccc(-c2c[nH]c(C3COCCN3Cc3ccccc3)n2)cc1
320
+ CC=CC=CCCC(=O)CC(O)CCO
321
+ COc1ccccc1C(=O)Oc1ccc(Cl)cc1C(=O)C=Cc1cccs1
322
+ COC1CC(NC(C)=O)C(OC(C)=O)C(C)O1
323
+ CC(C)(C)C1CC(O)C(C(O)C=Cc2ccccc2)C(=O)O1
324
+ COc1cc2c(cc1O)CCN(C)C2Cc1ccc(O)cc1
325
+ CN1C(=O)CCC1(O)c1cccnc1
326
+ C=c1oc(=O)cc(OC)c1=CC(O)CC
327
+ CC(C)=CC1CC(C)C2(CC=C(C)CC2)O1
328
+ O=CC=Cc1ccccc1
329
+ CCCCCCCC(CCc1ccc(O)c(OC)c1)OC(C)=O
330
+ O=C(NCCc1ccccc1)c1c(O)c2cccc3c2n(c1=O)CC3
331
+ CC=C(CO)C(=O)OCC1=CCN2CCC(OC(=O)C=C(C)CO)C12
332
+ Nc1ccc(C(=O)c2ccc(N)c(N)c2)cc1N
333
+ CCCCCC(O)CC(=O)CCc1ccc(OC)c(OC)c1
334
+ COc1ccc(C2=C(c3ccc(OC)cc3)OCCC2)cc1
335
+ CCCCC1(CCCC)C(=O)NC(=Nc2ccc(OC)cc2)NC1=O
336
+ O=C(NCCOC(=O)N(O)c1ccc(Br)cc1)Oc1ccc(Cl)cc1Cl
337
+ [O-][NH+]1C=CC=CC1c1ccccn1
338
+ CCCCCC=CCC=CCCCCCCCC(=O)OCC(COC(=O)CCCCCCCCCCCCCCCCC)OC(=O)CCCCCCCCCCCCC
339
+ COc1ccc2c(C)c(CC(=O)O)c(=O)oc2c1C
340
+ Cc1cc2ocnc2nn1
341
+ CNCCCCCNCCCCCNC
342
+ CC(C)CCCCCCCCCC(C)C(=O)OCC(O)CO
343
+ CC(=O)C1C(c2ccc3c(c2)OCO3)C=CC2CCCCC21
344
+ Fc1ccccc1-c1nnc(C2CCN(Cc3cccs3)C2)o1
345
+ O=C(NCCCNC(=O)c1ccc2c(O)c(O)ccc2c1)c1ccc2c(O)c(O)ccc2c1
346
+ O=C(CSc1nc2ccccc2s1)NCC1CCCN(CCc2ccccc2)C1
347
+ CCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
348
+ CC(C)CCCCCCCCCCC1(O)C=CC2N1CCC[NH+]2CCCCCNC(=N)N
349
+ CC(NC(=O)Cc1ccccc1)C(=O)NC(C(=O)O)C(C)C
350
+ O=C(O)C=CC(=O)Nc1ccc(C=Cc2ccccc2)cc1
351
+ COc1cc(C2COc3cc(O)ccc3C2)ccc1O
352
+ CN[C@@H](C)[C@H](O)c1ccc(O)c(O)c1
353
+ CCC(c1ccc(O)cc1)C(CC)c1cc(I)c(O)c(I)c1
354
+ C=NN(C)C=C(C)N
355
+ CN1CCC(c2nc3ccccc3s2)C1
356
+ Cc1ccc(-c2c[nH]c(C3COCCN3C)n2)cc1
357
+ C=C(CCC1CCCC(=O)C1(C)C)CC(O)C=C(C)CC(=O)NCCc1ccccc1
358
+ COc1ccc(S(=O)(=O)C(CCC2CCCCC2)CC(=O)NO)cc1
359
+ COc1ccc(CCNC(=O)Cc2ccc3c(c2)OCC3)cc1OC
360
+ CC1(C)C[C]2[NH2+][C](N)[C](C(=O)[O-])[CH][C]2CO1
361
+ Cc1cc(=O)c2c(O)cc3c(c2o1)C=CC(C)(CO)O3
362
+ CC(O)c1cnccc1C(=O)c1nccc2c1[nH]c1ccccc12
363
+ CCC(C)(CC)C(C)C
364
+ COc1cc2c(c(O)c1C(C)O)C(=O)CC(O)C2O
365
+ Cc1cc(Br)c2c(C)ccc(C(C)C)cc1-2
366
+ C=C(C)C(O)COc1ccc2c(OC)c3ccoc3nc2c1OC
367
+ C#CC(C)N(C)C(=O)Nc1ccc(Cl)cc1
368
+ CCCCCCCCCCCC(=O)c1cc(C(=O)CCCCCCCCCCC)cc(C(=O)CCCCCCCCCCC)c1
369
+ C=C1CCC=C(CO)C(O)CC2C1CC2(C)COC(C)=O
370
+ O=C1CN2Cc3cc(N4CCCC4)ccc3N=C2N1
371
+ COc1cc2c(cc1OC)CCN(C)C(C(=O)c1cccc(N)c1)=C2
372
+ O=C(COC(=O)c1ccc(Br)cc1)c1cccc([NH+]([O-])O)c1
373
+ CN(C)C(=O)Oc1ccc2c(c1)OC(=Cc1cccc(F)c1)C2=O
374
+ CC(C)c1cc(C=O)cc(C(C)C)c1O
375
+ COC1=CC(=O)c2cc[nH]c2C1=O
376
+ CC(=O)OC(C)C(C=C1C=C(C)C2=NCCC3OC123)OC(=O)C(C)C
377
+ CC(C)C1=CC2CC1C1CCCC2C1O
378
+ CC(C)COP(C)(=S)OCC1CCCN2CCCCC12
379
+ CCOC(C)OO
380
+ CC(C)C(N)CC(=O)O
381
+ C=CC(=C)CCCC(C)CCCC(C)CCCC(C)C
382
+ CC1C(O)c2cocc2C(O)C2CC(C)(C)CC12
383
+ Nc1c(CC(=O)[O-])cccc1C(=O)c1ccc(Cl)cc1Cl.[Na+]
384
+ CC12CCC(N)CC1=CCC1C2CCC2(C)C1CCC2C(O)[SH](=O)=O
385
+ CC(C)=CCCC(C)(N)C1(C)CC=C(C)CC1
386
+ CC(=O)CCCCCCCCCCCCCCC=Cc1ccc2c(c1)OCO2
387
+ CCCC=CCC1(O)C2=C3C(CCC3N=C(N)N2)CC1C
388
+ CC(NC(=O)C(N)CC(O)C(=O)O)C(O)C12CC1CC=CC2=O
389
+ CC(=CC(=O)OC1C(C)=CC=C(C=O)C1(C)C)CO
390
+ CCCCCCCCCC(C)OC(C)=O
391
+ NC(=O)CCC(=O)O
392
+ CC1CCC2(C)C(CCC(O)C2(C)O)C12COC(=O)C2
393
+ O=C(NCCCN1CCC(Oc2ccccc2)CC1)C(c1ccccc1)c1ccccc1
394
+ CC1Oc2c(c(=O)[nH]c3ccccc23)C1(C)C
395
+ COc1ccc2oc3cc(O)c(O)c(O)c3c(=O)c2c1OC
396
+ OC(CNc1ccccc1I)CON=C(C1CC1)C1CC1
397
+ O=C1c2ccccc2C(=O)c2cc(CO)c(O)cc21
398
+ CC1=CCCC(C)=CC(C)(C)C=CCC1
399
+ CCCCCC=CCC=CCC=CCCCCC(=O)OC(CO)COC(=O)CCCCCCCC=CCC=CCCCCC
400
+ COc1ccc2cc(C(C)C(=O)OCC(=O)O)ccc2c1
401
+ Cc1c(CCC(=O)O)c(=O)oc2cc(OCC(=O)O)ccc12
402
+ C=C(C)C1CCC2=CC(OC2=O)C2=C(C)CC(O)(C2=O)C(C)C(=O)C1O
403
+ CCCCC(COCCOCCOCCO)CC(=O)O
404
+ C=CC1(C)CC(OC(C)=O)C2C(=C)C(=O)OC2C1C(=C)C
405
+ CCCCCCCCCCCCCC(=O)NC(COC1OC(CO)C(O)C(O)C1O)C(O)C=CCCC=C(C)CCCCCCCCC
406
+ Cn1c(N)nc2c3nccnc3ccc21
407
+ Cc1ccc2c(c1O)C(=O)c1c(O)cc(O)cc1C2=O
408
+ COC(=O)CC(c1ccsc1)c1oc2ccccc2c(=O)c1O
409
+ CC1(C)CCCC2(C)C(CO)C(CO)=CCC12
410
+ COC1=CC(=O)c2c(O)cc(CC(C)=O)c(O)c2C1=O
411
+ CC(O)C(OC1OC(C(=O)O)C(O)C(O)C1O)C(O)C(O)C=O
412
+ CC(=O)OCC12CCC1(OC(C)=O)C1C=C(C)C2CC(C)(C)C1
413
+ CCN(CC)c1ccc2c(COC(C)=O)cc(=O)oc2c1
414
+ CC(=O)C12OC1C1(C)C(=CC2=O)CCC(O)C1C
415
+ Cc1ccc2c(n1)CC(C(=O)CO)CCC2C
416
+ CC(C)(C)OC(=O)NCCNC(=O)C(N)CCCCNCCC(=O)NCCS
417
+ Brc1c(OCCCN2CCCCC2c2cccnc2)ccc2ccccc12
418
+ COc1ccccc1CNCCCCCCCCCCNCCSSCCNCCCCCCCCCCNCc1ccccc1OC
419
+ Oc1ccc(Oc2ccc(Br)cc2Br)cc1Br
420
+ CCCN(CCC)[C@@H]1CCc2cccc(C(=O)c3ccccc3)c2[C@@H]1C
421
+ CCC=CCC=CCC=CCC=CCC=CCCCC(=O)OC(CO)CO
422
+ CC(C)C(=O)OCC(=O)c1cc2c(cc1O)OC(C)(C)C=C2
423
+ CC1COC2=C1C(=O)C(=O)c1c2ccc2c1C(O)CCC2(C)C
424
+ CC(O)c1cc(OCCCC(C)(C)C(=O)O)ccc1OCCCC(C)(C)C(=O)O
425
+ Cc1ccc(C(C)C)c2c1CCC(C)(O)C2O
426
+ CN(C)c1ccc2nc(N)oc2c1
427
+ COC(=O)c1ccc(NC(=O)N2CCc3nc[nH]c3C2c2ccccn2)cc1
428
+ O=C(OCC1OC(Oc2ccc(CO)cc2O)C(O)C(O)C1O)c1ccccc1
429
+ C=C1C(=O)OC2C1C(OC(=O)C=C(C)C)C(O)C1(C)C(O)CCC(=C)C21
430
+ CC(CCc1ccccc1)NC(=O)CC1NC(=O)c2ccccc2NC1=O
431
+ CCCCC(N)CCCCCCC(C=CC(=O)O)Nc1ccc[nH]1
432
+ COc1ccccc1C(=O)NCC1COCc2nc3cc(C)ccc3n21
433
+ Cc1cc2ncoc2nn1
434
+ C[C@@H]1CCC[C@@H]2C[C@@H](NC(=O)c3cc(Cl)ccc3O)CCN21
435
+ C=C1CCC2C(C=C(C=O)CCC1O)C2(C)CO
436
+ CC(C)CCCCCCCCCCCC=CC(=O)NC=Cc1ccc(OC2OC(C)C(O)C(O)C2C)cc1
437
+ COc1ccc2c(c1)C1CNC2Cc2ccc(Cl)cc21
438
+ C=CC(C)(C)C1C(=O)C=C(c2ccccc2)OC1=O
439
+ Nc1ncnc2c1ncn2NC(=O)COCP(=O)(O)O
440
+ O=c1c(-c2ccc(O)c(O)c2)coc2cc(O)cc(O)c12
441
+ CCCC(=CCCC=CCCCCOC(C)=O)CCC
442
+ COC(=O)c1cc(C)c[nH]1
443
+ COc1c(C(C)=O)ccc2c1C=CC(C)(C)O2
444
+ CCCCCCCCCCCCCCCCCC(=O)OC(C)C(=O)OC(C)C(=O)O
445
+ CC#CC#CC#CC=CC=CC(CCO)OC(C)=O
446
+ C=CCC(CC=C)(CC=C)c1ccc(CCCCCCCCC)cc1O
447
+ O=C(NCC1OCC(NCc2cccs2)C1O)c1ccccc1
448
+ COc1cc(NC(C)CCCN)c2ncccc2c1Oc1ccc(Cl)cc1
449
+ C[C]1[CH][CH][C](N)[NH+](C[C]2[CH][CH][C](F)[CH][CH]2)[CH]1
450
+ C#CC(C)(C#C)c1ccccc1
451
+ NC(Cc1cc(O)c(O)cc1O)C(=O)O
452
+ COc1ccc(NC(=O)OC2COC3C(NC(=S)NC4CCCCC4)COC23)cc1
453
+ Cc1ccc(O)c(-c2cc3cc(C(=N)N)ccc3[nH]2)c1
454
+ CC(C)CC(N)C(=O)NC(CCC(N)=O)C(=O)N1CCCC1C(=O)O
455
+ CCCCCCCCCCCCCCCCNc1ccc(C(=O)NC(=O)c2ccccc2)cc1
456
+ COc1ccc(C=CC(=O)NC(=S)N2CCCCC2c2cccnc2)cc1
457
+ CC(=O)OC1CCC2(NC(=O)c3ccccc3)CSC1C2OC(C)=O
458
+ CNc1ccc2ncnc(Nc3cccc(Br)c3)c2n1
459
+ O=C(O)COc1cc(OCC(=O)O)c2c3c(c(=O)oc2c1)CCC3
460
+ ClCC1Cc2ccccc2CN1
461
+ COC(=O)CCC1C(=C(C)C)CCC(C)C1(C)Cc1c[nH]c2ccccc12
462
+ CCc1c(C)nc2ccc(OC)cc2c1O
463
+ C#CC=CC(O)C(C)O
464
+ CCCCCCCCCCCCCCCCCCCCCOCC(O)CO
465
+ COC(=O)CC(c1ccc(O)cc1)c1cc2c(cc1O)OCO2
466
+ O=C(CCn1cnc2ccccc2c1=O)N1CC2CC(C1)C1CCCC(=O)N1C2
467
+ O=S(=O)(c1ccc(F)cc1)N1CCCCC1c1cccnc1
468
+ CCOc1ccc(C=CC(=O)c2cc(NC(C)=O)ccc2O)cc1
469
+ CC(=O)OC1CCN2CC(N)CCC12
470
+ Cn1c(C(C#N)=NNc2cccc(Cl)c2)nc2ccccc21
471
+ O=C(O)C1CCCNC1
472
+ COc1c(O)cc2c3c1-c1cc(O)ccc1CC3NC2=O
473
+ O=C(CBr)Nc1ccc(Oc2ccc(C(=O)O)cc2)cc1
474
+ COc1cc2c(cc1O)CNCC2c1ccc(O)c(C)c1
475
+ CC(=O)Nc1ccc2c(c1)nc1n2C(CNCc2ccc(C(=O)O)cc2)COC1
476
+ C=C1OC(=O)c2c(O)cc(OC)c(C)c2C1(C)O
477
+ CC(NCc1c(O)ccc2c3c(c(=O)oc12)CCCC3)C(=O)O
478
+ COCc1cn(C2COC3C(NC(=O)NC4CCCCC4)COC32)nn1
479
+ CCCCCCCCCCCCCCCCCCCCCCCC(C)=CCCOC(=O)CCCC(C)O
480
+ C=CC1(C)CCC2(C)C3=CC(=O)CC(C)(CO)C3CCC2C1
481
+ NC(CSCCS(=O)(=O)O)C(=O)O
482
+ COC(=O)c1ncn2c(=O)n(CCCl)nnc12
483
+ CC(=O)OCC(C)C#CC(CO)=C1C=CC(=O)O1
484
+ N#Cc1nc(C(=O)O)c(O)cc1-c1cccs1
485
+ Cc1nc2c(=O)n(C)c(=O)n(C)c2[nH]c1=O
486
+ Fc1ccc(CN2CCOCC(Oc3cccnc3)C2)cc1
487
+ C1CNCCSCCSCCN1
488
+ CNC(=O)C(Cc1ccccc1)N1CCCC1C(=O)NCCOC
489
+ Nc1ccc(NC(=O)OCCCc2c[nH]cn2)cc1
490
+ O=c1cccc2n1CC1C=NCC2C1
491
+ Cc1ccc2c(CCl)cc(=O)oc2c1
492
+ c1ccc(CCN2CCc3ccccc3C2)cc1
493
+ CCCCCC=CC1C=CCCC1C(C)=O
494
+ CC(C)(C)C(=O)Oc1ccc2c(c1)OC(=Cc1ccccc1Br)C2=O
495
+ O=C(O)C(CCCCn1cnc2c1NC=NCC2O)Cc1cccc(Br)c1
496
+ COc1ccc(C=CC(=O)C2(C)CO2)cc1OC
497
+ CCCCCC=CCC=CCC=CCC=CCC=CCCC(=O)O
498
+ CCCSSCc1ccco1
499
+ N#Cc1ccc(OS(N)(=O)=O)cc1
500
+ COc1cccc(CN2CCC(c3nc4ccccc4[nH]3)C2)c1
501
+ CC(C)C1=Cc2ccc3c(c2C(=O)C1=O)C(O)CCC3(C)C
502
+ CC(=O)OC1(C(C)=O)CCC2C3C=CC4=CC(=O)CCC4(C)C3CCC21C
503
+ CC(=O)OC1C=CC(O)C2(CC(=O)OC2C=C(C)CCC=C(C)C)C1
504
+ O=C(Cc1c[nH]c2ccccc12)OC1C(O)C(O)C(O)C(O)C1O
505
+ COC(=O)c1c[nH]c(=O)c(C(CC(N)=O)c2cccnc2)c1O
506
+ CCCCCCCCCC(C)CCCC
507
+ O=C(C=Cc1ccccc1)c1cc2occc2cc1O
508
+ CC(C)Oc1ccc(CNCCC(c2ccco2)C(C)C)cc1
509
+ CC(C)=CCc1c(OCC(O)C(C)(C)O)ccc2ccc(=O)oc12
510
+ COC(=O)c1cnn2c1NC1=C(C(=O)CCC1)C2c1ccc(OC)cc1
511
+ C=CCC=CCC=CCCCCCCCc1cccc(O)c1
512
+ CC(=O)N1CC2CN(CC(C)C)CCN2C(C)(CO)C1
513
+ CC1OC(C)OC(C)O1
514
+ COC(=O)C1CC(NC(=O)c2ccc(OC)cc2)CN1C(C)=O
515
+ O=C(O)c1cc2ccccc2c(O)n1
516
+ CCC(C)OC(=O)NC(CC)(C(F)(F)F)C(F)(F)F
517
+ COC1=CC(=C(c2ccccc2)c2ccccc2)C=CC1=O
518
+ O=C(NC=CNC(=O)c1ccccc1)c1ccccc1
519
+ C=CC(C=Cc1ccc(O)cc1)c1ccc(O)c(OC)c1
520
+ O=c1cc(CCc2ccccc2)oc2c1C(O)C(O)C(O)C2O
521
+ O=C1/C(=C/c2ccccc2)Cc2ccccc21
522
+ C=CC1CN(C(C)=O)CCC1CCCc1ccnc2ccccc12
523
+ C=CC(C)(C)c1ccc2c(c1)CCC(C)(C)O2
524
+ O=c1nccc2[nH][nH]cc1-2
525
+ Cn1cc(C2CC(=O)Oc3c2c(=O)oc2ccccc32)cn1
526
+ Cc1oc2c(C)c3oc(=O)c(CCC(=O)O)c(C)c3cc2c1C
527
+ CN1CC2CN(C(=O)N3CCOCC3)CCN2C(C)(C)C1
528
+ CCCCCC=CC=CC(=O)NCC(C)CC
529
+ O=C(O)CCCCCCCCCCCCC1CCCC1
530
+ CC12OC(=O)C1(C(O)C1C=CCCC1)NC(=O)C2CCCl
531
+ CCCCCCCCCCCCCCCCCCNC(=O)OCC1(COC(=O)N(Cc2cccc[n+]2CC)C(C)=O)CCCCCC1.[I-]
532
+ CC(C)=CCCC(C)(O)C1C(O)CC(C)(O)C1C
533
+ COc1cccc(NC(=O)NC2COC(CN3CCOCC3)C2O)c1
534
+ CCCNC(=O)NC1CC(COC)C(O)C1O
535
+ O=C(O)C1CC(O)C=N1
536
+ CCCCCC(=O)N1CCC(CC(=O)O)C(Cc2nc3ccc(C)cc3[nH]2)C1
537
+ O=C(C1CCCCC1)N1CCC(c2nnc(-c3cccnc3)o2)C1
538
+ COc1cc(O)c(C(C)=O)c2c1CC(C(C)(C)O)O2
539
+ CC(=O)c1c(C)cccc1O
540
+ C=CC(C)(C)C(=O)C(=O)CC(=O)c1ccccc1
541
+ CCOC(=O)C1=CCCN(C)C1
542
+ COc1ccc(-c2n[nH]cc2C(=O)NCC2CCCN3CCCCC23)cc1
543
+ CC(=O)OC1CC(O)C23C(=O)C(C)C1C2(C)C(C)CCC3O
544
+ C=C(C)C1CCC1C(=C)C
545
+ CCOC(=O)CN1C(=O)C2CCCCN2C(=O)c2ccccc21
546
+ CC(C)(C)OC(=O)N(CCCCCOCc1ccccc1)OCc1ccccc1
547
+ C=C1OC(=O)C(=CCCCCCCCCCCCCCCCC)C1O
548
+ CCCc1cc(C(=O)O)n(C)n1
549
+ C=CC=CCCCCCCCC#CC#CCO
550
+ CCc1cc(C)cc(C)n1
551
+ COc1ccc(C=C2C(=O)OC(=O)c3ccccc32)cc1
552
+ CC1OC(n2ccc(NC(=O)Cc3ccccc3)nc2=O)CCC1O
553
+ CC1=CCc2oc3ccc(C)cc3c2-c2oc(C)cc21
554
+ COC(=O)C(C)Oc1ccc2cc(-c3ccc(OC)cc3OC)c(=O)oc2c1
555
+ COC1(OC)CCC2(C)C(CCC3C4CCC(O)C4(C)CCC32)C1
556
+ O=C1C(=CCCO)C2OCC=CC2=C1O
557
+ CCCCCCCCC/C=C/CC/C=C/[C@@H](O)[C@H](CO)NC(=O)CCCCCCCCCCCCCCC
558
+ C=C1C=CC(C(C)C)CC1O
559
+ O=C(C1CCCC1)N1CCOCC(Oc2cnccn2)C1
560
+ CC1=CC2CC3(C=CC(=O)O3)C(C)(C)C2CC1
561
+ CCOC(=O)C=C(Br)Br
562
+ CCc1c(OC)cc2c(c1O)C(=O)C=CC2=O
563
+ COC(=O)C1(O)CC(O)C(OC(=O)C=Cc2ccc(O)cc2)C(O)C1
564
+ O=C(c1ccco1)c1coc2ccc(O)c(CN3CCCCC3)c12
565
+ CCCN(CCC)[C@@H]1Cc2cccc(O)c2C[C@H]1C
566
+ Nc1c(C=O)cc(C=O)c(N)c1N=O
567
+ CC1CC2CC(=O)C3CCCN4CCC(O)C2C34C1
568
+ Cc1cc2c(c3c1OCO3)C(C)CCC2C(C)CC(=O)CC(C)C
569
+ CC(C)(C)c1cc(C(=O)c2cccs2)cc(C(C)(C)C)c1O
570
+ CN(Cc1cccc(O)c1)C(=O)c1cc[nH]n1
571
+ O=c1c(-c2cc(O)cc(O)c2)coc2cc(O)cc(O)c12
572
+ Oc1cc2cnnc-2c[nH]1
573
+ CC=CC=CC=CCCC=CC=CC(=O)NCC(C)(C)O
574
+ C=C1C(=O)OC2CC3C(=CC12O)CCC1C(C)(C)CCCC31C
575
+ CO[C]1[CH][CH][C](C(O)[C]2[NH2+][CH][CH]N2C)[CH][CH]1
576
+ COc1ccc2c(=O)cc(-c3cc(OC)c(OC)cc3OC)oc2c1
577
+ COCCNC(=O)C1CN2CCC1CC2Cn1cc(CN(C)C)nn1
578
+ CC(C(=O)OC1C(O)C2CC(O)CC1N2C)C(O)c1ccccc1
579
+ COC1=Nc2ccc(NC(=O)CCCCCCC(=O)NO)cc2C(C)(C)C1
580
+ Cc1cnoc2ncnc1-2
581
+ CCOC(=O)c1cc(C)n(-c2ccccn2)n1
582
+ CC(C)[C@@H](NC(=O)[C@H](CO)NC(=O)CCC[C@H](N)C(=O)O)C(=O)O
583
+ Clc1ccc(CNCCC(c2ccccc2)c2ccc3c(c2)OCO3)cc1
584
+ CC1=CC(=O)C(C)(C2(C)CC(CO)=CC2=O)CC1
585
+ CCC(C)CN=C(O)C=CCCCCC=Cc1ccc2c(c1)OCO2
586
+ CC(=O)C1CC2(O)C3(C)COC2(C)CC1(O)C3
587
+ c1ccc(OC(CC2CNC2)c2ccccc2)cc1
588
+ Clc1ccc(-c2cn3cc(I)ccc3n2)cc1
589
+ CC1(C)CN(S(C)(=O)=O)CC2CN(C3COC3)CCN21
590
+ O=C1OC(O)C2C(CO)=CCC2C1CO
591
+ C=C1OC(=O)C2(C(C=CC=CCCC)C=CC(O)C2O)C1O
592
+ COc1cccc(-n2c(C)nc3ccc(OCC(C)=O)cc3c2=O)c1
593
+ NC(CCC(=O)O)C(=O)O
594
+ CCCCCCCCCCCCCCCCCCCCCCCC(O)CCCCO
595
+ C=C(C)C1=CCC2(C)C(OC(=O)C(C)=CC)CCC(C)(O)C2C1
596
+ CCCc1c(C)c2cc3c4c(c2oc1=O)CCCN4CCC3
597
+ CCCCc1oc(CCc2ccc(O)c(OC3CCCC3)c2)cc1CO
598
+ NS(=O)(=O)c1cccnc1[N+](=O)[O-]
599
+ CCC=CC#CCCCCCCCCCCOC(C)=O
600
+ CCCCCCCCC(C)CCC
601
+ COc1cc(O)cc2c1-c1ccc(O)cc1CC2
602
+ O=S1(=O)Cc2ncc3ccccc3c2C1
603
+ Cn1sc(=O)n(-c2ccc(F)cc2)c1=O
604
+ CC(=O)NC1=C2CCCN3CCCC4C(C1)CC(C)CC243
605
+ COC(=O)C1(C(C)OC)CC(C)C(C)(OC(C)=O)C(=O)O1
606
+ N=Nn1cc2cccc-2o1
607
+ Cc1nc(C)c(-c2ccnc(N)n2)s1
608
+ CC(=CC=CC=O)CO
609
+ CCCCCCCCOC(=O)c1ccccc1C(=O)OC
610
+ CC(=O)OC1CCC(C)(CCC=C(C)C)C2CC=C3COC(O)C3C12C
611
+ CCCOCCN(C(=O)CCl)c1c(CC)cccc1CC
612
+ O=C(C=Cc1ccc(O)cc1)CCCCc1ccc(O)cc1
613
+ Nc1nc(NCC(O)CO)c(Cl)nc1[N+](=O)[O-]
614
+ CC1CC(C)C2c3c(ccn(O)c3=O)OC(C)(C1)C2C
615
+ c1cc(-c2c[nH]c(C3COCCN3C3CCC3)n2)ccn1
616
+ O=C(CC1NC(=O)c2ccccc2NC1=O)NCc1ccco1
617
+ COc1ccc(CCNC(=O)NC(C(=O)O)C(C)C)cc1
618
+ C=CCCC(=O)C=CC1C(C)=CCCC1(C)C
619
+ CC1=C2C(=O)C=C(C(=O)O)C2C2OC(=O)C(C)C2CC1
620
+ CC(O)CCc1ccc(C(=O)O)nc1
621
+ CC(=CCCC(C)=CCC=C(C)C1CC=C(C)CC1)CO
622
+ COc1cc(CC(C)=O)c2c(=O)cc(C(=O)CCO)oc2c1
623
+ C=CCNC(=O)CC1CCN(C(=O)c2ccccc2)CC1CC
624
+ CC(=O)OCc1cc(O)c2c(c1)C(=O)c1cc(O)cc(O)c1C2=O
625
+ COc1cc(CCC(O)CCCCc2ccccc2)ccc1O
626
+ CC1=C(CC2C(C=O)=C(C)C3CC32)C(=O)OC1=O
627
+ C[S+](C)(=O)CCO
628
+ CC(C#N)C(C)OC1OC(CO)C(O)C(O)C1O
629
+ CCCCCC1(O)C(=O)C(C)(C)C(=O)C(C)(C)C1O
630
+ CC(NC(=O)C1CC1)c1onc(-c2ccc(F)cc2)c1C(=O)O
631
+ CC1(C)SC2C(NC(=O)C3(N)CCCCC3)C(=O)N2C1C(=O)O
632
+ Cn1c(=O)c2c(O)cc(=O)oc2c2ccccc21
633
+ COc1ccc(CNC2(Cc3cc(CC(C)C)on3)COC2)cc1
634
+ N=c1cccc2oncn12
635
+ C=C(C)C1Cc2nc(N)nc(C)c2C1
636
+ C=CCN1C(=O)C(C(=O)Nc2ccccc2)C2CC1(C)Oc1ccccc12
637
+ COc1c(OC)c(OC(C)=O)c2cc(C)ccc2c1OC(C)=O
638
+ Cn1c(=O)c2nc(O)[nH]c2n(C)c1=O
639
+ Cc1cccc2c1CCc1cc(C(C)C)ccc1-2
640
+ CCCCCCc1c(C)c2cc3c(C(C)(C)C)coc3c(C)c2oc1=O
641
+ Nc1cccc(NC(=S)Nc2cccc(N)c2)c1
642
+ C#CC[C@@H](N)C(=O)O
643
+ O=C(NC1CCCCC1)OC1COC2C(NC(=S)Nc3ccc(F)cc3)COC12
644
+ c1ccc2[nH]nnc2c1
645
+ COc1c2ccccc2nc2oc(C(C)(O)COC(=O)c3ccccc3)cc12
646
+ Cc1cc(C)c(CC(=O)c2cc3ccccc3o2)c(C)c1
647
+ CN1CCc2cn(C)c3c2C1CC(=O)C3=O
648
+ CC(=CCCc1ccoc1)CCC=C(C)CCCc1ccoc1
649
+ CN1CCC23c4c5ccc(O)c4OC2C(=O)CCC3C1C5
650
+ O=C1c2ccccc2C(=O)N1N1C(=O)c2ccccc2C1=O
651
+ CC(=O)c1cnc(C)cn1
652
+ CC(=O)OC1C=C2COC(O)C2(O)C2(C)CCC(O)C(C)(C)C12
653
+ CCOc1ccc2c(C)c(-c3cccc(Cl)c3)c(=O)oc2c1
654
+ CC1=C(Cn2c3ccc(Br)c(=O)c-3nc3ccccc32)C(O)CC(C)(C)C1
655
+ Cc1ccccc1CC(C)C=O
656
+ COc1cccc(CNC2CC(COc3cccc(C)n3)C(O)C2O)c1
657
+ CC(C)COC(=O)Cc1cc(O)cc2c1C(=O)CC(CC(C)O)O2
658
+ CCCCCCCCCCCCS(=O)(=O)N(C)[C@@H]1CCN2CCc3ccccc3[C@@H]2C1
659
+ COc1cccc(CN2CC(F)C(OCc3nc4ccncc4[nH]3)C2)c1
660
+ OCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCO
661
+ O=C(Nc1ccc(Cl)cc1Cl)C1CC2CC3CCC1C2C3
662
+ CCCC(C)OC(=O)C(C)CCC
663
+ COc1ccc(CCN2C(=O)NC(CC(=O)NCCO)C2=O)cc1
664
+ CCOC(=O)c1c(-c2ccc(OC)cc2)oc2ccc(OC)cc12
665
+ ON=Cc1ccc(O)c(O)c1
666
+ CC1=CCCC(C)(O)C=CC(C(C)C)CC1
667
+ COC(C)(CO)C1CC=C(/C=N/O)CC1
668
+ CC(C)CCC(CCNCc1ccc(OC(C)C)cc1)C1CCOC(C)(C)C1
669
+ CCCC=Cc1cc2c(O)ccc(O)c2c(=O)o1
670
+ CC1CC(=O)Oc2cc(O)ccc21
671
+ CC(=O)NCC1OC(CC(=O)NCCN2CCOCC2)C(O)C1O
672
+ CCOC(=O)Cc1c(C)c2ccc(OC)c(OC)c2oc1=O
673
+ CC(=O)c1cc(CC2CNCCC2CC(=O)N(C)Cc2ccccc2)no1
674
+ CC12CCC3(CC1Cl)C(=CC(=O)CC3(C)C)C(=O)O2
675
+ O=C(O)C1OC(OCC(Cl)(Cl)Cl)C(O)C(O)C1O
676
+ CC1C=CCCC=CC=CCC=CCCC(=O)O1
677
+ COC(=O)c1ccccc1NC(=O)Cc1ccccc1C(=O)O
678
+ C=CCCCCC=CC#CC#CC=CC#CC=O
679
+ Nc1cnn(-c2ccccc2)c(=O)c1Cl
680
+ CC=C(C)C(O)C(C)C=C(C)C=CCC(C)=CCC(=O)NC(C)CC(=O)O
681
+ CCCN(CCC)C(=O)C1CC(=O)OC12CCOC(C)(C)C2
682
+ Cc1ccc(C(=O)c2cc(N)ccc2N2CCC(C)CC2)c(C)c1
683
+ O=C(NC(=O)C(F)(F)F)Nc1ccc(I)cc1
684
+ CCCCCC=CCC=CCCCCCCCC(=O)OC(COCCCCCCCCCCCCCCCCCCCC)COP(=O)(O)OCCN
685
+ C=CC1(O)CC(NC=O)=CC1=O
686
+ COc1ccc(-c2nnc(-c3nsc4ccccc34)o2)cc1
687
+ CCc1cccc(C=O)c1
688
+ c1ccc(Cc2nnc(C3CCN(C4CCC4)C3)o2)cc1
689
+ Cn1c(=O)ccc2c(NC/C=C/C#CC(C)(C)C)cccc21
690
+ CCCCCCCCCC(=O)CC(=O)NC1CCOC1=O
691
+ Cc1nc(N)ncc1CNC(=O)Cn1c(C)cnc(NCCc2ccccc2)c1=O
692
+ CNC1=NC(=Cc2c[nH]c3ccccc23)C(=O)N1C
693
+ COCCOc1ncccc1-c1noc(C2CCCN2C)n1
694
+ NCCCCOc1ccccc1CCc1ccccc1
695
+ CN(CCc1ccccc1)c1cnc2nc(N)nc(N)c2c1
696
+ C=CC(C)=CCC1(C)C(C(=O)O)=CCCC1C
697
+ CN(C)c1nc(N)nc2c1ncn2CC(=O)O
698
+ CCCCCC(O)CC(=O)CCCCCC(O)CCO
699
+ COc1cccc2c(=O)c(=O)c12
700
+ O=C(O)Cc1ccccc1Sc1c(Cl)c(Cl)cc(Cl)c1Cl
701
+ CCCCCCCCCCCCOc1cc(C(N)=O)cc(C(N)=O)c1
702
+ CC(C)CC(=O)OCC1(CO)CC(=C(C(C)C)C(C)C)C(=O)O1
703
+ O=C1CCC2=C1C1=CCCOC1OC2
704
+ CCNC(=O)NC1CC(Cc2cc(C(C)(C)C)on2)C1(C)C
705
+ Brc1ccccc1Nc1nc2ccccc2n2ccnc12
706
+ C=CC1(CO)CCC(C(C)(C)O)CC1C(=C)C
707
+ COc1ccccc1S
708
+ CC1[C@H]2Cc3ccc(NC=O)cc3[C@]1(C)CCN2CC1CC1
709
+ CN1CCSCCN(C)CCSCC1
710
+ Cc1c2c(cc3c1C(=O)CC3)C(=O)OCC2
711
+ COc1ccc(C2(C(=O)NC(CO)C(C)C)CCOCC2)cc1
712
+ CC1CCC23CC(=CCCC2C1(C)CCc1ccoc1)C(=O)O3
713
+ CC(=NNC(N)=S)C(=S)Nc1ccccc1
714
+ C=CCCCCCCCO
715
+ CC1CC(=O)C(CC(=O)O)C1C[NH+]([O-])O
716
+ CC1Oc2ccccc2C=C1C=O
717
+ COC(=O)C(Cc1ccc(O)cc1)NC(=O)c1ccc(OCC(C)C)cc1
718
+ O=C1N=C(c2c[nH]c3ccccc23)C(=S)N1
719
+ Cc1cccc2cc[nH]c12
720
+ CC1(O)CCC2C(C=CC3(C)OCCC(O)C23C)C1
721
+ CCN(CCCCCCO)C1CCc2cc(OC)ccc2C1
722
+ CC1(C)CC(C(=O)N2CCc3[nH]c4ccc(Cl)cc4c3C2)CCO1
723
+ CCCC(NC(=O)c1ccccc1)c1nc2ccccc2[nH]1
724
+ CC(NCc1c(O)ccc2c3c(c(=O)oc12)CCC3)C(=O)O
725
+ CCCCCCCCCCc1ccccc1S(=O)(=O)O
726
+ CCCCCC(O)C=CC1C(O)CC(=O)C1CC(=O)CCCCC(=O)O
727
+ CCCC(=O)C1C(=O)OCC1CO
728
+ NC(=O)C(c1ccccc1)(c1ccc(F)cc1)c1ccccc1F
729
+ CSC=CC(=O)N(C)CCc1ccccc1
730
+ CC(C)C(C)C(C)(C)C
731
+ CN(C/C=C/c1ccccc1)Cc1ccc2c(c1)OCCO2
732
+ CCCCC=C(C)C=CC1=C(C)C(=O)CCC1(C)C(=O)O
733
+ O=C(NCC1OCC(NCc2ncc[nH]2)C1O)C1CCCC1
734
+ CCCCCC=CCCCCCCCCCCCCCCCC(O)C(O)C(CO)NC(=O)C(O)CCCCCCCCCCCCCC
735
+ COCCNC1C2OCC(O2)C(NCc2ccccn2)C1O
736
+ CC(N)C(O)C=Cc1ccccc1
737
+ CC(=O)NC(CCCC#CC=CCl)CCCC=CC=C(Cl)Cl
738
+ O=C(CCC(=O)OCC(F)(F)F)NC(=S)Nc1cc(Cl)ccc1Cl
739
+ COc1cc2c(cc1O)C1Cc3cccc(O)c3CN1CC2
740
+ CCCCCCCCCCCCCCCCC(C)CO
741
+ COc1ccc2c(c1)CCc1ccc(OC)c(OC)c1-2
742
+ CC(C)CC1NC(C)SC(C)S1
743
+ C=C1CCCC2(C)CCC3(O)OC12C(O)C3(C)CCC(=O)C(C)C
744
+ COc1cc2c(c(O)c1CC=C(C)CCC=C(C)C)C(=O)N(CCc1ccccc1)C2
745
+ CC1C=CCC2(C)C=CC(C(C)(C)O)CC12
746
+ COc1cccc2c1C(=O)C(O)=CC2=O
747
+ COC(=O)C(=CNCCc1c[nH]c2ccccc12)[NH+]([O-])O
748
+ N#CC(=NNc1ccc(Cl)cc1)c1nc2ccccc2s1
749
+ O=C(O)CCC(=O)NC1OCC=C1CO
750
+ CC1CC(O)CC(C)(C)C1CCC(O)CO
751
+ C=C1CCC2C(C=C(C)C(O)CC1O)OC(=O)C2C
752
+ CC1(C)Cc2cc(C(=O)c3ccccc3)ccc2OC1=O
753
+ CCCCCC(O)C=CC1C(O)CC(=O)C1CCCCCCCO
754
+ O=C(NCCCc1ccccc1)[C@@H]1CCCN1S(=O)(=O)Cc1ccccc1
755
+ C=CC[C@@H](CC/C(C)=C/C=C/CCCO)OC
756
+ COc1ccc(NC(C)=O)cc1
757
+ CC1=CC(O)C(C(C)C)C(OC(=O)C=Cc2ccc(O)cc2)CC(C)=CCC1
758
+ COc1cc(OCC(O)CO)ccc1O
759
+ CC(=CC1CCC(C)C2C1=C(C)CC2O)C(=O)O
760
+ CC1=CCC(C(C)C)c2cc(C)ccc21
761
+ CC1=CC(=O)C(=C(O)C=Cc2ccccc2)C1=O
762
+ CC1CCC2C(C)(C)CCCC23Oc2ccccc2CC13C
763
+ CC(=O)NCCC(F)CNC(=O)CCC(=O)N(O)CCC(F)CNC(=O)CCC(=O)N(O)CCC(F)CN
764
+ CCC=CCC=CCC=CCC=CCC=CCC=CC(=O)O
765
+ O=C(O)Cn1c(=O)c2ccc(F)cc2n(Cc2ccc(Br)cc2F)c1=O
766
+ CC(=O)OCc1ccc(C(C)(O)COC(=O)C(C)C)c(O)c1
767
+ CC(C)CC=CC1(C)OCC23CCC4C(CCC5CC(=O)C=CC54C)C2CCC13
768
+ COCCC(=O)N1Cc2c(ncn2Cc2ccccc2)CC1C(=O)OC
769
+ CNC(=O)C(C)(C)N1CCCC1C(=O)N1CCCCCC1
770
+ CC(C=CC1=C(C)CCCC1(C)C)=COC1C=C(C)C(=O)O1
771
+ CNCC(C)(O)c1ccccc1
772
+ CCCCCCCC=CC(=O)NCCc1ccccc1
773
+ COC(=O)CCCCCCc1ccc(OCCOCCO)cc1
774
+ COc1ccc(-c2cc(=O)c3c(O)c(C)c(O)c(C)c3o2)cc1
775
+ COc1cccc2c1CCCC2CCCCN1CCN(C2=NCCCC2)CC1
776
+ CC(C)NCC(O)c1cc(O)cc(O)c1
777
+ CC(=O)CC1OC(C)C(C)c2c(C)c(O)cc(O)c21
778
+ CC(CCc1ccccc1)NCC(O)CNC(C)C1COc2ccccc2O1
779
+ CCCCOC(=O)CC1(O)C(=O)OC1C(=O)OCCCC
780
+ CC(C)CC12NC(=O)C3(O)C(=O)CC(Cc4ccccc4)(OC13)O2
781
+ COc1c(-c2ccccc2)oc2cc3occc3cc2c1=O
782
+ C#CC=CCC(OC(C)=O)C1CC=CCC(Br)C(CC)C1
783
+ COC(=O)C(CCSC)NC(=O)c1cc(C(C)C)nc2ccccc12
784
+ C=C(OC1C=C(C(=O)OC)C(O)C(O)C1O)C(=O)OC
785
+ O=C(O)C1CCN2CC(O)CC12
786
+ CN(CC#CCCC1SCCCS1)Cc1cccc2ccccc12
787
+ CCCCCCCCCCC1CCCCC1
788
+ Cc1ccc(N)c(S(=O)(=O)O)c1
789
+ O=C(Oc1ccc2c(=O)c(-c3ccccc3)coc2c1)c1ccco1
790
+ COc1ccc(NC(=O)N2CCc3c([nH]c4ccccc34)C2)cc1
791
+ COc1ccc(C(=O)NC2COC3C(OC(=O)Nc4ccccc4)COC23)cc1
792
+ Nc1ccc(C(=O)OCCCOC(=O)c2ccc(N)cc2)cc1
793
+ CC(=O)C12OC1(C)CC1C3CC=C4CC(O)CCC4(C)C3CCC12C
794
+ CC(=O)NC(CCCC#CC=CCl)CCCC#CC=C(Cl)Cl
795
+ CC(C(O)c1ccccc1)N(C)CCO
796
+ CCCCCC(O)CCCCCC(=O)O
797
+ CC=C(C)C(=O)OC1CCC2CC3OC(=O)C(C)=C3C3OCC1C23C
798
+ C=C1CCC2CCC3C(O)=NC(=CC(C)CC(O)C1)C3(O)C2(C)C
799
+ COC1OC(OC2C(O)C(O)C(O)C(O)C2O)C(O)C(O)C1O
800
+ CC(CCC(C=O)(OO)C(C)C)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C
801
+ Cc1ccc(C)c(NC2=CC(=O)CC(C)(C)C2)c1C#N
802
+ CC1(CO)CCCC23COC4OCC(=CCC12)C43
803
+ CC1=CCCC2(C)OC2C(O)C(C(C)C)C(=O)C1
804
+ CC(C)(Oc1cccc(CCCCCCCCO)c1)C(=O)O
805
+ CC#Cc1ccc(-c2ccc(C(=O)C(O)C(C)=O)s2)s1
806
+ CC(N)C(=O)N1CCCC1C(=O)OCc1ccccc1
807
+ COC(=O)c1ccccc1N1C(=O)c2ccccc2C1=O
808
+ CC1(C)CCc2cc(CC(=O)Cc3ccc(O)cc3)ccc2O1
809
+ O=C(O)CC(O)(Cc1ccccc1)Cc1ccccc1
810
+ CC(=O)C=CC1C(C)=CC(O)CC1(C)C
811
+ C=C(C)C(O)Cc1c(O)ccc(C(=O)C=Cc2ccc(O)c(CC=C(C)C)c2)c1O
812
+ CCCCC1C(C)CCC2C1(C)CCC1C(C)(C)CCCC12C
813
+ O=C(C=Cc1ccccc1)c1cc2cc(Br)ccc2oc1=O
814
+ O=C(O)CCCNC(=O)OCC1c2ccccc2-c2ccccc21
815
+ CCC(C)CCC1C(COC2OC(C(O)CO)C(O)C2O)CCC2C(C)(C)CCCC12C
816
+ NC1=CC(=O)c2ncccc2C1=O
817
+ C=CC(C)(O)CCC1(C)C2=CCCC(C)(C)C2CCC1C
818
+ O=C(O)CN1C(=O)[C@@H](NC(=O)[C@@H](CS)Cc2ccccc2)CCc2ccccc21
819
+ CCC(C)C(O)C(=O)O
820
+ CN(C)Cc1cn(CC2CC3CCN2CC3C(=O)N(C)C)nn1
821
+ C=CC(C)=CCC1C(=C)CCC2C(C)(C(=O)O)CC(O)CC12C
822
+ O=C(O)c1ccc(COc2ccc3ccc(=O)oc3c2)o1
823
+ CNC(=S)N(C)CCc1cc2c(c(OC)c1C=NO)OCO2
824
+ O=C1CC(O)Cc2cc(O)cc(O)c21
825
+ OCC(NCC1NCC(O)C1O)c1ccccc1
826
+ CC(C)(O)C=CC1=CC(O)C(O)C(O)C1O
827
+ CC(C)(C)NC(=O)CCl
828
+ COc1ccc(Cn2cccc2/C=C/C(=O)CC(=O)C(=O)O)cc1
829
+ CC(=O)Oc1cc(C)c(O)cc1CC=C(C)CCCC(C)=CC(=O)CC(C)C
830
+ CC(=O)c1cc(O)c2c(c1)OC(c1ccc(O)cc1)CC2=O
831
+ CCCCCCCCCCCCCCCNC(=O)C1CCCCC1
832
+ COc1ccc(CCNC(=O)Nc2cccc3[nH]ccc23)cc1OC
833
+ CCN(CC)C1C2OCC(O2)C(NCc2ccc(CO)o2)C1O
834
+ OC(CCC(c1ccc(F)cn1)N1CCNCC1)c1ccc(F)cc1
835
+ C=C(CCC=C(C)CO)C1CCC(C)(O)CC1
836
+ CC1OC(=O)C(O)C1OC(=O)C=Cc1ccc(O)c(O)c1
837
+ CCCCC1Cc2ccccc21
838
+ Cn1ccnc1CN1CC(F)C(OCc2nc3ccncc3[nH]2)C1
839
+ CC1=CCCC(C(=O)O)=CCc2cc(ccc2O)OC(C)(C)C(O)CC1
840
+ Cc1cc2c(c(CO)c1CCCl)CC(C)(C)C2O
841
+ OCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCO
842
+ C=Cc1cc2c(cn1)C(=O)OCC2
843
+ C=CCNCc1ccccc1
844
+ CCC(C)C(=O)OC1C(O)C2CC(O)CC1N2C
845
+ CC(=O)OC1CCC2(C)C3CCc4ccoc4C3(C)CCC2C1(C)C
846
+ CC(C)(O)C1Cc2c(ccc(C(=O)CCc3ccccc3)c2O)O1
847
+ O=CCc1ccc(O)c(O)c1
848
+ CCCCCCCCCCCCC(Br)CBr
849
+ COC(=O)COc1cccc(OCCNCC(O)COc2ccccc2F)c1
850
+ O=c1cc2cc[nH]n2cn1
851
+ NCCC[Se](=O)O
852
+ O=C(Nc1cccnc1)N1CCc2ccccc21
853
+ O=C(O)CCNC(=O)CCn1ccc2c(Br)cccc21
854
+ CCCCCC(=O)CC(O)CCc1ccc(OC)c(OC)c1
855
+ CC(C)=CCCC(C)=C1C=C2C(C)CCCC2(C)CC1
856
+ COc1ccc2c(c1)C(=O)c1c-2ncc(OC)c1C
857
+ CC(C)(C)CN1CCC(c2nnc(-c3ccccc3)o2)C1
858
+ O=C1c2c(O)cc(O)cc2OC(c2cc(O)ccc2O)C1O
859
+ OCc1cc(O)c(O)c(Br)c1
860
+ CC=C1C(=O)CC2C3CC=C4C(O)C(O)CCC4C3CCC12C
861
+ COC(=O)Cc1nc2cc3ccccc3cc2[nH]c1=O
862
+ CC(=O)Oc1ccc2oc(=O)c(-c3ccccc3)c(C)c2c1
863
+ CCCCCC=CCC(O)C=CC(O)C1CC1C1CCCCCC(=O)O1
864
+ Cc1cc2c(C(C)C)cc(O)c3c2c(c1O)OC3=O
865
+ CCCCCC(O)CCCC(=O)OCC(O)CO
866
+ CCCCCCCC/C=C\C/C=C\C=C\SCCCC(=O)O
867
+ CC1=CCC(C(C)C)C2C=C(CO)CCC12
868
+ CC(C)(O)CCc1c(-c2ccc(O)cc2O)oc2cc(O)cc(O)c2c1=O
869
+ O=c1c2cccc(O)c2nc2n1-c1ccccc1C(O)=NC2
870
+ Cc1cccc(-c2nn3c(-c4ccco4)nnc3s2)c1
871
+ COc1ccc2[nH]c3c(c2c1)CN(C(=O)CCC(=O)NCc1ccccc1)CC3
872
+ CCCNC(=O)N1CC2NS(=O)(=O)c3ccccc3OC2C1
873
+ COc1ccc(-c2ccc3cccc4c3c2C=C(O)C4=O)cc1
874
+ O=C(O)CCNC(=O)C(=Cc1ccc(O)cc1)NC(=O)c1ccccc1
875
+ COc1cc(C=CC(=O)NCCCCN=C(N)N)cc(OC)c1O
876
+ C=C1CCC(=O)C2(C)CCC(O)(C(C)C)CC12O
877
+ CCCCCCCCC(=O)CC(=O)NC1CCOC1=O
878
+ CC1c2[nH]c3ccccc3c2C(=O)C(O)C1C
879
+ CSCC(C=O)=Cc1ccccc1
880
+ C=C1CC(O)C=C(C)CCC(C(C)C)C(O)C1OO
881
+ COc1cc(CCC(=O)CCCCc2cccc(O)c2)ccc1O
882
+ CC1=C2CC34OC3(C)CC3OC34C(C)CC2OC1=O
883
+ CCCCCCSc1cc(C(N)=O)cc(SCCCCC)n1
884
+ CC(NC(=O)Cn1ccc2c(Br)cccc21)C(=O)O
885
+ COC(=O)c1ccc(OC)cc1O
886
+ COC(C#N)C(Oc1ccccc1)c1ccccc1
887
+ CC(C)NC(=O)NC1CCN(C(=O)CC(C)(C)C)C1C(=O)N(C)CC(N)=O
888
+ CC1=CC2C(=C(C)C)C(=O)CC(C)(O)C2CC1
889
+ c1ccc(N(Cn2nnc3ccccc32)c2ccccc2)cc1
890
+ CC(C)=CCc1c(CCCc2ccc(O)cc2O)cc(O)c2c1C=CC(C)(C)O2
891
+ COc1ccc2c3c1OC1C(=O)C[C@H](C)C4C(C2)N(C)CCC314
892
+ Cc1c(-c2cccc(Br)c2)c(=O)oc2ccc(Br)cc12
893
+ Cc1cc(C)c(CCP(=O)(O)O)c(C[C@H](N)C(=O)O)c1
894
+ CNCC(O)c1ccc(O)c2c1CCC(C)(C)C2
895
+ C=CC(C)(C)c1ccc(OC)c(C=CC(=O)c2ccc(O)cc2)c1
896
+ CC(=O)OCC=C(C)CCC1C(C)=CCC2C(C)(C)C(O)C(=O)CC12C
897
+ COC(=O)CC1CCC(=O)Oc2ccc(C(=O)O)cc2N1
898
+ COc1cc2c(c(O)c1OC)COC(C)C2
899
+ CCCCCC=CC1=C(CO)C2OC(=O)OC2C(O)C1O
900
+ COC(=O)CC(O)C1OC(c2ccccc2)C(O)C1O
901
+ CC(C)(C)c1ccc(-c2cc(CC3CNCCC3CC(=O)NC3CCCCC3)no2)cc1
902
+ CC1=CC=C2C(C)=CCC(C(C)C(=O)O)C=C12
903
+ COc1ccc2c(=O)c3ccoc3n(CCC(C)(C)O)c2c1
904
+ NCC(O)c1cccc2c1Cc1ccccc1-2
905
+ C=CCCCCCCCCCC#CCCCCC(=O)OC
906
+ Cc1ccc2nc(C3CCN(C)C3)[nH]c2c1
907
+ CC(C)=CCCC(C)C1CCC(C)(O)C2CC=C(C)C2C1O
908
+ COc1ccc(-c2coc3cc(O)c(O)c(O)c3c2=O)cc1O
909
+ CCc1nc(C)c(C)s1
910
+ Cc1cc(C)nc(NC(=O)c2cc(Cl)ccc2O)n1
911
+ CSc1cccc(N=C=S)c1
912
+ CCCCCC=CCC=CCC=CCCCCC(=O)OC(CO)COC(=O)CCCCCCCC=CCCCCCCCC
913
+ C=C1CCC2C(CN3CCOCC3)C(=O)OC2C2C1CC1OC12C
914
+ O=C(O)CCC(=O)C1COc2ccccc2O1
915
+ CC1C=C2CCC3C(C)(C(=O)O)CCCC3(C)C2CC1
916
+ C=C1CCN2CCCC12
917
+ COc1cc(C)c(OC)c2c1OCO2
918
+ C=CCCCC(C)C1CCC(C)=CC1=O
919
+ Cc1nc(NC(=O)c2ccc(C(C)(C)C)cc2)c(C)c(C)c1O
920
+ CC(C)c1ccc(NC(=O)OC2COC3C(NCC4CCCCC4)COC23)cc1
921
+ COc1ccc(CCNC(=O)NC(CC(C)C)C(=O)O)cc1OC
922
+ COc1ccccc1-c1nnc2n1NC(c1ccccc1)S2
923
+ COc1cc(C(CC(C)C)NC(C)=O)oc(=O)c1
924
+ C=CC1=CC(O)([NH2+][CH2-])C(=O)C1
925
+ CN(CC/C=C/c1ccccc1)Cc1cccc2ccccc12
926
+ CC(=O)C=C1CC(=O)Nc2ccccc2N1
927
+ C#CC1Cc2sccc2C(N)=N1
928
+ CC(C)C1CCC2(C)CC3OOC12C(O)C=C3C(=O)O
929
+ CC(=CCOc1cc2oc(=O)ccc2cc1O)CO
930
+ COc1ccc2[nH]cc(CCNC(=O)c3cccc4c3ccn4C)c2c1
931
+ CN(C)C=CC(=O)C1C(=O)CCC1=O
932
+ OCC#CCSc1nc2ccccc2o1
933
+ O=C(O)c1cc(O)c(Cc2c(O)cc(C(=O)O)cc2O)c(O)c1
934
+ CC(C)(O)C1CC=C(CO)CC1
935
+ CCC1CN(C(C)=O)CCC1CC(=O)NCc1cccc2ccccc12
936
+ CCC(NC(=O)OC)(C(F)(F)F)C(F)(F)F
937
+ Cc1nnc(NC(=O)c2cc(Br)cc(Br)c2O)s1
938
+ Cc1cc(=O)c2c(O)cc3oc(=O)c4cnccc4c3c2o1
939
+ COc1ccccc1N1CCN(CCCCNC(=O)C(C)c2ccccc2)CC1
940
+ Cc1cc(C)c2oc(=O)cc(O)c2c1
941
+ C=C1CCCC2C1(C)CCC(C)C2(C)CC1=C(O)C(=O)C=C(O)C1=O
942
+ CC(O)C1=Nc2c(nc(N)[nH]c2=O)NC1C
943
+ CNC(=O)C(C)(C)N1CCCC1C(=O)N(C)Cc1ccccc1
944
+ C=C1CC(O)CC2(C)CC3OC(=O)C(C)(O)C3CC12
945
+ CC1=C(C=O)CC2C1(C)CCC1C(C(=O)O)C(C)CCC12C
946
+ CC(CCc1ccc(O)cc1)NCCc1ccc(O)c(O)c1
947
+ O=C(O)CCCC(O)C=CC=CCCC(O)CC=CCCCCC(O)O
948
+ CCN(CC)CCNC(=O)c1cc(Cl)c(N)cc1OCC(OC)OC
949
+ CC(C)=CCOc1ccc2ccc(=O)oc2c1CC=C(C)C
950
+ CC(=O)C(O)Cc1ccccc1
951
+ O=c1cc(-c2ccccc2)oc2ccc(OCCCCCCN3CCCCC3)cc12
952
+ CN1C(C(=O)O)CSC1C1CSC(c2ccccc2O)=N1
953
+ CN1CC2(C)CN(CC(N)=O)CC(C)(C1)C2
954
+ COc1ccc(-c2c(O)cc3c(c2OC)OCO3)cc1
955
+ CC1=C(Cn2c3ccc(Br)c(=O)c-3nc3ccccc32)CCC(C)(C)C1
956
+ c1cc(-c2ccno2)[nH]n1
957
+ C=C(C)CC(C)=O
958
+ COC(=O)C1=CCC(C)CC1
959
+ CC1CC=CC2(C)C1C=C(CCCO)C1CCCCC12
960
+ COc1cccc(CN[C@@H](Cc2ccccc2)C(=O)OC(C)(C)C)c1
961
+ COc1ccc2c(=O)c(-c3ccc(Cl)cc3)coc2c1C
962
+ C=C1CCCC(C)(C)C1CCC(C)=CC(=O)OCC(O)CO
963
+ Cc1ccc(NS(=O)(=O)c2ccc(N)cc2)nn1
964
+ C=C1CCCC2(C)C1CCC13CC(CCC12)C(C(=O)O)C3
965
+ CC(N)C(O)=NC(CC(Cl)C1(O)CN=C1O)C(=O)O
966
+ CC1CC2OC(=O)C(C)C2C(O)C2(C)C(=O)C=CC12
967
+ CC(C)c1ccccc1C(C)(C)C
968
+ O=C(CC1OC(CNCc2ccccn2)C(O)C1O)N1CCN(c2ccccc2)CC1
969
+ COBN1CCC(Cl)C1COCc1ccc(C(=O)OC)s1
970
+ CC(=O)OCC1CC(O)C2(C)C(CO)=CCCC2C1(C)CCC(C)CCO
971
+ CC(O)c1c(-c2ccc(Cl)cc2)noc1C(=O)NC1CCCC1
972
+ O=C(O)CCCNc1ccccc1C(=O)O
973
+ CCC(=O)c1ccc(OC)c(OC)c1
974
+ CCCCCCCCC1(c2ccncc2)CCC(=O)NC1=O
975
+ Oc1[nH]cc2ncncc12
976
+ CNC(=O)[C@@H](NC(=O)[C@H](CCCc1ccccc1)CC(=O)NO)C(C)C
977
+ CC(=O)CC(C)=O
978
+ CC=CCC=CCCCCCCCCOC(C)=O
979
+ C[C@H]1C(=O)CCC2C1CCC1(C)C(O)CCC21
980
+ c1ccc2c(c1)c1[nH]ccc3cc[nH]c2c31
981
+ O=Cc1ccc(NCc2ccc(F)cc2)cn1
982
+ CCOC(=O)C1C(=O)C(=O)Nc2ccccc21
983
+ COCCOC(=O)c1c(C)oc2ccc(OCC=C(C)C)cc12
984
+ CCC=C(C)c1ccc(C(C)O)c(=O)o1
985
+ CC(C)C(C)CCC(C)C1CCC2C3CCC4C(C)CCCC4(C)C3CCC12C
986
+ O=C(O)COc1ccc2c(c1)OC(=Cc1ccccc1Br)C2=O
987
+ O=C(O)C(Cc1c[nH]c2ccccc12)NS(=O)(=O)c1ccccc1
988
+ CC(=O)N1c2ccccc2C2C1CC1CCC3C(C)(C)C(O)CCC3(C)C12C
989
+ CC(C)CCN1CC2Oc3ccccc3C(=O)N(C)C2C1
990
+ CC1(C)CCCC2(C)C1CC(O)C13C=CC(CC21)C(O)(CO)C3
991
+ Cc1ccc(CCNc2ccccc2)cn1
992
+ CC(=O)C(C)Cc1ccc2c(c1)OCO2
993
+ COc1ccc(OC)c(-c2cc3ccc(OC)cc3oc2=O)c1
994
+ COc1ccc(-c2oc3c(=O)cc(C)oc3c2CC(=O)NCCn2ccnc2)cc1
995
+ CCOC(=O)C(=NNc1cccc([NH+]([O-])O)c1)C(C)=O
996
+ N#Cc1cccc(CNC2C(c3cccnc3)CC(O)C2O)c1
997
+ COCCOC(=O)c1c(C)oc2ccc(OCC(N)=O)cc12
998
+ Nc1ncnc2[nH]cc(Br)c12
999
+ CC(C)(O)C(O)Cc1ccc2c(c1O)COC2
1000
+ Clc1ccc(-c2c(Cl)cc(Cl)c(Cl)c2Cl)cc1Cl
1001
+ c1ccc(Sc2ccccc2)cc1
1002
+ COC(=O)C(O)C(N)C(=O)O
1003
+ O=c1[nH]c2sccc2c(=O)n1CCN1CCN(c2ccccc2)CC1
1004
+ CCCCNc1nc(NC(C)(C)C)nc(NC(C)(C)C)n1
1005
+ CC1=C2CC3C(CC2(C)CCC1)OC(=O)C3CN1CCSCC1
1006
+ C=C(CCC(OO)C(=C)C)C1CC=C(C)CC1
1007
+ O=C(CCc1nc2cccnc2[nH]1)Nc1ccc(O)c(C(=O)O)c1
1008
+ Cc1cc(O)c2c(c1)Cc1cc(O)cc(O)c1OC2=O
1009
+ COc1ccc2c(c1)c(=O)oc1c(C)c3occ(C)c3cc12
1010
+ CC1=CCC2OC1CC1=CCC(C(C)C)C1(C)CC(=O)C2(C)O
1011
+ Nc1nc(O)c2c(n1)C(=O)C=CC2=O
1012
+ CCCCCCCC/C=C\C/C=C\C=C\Sc1cccc(C(=O)OC)c1
1013
+ O=C1CCC(=O)NCCCCCCN(O)C(=O)CCC(=O)NCCCCCN(O)C(=O)CCC(=O)NCCCCCCN1
1014
+ CSCCC(NC(=O)Cn1ccc2c(Br)cccc21)C(=O)O
1015
+ CC(C)(C)c1cc(CC2CC(NC(=O)C3CC3)C2(C)C)no1
1016
+ CCCOc1ccc(NCCC(=O)c2ccc3c(c2)OCO3)cc1
1017
+ COc1ccc2c(c1CCCC(=O)O)OC(c1ccc(O)cc1O)CC2
1018
+ NCCC(O)(P(=O)(O)O)P(=O)(O)O
1019
+ CC(C)CCCCCCCCCCC(=O)O
1020
+ CC=CC1=C(C=CC)C(=O)C2(C1)CC(O)C(=O)O2
1021
+ CC12CCC3C(C(=O)CC4C(O)CCCC43C)C1CCC2O
1022
+ O=C(OCC1CC2c3ccccc3C1c1ccccc12)C(Cl)Cl
1023
+ C=C(C)COc1ccc2c(C)c(CC(=O)NCCc3ccncc3)c(=O)oc2c1
1024
+ C#CC=CCCCCCCCCCCC=CCCCCCCCCCCCCC#CC#CCO
1025
+ CCCSSCCC
1026
+ COC(=O)c1ccc2c(=O)n(CC(N)=O)cnc2c1
1027
+ O=C(O)COc1ccc2c(c1)OC(=Cc1cccc(F)c1)C2=O
1028
+ COC(=O)/C=C/CNC(=O)[C@@H](CCSC)NC(C)=O
1029
+ CCCc1cc(-c2onc(C)c2-c2ccc(C(=O)OCC)o2)c(O)cc1OC
1030
+ C1=CC=CC=CC=CC=CC=CC=CC=C1
1031
+ C=c1ccn2c(=N)onc12
1032
+ Cc1cc(C)c(C)c(N)c1
1033
+ CC(=O)NC(Cc1ccc(F)cc1)C(=O)O
1034
+ CN1C(=O)NC(=Cc2ccccc2)C1=O
1035
+ CC(=O)c1cc2c(c(O)c1C)C(=O)C(=O)c1c(O)cccc1-2
1036
+ CN1CCN(C(=O)CC2CCNCC2Cc2cc(CN3CCCCC3)on2)CC1
1037
+ COCCOCC1(CNC(=O)c2nc3ccccc3[nH]2)CC(O)C(O)C1
1038
+ CC1CCC(C(C)C)C(OC(=O)c2ccccc2)C1
1039
+ C=C1CCC=C(C)CC2OC(CC2C(=C)C)C(C)(O)CCC1O
1040
+ CNC1CC(c2ccc(Cl)cc2)c2ccccc21
1041
+ CC(=O)OC(C)(C)C1CC=C(C)CCC=C(C)CC(O)C=C(C)CC1
1042
+ COC(=O)c1c[nH]c2cc(Br)ccc12
1043
+ C=CCCC(CC=C(C)CCOC(C)=O)C(=C)C
1044
+ OCC1=CCN2CC(O)C(O)C12
1045
+ CCCCSSC(CC)SC
1046
+ CC1=CC2C(C(C)O)CCC(C)(C)C2CC1
1047
+ Cc1c(Cl)cnc(N=C(N)N)c1Cl
1048
+ O=C1Nc2cc(Cl)ccc2C1=Cc1c[nH]c2ncccc12
1049
+ CCCCCCCCCC[S+]([O-])CCC(=O)NC(CO)(CO)CO
1050
+ COc1cc2c3c(cc4ccccc4c3c1O)NC2=O
1051
+ Cc1cc2oc(=O)cc(C)c2c(O)c1CN1CCCC1
1052
+ CC(=O)N[C@@H](CS)C(=O)N[C@H](C(N)=O)C(C)C
1053
+ CC(C)=CCCC1COC2OC(O)C3=CCC=C(C)CCC1C32
1054
+ C=C1CC23CCC4C(C)(C(=O)OC)CCCC4(C)C2(O)CCC1C3
1055
+ CC1(CO)CCCC2(C)c3ccccc3C(CO)C12
1056
+ CC(=O)NCCOc1cccc2c1N(C(=O)c1ccc3c(c1)OCO3)CCC2
1057
+ CCCCCCCCCCCCCCCCCc1ccc(O)c(O)c1
1058
+ CC(C)CCn1c(N(C)C)nc2c1c(=O)n(C)c(=O)n2C
1059
+ CCCCCC(CC(=O)CCc1ccc(O)c(OC)c1)OC
1060
+ CC(=O)OC1CCC2(C)C(CCC3C4CCCOC4(C)CCC32)C1
1061
+ OCC1CCN2CCC(O)C12
1062
+ CC1CCC2(OC2(C)C)C(=O)C1
1063
+ CC(C#N)c1ccccc1[NH+]([O-])O
1064
+ Cc1c(O)cc(C=Cc2ccccc2)cc1O
1065
+ COc1cc(=O)oc(C)c1C=CC(C)O
1066
+ CC(C)CCCCCCCCCCC=COCC(COP(=O)(O)OCCN)OC(=O)CCCCCCCCCCCC(C)C
1067
+ CCCCC(O)CCCCCCCCCCC(=O)O
1068
+ CCCCCCCC=CCCCCCC(=O)O
1069
+ COc1cc2c(c(O)c1C)C(=O)CCC2O
1070
+ C=C1OC2CC(C(=O)O)C=CC2NC1=O
1071
+ O=C(NNC(=O)c1cc([N+](=O)[O-])c[nH]1)Nc1ccccc1
1072
+ COC(=O)c1cccc(Nc2cc(C)nc3ccc(OC)cc23)c1
1073
+ COc1ccc(CCN2C(=O)NC(CC(=O)N3CCCC3)C2=O)cc1
1074
+ CC(C)c1cc(C(=O)N2CC[C@@H](N)C2)nn1C(C)(C)C
1075
+ CN(C)C(=O)Oc1ccc2cc(-c3ccc(Cl)cc3)c(=O)oc2c1
1076
+ CC(=O)Oc1ccc(C2Oc3cc(OC(C)=O)ccc3CC2OC(C)=O)cc1
1077
+ CCC(=O)OCC(C)=CCCC1(C)OC2(C)C=CC1CC2
1078
+ CC(=O)OC1c2c(C)coc2C(=O)C2CCCC(C)C21C
1079
+ CCC1CN(C)CC2C(C(=O)OC)c3c([nH]c4ccccc34)C(=O)CC12
1080
+ CC1(CO)CCCC2(C)C1CC(O)C13C=C(CC=O)C(CCC21)C3
1081
+ O=C(NC1CC2C(=O)NCC(CCC(=O)N3CCOCC3)N2C1)C1CC1
1082
+ Cc1cccc(OCC2(CNC(=O)NC(C)C)CC(O)C(O)C2)n1
1083
+ COP(=O)(O)ON1C(N)=NCC1CN(C)C
1084
+ CC(O)=NC(CCC(=O)O)C(=O)OP(=O)(O)O
1085
+ NCCCCC(=O)O
1086
+ CC(=O)OCC(=CCCC(=CCO)CO)CCC=C(C)C(O)CC=C(C)C
1087
+ Cc1ccc(C(C)(C)O)c(O)c1
1088
+ CC12CCC3C(CCC4CC(O)CCC43C)C1CCC2CCO
1089
+ c1ccc(-c2cc3ccncc3cn2)cc1
1090
+ CCCCCCCCCCCCCCC(O)CO
1091
+ NC(=O)NCCCCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
1092
+ CC(CO)CCC(=O)C(C)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C
1093
+ COc1ccc(Cn2cnc3c([nH]c4cc(OC)ccc43)c2=O)cc1
1094
+ C=C1CCCC2(C)CC3OC(=O)C(CNC4CCCCC4Cc4ccco4)C3CC12
1095
+ NC(Cc1c[nH]c2ccc(O)cc12)C(=O)O
1096
+ CC1=CC(=O)C2=C(CO)CCC3C(C)C(=O)OC3C12
1097
+ CNCCCCCN
1098
+ NCCCC(N)C(=O)NCC(=O)O
1099
+ COc1ccc(SCc2cnc3nc(N)nc(N)c3n2)cc1OC
1100
+ COc1cc(O)c2c(=O)c3c(C)cc(O)cc3oc2c1
1101
+ CC(=O)NC(CSNN=C1N=CN=C1C(N)=O)C(=O)O
1102
+ N=c1cc2[nH]nccc-2o1
1103
+ CSc1cc(=O)n2c3ccccc3c3ccnc1c32
1104
+ C=CC(C)=CC(C)C(=O)C=CCC1CC(=O)NC(=O)C1
1105
+ COc1ccc(C(=O)CCc2ccccc2)c(OC)c1CC=C(C)C
1106
+ CC1CCCC(C)(C)C1=NO
1107
+ COC1OC(=O)C2=C1C1(C)CCCC(C)(C)C1CC2O
1108
+ COc1ccc(C2NCCc3[nH]cnc32)cc1
1109
+ CCC=CCC=CCC=CCC=CCC=CCCCCCC(=O)OC(CO)COC(=O)CCCCCCCCCCCC=CCCCCCCCC
1110
+ Cc1[nH]c2ccccc2c1CCNC(=O)c1c(F)cccc1F
1111
+ C=C(C(=O)OC)C1CCC2(C)C(=O)C=CC(C)(O)C2C1O
1112
+ CCOC(=O)CC(c1ccc2nccnc2c1)c1oc(CO)cc(=O)c1O
1113
+ CCOC(=O)CC1CC(C(C)C)C(CNC(=O)C(C)(C)C)C=C1C
1114
+ CCN(CC)C(=O)c1ccccc1
1115
+ CC1CCCCCCCC(=O)O1
1116
+ COc1ccc2nc(CC3CN(Cc4cnc[nH]4)CCC3CC(=O)O)[nH]c2c1
1117
+ CC(C=C1CN2CCCC2C(C)(O)C1)CO
1118
+ COc1ccc(C(=O)Nc2c(Cl)cncc2Cl)cc1OC
1119
+ CCC=CSC
1120
+ C=CC=CCC1=C(C)C(O)CC1=O
1121
+ CCOc1ccc(NC2=CC(=O)c3ncsc3C2=O)cc1
1122
+ CC1=CCCC2(C)OC2C(O)C(C(C)C)CC1
1123
+ CCCCCC(O)CC(=O)CCCC(=O)CC(O)CCCCCCCC(O)CC(=O)CCCC(=O)CC(O)CCCCC
1124
+ CCCOc1ccc(C(=O)Oc2cc(=O)oc3ccccc23)cc1
1125
+ CC(=O)OC1CC2C(C)(C)CCC(O)C2(C)C2CCC3CC12C1OC31C
1126
+ CCOC(=O)C(C#N)=Cc1ccc(N(C)C)cc1
1127
+ Cc1c[nH]c(=O)[nH]ccc(=O)n(Br)c1=O
1128
+ COCC(COC(C)=O)OC(C)=O
1129
+ CC(=O)Nc1cccc(Cl)c1Cl
1130
+ CC(NC(=O)c1ccccc1)c1c(-c2ccc(F)cc2)noc1C(=O)O
1131
+ CCCCCC=CCCCCCCC=CC=CC(=O)NCC(C)C
1132
+ CCCCC=CC=CC=CCCCCCCCC(=O)OCC
1133
+ CC(C)CC(N)C(=O)NC(CCC(=O)O)C(=O)NC(CCCCN)C(=O)O
1134
+ CCCCCCCCCC(=O)CC1C2=COC(CC(C)O)=CC2=CC(=O)C1(C)O
1135
+ c1ccc(C2=NOC(c3ccccc3)C2)cc1
1136
+ COc1ccccc1CCn1cnc2c([nH]c3ccccc32)c1=O
1137
+ Cc1coc2c1c(C)cc1oc(=O)c(CC(=O)NCCN(C)C)c(C)c12
1138
+ CC(=O)c1c[nH]c2ccccc12
1139
+ N[C@H](C(=O)O)c1ccc(C(=O)O)c(O)c1
1140
+ CS(=O)(=O)NCC1OCC(NC2COC2)C1O
1141
+ COc1ccc(C=CCC(CO)C(O)c2ccc(OC)c(OC)c2)cc1
1142
+ COc1ccc(CNC(=S)NCc2ccccc2)cc1
1143
+ COc1ccccc1-c1c(C)oc2cc(OCC(N)=O)ccc2c1=O
1144
+ O=C(O)Cc1ccc(O)c(O)c1
1145
+ CCCc1oc2c(c(=O)c1CC)C(=O)CC(C)(C)C2
1146
+ CC1=CCOC1=O
1147
+ CC(=NCCc1ccccn1)c1c(O)n(C)c2ccccc2c1=O
1148
+ C1CSCCOCCSCCO1
1149
+ Cc1cc2c(c(=O)o1)C(c1ccc3c(c1)OCCO3)CC(=O)O2
1150
+ COc1cc(O)c(-c2cc(=O)c3c(O)cc(O)c(OC)c3o2)cc1O
1151
+ COC(=O)C1C2CCC(CC1OC(=O)c1ccccc1)N2
1152
+ CC(C)=CCC=C(C)C(C)c1c(O)c2c(C)cccc2oc1=O
1153
+ OCC=CC#CC#Cc1ccccc1
1154
+ CC1(C)C=Cc2c(ccc(-c3cc4ccc(O)cc4o3)c2O)O1
1155
+ COC(=O)c1csnn1
1156
+ CCCCCC=CCC=CCC=CCCCCCCC(=O)NCCO
1157
+ Nc1c(OS(=O)(=O)O)cccc1C(=O)CC(N)C(=O)O
1158
+ CC(=O)OC1c2c(C)coc2C(=O)C2C(O)CCC(C)C12C
1159
+ COC(=O)CC(c1oc(CSc2ccc(OC)cc2)cc(=O)c1O)C(C)C
1160
+ O=C(Cn1ccc2cc(OCc3ccccc3)ccc21)NCC1CCCN2CCCCC12
1161
+ CC(C)=CCCC(C)=CCc1c(O)oc2cccc(C=O)c2c1=O
1162
+ C=C(CC)C(=O)O
1163
+ N#CCCCCC=CCC=CCCCCCCCCCCCCCCc1ccc(C=O)[nH]1
1164
+ CC1OC(=O)C=CC(O)C=CC(Cl)C1O
1165
+ O=C(O)c1c(O)c(Cl)cc(Cl)c1Cl
1166
+ CC(=O)C=CC12OC1(C)CC(O)C(O)C2(C)C
1167
+ CC1(C)Cc2ccccc2-c2nnc(-c3cccc(O)c3)n21
1168
+ CC(=O)SC(C)SC(C)=O
1169
+ COc1ccc2c(c1)nc(CC(C)(C)CC(=O)O)n2Cc1ccc(Cl)cc1
1170
+ CC1(C)C2CCC3(C)C(C2)C(C(=O)O)CCC13O
1171
+ CC(C)=CC1C(C)CCC2C(C)CCC12O
1172
+ Cc1ccc2c(c1)C(=O)c1c(O)cccc1C2=O
1173
+ CCCCCCCCCCCC(O)C(Cc1ccccc1)NC(C)=O
1174
+ CCOC(=O)C(=O)c1ccc(O)cc1
1175
+ CCCCCc1ccc(C(=O)OCC2CCCN3CCCCC23)c(=O)o1
1176
+ Cc1ccc(N)c(C)c1
1177
+ CCC=CCC=CCC=CCC=CCC=CCCCCCC(=O)OC(CO)COC(=O)CCCCCC=CCC=CCC=CCC=CCCCCC
1178
+ CCCCCCCCCCCC(=O)OCC(CO)OC(=O)CCCCCCCCCCC
1179
+ COC1(C)CCC2(CC1Br)C(C)=CC(=O)CC2(C)C
1180
+ O=C1C=CC2C3c4ccccc4C(c4ccccc43)C2C=C1
1181
+ CC=Cc1ccc(OC(=O)Cc2ccccc2)c(OC)c1
1182
+ O=C(Cc1ccccc1)NNc1ccccc1
1183
+ CN(c1ccccc1)c1nc(Cl)nc(Cl)n1
1184
+ O=C(NCCCOc1ccc2nc(O)ccc2c1)N(C1CCCCCCC1)[C@H]1CCCC[C@@H]1O
1185
+ CCCCCCC[C@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)NC
1186
+ CC(=O)NC(COC(C)=O)Cc1ccccc1
1187
+ CC(=O)Nc1ccc(OC(=O)c2ccc(Cl)cc2)c(C(C)=O)c1
1188
+ COc1ccc(OC)c2c1cc(C(=O)NCc1ccc3c(c1)OCO3)n2C
1189
+ CC1=NN(c2ccccc2)C(=O)C1=Cc1ccccc1O
1190
+ COC(=O)CC1C(C)=CC(=O)CC1(C)C
1191
+ CC1=CCCC(C)CC(=O)C2(C)CC(=O)C(C(C)C)C2CC1
1192
+ CCCC[NH+]1CCc2cc(OC)c(OC)cc2C1C
1193
+ Cc1ccc(NC=C2C(=O)OC(C)(C)OC2=O)cc1
1194
+ Cc1ccc2c(c1)C(=O)c1ccc(O)c(O)c1C2=O
1195
+ O=C(O)c1cccc(-c2ccc(F)cc2)c1
1196
+ O=C(NC(=O)C(F)(F)F)Nc1cc(Cl)ccc1Cl
1197
+ Cc1cc2c3ccccc3[nH]c2c2c[n+](C)ccc12.[I-]
1198
+ COc1c(C)c(O)c2c(c1C)COC2=O
1199
+ OC1COCCN(c2nc3ccccc3o2)C1
1200
+ O=C(C=Cc1ccc(Cl)cc1)c1cc(F)ccc1O
1201
+ CCOc1ccc(-c2cc(=O)c3cc(O)ccc3o2)cc1
1202
+ CN1C(=O)NC(=Cc2ccc([NH+]([O-])O)cc2)C1=O
1203
+ CCCCCCCC(O)C(O)CC#CC#CC(=O)CC
1204
+ CC(C)=C1C(=O)C=C2CCC(O)C(C)C2(C)C1O
1205
+ CCC(=NNC(N)=O)C1CC2(C)CCC1C2(C)C
1206
+ O=C1CCCCC1(O)CCO
1207
+ O=C(Cn1cnc2ccccc2c1=O)NC(Cc1ccc(O)cc1)C(=O)O
1208
+ C=C(C)C(O)COc1ccc2ccc(=O)oc2c1
1209
+ [O-][NH+](O)c1cc(-c2cn3ccccc3n2)ccc1I
1210
+ CCCCCC=CCC=CCCCCCCCC(=O)OCCCCCCCCCCCCCCCCCCCCO
1211
+ CCC=CCC=CCCCOS(=O)(=O)O
1212
+ CCCCC(C)C=C(C)C(=O)CC
1213
+ CCCCCCCCCCCCCCCCCCCC(=O)OCC(O)COP(=O)(O)OCCN
1214
+ CC(=O)C1(CCC(C)CCO)C(O)CC2C(C)(C)C(O)CCC21C
1215
+ CC(C)=CCCC(C)=CCOc1ccc(C=C2C(=O)N(O)C(C)C(=O)N2O)cc1
1216
+ C=C1C(=O)OC2C1CC=C(C)C1CC=C(C)C12
1217
+ CC(NC(=O)Cc1ccccc1)c1onc(-c2ccc(F)cc2)c1C(=O)O
1218
+ O=C(c1ccc(O)cc1)c1oc2cc(O)ccc2c1-c1ccc(O)cc1
1219
+ O=C1N[C@@H](Cc2ccccc2)C(O)[C@H](Cc2ccccc2)N1
1220
+ NCCC1=CC(C(C=O)CCCCCC(CC=CCCC(=O)O)Cc2ccc(O)cc2)CC1
1221
+ O=C(NNC(=O)c1ccc([NH+]([O-])O)cc1)Nc1ccccc1
1222
+ CCC1CN(C(=O)c2ccccc2)CCC1CC(=O)N1CCC(O)CC1
1223
+ c1ccc(-c2ccnc(NC3COC4C(NCC5CCCCC5)COC34)n2)cc1
1224
+ O=c1[nH]c(=O)n(C2CC(O)C(CO)O2)cc1F
1225
+ CC(O)C1CCC2C3CC=C4CC(N)CCC4(C)C3CCC12C
1226
+ CC1(C)COC(c2ccccc2Br)=N1
1227
+ Nc1c2c3c(cccc3[nH]c1=O)C(=O)c1ccccc1-2
1228
+ CCCCCCCCCCCCCCCCCCC(C)CCCCCCCC(C)C(C)=O
1229
+ CC(O)CCOC(=O)c1ccccc1C(=O)O
1230
+ CCC(C(=O)O)C(C)C
1231
+ CCCCC12CN3CC(C)(CN(C1)C3c1ccco1)C2=O
1232
+ O=C1C(CO)=CC(=O)c2c1cc1ccccc1c2O
1233
+ O=C(O)c1cc(Cc2c(O)ccc3ccccc23)ccc1O
1234
+ CCC(C)C1OC1(C)C1(O)C(C)=CC2CC(CO)=CCC2C1C=CC=CC=CC(=O)O
1235
+ O=C1c2ccc(O)cc2OCC1(O)c1ccc(O)cc1
1236
+ COC(=O)c1cc2ccccc2c(O)c1C(=O)OC
1237
+ CC(C)NC(=O)NCC1OC(CO)C(O)C1N(C)CCN(C)C
1238
+ CCCCCCCCCCCCC=CC(O)C(COC1OC(CO)C(O)C(O)C1O)NC(=O)CCCCCCCCCCCCCCCCCCCC
1239
+ COc1ccc(CNC(=O)CC2CCNCC2Cc2cc(CN(C)C)on2)cc1
1240
+ COc1ccc2c([nH]c3cc(O)c(C=O)cc32)c1CC=C(C)CCC=C(C)C
1241
+ Cc1nc2ncccc2c(=O)n1CCNC(=O)c1ccc2c(c1)OCO2
1242
+ C=CC(C)(O)CC(O)C=C(C)CC(O)C=C(C)C
1243
+ CC(=CC(O)C(=O)O)C1CC(=O)OC1(C)C
1244
+ C=C(C(=O)OC)C1CCC(C)(OCC)C2CC(O)C(C)=CC12
1245
+ CCCCCC(O)C=CC1CCC(=O)C1CCCCCCC(=O)NCCCN(C)C
1246
+ CS(=O)(=O)N1CC(F)C(OCc2nc3ccccc3o2)C1
1247
+ O=S(=O)(c1cccc([NH+]([O-])O)c1)N1CCCCC1
1248
+ COc1ccc2c(c1O)-c1ccc(O)cc1CC2
1249
+ CNCCCC(=O)c1ccc(O)nc1
1250
+ c1ccc2c(c1)-c1nc3ccccc3nc1-2
1251
+ CC(=O)OCC(=CCCC(C)=CCO)CCC=C(CCC=C(C)C)C(=O)O
1252
+ CC(=CC(=O)O)CCC1(C)C(C)CCC2(C)C(C)=C(C=O)CC21
1253
+ CCOC(=O)C1COC(=O)C2C1CCC2(C)O
1254
+ Cc1ccc(Br)c(N)c1
1255
+ CC(=O)OC1CC(C)(O)C2=CCC(C)=C2C2OC(=O)C(C)C12
1256
+ CCC1(C)CC(CCNCc2ccccc2)(C(C)C)CCO1
1257
+ COc1cccc2c3c(c(=O)n(C)c12)CC(C(C)C)O3
1258
+ O=C(NC1COC2C(Nc3nccc(C4CCCC4)n3)COC12)N1CCOCC1
1259
+ COc1cc(C2Cc3cc(O)cc(O)c3C(=O)O2)ccc1O
1260
+ CC=C(C)C=C(C)C(O)C(C)C(=O)CCC
1261
+ CCCCC=CC=CC=CCCCCCCCC(=O)OCC(O)CO
1262
+ CC1=CC(=O)CC(C)(C)C1CO
1263
+ COc1ccc(-c2[nH]nc(C)c2-c2ccc(OC)c(O)c2)c(O)c1
1264
+ COC(=O)C1=CC2C(O)C(C)(C)CC23C1COC(=O)C31CO1
1265
+ CC(CO)Cc1cc(O)c(O)c(Br)c1Br
1266
+ Cc1ccc(C(C)C)o1
1267
+ CCCCCCCC1CCCC(=O)NCCCN(O)CCCCNCCCN1
1268
+ CCCCC=CCCCC=CCCCCC=O
1269
+ O=C(CCC(O)Cc1ccccc1)OC1OC(C(=O)O)C(O)C(O)C1O
1270
+ O=C(O)C1C2C=CC3(CN(c4ccccc4)C(=O)C13)O2
1271
+ CCCCCCCCCCCCCC=CC(O)=C(O)C(=O)O
1272
+ CC(=CC(=O)O)CCC=C(C)CCC(=O)O
1273
+ CC(=O)N(CCC(Cc1ccccc1)c1ccco1)C(C)c1ccccc1
1274
+ O=c1ncncn1C1OC(CO)C(O)C1O
1275
+ COC1=CC(=O)C2=C(CCc3cccc(O)c32)C1=O
1276
+ Clc1ccc(-c2cn3cc(I)ccc3n2)cc1Cl
1277
+ CCCCCCCCCCCCCCCC(OC(C)=O)C(CO)NC(C)=O
1278
+ NC1COC2C(OC(=O)Nc3ccccc3)COC12
1279
+ CCCCCC(O)CC(CC(=O)O)OC1OC(CO)C(O)C(O)C1O
1280
+ CCCCCCCCCCCCCCCCCCCCCCCCCCCC(=O)NCCc1c[nH]c2ccccc12
1281
+ COc1ccc(-c2noc(C3CC(NCC4CC4)CN3C)n2)cc1
1282
+ CC=CC1=CC(O)C(O)C1=O
1283
+ CCCCCC(O)CC(CCc1ccc(O)c2c1CCC1CCCC1O2)OC(C)=O
1284
+ S=C(Nc1ccccc1)NN1c2ccccc2CCc2ccccc21
1285
+ CC1=CCCC2(C)OC2CCC(C)=CC2OC(=O)C(C)=C2CC1
1286
+ CNCCc1cc(O)c(O)c(O)c1
1287
+ CC1(C)CC2CC(C)(O)c3cocc3C(O)C2C1
1288
+ O=C(CC1(n2cccc2)CCOCC1)NCC1CCCN2CCCCC12
1289
+ CCCCCCC(=O)C=CC=CC(O)=NCC(C)C
1290
+ CCCCCCCCCCCCCCCC(=O)NCC
1291
+ CCCCC#CC#CC#CCCCCCCCCCC(=O)O
1292
+ O=C(Cl)CCc1ccccc1
1293
+ O=C1CCC(=O)NCCCCCN(O)C(=O)CCC(=O)NCCCCCN(O)C(=O)CCC(=O)NCCCCCN1
1294
+ O=C(O)C1CC(O)CN1C(=O)C1CCCN1
1295
+ CC(=O)OCCCC=CC=CC#Cc1cccs1
1296
+ COc1ccc(NCCNC(C)=O)c2c1C(=O)c1ccccc1C2=O
1297
+ COc1ccc(C=CCOC(=O)CCCCCCCC=CCC=CCCCCCO)cc1OC
1298
+ COc1cc(O)c(Cc2ccc(O)cc2)c(CCc2ccccc2O)c1
1299
+ COc1c2c(cc3c1C1C(OC3OC)C(O)C=C3CCN(C)C31)OCO2
1300
+ CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)OCc1ccco1
1301
+ COC(COC(C)=O)C(OC)C(O)C(CO)OC(C)=O
1302
+ CCCCCCCCCCC=CCCC(=O)O
1303
+ CC(C)=CCCC(C)=CCCC1(C)OC1Cc1c[nH]c([NH+]([O-])O)c1
1304
+ CCCCCC(=O)CC(=O)C=Cc1cc(CCO)c(O)c(OC)c1
1305
+ CC(C)=CCO
1306
+ COc1ccccc1CCO
1307
+ COc1ccc(CC(C)C(C)Cc2cc(O)c(OC)c(O)c2)cc1OC
1308
+ CC(=O)N1C(=O)C2CCCN2C(=O)c2ccccc21
1309
+ CN1Cc2sc(Br)cc2C(c2ccccc2)C1
1310
+ CCCCC1(CCCC)C(=O)NC(=O)N(C)C1=S
1311
+ COc1ccc(NCC(=O)CC(c2ccccc2)C2CCOC(C)(C)C2)cc1
1312
+ CC(C=CC1(O)C(C)=CC(O)CC1(C)CO)=CC(=O)O
1313
+ COC1COCCN(Cc2cn(C)c3ccccc23)C1
1314
+ CSCCC(NC(=O)CCn1ccc2cc(OCc3ccccc3)ccc21)C(=O)O
1315
+ CCCNC1CCc2nc(N)sc2C1=O
1316
+ C=C(C)C1CCC2(C)CC(=O)C=C(C)C2C1
1317
+ COC1C=C(C)CC(=O)C2(O)C(CC2(C)C)C(CO)=C1
1318
+ NC1C2COC(O2)C(Sc2ccccc2)C1O
1319
+ CCCOC(NC(C)c1ccc(OC)c(OC)c1)OCCC
1320
+ COc1cc(C(C)=CC(C)=CC(=O)O)oc(=O)c1CO
1321
+ CCCCc1oc2ccccc2c1C(=O)c1ccc(OCCC[N+](C)(C)C)cc1
1322
+ C=C(C)C12C=C3CCC4C(C)(C(=O)O)CCCC4(C)C3(CC1)OO2
1323
+ C#CC(O)C=CCCCCCCCCCCCCC=CCCCCCCCCCCCCCCC(O)C#CC#CC#CCO
1324
+ CCCCCCCCCOc1ccc(C=Nc2ccc(CCCC)cc2)cc1
1325
+ COc1cc(O)c2c(-c3cc(O)ccc3O)cc(=O)oc2c1
1326
+ O=C(NCCNC(=O)c1c[nH]c2ccccc12)c1c[nH]c2ccccc12
1327
+ CC(C)=CCc1c(O)c(O)c2c(c1CC=C(C)C)C(=O)C1C(O)=CC=C(O)C1O2
1328
+ CC=CC=CC(=O)C=CC1OC(=O)C(C)C1O
1329
+ C=CCNc1nnc(SCC(=O)N2CCN(c3ccccc3Cl)CC2)s1
1330
+ CC(C)NC(=O)CC1CC2OC(CNC(=O)N3CCOCC3)C(O)C2O1
1331
+ CC(C)C(CC(=N)O)c1ccco1
1332
+ CCCS(=O)(=O)N1CCC(C(=O)N(C)C)CC1
1333
+ CC1=CCC(C(C)(N)CC=CC(C)(C)O)CC1
1334
+ Cc1c(Br)cc2c(c1Br)OC1(CO)CCC2(C)C1C
1335
+ O=C(CC1OC(CNC(=O)c2ccccc2)C(O)C1O)NCCN1CCOCC1
1336
+ C=C1C(OC(C)=O)CC(C(C)CCC=C(C)C)C2OC12
1337
+ CC(C)=CCOc1cc(O)c2c(c1)OC(c1ccc(O)cc1)C(O)C2=O
1338
+ CCCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)NCCCCCCCCCCCN
1339
+ C=C1CCC2(C)CCC(O)C(C)(C)C23CC13
1340
+ Cc1ccc2c(c1)OC(c1ccc3c(c1)OCCO3)CC2=O
1341
+ CCCC(C)c1cccc(CC)c1O
1342
+ CC1(C)CC2C(O)C(O)C3=CCC3(C)C2C1
1343
+ CCC12C=CCN(CCc3c([nH]c4ccccc34)C(C(=O)OC)C1)C2
1344
+ COc1cccc2c1[NH+]([O-])c1cccc(O)c1[NH+]2[O-]
1345
+ CC(=O)OC1(C(C)=O)CCCCC1
1346
+ O=C(NCCS(=O)(=O)O)C(=O)c1c[nH]c2ccccc12
1347
+ COC(=O)C1=C(C)NC(=O)NC1c1ccc(Cl)cc1
1348
+ CC1=CC=CC(C)(C)C=C1
1349
+ CCCCCc1cc(O)cc(O)c1C(=O)OCC
1350
+ CC(=O)c1ccc(OCC(=O)NC(c2ccccc2)c2cccs2)cc1
1351
+ COc1cc(O)c(C)c2cc(C)ncc12
1352
+ CC=CCCC1Cc2cc(O)c(CC=CC)c(O)c2C(=O)O1
1353
+ Cc1ccc(NC(=O)C2CCC(=O)N2C2OC(=O)c3ccccc32)c(C)c1
1354
+ CC1(C)CCCC2(C)C3=C(CCC12)COC3=O
1355
+ COc1c(C)cc2c(C(C)C)cc(=O)oc2c1O
1356
+ COc1c2c(c(Br)c3c1C(=O)N(C)CC3)OCO2
1357
+ CC1CC2CC(=O)C3(O)CCCN4CCCC2(O)C43C1
1358
+ COc1ccc(-c2cc(C=CC=O)ccc2O)cc1C=CC=O
1359
+ CC=CC=CC#CC=CCOC(C)=O
1360
+ CCCCCCCCn1sccc1=O
1361
+ C#CC=CC(O)CCCCCCCCCCCCCC=CCCC=CC#CC(O)C#CCCCCC=CCCCCCCC=CC(O)C#C
1362
+ O=C(O)CCCCCCCCCCCCCCCCCCCCCCCO
1363
+ COC(=O)c1occ2c1C(C)(O)C1CC(C)(C)CC1C2O
1364
+ CC(C)n1cc(CCC(=O)Nc2ccc(C(N)=O)cc2)c2ccccc21
1365
+ CC12CN3CC(C)(CN(C1)C3c1cccc3[nH]ccc13)C2=O
1366
+ CCCCCCCCCCC(=O)C1(O)C(OC(C)=O)C=CC1OC(C)=O
1367
+ CNCC(C)Cc1ccc2c(c1)OCO2
1368
+ CC(C)(C)NC(=O)NC1CC(CO)C(O)C1O
1369
+ Nc1c(Cl)cccc1-c1c[nH]c(C(=O)O)c1
1370
+ COc1ccc2c(c1)CN(C(=O)NC(C)C)CCC1COC(=O)N21
1371
+ CC(C(O)CC(=O)O)C1CCC2C3C(O)CC4CC(O)CCC4(C)C3CCC12C
1372
+ CCC1c2ccc(C)n2CCN1C(=O)Nc1cc(OC)ccc1OC
1373
+ O=C(O)CC(NC(=O)CP(=O)(O)O)C(=O)O
1374
+ CC(C)C(NC(=O)Nc1ccc(OC(F)(F)F)cc1)C(=O)O
1375
+ CCCCCCC(=O)CCCCCCCC1CCC(=O)O1
1376
+ Cc1coc2c1C(OC(=O)CC(C)C)C1(C)C(C)CCCC1C2=O
1377
+ NC(=O)c1ccccc1NC(=O)CC(CN1CCCC1)C(=O)O
1378
+ CC(C)CCCCCCC#CC=COCC(O)CO
1379
+ O=C1C(=Cc2ccccc2)Oc2c1ccc(O)c2CN1CCCCC1
1380
+ CCCCC(Br)=CC(=O)O
1381
+ O=CC(O)CS(=O)(=O)O
1382
+ C=C1CCC2C(C)(C(=O)O)CCCC2(C)C1CCC(C)(O)CC(=O)O
1383
+ C=CC1(C)CCC2(C)C(CCC3(O)C2CCCC3(C)COC(C)=O)C1
1384
+ CO[C@@H]1CCOP(=O)(NCCCl)N1CCCl
1385
+ CC(O)C(=O)CCC(=O)OCCc1ccccc1
1386
+ CC=CC#CC#CC=CC=CCCCCC(=O)OC
1387
+ CCCCCCCCCc1ccc(Oc2ccc(C)cc2CC(=O)O)c(Cl)c1
1388
+ C[C]1[CH][CH][C](C(=O)C[NH+]2[CH][C](C)[CH][CH][C]2N)[CH][CH]1
1389
+ CC(=O)OC1C2CC(CC(=O)O2)OC1c1ccccc1
1390
+ CN(C)CCOc1cc2c(c3ccccc13)-c1ccc(O)cc1C2=O
1391
+ Cc1c(C)c2c(cc(C)c3c(C)coc32)oc1=O
1392
+ CC1=C2C(O)CC2(C)C2CC(C)(CO)C=C2C1=O
1393
+ COc1cc2oc(=O)ccc2c2c1C=CC(C)(C)O2
1394
+ COC(CN(C)CC1C(=O)OC2CC3(C)CCCC4(CO4)C3CC21)OC
1395
+ CC1=CC(=C(c2ccc(N)cc2)c2ccc(N)cc2)C=CC1=N
1396
+ CC=C(C)C=C(C)c1oc(=O)cc(OC)c1C
1397
+ COC(C)=C1C(=O)C=CC1=O
1398
+ C=C1C(=O)OC2C1C(OC(=O)C(C)=CC)CC(CO)C1CC=C(C)C12
1399
+ CC=Cc1cc(O)c(C(O)C(O)C(C)O)c(=O)o1
1400
+ CCCCCC/N=c1\ccn(Cc2ccccc2)c2c(OC)cccc12
1401
+ CCCCCCCCC=CCCCCCCCC(=O)NCc1ccccc1
1402
+ COc1cc2c(cc1OC)C1C(CC=C3CCN(C)C31)OC2
1403
+ O=C(C=Cc1ccccc1)OC1C(O)OC(CO)C(O)C1O
1404
+ CN(C)CCCCN(C)C
1405
+ CCc1cc2c(=O)c(-c3ccc(Cl)cc3)coc2cc1O
1406
+ COc1ccc2nc3n(c2c1)C(CNC(C)C)COC3
1407
+ CC(C)NCC(O)COc1cccc2ccccc12
1408
+ Cc1ccc(OCCNC(=O)C(=O)NCCC(C)C)cc1
1409
+ O=c1cc2[nH]ccn2cn1
1410
+ CC(CCCCCCCCCCCCCC=CC(=O)O)OC1OC(C)C(O)CC1O
1411
+ Oc1[nH]cc2nnccc12
1412
+ CC#CC#CC#CC=CC=CCCCC=O
1413
+ CCC1CCCCC1C
1414
+ CC1=CCCC(C=O)=CC2CC2(C)C(O)CC1
1415
+ Oc1ccc(-c2nnc3n2NC(c2ccco2)S3)cc1
1416
+ CC1=CCCC2=CC(CC(CO)=CC=C(C(C)C)CC1)OC2=O
1417
+ CC=C(C)C(=O)CC
1418
+ COC1=CC(=O)C(C)=C(OC)C1=O
1419
+ CCCCCC=CCC=CCC=CCCCCC(=O)OCC(O)COP(=O)(O)OCCN
1420
+ COC(=O)c1c(C)oc2cc(Br)c(OCc3ccccc3Br)cc12
1421
+ CC=CC#CC#CC=CCOC(C)=O
1422
+ O=C(O)CC(NC(=O)c1ccccc1NC(=O)c1ccccc1)C(=O)O
1423
+ CC(C)(C)CC(C)(C)c1ccc(O)c(Cc2ccc(Cl)cc2Cl)c1
1424
+ CN(CC1CCCN2CCCCC12)C(=O)CCCc1nc(-c2cccnc2)no1
1425
+ COc1ccc(Oc2oc3cc(O)cc(O)c3c(=O)c2O)cc1
1426
+ CC(C)=CCc1c(O)cccc1C(=O)c1c(O)cc(O)c(CC=C(C)C)c1O
1427
+ O=C(O)Nc1ccc(Cl)cc1
1428
+ COc1ccc(O)c(-c2cc(=O)c3c(O)cc(O)cc3o2)c1
1429
+ O=C(OCC1CCCN2CCCCC12)c1ccc2c(c1)OCO2
1430
+ COC(=O)C1C[C]2[NH2+][CH]N(C)[C]2CN1C(=O)CCC(=O)[O-]
1431
+ C=C1C(O)CCC(CC(=O)c2cc(O)ccc2O)(C(=O)O)C1O
1432
+ COc1ccccc1N1CC(C(=O)Nc2ccc3oc(=O)ccc3c2)CC1=O
1433
+ CCC(O)C=CC1C(O)CC(=O)C1CC=CCC=CCC=CCCC(=O)O
1434
+ CC1=CC2OC3C(O)C(O)C(C)(C2(CO)CC1O)C31CO1
1435
+ CC(C)C(N)C(=O)N1CC(O)CC1C(O)C(=O)O
1436
+ CCOC(=O)Cc1nc(-c2ccc(Cl)cc2)oc1-c1ccc(Br)o1
1437
+ Oc1ccc2cccc(N3CCNCC3)c2c1
1438
+ CC(C)CC1c2cc(O)c(O)cc2C=C2c3ccc(O)c(O)c3CCC21
1439
+ CN(C=Cc1ccccc1)C(=O)C=Cc1ccccc1
1440
+ O=C(O)C(Cl)=CCl
1441
+ CC12CCC(C(=CN(O)c3ccccc3)C1=O)C2(C)C
1442
+ O=C1CC2(CCCCC2)Oc2ccc(O)cc21
1443
+ O=C(O)CCCNC(=O)OCc1ccccc1
1444
+ CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCC1(CC(=O)O)CC(=O)C=CC1=O
1445
+ Cc1cccc(OCC2CC(NC(=O)Nc3ccccc3)C(O)C2O)n1
1446
+ CC1(O)Cc2cccc(F)c21
1447
+ CC1C(OC2OC(CO)C(O)C(O)C2O)CC2(C(C)C)CC12
1448
+ CN1CCCC(n2nc(Cc3ccc(Cl)cc3)c3ccccc3c2=O)CC1
1449
+ O=C1CN(C(=O)NC(Cc2ccccc2)C(=O)NC(CO)C(=O)O)c2ccccc2N1
1450
+ CCOC(=O)c1c(C)n(C)c2c1cc(O)c1occ(C)c12
1451
+ COc1ccc2oc(S(N)(=O)=O)cc2c1
1452
+ NC(=O)NC(=O)Nc1ccccc1Cl
1453
+ C=C(c1ccccc1)c1ccccc1
1454
+ C#CCNC(=O)C1CCCN1C(C(=O)NC)c1ccccc1
1455
+ CNCC(C)CN1c2ccccc2Sc2ccccc21
1456
+ C=CC1(CO)CCC2(C)C(CCC3(O)C2CCCC3(C)C)C1
1457
+ C=C1OC(=O)C(CCCCCCCCCCCCCc2ccc3c(c2)OCO3)C1=O
1458
+ CNC(=O)OCc1c(COC(=O)NC)c(C)n(-c2ccccc2)c1C
1459
+ CCCOc1ccccc1-c1nc2nc[nH]c2c(=O)[nH]1
1460
+ CCC1CN(C(=O)c2ccc(F)cc2)CCC1CC(=O)NCCO
1461
+ NC(=O)NC=C(C=O)c1ccccc1
1462
+ COc1cccc(CN2CCN3C(=O)N(c4ccccc4)CC3C2)n1
1463
+ COc1ccc2c3c1OC1CC(O)C=CC31CCNC2
1464
+ CC(C)CCCCCCCCCCCCCCCCCCCC(O)CO
1465
+ COC(=O)C1=CC2CC(C)(C)CCC(C)(O)C2CC1
1466
+ CC(=O)N1CC2CN(C(C)C)CCN2C(C)(CO)C1
1467
+ CC(C)NC(=O)C1CN2CCC1CC2Cn1cc(CO)nn1
1468
+ CON(C)CCCCCCCCCCC=CC#CCCCCC1=CC=C[NH+](C)C1
1469
+ C=NC1(C)CCC2C(C)CC3CC(C)(C)CC4=C3C2C1CC4
1470
+ CCc1cn(C2OCC(O)C2CO)c(=O)nc1O
1471
+ CCC=CC#CC#CC=CCCC(=O)CCOC1OC(CO)C(O)C(O)C1O
1472
+ NCCCCCNC(=O)C(Cc1ccc(O)cc1)NC(=O)C1OC1C(=O)O
1473
+ COc1cccc(I)c1O
1474
+ CC#Cc1cn(C2OC(CO)C(O)C2O)c(=O)[nH]c1=O
1475
+ CCCCCCCCCCC(=O)CCCC(=O)OC
1476
+ O=C(NN1CCC=C(CCCO)C1)c1ccccc1
1477
+ CC=CC#CC#CC=CCOC(=O)C(C)=CC
1478
+ COc1cc2c(cc1OC)C(=O)C(=Cc1cccnc1)C2
1479
+ NC(=S)NN=Cc1ccccc1O
1480
+ CC1(C)C2CCC3(C)C(C=CCC13O)C2
1481
+ CCOC(=O)C=Cc1ccc(OC2OC(CO)C(O)C(O)C2O)c(OC)c1
1482
+ Nc1cc([NH+]([O-])O)ccc1Cl
1483
+ COc1ccccc1C
1484
+ CC1=CC(=O)C2(C)CC(=O)C(C(C)C)C2CCC(C)=CCC1
1485
+ CC1CC2C=CC3(O)COC(=O)C3(C)C2CC1O
1486
+ CCC=CCC=CC=CC(O)CCCCCCCC(=O)OC
1487
+ CCC1(C)CC(CCNCc2ccc(OC)c(OC)c2)(CCC(C)C)CCO1
1488
+ CCCCCCCC(O)CCCCCCCCCCCCCCCCCCCCCCCC(C)CC
1489
+ Oc1cc2c(cc1O)C(c1ccco1)=NCC2
1490
+ COc1cc(CC(C)NCC#N)c(OC)cc1I
1491
+ CC(C)NC(C)C(O)COc1ccc(Cl)c(Cl)c1
1492
+ O=C(O)C(=O)CC(=O)c1ccc(Cc2ccc(Cl)c(Cl)c2)cc1
1493
+ CC(=O)OCCC(C)OC(C)=O
1494
+ CC1(C)CCCC2(C)C(CO)=CC1C2CCO
1495
+ COc1cc(O)c2c(c1C)C(C)(O)C(C)OC2=O
1496
+ CC(C)(O)C(NC(=O)C(CS)NC(=O)CCCC(N)C(=O)O)C(=O)NCC(=O)O
1497
+ CC1CC2OC(=O)C(C)C2CC2(C)C(O)OCCC12
1498
+ CCCCCCCCCCCCCCCC(=O)N1CCc2c([nH]c3ccccc23)C1C
1499
+ CC(C)=CCCC(C)(O)C1CC=C(COC(=O)c2ccco2)CC1
1500
+ CCn1c(C(=O)NCCc2c[nH]c3ccccc23)cc2sccc21
1501
+ COc1ccc(NC(=O)N2CCC(CO)N(C)c3cccnc3C2)cc1
1502
+ C#CC(O)C=CCCCCCCC#Cc1ccc(C#CCCCCCCC=CC(O)C#C)o1
1503
+ Cc1csc(C)c1
1504
+ COc1cc(C=CCO)ccc1Oc1ccc(C(O)C(O)CO)cc1OC
1505
+ COC(=O)CNC(=O)c1ccccc1
1506
+ CCCCCCC(=O)C=CC(=O)CCCCCCCC(=O)O
1507
+ Cc1cc(C)c2oc(=O)cc(CN3CCCCC3)c2c1
1508
+ COc1cc2c(cc1C(=O)COC(C)=O)C=CC(C)(C)O2
1509
+ CN1CCc2cc3c(c(OCC(N)=O)c2C1=O)OCO3
1510
+ CC(=O)OCC1(C)CCCC2(C)C1CCC1(C)OC(CO)(C(O)CO)CCC12
1511
+ CC(C)=CC(=O)OCC=CC=CCCC=CC(=O)NCC(C)C
1512
+ OC1CCCOC1C=CC#Cc1cccs1
1513
+ CC=C1CC2CCC(C)C1(C)C2(C)C
1514
+ NC(=O)C(=Cc1ccc(Cl)cc1Cl)c1nc2ccccc2s1
1515
+ O=C(O)c1ccccc1C1C(=O)c2ccccc2C1=O
1516
+ CCCCCCCCCCCc1cc(=O)c2c(O)cccc2o1
1517
+ CC(O)C#CC1(C)CCCC(C)(C)O1
1518
+ CC(C)CC/C=C(\NC(=O)C1CC1(C)C)C(=O)O
1519
+ CC(C=CC(=O)O)C1CCC2C3C=CC4=CC(=O)CCC4(C)C3CCC12C
1520
+ CCc1cn(CC(NC(=O)c2ccco2)C(=O)O)c(=O)[nH]c1=O
1521
+ CC=CC(=O)OC1C=CC(C)OC(=O)CCC(OC(C)=O)C1O
1522
+ CC(C)CCCOC(=O)CCC(C)C
1523
+ CC1(C)C2C=CC(O2)C(Cl)C1O
1524
+ NCC(=O)Oc1c(-c2ccc(O)cc2)oc2cc(O)cc(O)c2c1=O
1525
+ CC=CC(N)=S
1526
+ C=Cc1c(N)ccc2cnccc12
1527
+ COC(=O)c1ccccc1NC(=O)NCCC(C)C
1528
+ CC(=O)C(O)Cc1ccc(O)cc1
1529
+ CC(=O)OC1(C(C)C)CC=C(C)C2CCC(C)=CC21
1530
+ NCC(O)P(=O)(O)O
1531
+ O=C1C(=Cc2ccc(O)cc2)C(=O)c2ccccc21
1532
+ COc1cc(-c2cc(=O)c3ccccc3o2)cc(I)c1OC
1533
+ N#Cc1ccc(Cn2ccnc2)cc1OCCc1ccc(-c2ccccc2)cc1
1534
+ COc1ccc(C(=O)OCc2cc3cc(OC)ccc3nc2O)cc1
1535
+ NC(=O)[C@H](Cc1ccccc1)NC(=O)Nc1nnc(S)s1
1536
+ CN(C)CCN(C)CC1CN2CCC1CC2CNC(=O)c1ccccc1
1537
+ CCCCCCCCCCCCCCCC(O)CCO
1538
+ COCC(=O)NC1C(c2cccs2)N(CCC(C)(C)C)CCC1(C)O
1539
+ C=C(C)C(=O)OCCOP(=O)(O)Oc1ccccc1
1540
+ CC(=O)NC[C@@H]1OC(=O)N2c3ccc(-c4ccccc4)cc3C[C@@H]12
1541
+ COc1c2c(cc3c1C(Nc1ccncc1)N(C)CC3)OCO2
1542
+ C=CCOc1ccc2c(C)cc(=O)oc2c1OCC=C
1543
+ C#CC(O)C=CCCCCCCCCCCCC#CCCCCCCCCCCCC(O)C#CC#CCO
1544
+ CNCC(O)CN1CCc2c(Br)cccc2C1
1545
+ COc1ccc(-n2cc(C(=O)O)c3c2C(c2cccnc2)CC(=O)N3)cc1
1546
+ CC(NC(=O)C1CC1)c1c(-c2ccc(Cl)cc2)noc1C(=O)O
1547
+ CCNc1c(C=O)c(=O)oc2ccccc12
1548
+ COc1ccc(NC2OC(=O)c3c2ccc(OC)c3OC)c(OC)c1
1549
+ O=C(Cl)CCC(=O)Cl
1550
+ O=C(O)C=CCCCCCCCCCC=C(Br)Br
1551
+ O=C(O)Cc1cc(O)cc(O)c1O
1552
+ CC1=CCC(C(C)CCC(O)C(C)(C)O)CC1=O
1553
+ CCOC(CNC(=O)c1cc(Br)c(Br)[nH]1)CC(=O)OC
1554
+ COc1c(O)ccc(CCC(=O)CC(O)CC(C)CCCO)c1-c1ccc(CN)cc1
1555
+ Cc1[nH]c2ccccc2c(=O)c1C(=O)C=Cc1ccccc1
1556
+ C/C=C(\NC(=O)CCCCCC)C(=O)O
1557
+ COc1c(O)cc2c(c1O)C(=O)c1ccccc1C2=O
1558
+ COc1ccc(C(=O)c2nccc3cc(OC)c(OC)cc23)cc1
1559
+ CC1=C2COCC(C)C2CC1O
1560
+ CCC(C)C(C)C(O)CN
1561
+ C=C(C)C1CCC2(C)CC(Br)CC(C)C2(O)C1
1562
+ C[C]1[CH][C](C)[C]2CCCC[C]2[NH2+]1
1563
+ COc1ccc(Cl)cc1C(=O)C=Cc1ccc(F)cc1
1564
+ O=C(OCC1CCCN2CCCCC12)c1cc2ccccc2oc1=O
1565
+ CCCNC(=O)NC1CC(O)(C(=O)NCC(N)=O)CC(O)C1O
1566
+ Cc1cc(C=C2C=CC(=O)O2)oc1C
1567
+ CC(=O)OCC1=C(C(=O)O)N2C(=O)C(NC(=O)C=Cc3ccccc3)C2SC1
1568
+ COC(=O)C1SCC(NC(=O)c2ccccc2)C1OC(C)=O
1569
+ NS(=O)(=O)c1ccc(NC(=O)CNCC(=O)O)c(Br)c1
1570
+ CC1CCCC2CCCCC12
1571
+ CCC(=O)c1cn(-c2ccc(F)cc2)c2ccc(Cl)cc12
1572
+ COc1cc(OC)c2c(c1)C(=O)c1cccc(O)c1C2=O
1573
+ COC(=O)c1cccc(Nc2nc(NCCO)c3ccccc3n2)c1
1574
+ O=S(=O)(O)OC1C(Oc2ccc(O)cc2)OC(CO)C(O)C1O
1575
+ CCN(CC)CCN=C(O)COc1ccc(OC)cc1
1576
+ CC1=CC(=O)C(C(C)CCC(O)C(C)(C)O)CC1
1577
+ O=C(Nc1ccc(OCCN2CCCC2)cc1)C1(c2ccccc2)CCOCC1
1578
+ CN1C(=O)c2ccccc2NC(=O)C12OC2c1ccccc1
1579
+ CC(=O)N[C@@H](CC(C)C)C(=O)O
1580
+ COc1ccc2oc3cccc(O)c3c(=O)c2c1
1581
+ O=C(Nc1cccc(C(=O)O)c1)NC(Cc1ccccc1)C(=O)O
1582
+ CN(C)CCc1c[nH]c2ccsc12
1583
+ C=C1C(O)CC2C(C)(C)CCCC2(C)C1CCC(C)=O
1584
+ COC(=O)c1ccccc1NC(=O)N1CCc2nc[nH]c2C1c1ccc(OC)cc1
1585
+ O=C(O)c1ccc2c(c1)OCO2
1586
+ O=C(NCCOC(=O)Nc1ccccc1)Nc1ccc(Cl)c(Cl)c1
1587
+ COc1c(O)cc2cc3c(c(O)c2c1C)C(=O)CC(O)C3
1588
+ NC(C(=O)O)c1ccc(C(=O)O)cc1
1589
+ CCCCCCCCC1C2C=CC3(O)C(C(=O)O)=CC(O)C4CC1C2C43
1590
+ O=c1cccc2n1CC1CCCN3CCCC2(O)C13
1591
+ CCCCCCCCCCCCCCCCC(C)CCCCCC
1592
+ COC(OC)C1(C)CCCC2(C)c3cc(O)c(C(C)C)cc3CCC21
1593
+ Cc1ccc(C(=O)NCC2OC(CC(=O)N3CCC(C)CC3)C(O)C2O)cc1
1594
+ CC1=C(O)C(=O)C23C(=O)C1C(C)(C)C2CCC3C
1595
+ O=C(O)C1OC(OCC2OC(O)C(O)C(O)C2O)C(O)C(O)C1O
1596
+ COc1ccc(CN2C(=O)NC(CC(=O)NC(CO)CCSC)C2=O)cc1
1597
+ c1ccc(-c2cc3c(cn2)CCCC3)cc1
1598
+ CC1=CCC2=C(C)CC(OC(=O)c3ccc(O)cc3)C(C(C)C)CC12
1599
+ CCC1CC2(CC)OC(=O)C3(CC)OC(=O)C1C23
1600
+ CC(=O)OCC1=C(C(=O)O)N2C(=O)C(NC(=O)Cc3ccccc3)C2SC1
1601
+ Oc1cccc2c1ccc1c3ccccc3ccc21
1602
+ CCCCCCCCCCC(=O)NCC(=O)O
1603
+ O=C(CCCCC1CCSS1)N1CCCC1c1nnc2ccccn12
1604
+ ClCCOC1CCCN(Cc2ccccc2)CO1
1605
+ COc1c(O)ccc2c1CC(O)C(c1ccccc1)O2
1606
+ Nc1nc(N)c2c3c(oc2n1)CN(Cc1ccccc1)CC3
1607
+ CCC12CCCN3CCC4(c5ccccc5NC4C(C(=O)O)C1)C32
1608
+ CN1c2ccccc2C(C)(C)C1O
1609
+ COC(CCC=CCCCCC(=O)CCCCCCC(=O)CCC=CCCCCCCO)C(=O)NCCc1ccc(O)cc1
1610
+ CCC=CCC=CCC=CCC=CCC=CCCCCCC(=O)OCC(CO)OC(=O)CCCCCCCC=CCCCCCCCC
1611
+ CCOC(=O)c1c(C)oc2ccc(OC(=O)c3ccc(Br)cc3)cc12
1612
+ CC(=O)CC1=C(C)C(=O)c2c(O)cc(O)cc2C1=O
1613
+ COc1cc(CCCCCCCCC(=O)c2c(O)cccc2O)ccc1O
1614
+ COc1cc2c(cc1OC)CC(=O)N(CC(=O)NCCn1ccc3ccccc31)C=C2
1615
+ CCCc1cc(OC)cc(O)c1C(=O)O
1616
+ C=C1CCCC2C1(C)CCC(C)C2(C)CC1=CC(=O)C=C(N)C1=O
1617
+ COc1cccc2c1CC(Cc1ccncc1)C2
1618
+ COC(=O)Cc1c(C)c2ccc(OCC=C(C)C)cc2oc1=O
1619
+ C=CC=CC=CC(O)C(C)O
1620
+ CCC1C(=O)Nc2cc3[nH]c(-c4cccnc4)nc3cc21
1621
+ C=C1C=C(CCC(=O)O)C(=O)OC1C=CC=CC
1622
+ O=C(NC1C(c2ccccc2)CC(O)C1O)c1cccnc1
1623
+ Nc1cccc2c1ccc1ccccc12
1624
+ Cc1ccc(C)c(Cl)c1
1625
+ CNCc1cn(CC2OC(CC(=O)N3CCCC3)C(O)C2O)nn1
1626
+ COC(=O)C(C)(C)CCCOc1ccc(C)c(OCCCC(C)(C)C(=O)OC)c1
1627
+ CC(C)(O)c1cc(CC2(N)COC2)no1
benchmark/latent_visualization_legacy.py ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Latent Space Visualization for Molecule VAE Models
4
+ Integrated with existing benchmark pipeline structure
5
+ """
6
+
7
+ import os
8
+ import time
9
+ import random
10
+ import pandas as pd
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+ import seaborn as sns
14
+ from matplotlib.colors import ListedColormap
15
+ from pathlib import Path
16
+
17
+ import torch
18
+ import torch.nn as nn
19
+ import torch.nn.functional as F
20
+ from torch.utils.data import DataLoader, Dataset
21
+
22
+ from sklearn.manifold import TSNE
23
+ from sklearn.decomposition import PCA
24
+ from tqdm import tqdm
25
+ from rdkit import Chem
26
+ from rdkit import RDLogger
27
+ RDLogger.DisableLog('rdApp.*')
28
+
29
+ os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
30
+
31
+ # Import from existing benchmark code
32
+ from transformers import AutoTokenizer
33
+ try:
34
+ from FastChemTokenizer import FastChemTokenizer
35
+ except ImportError:
36
+ print("FastChemTokenizer not found. Please ensure it's in your PYTHONPATH.")
37
+ FastChemTokenizer = None
38
+
39
+ # Set seeds for reproducibility
40
+ def set_seed(seed=42):
41
+ torch.manual_seed(seed)
42
+ torch.cuda.manual_seed_all(seed)
43
+ np.random.seed(seed)
44
+ random.seed(seed)
45
+ os.environ['PYTHONHASHSEED'] = str(seed)
46
+ torch.backends.cudnn.deterministic = True
47
+ torch.backends.cudnn.benchmark = False
48
+
49
+ set_seed(42)
50
+
51
+ # Device setup
52
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
53
+
54
+ class TokenizerWrapper:
55
+ def __init__(self, tokenizer, name, bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>"):
56
+ self.tokenizer = tokenizer
57
+ self.name = name
58
+ self.bos_token = bos_token
59
+ self.eos_token = eos_token
60
+ self.pad_token = pad_token
61
+ self.unk_token = unk_token
62
+
63
+ if hasattr(tokenizer, 'add_special_tokens'):
64
+ tokenizer.add_special_tokens({
65
+ 'bos_token': bos_token,
66
+ 'eos_token': eos_token,
67
+ 'pad_token': pad_token,
68
+ 'unk_token': unk_token
69
+ })
70
+
71
+ def encode(self, smiles: str, add_special_tokens: bool = True):
72
+ if isinstance(self.tokenizer, FastChemTokenizer):
73
+ # 1. get ids directly
74
+ ids = self.tokenizer.encode(smiles) # ← no .tokenize() here
75
+ # 2. add specials ourselves
76
+ if add_special_tokens:
77
+ ids = [self.tokenizer.bos_token_id] + ids + [self.tokenizer.eos_token_id]
78
+ return {'input_ids': ids}
79
+ else:
80
+ # Hugging-Face style tokenizer
81
+ return self.tokenizer(
82
+ smiles,
83
+ add_special_tokens=add_special_tokens,
84
+ return_attention_mask=False,
85
+ return_tensors=None
86
+ )
87
+
88
+ def decode(self, token_ids, skip_special_tokens=True):
89
+ if isinstance(self.tokenizer, FastChemTokenizer):
90
+ # 1. map single ids → tokens
91
+ tokens = [self.tokenizer.id_to_token.get(tid, self.tokenizer.unk_token)
92
+ for tid in token_ids]
93
+ # 2. drop specials if requested
94
+ if skip_special_tokens:
95
+ specials = {self.tokenizer.bos_token,
96
+ self.tokenizer.eos_token,
97
+ self.tokenizer.pad_token,
98
+ self.tokenizer.unk_token} # add any others you use
99
+ tokens = [t for t in tokens if t not in specials]
100
+ # 3. detokenise
101
+ if hasattr(self.tokenizer, 'detokenize'):
102
+ return self.tokenizer.detokenize(tokens)
103
+ else:
104
+ return "".join(tokens) # chemistry tokens are atomic
105
+ else:
106
+ return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
107
+
108
+ def __len__(self):
109
+ if isinstance(self.tokenizer, FastChemTokenizer):
110
+ # FastChemTokenizer uses ._vocab or .vocab depending on version
111
+ return len(getattr(self.tokenizer, 'vocab',
112
+ getattr(self.tokenizer, '_vocab', self.tokenizer)))
113
+ else:
114
+ return len(self.tokenizer)
115
+
116
+ def get_vocab(self):
117
+ if isinstance(self.tokenizer, FastChemTokenizer):
118
+ return self.tokenizer.vocab
119
+ else:
120
+ return self.tokenizer.get_vocab()
121
+
122
+ @property
123
+ def bos_token_id(self):
124
+ return self.tokenizer.bos_token_id
125
+
126
+ @property
127
+ def eos_token_id(self):
128
+ return self.tokenizer.eos_token_id
129
+
130
+ @property
131
+ def pad_token_id(self):
132
+ return self.tokenizer.pad_token_id
133
+
134
+ @property
135
+ def unk_token_id(self):
136
+ return self.tokenizer.unk_token_id
137
+
138
+ def collate_fn(batch, tokenizer, max_length=128):
139
+ encodings = [tokenizer.encode(s, add_special_tokens=True) for s in batch]
140
+ input_ids = [e['input_ids'] for e in encodings]
141
+
142
+ max_len = min(max(len(ids) for ids in input_ids), max_length)
143
+ padded = []
144
+ lengths = []
145
+
146
+ pad_token_id = tokenizer.tokenizer.pad_token_id # FIXED: dynamic
147
+
148
+ for ids in input_ids:
149
+ if len(ids) > max_length:
150
+ ids = ids[:max_length]
151
+ else:
152
+ ids = ids + [pad_token_id] * (max_len - len(ids))
153
+ padded.append(ids)
154
+ lengths.append(min(len(ids), max_length))
155
+
156
+ return torch.tensor(padded, dtype=torch.long), torch.tensor(lengths, dtype=torch.long)
157
+
158
+
159
+ class SmilesDataset(Dataset):
160
+ def __init__(self, smiles_list):
161
+ self.smiles_list = smiles_list
162
+ def __len__(self):
163
+ return len(self.smiles_list)
164
+ def __getitem__(self, idx):
165
+ return self.smiles_list[idx]
166
+
167
+
168
+
169
+ class MoleculeVAE(nn.Module):
170
+ def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, latent_dim=128, num_layers=2,
171
+ pad_token_id=0, bos_token_id=1, eos_token_id=2):
172
+ super().__init__()
173
+ self.vocab_size = vocab_size
174
+ self.embed_dim = embed_dim
175
+ self.hidden_dim = hidden_dim
176
+ self.latent_dim = latent_dim
177
+ self.num_layers = num_layers
178
+ self.pad_token_id = pad_token_id
179
+ self.bos_token_id = bos_token_id
180
+ self.eos_token_id = eos_token_id
181
+
182
+ self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_token_id)
183
+ self.encoder_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
184
+ self.fc_mu = nn.Linear(hidden_dim * 2, latent_dim)
185
+ self.fc_logvar = nn.Linear(hidden_dim * 2, latent_dim)
186
+
187
+ self.decoder_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
188
+ self.fc_out = nn.Linear(hidden_dim, vocab_size)
189
+
190
+ self.latent2hidden = nn.Linear(latent_dim, num_layers * hidden_dim)
191
+ self.latent2cell = nn.Linear(latent_dim, num_layers * hidden_dim)
192
+
193
+ self._init_weights()
194
+
195
+ def _init_weights(self):
196
+ for m in self.modules():
197
+ if isinstance(m, nn.Linear):
198
+ nn.init.xavier_uniform_(m.weight)
199
+ if m.bias is not None:
200
+ nn.init.zeros_(m.bias)
201
+ elif isinstance(m, nn.LSTM):
202
+ for name, param in m.named_parameters():
203
+ if 'weight' in name:
204
+ nn.init.orthogonal_(param)
205
+ elif 'bias' in name:
206
+ nn.init.zeros_(param)
207
+
208
+ def encode(self, x, lengths):
209
+ embedded = self.embedding(x)
210
+ packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
211
+ packed_out, (hidden, _) = self.encoder_lstm(packed)
212
+ h_forward = hidden[-2]
213
+ h_backward = hidden[-1]
214
+ h = torch.cat([h_forward, h_backward], dim=1)
215
+ mu = self.fc_mu(h)
216
+ logvar = self.fc_logvar(h)
217
+ return mu, logvar
218
+
219
+ def reparameterize(self, mu, logvar):
220
+ if self.training:
221
+ std = torch.exp(0.5 * logvar)
222
+ eps = torch.randn_like(std)
223
+ return mu + eps * std
224
+ else:
225
+ return mu
226
+
227
+ def decode(self, z, max_length=128, mode="greedy", temperature=1.0):
228
+ """
229
+ Decode latent vector z into a sequence.
230
+ Returns full logits at each step.
231
+ PATCHED: stops generation when EOS is predicted.
232
+ """
233
+ batch_size = z.size(0)
234
+ device = z.device
235
+
236
+ # Initialize hidden states from latent
237
+ h0 = self.latent2hidden(z).view(self.num_layers, batch_size, self.hidden_dim)
238
+ c0 = self.latent2cell(z).view(self.num_layers, batch_size, self.hidden_dim)
239
+ hidden = (h0, c0)
240
+
241
+ # Start with BOS token — shape: (batch_size, 1)
242
+ input_token = torch.full((batch_size, 1), self.bos_token_id, dtype=torch.long, device=device)
243
+ logits = []
244
+ finished = torch.zeros(batch_size, dtype=torch.bool, device=device) # ← TRACK FINISHED SEQS
245
+
246
+ for _ in range(max_length):
247
+ embedded = self.embedding(input_token) # (batch, 1, embed_dim)
248
+ output, hidden = self.decoder_lstm(embedded, hidden)
249
+ logit = self.fc_out(output) # (batch, 1, vocab)
250
+ logits.append(logit)
251
+
252
+ if mode == "greedy":
253
+ input_token = logit.argmax(dim=-1) # (batch, 1)
254
+ elif mode == "sample":
255
+ probs = torch.softmax(logit.squeeze(1) / temperature, dim=-1) # (batch, vocab)
256
+ input_token = torch.multinomial(probs, 1) # (batch, 1)
257
+ else:
258
+ raise ValueError(f"Unknown decode mode: {mode}")
259
+
260
+ # ← EARLY STOPPING AT EOS
261
+ just_finished = (input_token.squeeze(1) == self.eos_token_id)
262
+ finished |= just_finished
263
+ input_token[finished] = self.pad_token_id # pad finished sequences
264
+ if finished.all():
265
+ break
266
+
267
+ return torch.cat(logits, dim=1) # (batch, seq_len, vocab)
268
+
269
+ def forward(self, input_ids, lengths, target_seq=None, teacher_forcing_ratio=0.0, temperature=1.0):
270
+ mu, logvar = self.encode(input_ids, lengths)
271
+ z = self.reparameterize(mu, logvar)
272
+
273
+ if self.training and target_seq is not None and teacher_forcing_ratio > 0:
274
+ # Training with teacher forcing
275
+ batch_size, seq_len = target_seq.size()
276
+ device = target_seq.device
277
+
278
+ # Initialize hidden states
279
+ h0 = self.latent2hidden(z).view(self.num_layers, batch_size, self.hidden_dim)
280
+ c0 = self.latent2cell(z).view(self.num_layers, batch_size, self.hidden_dim)
281
+ hidden = (h0, c0)
282
+
283
+ logits = []
284
+ input_token = target_seq[:, 0].unsqueeze(1) # BOS
285
+
286
+ for t in range(1, seq_len):
287
+ embedded = self.embedding(input_token)
288
+ output, hidden = self.decoder_lstm(embedded, hidden)
289
+ logit = self.fc_out(output)
290
+ logits.append(logit)
291
+
292
+ use_teacher = torch.rand(1).item() < teacher_forcing_ratio
293
+ if use_teacher:
294
+ input_token = target_seq[:, t].unsqueeze(1)
295
+ else:
296
+ input_token = logit.argmax(dim=-1)
297
+
298
+ logits = torch.cat(logits, dim=1)
299
+ else:
300
+ # Inference mode
301
+ max_len = target_seq.size(1) if target_seq is not None else 128
302
+ logits = self.decode(z, max_length=max_len, mode="greedy", temperature=temperature)
303
+
304
+ return logits, mu, logvar
305
+
306
+ class LatentSpaceVisualizer:
307
+ def __init__(self, model_path, tokenizer, device='cuda' if torch.cuda.is_available() else 'cpu'):
308
+ self.device = device
309
+ self.tokenizer = tokenizer
310
+ self.model = self.load_model(model_path)
311
+
312
+ def load_model(self, model_path):
313
+ """Load the trained VAE model"""
314
+ checkpoint = torch.load(model_path, map_location=self.device)
315
+
316
+ # Extract model parameters from checkpoint
317
+ if 'model_state_dict' in checkpoint:
318
+ state_dict = checkpoint['model_state_dict']
319
+ else:
320
+ state_dict = checkpoint
321
+
322
+ # Get vocab size from tokenizer
323
+ vocab_size = len(self.tokenizer)
324
+ pad_token_id = self.tokenizer.tokenizer.pad_token_id
325
+
326
+ # Initialize model with correct parameters
327
+ model = MoleculeVAE(vocab_size=vocab_size, pad_token_id=pad_token_id)
328
+ model.load_state_dict(state_dict)
329
+ model.to(self.device)
330
+ model.eval()
331
+
332
+ return model
333
+
334
+ def encode_molecules(self, smiles_list, batch_size=32):
335
+ """Encode molecules to latent space"""
336
+ dataset = SmilesDataset(smiles_list)
337
+ dataloader = DataLoader(
338
+ dataset,
339
+ batch_size=batch_size,
340
+ shuffle=False,
341
+ collate_fn=lambda batch: collate_fn(batch, self.tokenizer, max_length=128)
342
+ )
343
+
344
+ all_mus = []
345
+ with torch.no_grad():
346
+ for input_ids, lengths in tqdm(dataloader, desc="Encoding molecules"):
347
+ input_ids = input_ids.to(self.device)
348
+ lengths = lengths.to(self.device)
349
+
350
+ mu, logvar = self.model.encode(input_ids, lengths)
351
+ all_mus.append(mu.cpu().numpy())
352
+
353
+ return np.concatenate(all_mus, axis=0)
354
+
355
+ def create_grid_latent_points(self, grid_size=100, z_range=4):
356
+ """Create a grid of points in 2D latent space"""
357
+ x = np.linspace(-z_range, z_range, grid_size)
358
+ y = np.linspace(-z_range, z_range, grid_size)
359
+ xx, yy = np.meshgrid(x, y)
360
+
361
+ # Create circular mask
362
+ center = grid_size // 2
363
+ radius = grid_size // 2
364
+ y_coords, x_coords = np.ogrid[:grid_size, :grid_size]
365
+ mask = (x_coords - center) ** 2 + (y_coords - center) ** 2 <= radius ** 2
366
+
367
+ return xx, yy, mask
368
+
369
+ def classify_latent_points(self, latent_points, dim1=0, dim2=1, additional_dim=None):
370
+ """
371
+ Classify latent points by chemical validity (RDKit parseable)
372
+ Returns: 0 for invalid/unparseable molecules, 1 for valid molecules
373
+ """
374
+ classifications = []
375
+
376
+ with torch.no_grad():
377
+ # Process in batches to avoid memory issues
378
+ batch_size = 32
379
+ for i in range(0, len(latent_points), batch_size):
380
+ batch_points = latent_points[i:i+batch_size]
381
+
382
+ # Create full latent vectors (sample from normal for other dimensions)
383
+ full_z = torch.randn(len(batch_points), self.model.latent_dim, device=self.device) * 0.1
384
+ full_z[:, dim1] = torch.FloatTensor(batch_points[:, 0]).to(self.device)
385
+ full_z[:, dim2] = torch.FloatTensor(batch_points[:, 1]).to(self.device)
386
+
387
+ # If additional dimension specified (for z2 plots)
388
+ if additional_dim is not None:
389
+ if isinstance(additional_dim, dict):
390
+ for dim_idx, dim_val in additional_dim.items():
391
+ full_z[:, dim_idx] = dim_val
392
+
393
+ try:
394
+ # Decode to SMILES
395
+ logits = self.model.decode(full_z, max_length=64, temperature=0.8)
396
+ predictions = torch.argmax(logits, dim=-1)
397
+
398
+ # Check chemical validity for each decoded molecule
399
+ batch_classes = []
400
+ for pred in predictions:
401
+ pred_ids = pred.cpu().tolist()
402
+
403
+ # Remove padding and special tokens
404
+ pad_id = self.tokenizer.tokenizer.pad_token_id
405
+ eos_id = self.tokenizer.tokenizer.eos_token_id
406
+
407
+ # Trim at EOS or pad
408
+ for j, token_id in enumerate(pred_ids):
409
+ if token_id in [pad_id, eos_id]:
410
+ pred_ids = pred_ids[:j]
411
+ break
412
+
413
+ try:
414
+ decoded_smiles = self.tokenizer.decode(pred_ids, skip_special_tokens=True)
415
+ # Test chemical validity with RDKit
416
+ mol = Chem.MolFromSmiles(decoded_smiles)
417
+
418
+ if mol is None:
419
+ # Invalid/unparseable molecule
420
+ batch_classes.append(0)
421
+ else:
422
+ # Valid, RDKit-parseable molecule
423
+ batch_classes.append(1)
424
+
425
+ except Exception:
426
+ # Decoding or parsing failed - invalid
427
+ batch_classes.append(0)
428
+
429
+ classifications.extend(batch_classes)
430
+
431
+ except Exception as e:
432
+ # If decoding fails, all points in batch are invalid
433
+ classifications.extend([0] * len(batch_points))
434
+
435
+ return np.array(classifications)
436
+
437
+ def plot_latent_space_interpolation(self, grid_size=100, z_range=4, save_path=None):
438
+ """
439
+ Create latent space interpolation plots similar to the reference images
440
+ """
441
+ fig, axes = plt.subplots(2, 4, figsize=(20, 10))
442
+ axes = axes.flatten()
443
+
444
+ # Create color map (RED for invalid molecules, GREEN for valid molecules)
445
+ colors = ['#FF4444', '#44AA44'] # Red (invalid) and Green (valid)
446
+ cmap = ListedColormap(colors)
447
+
448
+ plot_idx = 0
449
+
450
+ # First row: different dimension pairs
451
+ dimension_pairs = [(0, 1), (2, 3), (4, 5), (6, 7)]
452
+
453
+ for dim_pair in dimension_pairs:
454
+ dim1, dim2 = dim_pair
455
+
456
+ # Create grid
457
+ xx, yy, mask = self.create_grid_latent_points(grid_size, z_range)
458
+
459
+ # Get points within circular boundary
460
+ valid_points = []
461
+ valid_coords = []
462
+
463
+ for i in range(grid_size):
464
+ for j in range(grid_size):
465
+ if mask[i, j]:
466
+ valid_points.append([xx[i, j], yy[i, j]])
467
+ valid_coords.append([i, j])
468
+
469
+ valid_points = np.array(valid_points)
470
+
471
+ # Classify points based on chemical validity
472
+ print(f"Classifying latent space chemical validity for dimensions {dim1}, {dim2}...")
473
+ classifications = self.classify_latent_points(valid_points, dim1, dim2)
474
+
475
+ # Create classification grid
476
+ class_grid = np.zeros((grid_size, grid_size))
477
+ class_grid.fill(np.nan) # Fill with NaN for areas outside circle
478
+
479
+ for point_idx, (i, j) in enumerate(valid_coords):
480
+ class_grid[i, j] = classifications[point_idx]
481
+
482
+ # Plot
483
+ ax = axes[plot_idx]
484
+ im = ax.imshow(class_grid, extent=[-z_range, z_range, -z_range, z_range],
485
+ origin='lower', cmap=cmap, alpha=0.8, vmin=0, vmax=1)
486
+
487
+ # Add concentric circles
488
+ circles = [1, 2, 3, 4]
489
+ for radius in circles:
490
+ if radius <= z_range:
491
+ circle = plt.Circle((0, 0), radius, fill=False, color='black',
492
+ alpha=0.3, linewidth=0.5)
493
+ ax.add_patch(circle)
494
+
495
+ # Set labels and title
496
+ ax.set_xlabel(f'Latent dimension z{dim1}')
497
+ ax.set_ylabel(f'Latent dimension z{dim2}')
498
+ ax.set_title('SMILES')
499
+ ax.set_xlim(-z_range, z_range)
500
+ ax.set_ylim(-z_range, z_range)
501
+ ax.set_aspect('equal')
502
+
503
+ plot_idx += 1
504
+
505
+ # Second row: fix z0, z1 and vary z2
506
+ for z2_val in [-2, -1, 1, 2]:
507
+ dim1, dim2 = 0, 1 # Use z0 and z1 for x,y
508
+
509
+ # Create grid
510
+ xx, yy, mask = self.create_grid_latent_points(grid_size, z_range)
511
+
512
+ # Get points within circular boundary
513
+ valid_points = []
514
+ valid_coords = []
515
+
516
+ for i in range(grid_size):
517
+ for j in range(grid_size):
518
+ if mask[i, j]:
519
+ valid_points.append([xx[i, j], yy[i, j]])
520
+ valid_coords.append([i, j])
521
+
522
+ valid_points = np.array(valid_points)
523
+
524
+ # Classify points with z2 fixed - check chemical validity
525
+ print(f"Classifying latent space chemical validity for z0, z1 with z2 = {z2_val}...")
526
+ classifications = self.classify_latent_points(
527
+ valid_points, dim1, dim2,
528
+ additional_dim={2: z2_val}
529
+ )
530
+
531
+ # Create classification grid
532
+ class_grid = np.zeros((grid_size, grid_size))
533
+ class_grid.fill(np.nan)
534
+
535
+ for point_idx, (i, j) in enumerate(valid_coords):
536
+ class_grid[i, j] = classifications[point_idx]
537
+
538
+ # Plot
539
+ ax = axes[plot_idx]
540
+ im = ax.imshow(class_grid, extent=[-z_range, z_range, -z_range, z_range],
541
+ origin='lower', cmap=cmap, alpha=0.8, vmin=0, vmax=1)
542
+
543
+ # Add concentric circles
544
+ for radius in circles:
545
+ if radius <= z_range:
546
+ circle = plt.Circle((0, 0), radius, fill=False, color='black',
547
+ alpha=0.3, linewidth=0.5)
548
+ ax.add_patch(circle)
549
+
550
+ ax.set_xlabel('Latent dimension z0')
551
+ ax.set_ylabel('Latent dimension z1')
552
+ ax.set_title(f'SMILES; z2 = {z2_val}')
553
+ ax.set_xlim(-z_range, z_range)
554
+ ax.set_ylim(-z_range, z_range)
555
+ ax.set_aspect('equal')
556
+
557
+ plot_idx += 1
558
+
559
+ plt.suptitle(f'Latent Space Chemical Validity - {self.tokenizer.name}\n(Red: Invalid molecules, Green: Valid molecules)', fontsize=16)
560
+ plt.tight_layout()
561
+
562
+ if save_path:
563
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
564
+
565
+ plt.show()
566
+
567
+ def plot_molecule_embeddings(self, smiles_list, method='tsne', save_path=None):
568
+ """Plot actual molecule embeddings in 2D using dimensionality reduction"""
569
+ print(f"Encoding {len(smiles_list)} molecules...")
570
+ embeddings = self.encode_molecules(smiles_list)
571
+
572
+ # Create simple labels based on molecular properties
573
+ labels = []
574
+ for smiles in smiles_list:
575
+ mol = Chem.MolFromSmiles(smiles)
576
+ if mol is None:
577
+ labels.append(0)
578
+ else:
579
+ # Simple binary classification
580
+ mw = Chem.Descriptors.MolWt(mol)
581
+ labels.append(1 if mw > 200 else 0)
582
+
583
+ labels = np.array(labels)
584
+
585
+ # Reduce dimensionality
586
+ print(f"Computing {method.upper()} projection...")
587
+ if method == 'tsne':
588
+ reducer = TSNE(n_components=2, random_state=42, perplexity=min(30, len(smiles_list)//4))
589
+ else:
590
+ reducer = PCA(n_components=2, random_state=42)
591
+
592
+ embeddings_2d = reducer.fit_transform(embeddings)
593
+
594
+ # Plot
595
+ plt.figure(figsize=(10, 8))
596
+ scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1],
597
+ c=labels, cmap='RdYlGn', alpha=0.7, s=20)
598
+ plt.colorbar(scatter, label='Molecular Weight > 200')
599
+ plt.title(f'{method.upper()} of Molecule Embeddings - {self.tokenizer.name}')
600
+ plt.xlabel(f'{method.upper()} 1')
601
+ plt.ylabel(f'{method.upper()} 2')
602
+
603
+ if save_path:
604
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
605
+
606
+ plt.show()
607
+
608
+ def load_data_and_tokenizers():
609
+ """Load data and tokenizers using your existing structure"""
610
+ # Load SMILES data (adjust path as needed)
611
+ data_path = "../data/sample_all_8k_smi.csv"
612
+ if not os.path.exists(data_path):
613
+ print(f"Data file not found: {data_path}")
614
+ print("Please update the data_path in the script.")
615
+ return None, None
616
+
617
+ df = pd.read_csv(data_path)
618
+ if 'SMILES' not in df.columns:
619
+ raise ValueError("Expected column 'SMILES' in CSV")
620
+
621
+ smiles_list = df['SMILES'].dropna().tolist()
622
+
623
+ # Validate SMILES
624
+ valid_smiles = []
625
+ for smiles in smiles_list:
626
+ if Chem.MolFromSmiles(smiles) is not None:
627
+ valid_smiles.append(smiles)
628
+
629
+ print(f"Loaded {len(valid_smiles)} valid SMILES")
630
+
631
+ # Initialize tokenizers
632
+ try:
633
+ tok1_hf = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
634
+ tokenizer1 = TokenizerWrapper(tok1_hf, name="ChemBERTa",
635
+ bos_token="<s>", eos_token="</s>",
636
+ pad_token="<pad>", unk_token="<unk>")
637
+ except Exception as e:
638
+ print(f"Failed to load ChemBERTa tokenizer: {e}")
639
+ tokenizer1 = None
640
+
641
+ try:
642
+ tok2_fast = FastChemTokenizer.from_pretrained("../smitok")
643
+ tokenizer2 = TokenizerWrapper(tok2_fast, name="FastChemTokenizer",
644
+ bos_token="[BOS]", eos_token="[EOS]",
645
+ pad_token="[PAD]", unk_token="[UNK]")
646
+ except Exception as e:
647
+ print(f"Failed to load FastChemTokenizer: {e}")
648
+ tokenizer2 = None
649
+
650
+ tokenizers = [t for t in [tokenizer1, tokenizer2] if t is not None]
651
+
652
+ return valid_smiles, tokenizers
653
+
654
+ def create_latent_visualizations():
655
+ """Main function to create latent space visualizations"""
656
+
657
+ # Load data and tokenizers
658
+ smiles_list, tokenizers = load_data_and_tokenizers()
659
+ if smiles_list is None or not tokenizers:
660
+ print("Failed to load data or tokenizers. Please check your setup.")
661
+ return
662
+
663
+ # Use a subset for faster visualization
664
+ viz_smiles = smiles_list[:1000] # Adjust size as needed
665
+
666
+ # Model paths
667
+ model_paths = {
668
+ 'ChemBERTa': './checkpoints/ChemBERTa/best_model_ChemBERTa.pt',
669
+ 'FastChemTokenizer': './checkpoints/FastChemTokenizer/best_model_FastChemTokenizer.pt'
670
+ }
671
+
672
+ # Create output directory
673
+ os.makedirs('latent_space_plots', exist_ok=True)
674
+
675
+ for tokenizer in tokenizers:
676
+ model_path = model_paths.get(tokenizer.name)
677
+ if model_path is None or not os.path.exists(model_path):
678
+ print(f"Model not found for {tokenizer.name}: {model_path}")
679
+ continue
680
+
681
+ print(f"\n{'='*60}")
682
+ print(f"Creating visualizations for {tokenizer.name}")
683
+ print(f"{'='*60}")
684
+
685
+ try:
686
+ # Create visualizer
687
+ visualizer = LatentSpaceVisualizer(model_path, tokenizer, device)
688
+
689
+ # Create latent space interpolation plots
690
+ print("Creating latent space interpolation plots...")
691
+ save_path = f'latent_space_plots/{tokenizer.name}_latent_interpolation.png'
692
+ visualizer.plot_latent_space_interpolation(save_path=save_path)
693
+
694
+ # Create molecule embedding plots
695
+ print("Creating t-SNE embedding plot...")
696
+ save_path = f'latent_space_plots/{tokenizer.name}_embeddings_tsne.png'
697
+ visualizer.plot_molecule_embeddings(viz_smiles, method='tsne', save_path=save_path)
698
+
699
+ print("Creating PCA embedding plot...")
700
+ save_path = f'latent_space_plots/{tokenizer.name}_embeddings_pca.png'
701
+ visualizer.plot_molecule_embeddings(viz_smiles, method='pca', save_path=save_path)
702
+
703
+ except Exception as e:
704
+ print(f"Error processing {tokenizer.name}: {str(e)}")
705
+ import traceback
706
+ traceback.print_exc()
707
+ continue
708
+
709
+ print(f"\n{'='*60}")
710
+ print("Visualization complete! Check the 'latent_space_plots' directory for results.")
711
+ print(f"{'='*60}")
712
+
713
+ if __name__ == "__main__":
714
+ # Import RDKit descriptors for molecular property calculation
715
+ try:
716
+ from rdkit.Chem import Descriptors, rdMolDescriptors
717
+ except ImportError:
718
+ print("RDKit Descriptors not available. Using simpler classification.")
719
+ # Fallback to simple classification if descriptors not available
720
+ Descriptors = None
721
+ rdMolDescriptors = None
722
+
723
+ create_latent_visualizations()
benchmark/sample_all_8k_smi.csv ADDED
The diff for this file is too large to render. See raw diff
 
latent_space_plots/ChemBERTa_latent_interpolation.png ADDED

Git LFS Details

  • SHA256: 3164da3b32584e4f19c219f95a7051424b1a1d0bbfafa06bf4871e3db48e6569
  • Pointer size: 131 Bytes
  • Size of remote file: 974 kB
latent_space_plots/FastChemTokenizerHF_latent_interpolation.png ADDED

Git LFS Details

  • SHA256: ac7fe41ed48165f293169ffd8a42282871a11b011f400f83d48ba9d13a71da10
  • Pointer size: 131 Bytes
  • Size of remote file: 956 kB