Upload benchmark script and set
Browse files- .gitattributes +2 -0
- benchmark/FastChemTokenizer.py +621 -0
- benchmark/benchmark_HF_efficient.py +1119 -0
- benchmark/benchmark_HF_simpler.py +895 -0
- benchmark/benchmark_legacy.py +1039 -0
- benchmark/data/chunk_1smi.csv +0 -0
- benchmark/data/test_smiles.txt +1628 -0
- benchmark/data/train_smiles.txt +0 -0
- benchmark/data/val_smiles.txt +1627 -0
- benchmark/latent_visualization_legacy.py +723 -0
- benchmark/sample_all_8k_smi.csv +0 -0
- latent_space_plots/ChemBERTa_latent_interpolation.png +3 -0
- latent_space_plots/FastChemTokenizerHF_latent_interpolation.png +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
latent_space_plots/ChemBERTa_latent_interpolation.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
latent_space_plots/FastChemTokenizerHF_latent_interpolation.png filter=lfs diff=lfs merge=lfs -text
|
benchmark/FastChemTokenizer.py
ADDED
|
@@ -0,0 +1,621 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from typing import List, Union, Optional, Tuple
|
| 5 |
+
from transformers.tokenization_utils_base import BatchEncoding
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
|
| 8 |
+
# Copyright 2025 Genta Pramillean Bayu (@gbyuvd)
|
| 9 |
+
#
|
| 10 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 11 |
+
# you may not use this file except in compliance with the License.
|
| 12 |
+
# You may obtain a copy of the License at
|
| 13 |
+
#
|
| 14 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 15 |
+
#
|
| 16 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 17 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 18 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 19 |
+
# See the License for the specific language governing permissions and
|
| 20 |
+
# limitations under the License.
|
| 21 |
+
|
| 22 |
+
class TrieNode:
|
| 23 |
+
__slots__ = ['children', 'token_id']
|
| 24 |
+
def __init__(self):
|
| 25 |
+
self.children = {}
|
| 26 |
+
self.token_id = None # If set, this node completes a valid token
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class FastChemTokenizer:
|
| 30 |
+
def __init__(self, token_to_id, model_max_length=512):
|
| 31 |
+
self.token_to_id = token_to_id
|
| 32 |
+
self.id_to_token = {v: k for k, v in token_to_id.items()}
|
| 33 |
+
# No more self.token_set — replaced by trie
|
| 34 |
+
self.model_max_length = model_max_length
|
| 35 |
+
|
| 36 |
+
# Precompute max token length for possible use & clarity
|
| 37 |
+
self.max_token_len = max(len(t) for t in token_to_id.keys())
|
| 38 |
+
|
| 39 |
+
# Build trie for fast longest-match lookup
|
| 40 |
+
self.trie_root = self._build_trie(token_to_id)
|
| 41 |
+
|
| 42 |
+
# Validate required special tokens
|
| 43 |
+
required_special_tokens = ["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
|
| 44 |
+
for tok in required_special_tokens:
|
| 45 |
+
if tok not in token_to_id:
|
| 46 |
+
raise KeyError(f"Required special token '{tok}' not found in vocab.")
|
| 47 |
+
|
| 48 |
+
# Special token IDs
|
| 49 |
+
self.bos_token_id = token_to_id["<s>"]
|
| 50 |
+
self.eos_token_id = token_to_id["</s>"]
|
| 51 |
+
self.pad_token_id = token_to_id["<pad>"]
|
| 52 |
+
self.unk_token_id = token_to_id["<unk>"]
|
| 53 |
+
self.mask_token_id = token_to_id["<mask>"]
|
| 54 |
+
|
| 55 |
+
# Special tokens for convenience
|
| 56 |
+
self.bos_token = "<s>"
|
| 57 |
+
self.eos_token = "</s>"
|
| 58 |
+
self.pad_token = "<pad>"
|
| 59 |
+
self.unk_token = "<unk>"
|
| 60 |
+
self.mask_token = "<mask>"
|
| 61 |
+
|
| 62 |
+
def _build_trie(self, token_to_id):
|
| 63 |
+
root = TrieNode()
|
| 64 |
+
for token, tid in token_to_id.items():
|
| 65 |
+
node = root
|
| 66 |
+
for char in token:
|
| 67 |
+
if char not in node.children:
|
| 68 |
+
node.children[char] = TrieNode()
|
| 69 |
+
node = node.children[char]
|
| 70 |
+
node.token_id = tid
|
| 71 |
+
return root
|
| 72 |
+
|
| 73 |
+
def __len__(self):
|
| 74 |
+
"""Return vocab size — REQUIRED for HF compatibility."""
|
| 75 |
+
return len(self.token_to_id)
|
| 76 |
+
|
| 77 |
+
def __call__(self, text: Union[str, List[str]], text_pair: Optional[Union[str, List[str]]] = None, **kwargs) -> BatchEncoding:
|
| 78 |
+
if isinstance(text, list):
|
| 79 |
+
batch = [(t, p) if p is not None else t for t, p in zip(text, text_pair)] if text_pair else text
|
| 80 |
+
return self.batch_encode_plus(batch, **kwargs)
|
| 81 |
+
else:
|
| 82 |
+
return self.encode_plus(text=text, text_pair=text_pair, **kwargs)
|
| 83 |
+
|
| 84 |
+
@lru_cache(maxsize=10000)
|
| 85 |
+
def _cached_encode_str(self, s: str) -> Tuple[int, ...]:
|
| 86 |
+
return tuple(self._encode_core(s))
|
| 87 |
+
|
| 88 |
+
def _encode_core(self, text: str) -> List[int]:
|
| 89 |
+
"""Core encoding logic using Trie — no caching."""
|
| 90 |
+
tokens = text
|
| 91 |
+
result_ids = []
|
| 92 |
+
i = 0
|
| 93 |
+
n = len(tokens)
|
| 94 |
+
|
| 95 |
+
while i < n:
|
| 96 |
+
node = self.trie_root
|
| 97 |
+
j = i
|
| 98 |
+
last_match_id = None
|
| 99 |
+
last_match_end = i
|
| 100 |
+
|
| 101 |
+
# Traverse trie while characters match
|
| 102 |
+
while j < n and tokens[j] in node.children:
|
| 103 |
+
node = node.children[tokens[j]]
|
| 104 |
+
j += 1
|
| 105 |
+
if node.token_id is not None:
|
| 106 |
+
last_match_id = node.token_id
|
| 107 |
+
last_match_end = j # Remember end of valid token
|
| 108 |
+
|
| 109 |
+
if last_match_id is not None:
|
| 110 |
+
result_ids.append(last_match_id)
|
| 111 |
+
i = last_match_end
|
| 112 |
+
else:
|
| 113 |
+
# Fallback: encode single char
|
| 114 |
+
tok = tokens[i]
|
| 115 |
+
result_ids.append(self.token_to_id.get(tok, self.unk_token_id))
|
| 116 |
+
i += 1
|
| 117 |
+
|
| 118 |
+
return result_ids
|
| 119 |
+
|
| 120 |
+
def encode(self, text: str) -> List[int]:
|
| 121 |
+
"""Public encode method — strips input and uses cache."""
|
| 122 |
+
return list(self._cached_encode_str(text.strip()))
|
| 123 |
+
|
| 124 |
+
def decode(self, token_ids: Union[List[int], torch.Tensor], skip_special_tokens: bool = False) -> str:
|
| 125 |
+
if isinstance(token_ids, torch.Tensor):
|
| 126 |
+
token_ids = token_ids.tolist()
|
| 127 |
+
|
| 128 |
+
if skip_special_tokens:
|
| 129 |
+
special_ids = {
|
| 130 |
+
self.bos_token_id,
|
| 131 |
+
self.eos_token_id,
|
| 132 |
+
self.pad_token_id,
|
| 133 |
+
self.mask_token_id,
|
| 134 |
+
}
|
| 135 |
+
else:
|
| 136 |
+
special_ids = set()
|
| 137 |
+
|
| 138 |
+
tokens = []
|
| 139 |
+
for tid in token_ids:
|
| 140 |
+
if tid in special_ids:
|
| 141 |
+
continue
|
| 142 |
+
token = self.id_to_token.get(tid, self.unk_token)
|
| 143 |
+
tokens.append(token)
|
| 144 |
+
|
| 145 |
+
return "".join(tokens)
|
| 146 |
+
|
| 147 |
+
def decode_with_trace(self, token_ids: List[int]) -> None:
|
| 148 |
+
print(f"\n🔍 Decoding {len(token_ids)} tokens:")
|
| 149 |
+
for i, tid in enumerate(token_ids):
|
| 150 |
+
token = self.id_to_token.get(tid, self.unk_token)
|
| 151 |
+
print(f" [{i:03d}] ID={tid:5d} → '{token}'")
|
| 152 |
+
|
| 153 |
+
def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
|
| 154 |
+
return [self.id_to_token.get(i, self.unk_token) for i in ids]
|
| 155 |
+
|
| 156 |
+
def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
|
| 157 |
+
return [self.token_to_id.get(t, self.unk_token_id) for t in tokens]
|
| 158 |
+
|
| 159 |
+
def encode_plus(
|
| 160 |
+
self,
|
| 161 |
+
text: str,
|
| 162 |
+
text_pair: Optional[str] = None,
|
| 163 |
+
add_special_tokens: bool = True,
|
| 164 |
+
padding: Union[bool, str] = False,
|
| 165 |
+
truncation: bool = False,
|
| 166 |
+
max_length: Optional[int] = None,
|
| 167 |
+
return_tensors: Optional[str] = None,
|
| 168 |
+
return_attention_mask: bool = True,
|
| 169 |
+
return_token_type_ids: bool = True,
|
| 170 |
+
) -> BatchEncoding:
|
| 171 |
+
if max_length is None:
|
| 172 |
+
max_length = self.model_max_length
|
| 173 |
+
|
| 174 |
+
ids_a = self.encode(text)
|
| 175 |
+
|
| 176 |
+
if text_pair is not None:
|
| 177 |
+
ids_b = self.encode(text_pair)
|
| 178 |
+
else:
|
| 179 |
+
ids_b = None
|
| 180 |
+
|
| 181 |
+
input_ids = []
|
| 182 |
+
token_type_ids = []
|
| 183 |
+
|
| 184 |
+
if add_special_tokens:
|
| 185 |
+
input_ids.append(self.bos_token_id)
|
| 186 |
+
token_type_ids.append(0)
|
| 187 |
+
if ids_b is not None:
|
| 188 |
+
input_ids.extend(ids_a)
|
| 189 |
+
token_type_ids.extend([0] * len(ids_a))
|
| 190 |
+
input_ids.append(self.eos_token_id)
|
| 191 |
+
token_type_ids.append(0)
|
| 192 |
+
|
| 193 |
+
input_ids.extend(ids_b)
|
| 194 |
+
token_type_ids.extend([1] * len(ids_b))
|
| 195 |
+
input_ids.append(self.eos_token_id)
|
| 196 |
+
token_type_ids.append(1)
|
| 197 |
+
else:
|
| 198 |
+
input_ids.extend(ids_a)
|
| 199 |
+
token_type_ids.extend([0] * len(ids_a))
|
| 200 |
+
input_ids.append(self.eos_token_id)
|
| 201 |
+
token_type_ids.append(0)
|
| 202 |
+
else:
|
| 203 |
+
input_ids = ids_a
|
| 204 |
+
token_type_ids = [0] * len(input_ids)
|
| 205 |
+
if ids_b is not None:
|
| 206 |
+
input_ids.extend(ids_b)
|
| 207 |
+
token_type_ids.extend([1] * len(ids_b))
|
| 208 |
+
|
| 209 |
+
if truncation and len(input_ids) > max_length:
|
| 210 |
+
input_ids = input_ids[:max_length]
|
| 211 |
+
token_type_ids = token_type_ids[:max_length]
|
| 212 |
+
|
| 213 |
+
if padding:
|
| 214 |
+
pad_len = max_length - len(input_ids)
|
| 215 |
+
if pad_len > 0:
|
| 216 |
+
input_ids.extend([self.pad_token_id] * pad_len)
|
| 217 |
+
token_type_ids.extend([0] * pad_len)
|
| 218 |
+
|
| 219 |
+
attention_mask = [1 if tid != self.pad_token_id else 0 for tid in input_ids]
|
| 220 |
+
|
| 221 |
+
encoded_dict = {
|
| 222 |
+
"input_ids": input_ids,
|
| 223 |
+
"attention_mask": attention_mask,
|
| 224 |
+
}
|
| 225 |
+
if return_token_type_ids:
|
| 226 |
+
encoded_dict["token_type_ids"] = token_type_ids
|
| 227 |
+
|
| 228 |
+
if return_tensors == "pt":
|
| 229 |
+
output = {}
|
| 230 |
+
for k, v in encoded_dict.items():
|
| 231 |
+
tensor = torch.tensor(v, dtype=torch.long) # Fixed: use torch.tensor, not as_tensor
|
| 232 |
+
if tensor.ndim == 1:
|
| 233 |
+
tensor = tensor.unsqueeze(0)
|
| 234 |
+
output[k] = tensor
|
| 235 |
+
else:
|
| 236 |
+
output = encoded_dict
|
| 237 |
+
|
| 238 |
+
return BatchEncoding(output, tensor_type=return_tensors)
|
| 239 |
+
|
| 240 |
+
def batch_encode_plus(
|
| 241 |
+
self,
|
| 242 |
+
batch_text_or_text_pairs: List[Union[str, Tuple[str, str]]],
|
| 243 |
+
**kwargs
|
| 244 |
+
) -> BatchEncoding:
|
| 245 |
+
all_input_ids = []
|
| 246 |
+
all_attention_masks = []
|
| 247 |
+
all_token_type_ids = []
|
| 248 |
+
|
| 249 |
+
for item in batch_text_or_text_pairs:
|
| 250 |
+
if isinstance(item, tuple):
|
| 251 |
+
text, text_pair = item
|
| 252 |
+
else:
|
| 253 |
+
text, text_pair = item, None
|
| 254 |
+
|
| 255 |
+
encoded = self.encode_plus(
|
| 256 |
+
text=text,
|
| 257 |
+
text_pair=text_pair,
|
| 258 |
+
**kwargs
|
| 259 |
+
)
|
| 260 |
+
all_input_ids.append(encoded["input_ids"])
|
| 261 |
+
all_attention_masks.append(encoded["attention_mask"])
|
| 262 |
+
if "token_type_ids" in encoded:
|
| 263 |
+
all_token_type_ids.append(encoded["token_type_ids"])
|
| 264 |
+
|
| 265 |
+
batched = {
|
| 266 |
+
"input_ids": all_input_ids,
|
| 267 |
+
"attention_mask": all_attention_masks,
|
| 268 |
+
}
|
| 269 |
+
if all_token_type_ids:
|
| 270 |
+
batched["token_type_ids"] = all_token_type_ids
|
| 271 |
+
|
| 272 |
+
if kwargs.get("return_tensors") == "pt":
|
| 273 |
+
def to_tensor_list(lst):
|
| 274 |
+
# Fixed: Handle both tensor and non-tensor items properly
|
| 275 |
+
return [item.clone().detach() if isinstance(item, torch.Tensor)
|
| 276 |
+
else torch.tensor(item, dtype=torch.long) for item in lst]
|
| 277 |
+
batched = {
|
| 278 |
+
k: torch.nn.utils.rnn.pad_sequence(
|
| 279 |
+
to_tensor_list(v),
|
| 280 |
+
batch_first=True,
|
| 281 |
+
padding_value=self.pad_token_id if k == "input_ids" else 0
|
| 282 |
+
)
|
| 283 |
+
for k, v in batched.items()
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
return BatchEncoding(batched, tensor_type=kwargs.get("return_tensors"))
|
| 287 |
+
|
| 288 |
+
# Save vocab to directory
|
| 289 |
+
def save_pretrained(self, save_directory: str):
|
| 290 |
+
"""
|
| 291 |
+
Save tokenizer vocab as `vocab.json` in target directory.
|
| 292 |
+
Mimics Hugging Face convention.
|
| 293 |
+
"""
|
| 294 |
+
if not os.path.exists(save_directory):
|
| 295 |
+
os.makedirs(save_directory)
|
| 296 |
+
|
| 297 |
+
vocab_file = os.path.join(save_directory, "vocab.json")
|
| 298 |
+
|
| 299 |
+
# Keys are strings, values are ints — JSON-safe
|
| 300 |
+
with open(vocab_file, "w", encoding="utf-8") as f:
|
| 301 |
+
json.dump(self.token_to_id, f, ensure_ascii=False, indent=2)
|
| 302 |
+
|
| 303 |
+
print(f"✅ Tokenizer vocab saved to: {vocab_file}")
|
| 304 |
+
|
| 305 |
+
# Load from pretrained directory
|
| 306 |
+
@classmethod
|
| 307 |
+
def from_pretrained(cls, pretrained_directory: str, model_max_length=512):
|
| 308 |
+
"""
|
| 309 |
+
Load tokenizer from directory containing `vocab.json`.
|
| 310 |
+
"""
|
| 311 |
+
vocab_file = os.path.join(pretrained_directory, "vocab.json")
|
| 312 |
+
|
| 313 |
+
if not os.path.exists(vocab_file):
|
| 314 |
+
raise FileNotFoundError(f"Vocab file not found: {vocab_file}")
|
| 315 |
+
|
| 316 |
+
with open(vocab_file, "r", encoding="utf-8") as f:
|
| 317 |
+
token_to_id = json.load(f)
|
| 318 |
+
|
| 319 |
+
# Convert keys to str (JSON loads as str anyway), values to int
|
| 320 |
+
token_to_id = {str(k): int(v) for k, v in token_to_id.items()}
|
| 321 |
+
|
| 322 |
+
return cls(token_to_id=token_to_id, model_max_length=model_max_length)
|
| 323 |
+
|
| 324 |
+
class FastChemTokenizerSelfies:
|
| 325 |
+
def __init__(self, token_to_id, model_max_length=512):
|
| 326 |
+
self.token_to_id = token_to_id
|
| 327 |
+
self.id_to_token = {v: k for k, v in token_to_id.items()}
|
| 328 |
+
# No more self.token_set — replaced by trie
|
| 329 |
+
self.model_max_length = model_max_length
|
| 330 |
+
|
| 331 |
+
# Precompute max token length for possible use & clarity
|
| 332 |
+
self.max_token_len = max(len(t) for t in token_to_id.keys())
|
| 333 |
+
|
| 334 |
+
# Build trie for fast longest-match lookup
|
| 335 |
+
self.trie_root = self._build_trie(token_to_id)
|
| 336 |
+
|
| 337 |
+
# Validate required special tokens
|
| 338 |
+
required_special_tokens = ["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
|
| 339 |
+
for tok in required_special_tokens:
|
| 340 |
+
if tok not in token_to_id:
|
| 341 |
+
raise KeyError(f"Required special token '{tok}' not found in vocab.")
|
| 342 |
+
|
| 343 |
+
# Special token IDs
|
| 344 |
+
self.bos_token_id = token_to_id["<s>"]
|
| 345 |
+
self.eos_token_id = token_to_id["</s>"]
|
| 346 |
+
self.pad_token_id = token_to_id["<pad>"]
|
| 347 |
+
self.unk_token_id = token_to_id["<unk>"]
|
| 348 |
+
self.mask_token_id = token_to_id["<mask>"]
|
| 349 |
+
|
| 350 |
+
# Special tokens for convenience
|
| 351 |
+
self.bos_token = "<s>"
|
| 352 |
+
self.eos_token = "</s>"
|
| 353 |
+
self.pad_token = "<pad>"
|
| 354 |
+
self.unk_token = "<unk>"
|
| 355 |
+
self.mask_token = "<mask>"
|
| 356 |
+
|
| 357 |
+
def _build_trie(self, token_to_id):
|
| 358 |
+
root = TrieNode()
|
| 359 |
+
for token, tid in token_to_id.items():
|
| 360 |
+
node = root
|
| 361 |
+
for char in token:
|
| 362 |
+
if char not in node.children:
|
| 363 |
+
node.children[char] = TrieNode()
|
| 364 |
+
node = node.children[char]
|
| 365 |
+
node.token_id = tid
|
| 366 |
+
return root
|
| 367 |
+
|
| 368 |
+
def __len__(self):
|
| 369 |
+
"""Return vocab size — REQUIRED for HF compatibility."""
|
| 370 |
+
return len(self.token_to_id)
|
| 371 |
+
|
| 372 |
+
def __call__(self, text: Union[str, List[str]], text_pair: Optional[Union[str, List[str]]] = None, **kwargs) -> BatchEncoding:
|
| 373 |
+
if isinstance(text, list):
|
| 374 |
+
batch = [(t, p) if p is not None else t for t, p in zip(text, text_pair)] if text_pair else text
|
| 375 |
+
return self.batch_encode_plus(batch, **kwargs)
|
| 376 |
+
else:
|
| 377 |
+
return self.encode_plus(text=text, text_pair=text_pair, **kwargs)
|
| 378 |
+
|
| 379 |
+
@lru_cache(maxsize=10000)
|
| 380 |
+
def _cached_encode_str(self, s: str) -> Tuple[int, ...]:
|
| 381 |
+
return tuple(self._encode_core(s))
|
| 382 |
+
|
| 383 |
+
def _encode_core(self, text: str) -> List[int]:
|
| 384 |
+
"""Core encoding logic using Trie — skips whitespace if not part of a token."""
|
| 385 |
+
result_ids = []
|
| 386 |
+
i = 0
|
| 387 |
+
n = len(text)
|
| 388 |
+
|
| 389 |
+
while i < n:
|
| 390 |
+
if text[i].isspace(): # ← Skip whitespace unless part of a token
|
| 391 |
+
i += 1
|
| 392 |
+
continue
|
| 393 |
+
|
| 394 |
+
node = self.trie_root
|
| 395 |
+
j = i
|
| 396 |
+
last_match_id = None
|
| 397 |
+
last_match_end = i
|
| 398 |
+
|
| 399 |
+
# Traverse trie while characters match
|
| 400 |
+
while j < n and text[j] in node.children:
|
| 401 |
+
node = node.children[text[j]]
|
| 402 |
+
j += 1
|
| 403 |
+
if node.token_id is not None:
|
| 404 |
+
last_match_id = node.token_id
|
| 405 |
+
last_match_end = j
|
| 406 |
+
|
| 407 |
+
if last_match_id is not None:
|
| 408 |
+
result_ids.append(last_match_id)
|
| 409 |
+
i = last_match_end
|
| 410 |
+
else:
|
| 411 |
+
# Fallback: encode single char
|
| 412 |
+
result_ids.append(self.token_to_id.get(text[i], self.unk_token_id))
|
| 413 |
+
i += 1
|
| 414 |
+
|
| 415 |
+
return result_ids
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
def encode(self, text: str) -> List[int]:
|
| 419 |
+
"""Public encode method — strips input and uses cache."""
|
| 420 |
+
return list(self._cached_encode_str(text.strip()))
|
| 421 |
+
|
| 422 |
+
def decode(self, token_ids: Union[List[int], torch.Tensor], skip_special_tokens: bool = False) -> str:
|
| 423 |
+
if isinstance(token_ids, torch.Tensor):
|
| 424 |
+
token_ids = token_ids.tolist()
|
| 425 |
+
|
| 426 |
+
if skip_special_tokens:
|
| 427 |
+
special_ids = {
|
| 428 |
+
self.bos_token_id,
|
| 429 |
+
self.eos_token_id,
|
| 430 |
+
self.pad_token_id,
|
| 431 |
+
self.mask_token_id,
|
| 432 |
+
}
|
| 433 |
+
else:
|
| 434 |
+
special_ids = set()
|
| 435 |
+
|
| 436 |
+
tokens = []
|
| 437 |
+
for tid in token_ids:
|
| 438 |
+
if tid in special_ids:
|
| 439 |
+
continue
|
| 440 |
+
token = self.id_to_token.get(tid, self.unk_token)
|
| 441 |
+
tokens.append(token)
|
| 442 |
+
|
| 443 |
+
# ✅ Join with SPACE between tokens — this reconstructs original format
|
| 444 |
+
return " ".join(tokens)
|
| 445 |
+
|
| 446 |
+
def decode_with_trace(self, token_ids: List[int]) -> None:
|
| 447 |
+
print(f"\n🔍 Decoding {len(token_ids)} tokens:")
|
| 448 |
+
for i, tid in enumerate(token_ids):
|
| 449 |
+
token = self.id_to_token.get(tid, self.unk_token)
|
| 450 |
+
print(f" [{i:03d}] ID={tid:5d} → '{token}'")
|
| 451 |
+
|
| 452 |
+
def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
|
| 453 |
+
return [self.id_to_token.get(i, self.unk_token) for i in ids]
|
| 454 |
+
|
| 455 |
+
def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
|
| 456 |
+
return [self.token_to_id.get(t, self.unk_token_id) for t in tokens]
|
| 457 |
+
|
| 458 |
+
def encode_plus(
|
| 459 |
+
self,
|
| 460 |
+
text: str,
|
| 461 |
+
text_pair: Optional[str] = None,
|
| 462 |
+
add_special_tokens: bool = True,
|
| 463 |
+
padding: Union[bool, str] = False,
|
| 464 |
+
truncation: bool = False,
|
| 465 |
+
max_length: Optional[int] = None,
|
| 466 |
+
return_tensors: Optional[str] = None,
|
| 467 |
+
return_attention_mask: bool = True,
|
| 468 |
+
return_token_type_ids: bool = True,
|
| 469 |
+
) -> BatchEncoding:
|
| 470 |
+
if max_length is None:
|
| 471 |
+
max_length = self.model_max_length
|
| 472 |
+
|
| 473 |
+
ids_a = self.encode(text)
|
| 474 |
+
|
| 475 |
+
if text_pair is not None:
|
| 476 |
+
ids_b = self.encode(text_pair)
|
| 477 |
+
else:
|
| 478 |
+
ids_b = None
|
| 479 |
+
|
| 480 |
+
input_ids = []
|
| 481 |
+
token_type_ids = []
|
| 482 |
+
|
| 483 |
+
if add_special_tokens:
|
| 484 |
+
input_ids.append(self.bos_token_id)
|
| 485 |
+
token_type_ids.append(0)
|
| 486 |
+
if ids_b is not None:
|
| 487 |
+
input_ids.extend(ids_a)
|
| 488 |
+
token_type_ids.extend([0] * len(ids_a))
|
| 489 |
+
input_ids.append(self.eos_token_id)
|
| 490 |
+
token_type_ids.append(0)
|
| 491 |
+
|
| 492 |
+
input_ids.extend(ids_b)
|
| 493 |
+
token_type_ids.extend([1] * len(ids_b))
|
| 494 |
+
input_ids.append(self.eos_token_id)
|
| 495 |
+
token_type_ids.append(1)
|
| 496 |
+
else:
|
| 497 |
+
input_ids.extend(ids_a)
|
| 498 |
+
token_type_ids.extend([0] * len(ids_a))
|
| 499 |
+
input_ids.append(self.eos_token_id)
|
| 500 |
+
token_type_ids.append(0)
|
| 501 |
+
else:
|
| 502 |
+
input_ids = ids_a
|
| 503 |
+
token_type_ids = [0] * len(input_ids)
|
| 504 |
+
if ids_b is not None:
|
| 505 |
+
input_ids.extend(ids_b)
|
| 506 |
+
token_type_ids.extend([1] * len(ids_b))
|
| 507 |
+
|
| 508 |
+
if truncation and len(input_ids) > max_length:
|
| 509 |
+
input_ids = input_ids[:max_length]
|
| 510 |
+
token_type_ids = token_type_ids[:max_length]
|
| 511 |
+
|
| 512 |
+
if padding:
|
| 513 |
+
pad_len = max_length - len(input_ids)
|
| 514 |
+
if pad_len > 0:
|
| 515 |
+
input_ids.extend([self.pad_token_id] * pad_len)
|
| 516 |
+
token_type_ids.extend([0] * pad_len)
|
| 517 |
+
|
| 518 |
+
attention_mask = [1 if tid != self.pad_token_id else 0 for tid in input_ids]
|
| 519 |
+
|
| 520 |
+
encoded_dict = {
|
| 521 |
+
"input_ids": input_ids,
|
| 522 |
+
"attention_mask": attention_mask,
|
| 523 |
+
}
|
| 524 |
+
if return_token_type_ids:
|
| 525 |
+
encoded_dict["token_type_ids"] = token_type_ids
|
| 526 |
+
|
| 527 |
+
if return_tensors == "pt":
|
| 528 |
+
output = {}
|
| 529 |
+
for k, v in encoded_dict.items():
|
| 530 |
+
tensor = torch.tensor(v, dtype=torch.long) # Fixed: use torch.tensor, not as_tensor
|
| 531 |
+
if tensor.ndim == 1:
|
| 532 |
+
tensor = tensor.unsqueeze(0)
|
| 533 |
+
output[k] = tensor
|
| 534 |
+
else:
|
| 535 |
+
output = encoded_dict
|
| 536 |
+
|
| 537 |
+
return BatchEncoding(output, tensor_type=return_tensors)
|
| 538 |
+
|
| 539 |
+
def batch_encode_plus(
|
| 540 |
+
self,
|
| 541 |
+
batch_text_or_text_pairs: List[Union[str, Tuple[str, str]]],
|
| 542 |
+
**kwargs
|
| 543 |
+
) -> BatchEncoding:
|
| 544 |
+
all_input_ids = []
|
| 545 |
+
all_attention_masks = []
|
| 546 |
+
all_token_type_ids = []
|
| 547 |
+
|
| 548 |
+
for item in batch_text_or_text_pairs:
|
| 549 |
+
if isinstance(item, tuple):
|
| 550 |
+
text, text_pair = item
|
| 551 |
+
else:
|
| 552 |
+
text, text_pair = item, None
|
| 553 |
+
|
| 554 |
+
encoded = self.encode_plus(
|
| 555 |
+
text=text,
|
| 556 |
+
text_pair=text_pair,
|
| 557 |
+
**kwargs
|
| 558 |
+
)
|
| 559 |
+
all_input_ids.append(encoded["input_ids"])
|
| 560 |
+
all_attention_masks.append(encoded["attention_mask"])
|
| 561 |
+
if "token_type_ids" in encoded:
|
| 562 |
+
all_token_type_ids.append(encoded["token_type_ids"])
|
| 563 |
+
|
| 564 |
+
batched = {
|
| 565 |
+
"input_ids": all_input_ids,
|
| 566 |
+
"attention_mask": all_attention_masks,
|
| 567 |
+
}
|
| 568 |
+
if all_token_type_ids:
|
| 569 |
+
batched["token_type_ids"] = all_token_type_ids
|
| 570 |
+
|
| 571 |
+
if kwargs.get("return_tensors") == "pt":
|
| 572 |
+
def to_tensor_list(lst):
|
| 573 |
+
# Fixed: Handle both tensor and non-tensor items properly
|
| 574 |
+
return [item.clone().detach() if isinstance(item, torch.Tensor)
|
| 575 |
+
else torch.tensor(item, dtype=torch.long) for item in lst]
|
| 576 |
+
batched = {
|
| 577 |
+
k: torch.nn.utils.rnn.pad_sequence(
|
| 578 |
+
to_tensor_list(v),
|
| 579 |
+
batch_first=True,
|
| 580 |
+
padding_value=self.pad_token_id if k == "input_ids" else 0
|
| 581 |
+
)
|
| 582 |
+
for k, v in batched.items()
|
| 583 |
+
}
|
| 584 |
+
|
| 585 |
+
return BatchEncoding(batched, tensor_type=kwargs.get("return_tensors"))
|
| 586 |
+
|
| 587 |
+
# Save vocab to directory
|
| 588 |
+
def save_pretrained(self, save_directory: str):
|
| 589 |
+
"""
|
| 590 |
+
Save tokenizer vocab as `vocab.json` in target directory.
|
| 591 |
+
Mimics Hugging Face convention.
|
| 592 |
+
"""
|
| 593 |
+
if not os.path.exists(save_directory):
|
| 594 |
+
os.makedirs(save_directory)
|
| 595 |
+
|
| 596 |
+
vocab_file = os.path.join(save_directory, "vocab.json")
|
| 597 |
+
|
| 598 |
+
# Keys are strings, values are ints — JSON-safe
|
| 599 |
+
with open(vocab_file, "w", encoding="utf-8") as f:
|
| 600 |
+
json.dump(self.token_to_id, f, ensure_ascii=False, indent=2)
|
| 601 |
+
|
| 602 |
+
print(f"✅ Tokenizer vocab saved to: {vocab_file}")
|
| 603 |
+
|
| 604 |
+
# Load from pretrained directory
|
| 605 |
+
@classmethod
|
| 606 |
+
def from_pretrained(cls, pretrained_directory: str, model_max_length=512):
|
| 607 |
+
"""
|
| 608 |
+
Load tokenizer from directory containing `vocab.json`.
|
| 609 |
+
"""
|
| 610 |
+
vocab_file = os.path.join(pretrained_directory, "vocab.json")
|
| 611 |
+
|
| 612 |
+
if not os.path.exists(vocab_file):
|
| 613 |
+
raise FileNotFoundError(f"Vocab file not found: {vocab_file}")
|
| 614 |
+
|
| 615 |
+
with open(vocab_file, "r", encoding="utf-8") as f:
|
| 616 |
+
token_to_id = json.load(f)
|
| 617 |
+
|
| 618 |
+
# Convert keys to str (JSON loads as str anyway), values to int
|
| 619 |
+
token_to_id = {str(k): int(v) for k, v in token_to_id.items()}
|
| 620 |
+
|
| 621 |
+
return cls(token_to_id=token_to_id, model_max_length=model_max_length)
|
benchmark/benchmark_HF_efficient.py
ADDED
|
@@ -0,0 +1,1119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Molecule Tokenizer Benchmark & VAE Training Pipeline
|
| 3 |
+
# PATCHED VERSION — Updated for FastChemTokenizerHF (HF compatible)
|
| 4 |
+
#
|
| 5 |
+
|
| 6 |
+
#
|
| 7 |
+
# Step 1.1 — Imports & Reproducibility
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import time
|
| 12 |
+
import random
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
import torch
|
| 17 |
+
import numpy as np
|
| 18 |
+
# Tokenizers
|
| 19 |
+
from transformers import AutoTokenizer
|
| 20 |
+
from FastChemTokenizerHF import FastChemTokenizer
|
| 21 |
+
# Optional: for progress bars
|
| 22 |
+
from tqdm import tqdm
|
| 23 |
+
from rdkit import Chem
|
| 24 |
+
from sklearn.model_selection import train_test_split
|
| 25 |
+
import torch.nn as nn
|
| 26 |
+
import torch.nn.functional as F
|
| 27 |
+
from ranger21 import Ranger21
|
| 28 |
+
from torch.utils.data import DataLoader, Dataset
|
| 29 |
+
from scipy.stats import entropy
|
| 30 |
+
import json
|
| 31 |
+
import math
|
| 32 |
+
from typing import Optional, Tuple, Union
|
| 33 |
+
from rdkit import RDLogger
|
| 34 |
+
RDLogger.DisableLog('rdApp.*')
|
| 35 |
+
# Set seeds for reproducibility
|
| 36 |
+
def set_seed(seed=42):
|
| 37 |
+
torch.manual_seed(seed)
|
| 38 |
+
torch.cuda.manual_seed_all(seed)
|
| 39 |
+
np.random.seed(seed)
|
| 40 |
+
random.seed(seed)
|
| 41 |
+
os.environ['PYTHONHASHSEED'] = str(seed)
|
| 42 |
+
torch.backends.cudnn.deterministic = True
|
| 43 |
+
torch.backends.cudnn.benchmark = False
|
| 44 |
+
|
| 45 |
+
set_seed(42)
|
| 46 |
+
|
| 47 |
+
# Device setup
|
| 48 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 49 |
+
print(f"Using device: {device}")
|
| 50 |
+
|
| 51 |
+
#
|
| 52 |
+
# Step 1.2 — Load & Preprocess SMILES Corpus
|
| 53 |
+
#
|
| 54 |
+
|
| 55 |
+
data_path = "../data/sample_1k_smi_42.csv"
|
| 56 |
+
df = pd.read_csv(data_path)
|
| 57 |
+
|
| 58 |
+
if 'SMILES' not in df.columns:
|
| 59 |
+
raise ValueError("Expected column 'SMILES' in CSV")
|
| 60 |
+
|
| 61 |
+
smiles_list = df['SMILES'].dropna().tolist()
|
| 62 |
+
print(f"Loaded {len(smiles_list)} SMILES (assumed pre-canonicalized)")
|
| 63 |
+
|
| 64 |
+
# Validate with RDKit
|
| 65 |
+
|
| 66 |
+
def is_valid_smiles(smiles):
|
| 67 |
+
return Chem.MolFromSmiles(smiles) is not None
|
| 68 |
+
|
| 69 |
+
print("Validating SMILES with RDKit...")
|
| 70 |
+
valid_mask = [is_valid_smiles(s) for s in tqdm(smiles_list)]
|
| 71 |
+
smiles_list = [s for s, valid in zip(smiles_list, valid_mask) if valid]
|
| 72 |
+
print(f"After RDKit filtering: {len(smiles_list)} valid SMILES")
|
| 73 |
+
|
| 74 |
+
#
|
| 75 |
+
# Step 1.3 — Train/Val/Test Split (80/10/10)
|
| 76 |
+
#
|
| 77 |
+
|
| 78 |
+
train_smiles, temp_smiles = train_test_split(smiles_list, test_size=0.2, random_state=42, shuffle=True)
|
| 79 |
+
val_smiles, test_smiles = train_test_split(temp_smiles, test_size=0.5, random_state=42, shuffle=True)
|
| 80 |
+
|
| 81 |
+
print(f"Train: {len(train_smiles)}")
|
| 82 |
+
print(f"Val: {len(val_smiles)}")
|
| 83 |
+
print(f"Test: {len(test_smiles)}")
|
| 84 |
+
|
| 85 |
+
# Cache splits
|
| 86 |
+
splits = {'train': train_smiles, 'val': val_smiles, 'test': test_smiles}
|
| 87 |
+
for split_name, smiles in splits.items():
|
| 88 |
+
with open(f"../data/{split_name}_smiles.txt", "w") as f:
|
| 89 |
+
f.write("\n".join(smiles))
|
| 90 |
+
|
| 91 |
+
#
|
| 92 |
+
# Step 1.4 — Tokenizer Wrapper (Simplified for HF compatibility)
|
| 93 |
+
#
|
| 94 |
+
|
| 95 |
+
class TokenizerWrapper:
|
| 96 |
+
def __init__(self, tokenizer, name,
|
| 97 |
+
bos_token="<s>", eos_token="</s>",
|
| 98 |
+
pad_token="<pad>", unk_token="<unk>"):
|
| 99 |
+
self.tokenizer = tokenizer
|
| 100 |
+
self.name = name
|
| 101 |
+
|
| 102 |
+
# Only call add_special_tokens if the tokenizer actually supports it
|
| 103 |
+
if hasattr(tokenizer, "add_special_tokens") and callable(tokenizer.add_special_tokens):
|
| 104 |
+
try:
|
| 105 |
+
tokenizer.add_special_tokens({
|
| 106 |
+
"bos_token": bos_token,
|
| 107 |
+
"eos_token": eos_token,
|
| 108 |
+
"pad_token": pad_token,
|
| 109 |
+
"unk_token": unk_token,
|
| 110 |
+
})
|
| 111 |
+
except NotImplementedError:
|
| 112 |
+
# Your FastChemTokenizerHF already defines these tokens internally
|
| 113 |
+
pass
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def encode(self, smiles: str, add_special_tokens: bool = True):
|
| 118 |
+
return self.tokenizer(
|
| 119 |
+
smiles,
|
| 120 |
+
add_special_tokens=add_special_tokens,
|
| 121 |
+
return_attention_mask=False,
|
| 122 |
+
return_tensors=None
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
def decode(self, token_ids, skip_special_tokens=True):
|
| 126 |
+
return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
|
| 127 |
+
|
| 128 |
+
def __len__(self):
|
| 129 |
+
return len(self.tokenizer)
|
| 130 |
+
|
| 131 |
+
def get_vocab(self):
|
| 132 |
+
return self.tokenizer.get_vocab()
|
| 133 |
+
|
| 134 |
+
@property
|
| 135 |
+
def bos_token_id(self):
|
| 136 |
+
return self.tokenizer.bos_token_id
|
| 137 |
+
|
| 138 |
+
@property
|
| 139 |
+
def eos_token_id(self):
|
| 140 |
+
return self.tokenizer.eos_token_id
|
| 141 |
+
|
| 142 |
+
@property
|
| 143 |
+
def pad_token_id(self):
|
| 144 |
+
return self.tokenizer.pad_token_id
|
| 145 |
+
|
| 146 |
+
@property
|
| 147 |
+
def unk_token_id(self):
|
| 148 |
+
return self.tokenizer.unk_token_id
|
| 149 |
+
|
| 150 |
+
#
|
| 151 |
+
# Step 1.5 — Initialize Tokenizers
|
| 152 |
+
#
|
| 153 |
+
|
| 154 |
+
tok1_hf = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
|
| 155 |
+
tok2_fast = FastChemTokenizer.from_pretrained("../smitok_core")
|
| 156 |
+
|
| 157 |
+
tokenizer1 = TokenizerWrapper(tok1_hf, name="ChemBERTa", bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>")
|
| 158 |
+
tokenizer2 = TokenizerWrapper(tok2_fast, name="FastChemTokenizerHF", bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>")
|
| 159 |
+
|
| 160 |
+
TOKENIZERS = [tokenizer1, tokenizer2]
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
#
|
| 164 |
+
# Step 1.6 — Benchmarking Functions (Fixed Bug #4 implicitly via epsilon)
|
| 165 |
+
#
|
| 166 |
+
|
| 167 |
+
def benchmark_tokenizer(tokenizer, smiles_sample, encode_only=False):
|
| 168 |
+
V = len(tokenizer)
|
| 169 |
+
sample = smiles_sample[:10000] if len(smiles_sample) > 10000 else smiles_sample
|
| 170 |
+
|
| 171 |
+
encode_times, token_counts, char_counts = [], [], []
|
| 172 |
+
unk_counts, total_tokens = 0, 0
|
| 173 |
+
|
| 174 |
+
for smiles in tqdm(sample, desc=f"Encoding with {tokenizer.name}", leave=False):
|
| 175 |
+
char_counts.append(len(smiles))
|
| 176 |
+
start = time.perf_counter()
|
| 177 |
+
enc = tokenizer.encode(smiles, add_special_tokens=True)
|
| 178 |
+
end = time.perf_counter()
|
| 179 |
+
encode_times.append(end - start)
|
| 180 |
+
|
| 181 |
+
input_ids = enc['input_ids']
|
| 182 |
+
token_counts.append(len(input_ids))
|
| 183 |
+
total_tokens += len(input_ids)
|
| 184 |
+
unk_id = tokenizer.tokenizer.unk_token_id
|
| 185 |
+
unk_counts += input_ids.count(unk_id)
|
| 186 |
+
|
| 187 |
+
L_bar = np.mean(token_counts)
|
| 188 |
+
C = np.mean(char_counts) / L_bar
|
| 189 |
+
U = unk_counts / total_tokens if total_tokens > 0 else 0.0
|
| 190 |
+
Tenc = len(sample) / sum(encode_times)
|
| 191 |
+
|
| 192 |
+
metrics = {
|
| 193 |
+
'vocab_size': V,
|
| 194 |
+
'avg_tokens_per_mol': L_bar,
|
| 195 |
+
'compression_ratio': C,
|
| 196 |
+
'percent_unknown': U * 100,
|
| 197 |
+
'encode_throughput_smiles_per_sec': Tenc,
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
if encode_only:
|
| 201 |
+
return metrics
|
| 202 |
+
|
| 203 |
+
decode_times, reconstruction_ok = [], 0
|
| 204 |
+
|
| 205 |
+
for smiles in tqdm(sample, desc=f"Decoding with {tokenizer.name}", leave=False):
|
| 206 |
+
enc = tokenizer.encode(smiles, add_special_tokens=True)
|
| 207 |
+
input_ids = enc['input_ids']
|
| 208 |
+
start = time.perf_counter()
|
| 209 |
+
decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
|
| 210 |
+
end = time.perf_counter()
|
| 211 |
+
decode_times.append(end - start)
|
| 212 |
+
if decoded == smiles:
|
| 213 |
+
reconstruction_ok += 1
|
| 214 |
+
|
| 215 |
+
Tdec = len(sample) / sum(decode_times)
|
| 216 |
+
recon_acc = reconstruction_ok / len(sample)
|
| 217 |
+
|
| 218 |
+
metrics.update({
|
| 219 |
+
'decode_throughput_smiles_per_sec': Tdec,
|
| 220 |
+
'decode_reconstruction_accuracy': recon_acc * 100,
|
| 221 |
+
})
|
| 222 |
+
|
| 223 |
+
return metrics
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
#
|
| 227 |
+
# Step 1.7 — Run Benchmark
|
| 228 |
+
#
|
| 229 |
+
|
| 230 |
+
benchmark_sample = train_smiles
|
| 231 |
+
results = []
|
| 232 |
+
|
| 233 |
+
for tokenizer in TOKENIZERS:
|
| 234 |
+
print(f"\n=== Benchmarking {tokenizer.name} ===")
|
| 235 |
+
metrics = benchmark_tokenizer(tokenizer, benchmark_sample)
|
| 236 |
+
metrics['tokenizer'] = tokenizer.name
|
| 237 |
+
results.append(metrics)
|
| 238 |
+
for k, v in metrics.items():
|
| 239 |
+
if k != 'tokenizer':
|
| 240 |
+
print(f"{k:35s}: {v:.4f}" if isinstance(v, float) else f"{k:35s}: {v}")
|
| 241 |
+
|
| 242 |
+
df_results = pd.DataFrame(results)
|
| 243 |
+
df_results.to_csv("tokenizer_benchmark_results.csv", index=False)
|
| 244 |
+
print("\nTokenizer benchmark results saved to 'tokenizer_benchmark_results.csv'")
|
| 245 |
+
|
| 246 |
+
#
|
| 247 |
+
# Step 2.1 — VAE Model Class (PATCHED: decode stops at EOS)
|
| 248 |
+
#
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
import torch
|
| 252 |
+
import torch.nn as nn
|
| 253 |
+
import torch.nn.functional as F
|
| 254 |
+
from typing import Optional, Tuple, Union
|
| 255 |
+
|
| 256 |
+
import torch
|
| 257 |
+
import torch.nn as nn
|
| 258 |
+
import torch.nn.functional as F
|
| 259 |
+
from typing import Tuple, Optional
|
| 260 |
+
|
| 261 |
+
class MoleculeVAE(nn.Module):
|
| 262 |
+
"""
|
| 263 |
+
Optimized MoleculeVAE with:
|
| 264 |
+
- Bidirectional encoder (restored)
|
| 265 |
+
- Proper latent2hidden + latent2cell (restored)
|
| 266 |
+
- Adjustable dropout for small dataset
|
| 267 |
+
- Attention pooling option
|
| 268 |
+
- Quantization-ready hooks
|
| 269 |
+
"""
|
| 270 |
+
|
| 271 |
+
def __init__(self,
|
| 272 |
+
vocab_size: int,
|
| 273 |
+
embed_dim: int = 128,
|
| 274 |
+
hidden_dim: int = 256,
|
| 275 |
+
latent_dim: int = 128,
|
| 276 |
+
num_layers: int = 2,
|
| 277 |
+
pad_token_id: int = 0,
|
| 278 |
+
bos_token_id: int = 1,
|
| 279 |
+
eos_token_id: int = 2,
|
| 280 |
+
dropout: float = 0.2,
|
| 281 |
+
use_attention: bool = True,
|
| 282 |
+
quantize_ready: bool = False):
|
| 283 |
+
super().__init__()
|
| 284 |
+
self.vocab_size = vocab_size
|
| 285 |
+
self.embed_dim = embed_dim
|
| 286 |
+
self.hidden_dim = hidden_dim
|
| 287 |
+
self.latent_dim = latent_dim
|
| 288 |
+
self.num_layers = num_layers
|
| 289 |
+
self.pad_token_id = pad_token_id
|
| 290 |
+
self.bos_token_id = bos_token_id
|
| 291 |
+
self.eos_token_id = eos_token_id
|
| 292 |
+
self.use_attention = use_attention
|
| 293 |
+
|
| 294 |
+
# Shared embedding
|
| 295 |
+
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_token_id)
|
| 296 |
+
|
| 297 |
+
# Bidirectional encoder
|
| 298 |
+
self.encoder_lstm = nn.LSTM(
|
| 299 |
+
embed_dim, hidden_dim, num_layers,
|
| 300 |
+
batch_first=True, dropout=dropout if num_layers > 1 else 0,
|
| 301 |
+
bidirectional=True
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
# Attention pooling (optional)
|
| 305 |
+
if use_attention:
|
| 306 |
+
self.attention = nn.MultiheadAttention(
|
| 307 |
+
hidden_dim * 2, num_heads=4, dropout=dropout, batch_first=True
|
| 308 |
+
)
|
| 309 |
+
self.attention_linear = nn.Linear(hidden_dim * 2, 1)
|
| 310 |
+
|
| 311 |
+
self.encoder_norm = nn.LayerNorm(hidden_dim * 2)
|
| 312 |
+
|
| 313 |
+
# Latent bottleneck
|
| 314 |
+
self.fc_mu = nn.Linear(hidden_dim * 2, latent_dim)
|
| 315 |
+
self.fc_logvar = nn.Linear(hidden_dim * 2, latent_dim)
|
| 316 |
+
|
| 317 |
+
# Decoder init (restored)
|
| 318 |
+
self.latent2hidden = nn.Linear(latent_dim, num_layers * hidden_dim)
|
| 319 |
+
self.latent2cell = nn.Linear(latent_dim, num_layers * hidden_dim)
|
| 320 |
+
|
| 321 |
+
# Decoder
|
| 322 |
+
self.decoder_lstm = nn.LSTM(
|
| 323 |
+
embed_dim, hidden_dim, num_layers,
|
| 324 |
+
batch_first=True, dropout=dropout if num_layers > 1 else 0
|
| 325 |
+
)
|
| 326 |
+
self.decoder_norm = nn.LayerNorm(hidden_dim)
|
| 327 |
+
self.fc_out = nn.Linear(hidden_dim, vocab_size)
|
| 328 |
+
|
| 329 |
+
# Weight tying
|
| 330 |
+
if embed_dim == hidden_dim:
|
| 331 |
+
self.fc_out.weight = self.embedding.weight
|
| 332 |
+
|
| 333 |
+
self.dropout = nn.Dropout(dropout)
|
| 334 |
+
|
| 335 |
+
# Quantization stubs
|
| 336 |
+
if quantize_ready:
|
| 337 |
+
self.quant = torch.quantization.QuantStub()
|
| 338 |
+
self.dequant = torch.quantization.DeQuantStub()
|
| 339 |
+
else:
|
| 340 |
+
self.quant = self.dequant = nn.Identity()
|
| 341 |
+
|
| 342 |
+
self._init_weights()
|
| 343 |
+
|
| 344 |
+
def _init_weights(self):
|
| 345 |
+
for name, param in self.named_parameters():
|
| 346 |
+
if 'weight' in name:
|
| 347 |
+
if param.ndim >= 2:
|
| 348 |
+
nn.init.xavier_uniform_(param)
|
| 349 |
+
else:
|
| 350 |
+
nn.init.normal_(param, 0, 0.01)
|
| 351 |
+
elif 'bias' in name:
|
| 352 |
+
nn.init.zeros_(param)
|
| 353 |
+
|
| 354 |
+
def _pool_sequence(self, packed_output, lengths):
|
| 355 |
+
output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
|
| 356 |
+
if self.use_attention:
|
| 357 |
+
attn_out, _ = self.attention(output, output, output)
|
| 358 |
+
weights = torch.softmax(self.attention_linear(attn_out), dim=1)
|
| 359 |
+
pooled = (weights * output).sum(dim=1)
|
| 360 |
+
else:
|
| 361 |
+
# mean pooling with mask
|
| 362 |
+
batch_size, max_len, _ = output.size()
|
| 363 |
+
mask = torch.arange(max_len, device=output.device).expand(batch_size, max_len) < lengths.unsqueeze(1)
|
| 364 |
+
masked_output = output * mask.unsqueeze(-1).float()
|
| 365 |
+
pooled = masked_output.sum(dim=1) / lengths.unsqueeze(-1).float()
|
| 366 |
+
return pooled
|
| 367 |
+
|
| 368 |
+
def encode(self, x: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 369 |
+
x = self.quant(x)
|
| 370 |
+
embedded = self.dropout(self.embedding(x))
|
| 371 |
+
packed = nn.utils.rnn.pack_padded_sequence(
|
| 372 |
+
embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
|
| 373 |
+
)
|
| 374 |
+
packed_out, _ = self.encoder_lstm(packed)
|
| 375 |
+
h = self._pool_sequence(packed_out, lengths)
|
| 376 |
+
h = self.encoder_norm(h)
|
| 377 |
+
mu, logvar = self.fc_mu(h), self.fc_logvar(h)
|
| 378 |
+
return mu, logvar
|
| 379 |
+
|
| 380 |
+
def reparameterize(self, mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
|
| 381 |
+
if self.training:
|
| 382 |
+
std = torch.exp(0.5 * logvar)
|
| 383 |
+
eps = torch.randn_like(std)
|
| 384 |
+
return mu + eps * std
|
| 385 |
+
return mu
|
| 386 |
+
|
| 387 |
+
def _init_decoder_state(self, z: torch.Tensor):
|
| 388 |
+
batch_size = z.size(0)
|
| 389 |
+
h0 = self.latent2hidden(z).view(self.num_layers, batch_size, self.hidden_dim)
|
| 390 |
+
c0 = self.latent2cell(z).view(self.num_layers, batch_size, self.hidden_dim)
|
| 391 |
+
return h0, c0
|
| 392 |
+
|
| 393 |
+
def decode(self, z: torch.Tensor, max_length: int = 64, mode: str = "greedy", temperature: float = 1.0):
|
| 394 |
+
batch_size = z.size(0)
|
| 395 |
+
device = z.device
|
| 396 |
+
h0, c0 = self._init_decoder_state(z)
|
| 397 |
+
hidden = (h0, c0)
|
| 398 |
+
|
| 399 |
+
input_ids = torch.full((batch_size, 1), self.bos_token_id, dtype=torch.long, device=device)
|
| 400 |
+
finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
|
| 401 |
+
logits_list = []
|
| 402 |
+
|
| 403 |
+
for _ in range(max_length):
|
| 404 |
+
embedded = self.embedding(input_ids)
|
| 405 |
+
output, hidden = self.decoder_lstm(embedded, hidden)
|
| 406 |
+
output = self.decoder_norm(output)
|
| 407 |
+
logit = self.fc_out(output)
|
| 408 |
+
logits_list.append(logit)
|
| 409 |
+
|
| 410 |
+
if mode == "greedy":
|
| 411 |
+
next_tokens = logit.argmax(dim=-1)
|
| 412 |
+
elif mode == "sample":
|
| 413 |
+
probs = F.softmax(logit.squeeze(1) / temperature, dim=-1)
|
| 414 |
+
next_tokens = torch.multinomial(probs, 1)
|
| 415 |
+
else:
|
| 416 |
+
raise ValueError(f"Unknown decode mode: {mode}")
|
| 417 |
+
|
| 418 |
+
just_finished = (next_tokens.squeeze(-1) == self.eos_token_id)
|
| 419 |
+
finished |= just_finished
|
| 420 |
+
next_tokens = torch.where(
|
| 421 |
+
finished.unsqueeze(-1),
|
| 422 |
+
torch.tensor(self.pad_token_id, device=device),
|
| 423 |
+
next_tokens
|
| 424 |
+
)
|
| 425 |
+
input_ids = next_tokens
|
| 426 |
+
if finished.all():
|
| 427 |
+
break
|
| 428 |
+
|
| 429 |
+
return self.dequant(torch.cat(logits_list, dim=1))
|
| 430 |
+
|
| 431 |
+
def forward(self, input_ids: torch.Tensor, lengths: torch.Tensor,
|
| 432 |
+
target_seq: Optional[torch.Tensor] = None,
|
| 433 |
+
teacher_forcing_ratio: float = 0.0,
|
| 434 |
+
temperature: float = 1.0):
|
| 435 |
+
mu, logvar = self.encode(input_ids, lengths)
|
| 436 |
+
z = self.reparameterize(mu, logvar)
|
| 437 |
+
if self.training and target_seq is not None and teacher_forcing_ratio > 0:
|
| 438 |
+
return self._forward_teacher_forcing(z, target_seq, teacher_forcing_ratio), mu, logvar
|
| 439 |
+
else:
|
| 440 |
+
max_len = target_seq.size(1) if target_seq is not None else 64
|
| 441 |
+
return self.decode(z, max_length=max_len, temperature=temperature), mu, logvar
|
| 442 |
+
|
| 443 |
+
def _forward_teacher_forcing(self, z: torch.Tensor, target_seq: torch.Tensor, teacher_forcing_ratio: float):
|
| 444 |
+
batch_size, seq_len = target_seq.size()
|
| 445 |
+
h0, c0 = self._init_decoder_state(z)
|
| 446 |
+
hidden = (h0, c0)
|
| 447 |
+
logits_list = []
|
| 448 |
+
input_token = target_seq[:, 0:1]
|
| 449 |
+
|
| 450 |
+
for t in range(1, seq_len):
|
| 451 |
+
embedded = self.embedding(input_token)
|
| 452 |
+
output, hidden = self.decoder_lstm(embedded, hidden)
|
| 453 |
+
output = self.decoder_norm(output)
|
| 454 |
+
logit = self.fc_out(output)
|
| 455 |
+
logits_list.append(logit)
|
| 456 |
+
|
| 457 |
+
if torch.rand(1).item() < teacher_forcing_ratio:
|
| 458 |
+
input_token = target_seq[:, t:t+1]
|
| 459 |
+
else:
|
| 460 |
+
input_token = logit.argmax(dim=-1)
|
| 461 |
+
|
| 462 |
+
return torch.cat(logits_list, dim=1)
|
| 463 |
+
|
| 464 |
+
#
|
| 465 |
+
# Step 2.2 — Loss Function (PATCHED: β applied OUTSIDE, not inside)
|
| 466 |
+
#
|
| 467 |
+
|
| 468 |
+
# PATCH 2: Fix VAE Loss Function - Ensure beta is properly applied
|
| 469 |
+
# Replace the existing vae_loss function:
|
| 470 |
+
|
| 471 |
+
def vae_loss(logits, targets, mu, logvar, pad_token_id, beta=1.0):
|
| 472 |
+
# 1. align lengths
|
| 473 |
+
max_len = max(logits.size(1), targets.size(1))
|
| 474 |
+
if logits.size(1) < max_len:
|
| 475 |
+
logits = F.pad(logits, (0, 0, 0, max_len - logits.size(1)))
|
| 476 |
+
if targets.size(1) < max_len:
|
| 477 |
+
targets = F.pad(targets, (0, max_len - targets.size(1)), value=pad_token_id)
|
| 478 |
+
|
| 479 |
+
logits_flat = logits.view(-1, logits.size(-1)) # [B*L, V]
|
| 480 |
+
targets_flat = targets.reshape(-1) # [B*L]
|
| 481 |
+
|
| 482 |
+
mask = (targets_flat != pad_token_id).float()
|
| 483 |
+
ce_loss = F.cross_entropy(logits_flat, targets_flat, reduction='none')
|
| 484 |
+
mask_sum = mask.sum()
|
| 485 |
+
ce_loss = (ce_loss * mask).sum() / (mask_sum + 1e-8)
|
| 486 |
+
|
| 487 |
+
# FIXED: Raw KL loss computation
|
| 488 |
+
kl_loss_raw = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1)
|
| 489 |
+
# Apply mask to KL loss if needed (but typically KL is per-sample)
|
| 490 |
+
kl_loss = kl_loss_raw.mean()
|
| 491 |
+
|
| 492 |
+
# CRITICAL FIX: Apply beta scaling correctly
|
| 493 |
+
total_loss = ce_loss + beta * kl_loss
|
| 494 |
+
|
| 495 |
+
return total_loss, ce_loss, kl_loss
|
| 496 |
+
|
| 497 |
+
#
|
| 498 |
+
# Step 2.3 — KLAnnealer (Fixed Bug #5: double increment)
|
| 499 |
+
#
|
| 500 |
+
|
| 501 |
+
import math
|
| 502 |
+
|
| 503 |
+
class KLAnnealer:
|
| 504 |
+
def __init__(self, total_steps, n_cycle=1, ratio=0.3, mode="linear", per_epoch=False, steps_per_epoch=None):
|
| 505 |
+
self.total_steps = total_steps
|
| 506 |
+
self.n_cycle = n_cycle
|
| 507 |
+
self.ratio = ratio
|
| 508 |
+
self.mode = mode
|
| 509 |
+
self.per_epoch = per_epoch
|
| 510 |
+
self.steps_per_epoch = steps_per_epoch
|
| 511 |
+
self.current_step = 0
|
| 512 |
+
self.current_epoch = 0
|
| 513 |
+
|
| 514 |
+
def get_beta(self, increment=True):
|
| 515 |
+
"""Get current KL weight (beta).
|
| 516 |
+
Args:
|
| 517 |
+
increment (bool): whether to advance the annealer (use False in validation).
|
| 518 |
+
"""
|
| 519 |
+
if increment:
|
| 520 |
+
self.current_step += 1
|
| 521 |
+
|
| 522 |
+
# Calculate progress based on total steps
|
| 523 |
+
progress = min(self.current_step / max(self.total_steps, 1.0), 1.0)
|
| 524 |
+
|
| 525 |
+
# For cyclical annealing
|
| 526 |
+
if self.n_cycle > 1:
|
| 527 |
+
cycle_length = self.total_steps / self.n_cycle
|
| 528 |
+
pos_in_cycle = (self.current_step % cycle_length)
|
| 529 |
+
cycle_progress = min(pos_in_cycle / max(cycle_length * self.ratio, 1.0), 1.0)
|
| 530 |
+
else:
|
| 531 |
+
# For single cycle, use full progress
|
| 532 |
+
cycle_progress = min(progress / self.ratio, 1.0) if self.ratio > 0 else 1.0
|
| 533 |
+
|
| 534 |
+
if self.mode == "linear":
|
| 535 |
+
beta = min(cycle_progress, 1.0)
|
| 536 |
+
elif self.mode == "sigmoid":
|
| 537 |
+
k = 6
|
| 538 |
+
# scale progress ∈ [0,1] → [-3, +3] for a smooth S curve
|
| 539 |
+
beta = 1 / (1 + math.exp(-k * (cycle_progress - 0.5)))
|
| 540 |
+
elif self.mode == "cosine":
|
| 541 |
+
# Cosine annealing from 0 to 1
|
| 542 |
+
beta = 0.5 * (1 + math.cos(math.pi * (1 - cycle_progress)))
|
| 543 |
+
else:
|
| 544 |
+
raise ValueError(f"Unknown mode: {self.mode}")
|
| 545 |
+
|
| 546 |
+
return min(beta, 1.0)
|
| 547 |
+
|
| 548 |
+
def step(self):
|
| 549 |
+
"""Increment the step counter."""
|
| 550 |
+
self.current_step += 1
|
| 551 |
+
|
| 552 |
+
def epoch_step(self):
|
| 553 |
+
"""Increment the epoch counter."""
|
| 554 |
+
self.current_epoch += 1
|
| 555 |
+
|
| 556 |
+
#
|
| 557 |
+
# Teacher forcing ratio
|
| 558 |
+
#
|
| 559 |
+
|
| 560 |
+
def get_teacher_forcing_ratio(epoch, num_epochs, min_tfr=0.6, warmup_fraction=0.3):
|
| 561 |
+
"""
|
| 562 |
+
Linear decay of teacher forcing ratio (TFR).
|
| 563 |
+
- Starts at 1.0
|
| 564 |
+
- Decays to min_tfr by (warmup_fraction * num_epochs)
|
| 565 |
+
- Then stays flat
|
| 566 |
+
"""
|
| 567 |
+
warmup_epochs = int(num_epochs * warmup_fraction)
|
| 568 |
+
if epoch < warmup_epochs:
|
| 569 |
+
# linearly decay from 1.0 → min_tfr
|
| 570 |
+
return 1.0 - (1.0 - min_tfr) * (epoch / warmup_epochs)
|
| 571 |
+
else:
|
| 572 |
+
return min_tfr
|
| 573 |
+
|
| 574 |
+
|
| 575 |
+
#
|
| 576 |
+
# Step 2.4 — Collate Function (Fixed Bug #2: dynamic pad id)
|
| 577 |
+
#
|
| 578 |
+
|
| 579 |
+
def collate_fn(batch, tokenizer, max_length=128):
|
| 580 |
+
encodings = [tokenizer.encode(s, add_special_tokens=True) for s in batch]
|
| 581 |
+
input_ids = [e['input_ids'] for e in encodings]
|
| 582 |
+
|
| 583 |
+
max_len = min(max(len(ids) for ids in input_ids), max_length)
|
| 584 |
+
padded = []
|
| 585 |
+
lengths = []
|
| 586 |
+
|
| 587 |
+
pad_token_id = tokenizer.tokenizer.pad_token_id # FIXED: dynamic
|
| 588 |
+
|
| 589 |
+
for ids in input_ids:
|
| 590 |
+
if len(ids) > max_length:
|
| 591 |
+
ids = ids[:max_length]
|
| 592 |
+
else:
|
| 593 |
+
ids = ids + [pad_token_id] * (max_len - len(ids))
|
| 594 |
+
padded.append(ids)
|
| 595 |
+
lengths.append(min(len(ids), max_length))
|
| 596 |
+
|
| 597 |
+
return torch.tensor(padded, dtype=torch.long), torch.tensor(lengths, dtype=torch.long)
|
| 598 |
+
|
| 599 |
+
#
|
| 600 |
+
# Step 2.5 — Dataset & DataLoader
|
| 601 |
+
#
|
| 602 |
+
|
| 603 |
+
class SmilesDataset(Dataset):
|
| 604 |
+
def __init__(self, smiles_list):
|
| 605 |
+
self.smiles_list = smiles_list
|
| 606 |
+
def __len__(self):
|
| 607 |
+
return len(self.smiles_list)
|
| 608 |
+
def __getitem__(self, idx):
|
| 609 |
+
return self.smiles_list[idx]
|
| 610 |
+
|
| 611 |
+
#
|
| 612 |
+
# Step 3.x — Training Loop (PATCHED: per-tokenizer annealer, exponential TFR, device-safe eval, KL beta logging clarity)
|
| 613 |
+
#
|
| 614 |
+
|
| 615 |
+
LEARNING_RATE = 1e-5
|
| 616 |
+
BATCH_SIZE = 16
|
| 617 |
+
ACCUMULATION_STEPS = 4
|
| 618 |
+
NUM_EPOCHS = 5
|
| 619 |
+
MAX_SEQ_LEN = 128
|
| 620 |
+
KL_ANNEAL_RATIO = 0.3
|
| 621 |
+
|
| 622 |
+
def train_vae(
|
| 623 |
+
model,
|
| 624 |
+
train_loader,
|
| 625 |
+
val_loader,
|
| 626 |
+
optimizer,
|
| 627 |
+
kl_annealer,
|
| 628 |
+
pad_token_id,
|
| 629 |
+
device,
|
| 630 |
+
num_epochs,
|
| 631 |
+
accumulation_steps=4,
|
| 632 |
+
save_dir="./checkpoints",
|
| 633 |
+
tokenizer_name="default"
|
| 634 |
+
):
|
| 635 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 636 |
+
log_file = os.path.join(save_dir, f"training_log_{tokenizer_name}.csv")
|
| 637 |
+
|
| 638 |
+
with open(log_file, "w") as f:
|
| 639 |
+
f.write("epoch,step,train_loss,train_ce,train_kl,val_loss,val_ce,val_kl,kl_beta\n")
|
| 640 |
+
|
| 641 |
+
best_val_loss = float('inf')
|
| 642 |
+
|
| 643 |
+
for epoch in range(num_epochs):
|
| 644 |
+
print(f"\n=== Epoch {epoch+1}/{num_epochs} ===")
|
| 645 |
+
model.train()
|
| 646 |
+
total_train_loss = total_train_ce = total_train_kl = 0.0
|
| 647 |
+
num_batches = 0
|
| 648 |
+
|
| 649 |
+
optimizer.zero_grad()
|
| 650 |
+
|
| 651 |
+
for step, (input_ids, lengths) in enumerate(tqdm(train_loader, desc="Training")):
|
| 652 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 653 |
+
|
| 654 |
+
# ← PATCHED: exponential decay per epoch (not per batch, but smoother than linear)
|
| 655 |
+
tfr = get_teacher_forcing_ratio(epoch, num_epochs, min_tfr=0.6, warmup_fraction=0.3)
|
| 656 |
+
|
| 657 |
+
logits, mu, logvar = model(input_ids, lengths, target_seq=input_ids, teacher_forcing_ratio=tfr)
|
| 658 |
+
beta = kl_annealer.get_beta(increment=True)
|
| 659 |
+
loss, ce_loss, kl_loss = vae_loss(logits, input_ids, mu, logvar, pad_token_id, beta=beta)
|
| 660 |
+
|
| 661 |
+
loss = loss / accumulation_steps
|
| 662 |
+
loss.backward()
|
| 663 |
+
|
| 664 |
+
total_train_loss += loss.item() * accumulation_steps
|
| 665 |
+
total_train_ce += ce_loss.item()
|
| 666 |
+
total_train_kl += kl_loss.item()
|
| 667 |
+
num_batches += 1
|
| 668 |
+
|
| 669 |
+
if (step + 1) % accumulation_steps == 0:
|
| 670 |
+
optimizer.step()
|
| 671 |
+
optimizer.zero_grad()
|
| 672 |
+
|
| 673 |
+
if len(train_loader) % accumulation_steps != 0:
|
| 674 |
+
optimizer.step()
|
| 675 |
+
optimizer.zero_grad()
|
| 676 |
+
|
| 677 |
+
# ✅ CAPTURE BETA AFTER TRAINING — BEFORE VALIDATION
|
| 678 |
+
# This ensures we log the beta that was actually used during training
|
| 679 |
+
current_beta = kl_annealer.get_beta(increment=False)
|
| 680 |
+
|
| 681 |
+
# Validation — DO NOT query beta again here
|
| 682 |
+
model.eval()
|
| 683 |
+
total_val_loss = total_val_ce = total_val_kl = 0.0
|
| 684 |
+
val_batches = 0
|
| 685 |
+
|
| 686 |
+
with torch.no_grad():
|
| 687 |
+
for input_ids, lengths in tqdm(val_loader, desc="Validating"):
|
| 688 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 689 |
+
# Use captured beta — DO NOT call kl_annealer again here
|
| 690 |
+
logits, mu, logvar = model(input_ids, lengths, target_seq=input_ids, teacher_forcing_ratio=0.0)
|
| 691 |
+
loss, ce_loss, kl_loss = vae_loss(logits, input_ids, mu, logvar, pad_token_id, beta=current_beta)
|
| 692 |
+
|
| 693 |
+
total_val_loss += loss.item()
|
| 694 |
+
total_val_ce += ce_loss.item()
|
| 695 |
+
total_val_kl += kl_loss.item()
|
| 696 |
+
val_batches += 1
|
| 697 |
+
|
| 698 |
+
avg_train_loss = total_train_loss / num_batches
|
| 699 |
+
avg_val_loss = total_val_loss / val_batches
|
| 700 |
+
|
| 701 |
+
current_step = (epoch + 1) * len(train_loader)
|
| 702 |
+
with open(log_file, "a") as f:
|
| 703 |
+
f.write(f"{epoch+1},{current_step},{avg_train_loss:.6f},{total_train_ce/num_batches:.6f},{total_train_kl/num_batches:.6f},"
|
| 704 |
+
f"{avg_val_loss:.6f},{total_val_ce/val_batches:.6f},{total_val_kl/val_batches:.6f},{current_beta:.6f}\n")
|
| 705 |
+
|
| 706 |
+
print(f"Train Loss: {avg_train_loss:.4f}")
|
| 707 |
+
print(f"Val Loss: {avg_val_loss:.4f}")
|
| 708 |
+
print(f"KL Beta: {current_beta:.4f}") # ← Now explicitly the training beta
|
| 709 |
+
|
| 710 |
+
if avg_val_loss < best_val_loss:
|
| 711 |
+
best_val_loss = avg_val_loss
|
| 712 |
+
checkpoint_path = os.path.join(save_dir, f"best_model_{tokenizer_name}.pt")
|
| 713 |
+
torch.save({
|
| 714 |
+
'epoch': epoch + 1,
|
| 715 |
+
'model_state_dict': model.state_dict(),
|
| 716 |
+
'optimizer_state_dict': optimizer.state_dict(),
|
| 717 |
+
'val_loss': avg_val_loss,
|
| 718 |
+
}, checkpoint_path)
|
| 719 |
+
print(f"→ Saved best model to {checkpoint_path}")
|
| 720 |
+
|
| 721 |
+
return best_val_loss
|
| 722 |
+
|
| 723 |
+
#
|
| 724 |
+
# TRAINING LOOP OVER TOKENIZERS (PATCHED: KLAnnealer reset per tokenizer)
|
| 725 |
+
#
|
| 726 |
+
|
| 727 |
+
for tokenizer in TOKENIZERS:
|
| 728 |
+
print(f"\n STARTING TRAINING FOR: {tokenizer.name}\n")
|
| 729 |
+
|
| 730 |
+
vocab_size = len(tokenizer)
|
| 731 |
+
pad_token_id = tokenizer.tokenizer.pad_token_id
|
| 732 |
+
|
| 733 |
+
# Validate token IDs
|
| 734 |
+
sample_ids = tokenizer.encode(train_smiles[0], add_special_tokens=True)['input_ids']
|
| 735 |
+
max_id_in_sample = max(sample_ids)
|
| 736 |
+
assert max_id_in_sample < vocab_size, f"Token ID {max_id_in_sample} >= vocab size {vocab_size} in {tokenizer.name}"
|
| 737 |
+
|
| 738 |
+
model = MoleculeVAE(
|
| 739 |
+
vocab_size=len(tokenizer),
|
| 740 |
+
pad_token_id=tokenizer.pad_token_id,
|
| 741 |
+
bos_token_id=tokenizer.bos_token_id,
|
| 742 |
+
eos_token_id=tokenizer.eos_token_id
|
| 743 |
+
).to(device)
|
| 744 |
+
|
| 745 |
+
########################################################################
|
| 746 |
+
# 1. CREATE A FRESH annealer FOR EVERY TOKENIZER
|
| 747 |
+
########################################################################
|
| 748 |
+
|
| 749 |
+
|
| 750 |
+
|
| 751 |
+
optimizer = Ranger21(
|
| 752 |
+
model.parameters(),
|
| 753 |
+
lr=LEARNING_RATE,
|
| 754 |
+
weight_decay=0.01,
|
| 755 |
+
use_adabelief=True,
|
| 756 |
+
use_warmup=True,
|
| 757 |
+
use_madgrad=True,
|
| 758 |
+
num_epochs=NUM_EPOCHS,
|
| 759 |
+
num_batches_per_epoch=len(train_smiles) // (BATCH_SIZE * ACCUMULATION_STEPS),
|
| 760 |
+
warmdown_active=False,
|
| 761 |
+
)
|
| 762 |
+
|
| 763 |
+
train_dataset = SmilesDataset(train_smiles)
|
| 764 |
+
val_dataset = SmilesDataset(val_smiles)
|
| 765 |
+
|
| 766 |
+
train_loader = DataLoader(
|
| 767 |
+
train_dataset,
|
| 768 |
+
batch_size=BATCH_SIZE,
|
| 769 |
+
shuffle=True,
|
| 770 |
+
collate_fn=lambda batch: collate_fn(batch, tokenizer, max_length=MAX_SEQ_LEN),
|
| 771 |
+
num_workers=0,
|
| 772 |
+
pin_memory=True
|
| 773 |
+
)
|
| 774 |
+
|
| 775 |
+
val_loader = DataLoader(
|
| 776 |
+
val_dataset,
|
| 777 |
+
batch_size=BATCH_SIZE,
|
| 778 |
+
shuffle=False,
|
| 779 |
+
collate_fn=lambda batch: collate_fn(batch, tokenizer, max_length=MAX_SEQ_LEN),
|
| 780 |
+
num_workers=0,
|
| 781 |
+
pin_memory=True
|
| 782 |
+
)
|
| 783 |
+
|
| 784 |
+
steps_per_epoch = len(train_loader)
|
| 785 |
+
total_steps = steps_per_epoch * NUM_EPOCHS
|
| 786 |
+
# total_steps = (len(train_smiles) // (BATCH_SIZE * ACCUMULATION_STEPS)) * NUM_EPOCHS
|
| 787 |
+
kl_annealer = KLAnnealer(
|
| 788 |
+
total_steps=total_steps,
|
| 789 |
+
n_cycle=1, # REDUCED: 2 cycles instead of 4 for longer warmup per cycle
|
| 790 |
+
ratio=0.6, # INCREASED: 60% of each cycle is warmup (was 25%)
|
| 791 |
+
mode="linear", # CHANGED: Linear is more predictable than sigmoid
|
| 792 |
+
per_epoch=False
|
| 793 |
+
)
|
| 794 |
+
|
| 795 |
+
train_vae(
|
| 796 |
+
model=model,
|
| 797 |
+
train_loader=train_loader,
|
| 798 |
+
val_loader=val_loader,
|
| 799 |
+
optimizer=optimizer,
|
| 800 |
+
kl_annealer=kl_annealer,
|
| 801 |
+
pad_token_id=pad_token_id,
|
| 802 |
+
device=device,
|
| 803 |
+
num_epochs=NUM_EPOCHS,
|
| 804 |
+
accumulation_steps=ACCUMULATION_STEPS,
|
| 805 |
+
save_dir=f"./checkpoints/{tokenizer.name}",
|
| 806 |
+
tokenizer_name=tokenizer.name
|
| 807 |
+
)
|
| 808 |
+
|
| 809 |
+
#
|
| 810 |
+
# Step 4.x — Evaluation Pipeline (Fixed Bug #6, #7, #8)
|
| 811 |
+
#
|
| 812 |
+
|
| 813 |
+
def canonicalize_smiles(smiles):
|
| 814 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 815 |
+
if mol is None:
|
| 816 |
+
return None
|
| 817 |
+
return Chem.MolToSmiles(mol, isomericSmiles=True)
|
| 818 |
+
|
| 819 |
+
def evaluate_reconstruction(model, dataloader, tokenizer, device, max_length=128):
|
| 820 |
+
model.eval()
|
| 821 |
+
total_token_correct = total_tokens = exact_matches = valid_count = total_samples = 0
|
| 822 |
+
all_generated, all_targets = [], []
|
| 823 |
+
|
| 824 |
+
pad_id = tokenizer.tokenizer.pad_token_id
|
| 825 |
+
eos_id = tokenizer.tokenizer.eos_token_id
|
| 826 |
+
special_ids = {pad_id, eos_id}
|
| 827 |
+
|
| 828 |
+
def trim_to_special(ids, specials):
|
| 829 |
+
for i, id_ in enumerate(ids):
|
| 830 |
+
if id_ in specials:
|
| 831 |
+
return ids[:i]
|
| 832 |
+
return ids
|
| 833 |
+
|
| 834 |
+
with torch.no_grad():
|
| 835 |
+
for input_ids, lengths in tqdm(dataloader, desc="Evaluating Reconstruction"):
|
| 836 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 837 |
+
B = input_ids.size(0)
|
| 838 |
+
|
| 839 |
+
mu, logvar = model.encode(input_ids, lengths)
|
| 840 |
+
z = model.reparameterize(mu, logvar)
|
| 841 |
+
logits = model.decode(z, max_length=128, mode="greedy") # FIXED #7 for reconstruction
|
| 842 |
+
preds = logits.argmax(dim=-1)
|
| 843 |
+
|
| 844 |
+
# FIXED: Align logits and targets to same sequence length
|
| 845 |
+
min_len = min(logits.size(1), input_ids.size(1))
|
| 846 |
+
preds = preds[:, :min_len] # trim predictions
|
| 847 |
+
input_ids_eval = input_ids[:, :min_len] # trim targets
|
| 848 |
+
|
| 849 |
+
mask = (input_ids_eval != pad_id)
|
| 850 |
+
token_correct = ((preds == input_ids_eval) & mask).sum().item()
|
| 851 |
+
total_token_correct += token_correct
|
| 852 |
+
total_tokens += mask.sum().item()
|
| 853 |
+
|
| 854 |
+
for i in range(B):
|
| 855 |
+
target_ids = input_ids_eval[i].cpu().tolist()
|
| 856 |
+
pred_ids = preds[i].cpu().tolist()
|
| 857 |
+
|
| 858 |
+
# FIXED BUG #6: Trim before decode
|
| 859 |
+
target_ids_trim = trim_to_special(target_ids, special_ids)
|
| 860 |
+
pred_ids_trim = trim_to_special(pred_ids, special_ids)
|
| 861 |
+
|
| 862 |
+
target_smiles = tokenizer.decode(target_ids_trim, skip_special_tokens=False)
|
| 863 |
+
pred_smiles = tokenizer.decode(pred_ids_trim, skip_special_tokens=False)
|
| 864 |
+
|
| 865 |
+
all_targets.append(target_smiles)
|
| 866 |
+
all_generated.append(pred_smiles)
|
| 867 |
+
|
| 868 |
+
if pred_smiles == target_smiles:
|
| 869 |
+
exact_matches += 1
|
| 870 |
+
if Chem.MolFromSmiles(pred_smiles) is not None:
|
| 871 |
+
valid_count += 1
|
| 872 |
+
total_samples += 1
|
| 873 |
+
|
| 874 |
+
token_acc = total_token_correct / total_tokens if total_tokens > 0 else 0.0
|
| 875 |
+
exact_match_rate = exact_matches / total_samples
|
| 876 |
+
validity_rate = valid_count / total_samples
|
| 877 |
+
|
| 878 |
+
print(f"Token-level Accuracy: {token_acc:.4f}")
|
| 879 |
+
print(f"Exact Match Rate: {exact_match_rate:.4f}")
|
| 880 |
+
print(f"Validity Rate: {validity_rate:.4f}")
|
| 881 |
+
|
| 882 |
+
return {
|
| 883 |
+
'token_accuracy': token_acc,
|
| 884 |
+
'exact_match_rate': exact_match_rate,
|
| 885 |
+
'validity_rate': validity_rate,
|
| 886 |
+
'generated_smiles': all_generated,
|
| 887 |
+
'target_smiles': all_targets
|
| 888 |
+
}
|
| 889 |
+
|
| 890 |
+
def compute_uniqueness_and_novelty(generated_smiles, train_smiles_set):
|
| 891 |
+
total = len(generated_smiles)
|
| 892 |
+
unique = len(set(generated_smiles))
|
| 893 |
+
novel = len([s for s in generated_smiles if s not in train_smiles_set])
|
| 894 |
+
uniqueness = unique / total if total > 0 else 0.0
|
| 895 |
+
novelty = novel / total if total > 0 else 0.0
|
| 896 |
+
print(f"Uniqueness: {uniqueness:.4f} ({unique}/{total})")
|
| 897 |
+
print(f"Novelty: {novelty:.4f} ({novel}/not in train)")
|
| 898 |
+
return uniqueness, novelty
|
| 899 |
+
|
| 900 |
+
def kl_divergence_from_samples(samples, bins=512):
|
| 901 |
+
dim_kls = []
|
| 902 |
+
for d in range(samples.shape[1]):
|
| 903 |
+
data = samples[:, d]
|
| 904 |
+
hist, bin_edges = np.histogram(data, bins=bins, density=True)
|
| 905 |
+
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
|
| 906 |
+
norm_pdf = (1 / np.sqrt(2 * np.pi)) * np.exp(-0.5 * bin_centers**2)
|
| 907 |
+
hist = np.clip(hist, 1e-10, None)
|
| 908 |
+
norm_pdf = np.clip(norm_pdf, 1e-10, None)
|
| 909 |
+
kl = entropy(hist, norm_pdf)
|
| 910 |
+
dim_kls.append(kl)
|
| 911 |
+
return np.mean(dim_kls)
|
| 912 |
+
|
| 913 |
+
def evaluate_latent_kl(model, dataloader, device, latent_dim=128, bins=512):
|
| 914 |
+
model.eval()
|
| 915 |
+
all_z = []
|
| 916 |
+
with torch.no_grad():
|
| 917 |
+
for input_ids, lengths in tqdm(dataloader, desc="Sampling Latents"):
|
| 918 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 919 |
+
mu, logvar = model.encode(input_ids, lengths)
|
| 920 |
+
z = model.reparameterize(mu, logvar)
|
| 921 |
+
all_z.append(z.cpu().numpy())
|
| 922 |
+
all_z = np.concatenate(all_z, axis=0)
|
| 923 |
+
kl_div = kl_divergence_from_samples(all_z, bins=bins)
|
| 924 |
+
print(f"KL Divergence (empirical vs N(0,1)): {kl_div:.4f}")
|
| 925 |
+
return kl_div
|
| 926 |
+
|
| 927 |
+
def evaluate_interpolation_validity(model, tokenizer, test_smiles, device, num_pairs=100, steps=10, max_length=128):
|
| 928 |
+
model.eval()
|
| 929 |
+
pairs = random.sample(list(zip(test_smiles[::2], test_smiles[1::2])), min(num_pairs, len(test_smiles)//2))
|
| 930 |
+
valid_interps = total_interps = 0
|
| 931 |
+
|
| 932 |
+
with torch.no_grad():
|
| 933 |
+
for smiles_a, smiles_b in tqdm(pairs, desc="Interpolation Validity"):
|
| 934 |
+
if not smiles_a or not smiles_b: continue
|
| 935 |
+
|
| 936 |
+
enc_a = tokenizer.encode(smiles_a, add_special_tokens=True)
|
| 937 |
+
enc_b = tokenizer.encode(smiles_b, add_special_tokens=True)
|
| 938 |
+
|
| 939 |
+
ids_a = torch.tensor([enc_a['input_ids']], device=device)
|
| 940 |
+
ids_b = torch.tensor([enc_b['input_ids']], device=device)
|
| 941 |
+
len_a = torch.tensor([len(enc_a['input_ids'])], device=device)
|
| 942 |
+
len_b = torch.tensor([len(enc_b['input_ids'])], device=device)
|
| 943 |
+
|
| 944 |
+
mu_a, _ = model.encode(ids_a, len_a)
|
| 945 |
+
mu_b, _ = model.encode(ids_b, len_b)
|
| 946 |
+
|
| 947 |
+
alphas = torch.linspace(0, 1, steps, device=device)
|
| 948 |
+
for alpha in alphas:
|
| 949 |
+
z_interp = alpha * mu_b + (1 - alpha) * mu_a
|
| 950 |
+
# Ensure z_interp maintains batch dimension [1, latent_dim]
|
| 951 |
+
if z_interp.dim() == 1:
|
| 952 |
+
z_interp = z_interp.unsqueeze(0)
|
| 953 |
+
|
| 954 |
+
logits = model.decode(z_interp, max_length=max_length, mode="sample", temperature=0.8)
|
| 955 |
+
preds = logits.argmax(dim=-1)
|
| 956 |
+
# Handle batch dimension properly
|
| 957 |
+
if preds.dim() > 1:
|
| 958 |
+
preds = preds[0] # Take first (and only) batch item
|
| 959 |
+
pred_smiles = tokenizer.decode(preds.cpu().tolist(), skip_special_tokens=True)
|
| 960 |
+
if Chem.MolFromSmiles(pred_smiles) is not None:
|
| 961 |
+
valid_interps += 1
|
| 962 |
+
total_interps += 1
|
| 963 |
+
|
| 964 |
+
interp_validity = valid_interps / total_interps if total_interps > 0 else 0.0
|
| 965 |
+
print(f"Interpolation Validity: {interp_validity:.4f}")
|
| 966 |
+
return interp_validity
|
| 967 |
+
|
| 968 |
+
def sample_from_latent(model, tokenizer, num_samples=30000, latent_dim=128, max_length=128, device=device, temperature=0.8):
|
| 969 |
+
model.eval()
|
| 970 |
+
generated_smiles = []
|
| 971 |
+
with torch.no_grad():
|
| 972 |
+
for _ in tqdm(range(0, num_samples, BATCH_SIZE), desc="Sampling from Latent"):
|
| 973 |
+
current_batch_size = min(BATCH_SIZE, num_samples - len(generated_smiles))
|
| 974 |
+
if current_batch_size <= 0: break
|
| 975 |
+
z = torch.randn(current_batch_size, latent_dim, device=device)
|
| 976 |
+
logits = model.decode(z, max_length=max_length, mode="sample", temperature=temperature)
|
| 977 |
+
preds = logits.argmax(dim=-1)
|
| 978 |
+
for i in range(current_batch_size):
|
| 979 |
+
pred_ids = preds[i].cpu().tolist()
|
| 980 |
+
smiles = tokenizer.decode(pred_ids, skip_special_tokens=True)
|
| 981 |
+
generated_smiles.append(smiles)
|
| 982 |
+
if len(generated_smiles) >= num_samples: break
|
| 983 |
+
return generated_smiles
|
| 984 |
+
|
| 985 |
+
def measure_inference_throughput(model, tokenizer, test_smiles, device,
|
| 986 |
+
max_length=128,
|
| 987 |
+
batch_sizes=[1, 4, 8, 16]):
|
| 988 |
+
"""
|
| 989 |
+
Benchmark inference speed & peak GPU memory across several batch sizes.
|
| 990 |
+
Returns a JSON-serialisable dict:
|
| 991 |
+
{batch_size: {'tokens_per_sec': <float>, 'peak_mem_mb': <float>}, ...}
|
| 992 |
+
"""
|
| 993 |
+
model.eval()
|
| 994 |
+
results = {}
|
| 995 |
+
|
| 996 |
+
for bs in batch_sizes:
|
| 997 |
+
# Build a small fixed subset so every BS processes the same #samples
|
| 998 |
+
subset = SmilesDataset(test_smiles[:bs * 10])
|
| 999 |
+
loader = DataLoader(
|
| 1000 |
+
subset,
|
| 1001 |
+
batch_size=bs,
|
| 1002 |
+
shuffle=False,
|
| 1003 |
+
num_workers=0,
|
| 1004 |
+
collate_fn=lambda b: collate_fn(b, tokenizer, max_length=max_length),
|
| 1005 |
+
)
|
| 1006 |
+
|
| 1007 |
+
total_tokens = 0
|
| 1008 |
+
if torch.cuda.is_available():
|
| 1009 |
+
torch.cuda.reset_peak_memory_stats(device)
|
| 1010 |
+
|
| 1011 |
+
start_time = time.perf_counter()
|
| 1012 |
+
with torch.no_grad():
|
| 1013 |
+
for input_ids, lengths in loader:
|
| 1014 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 1015 |
+
mu, logvar = model.encode(input_ids, lengths)
|
| 1016 |
+
z = model.reparameterize(mu, logvar)
|
| 1017 |
+
logits = model.decode(z, max_length=max_length)
|
| 1018 |
+
total_tokens += logits.numel() # number of float elements
|
| 1019 |
+
duration = time.perf_counter() - start_time
|
| 1020 |
+
|
| 1021 |
+
tokens_per_sec = total_tokens / duration
|
| 1022 |
+
peak_mem_mb = (
|
| 1023 |
+
torch.cuda.max_memory_allocated(device) / (1024 ** 2)
|
| 1024 |
+
if torch.cuda.is_available()
|
| 1025 |
+
else 0.0
|
| 1026 |
+
)
|
| 1027 |
+
|
| 1028 |
+
# Store as plain Python floats
|
| 1029 |
+
results[bs] = {
|
| 1030 |
+
"tokens_per_sec": float(tokens_per_sec),
|
| 1031 |
+
"peak_mem_mb": float(peak_mem_mb),
|
| 1032 |
+
}
|
| 1033 |
+
print(f"BS {bs:3d} → {tokens_per_sec:8.2f} tok/s | Peak Mem: {peak_mem_mb:.2f} MB")
|
| 1034 |
+
|
| 1035 |
+
return results
|
| 1036 |
+
|
| 1037 |
+
#
|
| 1038 |
+
# FINAL EVALUATION PIPELINE
|
| 1039 |
+
#
|
| 1040 |
+
|
| 1041 |
+
def full_evaluation_pipeline(model, tokenizer, train_smiles, test_smiles, device, save_dir):
|
| 1042 |
+
print(f"\n FULL EVALUATION FOR: {tokenizer.name}")
|
| 1043 |
+
|
| 1044 |
+
test_dataset = SmilesDataset(test_smiles)
|
| 1045 |
+
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False,
|
| 1046 |
+
collate_fn=lambda b: collate_fn(b, tokenizer, max_length=MAX_SEQ_LEN),
|
| 1047 |
+
num_workers=0)
|
| 1048 |
+
|
| 1049 |
+
# 1. Reconstruction
|
| 1050 |
+
recon_metrics = evaluate_reconstruction(model, test_loader, tokenizer, device)
|
| 1051 |
+
|
| 1052 |
+
# 2. Uniqueness & Novelty
|
| 1053 |
+
train_set = set(train_smiles)
|
| 1054 |
+
uniqueness, novelty = compute_uniqueness_and_novelty(recon_metrics['generated_smiles'], train_set)
|
| 1055 |
+
|
| 1056 |
+
# 3. KL Divergence
|
| 1057 |
+
kl_div = evaluate_latent_kl(model, test_loader, device)
|
| 1058 |
+
|
| 1059 |
+
# 4. Interpolation Validity
|
| 1060 |
+
interp_validity = evaluate_interpolation_validity(model, tokenizer, test_smiles, device)
|
| 1061 |
+
|
| 1062 |
+
# 5. Latent Sampling (for FCD — optional)
|
| 1063 |
+
# gen_smiles_30k = sample_from_latent(model, tokenizer, num_samples=10000, temperature=0.8) # reduce for speed
|
| 1064 |
+
# fcd_score = compute_fcd(test_smiles, gen_smiles_30k) if 'get_fcd' in globals() else None
|
| 1065 |
+
|
| 1066 |
+
# 6. Throughput & Memory
|
| 1067 |
+
# throughput = measure_inference_throughput(model, tokenizer, test_loader, device)
|
| 1068 |
+
|
| 1069 |
+
eval_results = {
|
| 1070 |
+
**recon_metrics,
|
| 1071 |
+
'uniqueness': uniqueness,
|
| 1072 |
+
'novelty': novelty,
|
| 1073 |
+
'kl_divergence': kl_div,
|
| 1074 |
+
'interpolation_validity': interp_validity,
|
| 1075 |
+
# 'fcd': fcd_score,
|
| 1076 |
+
# 'inference_throughput': throughput,
|
| 1077 |
+
}
|
| 1078 |
+
|
| 1079 |
+
eval_path = os.path.join(save_dir, "evaluation_results.json")
|
| 1080 |
+
with open(eval_path, "w") as f:
|
| 1081 |
+
json.dump(eval_results, f, indent=2, default=str)
|
| 1082 |
+
|
| 1083 |
+
print(f" Evaluation saved to {eval_path}")
|
| 1084 |
+
return eval_results
|
| 1085 |
+
|
| 1086 |
+
#
|
| 1087 |
+
# RUN EVALUATION FOR EACH TOKENIZER
|
| 1088 |
+
#
|
| 1089 |
+
|
| 1090 |
+
for tokenizer in TOKENIZERS:
|
| 1091 |
+
print(f"\n🔄 LOADING BEST MODEL FOR: {tokenizer.name}")
|
| 1092 |
+
checkpoint_path = f"./checkpoints/{tokenizer.name}/best_model_{tokenizer.name}.pt"
|
| 1093 |
+
if not os.path.exists(checkpoint_path):
|
| 1094 |
+
print(f"⚠️ Checkpoint not found: {checkpoint_path}")
|
| 1095 |
+
continue
|
| 1096 |
+
|
| 1097 |
+
vocab_size = len(tokenizer)
|
| 1098 |
+
pad_token_id = tokenizer.tokenizer.pad_token_id
|
| 1099 |
+
model = MoleculeVAE(
|
| 1100 |
+
vocab_size=vocab_size,
|
| 1101 |
+
pad_token_id=pad_token_id,
|
| 1102 |
+
bos_token_id=tokenizer.bos_token_id,
|
| 1103 |
+
eos_token_id=tokenizer.eos_token_id
|
| 1104 |
+
).to(device)
|
| 1105 |
+
|
| 1106 |
+
checkpoint = torch.load(checkpoint_path, map_location=device)
|
| 1107 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
| 1108 |
+
model.eval()
|
| 1109 |
+
|
| 1110 |
+
full_evaluation_pipeline(
|
| 1111 |
+
model=model,
|
| 1112 |
+
tokenizer=tokenizer,
|
| 1113 |
+
train_smiles=train_smiles,
|
| 1114 |
+
test_smiles=test_smiles,
|
| 1115 |
+
device=device,
|
| 1116 |
+
save_dir=f"./checkpoints/{tokenizer.name}"
|
| 1117 |
+
)
|
| 1118 |
+
|
| 1119 |
+
print("\n🎉 PIPELINE COMPLETE — ALL TOKENIZERS BENCHMARKED, TRAINED, AND EVALUATED!")
|
benchmark/benchmark_HF_simpler.py
ADDED
|
@@ -0,0 +1,895 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Molecule Tokenizer Benchmark & VAE Training Pipeline
|
| 3 |
+
# PATCHED VERSION — Updated for FastChemTokenizerHF (HF compatible)
|
| 4 |
+
# PATCHED: Simplified KL annealing (linear warmup), updated TFR schedule, updated training loop
|
| 5 |
+
#
|
| 6 |
+
#
|
| 7 |
+
# Step 1.1 — Imports & Reproducibility
|
| 8 |
+
#
|
| 9 |
+
import os
|
| 10 |
+
import time
|
| 11 |
+
import random
|
| 12 |
+
import pandas as pd
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
import torch
|
| 16 |
+
import numpy as np
|
| 17 |
+
# Tokenizers
|
| 18 |
+
from transformers import AutoTokenizer
|
| 19 |
+
from FastChemTokenizerHF import FastChemTokenizer
|
| 20 |
+
# Optional: for progress bars
|
| 21 |
+
from tqdm import tqdm
|
| 22 |
+
from rdkit import Chem
|
| 23 |
+
from sklearn.model_selection import train_test_split
|
| 24 |
+
import torch.nn as nn
|
| 25 |
+
import torch.nn.functional as F
|
| 26 |
+
from ranger21 import Ranger21
|
| 27 |
+
from torch.utils.data import DataLoader, Dataset
|
| 28 |
+
from scipy.stats import entropy
|
| 29 |
+
import json
|
| 30 |
+
import math
|
| 31 |
+
from typing import Optional, Tuple, Union
|
| 32 |
+
from rdkit import RDLogger
|
| 33 |
+
RDLogger.DisableLog('rdApp.*')
|
| 34 |
+
# Set seeds for reproducibility
|
| 35 |
+
def set_seed(seed=42):
|
| 36 |
+
torch.manual_seed(seed)
|
| 37 |
+
torch.cuda.manual_seed_all(seed)
|
| 38 |
+
np.random.seed(seed)
|
| 39 |
+
random.seed(seed)
|
| 40 |
+
os.environ['PYTHONHASHSEED'] = str(seed)
|
| 41 |
+
torch.backends.cudnn.deterministic = True
|
| 42 |
+
torch.backends.cudnn.benchmark = False
|
| 43 |
+
set_seed(42)
|
| 44 |
+
# Device setup
|
| 45 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 46 |
+
print(f"Using device: {device}")
|
| 47 |
+
#
|
| 48 |
+
# Step 1.2 — Load & Preprocess SMILES Corpus
|
| 49 |
+
#
|
| 50 |
+
data_path = "./data/chunk_1smi.csv"
|
| 51 |
+
df = pd.read_csv(data_path)
|
| 52 |
+
# Replace df with a 10% sample for prototyping
|
| 53 |
+
df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)
|
| 54 |
+
print(f"Prototype size: {len(df)} rows")
|
| 55 |
+
if 'SMILES' not in df.columns:
|
| 56 |
+
raise ValueError("Expected column 'SMILES' in CSV")
|
| 57 |
+
smiles_list = df['SMILES'].dropna().tolist()
|
| 58 |
+
print(f"Loaded {len(smiles_list)} SMILES (assumed pre-canonicalized)")
|
| 59 |
+
# Validate with RDKit
|
| 60 |
+
def is_valid_smiles(smiles):
|
| 61 |
+
return Chem.MolFromSmiles(smiles) is not None
|
| 62 |
+
print("Validating SMILES with RDKit...")
|
| 63 |
+
valid_mask = [is_valid_smiles(s) for s in tqdm(smiles_list)]
|
| 64 |
+
smiles_list = [s for s, valid in zip(smiles_list, valid_mask) if valid]
|
| 65 |
+
print(f"After RDKit filtering: {len(smiles_list)} valid SMILES")
|
| 66 |
+
#
|
| 67 |
+
# Step 1.3 — Train/Val/Test Split (80/10/10)
|
| 68 |
+
#
|
| 69 |
+
train_smiles, temp_smiles = train_test_split(smiles_list, test_size=0.2, random_state=42, shuffle=True)
|
| 70 |
+
val_smiles, test_smiles = train_test_split(temp_smiles, test_size=0.5, random_state=42, shuffle=True)
|
| 71 |
+
print(f"Train: {len(train_smiles)}")
|
| 72 |
+
print(f"Val: {len(val_smiles)}")
|
| 73 |
+
print(f"Test: {len(test_smiles)}")
|
| 74 |
+
# Cache splits
|
| 75 |
+
splits = {'train': train_smiles, 'val': val_smiles, 'test': test_smiles}
|
| 76 |
+
for split_name, smiles in splits.items():
|
| 77 |
+
with open(f"./data/{split_name}_smiles.txt", "w") as f:
|
| 78 |
+
f.write("\n".join(smiles))
|
| 79 |
+
#
|
| 80 |
+
# Step 1.4 — Tokenizer Wrapper (Simplified for HF compatibility)
|
| 81 |
+
#
|
| 82 |
+
class TokenizerWrapper:
|
| 83 |
+
def __init__(self, tokenizer, name,
|
| 84 |
+
bos_token="<s>", eos_token="</s>",
|
| 85 |
+
pad_token="<pad>", unk_token="<unk>"):
|
| 86 |
+
self.tokenizer = tokenizer
|
| 87 |
+
self.name = name
|
| 88 |
+
# Only call add_special_tokens if the tokenizer actually supports it
|
| 89 |
+
if hasattr(tokenizer, "add_special_tokens") and callable(tokenizer.add_special_tokens):
|
| 90 |
+
try:
|
| 91 |
+
tokenizer.add_special_tokens({
|
| 92 |
+
"bos_token": bos_token,
|
| 93 |
+
"eos_token": eos_token,
|
| 94 |
+
"pad_token": pad_token,
|
| 95 |
+
"unk_token": unk_token,
|
| 96 |
+
})
|
| 97 |
+
except NotImplementedError:
|
| 98 |
+
# Your FastChemTokenizerHF already defines these tokens internally
|
| 99 |
+
pass
|
| 100 |
+
def encode(self, smiles: str, add_special_tokens: bool = True):
|
| 101 |
+
return self.tokenizer(
|
| 102 |
+
smiles,
|
| 103 |
+
add_special_tokens=add_special_tokens,
|
| 104 |
+
return_attention_mask=False,
|
| 105 |
+
return_tensors=None
|
| 106 |
+
)
|
| 107 |
+
def decode(self, token_ids, skip_special_tokens=True):
|
| 108 |
+
return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
|
| 109 |
+
def __len__(self):
|
| 110 |
+
return len(self.tokenizer)
|
| 111 |
+
def get_vocab(self):
|
| 112 |
+
return self.tokenizer.get_vocab()
|
| 113 |
+
@property
|
| 114 |
+
def bos_token_id(self):
|
| 115 |
+
return self.tokenizer.bos_token_id
|
| 116 |
+
@property
|
| 117 |
+
def eos_token_id(self):
|
| 118 |
+
return self.tokenizer.eos_token_id
|
| 119 |
+
@property
|
| 120 |
+
def pad_token_id(self):
|
| 121 |
+
return self.tokenizer.pad_token_id
|
| 122 |
+
@property
|
| 123 |
+
def unk_token_id(self):
|
| 124 |
+
return self.tokenizer.unk_token_id
|
| 125 |
+
#
|
| 126 |
+
# Step 1.5 — Initialize Tokenizers
|
| 127 |
+
#
|
| 128 |
+
tok1_hf = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
|
| 129 |
+
tok2_fast = FastChemTokenizer.from_pretrained("../smitok")
|
| 130 |
+
tokenizer1 = TokenizerWrapper(tok1_hf, name="ChemBERTa", bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>")
|
| 131 |
+
tokenizer2 = TokenizerWrapper(tok2_fast, name="FastChemTokenizerHF", bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>")
|
| 132 |
+
TOKENIZERS = [tokenizer1, tokenizer2]
|
| 133 |
+
#
|
| 134 |
+
# Step 1.6 — Benchmarking Functions (Fixed Bug #4 implicitly via epsilon)
|
| 135 |
+
#
|
| 136 |
+
def benchmark_tokenizer(tokenizer, smiles_sample, encode_only=False):
|
| 137 |
+
V = len(tokenizer)
|
| 138 |
+
sample = smiles_sample[:10000] if len(smiles_sample) > 10000 else smiles_sample
|
| 139 |
+
encode_times, token_counts, char_counts = [], [], []
|
| 140 |
+
unk_counts, total_tokens = 0, 0
|
| 141 |
+
for smiles in tqdm(sample, desc=f"Encoding with {tokenizer.name}", leave=False):
|
| 142 |
+
char_counts.append(len(smiles))
|
| 143 |
+
start = time.perf_counter()
|
| 144 |
+
enc = tokenizer.encode(smiles, add_special_tokens=True)
|
| 145 |
+
end = time.perf_counter()
|
| 146 |
+
encode_times.append(end - start)
|
| 147 |
+
input_ids = enc['input_ids']
|
| 148 |
+
token_counts.append(len(input_ids))
|
| 149 |
+
total_tokens += len(input_ids)
|
| 150 |
+
unk_id = tokenizer.tokenizer.unk_token_id
|
| 151 |
+
unk_counts += input_ids.count(unk_id)
|
| 152 |
+
L_bar = np.mean(token_counts)
|
| 153 |
+
C = np.mean(char_counts) / L_bar
|
| 154 |
+
U = unk_counts / total_tokens if total_tokens > 0 else 0.0
|
| 155 |
+
Tenc = len(sample) / sum(encode_times)
|
| 156 |
+
metrics = {
|
| 157 |
+
'vocab_size': V,
|
| 158 |
+
'avg_tokens_per_mol': L_bar,
|
| 159 |
+
'compression_ratio': C,
|
| 160 |
+
'percent_unknown': U * 100,
|
| 161 |
+
'encode_throughput_smiles_per_sec': Tenc,
|
| 162 |
+
}
|
| 163 |
+
if encode_only:
|
| 164 |
+
return metrics
|
| 165 |
+
decode_times, reconstruction_ok = [], 0
|
| 166 |
+
for smiles in tqdm(sample, desc=f"Decoding with {tokenizer.name}", leave=False):
|
| 167 |
+
enc = tokenizer.encode(smiles, add_special_tokens=True)
|
| 168 |
+
input_ids = enc['input_ids']
|
| 169 |
+
start = time.perf_counter()
|
| 170 |
+
decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
|
| 171 |
+
end = time.perf_counter()
|
| 172 |
+
decode_times.append(end - start)
|
| 173 |
+
if decoded == smiles:
|
| 174 |
+
reconstruction_ok += 1
|
| 175 |
+
Tdec = len(sample) / sum(decode_times)
|
| 176 |
+
recon_acc = reconstruction_ok / len(sample)
|
| 177 |
+
metrics.update({
|
| 178 |
+
'decode_throughput_smiles_per_sec': Tdec,
|
| 179 |
+
'decode_reconstruction_accuracy': recon_acc * 100,
|
| 180 |
+
})
|
| 181 |
+
return metrics
|
| 182 |
+
#
|
| 183 |
+
# Step 1.7 — Run Benchmark
|
| 184 |
+
#
|
| 185 |
+
benchmark_sample = train_smiles
|
| 186 |
+
results = []
|
| 187 |
+
for tokenizer in TOKENIZERS:
|
| 188 |
+
print(f"\n=== Benchmarking {tokenizer.name} ===")
|
| 189 |
+
metrics = benchmark_tokenizer(tokenizer, benchmark_sample)
|
| 190 |
+
metrics['tokenizer'] = tokenizer.name
|
| 191 |
+
results.append(metrics)
|
| 192 |
+
for k, v in metrics.items():
|
| 193 |
+
if k != 'tokenizer':
|
| 194 |
+
print(f"{k:35s}: {v:.4f}" if isinstance(v, float) else f"{k:35s}: {v}")
|
| 195 |
+
df_results = pd.DataFrame(results)
|
| 196 |
+
df_results.to_csv("tokenizer_benchmark_results.csv", index=False)
|
| 197 |
+
print("\nTokenizer benchmark results saved to 'tokenizer_benchmark_results.csv'")
|
| 198 |
+
#
|
| 199 |
+
# Step 2.1 — VAE Model Class (PATCHED: decode stops at EOS)
|
| 200 |
+
#
|
| 201 |
+
import torch
|
| 202 |
+
import torch.nn as nn
|
| 203 |
+
import torch.nn.functional as F
|
| 204 |
+
from typing import Optional, Tuple, Union
|
| 205 |
+
import torch
|
| 206 |
+
import torch.nn as nn
|
| 207 |
+
import torch.nn.functional as F
|
| 208 |
+
from typing import Tuple, Optional
|
| 209 |
+
class MoleculeVAE(nn.Module):
|
| 210 |
+
"""
|
| 211 |
+
Optimized MoleculeVAE with:
|
| 212 |
+
- Bidirectional encoder (restored)
|
| 213 |
+
- Proper latent2hidden + latent2cell (restored)
|
| 214 |
+
- Adjustable dropout for small dataset
|
| 215 |
+
- Attention pooling option
|
| 216 |
+
- Quantization-ready hooks
|
| 217 |
+
"""
|
| 218 |
+
def __init__(self,
|
| 219 |
+
vocab_size: int,
|
| 220 |
+
embed_dim: int = 64,
|
| 221 |
+
hidden_dim: int = 128,
|
| 222 |
+
latent_dim: int = 64,
|
| 223 |
+
num_layers: int = 2,
|
| 224 |
+
pad_token_id: int = 0,
|
| 225 |
+
bos_token_id: int = 1,
|
| 226 |
+
eos_token_id: int = 2,
|
| 227 |
+
dropout: float = 0.2,
|
| 228 |
+
use_attention: bool = True,
|
| 229 |
+
quantize_ready: bool = False):
|
| 230 |
+
super().__init__()
|
| 231 |
+
self.vocab_size = vocab_size
|
| 232 |
+
self.embed_dim = embed_dim
|
| 233 |
+
self.hidden_dim = hidden_dim
|
| 234 |
+
self.latent_dim = latent_dim
|
| 235 |
+
self.num_layers = num_layers
|
| 236 |
+
self.pad_token_id = pad_token_id
|
| 237 |
+
self.bos_token_id = bos_token_id
|
| 238 |
+
self.eos_token_id = eos_token_id
|
| 239 |
+
self.use_attention = use_attention
|
| 240 |
+
# Shared embedding
|
| 241 |
+
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_token_id)
|
| 242 |
+
# Bidirectional encoder
|
| 243 |
+
self.encoder_lstm = nn.LSTM(
|
| 244 |
+
embed_dim, hidden_dim, num_layers,
|
| 245 |
+
batch_first=True, dropout=dropout if num_layers > 1 else 0,
|
| 246 |
+
bidirectional=True
|
| 247 |
+
)
|
| 248 |
+
# Attention pooling (optional)
|
| 249 |
+
if use_attention:
|
| 250 |
+
self.attention = nn.MultiheadAttention(
|
| 251 |
+
hidden_dim * 2, num_heads=4, dropout=dropout, batch_first=True
|
| 252 |
+
)
|
| 253 |
+
self.attention_linear = nn.Linear(hidden_dim * 2, 1)
|
| 254 |
+
self.encoder_norm = nn.LayerNorm(hidden_dim * 2)
|
| 255 |
+
# Latent bottleneck
|
| 256 |
+
self.fc_mu = nn.Linear(hidden_dim * 2, latent_dim)
|
| 257 |
+
self.fc_logvar = nn.Linear(hidden_dim * 2, latent_dim)
|
| 258 |
+
# Decoder init (restored)
|
| 259 |
+
self.latent2hidden = nn.Linear(latent_dim, num_layers * hidden_dim)
|
| 260 |
+
self.latent2cell = nn.Linear(latent_dim, num_layers * hidden_dim)
|
| 261 |
+
# Decoder
|
| 262 |
+
self.decoder_lstm = nn.LSTM(
|
| 263 |
+
embed_dim, hidden_dim, num_layers,
|
| 264 |
+
batch_first=True, dropout=dropout if num_layers > 1 else 0
|
| 265 |
+
)
|
| 266 |
+
self.decoder_norm = nn.LayerNorm(hidden_dim)
|
| 267 |
+
self.fc_out = nn.Linear(hidden_dim, vocab_size)
|
| 268 |
+
# Weight tying
|
| 269 |
+
if embed_dim == hidden_dim:
|
| 270 |
+
self.fc_out.weight = self.embedding.weight
|
| 271 |
+
self.dropout = nn.Dropout(dropout)
|
| 272 |
+
# Quantization stubs
|
| 273 |
+
if quantize_ready:
|
| 274 |
+
self.quant = torch.quantization.QuantStub()
|
| 275 |
+
self.dequant = torch.quantization.DeQuantStub()
|
| 276 |
+
else:
|
| 277 |
+
self.quant = self.dequant = nn.Identity()
|
| 278 |
+
self._init_weights()
|
| 279 |
+
def _init_weights(self):
|
| 280 |
+
for name, param in self.named_parameters():
|
| 281 |
+
if 'weight' in name:
|
| 282 |
+
if param.ndim >= 2:
|
| 283 |
+
nn.init.xavier_uniform_(param)
|
| 284 |
+
else:
|
| 285 |
+
nn.init.normal_(param, 0, 0.01)
|
| 286 |
+
elif 'bias' in name:
|
| 287 |
+
nn.init.zeros_(param)
|
| 288 |
+
def _pool_sequence(self, packed_output, lengths):
|
| 289 |
+
output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
|
| 290 |
+
if self.use_attention:
|
| 291 |
+
attn_out, _ = self.attention(output, output, output)
|
| 292 |
+
weights = torch.softmax(self.attention_linear(attn_out), dim=1)
|
| 293 |
+
pooled = (weights * output).sum(dim=1)
|
| 294 |
+
else:
|
| 295 |
+
# mean pooling with mask
|
| 296 |
+
batch_size, max_len, _ = output.size()
|
| 297 |
+
mask = torch.arange(max_len, device=output.device).expand(batch_size, max_len) < lengths.unsqueeze(1)
|
| 298 |
+
masked_output = output * mask.unsqueeze(-1).float()
|
| 299 |
+
pooled = masked_output.sum(dim=1) / lengths.unsqueeze(-1).float()
|
| 300 |
+
return pooled
|
| 301 |
+
def encode(self, x: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 302 |
+
x = self.quant(x)
|
| 303 |
+
embedded = self.dropout(self.embedding(x))
|
| 304 |
+
packed = nn.utils.rnn.pack_padded_sequence(
|
| 305 |
+
embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
|
| 306 |
+
)
|
| 307 |
+
packed_out, _ = self.encoder_lstm(packed)
|
| 308 |
+
h = self._pool_sequence(packed_out, lengths)
|
| 309 |
+
h = self.encoder_norm(h)
|
| 310 |
+
mu, logvar = self.fc_mu(h), self.fc_logvar(h)
|
| 311 |
+
return mu, logvar
|
| 312 |
+
def reparameterize(self, mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
|
| 313 |
+
if self.training:
|
| 314 |
+
std = torch.exp(0.5 * logvar)
|
| 315 |
+
eps = torch.randn_like(std)
|
| 316 |
+
return mu + eps * std
|
| 317 |
+
return mu
|
| 318 |
+
def _init_decoder_state(self, z: torch.Tensor):
|
| 319 |
+
batch_size = z.size(0)
|
| 320 |
+
h0 = self.latent2hidden(z).view(self.num_layers, batch_size, self.hidden_dim)
|
| 321 |
+
c0 = self.latent2cell(z).view(self.num_layers, batch_size, self.hidden_dim)
|
| 322 |
+
return h0, c0
|
| 323 |
+
def decode(self, z: torch.Tensor, max_length: int = 64, mode: str = "greedy", temperature: float = 1.0):
|
| 324 |
+
batch_size = z.size(0)
|
| 325 |
+
device = z.device
|
| 326 |
+
h0, c0 = self._init_decoder_state(z)
|
| 327 |
+
hidden = (h0, c0)
|
| 328 |
+
input_ids = torch.full((batch_size, 1), self.bos_token_id, dtype=torch.long, device=device)
|
| 329 |
+
finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
|
| 330 |
+
logits_list = []
|
| 331 |
+
for _ in range(max_length):
|
| 332 |
+
embedded = self.embedding(input_ids)
|
| 333 |
+
output, hidden = self.decoder_lstm(embedded, hidden)
|
| 334 |
+
output = self.decoder_norm(output)
|
| 335 |
+
logit = self.fc_out(output)
|
| 336 |
+
logits_list.append(logit)
|
| 337 |
+
if mode == "greedy":
|
| 338 |
+
next_tokens = logit.argmax(dim=-1)
|
| 339 |
+
elif mode == "sample":
|
| 340 |
+
probs = F.softmax(logit.squeeze(1) / temperature, dim=-1)
|
| 341 |
+
next_tokens = torch.multinomial(probs, 1)
|
| 342 |
+
else:
|
| 343 |
+
raise ValueError(f"Unknown decode mode: {mode}")
|
| 344 |
+
just_finished = (next_tokens.squeeze(-1) == self.eos_token_id)
|
| 345 |
+
finished |= just_finished
|
| 346 |
+
next_tokens = torch.where(
|
| 347 |
+
finished.unsqueeze(-1),
|
| 348 |
+
torch.tensor(self.pad_token_id, device=device),
|
| 349 |
+
next_tokens
|
| 350 |
+
)
|
| 351 |
+
input_ids = next_tokens
|
| 352 |
+
if finished.all():
|
| 353 |
+
break
|
| 354 |
+
return self.dequant(torch.cat(logits_list, dim=1))
|
| 355 |
+
def forward(self, input_ids: torch.Tensor, lengths: torch.Tensor,
|
| 356 |
+
target_seq: Optional[torch.Tensor] = None,
|
| 357 |
+
teacher_forcing_ratio: float = 0.0,
|
| 358 |
+
temperature: float = 1.0):
|
| 359 |
+
mu, logvar = self.encode(input_ids, lengths)
|
| 360 |
+
z = self.reparameterize(mu, logvar)
|
| 361 |
+
if self.training and target_seq is not None and teacher_forcing_ratio > 0:
|
| 362 |
+
return self._forward_teacher_forcing(z, target_seq, teacher_forcing_ratio), mu, logvar
|
| 363 |
+
else:
|
| 364 |
+
max_len = target_seq.size(1) if target_seq is not None else 64
|
| 365 |
+
return self.decode(z, max_length=max_len, temperature=temperature), mu, logvar
|
| 366 |
+
def _forward_teacher_forcing(self, z: torch.Tensor, target_seq: torch.Tensor, teacher_forcing_ratio: float):
|
| 367 |
+
batch_size, seq_len = target_seq.size()
|
| 368 |
+
h0, c0 = self._init_decoder_state(z)
|
| 369 |
+
hidden = (h0, c0)
|
| 370 |
+
logits_list = []
|
| 371 |
+
input_token = target_seq[:, 0:1]
|
| 372 |
+
for t in range(1, seq_len):
|
| 373 |
+
embedded = self.embedding(input_token)
|
| 374 |
+
output, hidden = self.decoder_lstm(embedded, hidden)
|
| 375 |
+
output = self.decoder_norm(output)
|
| 376 |
+
logit = self.fc_out(output)
|
| 377 |
+
logits_list.append(logit)
|
| 378 |
+
if torch.rand(1).item() < teacher_forcing_ratio:
|
| 379 |
+
input_token = target_seq[:, t:t+1]
|
| 380 |
+
else:
|
| 381 |
+
input_token = logit.argmax(dim=-1)
|
| 382 |
+
return torch.cat(logits_list, dim=1)
|
| 383 |
+
|
| 384 |
+
# ============================
|
| 385 |
+
# Utility: Simple Linear KL Warmup (PATCHED IN)
|
| 386 |
+
# ============================
|
| 387 |
+
def linear_kl_beta(global_step: int, warmup_steps: int, start: float = 0.0, end: float = 1.0):
|
| 388 |
+
"""Linear schedule from start → end over warmup_steps. Caps at end."""
|
| 389 |
+
if warmup_steps <= 0:
|
| 390 |
+
return float(end)
|
| 391 |
+
frac = float(global_step) / float(max(1, warmup_steps))
|
| 392 |
+
return float(start + (end - start) * min(1.0, frac))
|
| 393 |
+
|
| 394 |
+
#
|
| 395 |
+
# Step 2.2 — Loss Function (PATCHED: β applied OUTSIDE, not inside)
|
| 396 |
+
#
|
| 397 |
+
# PATCH 2: Fix VAE Loss Function - Ensure beta is properly applied
|
| 398 |
+
# Replace the existing vae_loss function:
|
| 399 |
+
def vae_loss(logits, targets, mu, logvar, pad_token_id, beta=1.0):
|
| 400 |
+
# 1. align lengths
|
| 401 |
+
max_len = max(logits.size(1), targets.size(1))
|
| 402 |
+
if logits.size(1) < max_len:
|
| 403 |
+
logits = F.pad(logits, (0, 0, 0, max_len - logits.size(1)))
|
| 404 |
+
if targets.size(1) < max_len:
|
| 405 |
+
targets = F.pad(targets, (0, max_len - targets.size(1)), value=pad_token_id)
|
| 406 |
+
logits_flat = logits.view(-1, logits.size(-1)) # [B*L, V]
|
| 407 |
+
targets_flat = targets.reshape(-1) # [B*L]
|
| 408 |
+
mask = (targets_flat != pad_token_id).float()
|
| 409 |
+
ce_loss = F.cross_entropy(logits_flat, targets_flat, reduction='none')
|
| 410 |
+
mask_sum = mask.sum()
|
| 411 |
+
ce_loss = (ce_loss * mask).sum() / (mask_sum + 1e-8)
|
| 412 |
+
# FIXED: Raw KL loss computation
|
| 413 |
+
kl_loss_raw = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1)
|
| 414 |
+
# Apply mask to KL loss if needed (but typically KL is per-sample)
|
| 415 |
+
kl_loss = kl_loss_raw.mean()
|
| 416 |
+
# CRITICAL FIX: Apply beta scaling correctly
|
| 417 |
+
total_loss = ce_loss + beta * kl_loss
|
| 418 |
+
return total_loss, ce_loss, kl_loss
|
| 419 |
+
|
| 420 |
+
# ============================
|
| 421 |
+
# Teacher Forcing Ratio Schedule (PATCHED IN)
|
| 422 |
+
# ============================
|
| 423 |
+
def get_teacher_forcing_ratio(epoch, num_epochs, min_tfr=0.6, warmup_fraction=0.3):
|
| 424 |
+
"""Linear schedule: 1.0 until warmup_epochs, then linear decay to min_tfr."""
|
| 425 |
+
warmup_epochs = int(num_epochs * warmup_fraction)
|
| 426 |
+
if epoch < warmup_epochs:
|
| 427 |
+
return 1.0
|
| 428 |
+
else:
|
| 429 |
+
progress = (epoch - warmup_epochs) / max(1, num_epochs - warmup_epochs)
|
| 430 |
+
return max(min_tfr, 1.0 - (1.0 - min_tfr) * progress)
|
| 431 |
+
|
| 432 |
+
# REMOVED: KLAnnealer class (PATCHED OUT)
|
| 433 |
+
|
| 434 |
+
#
|
| 435 |
+
# Step 2.4 — Collate Function (Fixed Bug #2: dynamic pad id)
|
| 436 |
+
#
|
| 437 |
+
def collate_fn(batch, tokenizer, max_length=128):
|
| 438 |
+
encodings = [tokenizer.encode(s, add_special_tokens=True) for s in batch]
|
| 439 |
+
input_ids = [e['input_ids'] for e in encodings]
|
| 440 |
+
max_len = min(max(len(ids) for ids in input_ids), max_length)
|
| 441 |
+
padded = []
|
| 442 |
+
lengths = []
|
| 443 |
+
pad_token_id = tokenizer.tokenizer.pad_token_id # FIXED: dynamic
|
| 444 |
+
for ids in input_ids:
|
| 445 |
+
if len(ids) > max_length:
|
| 446 |
+
ids = ids[:max_length]
|
| 447 |
+
else:
|
| 448 |
+
ids = ids + [pad_token_id] * (max_len - len(ids))
|
| 449 |
+
padded.append(ids)
|
| 450 |
+
lengths.append(min(len(ids), max_length))
|
| 451 |
+
return torch.tensor(padded, dtype=torch.long), torch.tensor(lengths, dtype=torch.long)
|
| 452 |
+
#
|
| 453 |
+
# Step 2.5 — Dataset & DataLoader
|
| 454 |
+
#
|
| 455 |
+
class SmilesDataset(Dataset):
|
| 456 |
+
def __init__(self, smiles_list):
|
| 457 |
+
self.smiles_list = smiles_list
|
| 458 |
+
def __len__(self):
|
| 459 |
+
return len(self.smiles_list)
|
| 460 |
+
def __getitem__(self, idx):
|
| 461 |
+
return self.smiles_list[idx]
|
| 462 |
+
|
| 463 |
+
# ============================
|
| 464 |
+
# Training Loop (PATCHED: Uses linear_kl_beta)
|
| 465 |
+
# ============================
|
| 466 |
+
LEARNING_RATE = 1e-5
|
| 467 |
+
BATCH_SIZE = 16
|
| 468 |
+
ACCUMULATION_STEPS = 4
|
| 469 |
+
NUM_EPOCHS = 1
|
| 470 |
+
MAX_SEQ_LEN = 128
|
| 471 |
+
KL_WARMUP_FRAC = 0.1 # PATCHED: New parameter for KL warmup fraction
|
| 472 |
+
|
| 473 |
+
def train_vae(
|
| 474 |
+
model,
|
| 475 |
+
train_loader,
|
| 476 |
+
val_loader,
|
| 477 |
+
optimizer,
|
| 478 |
+
pad_token_id,
|
| 479 |
+
device,
|
| 480 |
+
num_epochs,
|
| 481 |
+
accumulation_steps=4,
|
| 482 |
+
save_dir="./checkpoints",
|
| 483 |
+
tokenizer_name="default",
|
| 484 |
+
warmup_steps=100, # PATCHED: New parameter for warmup steps
|
| 485 |
+
):
|
| 486 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 487 |
+
log_file = os.path.join(save_dir, f"training_log_{tokenizer_name}.csv")
|
| 488 |
+
with open(log_file, "w") as f:
|
| 489 |
+
f.write("epoch,step,train_loss,train_ce,train_kl,val_loss,val_ce,val_kl,kl_beta\n")
|
| 490 |
+
|
| 491 |
+
best_val_loss = float('inf')
|
| 492 |
+
global_step = 0 # PATCHED: Initialize global step counter
|
| 493 |
+
|
| 494 |
+
for epoch in range(num_epochs):
|
| 495 |
+
print(f"\n=== Epoch {epoch+1}/{num_epochs} ===")
|
| 496 |
+
model.train()
|
| 497 |
+
total_train_loss = total_train_ce = total_train_kl = 0.0
|
| 498 |
+
num_batches = 0
|
| 499 |
+
|
| 500 |
+
optimizer.zero_grad()
|
| 501 |
+
|
| 502 |
+
for step, (input_ids, lengths) in enumerate(tqdm(train_loader, desc="Training")):
|
| 503 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 504 |
+
tfr = get_teacher_forcing_ratio(epoch, num_epochs, min_tfr=0.6, warmup_fraction=0.3)
|
| 505 |
+
|
| 506 |
+
logits, mu, logvar = model(input_ids, lengths, target_seq=input_ids, teacher_forcing_ratio=tfr)
|
| 507 |
+
|
| 508 |
+
beta = linear_kl_beta(global_step, warmup_steps) # PATCHED: Use linear_kl_beta
|
| 509 |
+
loss, ce_loss, kl_loss = vae_loss(logits, input_ids, mu, logvar, pad_token_id, beta=beta)
|
| 510 |
+
|
| 511 |
+
loss = loss / accumulation_steps
|
| 512 |
+
loss.backward()
|
| 513 |
+
|
| 514 |
+
total_train_loss += loss.item() * accumulation_steps
|
| 515 |
+
total_train_ce += ce_loss.item()
|
| 516 |
+
total_train_kl += kl_loss.item()
|
| 517 |
+
num_batches += 1
|
| 518 |
+
|
| 519 |
+
if (step + 1) % accumulation_steps == 0:
|
| 520 |
+
optimizer.step()
|
| 521 |
+
optimizer.zero_grad()
|
| 522 |
+
global_step += 1 # PATCHED: Increment global step
|
| 523 |
+
|
| 524 |
+
if len(train_loader) % accumulation_steps != 0:
|
| 525 |
+
optimizer.step()
|
| 526 |
+
optimizer.zero_grad()
|
| 527 |
+
global_step += 1 # PATCHED: Increment global step
|
| 528 |
+
|
| 529 |
+
current_beta = linear_kl_beta(global_step, warmup_steps) # PATCHED: Get current beta after training
|
| 530 |
+
|
| 531 |
+
model.eval()
|
| 532 |
+
total_val_loss = total_val_ce = total_val_kl = 0.0
|
| 533 |
+
val_batches = 0
|
| 534 |
+
|
| 535 |
+
with torch.no_grad():
|
| 536 |
+
for input_ids, lengths in tqdm(val_loader, desc="Validating"):
|
| 537 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 538 |
+
logits, mu, logvar = model(input_ids, lengths, target_seq=input_ids, teacher_forcing_ratio=0.0)
|
| 539 |
+
loss, ce_loss, kl_loss = vae_loss(logits, input_ids, mu, logvar, pad_token_id, beta=current_beta) # PATCHED: Use current_beta
|
| 540 |
+
total_val_loss += loss.item()
|
| 541 |
+
total_val_ce += ce_loss.item()
|
| 542 |
+
total_val_kl += kl_loss.item()
|
| 543 |
+
val_batches += 1
|
| 544 |
+
|
| 545 |
+
avg_train_loss = total_train_loss / num_batches
|
| 546 |
+
avg_val_loss = total_val_loss / val_batches
|
| 547 |
+
|
| 548 |
+
current_step = (epoch + 1) * len(train_loader)
|
| 549 |
+
with open(log_file, "a") as f:
|
| 550 |
+
f.write(f"{epoch+1},{current_step},{avg_train_loss:.6f},{total_train_ce/num_batches:.6f},{total_train_kl/num_batches:.6f},"
|
| 551 |
+
f"{avg_val_loss:.6f},{total_val_ce/val_batches:.6f},{total_val_kl/val_batches:.6f},{current_beta:.6f}\n")
|
| 552 |
+
|
| 553 |
+
print(f"Train Loss: {avg_train_loss:.4f}")
|
| 554 |
+
print(f"Val Loss: {avg_val_loss:.4f}")
|
| 555 |
+
print(f"KL Beta: {current_beta:.4f}")
|
| 556 |
+
|
| 557 |
+
if avg_val_loss < best_val_loss:
|
| 558 |
+
best_val_loss = avg_val_loss
|
| 559 |
+
checkpoint_path = os.path.join(save_dir, f"best_model_{tokenizer_name}.pt")
|
| 560 |
+
torch.save({
|
| 561 |
+
'epoch': epoch + 1,
|
| 562 |
+
'model_state_dict': model.state_dict(),
|
| 563 |
+
'optimizer_state_dict': optimizer.state_dict(),
|
| 564 |
+
'val_loss': avg_val_loss,
|
| 565 |
+
}, checkpoint_path)
|
| 566 |
+
print(f"→ Saved best model to {checkpoint_path}")
|
| 567 |
+
|
| 568 |
+
return best_val_loss # PATCHED: Return best_val_loss
|
| 569 |
+
|
| 570 |
+
#
|
| 571 |
+
# TRAINING LOOP OVER TOKENIZERS (PATCHED: Uses linear_kl_beta, calculates warmup_steps)
|
| 572 |
+
#
|
| 573 |
+
for tokenizer in TOKENIZERS:
|
| 574 |
+
print(f"\n STARTING TRAINING FOR: {tokenizer.name}\n")
|
| 575 |
+
vocab_size = len(tokenizer)
|
| 576 |
+
pad_token_id = tokenizer.tokenizer.pad_token_id
|
| 577 |
+
# Validate token IDs
|
| 578 |
+
sample_ids = tokenizer.encode(train_smiles[0], add_special_tokens=True)['input_ids']
|
| 579 |
+
max_id_in_sample = max(sample_ids)
|
| 580 |
+
assert max_id_in_sample < vocab_size, f"Token ID {max_id_in_sample} >= vocab size {vocab_size} in {tokenizer.name}"
|
| 581 |
+
model = MoleculeVAE(
|
| 582 |
+
vocab_size=len(tokenizer),
|
| 583 |
+
pad_token_id=tokenizer.pad_token_id,
|
| 584 |
+
bos_token_id=tokenizer.bos_token_id,
|
| 585 |
+
eos_token_id=tokenizer.eos_token_id
|
| 586 |
+
).to(device)
|
| 587 |
+
########################################################################
|
| 588 |
+
# 1. CREATE A FRESH optimizer FOR EVERY TOKENIZER
|
| 589 |
+
########################################################################
|
| 590 |
+
optimizer = Ranger21(
|
| 591 |
+
model.parameters(),
|
| 592 |
+
lr=LEARNING_RATE,
|
| 593 |
+
weight_decay=0.01,
|
| 594 |
+
use_adabelief=True,
|
| 595 |
+
use_warmup=True, # Keep Ranger21's LR warmup as-is
|
| 596 |
+
use_madgrad=True,
|
| 597 |
+
num_epochs=NUM_EPOCHS,
|
| 598 |
+
num_batches_per_epoch=len(train_smiles) // (BATCH_SIZE * ACCUMULATION_STEPS),
|
| 599 |
+
warmdown_active=False,
|
| 600 |
+
)
|
| 601 |
+
train_dataset = SmilesDataset(train_smiles)
|
| 602 |
+
val_dataset = SmilesDataset(val_smiles)
|
| 603 |
+
train_loader = DataLoader(
|
| 604 |
+
train_dataset,
|
| 605 |
+
batch_size=BATCH_SIZE,
|
| 606 |
+
shuffle=True,
|
| 607 |
+
collate_fn=lambda batch: collate_fn(batch, tokenizer, max_length=MAX_SEQ_LEN),
|
| 608 |
+
num_workers=0,
|
| 609 |
+
pin_memory=True
|
| 610 |
+
)
|
| 611 |
+
val_loader = DataLoader(
|
| 612 |
+
val_dataset,
|
| 613 |
+
batch_size=BATCH_SIZE,
|
| 614 |
+
shuffle=False,
|
| 615 |
+
collate_fn=lambda batch: collate_fn(batch, tokenizer, max_length=MAX_SEQ_LEN),
|
| 616 |
+
num_workers=0,
|
| 617 |
+
pin_memory=True
|
| 618 |
+
)
|
| 619 |
+
steps_per_epoch = len(train_loader)
|
| 620 |
+
total_steps = steps_per_epoch * NUM_EPOCHS
|
| 621 |
+
# Calculate warmup steps based on total steps and fraction
|
| 622 |
+
warmup_steps = int(total_steps * KL_WARMUP_FRAC) # PATCHED: Calculate warmup steps
|
| 623 |
+
|
| 624 |
+
train_vae(
|
| 625 |
+
model=model,
|
| 626 |
+
train_loader=train_loader,
|
| 627 |
+
val_loader=val_loader,
|
| 628 |
+
optimizer=optimizer,
|
| 629 |
+
pad_token_id=pad_token_id,
|
| 630 |
+
device=device,
|
| 631 |
+
num_epochs=NUM_EPOCHS,
|
| 632 |
+
accumulation_steps=ACCUMULATION_STEPS,
|
| 633 |
+
save_dir=f"./checkpoints/{tokenizer.name}",
|
| 634 |
+
tokenizer_name=tokenizer.name,
|
| 635 |
+
warmup_steps=warmup_steps, # PATCHED: Pass warmup_steps
|
| 636 |
+
)
|
| 637 |
+
|
| 638 |
+
#
|
| 639 |
+
# Step 4.x — Evaluation Pipeline (Fixed Bug #6, #7, #8)
|
| 640 |
+
#
|
| 641 |
+
def canonicalize_smiles(smiles):
|
| 642 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 643 |
+
if mol is None:
|
| 644 |
+
return None
|
| 645 |
+
return Chem.MolToSmiles(mol, isomericSmiles=True)
|
| 646 |
+
def evaluate_reconstruction(model, dataloader, tokenizer, device, max_length=128):
|
| 647 |
+
model.eval()
|
| 648 |
+
total_token_correct = total_tokens = exact_matches = valid_count = total_samples = 0
|
| 649 |
+
all_generated, all_targets = [], []
|
| 650 |
+
pad_id = tokenizer.tokenizer.pad_token_id
|
| 651 |
+
eos_id = tokenizer.tokenizer.eos_token_id
|
| 652 |
+
special_ids = {pad_id, eos_id}
|
| 653 |
+
def trim_to_special(ids, specials):
|
| 654 |
+
for i, id_ in enumerate(ids):
|
| 655 |
+
if id_ in specials:
|
| 656 |
+
return ids[:i]
|
| 657 |
+
return ids
|
| 658 |
+
with torch.no_grad():
|
| 659 |
+
for input_ids, lengths in tqdm(dataloader, desc="Evaluating Reconstruction"):
|
| 660 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 661 |
+
B = input_ids.size(0)
|
| 662 |
+
mu, logvar = model.encode(input_ids, lengths)
|
| 663 |
+
z = model.reparameterize(mu, logvar)
|
| 664 |
+
logits = model.decode(z, max_length=128, mode="greedy") # FIXED #7 for reconstruction
|
| 665 |
+
preds = logits.argmax(dim=-1)
|
| 666 |
+
# FIXED: Align logits and targets to same sequence length
|
| 667 |
+
min_len = min(logits.size(1), input_ids.size(1))
|
| 668 |
+
preds = preds[:, :min_len] # trim predictions
|
| 669 |
+
input_ids_eval = input_ids[:, :min_len] # trim targets
|
| 670 |
+
mask = (input_ids_eval != pad_id)
|
| 671 |
+
token_correct = ((preds == input_ids_eval) & mask).sum().item()
|
| 672 |
+
total_token_correct += token_correct
|
| 673 |
+
total_tokens += mask.sum().item()
|
| 674 |
+
for i in range(B):
|
| 675 |
+
target_ids = input_ids_eval[i].cpu().tolist()
|
| 676 |
+
pred_ids = preds[i].cpu().tolist()
|
| 677 |
+
# FIXED BUG #6: Trim before decode
|
| 678 |
+
target_ids_trim = trim_to_special(target_ids, special_ids)
|
| 679 |
+
pred_ids_trim = trim_to_special(pred_ids, special_ids)
|
| 680 |
+
target_smiles = tokenizer.decode(target_ids_trim, skip_special_tokens=False)
|
| 681 |
+
pred_smiles = tokenizer.decode(pred_ids_trim, skip_special_tokens=False)
|
| 682 |
+
all_targets.append(target_smiles)
|
| 683 |
+
all_generated.append(pred_smiles)
|
| 684 |
+
if pred_smiles == target_smiles:
|
| 685 |
+
exact_matches += 1
|
| 686 |
+
if Chem.MolFromSmiles(pred_smiles) is not None:
|
| 687 |
+
valid_count += 1
|
| 688 |
+
total_samples += 1
|
| 689 |
+
token_acc = total_token_correct / total_tokens if total_tokens > 0 else 0.0
|
| 690 |
+
exact_match_rate = exact_matches / total_samples
|
| 691 |
+
validity_rate = valid_count / total_samples
|
| 692 |
+
print(f"Token-level Accuracy: {token_acc:.4f}")
|
| 693 |
+
print(f"Exact Match Rate: {exact_match_rate:.4f}")
|
| 694 |
+
print(f"Validity Rate: {validity_rate:.4f}")
|
| 695 |
+
return {
|
| 696 |
+
'token_accuracy': token_acc,
|
| 697 |
+
'exact_match_rate': exact_match_rate,
|
| 698 |
+
'validity_rate': validity_rate,
|
| 699 |
+
'generated_smiles': all_generated,
|
| 700 |
+
'target_smiles': all_targets
|
| 701 |
+
}
|
| 702 |
+
def compute_uniqueness_and_novelty(generated_smiles, train_smiles_set):
|
| 703 |
+
total = len(generated_smiles)
|
| 704 |
+
unique = len(set(generated_smiles))
|
| 705 |
+
novel = len([s for s in generated_smiles if s not in train_smiles_set])
|
| 706 |
+
uniqueness = unique / total if total > 0 else 0.0
|
| 707 |
+
novelty = novel / total if total > 0 else 0.0
|
| 708 |
+
print(f"Uniqueness: {uniqueness:.4f} ({unique}/{total})")
|
| 709 |
+
print(f"Novelty: {novelty:.4f} ({novel}/not in train)")
|
| 710 |
+
return uniqueness, novelty
|
| 711 |
+
def kl_divergence_from_samples(samples, bins=512):
|
| 712 |
+
dim_kls = []
|
| 713 |
+
for d in range(samples.shape[1]):
|
| 714 |
+
data = samples[:, d]
|
| 715 |
+
hist, bin_edges = np.histogram(data, bins=bins, density=True)
|
| 716 |
+
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
|
| 717 |
+
norm_pdf = (1 / np.sqrt(2 * np.pi)) * np.exp(-0.5 * bin_centers**2)
|
| 718 |
+
hist = np.clip(hist, 1e-10, None)
|
| 719 |
+
norm_pdf = np.clip(norm_pdf, 1e-10, None)
|
| 720 |
+
kl = entropy(hist, norm_pdf)
|
| 721 |
+
dim_kls.append(kl)
|
| 722 |
+
return np.mean(dim_kls)
|
| 723 |
+
def evaluate_latent_kl(model, dataloader, device, latent_dim=128, bins=512):
|
| 724 |
+
model.eval()
|
| 725 |
+
all_z = []
|
| 726 |
+
with torch.no_grad():
|
| 727 |
+
for input_ids, lengths in tqdm(dataloader, desc="Sampling Latents"):
|
| 728 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 729 |
+
mu, logvar = model.encode(input_ids, lengths)
|
| 730 |
+
z = model.reparameterize(mu, logvar)
|
| 731 |
+
all_z.append(z.cpu().numpy())
|
| 732 |
+
all_z = np.concatenate(all_z, axis=0)
|
| 733 |
+
kl_div = kl_divergence_from_samples(all_z, bins=bins)
|
| 734 |
+
print(f"KL Divergence (empirical vs N(0,1)): {kl_div:.4f}")
|
| 735 |
+
return kl_div
|
| 736 |
+
def evaluate_interpolation_validity(model, tokenizer, test_smiles, device, num_pairs=100, steps=10, max_length=128):
|
| 737 |
+
model.eval()
|
| 738 |
+
pairs = random.sample(list(zip(test_smiles[::2], test_smiles[1::2])), min(num_pairs, len(test_smiles)//2))
|
| 739 |
+
valid_interps = total_interps = 0
|
| 740 |
+
with torch.no_grad():
|
| 741 |
+
for smiles_a, smiles_b in tqdm(pairs, desc="Interpolation Validity"):
|
| 742 |
+
if not smiles_a or not smiles_b: continue
|
| 743 |
+
enc_a = tokenizer.encode(smiles_a, add_special_tokens=True)
|
| 744 |
+
enc_b = tokenizer.encode(smiles_b, add_special_tokens=True)
|
| 745 |
+
ids_a = torch.tensor([enc_a['input_ids']], device=device)
|
| 746 |
+
ids_b = torch.tensor([enc_b['input_ids']], device=device)
|
| 747 |
+
len_a = torch.tensor([len(enc_a['input_ids'])], device=device)
|
| 748 |
+
len_b = torch.tensor([len(enc_b['input_ids'])], device=device)
|
| 749 |
+
mu_a, _ = model.encode(ids_a, len_a)
|
| 750 |
+
mu_b, _ = model.encode(ids_b, len_b)
|
| 751 |
+
alphas = torch.linspace(0, 1, steps, device=device)
|
| 752 |
+
for alpha in alphas:
|
| 753 |
+
z_interp = alpha * mu_b + (1 - alpha) * mu_a
|
| 754 |
+
# Ensure z_interp maintains batch dimension [1, latent_dim]
|
| 755 |
+
if z_interp.dim() == 1:
|
| 756 |
+
z_interp = z_interp.unsqueeze(0)
|
| 757 |
+
logits = model.decode(z_interp, max_length=max_length, mode="sample", temperature=0.8)
|
| 758 |
+
preds = logits.argmax(dim=-1)
|
| 759 |
+
# Handle batch dimension properly
|
| 760 |
+
if preds.dim() > 1:
|
| 761 |
+
preds = preds[0] # Take first (and only) batch item
|
| 762 |
+
pred_smiles = tokenizer.decode(preds.cpu().tolist(), skip_special_tokens=True)
|
| 763 |
+
if Chem.MolFromSmiles(pred_smiles) is not None:
|
| 764 |
+
valid_interps += 1
|
| 765 |
+
total_interps += 1
|
| 766 |
+
interp_validity = valid_interps / total_interps if total_interps > 0 else 0.0
|
| 767 |
+
print(f"Interpolation Validity: {interp_validity:.4f}")
|
| 768 |
+
return interp_validity
|
| 769 |
+
def sample_from_latent(model, tokenizer, num_samples=30000, latent_dim=128, max_length=128, device=device, temperature=0.8):
|
| 770 |
+
model.eval()
|
| 771 |
+
generated_smiles = []
|
| 772 |
+
with torch.no_grad():
|
| 773 |
+
for _ in tqdm(range(0, num_samples, BATCH_SIZE), desc="Sampling from Latent"):
|
| 774 |
+
current_batch_size = min(BATCH_SIZE, num_samples - len(generated_smiles))
|
| 775 |
+
if current_batch_size <= 0: break
|
| 776 |
+
z = torch.randn(current_batch_size, latent_dim, device=device)
|
| 777 |
+
logits = model.decode(z, max_length=max_length, mode="sample", temperature=temperature)
|
| 778 |
+
preds = logits.argmax(dim=-1)
|
| 779 |
+
for i in range(current_batch_size):
|
| 780 |
+
pred_ids = preds[i].cpu().tolist()
|
| 781 |
+
smiles = tokenizer.decode(pred_ids, skip_special_tokens=True)
|
| 782 |
+
generated_smiles.append(smiles)
|
| 783 |
+
if len(generated_smiles) >= num_samples: break
|
| 784 |
+
return generated_smiles
|
| 785 |
+
def measure_inference_throughput(model, tokenizer, test_smiles, device,
|
| 786 |
+
max_length=128,
|
| 787 |
+
batch_sizes=[1, 4, 8, 16]):
|
| 788 |
+
"""
|
| 789 |
+
Benchmark inference speed & peak GPU memory across several batch sizes.
|
| 790 |
+
Returns a JSON-serialisable dict:
|
| 791 |
+
{batch_size: {'tokens_per_sec': <float>, 'peak_mem_mb': <float>}, ...}
|
| 792 |
+
"""
|
| 793 |
+
model.eval()
|
| 794 |
+
results = {}
|
| 795 |
+
for bs in batch_sizes:
|
| 796 |
+
# Build a small fixed subset so every BS processes the same #samples
|
| 797 |
+
subset = SmilesDataset(test_smiles[:bs * 10])
|
| 798 |
+
loader = DataLoader(
|
| 799 |
+
subset,
|
| 800 |
+
batch_size=bs,
|
| 801 |
+
shuffle=False,
|
| 802 |
+
num_workers=0,
|
| 803 |
+
collate_fn=lambda b: collate_fn(b, tokenizer, max_length=max_length),
|
| 804 |
+
)
|
| 805 |
+
total_tokens = 0
|
| 806 |
+
if torch.cuda.is_available():
|
| 807 |
+
torch.cuda.reset_peak_memory_stats(device)
|
| 808 |
+
start_time = time.perf_counter()
|
| 809 |
+
with torch.no_grad():
|
| 810 |
+
for input_ids, lengths in loader:
|
| 811 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 812 |
+
mu, logvar = model.encode(input_ids, lengths)
|
| 813 |
+
z = model.reparameterize(mu, logvar)
|
| 814 |
+
logits = model.decode(z, max_length=max_length)
|
| 815 |
+
total_tokens += logits.numel() # number of float elements
|
| 816 |
+
duration = time.perf_counter() - start_time
|
| 817 |
+
tokens_per_sec = total_tokens / duration
|
| 818 |
+
peak_mem_mb = (
|
| 819 |
+
torch.cuda.max_memory_allocated(device) / (1024 ** 2)
|
| 820 |
+
if torch.cuda.is_available()
|
| 821 |
+
else 0.0
|
| 822 |
+
)
|
| 823 |
+
# Store as plain Python floats
|
| 824 |
+
results[bs] = {
|
| 825 |
+
"tokens_per_sec": float(tokens_per_sec),
|
| 826 |
+
"peak_mem_mb": float(peak_mem_mb),
|
| 827 |
+
}
|
| 828 |
+
print(f"BS {bs:3d} → {tokens_per_sec:8.2f} tok/s | Peak Mem: {peak_mem_mb:.2f} MB")
|
| 829 |
+
return results
|
| 830 |
+
#
|
| 831 |
+
# FINAL EVALUATION PIPELINE
|
| 832 |
+
#
|
| 833 |
+
def full_evaluation_pipeline(model, tokenizer, train_smiles, test_smiles, device, save_dir):
|
| 834 |
+
print(f"\n FULL EVALUATION FOR: {tokenizer.name}")
|
| 835 |
+
test_dataset = SmilesDataset(test_smiles)
|
| 836 |
+
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False,
|
| 837 |
+
collate_fn=lambda b: collate_fn(b, tokenizer, max_length=MAX_SEQ_LEN),
|
| 838 |
+
num_workers=0)
|
| 839 |
+
# 1. Reconstruction
|
| 840 |
+
recon_metrics = evaluate_reconstruction(model, test_loader, tokenizer, device)
|
| 841 |
+
# 2. Uniqueness & Novelty
|
| 842 |
+
train_set = set(train_smiles)
|
| 843 |
+
uniqueness, novelty = compute_uniqueness_and_novelty(recon_metrics['generated_smiles'], train_set)
|
| 844 |
+
# 3. KL Divergence
|
| 845 |
+
kl_div = evaluate_latent_kl(model, test_loader, device)
|
| 846 |
+
# 4. Interpolation Validity
|
| 847 |
+
interp_validity = evaluate_interpolation_validity(model, tokenizer, test_smiles, device)
|
| 848 |
+
# 5. Latent Sampling (for FCD — optional)
|
| 849 |
+
# gen_smiles_30k = sample_from_latent(model, tokenizer, num_samples=10000, temperature=0.8) # reduce for speed
|
| 850 |
+
# fcd_score = compute_fcd(test_smiles, gen_smiles_30k) if 'get_fcd' in globals() else None
|
| 851 |
+
# 6. Throughput & Memory
|
| 852 |
+
# throughput = measure_inference_throughput(model, tokenizer, test_loader, device)
|
| 853 |
+
eval_results = {
|
| 854 |
+
**recon_metrics,
|
| 855 |
+
'uniqueness': uniqueness,
|
| 856 |
+
'novelty': novelty,
|
| 857 |
+
'kl_divergence': kl_div,
|
| 858 |
+
'interpolation_validity': interp_validity,
|
| 859 |
+
# 'fcd': fcd_score,
|
| 860 |
+
# 'inference_throughput': throughput,
|
| 861 |
+
}
|
| 862 |
+
eval_path = os.path.join(save_dir, "evaluation_results.json")
|
| 863 |
+
with open(eval_path, "w") as f:
|
| 864 |
+
json.dump(eval_results, f, indent=2, default=str)
|
| 865 |
+
print(f" Evaluation saved to {eval_path}")
|
| 866 |
+
return eval_results
|
| 867 |
+
#
|
| 868 |
+
# RUN EVALUATION FOR EACH TOKENIZER
|
| 869 |
+
#
|
| 870 |
+
for tokenizer in TOKENIZERS:
|
| 871 |
+
print(f"\n🔄 LOADING BEST MODEL FOR: {tokenizer.name}")
|
| 872 |
+
checkpoint_path = f"./checkpoints/{tokenizer.name}/best_model_{tokenizer.name}.pt"
|
| 873 |
+
if not os.path.exists(checkpoint_path):
|
| 874 |
+
print(f"⚠️ Checkpoint not found: {checkpoint_path}")
|
| 875 |
+
continue
|
| 876 |
+
vocab_size = len(tokenizer)
|
| 877 |
+
pad_token_id = tokenizer.tokenizer.pad_token_id
|
| 878 |
+
model = MoleculeVAE(
|
| 879 |
+
vocab_size=vocab_size,
|
| 880 |
+
pad_token_id=pad_token_id,
|
| 881 |
+
bos_token_id=tokenizer.bos_token_id,
|
| 882 |
+
eos_token_id=tokenizer.eos_token_id
|
| 883 |
+
).to(device)
|
| 884 |
+
checkpoint = torch.load(checkpoint_path, map_location=device)
|
| 885 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
| 886 |
+
model.eval()
|
| 887 |
+
full_evaluation_pipeline(
|
| 888 |
+
model=model,
|
| 889 |
+
tokenizer=tokenizer,
|
| 890 |
+
train_smiles=train_smiles,
|
| 891 |
+
test_smiles=test_smiles,
|
| 892 |
+
device=device,
|
| 893 |
+
save_dir=f"./checkpoints/{tokenizer.name}"
|
| 894 |
+
)
|
| 895 |
+
print("\n🎉 PIPELINE COMPLETE — ALL TOKENIZERS BENCHMARKED, TRAINED, AND EVALUATED!")
|
benchmark/benchmark_legacy.py
ADDED
|
@@ -0,0 +1,1039 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Molecule Tokenizer Benchmark & VAE Training Pipeline
|
| 3 |
+
# PATCHED VERSION — All 5 critical bugs fixed + KL Beta Logging Clarity
|
| 4 |
+
#
|
| 5 |
+
|
| 6 |
+
#
|
| 7 |
+
# Step 1.1 — Imports & Reproducibility
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import time
|
| 12 |
+
import random
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
import torch
|
| 17 |
+
import numpy as np
|
| 18 |
+
|
| 19 |
+
# Tokenizers
|
| 20 |
+
from transformers import AutoTokenizer
|
| 21 |
+
from FastChemTokenizer import FastChemTokenizer # assuming it's in PYTHONPATH
|
| 22 |
+
# Optional: for progress bars
|
| 23 |
+
from tqdm import tqdm
|
| 24 |
+
from rdkit import Chem
|
| 25 |
+
from sklearn.model_selection import train_test_split
|
| 26 |
+
import torch.nn as nn
|
| 27 |
+
import torch.nn.functional as F
|
| 28 |
+
from ranger21 import Ranger21
|
| 29 |
+
from torch.utils.data import DataLoader, Dataset
|
| 30 |
+
from scipy.stats import entropy
|
| 31 |
+
import json
|
| 32 |
+
import math
|
| 33 |
+
|
| 34 |
+
from rdkit import RDLogger
|
| 35 |
+
RDLogger.DisableLog('rdApp.*')
|
| 36 |
+
# Set seeds for reproducibility
|
| 37 |
+
def set_seed(seed=42):
|
| 38 |
+
torch.manual_seed(seed)
|
| 39 |
+
torch.cuda.manual_seed_all(seed)
|
| 40 |
+
np.random.seed(seed)
|
| 41 |
+
random.seed(seed)
|
| 42 |
+
os.environ['PYTHONHASHSEED'] = str(seed)
|
| 43 |
+
torch.backends.cudnn.deterministic = True
|
| 44 |
+
torch.backends.cudnn.benchmark = False
|
| 45 |
+
|
| 46 |
+
set_seed(42)
|
| 47 |
+
|
| 48 |
+
# Device setup
|
| 49 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 50 |
+
print(f"Using device: {device}")
|
| 51 |
+
|
| 52 |
+
#
|
| 53 |
+
# Step 1.2 — Load & Preprocess SMILES Corpus
|
| 54 |
+
#
|
| 55 |
+
|
| 56 |
+
data_path = "../data/sample_1k_smi_42.csv"
|
| 57 |
+
df = pd.read_csv(data_path)
|
| 58 |
+
|
| 59 |
+
if 'SMILES' not in df.columns:
|
| 60 |
+
raise ValueError("Expected column 'SMILES' in CSV")
|
| 61 |
+
|
| 62 |
+
smiles_list = df['SMILES'].dropna().tolist()
|
| 63 |
+
print(f"Loaded {len(smiles_list)} SMILES (assumed pre-canonicalized)")
|
| 64 |
+
|
| 65 |
+
# Validate with RDKit
|
| 66 |
+
|
| 67 |
+
def is_valid_smiles(smiles):
|
| 68 |
+
return Chem.MolFromSmiles(smiles) is not None
|
| 69 |
+
|
| 70 |
+
print("Validating SMILES with RDKit...")
|
| 71 |
+
valid_mask = [is_valid_smiles(s) for s in tqdm(smiles_list)]
|
| 72 |
+
smiles_list = [s for s, valid in zip(smiles_list, valid_mask) if valid]
|
| 73 |
+
print(f"After RDKit filtering: {len(smiles_list)} valid SMILES")
|
| 74 |
+
|
| 75 |
+
#
|
| 76 |
+
# Step 1.3 — Train/Val/Test Split (80/10/10)
|
| 77 |
+
#
|
| 78 |
+
|
| 79 |
+
train_smiles, temp_smiles = train_test_split(smiles_list, test_size=0.2, random_state=42, shuffle=True)
|
| 80 |
+
val_smiles, test_smiles = train_test_split(temp_smiles, test_size=0.5, random_state=42, shuffle=True)
|
| 81 |
+
|
| 82 |
+
print(f"Train: {len(train_smiles)}")
|
| 83 |
+
print(f"Val: {len(val_smiles)}")
|
| 84 |
+
print(f"Test: {len(test_smiles)}")
|
| 85 |
+
|
| 86 |
+
# Cache splits
|
| 87 |
+
splits = {'train': train_smiles, 'val': val_smiles, 'test': test_smiles}
|
| 88 |
+
for split_name, smiles in splits.items():
|
| 89 |
+
with open(f"../data/{split_name}_smiles.txt", "w") as f:
|
| 90 |
+
f.write("\n".join(smiles))
|
| 91 |
+
|
| 92 |
+
#
|
| 93 |
+
# Step 1.4 — Tokenizer Wrapper (Fixed Bug #2, #3, #6)
|
| 94 |
+
#
|
| 95 |
+
|
| 96 |
+
class TokenizerWrapper:
|
| 97 |
+
def __init__(self, tokenizer, name, bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>"):
|
| 98 |
+
self.tokenizer = tokenizer
|
| 99 |
+
self.name = name
|
| 100 |
+
self.bos_token = bos_token
|
| 101 |
+
self.eos_token = eos_token
|
| 102 |
+
self.pad_token = pad_token
|
| 103 |
+
self.unk_token = unk_token
|
| 104 |
+
|
| 105 |
+
if hasattr(tokenizer, 'add_special_tokens'):
|
| 106 |
+
tokenizer.add_special_tokens({
|
| 107 |
+
'bos_token': bos_token,
|
| 108 |
+
'eos_token': eos_token,
|
| 109 |
+
'pad_token': pad_token,
|
| 110 |
+
'unk_token': unk_token
|
| 111 |
+
})
|
| 112 |
+
|
| 113 |
+
def encode(self, smiles: str, add_special_tokens: bool = True):
|
| 114 |
+
if isinstance(self.tokenizer, FastChemTokenizer):
|
| 115 |
+
# 1. get ids directly
|
| 116 |
+
ids = self.tokenizer.encode(smiles) # ← no .tokenize() here
|
| 117 |
+
# 2. add specials ourselves
|
| 118 |
+
if add_special_tokens:
|
| 119 |
+
ids = [self.tokenizer.bos_token_id] + ids + [self.tokenizer.eos_token_id]
|
| 120 |
+
return {'input_ids': ids}
|
| 121 |
+
else:
|
| 122 |
+
# Hugging-Face style tokenizer
|
| 123 |
+
return self.tokenizer(
|
| 124 |
+
smiles,
|
| 125 |
+
add_special_tokens=add_special_tokens,
|
| 126 |
+
return_attention_mask=False,
|
| 127 |
+
return_tensors=None
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
def decode(self, token_ids, skip_special_tokens=True):
|
| 131 |
+
if isinstance(self.tokenizer, FastChemTokenizer):
|
| 132 |
+
# 1. map single ids → tokens
|
| 133 |
+
tokens = [self.tokenizer.id_to_token.get(tid, self.tokenizer.unk_token)
|
| 134 |
+
for tid in token_ids]
|
| 135 |
+
# 2. drop specials if requested
|
| 136 |
+
if skip_special_tokens:
|
| 137 |
+
specials = {self.tokenizer.bos_token,
|
| 138 |
+
self.tokenizer.eos_token,
|
| 139 |
+
self.tokenizer.pad_token,
|
| 140 |
+
self.tokenizer.unk_token} # add any others you use
|
| 141 |
+
tokens = [t for t in tokens if t not in specials]
|
| 142 |
+
# 3. detokenise
|
| 143 |
+
if hasattr(self.tokenizer, 'detokenize'):
|
| 144 |
+
return self.tokenizer.detokenize(tokens)
|
| 145 |
+
else:
|
| 146 |
+
return "".join(tokens) # chemistry tokens are atomic
|
| 147 |
+
else:
|
| 148 |
+
return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
|
| 149 |
+
|
| 150 |
+
def __len__(self):
|
| 151 |
+
if isinstance(self.tokenizer, FastChemTokenizer):
|
| 152 |
+
# FastChemTokenizer uses ._vocab or .vocab depending on version
|
| 153 |
+
return len(getattr(self.tokenizer, 'vocab',
|
| 154 |
+
getattr(self.tokenizer, '_vocab', self.tokenizer)))
|
| 155 |
+
else:
|
| 156 |
+
return len(self.tokenizer)
|
| 157 |
+
|
| 158 |
+
def get_vocab(self):
|
| 159 |
+
if isinstance(self.tokenizer, FastChemTokenizer):
|
| 160 |
+
return self.tokenizer.vocab
|
| 161 |
+
else:
|
| 162 |
+
return self.tokenizer.get_vocab()
|
| 163 |
+
|
| 164 |
+
@property
|
| 165 |
+
def bos_token_id(self):
|
| 166 |
+
return self.tokenizer.bos_token_id
|
| 167 |
+
|
| 168 |
+
@property
|
| 169 |
+
def eos_token_id(self):
|
| 170 |
+
return self.tokenizer.eos_token_id
|
| 171 |
+
|
| 172 |
+
@property
|
| 173 |
+
def pad_token_id(self):
|
| 174 |
+
return self.tokenizer.pad_token_id
|
| 175 |
+
|
| 176 |
+
@property
|
| 177 |
+
def unk_token_id(self):
|
| 178 |
+
return self.tokenizer.unk_token_id
|
| 179 |
+
|
| 180 |
+
#
|
| 181 |
+
# Step 1.5 — Initialize Tokenizers
|
| 182 |
+
#
|
| 183 |
+
|
| 184 |
+
tok1_hf = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
|
| 185 |
+
tok2_fast = FastChemTokenizer.from_pretrained("../smitok")
|
| 186 |
+
|
| 187 |
+
tokenizer1 = TokenizerWrapper(tok1_hf, name="ChemBERTa", bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>")
|
| 188 |
+
tokenizer2 = TokenizerWrapper(tok2_fast, name="FastChemTokenizer", bos_token="[BOS]", eos_token="[EOS]", pad_token="[PAD]", unk_token="[UNK]")
|
| 189 |
+
|
| 190 |
+
TOKENIZERS = [tokenizer1, tokenizer2]
|
| 191 |
+
|
| 192 |
+
#
|
| 193 |
+
# Step 1.6 — Benchmarking Functions (Fixed Bug #4 implicitly via epsilon)
|
| 194 |
+
#
|
| 195 |
+
|
| 196 |
+
def benchmark_tokenizer(tokenizer, smiles_sample, encode_only=False):
|
| 197 |
+
V = len(tokenizer)
|
| 198 |
+
sample = smiles_sample[:10000] if len(smiles_sample) > 10000 else smiles_sample
|
| 199 |
+
|
| 200 |
+
encode_times = []
|
| 201 |
+
token_counts = []
|
| 202 |
+
char_counts = []
|
| 203 |
+
unk_counts = 0
|
| 204 |
+
total_tokens = 0
|
| 205 |
+
|
| 206 |
+
for smiles in tqdm(sample, desc=f"Encoding with {tokenizer.name}", leave=False):
|
| 207 |
+
char_counts.append(len(smiles))
|
| 208 |
+
|
| 209 |
+
start = time.perf_counter()
|
| 210 |
+
enc = tokenizer.encode(smiles, add_special_tokens=True)
|
| 211 |
+
end = time.perf_counter()
|
| 212 |
+
encode_times.append(end - start)
|
| 213 |
+
|
| 214 |
+
input_ids = enc['input_ids']
|
| 215 |
+
token_counts.append(len(input_ids))
|
| 216 |
+
total_tokens += len(input_ids)
|
| 217 |
+
|
| 218 |
+
if isinstance(tokenizer.tokenizer, FastChemTokenizer):
|
| 219 |
+
unk_id = tokenizer.tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
|
| 220 |
+
else:
|
| 221 |
+
unk_id = tokenizer.tokenizer.unk_token_id
|
| 222 |
+
|
| 223 |
+
unk_counts += input_ids.count(unk_id)
|
| 224 |
+
|
| 225 |
+
L̄ = np.mean(token_counts)
|
| 226 |
+
C = np.mean(char_counts) / L̄
|
| 227 |
+
U = unk_counts / total_tokens if total_tokens > 0 else 0.0
|
| 228 |
+
Tenc = len(sample) / sum(encode_times)
|
| 229 |
+
|
| 230 |
+
metrics = {
|
| 231 |
+
'vocab_size': V,
|
| 232 |
+
'avg_tokens_per_mol': L̄,
|
| 233 |
+
'compression_ratio': C,
|
| 234 |
+
'percent_unknown': U * 100,
|
| 235 |
+
'encode_throughput_smiles_per_sec': Tenc,
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
if encode_only:
|
| 239 |
+
return metrics
|
| 240 |
+
|
| 241 |
+
decode_times = []
|
| 242 |
+
reconstruction_ok = 0
|
| 243 |
+
|
| 244 |
+
for smiles in tqdm(sample, desc=f"Decoding with {tokenizer.name}", leave=False):
|
| 245 |
+
enc = tokenizer.encode(smiles, add_special_tokens=True)
|
| 246 |
+
input_ids = enc['input_ids']
|
| 247 |
+
|
| 248 |
+
start = time.perf_counter()
|
| 249 |
+
decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
|
| 250 |
+
end = time.perf_counter()
|
| 251 |
+
decode_times.append(end - start)
|
| 252 |
+
|
| 253 |
+
if decoded == smiles:
|
| 254 |
+
reconstruction_ok += 1
|
| 255 |
+
|
| 256 |
+
Tdec = len(sample) / sum(decode_times)
|
| 257 |
+
recon_acc = reconstruction_ok / len(sample)
|
| 258 |
+
|
| 259 |
+
metrics.update({
|
| 260 |
+
'decode_throughput_smiles_per_sec': Tdec,
|
| 261 |
+
'decode_reconstruction_accuracy': recon_acc * 100,
|
| 262 |
+
})
|
| 263 |
+
|
| 264 |
+
return metrics
|
| 265 |
+
|
| 266 |
+
#
|
| 267 |
+
# Step 1.7 — Run Benchmark
|
| 268 |
+
#
|
| 269 |
+
|
| 270 |
+
benchmark_sample = train_smiles
|
| 271 |
+
results = []
|
| 272 |
+
|
| 273 |
+
for tokenizer in TOKENIZERS:
|
| 274 |
+
print(f"\n=== Benchmarking {tokenizer.name} ===")
|
| 275 |
+
metrics = benchmark_tokenizer(tokenizer, benchmark_sample)
|
| 276 |
+
metrics['tokenizer'] = tokenizer.name
|
| 277 |
+
results.append(metrics)
|
| 278 |
+
for k, v in metrics.items():
|
| 279 |
+
if k != 'tokenizer':
|
| 280 |
+
print(f"{k:35s}: {v:.4f}" if isinstance(v, float) else f"{k:35s}: {v}")
|
| 281 |
+
|
| 282 |
+
df_results = pd.DataFrame(results)
|
| 283 |
+
df_results.to_csv("tokenizer_benchmark_results.csv", index=False)
|
| 284 |
+
print("\nTokenizer benchmark results saved to 'tokenizer_benchmark_results.csv'")
|
| 285 |
+
|
| 286 |
+
#
|
| 287 |
+
# Step 2.1 — VAE Model Class (PATCHED: decode stops at EOS)
|
| 288 |
+
#
|
| 289 |
+
|
| 290 |
+
class MoleculeVAE(nn.Module):
|
| 291 |
+
def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, latent_dim=128, num_layers=2,
|
| 292 |
+
pad_token_id=0, bos_token_id=1, eos_token_id=2):
|
| 293 |
+
super().__init__()
|
| 294 |
+
self.vocab_size = vocab_size
|
| 295 |
+
self.embed_dim = embed_dim
|
| 296 |
+
self.hidden_dim = hidden_dim
|
| 297 |
+
self.latent_dim = latent_dim
|
| 298 |
+
self.num_layers = num_layers
|
| 299 |
+
self.pad_token_id = pad_token_id
|
| 300 |
+
self.bos_token_id = bos_token_id
|
| 301 |
+
self.eos_token_id = eos_token_id
|
| 302 |
+
|
| 303 |
+
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_token_id)
|
| 304 |
+
self.encoder_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
|
| 305 |
+
self.fc_mu = nn.Linear(hidden_dim * 2, latent_dim)
|
| 306 |
+
self.fc_logvar = nn.Linear(hidden_dim * 2, latent_dim)
|
| 307 |
+
|
| 308 |
+
self.decoder_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
|
| 309 |
+
self.fc_out = nn.Linear(hidden_dim, vocab_size)
|
| 310 |
+
|
| 311 |
+
self.latent2hidden = nn.Linear(latent_dim, num_layers * hidden_dim)
|
| 312 |
+
self.latent2cell = nn.Linear(latent_dim, num_layers * hidden_dim)
|
| 313 |
+
|
| 314 |
+
self._init_weights()
|
| 315 |
+
|
| 316 |
+
def _init_weights(self):
|
| 317 |
+
for m in self.modules():
|
| 318 |
+
if isinstance(m, nn.Linear):
|
| 319 |
+
nn.init.xavier_uniform_(m.weight)
|
| 320 |
+
if m.bias is not None:
|
| 321 |
+
nn.init.zeros_(m.bias)
|
| 322 |
+
elif isinstance(m, nn.LSTM):
|
| 323 |
+
for name, param in m.named_parameters():
|
| 324 |
+
if 'weight' in name:
|
| 325 |
+
nn.init.orthogonal_(param)
|
| 326 |
+
elif 'bias' in name:
|
| 327 |
+
nn.init.zeros_(param)
|
| 328 |
+
|
| 329 |
+
def encode(self, x, lengths):
|
| 330 |
+
embedded = self.embedding(x)
|
| 331 |
+
packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
|
| 332 |
+
packed_out, (hidden, _) = self.encoder_lstm(packed)
|
| 333 |
+
h_forward = hidden[-2]
|
| 334 |
+
h_backward = hidden[-1]
|
| 335 |
+
h = torch.cat([h_forward, h_backward], dim=1)
|
| 336 |
+
mu = self.fc_mu(h)
|
| 337 |
+
logvar = self.fc_logvar(h)
|
| 338 |
+
return mu, logvar
|
| 339 |
+
|
| 340 |
+
def reparameterize(self, mu, logvar):
|
| 341 |
+
if self.training:
|
| 342 |
+
std = torch.exp(0.5 * logvar)
|
| 343 |
+
eps = torch.randn_like(std)
|
| 344 |
+
return mu + eps * std
|
| 345 |
+
else:
|
| 346 |
+
return mu
|
| 347 |
+
|
| 348 |
+
def decode(self, z, max_length=128, mode="greedy", temperature=1.0):
|
| 349 |
+
"""
|
| 350 |
+
Decode latent vector z into a sequence.
|
| 351 |
+
Returns full logits at each step.
|
| 352 |
+
PATCHED: stops generation when EOS is predicted.
|
| 353 |
+
"""
|
| 354 |
+
batch_size = z.size(0)
|
| 355 |
+
device = z.device
|
| 356 |
+
|
| 357 |
+
# Initialize hidden states from latent
|
| 358 |
+
h0 = self.latent2hidden(z).view(self.num_layers, batch_size, self.hidden_dim)
|
| 359 |
+
c0 = self.latent2cell(z).view(self.num_layers, batch_size, self.hidden_dim)
|
| 360 |
+
hidden = (h0, c0)
|
| 361 |
+
|
| 362 |
+
# Start with BOS token — shape: (batch_size, 1)
|
| 363 |
+
input_token = torch.full((batch_size, 1), self.bos_token_id, dtype=torch.long, device=device)
|
| 364 |
+
logits = []
|
| 365 |
+
finished = torch.zeros(batch_size, dtype=torch.bool, device=device) # ← TRACK FINISHED SEQS
|
| 366 |
+
|
| 367 |
+
for _ in range(max_length):
|
| 368 |
+
embedded = self.embedding(input_token) # (batch, 1, embed_dim)
|
| 369 |
+
output, hidden = self.decoder_lstm(embedded, hidden)
|
| 370 |
+
logit = self.fc_out(output) # (batch, 1, vocab)
|
| 371 |
+
logits.append(logit)
|
| 372 |
+
|
| 373 |
+
if mode == "greedy":
|
| 374 |
+
input_token = logit.argmax(dim=-1) # (batch, 1)
|
| 375 |
+
elif mode == "sample":
|
| 376 |
+
probs = torch.softmax(logit.squeeze(1) / temperature, dim=-1) # (batch, vocab)
|
| 377 |
+
input_token = torch.multinomial(probs, 1) # (batch, 1)
|
| 378 |
+
else:
|
| 379 |
+
raise ValueError(f"Unknown decode mode: {mode}")
|
| 380 |
+
|
| 381 |
+
# ← EARLY STOPPING AT EOS
|
| 382 |
+
just_finished = (input_token.squeeze(1) == self.eos_token_id)
|
| 383 |
+
finished |= just_finished
|
| 384 |
+
input_token[finished] = self.pad_token_id # pad finished sequences
|
| 385 |
+
if finished.all():
|
| 386 |
+
break
|
| 387 |
+
|
| 388 |
+
return torch.cat(logits, dim=1) # (batch, seq_len, vocab)
|
| 389 |
+
|
| 390 |
+
def forward(self, input_ids, lengths, target_seq=None, teacher_forcing_ratio=0.0, temperature=1.0):
|
| 391 |
+
mu, logvar = self.encode(input_ids, lengths)
|
| 392 |
+
z = self.reparameterize(mu, logvar)
|
| 393 |
+
|
| 394 |
+
if self.training and target_seq is not None and teacher_forcing_ratio > 0:
|
| 395 |
+
# Training with teacher forcing
|
| 396 |
+
batch_size, seq_len = target_seq.size()
|
| 397 |
+
device = target_seq.device
|
| 398 |
+
|
| 399 |
+
# Initialize hidden states
|
| 400 |
+
h0 = self.latent2hidden(z).view(self.num_layers, batch_size, self.hidden_dim)
|
| 401 |
+
c0 = self.latent2cell(z).view(self.num_layers, batch_size, self.hidden_dim)
|
| 402 |
+
hidden = (h0, c0)
|
| 403 |
+
|
| 404 |
+
logits = []
|
| 405 |
+
input_token = target_seq[:, 0].unsqueeze(1) # BOS
|
| 406 |
+
|
| 407 |
+
for t in range(1, seq_len):
|
| 408 |
+
embedded = self.embedding(input_token)
|
| 409 |
+
output, hidden = self.decoder_lstm(embedded, hidden)
|
| 410 |
+
logit = self.fc_out(output)
|
| 411 |
+
logits.append(logit)
|
| 412 |
+
|
| 413 |
+
use_teacher = torch.rand(1).item() < teacher_forcing_ratio
|
| 414 |
+
if use_teacher:
|
| 415 |
+
input_token = target_seq[:, t].unsqueeze(1)
|
| 416 |
+
else:
|
| 417 |
+
input_token = logit.argmax(dim=-1)
|
| 418 |
+
|
| 419 |
+
logits = torch.cat(logits, dim=1)
|
| 420 |
+
else:
|
| 421 |
+
# Inference mode
|
| 422 |
+
max_len = target_seq.size(1) if target_seq is not None else 128
|
| 423 |
+
logits = self.decode(z, max_length=max_len, mode="greedy", temperature=temperature)
|
| 424 |
+
|
| 425 |
+
return logits, mu, logvar
|
| 426 |
+
|
| 427 |
+
#
|
| 428 |
+
# Step 2.2 — Loss Function (PATCHED: β applied OUTSIDE, not inside)
|
| 429 |
+
#
|
| 430 |
+
|
| 431 |
+
def vae_loss(logits, targets, mu, logvar, pad_token_id, beta=1.0):
|
| 432 |
+
# 1. align lengths
|
| 433 |
+
max_len = max(logits.size(1), targets.size(1))
|
| 434 |
+
if logits.size(1) < max_len:
|
| 435 |
+
logits = F.pad(logits, (0, 0, 0, max_len - logits.size(1)))
|
| 436 |
+
if targets.size(1) < max_len:
|
| 437 |
+
targets = F.pad(targets, (0, max_len - targets.size(1)), value=pad_token_id)
|
| 438 |
+
|
| 439 |
+
logits_flat = logits.view(-1, logits.size(-1)) # [B*L, V]
|
| 440 |
+
targets_flat = targets.reshape(-1) # [B*L]
|
| 441 |
+
|
| 442 |
+
mask = (targets_flat != pad_token_id).float()
|
| 443 |
+
ce_loss = F.cross_entropy(logits_flat, targets_flat, reduction='none')
|
| 444 |
+
mask_sum = mask.sum()
|
| 445 |
+
ce_loss = (ce_loss * mask).sum() / (mask_sum + 1e-8)
|
| 446 |
+
|
| 447 |
+
kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1).mean()
|
| 448 |
+
# ← β is applied OUTSIDE — so return raw KL
|
| 449 |
+
return ce_loss + kl_loss, ce_loss, kl_loss
|
| 450 |
+
|
| 451 |
+
#
|
| 452 |
+
# Step 2.3 — KLAnnealer (Fixed Bug #5: double increment)
|
| 453 |
+
#
|
| 454 |
+
|
| 455 |
+
class KLAnnealer:
|
| 456 |
+
def __init__(self, total_steps, n_cycle=1, ratio=0.3, mode="linear", per_epoch=False, steps_per_epoch=None):
|
| 457 |
+
self.total_steps = total_steps
|
| 458 |
+
self.n_cycle = n_cycle
|
| 459 |
+
self.ratio = ratio
|
| 460 |
+
self.mode = mode
|
| 461 |
+
self.per_epoch = per_epoch
|
| 462 |
+
self.steps_per_epoch = steps_per_epoch
|
| 463 |
+
self.current_step = 0
|
| 464 |
+
|
| 465 |
+
def get_beta(self, increment=True):
|
| 466 |
+
"""Get current KL weight.
|
| 467 |
+
Args:
|
| 468 |
+
increment (bool): whether to advance the annealer (use False in validation).
|
| 469 |
+
"""
|
| 470 |
+
if increment:
|
| 471 |
+
self.current_step += 1
|
| 472 |
+
|
| 473 |
+
if self.current_step > self.total_steps:
|
| 474 |
+
return 1.0
|
| 475 |
+
|
| 476 |
+
# effective cycle length
|
| 477 |
+
if self.per_epoch:
|
| 478 |
+
assert self.steps_per_epoch is not None, "steps_per_epoch required if per_epoch=True"
|
| 479 |
+
cycle_length = self.steps_per_epoch / self.n_cycle
|
| 480 |
+
pos_in_cycle = (self.current_step % self.steps_per_epoch) / cycle_length
|
| 481 |
+
else:
|
| 482 |
+
cycle_length = self.total_steps / self.n_cycle
|
| 483 |
+
pos_in_cycle = (self.current_step % cycle_length) / cycle_length
|
| 484 |
+
|
| 485 |
+
pos_in_cycle = min(pos_in_cycle, 1.0)
|
| 486 |
+
|
| 487 |
+
# warmup phase
|
| 488 |
+
fraction = pos_in_cycle / self.ratio if pos_in_cycle < self.ratio else 1.0
|
| 489 |
+
|
| 490 |
+
if self.mode == "linear":
|
| 491 |
+
return min(fraction, 1.0)
|
| 492 |
+
elif self.mode == "sigmoid":
|
| 493 |
+
# Map pos_in_cycle ∈ [0,1] to sigmoid ∈ [0,1]
|
| 494 |
+
# Center at 0.5, so at pos_in_cycle=0.5, sigmoid=0.5
|
| 495 |
+
k = 6
|
| 496 |
+
return 1 / (1 + math.exp(-k * (pos_in_cycle - 0.5)))
|
| 497 |
+
else:
|
| 498 |
+
raise ValueError(f"Unknown mode: {self.mode}")
|
| 499 |
+
|
| 500 |
+
#
|
| 501 |
+
# Step 2.4 — Collate Function (Fixed Bug #2: dynamic pad id)
|
| 502 |
+
#
|
| 503 |
+
|
| 504 |
+
def collate_fn(batch, tokenizer, max_length=128):
|
| 505 |
+
encodings = [tokenizer.encode(s, add_special_tokens=True) for s in batch]
|
| 506 |
+
input_ids = [e['input_ids'] for e in encodings]
|
| 507 |
+
|
| 508 |
+
max_len = min(max(len(ids) for ids in input_ids), max_length)
|
| 509 |
+
padded = []
|
| 510 |
+
lengths = []
|
| 511 |
+
|
| 512 |
+
pad_token_id = tokenizer.tokenizer.pad_token_id # FIXED: dynamic
|
| 513 |
+
|
| 514 |
+
for ids in input_ids:
|
| 515 |
+
if len(ids) > max_length:
|
| 516 |
+
ids = ids[:max_length]
|
| 517 |
+
else:
|
| 518 |
+
ids = ids + [pad_token_id] * (max_len - len(ids))
|
| 519 |
+
padded.append(ids)
|
| 520 |
+
lengths.append(min(len(ids), max_length))
|
| 521 |
+
|
| 522 |
+
return torch.tensor(padded, dtype=torch.long), torch.tensor(lengths, dtype=torch.long)
|
| 523 |
+
|
| 524 |
+
#
|
| 525 |
+
# Step 2.5 — Dataset & DataLoader
|
| 526 |
+
#
|
| 527 |
+
|
| 528 |
+
class SmilesDataset(Dataset):
|
| 529 |
+
def __init__(self, smiles_list):
|
| 530 |
+
self.smiles_list = smiles_list
|
| 531 |
+
def __len__(self):
|
| 532 |
+
return len(self.smiles_list)
|
| 533 |
+
def __getitem__(self, idx):
|
| 534 |
+
return self.smiles_list[idx]
|
| 535 |
+
|
| 536 |
+
#
|
| 537 |
+
# Step 3.x — Training Loop (PATCHED: per-tokenizer annealer, exponential TFR, device-safe eval, KL beta logging clarity)
|
| 538 |
+
#
|
| 539 |
+
|
| 540 |
+
LEARNING_RATE = 5e-6
|
| 541 |
+
BATCH_SIZE = 16
|
| 542 |
+
ACCUMULATION_STEPS = 4
|
| 543 |
+
NUM_EPOCHS = 5
|
| 544 |
+
MAX_SEQ_LEN = 128
|
| 545 |
+
KL_ANNEAL_RATIO = 0.3
|
| 546 |
+
|
| 547 |
+
def train_vae(
|
| 548 |
+
model,
|
| 549 |
+
train_loader,
|
| 550 |
+
val_loader,
|
| 551 |
+
optimizer,
|
| 552 |
+
kl_annealer,
|
| 553 |
+
pad_token_id,
|
| 554 |
+
device,
|
| 555 |
+
num_epochs,
|
| 556 |
+
accumulation_steps=4,
|
| 557 |
+
save_dir="./checkpoints",
|
| 558 |
+
tokenizer_name="default"
|
| 559 |
+
):
|
| 560 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 561 |
+
log_file = os.path.join(save_dir, f"training_log_{tokenizer_name}.csv")
|
| 562 |
+
|
| 563 |
+
with open(log_file, "w") as f:
|
| 564 |
+
f.write("epoch,step,train_loss,train_ce,train_kl,val_loss,val_ce,val_kl,kl_beta\n")
|
| 565 |
+
|
| 566 |
+
best_val_loss = float('inf')
|
| 567 |
+
|
| 568 |
+
for epoch in range(num_epochs):
|
| 569 |
+
print(f"\n=== Epoch {epoch+1}/{num_epochs} ===")
|
| 570 |
+
model.train()
|
| 571 |
+
total_train_loss = total_train_ce = total_train_kl = 0.0
|
| 572 |
+
num_batches = 0
|
| 573 |
+
|
| 574 |
+
optimizer.zero_grad()
|
| 575 |
+
|
| 576 |
+
for step, (input_ids, lengths) in enumerate(tqdm(train_loader, desc="Training")):
|
| 577 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 578 |
+
|
| 579 |
+
# ← PATCHED: exponential decay per epoch (not per batch, but smoother than linear)
|
| 580 |
+
tfr = 1.0 * (0.5 ** (epoch / max(1, num_epochs-1))) # decay from 1.0 → 0.5
|
| 581 |
+
|
| 582 |
+
logits, mu, logvar = model(input_ids, lengths, target_seq=input_ids, teacher_forcing_ratio=tfr)
|
| 583 |
+
beta = kl_annealer.get_beta(increment=True)
|
| 584 |
+
loss, ce_loss, kl_loss = vae_loss(logits, input_ids, mu, logvar, pad_token_id, beta=beta)
|
| 585 |
+
|
| 586 |
+
loss = loss / accumulation_steps
|
| 587 |
+
loss.backward()
|
| 588 |
+
|
| 589 |
+
total_train_loss += loss.item() * accumulation_steps
|
| 590 |
+
total_train_ce += ce_loss.item()
|
| 591 |
+
total_train_kl += kl_loss.item()
|
| 592 |
+
num_batches += 1
|
| 593 |
+
|
| 594 |
+
if (step + 1) % accumulation_steps == 0:
|
| 595 |
+
optimizer.step()
|
| 596 |
+
optimizer.zero_grad()
|
| 597 |
+
|
| 598 |
+
if len(train_loader) % accumulation_steps != 0:
|
| 599 |
+
optimizer.step()
|
| 600 |
+
optimizer.zero_grad()
|
| 601 |
+
|
| 602 |
+
# ✅ CAPTURE BETA AFTER TRAINING — BEFORE VALIDATION
|
| 603 |
+
# This ensures we log the beta that was actually used during training
|
| 604 |
+
current_beta = kl_annealer.get_beta(increment=False)
|
| 605 |
+
|
| 606 |
+
# Validation — DO NOT query beta again here
|
| 607 |
+
model.eval()
|
| 608 |
+
total_val_loss = total_val_ce = total_val_kl = 0.0
|
| 609 |
+
val_batches = 0
|
| 610 |
+
|
| 611 |
+
with torch.no_grad():
|
| 612 |
+
for input_ids, lengths in tqdm(val_loader, desc="Validating"):
|
| 613 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 614 |
+
# Use captured beta — DO NOT call kl_annealer again here
|
| 615 |
+
logits, mu, logvar = model(input_ids, lengths, target_seq=input_ids, teacher_forcing_ratio=0.0)
|
| 616 |
+
loss, ce_loss, kl_loss = vae_loss(logits, input_ids, mu, logvar, pad_token_id, beta=current_beta)
|
| 617 |
+
|
| 618 |
+
total_val_loss += loss.item()
|
| 619 |
+
total_val_ce += ce_loss.item()
|
| 620 |
+
total_val_kl += kl_loss.item()
|
| 621 |
+
val_batches += 1
|
| 622 |
+
|
| 623 |
+
avg_train_loss = total_train_loss / num_batches
|
| 624 |
+
avg_val_loss = total_val_loss / val_batches
|
| 625 |
+
|
| 626 |
+
current_step = (epoch + 1) * len(train_loader)
|
| 627 |
+
with open(log_file, "a") as f:
|
| 628 |
+
f.write(f"{epoch+1},{current_step},{avg_train_loss:.6f},{total_train_ce/num_batches:.6f},{total_train_kl/num_batches:.6f},"
|
| 629 |
+
f"{avg_val_loss:.6f},{total_val_ce/val_batches:.6f},{total_val_kl/val_batches:.6f},{current_beta:.6f}\n")
|
| 630 |
+
|
| 631 |
+
print(f"Train Loss: {avg_train_loss:.4f}")
|
| 632 |
+
print(f"Val Loss: {avg_val_loss:.4f}")
|
| 633 |
+
print(f"KL Beta: {current_beta:.4f}") # ← Now explicitly the training beta
|
| 634 |
+
|
| 635 |
+
if avg_val_loss < best_val_loss:
|
| 636 |
+
best_val_loss = avg_val_loss
|
| 637 |
+
checkpoint_path = os.path.join(save_dir, f"best_model_{tokenizer_name}.pt")
|
| 638 |
+
torch.save({
|
| 639 |
+
'epoch': epoch + 1,
|
| 640 |
+
'model_state_dict': model.state_dict(),
|
| 641 |
+
'optimizer_state_dict': optimizer.state_dict(),
|
| 642 |
+
'val_loss': avg_val_loss,
|
| 643 |
+
}, checkpoint_path)
|
| 644 |
+
print(f"→ Saved best model to {checkpoint_path}")
|
| 645 |
+
|
| 646 |
+
return best_val_loss
|
| 647 |
+
|
| 648 |
+
#
|
| 649 |
+
# TRAINING LOOP OVER TOKENIZERS (PATCHED: KLAnnealer reset per tokenizer)
|
| 650 |
+
#
|
| 651 |
+
|
| 652 |
+
for tokenizer in TOKENIZERS:
|
| 653 |
+
print(f"\n STARTING TRAINING FOR: {tokenizer.name}\n")
|
| 654 |
+
|
| 655 |
+
vocab_size = len(tokenizer)
|
| 656 |
+
pad_token_id = tokenizer.tokenizer.pad_token_id
|
| 657 |
+
|
| 658 |
+
# Validate token IDs
|
| 659 |
+
sample_ids = tokenizer.encode(train_smiles[0], add_special_tokens=True)['input_ids']
|
| 660 |
+
max_id_in_sample = max(sample_ids)
|
| 661 |
+
assert max_id_in_sample < vocab_size, f"Token ID {max_id_in_sample} >= vocab size {vocab_size} in {tokenizer.name}"
|
| 662 |
+
|
| 663 |
+
model = MoleculeVAE(
|
| 664 |
+
vocab_size=len(tokenizer),
|
| 665 |
+
pad_token_id=tokenizer.pad_token_id,
|
| 666 |
+
bos_token_id=tokenizer.bos_token_id,
|
| 667 |
+
eos_token_id=tokenizer.eos_token_id
|
| 668 |
+
).to(device)
|
| 669 |
+
|
| 670 |
+
########################################################################
|
| 671 |
+
# 1. CREATE A FRESH annealer FOR EVERY TOKENIZER
|
| 672 |
+
########################################################################
|
| 673 |
+
total_steps = (len(train_smiles) // (BATCH_SIZE*ACCUMULATION_STEPS)) * NUM_EPOCHS
|
| 674 |
+
kl_annealer = KLAnnealer(
|
| 675 |
+
total_steps=total_steps,
|
| 676 |
+
n_cycle=4, # 4 cycles across all epochs → real cyclical
|
| 677 |
+
ratio=0.25, # 25% of each cycle is warmup
|
| 678 |
+
mode="sigmoid",
|
| 679 |
+
per_epoch=False
|
| 680 |
+
)
|
| 681 |
+
|
| 682 |
+
optimizer = Ranger21(
|
| 683 |
+
model.parameters(),
|
| 684 |
+
lr=LEARNING_RATE,
|
| 685 |
+
weight_decay=0.01,
|
| 686 |
+
use_adabelief=True,
|
| 687 |
+
use_warmup=True,
|
| 688 |
+
use_madgrad=True,
|
| 689 |
+
num_epochs=NUM_EPOCHS,
|
| 690 |
+
num_batches_per_epoch=len(train_smiles) // (BATCH_SIZE * ACCUMULATION_STEPS),
|
| 691 |
+
warmdown_active=False,
|
| 692 |
+
)
|
| 693 |
+
|
| 694 |
+
train_dataset = SmilesDataset(train_smiles)
|
| 695 |
+
val_dataset = SmilesDataset(val_smiles)
|
| 696 |
+
|
| 697 |
+
train_loader = DataLoader(
|
| 698 |
+
train_dataset,
|
| 699 |
+
batch_size=BATCH_SIZE,
|
| 700 |
+
shuffle=True,
|
| 701 |
+
collate_fn=lambda batch: collate_fn(batch, tokenizer, max_length=MAX_SEQ_LEN),
|
| 702 |
+
num_workers=0,
|
| 703 |
+
pin_memory=True
|
| 704 |
+
)
|
| 705 |
+
|
| 706 |
+
val_loader = DataLoader(
|
| 707 |
+
val_dataset,
|
| 708 |
+
batch_size=BATCH_SIZE,
|
| 709 |
+
shuffle=False,
|
| 710 |
+
collate_fn=lambda batch: collate_fn(batch, tokenizer, max_length=MAX_SEQ_LEN),
|
| 711 |
+
num_workers=0,
|
| 712 |
+
pin_memory=True
|
| 713 |
+
)
|
| 714 |
+
|
| 715 |
+
train_vae(
|
| 716 |
+
model=model,
|
| 717 |
+
train_loader=train_loader,
|
| 718 |
+
val_loader=val_loader,
|
| 719 |
+
optimizer=optimizer,
|
| 720 |
+
kl_annealer=kl_annealer,
|
| 721 |
+
pad_token_id=pad_token_id,
|
| 722 |
+
device=device,
|
| 723 |
+
num_epochs=NUM_EPOCHS,
|
| 724 |
+
accumulation_steps=ACCUMULATION_STEPS,
|
| 725 |
+
save_dir=f"./checkpoints/{tokenizer.name}",
|
| 726 |
+
tokenizer_name=tokenizer.name
|
| 727 |
+
)
|
| 728 |
+
|
| 729 |
+
#
|
| 730 |
+
# Step 4.x — Evaluation Pipeline (Fixed Bug #6, #7, #8)
|
| 731 |
+
#
|
| 732 |
+
|
| 733 |
+
def canonicalize_smiles(smiles):
|
| 734 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 735 |
+
if mol is None:
|
| 736 |
+
return None
|
| 737 |
+
return Chem.MolToSmiles(mol, isomericSmiles=True)
|
| 738 |
+
|
| 739 |
+
def evaluate_reconstruction(model, dataloader, tokenizer, device, max_length=128):
|
| 740 |
+
model.eval()
|
| 741 |
+
total_token_correct = total_tokens = exact_matches = valid_count = total_samples = 0
|
| 742 |
+
all_generated, all_targets = [], []
|
| 743 |
+
|
| 744 |
+
pad_id = tokenizer.tokenizer.pad_token_id
|
| 745 |
+
eos_id = tokenizer.tokenizer.eos_token_id
|
| 746 |
+
special_ids = {pad_id, eos_id}
|
| 747 |
+
|
| 748 |
+
def trim_to_special(ids, specials):
|
| 749 |
+
for i, id_ in enumerate(ids):
|
| 750 |
+
if id_ in specials:
|
| 751 |
+
return ids[:i]
|
| 752 |
+
return ids
|
| 753 |
+
|
| 754 |
+
with torch.no_grad():
|
| 755 |
+
for input_ids, lengths in tqdm(dataloader, desc="Evaluating Reconstruction"):
|
| 756 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 757 |
+
B = input_ids.size(0)
|
| 758 |
+
|
| 759 |
+
mu, logvar = model.encode(input_ids, lengths)
|
| 760 |
+
z = model.reparameterize(mu, logvar)
|
| 761 |
+
logits = model.decode(z, max_length=128, mode="greedy") # FIXED #7 for reconstruction
|
| 762 |
+
preds = logits.argmax(dim=-1)
|
| 763 |
+
|
| 764 |
+
# FIXED: Align logits and targets to same sequence length
|
| 765 |
+
min_len = min(logits.size(1), input_ids.size(1))
|
| 766 |
+
preds = preds[:, :min_len] # trim predictions
|
| 767 |
+
input_ids_eval = input_ids[:, :min_len] # trim targets
|
| 768 |
+
|
| 769 |
+
mask = (input_ids_eval != pad_id)
|
| 770 |
+
token_correct = ((preds == input_ids_eval) & mask).sum().item()
|
| 771 |
+
total_token_correct += token_correct
|
| 772 |
+
total_tokens += mask.sum().item()
|
| 773 |
+
|
| 774 |
+
for i in range(B):
|
| 775 |
+
target_ids = input_ids_eval[i].cpu().tolist()
|
| 776 |
+
pred_ids = preds[i].cpu().tolist()
|
| 777 |
+
|
| 778 |
+
# FIXED BUG #6: Trim before decode
|
| 779 |
+
target_ids_trim = trim_to_special(target_ids, special_ids)
|
| 780 |
+
pred_ids_trim = trim_to_special(pred_ids, special_ids)
|
| 781 |
+
|
| 782 |
+
target_smiles = tokenizer.decode(target_ids_trim, skip_special_tokens=False)
|
| 783 |
+
pred_smiles = tokenizer.decode(pred_ids_trim, skip_special_tokens=False)
|
| 784 |
+
|
| 785 |
+
all_targets.append(target_smiles)
|
| 786 |
+
all_generated.append(pred_smiles)
|
| 787 |
+
|
| 788 |
+
if pred_smiles == target_smiles:
|
| 789 |
+
exact_matches += 1
|
| 790 |
+
if Chem.MolFromSmiles(pred_smiles) is not None:
|
| 791 |
+
valid_count += 1
|
| 792 |
+
total_samples += 1
|
| 793 |
+
|
| 794 |
+
token_acc = total_token_correct / total_tokens if total_tokens > 0 else 0.0
|
| 795 |
+
exact_match_rate = exact_matches / total_samples
|
| 796 |
+
validity_rate = valid_count / total_samples
|
| 797 |
+
|
| 798 |
+
print(f"Token-level Accuracy: {token_acc:.4f}")
|
| 799 |
+
print(f"Exact Match Rate: {exact_match_rate:.4f}")
|
| 800 |
+
print(f"Validity Rate: {validity_rate:.4f}")
|
| 801 |
+
|
| 802 |
+
return {
|
| 803 |
+
'token_accuracy': token_acc,
|
| 804 |
+
'exact_match_rate': exact_match_rate,
|
| 805 |
+
'validity_rate': validity_rate,
|
| 806 |
+
'generated_smiles': all_generated,
|
| 807 |
+
'target_smiles': all_targets
|
| 808 |
+
}
|
| 809 |
+
|
| 810 |
+
def compute_uniqueness_and_novelty(generated_smiles, train_smiles_set):
|
| 811 |
+
total = len(generated_smiles)
|
| 812 |
+
unique = len(set(generated_smiles))
|
| 813 |
+
novel = len([s for s in generated_smiles if s not in train_smiles_set])
|
| 814 |
+
uniqueness = unique / total if total > 0 else 0.0
|
| 815 |
+
novelty = novel / total if total > 0 else 0.0
|
| 816 |
+
print(f"Uniqueness: {uniqueness:.4f} ({unique}/{total})")
|
| 817 |
+
print(f"Novelty: {novelty:.4f} ({novel}/not in train)")
|
| 818 |
+
return uniqueness, novelty
|
| 819 |
+
|
| 820 |
+
def kl_divergence_from_samples(samples, bins=512):
|
| 821 |
+
dim_kls = []
|
| 822 |
+
for d in range(samples.shape[1]):
|
| 823 |
+
data = samples[:, d]
|
| 824 |
+
hist, bin_edges = np.histogram(data, bins=bins, density=True)
|
| 825 |
+
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
|
| 826 |
+
norm_pdf = (1 / np.sqrt(2 * np.pi)) * np.exp(-0.5 * bin_centers**2)
|
| 827 |
+
hist = np.clip(hist, 1e-10, None)
|
| 828 |
+
norm_pdf = np.clip(norm_pdf, 1e-10, None)
|
| 829 |
+
kl = entropy(hist, norm_pdf)
|
| 830 |
+
dim_kls.append(kl)
|
| 831 |
+
return np.mean(dim_kls)
|
| 832 |
+
|
| 833 |
+
def evaluate_latent_kl(model, dataloader, device, latent_dim=128, bins=512):
|
| 834 |
+
model.eval()
|
| 835 |
+
all_z = []
|
| 836 |
+
with torch.no_grad():
|
| 837 |
+
for input_ids, lengths in tqdm(dataloader, desc="Sampling Latents"):
|
| 838 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 839 |
+
mu, logvar = model.encode(input_ids, lengths)
|
| 840 |
+
z = model.reparameterize(mu, logvar)
|
| 841 |
+
all_z.append(z.cpu().numpy())
|
| 842 |
+
all_z = np.concatenate(all_z, axis=0)
|
| 843 |
+
kl_div = kl_divergence_from_samples(all_z, bins=bins)
|
| 844 |
+
print(f"KL Divergence (empirical vs N(0,1)): {kl_div:.4f}")
|
| 845 |
+
return kl_div
|
| 846 |
+
|
| 847 |
+
def evaluate_interpolation_validity(model, tokenizer, test_smiles, device, num_pairs=100, steps=10, max_length=128):
|
| 848 |
+
model.eval()
|
| 849 |
+
pairs = random.sample(list(zip(test_smiles[::2], test_smiles[1::2])), min(num_pairs, len(test_smiles)//2))
|
| 850 |
+
valid_interps = total_interps = 0
|
| 851 |
+
|
| 852 |
+
with torch.no_grad():
|
| 853 |
+
for smiles_a, smiles_b in tqdm(pairs, desc="Interpolation Validity"):
|
| 854 |
+
if not smiles_a or not smiles_b: continue
|
| 855 |
+
|
| 856 |
+
enc_a = tokenizer.encode(smiles_a, add_special_tokens=True)
|
| 857 |
+
enc_b = tokenizer.encode(smiles_b, add_special_tokens=True)
|
| 858 |
+
|
| 859 |
+
ids_a = torch.tensor([enc_a['input_ids']], device=device)
|
| 860 |
+
ids_b = torch.tensor([enc_b['input_ids']], device=device)
|
| 861 |
+
len_a = torch.tensor([len(enc_a['input_ids'])], device=device)
|
| 862 |
+
len_b = torch.tensor([len(enc_b['input_ids'])], device=device)
|
| 863 |
+
|
| 864 |
+
mu_a, _ = model.encode(ids_a, len_a)
|
| 865 |
+
mu_b, _ = model.encode(ids_b, len_b)
|
| 866 |
+
|
| 867 |
+
alphas = torch.linspace(0, 1, steps, device=device)
|
| 868 |
+
for alpha in alphas:
|
| 869 |
+
z_interp = alpha * mu_b + (1 - alpha) * mu_a
|
| 870 |
+
# Ensure z_interp maintains batch dimension [1, latent_dim]
|
| 871 |
+
if z_interp.dim() == 1:
|
| 872 |
+
z_interp = z_interp.unsqueeze(0)
|
| 873 |
+
|
| 874 |
+
logits = model.decode(z_interp, max_length=max_length, mode="sample", temperature=0.8)
|
| 875 |
+
preds = logits.argmax(dim=-1)
|
| 876 |
+
# Handle batch dimension properly
|
| 877 |
+
if preds.dim() > 1:
|
| 878 |
+
preds = preds[0] # Take first (and only) batch item
|
| 879 |
+
pred_smiles = tokenizer.decode(preds.cpu().tolist(), skip_special_tokens=True)
|
| 880 |
+
if Chem.MolFromSmiles(pred_smiles) is not None:
|
| 881 |
+
valid_interps += 1
|
| 882 |
+
total_interps += 1
|
| 883 |
+
|
| 884 |
+
interp_validity = valid_interps / total_interps if total_interps > 0 else 0.0
|
| 885 |
+
print(f"Interpolation Validity: {interp_validity:.4f}")
|
| 886 |
+
return interp_validity
|
| 887 |
+
|
| 888 |
+
def sample_from_latent(model, tokenizer, num_samples=30000, latent_dim=128, max_length=128, device=device, temperature=0.8):
|
| 889 |
+
model.eval()
|
| 890 |
+
generated_smiles = []
|
| 891 |
+
with torch.no_grad():
|
| 892 |
+
for _ in tqdm(range(0, num_samples, BATCH_SIZE), desc="Sampling from Latent"):
|
| 893 |
+
current_batch_size = min(BATCH_SIZE, num_samples - len(generated_smiles))
|
| 894 |
+
if current_batch_size <= 0: break
|
| 895 |
+
z = torch.randn(current_batch_size, latent_dim, device=device)
|
| 896 |
+
logits = model.decode(z, max_length=max_length, mode="sample", temperature=temperature)
|
| 897 |
+
preds = logits.argmax(dim=-1)
|
| 898 |
+
for i in range(current_batch_size):
|
| 899 |
+
pred_ids = preds[i].cpu().tolist()
|
| 900 |
+
smiles = tokenizer.decode(pred_ids, skip_special_tokens=True)
|
| 901 |
+
generated_smiles.append(smiles)
|
| 902 |
+
if len(generated_smiles) >= num_samples: break
|
| 903 |
+
return generated_smiles
|
| 904 |
+
|
| 905 |
+
def measure_inference_throughput(model, tokenizer, test_smiles, device,
|
| 906 |
+
max_length=128,
|
| 907 |
+
batch_sizes=[1, 4, 8, 16]):
|
| 908 |
+
"""
|
| 909 |
+
Benchmark inference speed & peak GPU memory across several batch sizes.
|
| 910 |
+
Returns a JSON-serialisable dict:
|
| 911 |
+
{batch_size: {'tokens_per_sec': <float>, 'peak_mem_mb': <float>}, ...}
|
| 912 |
+
"""
|
| 913 |
+
model.eval()
|
| 914 |
+
results = {}
|
| 915 |
+
|
| 916 |
+
for bs in batch_sizes:
|
| 917 |
+
# Build a small fixed subset so every BS processes the same #samples
|
| 918 |
+
subset = SmilesDataset(test_smiles[:bs * 10])
|
| 919 |
+
loader = DataLoader(
|
| 920 |
+
subset,
|
| 921 |
+
batch_size=bs,
|
| 922 |
+
shuffle=False,
|
| 923 |
+
num_workers=0,
|
| 924 |
+
collate_fn=lambda b: collate_fn(b, tokenizer, max_length=max_length),
|
| 925 |
+
)
|
| 926 |
+
|
| 927 |
+
total_tokens = 0
|
| 928 |
+
if torch.cuda.is_available():
|
| 929 |
+
torch.cuda.reset_peak_memory_stats(device)
|
| 930 |
+
|
| 931 |
+
start_time = time.perf_counter()
|
| 932 |
+
with torch.no_grad():
|
| 933 |
+
for input_ids, lengths in loader:
|
| 934 |
+
input_ids, lengths = input_ids.to(device), lengths.to(device)
|
| 935 |
+
mu, logvar = model.encode(input_ids, lengths)
|
| 936 |
+
z = model.reparameterize(mu, logvar)
|
| 937 |
+
logits = model.decode(z, max_length=max_length)
|
| 938 |
+
total_tokens += logits.numel() # number of float elements
|
| 939 |
+
duration = time.perf_counter() - start_time
|
| 940 |
+
|
| 941 |
+
tokens_per_sec = total_tokens / duration
|
| 942 |
+
peak_mem_mb = (
|
| 943 |
+
torch.cuda.max_memory_allocated(device) / (1024 ** 2)
|
| 944 |
+
if torch.cuda.is_available()
|
| 945 |
+
else 0.0
|
| 946 |
+
)
|
| 947 |
+
|
| 948 |
+
# Store as plain Python floats
|
| 949 |
+
results[bs] = {
|
| 950 |
+
"tokens_per_sec": float(tokens_per_sec),
|
| 951 |
+
"peak_mem_mb": float(peak_mem_mb),
|
| 952 |
+
}
|
| 953 |
+
print(f"BS {bs:3d} → {tokens_per_sec:8.2f} tok/s | Peak Mem: {peak_mem_mb:.2f} MB")
|
| 954 |
+
|
| 955 |
+
return results
|
| 956 |
+
|
| 957 |
+
#
|
| 958 |
+
# FINAL EVALUATION PIPELINE
|
| 959 |
+
#
|
| 960 |
+
|
| 961 |
+
def full_evaluation_pipeline(model, tokenizer, train_smiles, test_smiles, device, save_dir):
|
| 962 |
+
print(f"\n FULL EVALUATION FOR: {tokenizer.name}")
|
| 963 |
+
|
| 964 |
+
test_dataset = SmilesDataset(test_smiles)
|
| 965 |
+
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False,
|
| 966 |
+
collate_fn=lambda b: collate_fn(b, tokenizer, max_length=MAX_SEQ_LEN),
|
| 967 |
+
num_workers=0)
|
| 968 |
+
|
| 969 |
+
# 1. Reconstruction
|
| 970 |
+
recon_metrics = evaluate_reconstruction(model, test_loader, tokenizer, device)
|
| 971 |
+
|
| 972 |
+
# 2. Uniqueness & Novelty
|
| 973 |
+
train_set = set(train_smiles)
|
| 974 |
+
uniqueness, novelty = compute_uniqueness_and_novelty(recon_metrics['generated_smiles'], train_set)
|
| 975 |
+
|
| 976 |
+
# 3. KL Divergence
|
| 977 |
+
kl_div = evaluate_latent_kl(model, test_loader, device)
|
| 978 |
+
|
| 979 |
+
# 4. Interpolation Validity
|
| 980 |
+
interp_validity = evaluate_interpolation_validity(model, tokenizer, test_smiles, device)
|
| 981 |
+
|
| 982 |
+
# 5. Latent Sampling (for FCD — optional)
|
| 983 |
+
# gen_smiles_30k = sample_from_latent(model, tokenizer, num_samples=10000, temperature=0.8) # reduce for speed
|
| 984 |
+
# fcd_score = compute_fcd(test_smiles, gen_smiles_30k) if 'get_fcd' in globals() else None
|
| 985 |
+
|
| 986 |
+
# 6. Throughput & Memory
|
| 987 |
+
# throughput = measure_inference_throughput(model, tokenizer, test_loader, device)
|
| 988 |
+
|
| 989 |
+
eval_results = {
|
| 990 |
+
**recon_metrics,
|
| 991 |
+
'uniqueness': uniqueness,
|
| 992 |
+
'novelty': novelty,
|
| 993 |
+
'kl_divergence': kl_div,
|
| 994 |
+
'interpolation_validity': interp_validity,
|
| 995 |
+
# 'fcd': fcd_score,
|
| 996 |
+
# 'inference_throughput': throughput,
|
| 997 |
+
}
|
| 998 |
+
|
| 999 |
+
eval_path = os.path.join(save_dir, "evaluation_results.json")
|
| 1000 |
+
with open(eval_path, "w") as f:
|
| 1001 |
+
json.dump(eval_results, f, indent=2, default=str)
|
| 1002 |
+
|
| 1003 |
+
print(f" Evaluation saved to {eval_path}")
|
| 1004 |
+
return eval_results
|
| 1005 |
+
|
| 1006 |
+
#
|
| 1007 |
+
# RUN EVALUATION FOR EACH TOKENIZER
|
| 1008 |
+
#
|
| 1009 |
+
|
| 1010 |
+
for tokenizer in TOKENIZERS:
|
| 1011 |
+
print(f"\n🔄 LOADING BEST MODEL FOR: {tokenizer.name}")
|
| 1012 |
+
checkpoint_path = f"./checkpoints/{tokenizer.name}/best_model_{tokenizer.name}.pt"
|
| 1013 |
+
if not os.path.exists(checkpoint_path):
|
| 1014 |
+
print(f"⚠️ Checkpoint not found: {checkpoint_path}")
|
| 1015 |
+
continue
|
| 1016 |
+
|
| 1017 |
+
vocab_size = len(tokenizer)
|
| 1018 |
+
pad_token_id = tokenizer.tokenizer.pad_token_id
|
| 1019 |
+
model = MoleculeVAE(
|
| 1020 |
+
vocab_size=vocab_size,
|
| 1021 |
+
pad_token_id=pad_token_id,
|
| 1022 |
+
bos_token_id=tokenizer.bos_token_id,
|
| 1023 |
+
eos_token_id=tokenizer.eos_token_id
|
| 1024 |
+
).to(device)
|
| 1025 |
+
|
| 1026 |
+
checkpoint = torch.load(checkpoint_path, map_location=device)
|
| 1027 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
| 1028 |
+
model.eval()
|
| 1029 |
+
|
| 1030 |
+
full_evaluation_pipeline(
|
| 1031 |
+
model=model,
|
| 1032 |
+
tokenizer=tokenizer,
|
| 1033 |
+
train_smiles=train_smiles,
|
| 1034 |
+
test_smiles=test_smiles,
|
| 1035 |
+
device=device,
|
| 1036 |
+
save_dir=f"./checkpoints/{tokenizer.name}"
|
| 1037 |
+
)
|
| 1038 |
+
|
| 1039 |
+
print("\n🎉 PIPELINE COMPLETE — ALL TOKENIZERS BENCHMARKED, TRAINED, AND EVALUATED!")
|
benchmark/data/chunk_1smi.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
benchmark/data/test_smiles.txt
ADDED
|
@@ -0,0 +1,1628 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CN(CCc1ccccc1)C(=O)C=Cc1ccccc1
|
| 2 |
+
OCc1cc(CC2(NCC3CCCCC3)COC2)no1
|
| 3 |
+
COc1ccc(C23CCC(=O)C=C2N(C)CC3)cc1OC
|
| 4 |
+
CC1(C)CC2C(=O)CCC3OC3(C)CCC21
|
| 5 |
+
CCOC(=O)CC(c1c(O)c2ccccc2[nH]c1=O)C(C)C
|
| 6 |
+
CC(OC(=O)Cn1cnc2ccccc2c1=O)C(N)=O
|
| 7 |
+
CC(C)NC(=O)NC1C2COC(O2)C(n2ccnc2)C1O
|
| 8 |
+
O=C(O)CNC(=O)c1cccc(Cl)c1
|
| 9 |
+
Cc1ccc(C(C)CC=CC(C)(C)O)cc1O
|
| 10 |
+
NC1CCN(Cc2ccc(OCc3ccccc3)cc2)CC1
|
| 11 |
+
O=C1N=CN=C2C1=NC(=S)N2C1OC(CO)C(O)C1O
|
| 12 |
+
CC1(C)NCCc2cc(O)c(O)cc21
|
| 13 |
+
CC(=O)c1c(C)cc2c(c1O)C(=O)C=CC2=O
|
| 14 |
+
COC1C(O)OC(C)C(N)C1O
|
| 15 |
+
Nc1cc2nncoc-2c1
|
| 16 |
+
Cc1ccc(Nc2nc(Cl)nc(NC(C)(C)C)n2)cc1
|
| 17 |
+
CC(C)CCC(=O)OCCCc1ccc(O)c(O)c1
|
| 18 |
+
C=C(C)C=CC12CC(C)C3CCC(C)([NH2+][CH2-])C(CCC1C)C32
|
| 19 |
+
C[C@]12Cc3ccccc3C[C@](C)(N1)c1ccccc12
|
| 20 |
+
CCOC(=O)c1ccc(NC(=O)CCCCC2SCC3NC(=O)NC32)cc1
|
| 21 |
+
CCCCCCCCCCCCCCCCCCCC(C)CC
|
| 22 |
+
CC1=C(CCC(C)(O)C(O)CO)C2(C)CCCC(C)(C)C2CC1
|
| 23 |
+
CN(C)CCOc1ccc2c(=O)cc(-c3ccccc3)oc2c1
|
| 24 |
+
COc1cc(O)c2c(c1)Cc1cc(C)cc(O)c1C2=O
|
| 25 |
+
CC1=CCC2CC3C(C)CCC13C2(C)C
|
| 26 |
+
Cc1nccn1C1C2OCC(O2)C(NCc2ccccn2)C1O
|
| 27 |
+
C=C1CCC2OC1C1C(C(C)C)CCC21C
|
| 28 |
+
CC1CC2CCCN2C(CC(=O)CC2N3CCCC3CC(C)C2(O)c2ccccc2)C1(O)c1ccccc1
|
| 29 |
+
CCC(C)Cc1ccc(C(C)O)oc1=O
|
| 30 |
+
Cc1cccc2c1sc1c(C)cccc12
|
| 31 |
+
CC1(C)CC2C=C(C=O)C34CC3(C(=O)OC4O)C2C1
|
| 32 |
+
CCC(C)C(NC(=O)C(N)CCSC)C(=O)NC(CCCN=C(N)N)C(=O)O
|
| 33 |
+
O=C(Oc1ccc2c(c1)OC(=Cc1ccco1)C2=O)c1ccccc1F
|
| 34 |
+
N#CCCCC1CS1
|
| 35 |
+
CC(C)CC(C(=O)NCC1CCCN2CCCCC12)n1cccc1
|
| 36 |
+
c1csc(-c2nnc3n2C2(CCCC2)Cc2ccccc2-3)c1
|
| 37 |
+
C#CC=CC1C(O)CCCC12CCCC(CC=C)N2
|
| 38 |
+
CC(C)C(CCNCc1ccc(N(C)C)cc1)c1ccco1
|
| 39 |
+
NC(=O)C(CCC(=O)O)NC(=O)C1=CC(NC(=O)NC2CCCCC2)C(O)C(O)C1
|
| 40 |
+
O=c1c(O)cccc2ccc(O)c(O)c12
|
| 41 |
+
COC=Cc1cc2ccc(=O)oc2cc1OC
|
| 42 |
+
C#CC=CC(Cl)C(O)C1CC2OC2CC(Br)C(CC)O1
|
| 43 |
+
C=C1CCC2C(C3CC(C4CCCCC4)CC13)C2(CN)CCC
|
| 44 |
+
C=C(C)CC1CCC=C2C(=O)CC(C)(C)C21
|
| 45 |
+
CCCCCCCCCCCCCCCC(C)(C)C
|
| 46 |
+
CN1CCN(CC2OCC(NCc3nccn3C)C2O)CC1
|
| 47 |
+
COc1cc(Cc2cnc(N)nc2N)cc(OCCC(=O)O)c1OC
|
| 48 |
+
CC(=NOCCSC(=N)N)c1ccc(Cl)c(Cl)c1
|
| 49 |
+
CC1=Cc2cc3c(c(O)c2C(C)O1)C(=O)C=C(O)C3=O
|
| 50 |
+
CN1C(=O)Nc2cccc(CN)c2S1(=O)=O
|
| 51 |
+
c1ccc2c(CNCCCNCc3cccc4ccccc34)cccc2c1
|
| 52 |
+
O=P(O)(O)c1ccccc1O
|
| 53 |
+
CC(C)(C)NCC(=O)Nc1c2c(nc3c1CCC3)CCC2
|
| 54 |
+
COC(=O)CC(C)CCC1C(C)=CCC2C(C)(C)C(=O)CCC12C
|
| 55 |
+
CC(C)CCNC1(Cc2cc(-c3cccc(O)c3)on2)COC1
|
| 56 |
+
CCc1c(O)cc(CCC(C)C)oc1=O
|
| 57 |
+
CC(C)C1Oc2cc3oc(=O)ccc3cc2C1=O
|
| 58 |
+
O=NN1CCCc2cc3c(cc21)N(NO)CCC3
|
| 59 |
+
CCNC(=O)Nc1ccc(C(O)C2COCC(=O)N2C)cc1
|
| 60 |
+
CC=CC=Cc1cc(O)cc(O)c1C=O
|
| 61 |
+
O=C(Nc1ccccc1[NH+]([O-])O)C(F)(F)F
|
| 62 |
+
CCOC(=O)C1=C(C)N=C(C)/C(=C(/O)OCC)C1c1cccc(I)c1
|
| 63 |
+
COC1COC2C(NS(=O)(=O)c3cccs3)COC12
|
| 64 |
+
CCCCCCCc1ccc(C#Cc2ccc(OCCCC)cc2)nc1
|
| 65 |
+
COC(=O)c1c(C)oc2ccc(OC(=O)c3ccc(F)cc3)cc12
|
| 66 |
+
Clc1cc(Cl)c(Cl)c(-c2cccc(Cl)c2Cl)c1
|
| 67 |
+
COc1cc(OC)c(C(=O)C=Cc2ccccc2OC)c(OC)c1
|
| 68 |
+
Cc1c(O)cc2c(c1C)OC(C)(CCCC(C)CCCC(C)CCCC(C)C)CC2
|
| 69 |
+
COc1cc(C(O)C(O)c2ccccc2)oc(=O)c1
|
| 70 |
+
COc1ccccc1CC[C@H](O)CC[C@@H]1[C@@H](CCCCCCC(=O)O)[C@@H](O)C[C@H]1O
|
| 71 |
+
CC1=CC(c2ccccc2)CC(=O)O1
|
| 72 |
+
CC1C(O)CCC2(C)CC(=O)C(C(C)(C)O)=CC12
|
| 73 |
+
CCCCCCCCCCCCCCCCCCCCCCCCCCCCOC(=O)CCCCCCCCCCCCCCCCCCC
|
| 74 |
+
CC(C)=CCc1ccc2[nH]c3c(CC(C)O)c(C)c(O)cc3c2c1
|
| 75 |
+
CCC(C)=CC(=O)OC1C(O)C(C2(C)CO2)CC2(C)C(C)CC(=O)CC12
|
| 76 |
+
CCCCCCCCCCCCCCCC=CCCC=CCCCC(=O)OC
|
| 77 |
+
COc1ccc2c3c([nH]c2c1)-c1cc(C)cc(=O)n1CC3
|
| 78 |
+
CCCCCCC(Br)(Br)C(=O)C(Br)Br
|
| 79 |
+
O=CCCCCCCCC1OC1CCCCCCCC(=O)O
|
| 80 |
+
C1=CC(=Nn2cccc2)C=CC1=Nn1cccc1
|
| 81 |
+
CCCCCC(=O)CCCC(=O)CCCCCC(=O)CCCC(=O)CCCCC
|
| 82 |
+
CC(C)CC1=C(O)N(O)C(CC(C)C)C=N1
|
| 83 |
+
c1ccc2ncncc2c1
|
| 84 |
+
CC#CC#CC#Cc1ccc(-c2ccccc2)s1
|
| 85 |
+
COc1ccc(C2Oc3cc(OC)cc(O)c3C(=O)C2O)cc1
|
| 86 |
+
CCCCCCCCCCCCC=CC(O)C(COC1OC(CO)C(O)C(O)C1O)NC(=O)C(O)CCCCCCCCCCCCCCCC
|
| 87 |
+
COc1ccc(-c2cn3nccnc3n2)c(OC)c1
|
| 88 |
+
OCC1OC(n2nnc3ccccc32)C(O)C(O)C1O
|
| 89 |
+
CC(C)(C)c1ccc(-c2nc(I)ccc2O)cc1
|
| 90 |
+
C(=NC(N=Cc1ccco1)c1ccco1)c1ccco1
|
| 91 |
+
CC(=O)OCC1(O)CC23CCC4C(C)(C)CCCC4(C)C2CCC1C3
|
| 92 |
+
CC1=CCC(C)(C)C(O)C2CC2(C)C(O)CC1
|
| 93 |
+
CC12CCC3c4ccc(O)cc4CCC3C1CCC2O
|
| 94 |
+
COc1c2c(cc3ccoc13)C=CC(O)O2
|
| 95 |
+
O=C1C2CC(CN3CCC(O)CC23)C2=CC(O)CCN12
|
| 96 |
+
C=C(C(=O)O)C1CC=C2CCCC(C)C2(C)C1
|
| 97 |
+
CC1(C)CCCC1(C)c1cc(C=O)cc(O)c1O
|
| 98 |
+
CCCCCCCCCCCCCCCC1CCNCCCN(C)CCCCNCCCN1
|
| 99 |
+
CN(Cc1ccccc1)Cc1cc(CC2CNCCC2CC(=O)N2CCc3ccccc3C2)no1
|
| 100 |
+
CCCCCCC(=O)/C=C\C=C\C(=O)c1ccc(C(=O)OC)cc1
|
| 101 |
+
CC1=C(C(=O)O)C(c2ccccc2)N(C)C(=O)N1C
|
| 102 |
+
CCC(C)C(O)(CC(=O)O)C(=O)O
|
| 103 |
+
CC(=NO)C(CC(C)C)=NO
|
| 104 |
+
C=C(C)C(CC=C(C)C)Cc1c(O)ccc(C(=O)C=Cc2ccccc2O)c1O
|
| 105 |
+
CCCCCCCCCCCCCC(=O)OC(CO)CO
|
| 106 |
+
CCCCCCCc1cc(=O)c2ccccc2n1C
|
| 107 |
+
O=C(O)C(CCCc1ccccc1)c1ccccc1
|
| 108 |
+
C=C=Cn1nc(C)c2c(C)nc(CCC)n2c1=O
|
| 109 |
+
c1ccc(CNCCCNCCCCCCCNCCCNCc2ccccc2)cc1
|
| 110 |
+
C=C1C(=O)OC2C1CCC(C)C1CCC(OC(C)=O)C12C
|
| 111 |
+
CC=CC#CC#CC=CC=CCCCC
|
| 112 |
+
CCC=CC=CC1(C)OC(CC(CO)OC)=CC1=O
|
| 113 |
+
CN1CCc2nc(N(C)C)cc(N)c2C1
|
| 114 |
+
C#CCOC(=O)C=C(C)C=CCC(C)CCCC(C)C
|
| 115 |
+
COc1ccc(O)c2oc3ccc(O)cc3c(=O)c12
|
| 116 |
+
COC(=O)Cc1c(C)c2ccc(OCc3ccc(C)cc3)cc2oc1=O
|
| 117 |
+
CCC1CN(C(C)=O)CCC1CC(=O)Nc1ccccc1
|
| 118 |
+
CC=CC#CC#CC=CCC(CCOC(C)=O)OC(C)=O
|
| 119 |
+
C=CCOC(=O)COc1ccc2c(=O)c(Oc3ccc(OC)cc3)coc2c1
|
| 120 |
+
O=C(CCc1c[nH]c2ccccc12)NCCn1ccc2ccccc21
|
| 121 |
+
CCCCCC(O)c1cccc(OCc2ccccn2)c1
|
| 122 |
+
CC1(C)CC(CCNC(=O)c2ccccc2C(=O)O)(Cc2ccccc2)CCO1
|
| 123 |
+
Cc1ccc(Br)cc1F
|
| 124 |
+
Cc1c(CC(=O)NC(C)C)c(=O)oc2cc(O)cc(O)c12
|
| 125 |
+
C=CC1(C)C=C2C(=O)OC34CCCC(C)(C)C3C(=O)OC24CC1
|
| 126 |
+
COc1cc(C(Br)=CC=CC=CC=CC=CC=CC=CC=CC(=O)O)ccc1Br
|
| 127 |
+
O=C1CCC(=O)N(O)CCCCCNC(=O)CCC(=O)N(O)CCCCCN1
|
| 128 |
+
CCCCCCCCCCCCCCCC(=O)OC(CO)COC(=O)CCCCCCCCCCCCCC
|
| 129 |
+
COc1ccc(C(=O)CSC(=N)N)cc1[NH+]([O-])O
|
| 130 |
+
CNS(=O)(=O)Cc1ccc2[nH]cc(CCN(C)C)c2c1
|
| 131 |
+
ON=C1C2CCCC1C1(O)CCCCC1C2
|
| 132 |
+
C=C(C)C1CC=C(C)C(=O)C1O
|
| 133 |
+
c1ccc(CNC2CC2)cc1
|
| 134 |
+
C=C(C)C1CC(=O)C2CCC(O)C(C)C2(C)C1
|
| 135 |
+
CC1(C)CC=CC23CCC(C=C12)C3(C)C
|
| 136 |
+
C=C(C(=O)O)C(CCCCCCCCCCCCCCC(C)O)C(=O)O
|
| 137 |
+
CCOc1ccc2[nH]c([S+]([O-])Cc3ccccc3N)nc2c1
|
| 138 |
+
Cc1ccccc1-n1c(=O)[nH]c(O)c(C2NCCc3ccccc32)c1=O
|
| 139 |
+
COc1ccc(C(=O)Nc2ccccc2Cl)cc1OC(C)=O
|
| 140 |
+
C=C1C(=O)OC2CC3(COC(C)=O)C(CC12)C(=C)C1OC(O)C3O1
|
| 141 |
+
c1cnc2c(C3NCCc4c3[nH]c3ccccc43)cccc2c1
|
| 142 |
+
CC(=O)OC1C(C(C)C)C(O)CC(C)=C2CCC(C)(O)C21
|
| 143 |
+
O=C1NCCc2c1[nH]c1ccc([NH+]([O-])O)cc21
|
| 144 |
+
CC(C)C(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
|
| 145 |
+
COc1ccc2c3ccnc4c3n(c2c1O)C(=O)CC4
|
| 146 |
+
C=C(CCC(C(=O)O)C1CCC2C3=CCC4CC(O)CCC4(C)C3CCC21C)C(C)C
|
| 147 |
+
COCCC(=O)Nc1ccc2n(c1=O)CC1CC2CN(C(C)=O)C1
|
| 148 |
+
CNCC(O)c1cc(O)c(O)cc1F
|
| 149 |
+
C=C(C(=O)OC)C(C)O
|
| 150 |
+
CC1=CC(=O)CC2(C)CCC(=O)C=C12
|
| 151 |
+
COc1ccc(-c2cc3ccc(OC(=O)c4ccccc4OC)cc3oc2=O)cc1
|
| 152 |
+
Clc1ccc(Sc2ccc(NC3=NCCN3)cc2)cc1
|
| 153 |
+
C=C(C)C(=O)OC1CC(C(=O)OC)=CCCC(C)=CC2OC(=O)C(=C)C21
|
| 154 |
+
CCCCCCCC1CC=CC(=O)O1
|
| 155 |
+
Cn1c(O)c(C(=O)C(Cl)Cl)c(=O)c2ccccc21
|
| 156 |
+
CCCCCCCCC=CCCCCCCCC(=O)C1=C(O)CCC(O)C1=O
|
| 157 |
+
O=C(NCCCn1ccnc1)c1cccs1
|
| 158 |
+
CCCCC(CC)CC1(CC)C=C(CC)C(CC(=O)O)OO1
|
| 159 |
+
C=c1[nH]c2onnc2c1=C
|
| 160 |
+
CCCCCCC=CCCCCCCCCCC(=O)OCC(COC(=O)CCCCCCCCCC=CCCCCCCCC)OC(=O)CCCCCCCC=CCCCCCCCC
|
| 161 |
+
CC(O)CC(=O)CCCCCCCCCCCCCCCCCCCCCCCC(O)CCOC1OC(CO)C(O)C(O)C1O
|
| 162 |
+
c1coc(Cc2ccc(Cc3ccco3)o2)c1
|
| 163 |
+
O=C(O)CCC(O)C=C(O)C(=O)O
|
| 164 |
+
CCCCCCCCCCCCCC=CC=CC=CC=CC(=O)O
|
| 165 |
+
CC(C)=CCOc1ccc(C=O)cc1
|
| 166 |
+
CC1CC(O)CC(C)(C)C1
|
| 167 |
+
CC=CC#CC#CC(=O)O
|
| 168 |
+
C=Cc1ccc(OC)c(OC)c1
|
| 169 |
+
C=CCCCCCCCCC1CC(CC(COC)OC)C(=O)O1
|
| 170 |
+
COc1cc(Br)cc2c(O)cc(C(=O)O)nc12
|
| 171 |
+
CC(C)(C)OC(=O)NCC1OCC(N)C1O
|
| 172 |
+
CCOC(=O)c1c(C)oc2ccc(OCC(=O)OC(C)(C)C)cc12
|
| 173 |
+
O=C(Cl)ON1C(=O)CCC1=O
|
| 174 |
+
COc1ccccc1CNC1C2COC(O2)C(N(C)CCc2ccccn2)C1O
|
| 175 |
+
S=c1c2ccccc2oc2ccccc12
|
| 176 |
+
CC(C)=CCCC(C=O)=CC=O
|
| 177 |
+
Oc1ccc(C=Cc2c(O)c(O)c3c(c2O)CCCC3)cc1O
|
| 178 |
+
C=CC(C)(CCC=C(C)Cc1cc(C)co1)CC(=O)c1ccc(O)cc1O
|
| 179 |
+
COc1cc(C2(COC(=O)C(C)C)CO2)c(OC(=O)C(C)C)cc1C
|
| 180 |
+
CCCCCC(=O)c1ccc(O)c(C(=O)Nc2ccc(Br)cc2)c1
|
| 181 |
+
COc1ccc(C(=O)C=Cc2cc3ccccc3o2)c(OCc2ccccc2)c1
|
| 182 |
+
CCCCCCC1CC1CCCC(=O)O
|
| 183 |
+
CC(CCCCCCCCCCCCCCCC(O)CC(=O)O)OC1OC(C)C(O)CC1O
|
| 184 |
+
CCC=CCC=CCC=CCCCCCCCC(=O)O[Si](C)(C)C
|
| 185 |
+
O=C(O)CCCCC1C2NC(=O)NC2CS1(=O)=O
|
| 186 |
+
O=C1CC(=Cc2ccc3[nH]ccc3c2)C(=O)N1
|
| 187 |
+
CC12CCCCC1CCC1C2CCC2(C)C(C3=CC(=O)OC3)CCC12
|
| 188 |
+
CCCCCCCCCCCCCCCC(=O)CC(=O)CCC
|
| 189 |
+
O=C(CN1CCCCC1CCO)c1c[nH]c2ccccc12
|
| 190 |
+
CS(=O)(=O)c1ccc(-c2ccccc2-c2ccc(F)c(Cl)c2)cc1
|
| 191 |
+
Cc1nn(-c2ccccc2)c(Cl)c1C=NO
|
| 192 |
+
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=O
|
| 193 |
+
COc1ccc2nc(CC3CN(C4CCOCC4)CCC3CC(=O)O)[nH]c2c1
|
| 194 |
+
O=P(Cc1ccccc1OCCCCCOc1ccccc1)(c1ccccc1)c1ccccc1
|
| 195 |
+
CC1N=CC23CCC4C(CCC5CC(N)CCC54C)C2CCC13
|
| 196 |
+
CCC(O)CCC=CC#Cc1cccs1
|
| 197 |
+
COC1C(O)COC(O)C1O
|
| 198 |
+
C=C1C(=O)OC(CCCCCCCCCCC)C1C(=O)O
|
| 199 |
+
CCCCCCCCCCCCCCCCCCCC(=O)OCC(O)CO
|
| 200 |
+
CC(=O)OCC(COC(C)=O)=C1C=CC2(C)CC=C(C)CCC3OC3(C)CCC12
|
| 201 |
+
COC(=O)C1C(NC(=O)C2CCCC2)CCN1C(C)=O
|
| 202 |
+
C[Si](C)(C)OC1CSSCC1O[Si](C)(C)C
|
| 203 |
+
Nc1ccn(C2OC(CO)C(O)C2O)c(=O)n1
|
| 204 |
+
C=C(C)C1CC=C(C=NO)CC1
|
| 205 |
+
Nc1ccc(O)c(F)c1
|
| 206 |
+
CN(C)Cc1c(O)ccc2c1OC(=Cc1ccccc1Cl)C2=O
|
| 207 |
+
SCOC(OCS)c1ccccc1
|
| 208 |
+
Nc1cc(C2CC3CCC2N3)cnc1Cl
|
| 209 |
+
O=C(O)c1cn2c(n1)COc1ccccc1-2
|
| 210 |
+
CCCCCCCCCCCCCCCCCCN(CC)c1ccc(C(=O)O)cc1
|
| 211 |
+
N=C(N)c1cccc(OCCNC(=O)c2ccc(C(=O)N3CCCC3)cc2)c1
|
| 212 |
+
CCCCOc1cc(OCCCC)c2c3c(c(=O)oc2c1)CCC3
|
| 213 |
+
COc1ccc2cc([NH+]([O-])O)ccc2c1C(=O)O
|
| 214 |
+
COC(=O)CCC(C)CCCCCCC(C)C
|
| 215 |
+
COC(=O)COCCOCC(=O)OC
|
| 216 |
+
C=C(C)C1(O)CCC(C)=CC1=O
|
| 217 |
+
COc1cc2c(cc1OC)C(=CC(=O)c1ccccn1)NCC2
|
| 218 |
+
CC1=CCCC(C)(O)C=CC(C(C)C)CCC(C)(CO)C(=O)C1
|
| 219 |
+
CCNC=C1C(=O)C=C(C2C(C)C=CC3CCCCC32)OC1=O
|
| 220 |
+
O=C(C(c1ccccc1)n1cnnn1)N1CCCCC1c1cccnc1
|
| 221 |
+
CC(=O)C=Cc1ccc(C)c(C)c1C
|
| 222 |
+
COc1cccc2c1C(O)(CC#N)C(O)=N2
|
| 223 |
+
CC(=O)c1ccc(C)cc1OC(=O)c1ccccc1OC(C)C
|
| 224 |
+
CCn1cc(Br)ccc1=O
|
| 225 |
+
C=C(C)c1oc2ccc(C(=O)COC(C)=O)cc2c1OC
|
| 226 |
+
COc1cc(C=Cc2ccccc2)cc(O)c1O
|
| 227 |
+
CC12CC(CCl)C3c4ccc(O)cc4CCC3C1CCC2O
|
| 228 |
+
C=C(C)C1CC=C2C=C(C(C)C)CCC2(O)C1(C)CCC(=O)O
|
| 229 |
+
OC1CCOC1Cc1ccccc1
|
| 230 |
+
CCNc1ccccc1C(=O)O
|
| 231 |
+
OB(O)c1cccc(-c2ccccc2)c1
|
| 232 |
+
CCCCCCC=CCCC=CCCCCCCCCCCCCCCCCCCCC(=O)O
|
| 233 |
+
CCCCCCCC(C)CCCCC
|
| 234 |
+
CCn1c(=O)[nH]c2cc(C(=O)O)ccc2c1=O
|
| 235 |
+
CC(CCO)CCC1C(C)(O)CCC2C(C)(C)CCCC21C
|
| 236 |
+
COc1ccc2c3ccnc(C)c3n(C)c2c1
|
| 237 |
+
CC1(C)C2CCC(C(=O)O)(C2)C1O
|
| 238 |
+
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)Nc1ccc(O)cc1
|
| 239 |
+
COC(=O)C1(c2cc3ccccc3[nH]2)COCC=C2CNCCC21
|
| 240 |
+
Cc1cc(O)c(O)c2c(=O)c(O)c(-c3ccc(O)cc3)oc12
|
| 241 |
+
CCCC1CC(O)C(Cl)C(O)(C(Br)Br)O1
|
| 242 |
+
COc1cc2c(cc1O)C(Cc1ccccc1)NCC2
|
| 243 |
+
CCOC(=O)C=C1CCC2C(O)(CCC3C(C)(C)CCCC32C)C1
|
| 244 |
+
O=C(O)Cc1c(O)cccc1O
|
| 245 |
+
CC(C)CCCC(C)CCCC(C)CCCC(C)C(O)CO
|
| 246 |
+
C[C]1[CH][CH][C](N)[NH+](C[C]2[CH][CH][CH][CH][C]2C)[CH]1
|
| 247 |
+
COc1cc(-c2cc3ccccc3o2)c(C)c(O)c1C
|
| 248 |
+
c1ccc(N=Nc2ccccc2N=Nc2ccccc2)cc1
|
| 249 |
+
N=C(N)NCCCC(NC(CCC(=O)O)C(=O)O)C(=O)O
|
| 250 |
+
Cc1ccc2oc(-c3cc(NCCCC#N)ccc3Cl)nc2c1
|
| 251 |
+
COc1ccc(CC(C)N)c(OC)c1OC
|
| 252 |
+
CCc1cc(C(C)=O)c(O)cc1OCCCCCCC(=O)NC
|
| 253 |
+
NC(Cc1ccc(-c2ccc(CC(N)C(=O)O)cc2O)c(O)c1)C(=O)O
|
| 254 |
+
CCCCCc1ncc(C)s1
|
| 255 |
+
CCCCCCCCCCCCCCCC(=O)OCC(O)COP(=O)(O)OC1C(O)C(O)C(O)C(O)C1O
|
| 256 |
+
COC1=CC(=O)OC(CC(O)c2ccccc2)C1
|
| 257 |
+
CCOC(=O)C(NCC(O)COc1ccccc1C)C(=O)OCC
|
| 258 |
+
Cn1ccc2c(c1=O)C(=O)OC2(C)C
|
| 259 |
+
Cc1cc2cnnc-2no1
|
| 260 |
+
N=C(N)SCc1cccc(CSC(=N)N)c1
|
| 261 |
+
C=C1CCC2C(C)(C)CCCC2(C)C1CC=C1CC(OCC)OC1=O
|
| 262 |
+
CCCCCCC(O)C(=O)O
|
| 263 |
+
CC1=CC2OC(O)C3(C)OC23CCC(C)=CCCC2(C)OC2CC1
|
| 264 |
+
COCCNc1ncccc1-c1noc(C2CCCN2CC(C)C)n1
|
| 265 |
+
CNCc1cccc(OC)c1O
|
| 266 |
+
CCCC=C1OC(=O)c2ccccc21
|
| 267 |
+
Cc1c(CCC(=O)NCCC(=O)O)c(=O)oc2cc3occ(C(C)(C)C)c3cc12
|
| 268 |
+
CC(C)C(Nc1nc(N)nc2[nH]cnc12)C(=O)O
|
| 269 |
+
Cc1c[nH]cc1-c1ccccc1
|
| 270 |
+
O=C1CC2C=CC1CC2
|
| 271 |
+
Cc1ccc(C(=O)NC(=O)CSc2nccc(O)n2)cc1
|
| 272 |
+
COc1cc(O)ccc1-c1oc2cc(O)cc(O)c2c(=O)c1CC=C(C)C
|
| 273 |
+
COc1ccc2nc(C)c(C(N)=O)cc2c1
|
| 274 |
+
CC1C(=O)Oc2cc3c(c(O)c21)CCC1C(C)(C)CCCC31C
|
| 275 |
+
CC(O)C(O)(C(=O)OCC1CCN2CCCC12)C(C)(C)O
|
| 276 |
+
CCCCc1cccc(CCC)c1O
|
| 277 |
+
O=Cc1ccc2ccoc2c1
|
| 278 |
+
CC(C)NC(C)C(O)COc1ccc(CC(N)=O)cc1
|
| 279 |
+
CC(=O)OC1C=C(CCO)C(C)CC2OC(=O)C(C)C12
|
| 280 |
+
CC(=O)OC1CC(C)C2(CCCC(=O)O2)C2(C)CCCC(C)(C)C12
|
| 281 |
+
CC(=CC(=O)O)C1CC2C(C)(CCC3C(C)(C)CCCC32C)O1
|
| 282 |
+
COc1cc2c3c(c1O)C1(CCC(O)CC1)CCC3N(C)CC2
|
| 283 |
+
Oc1nnc2c[nH]ccc1-2
|
| 284 |
+
CCCCCC=CCC=CCCCC1CC(=O)C2CCCCC2N1C
|
| 285 |
+
O=C1C(=CC=Cc2ccccc2)CCC1=C1SCCCS1
|
| 286 |
+
CCN(CC)C(=O)C1CCCN(CCCCCCCCCCCCN2CCCC(C(=O)N(CC)CC)C2)C1
|
| 287 |
+
CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC(=O)NC(C)C(=O)O
|
| 288 |
+
O=C(O)CCCCCCCCC(=O)O
|
| 289 |
+
COc1cccc(CCc2ccccc2)c1
|
| 290 |
+
COc1cc2oc(=O)c(CC(=O)NCCCCCC(=O)O)c(C)c2cc1Cl
|
| 291 |
+
COCCCN1C(=O)C(O)=C(C(=O)c2ccc(OCC(C)C)cc2C)C1c1ccncc1
|
| 292 |
+
c1cn(-c2ccon2)cn1
|
| 293 |
+
CC(=O)Nc1nc2ncc(C(=O)O)nc2c(=O)[nH]1
|
| 294 |
+
COc1ccc(O)c2c1C(=O)OC(CCCC(C)=O)C2
|
| 295 |
+
C=C(C)C#Cc1cc(O)c(C#CC(=C)C)c(CO)c1O
|
| 296 |
+
NCc1ccc2c(Br)cccc2c1
|
| 297 |
+
COc1ccc(C=C2Cc3cc(OC)c(OC)cc3C2=O)c(OC)c1
|
| 298 |
+
OC1CNC(c2nc(-c3cnccn3)no2)C1
|
| 299 |
+
CCC=CC=C1COC2(CCC(CO)O2)C1O
|
| 300 |
+
C#CCOC(=O)C(F)(F)F
|
| 301 |
+
CC1CCC(C(C)C)C(OC2OC(CO)C(O)C(O)C2O)C1
|
| 302 |
+
Cc1cc(=O)c2c(o1)-c1cc(O)c(O)cc1OC2
|
| 303 |
+
CC(N)C(=O)NC(CCCCN)C(=O)NC(CCCCN)C(=O)O
|
| 304 |
+
NC(CC[Se][Se]CCC(N)C(=O)O)C(=O)O
|
| 305 |
+
CC(C)=CCCC=C(C)COc1ccc2ccc(=O)oc2c1
|
| 306 |
+
NC(=O)c1cc(Br)ccc1F
|
| 307 |
+
C=CC=C(C)COCC(=O)C#CC
|
| 308 |
+
CC(NC(=O)c1c(O)c2cccc3c2n(c1=O)CC3)c1ccccc1
|
| 309 |
+
O=C(O)Cc1ccc(C(F)(F)F)cc1Br
|
| 310 |
+
CCCCCC1(O)C(C)=C(C)C(=O)C1CC(=O)O
|
| 311 |
+
Cc1cc(=O)c(O)c(C(CC(N)=O)c2cccnc2)o1
|
| 312 |
+
CCOC(=O)Cc1nc(-c2ccc(OC)cc2)oc1-c1ccco1
|
| 313 |
+
CCCCNC(=O)[C@H](C)C[C@H](O)[C@@H](N)C[C@@H](C)Cc1ccccc1
|
| 314 |
+
CCCN=C(C)c1ccccc1
|
| 315 |
+
OCC1OC(n2nnc3c(O)ncnc32)C(O)C1O
|
| 316 |
+
CC(C)=CCCC1(C)C=Cc2c(c(C=O)cc3c2[nH]c2ccccc23)O1
|
| 317 |
+
CCCCCCC=CCCCCCC(OC(C)=O)C(C)NC(C)=O
|
| 318 |
+
C=C1C(=O)CC2CC3(C)CCCC(=C)C3CC12
|
| 319 |
+
COc1cccc(C=CC(=O)O)c1C(C)C
|
| 320 |
+
CCCCCCCCCCCCC(=O)C1(O)C(O)C=CC1OC(C)=O
|
| 321 |
+
CC(=O)C(C)=CC1C(C)=CCCC1(C)C
|
| 322 |
+
CCCCSCC(NC(=O)CC[C@H](N)C(=O)O)C(=O)NCC(=O)O
|
| 323 |
+
O=C1NC2c3ccccc3C3CCC2C3NC(=O)N1c1ccccc1
|
| 324 |
+
CC(=O)OCC1=CCN2CCC(OC(C)=O)C12
|
| 325 |
+
COc1ccccc1C(CCN=Cc1ccc(N(C)C)cc1)C1CCOC(C)(C)C1
|
| 326 |
+
CC(=O)OCC(C)(O)c1ccc(C)cc1
|
| 327 |
+
O=C(O)c1ccc(CN2CCN3C(=O)N(c4ccccc4)CC3C2)cc1
|
| 328 |
+
CCCCCCCCCCCCCCCCCc1cc(O)cc(O)c1
|
| 329 |
+
Cc1ccccc1-c1nc(-c2ccccc2)nc(N)c1CN
|
| 330 |
+
COC(=O)C(C)COC(=O)c1ccccc1
|
| 331 |
+
O=C1C=CCCCc2ccc(O)c(c2)-c2cc(ccc2O)C1
|
| 332 |
+
CCCCCCCCCCCC=CC(O)C(COC1OC(CO)C(O)C(O)C1O)NC(=O)C(O)CCCCCCCCCCCCCCCCCCCCCC
|
| 333 |
+
N=c1ncoc2[nH]ccc12
|
| 334 |
+
COc1ccccc1CCNCC1C(=O)OC2CC3=CCCC(C)C3(C)C(O)C21
|
| 335 |
+
COc1cc(OC(C)=O)c2c(=O)cc(C)oc2c1OC
|
| 336 |
+
CCCCCCCCCCCC(=O)SCCNC(=O)CCNC(=O)C(O)C(C)(C)COP(=O)(O)O
|
| 337 |
+
CNCCN1CCN([NH+](O)NOc2ccnc(Cl)n2)CC1
|
| 338 |
+
C[C]1[CH][CH][CH][NH+](C[C]2[CH][CH][CH][CH][C]2F)[CH]1
|
| 339 |
+
C=C1C(=O)OC2CC(C)(C3CO3)C(C(=C)C=O)CC12
|
| 340 |
+
CC=CC(=O)OC(=O)C=CC
|
| 341 |
+
C=C1CCC2C(C3C(=C)C(OC)OC(OC(C)=O)C13)C2(C)C
|
| 342 |
+
C=C1CCC2C(=C)C(=O)OC2C=C(C)CCC1=O
|
| 343 |
+
O=S(=O)(O)c1ccc2c3c1-c1cc(ccc1O)CCC=CC3CC2
|
| 344 |
+
CC(=O)OC1CCC2(C)C(CCC(C)=CCO)C(C)=CCC2C1(C)C
|
| 345 |
+
CC(=O)OC1CC(C)C(C=O)=C2C(C)CC(C)(C)C21
|
| 346 |
+
O=C(O)CCCC/C=C(\c1ccccc1)c1cccnc1
|
| 347 |
+
COC(=O)CNC(=O)N1CCc2nc[nH]c2C1c1ccncc1
|
| 348 |
+
COc1cc(CCc2ccc(O)c(O)c2)cc(O)c1OC
|
| 349 |
+
CCCCC=CCCCCCCCc1cc(OC)cc(OC)c1
|
| 350 |
+
C=CCCCCCCCCCCCC=CCCCCCCCC
|
| 351 |
+
N=C(N)c1ccc(CN2CCN(c3cccc(OCC(=O)O)c3)CC2)cc1
|
| 352 |
+
CCCCCCCCCCCCC(C)CCCCCCCCCCCC
|
| 353 |
+
CNC1=CC(=O)CCC1
|
| 354 |
+
Cc1cccc2c1ccn2CCNC(=O)CCC1NC(=O)c2ccccc2NC1=O
|
| 355 |
+
CSC=CC(=O)NCCCCNC(=O)C=Cc1ccccc1
|
| 356 |
+
O=C(O)c1cc2ccc(O)cc2oc1=O
|
| 357 |
+
CC(C(O)c1ccccc1)N(C)Cc1ccccc1
|
| 358 |
+
CCCCCC1C=CC(=O)CCCCCCCCC(=O)O1
|
| 359 |
+
C=C(C)C=Cc1cccc2c1NC1ON=C(C(=O)OC)CC21O
|
| 360 |
+
CCCCC=CCCCCCCCCCCCCCCCCCC1=C(O)C(=O)C=C(O)C1=O
|
| 361 |
+
CC(=O)C(C=NCC(C)C)C(C)=O
|
| 362 |
+
COCCc1nccc(CC2C(NCc3ccc(F)c(F)c3)CC(O)C2CO)n1
|
| 363 |
+
Nc1ccc2ccccc2n1
|
| 364 |
+
CCCCc1ccccc1-c1n[nH]c(-c2cccc(OC)c2)n1
|
| 365 |
+
CCCCCC=CCC=CCCCCCCCC(=O)OCC(COC(=O)CCCCCCCCCCCCCCC)OC(=O)CCCCCCCC=CCC=CCCCCC
|
| 366 |
+
CC=C(C)C=CCC(C)CCC=C(C)CCC=C(C)C
|
| 367 |
+
COc1ccc2c(c1)cc(C(=O)NC(C(=O)NC(C(=O)O)C(C)C)C(C)C)n2C
|
| 368 |
+
NS(=O)(=O)Oc1cccc(Br)c1
|
| 369 |
+
COc1ccc(C2=CC=C3C=CC=CC3[OH+]2)cc1OC
|
| 370 |
+
CNCCCCCCCCCCCCCc1cccnc1
|
| 371 |
+
c1ccc(C2Nc3ccccc3C3OCCC23)cc1
|
| 372 |
+
O=C1CCCC2=C1C1(CC3c4ccccc4CCN23)SCCS1
|
| 373 |
+
CC(=O)OCCC#Cc1ccc(-c2cccs2)s1
|
| 374 |
+
O=C(c1ccccc1)c1ccc(O)cc1O
|
| 375 |
+
CCCCN(C)CC1OCC(NC(C)=O)C1O
|
| 376 |
+
CN(C=Cc1ccccc1)C(=O)C1OC1c1ccccc1
|
| 377 |
+
CC(=CC(=O)O)CCC1(CO)C(C)CCC2(C)C(C)=CCCC21
|
| 378 |
+
C=C(C)C1CC2(C)C(=CC1=O)CCC(OC(=O)C=CC(C)COC(C)=O)C2C
|
| 379 |
+
Cc1c(OCC(=O)Nc2ccc(C(N)=O)cc2)c(=O)ccn1CCC(C)C
|
| 380 |
+
CC=C1CN2CCc3c([nH]c4ccccc34)C2CC1C(C)=O
|
| 381 |
+
CSC(C)CC(=O)C1=C(C)CCCC1(C)C
|
| 382 |
+
CCCc1nc(OC(C)=O)n(-c2ccccc2)n1
|
| 383 |
+
CCCCn1c(=O)c2cc(C(=O)O)cn2c2ccccc21
|
| 384 |
+
CNC(=O)C(C)(C)N1CCCC1C(=O)NCCN1CCOCC1
|
| 385 |
+
COC1(C)C=CC(C(C)C)CCC2=CC(CC(C)=CCC1)OC2=O
|
| 386 |
+
COc1ccc(CC(C(=O)O)C(=O)O)cc1OC
|
| 387 |
+
CCOC1(C)CCC2C1C1C(CCC2(C)O)C1(C)C
|
| 388 |
+
CCCCCC1NCCS1
|
| 389 |
+
CC(C)n1c(=O)nc(-c2ccccc2)c2cc3c(cc21)OCO3
|
| 390 |
+
CC(=O)OCC1=CCCC2C1(C)CCC(C)C2(C)CCC(C)=CC(=O)O
|
| 391 |
+
Fc1ccc(C2CC3CCC2N3)cn1
|
| 392 |
+
CC(C)C(=O)C1CCC2(O)CCCC(C)C12C
|
| 393 |
+
CC1Cc2ccccc2C1(O)c1ccccc1
|
| 394 |
+
CC1(O)CCC(C2=CCCC3CCCCC23)CC2C1CCC2(C)O
|
| 395 |
+
COc1ccc2oc3c(OC)c(O)cc(O)c3c(=O)c2c1
|
| 396 |
+
OCc1cc(CC2(NCc3ccccc3)COC2)no1
|
| 397 |
+
CC(=O)OCN1C(=O)c2ccccc2S1(=O)=O
|
| 398 |
+
COC(C(O)C=O)C(O)C(O)CO
|
| 399 |
+
COc1ccc(C2COc3cc(O)ccc3C2)cc1
|
| 400 |
+
CC(=O)OCC=C(C)C(=O)C=CC(C)(C)OO
|
| 401 |
+
COCc1cn(C2COC3C(NC(=O)C4CCCCC4)COC32)nn1
|
| 402 |
+
COc1nc(N)nc2c1ncn2C1OC(CO)C(O)C1O
|
| 403 |
+
CC(=O)OC1OCC=C2OC(=O)C=C21
|
| 404 |
+
COC(=O)C=CCCCCC1OC2(CCCCCCCCCCCCCCCOCC(N)CO)CCC(O)C1O2
|
| 405 |
+
CCCCc1ncc(C)nc1C
|
| 406 |
+
NC(CCC(=O)NCCC(=O)O)C(=O)O
|
| 407 |
+
COC1C=C(CCC2(C)C(C)=CC(=O)CC2C)C(=O)O1
|
| 408 |
+
C=C1CC(O)C(O)C2(C)CCC(C(C)(O)CO)CC12
|
| 409 |
+
CC(C)(C)c1nnc(C2CCN(Cc3ccncc3)C2)o1
|
| 410 |
+
CCCC1(CCC)C(=N)NC(=S)N=C1O
|
| 411 |
+
Brc1ccc2n(-c3ccccc3)nc3ccccc3c1-2
|
| 412 |
+
COC(=O)C1Cc2c([nH]c3ccccc23)C2CCC(=O)N12
|
| 413 |
+
O=C(C=Cc1cccc(O)c1O)OC1CC(O)(C(=O)O)CC(O)C1O
|
| 414 |
+
COc1cccc(NC(=O)N2CCOCC(OC)C2)c1
|
| 415 |
+
O=CCCCCC(=O)O
|
| 416 |
+
COc1ccccc1OC
|
| 417 |
+
CCCCCCCCCCCCCCCCCCCCCCCCCC(C)(O)CCO
|
| 418 |
+
CC(C)(C)NCC(O)COc1nsnc1N(CCO)CC(=O)O
|
| 419 |
+
CCCCCCCCCCCC(=O)C1C(=O)CCC(C(=O)OC)=C1O
|
| 420 |
+
O=C1C(=Cc2ccccc2F)Oc2c1ccc(O)c2CN1CCOCC1
|
| 421 |
+
CC(=O)NCC1OC(CO)C(O)C1N(C)CCc1ccccc1
|
| 422 |
+
C#CC=CCC(Br)=C1CC2CC(CC)C(Br)CC2O1
|
| 423 |
+
COc1ccccc1COc1ccc2c3c(c(=O)oc2c1C)CCC3
|
| 424 |
+
CC1(C)CCCC2CC3CC21CC=C3C(=O)O
|
| 425 |
+
C=C(C)C1CCC(C)CC1=O
|
| 426 |
+
CC1CCCc2c(O)c(O)c3c(c21)COC(=O)C3(C)O
|
| 427 |
+
Oc1ccncc1-c1ncccc1O
|
| 428 |
+
[O-][N+]12CCCCC1C(CO)CCC2
|
| 429 |
+
COC(=O)CCC(C)C1CCC2C3CCC4CCCCC4(C)C3CC(O)C12C
|
| 430 |
+
COCCCCCCN/N=C(\C)C(=O)O
|
| 431 |
+
CCCCCCCCC1OCCCC1CCCCCCC
|
| 432 |
+
COC(=O)C1CN(C(=O)c2ccccc2)CCN1C(C)=O
|
| 433 |
+
Cc1ccc(S(=O)(=O)NC(=O)Nc2ccc(Cl)cc2)cc1
|
| 434 |
+
COc1ccc(O)c2c(=O)c3c(O)cc(O)cc3oc12
|
| 435 |
+
CC(C)CCCCCCCCCCCCCCCCCCC(O)C(=O)O
|
| 436 |
+
C=C(C)CCCC(C)=C1CC=C(C)CC1
|
| 437 |
+
c1ccc2oc(C3CCN(C4CCC4)C3)nc2c1
|
| 438 |
+
COc1c(O)cccc1C(=O)OCc1ccccc1
|
| 439 |
+
C=C(CC(=O)C(C)=CCCC(=CCCC(C)=CCO)CO)C(C)C
|
| 440 |
+
CC1=CCCC2C1(C)CCC(C)C2(C)CCC1CC(=O)OC1O
|
| 441 |
+
COC=C(C(=O)OC)C(=CC=Cc1ccccc1)CO
|
| 442 |
+
NC(=O)CC[C@H](NC(=O)OCc1ccccc1)c1nc2ccsc2c(=O)o1
|
| 443 |
+
CC(=O)OC1C=C2C(C)(C)OOC2(O)CC1C
|
| 444 |
+
Cc1cccc(C)c1C(=O)OC1OC(CO)C(O)C(O)C1O
|
| 445 |
+
CN(O)C(=O)Cc1ccccc1
|
| 446 |
+
COc1ccc(C(CC(=O)NCCCNc2ccccc2)c2cc3c(cc2O)OCO3)cc1
|
| 447 |
+
COc1cc(C2c3cc(OC)c(O)cc3CC(C)C2C)ccc1O
|
| 448 |
+
CC(C)=CCc1cc(C(=O)O)ccc1OC(=O)C=C(C)C
|
| 449 |
+
CC(C)CC(N)C(=O)NC(CO)C(=O)O
|
| 450 |
+
OC1C(NCC2CCCCC2)C2COC(O2)C1n1ccnc1
|
| 451 |
+
Cc1cc2c(c(=O)o1)C(c1ccsc1)CC(=O)O2
|
| 452 |
+
CC(=O)OCCI
|
| 453 |
+
CCCCCCCCC(C)CCCCCCCC=CCCCCCCC1OCC(N)C1O
|
| 454 |
+
CC(C)=CCCOc1c2ccoc2cc2oc(=O)ccc12
|
| 455 |
+
O=C1c2[nH]cnc2N(Cc2ccc(F)cc2)C2=NCCN12
|
| 456 |
+
CN1C(=O)c2ccccc2NC(=O)C12OC2c1cccc(O)c1
|
| 457 |
+
CC(=O)OCC1(O)CCC2C1CC(C)(C)CC1CC12C
|
| 458 |
+
Cc1cccc(CC(=O)O)c1
|
| 459 |
+
COc1cc(CC2COCC2C(O)c2ccc(O)c(OC)c2)ccc1O
|
| 460 |
+
CCC=CCC=CCC=CCCCCCCCC(=O)OCC(COC(=O)CCCCCCCC=CCC=CCCCCC)OC(=O)CCCCCCCCCCCCC
|
| 461 |
+
CC1(C)C=Cc2cc(C=Cc3cc(O)cc(O)c3)ccc2O1
|
| 462 |
+
NC(CC(O)C(O)C(=O)O)C(=O)O
|
| 463 |
+
CCOC1c2c(ccc3ccc(=O)oc23)OC1C(C)C
|
| 464 |
+
Nc1nc(Cl)nc2c1ncn2C1CCC(CO)O1
|
| 465 |
+
COC(=O)CN1C(=O)C2CC(O)CN2C2(CN(CC(C)C)C2)C1=O
|
| 466 |
+
COC1CC(NC(C)=O)C(O)C(C)O1
|
| 467 |
+
On1cc2nccc-2cn1
|
| 468 |
+
C=CCCCC=CC=C(C)CCCCC=CCCC=CC(=O)NC(CO)CO
|
| 469 |
+
COc1ccccc1C=C1Oc2c(ccc(O)c2CN2CCCC2)C1=O
|
| 470 |
+
COc1ccc2ccc(=O)oc2c1C(O)C(O)C(C)C
|
| 471 |
+
C=CC1CN2CCC1CC2CNC(=O)c1ccc2c(c1)OCO2
|
| 472 |
+
CN1CCCN2CCN(CCCN(CCC#N)CC1)CC2
|
| 473 |
+
Cc1c([N+](=O)[O-])oc2ccccc12
|
| 474 |
+
O=C(NCC1CCCCC1)c1cccc2nc(CCl)cn12
|
| 475 |
+
COC(=O)c1cncc(C(C)OC)c1
|
| 476 |
+
O=C(C=Cc1ccccc1)NCCc1ccc(O)cc1
|
| 477 |
+
CN(C)c1ccc(C=C(C#N)c2nc(O)c3ccccc3n2)cc1
|
| 478 |
+
O=C(CCCN1Cc2ccccc2C1=O)NCC1CCCN2CCCCC12
|
| 479 |
+
CCCCCCCCCCCCCCCCCCCCCCC(=O)OCC=C(C)CCCC(C)CCCC(C)CCCC(C)C
|
| 480 |
+
C=C1CCC(=O)C(C)CCC2C1CC2(C)C(=O)CCC(C)O
|
| 481 |
+
COc1ccc(C2OC(=O)C(C)(C)C(=O)C2C)cc1OC
|
| 482 |
+
Cc1ccccc1NC(=O)Oc1ccc2c(c1)[C@]1(C)CCN(C)C1N2
|
| 483 |
+
C=C1CCC2C(C)(C(=O)O)CCCC2(C)C1CCC1COC(OC)C1
|
| 484 |
+
CC12CCC3c4ccc(O)cc4C(=O)CC3C1CCC2O
|
| 485 |
+
NC(CCNC(CNC(Cc1c[nH]cn1)C(=O)O)C(=O)O)C(=O)O
|
| 486 |
+
COc1cc2[nH]c(C(=O)O)c(C=O)c2cc1OC
|
| 487 |
+
COc1cccc(C2CC(CO)C3CC(O)CCN3C2)n1
|
| 488 |
+
CC(C)N1CCN2C(=O)N(C3CCCCC3)CC2C1
|
| 489 |
+
CC(NC(=O)C(N)CC(=O)O)NC(=O)N(C)C(C)(C)C
|
| 490 |
+
CCCCCC=CCC=CCCCCCCCC(=O)OC(COC(=O)CCCCCCCCCCCCCC)COC(=O)CCCCCCCCCCCCCCC
|
| 491 |
+
CC(C)COC(=O)c1ccccc1C(=O)OCC1CCCCC1
|
| 492 |
+
COc1cc(NCCCN)c2nccc(C)c2c1Oc1ccccc1
|
| 493 |
+
Clc1ccc(-c2nnc(-c3ccc(Cl)cc3)s2)cc1
|
| 494 |
+
O=C1Nc2ccccc2-c2cccn2[C@H]1Cc1ccc(O)cc1
|
| 495 |
+
CC(C=O)=CCCC(C)(O)C=Cc1cc(O)ccc1O
|
| 496 |
+
CC(=O)c1ccc2[nH]c3c(c2c1)CCCC3=NCCO
|
| 497 |
+
CC1=CCC(C(C)=CC(O)C(O)C(C)(C)O)CC1
|
| 498 |
+
CC1=CCC2(O)C(C)=CCC3C(C)C(=O)OC3C12
|
| 499 |
+
C=C(C(=O)OC)C1CCC2(C)C(O)C(O)CC(=C)C2C1O
|
| 500 |
+
CCCCCNC(=O)COc1ccc2nc3n(c(=O)c2c1)CCCCC3
|
| 501 |
+
CCOc1cccnc1
|
| 502 |
+
CC(=O)NCCCCC(=O)O
|
| 503 |
+
CN(C)c1ccc(-c2nc3ccc(I)cc3s2)cc1
|
| 504 |
+
CCCCCC=CCC=CCCC1OC1CCCC(=O)O
|
| 505 |
+
c1ccc2c(c1)NCC(C1=NCCN1)O2
|
| 506 |
+
COc1cc2c(c3c1C=COC=C3)OC(C(C)(C)O)C2
|
| 507 |
+
O=C1NC(=O)C2=C1CCC2O
|
| 508 |
+
CCC(CC=CCCC(=O)O)CCCCCCC1C=CCC1
|
| 509 |
+
CCCCCC(O)CCC(=O)Cc1ccc(O)c(OC)c1
|
| 510 |
+
CC(=O)OC(C)(C)C1CCC(C)=CCCC(C)=CCCC2(C)OC2C1
|
| 511 |
+
CC(C)CNC1C2COC(O2)C(n2cncn2)C1O
|
| 512 |
+
O=CCS(=O)(=O)O
|
| 513 |
+
CC(C)C(NC(=O)C(CS)NC(=O)CCCC(N)C(=O)O)C(=O)O
|
| 514 |
+
Clc1ccc(C2N(c3ccccc3)CCN2c2ccccc2)cc1
|
| 515 |
+
CC1(C)SC2C(N=C(O)CCCCO)C(=O)N2C1C(=O)O
|
| 516 |
+
CC(=O)C=CCC(C)C1CCC(C)c2c(O)cc(C(=O)O)cc21
|
| 517 |
+
CNC1CCC2(C)C(=CCC3C2CCC24C(=O)OC(C)C2CCC34)C1
|
| 518 |
+
C=CCn1ncc2c(CC)ncn2c1=S
|
| 519 |
+
CCC(=O)C=CC=CC(C)C(O)CC
|
| 520 |
+
N=c1cn[nH]c2cocc12
|
| 521 |
+
COP(=O)(O)N(C)N=C(O)C(N)CC(C)C
|
| 522 |
+
COc1cccc2ccn(CCC(=O)N3CCC(C(=O)O)CC3)c12
|
| 523 |
+
C=CC1C2C=C(CO)CC1(NC)c1ccc(=O)[nH]c1C2
|
| 524 |
+
COc1cccc2c1C(=O)c1ccccc1C2=O
|
| 525 |
+
CCCCCCC(=O)C(Br)=C(Br)Br
|
| 526 |
+
O=C(O)c1cc(C=CC(=O)c2ccccc2O)c2c(c1)COCO2
|
| 527 |
+
CC=C1CN(C)CC2CCc3c([nH]c4ccccc34)C(=O)C12
|
| 528 |
+
N#CC(C#N)=Cc1ccc2c(c1)OCO2
|
| 529 |
+
Nc1ccc(-c2nc3cc(F)ccc3s2)cc1Cl
|
| 530 |
+
CCn1c(SCC(=O)Nc2cccc(C)c2C)nc2c(=O)[nH]cnc21
|
| 531 |
+
CCCC=CCC1CC=CC=C(C)C=CC=CC(O)CC=C(C)C=CC=CC(O)=N1
|
| 532 |
+
Oc1ccccc1CNC1C2COC(O2)C(n2cnc3ccccc32)C1O
|
| 533 |
+
CC(C)[C@H](N)C(=O)O
|
| 534 |
+
C=C(CC=CC(C)(C)OO)C1CC=C(C)CC1
|
| 535 |
+
O=C(NCc1ccc2c(c1)OCO2)NC(Cc1ccccc1)C(=O)O
|
| 536 |
+
COC(=O)/C=C/c1ccccc1OCC(O)CNCCNC(=O)C(C)C
|
| 537 |
+
O=C(O)c1ccccc1C(=O)NCCC(c1ccccc1)c1ccco1
|
| 538 |
+
COc1cccc(C=CC2=NCCCC2)c1
|
| 539 |
+
CCCCCCCCCCCCC(SCCC(=O)O)SCCC(=O)O
|
| 540 |
+
CC(C)(C)CCNC1CC(O)C(CO)C1Cc1ccnc(-c2ccccc2)n1
|
| 541 |
+
NC(N)=NCCCC(N)C(=O)NC(Cc1ccccc1)C(=O)O
|
| 542 |
+
CC(=O)c1ccccc1OCC(=O)O
|
| 543 |
+
Cc1c(O)cc2c3c1C(=O)OCC3(C)CC2(C)C
|
| 544 |
+
O=C(Cc1ccccc1)NC1COC(CN2CCC(F)(F)CC2)C1O
|
| 545 |
+
N#Cn1c(N)nc2ccccc21
|
| 546 |
+
CCCCCCCCCC(O)CNC(C)=O
|
| 547 |
+
COc1cc(CCC(=O)CC(O)CC(C)CCCO)ccc1O
|
| 548 |
+
CC1CCC2(O)C(C)(C)C3CC(O)C2(C)C1C3
|
| 549 |
+
O=C(O)c1ccc(C(=O)CBr)cc1
|
| 550 |
+
CCCC=CC=CC1CC(O)C(O)C(O)C1
|
| 551 |
+
C=C1CC(=O)OC1=O
|
| 552 |
+
CCCc1ncc(C)nc1C
|
| 553 |
+
CCCCCCCCCCCCCC(=O)CC(O)CCCCC
|
| 554 |
+
C=CCC=CCC=CCCCCCCC=CCCCC(=O)O
|
| 555 |
+
O=Cc1ccc(COCCc2ccc(O)cc2)[nH]1
|
| 556 |
+
CCCCCCCCCCC(O)CCCCCCCCCCCCCC(=O)O
|
| 557 |
+
COc1ccc(C(=O)COC(C)=O)cc1OC
|
| 558 |
+
CN[C@H](CS)CCC(=O)O
|
| 559 |
+
CC1OC(O)CC(O)C1O
|
| 560 |
+
C=CCOC(=O)CCCCC
|
| 561 |
+
O=C(O)CCSCCSCCC(=O)O
|
| 562 |
+
CC12CCC3C4CCC(=O)C=C4CCC3C1CCC2OC(=O)CCC1CCCC1
|
| 563 |
+
OCc1ccc(COCCc2ccccc2)o1
|
| 564 |
+
COC(=O)c1ccccc1NC(=O)N1CCc2nc[nH]c2C1c1cccnc1
|
| 565 |
+
CCCCCCCCCCCCCCCCCCCC(=O)OCC(COC(=O)CCCCCCCCCCCCCCCCC)OC(=O)CCCCCCCCCCCCCCCCC
|
| 566 |
+
C=C1CCC2C(C)(C)CCCC2(C)C1COC(C)=O
|
| 567 |
+
O=C1NCCCC1=CC(=O)c1c[nH]c2ccccc12
|
| 568 |
+
C=C(C)C1CC=C(C)CCC=C(C)CC1
|
| 569 |
+
O=C1C(O)=CC(=O)c2c(O)cccc21
|
| 570 |
+
O=C(CC1Sc2ccccc2N(CCC2OCCO2)C1=O)NO
|
| 571 |
+
O=c1ccc2c(OC3OC(CO)C(O)C(O)C3O)cccc2o1
|
| 572 |
+
C=C1CCC2C(C3CC(C)CC13)C2(C(N)C=O)C1CCCCC1
|
| 573 |
+
CCCCOc1ccc([C@@H]2CC[C@H](NC)c3ccccc32)cc1
|
| 574 |
+
O=C(O)[C@H]1C[C@@H](CO)N1
|
| 575 |
+
N=S(=O)(O)c1ccccc1
|
| 576 |
+
CC(C)CC(NCC=Cc1ccccc1)C(=O)O
|
| 577 |
+
O=CNCCCCN(CCCNC(=O)c1ccccc1)C(=O)C=Cc1ccccc1
|
| 578 |
+
CC1C(c2ccccc2)OC(C#Cc2ccccc2)N1C
|
| 579 |
+
CCC(C)CC(C)C
|
| 580 |
+
N#CCCC1CS1
|
| 581 |
+
COc1c2c(cc3c1[nH]c1ccccc13)C(=O)CC2O
|
| 582 |
+
CCCCCCC=CCCCC1=NC(=Cc2ccc(O)cc2)C(=O)O1
|
| 583 |
+
OCCN(CCO)CCc1ccc(CSc2ccccc2)cc1
|
| 584 |
+
CC12CCCC(O)(C1)C1CCC1C(O)C2
|
| 585 |
+
OCC(O)CC#CC#Cc1ccccc1
|
| 586 |
+
COc1cc(O)cc2c1C(=O)C(O)C(c1ccccc1)O2
|
| 587 |
+
CCc1cc2c(=O)c(-c3nc(C)cs3)c(C(=O)O)oc2cc1O
|
| 588 |
+
CCCCCC(=O)NCC(=O)c1ccc(O)cc1
|
| 589 |
+
CC(C)(C)N=C1C(=O)N=C2C=CC=CN21
|
| 590 |
+
C=CCc1cc(OC)c(OC)c(OC)c1OC
|
| 591 |
+
COc1ccc2c(c1)C=CC(c1ccccc1)O2
|
| 592 |
+
OC(c1ccccc1)C1CCCCN1CCc1ccccc1
|
| 593 |
+
CCCCCCCCCCCCOc1ccc(C(=N)N)cc1
|
| 594 |
+
CCOc1ccc2c(=O)c(Oc3ccccc3)c(C)oc2c1
|
| 595 |
+
CC#CC=C1C=CC2(CCCO2)O1
|
| 596 |
+
Cc1cc2ncccc2c2nc(N)n(C)c12
|
| 597 |
+
O=C(C1=C(O)CCC1)C1CC1
|
| 598 |
+
COC=C(C(=O)OC)C(C)C(C=Cc1ccccc1)OC
|
| 599 |
+
COC(=O)C(CC=Cc1ccccc1)NC(C)=O
|
| 600 |
+
Nc1ccc(-c2nc3ccc(F)cc3s2)cc1I
|
| 601 |
+
CCCCCCCCCCCCCc1cc(=O)c2c(O)cc(O)cc2o1
|
| 602 |
+
CC(=O)NCCc1c(Br)[nH]c2ccccc12
|
| 603 |
+
C#CCN(C)Cc1nc(C2(O)CCN(C(=O)CCc3ccccc3)CC2)cs1
|
| 604 |
+
Cc1c(C)c(CCl)c2ccccc2c1CCl
|
| 605 |
+
CNCC1(c2ccc(OC)c(OC)c2)CCCC1
|
| 606 |
+
CC(c1cc2ncccc2s1)N(O)C(N)=O
|
| 607 |
+
CCC(=O)c1ccc2c(c1)N(CCCN(C)C)c1ccccc1S2
|
| 608 |
+
CCCc1scnc1CC
|
| 609 |
+
O=C(N[C@H]1CN2CCC1CC2)c1ccccc1
|
| 610 |
+
CC(C)COC(=O)C1C(=O)CC(C)(O)C(C(=O)OCC(C)C)C1c1ccc(O)cc1
|
| 611 |
+
COC1=CC23CCN(C)C(Cc4ccc(O)c(OC)c42)C3=CC1=O
|
| 612 |
+
COc1cc2[nH]c3cccc(OC)c3c2cc1C
|
| 613 |
+
CC(=O)Nc1ccc(F)c(Cl)c1
|
| 614 |
+
CCCCCCCC(C)CCCCCCCC(O)CC(=O)OCCCc1cc(O)c(O)c(OC)c1
|
| 615 |
+
Nc1cccc(OC2OC(C(=O)O)C(O)C(O)C2O)c1
|
| 616 |
+
CC(=CCOc1ccc2ccc(=O)oc2c1)CCC(=O)C(C)C
|
| 617 |
+
CC/C(=C(/c1ccccc1)c1ccc(O)cc1)c1ccccc1
|
| 618 |
+
C=CCC/C=C(\NC(=O)C1CC1(C)C)C(=O)O
|
| 619 |
+
CCc1ccc2cc(-c3ccc(Cl)cc3)cn2c1
|
| 620 |
+
CCC(CC)C(C)C
|
| 621 |
+
O=C1OC(=O)C2C3OC(C=C3COC3CCCCC3)C12
|
| 622 |
+
C=CC1(C)Cc2c(O)ccc(OC)c2CC1C(=C)C
|
| 623 |
+
C=C1CC(OC(=O)C(O)=CCO)C2C(=C)C(=O)OC2C2C(C)=CCC12
|
| 624 |
+
O=NN(CCF)C(=O)NCCF
|
| 625 |
+
Cc1ccc2[nH]c(C3CCN(C(=O)c4ccccc4)C3)nc2c1
|
| 626 |
+
CC1=CCCC2C1(C)CCC1(C)C3=C(CC21C)C(=O)C=C(NCC(C)C)C3=O
|
| 627 |
+
Cc1ccc(C(=O)Oc2ccccc2)cc1
|
| 628 |
+
c1ccc(CCc2ccccc2OCCCCN2CCNCC2)cc1
|
| 629 |
+
Cc1ncc2n1-c1ccc(Cl)cc1C(c1ccccc1F)=NC2
|
| 630 |
+
O=C(NC(=O)c1ccccc1Cl)Nc1ccc(OC(F)(F)F)cc1
|
| 631 |
+
CNCCc1cc(Br)c(OCCCN)c(Br)c1
|
| 632 |
+
CCCCC(C)CC(C)CC(C)C(=O)OC
|
| 633 |
+
COc1ccc(C(=O)C=Cc2cccs2)c(OC(=O)c2ccccc2)c1
|
| 634 |
+
COc1ccccc1CCNC(=O)Cn1cc(OC)c(=O)cc1C
|
| 635 |
+
CC=C(CC(C)C(O)(CO)C(=O)O)C(=O)O
|
| 636 |
+
CCC=CCC(O)CCO
|
| 637 |
+
CCC=CCC=CCC(O)C(O)C(O)C=CCC=CCC=CCCC(=O)O
|
| 638 |
+
CCOC(=O)C1(c2ccccc2)CCN(C)CC1
|
| 639 |
+
C=C1C=Cc2c(c(O)c3occc3c2CC=C(C)C)O1
|
| 640 |
+
N=C(N)NCCCCNCCCNCCCNCCCNC(=O)c1ccc(O)cc1
|
| 641 |
+
CC(=O)N(C)CCc1c[nH]c2ccccc12
|
| 642 |
+
CCCCCCCCCCCCC1(O)C(O)C=CC(=O)C1O
|
| 643 |
+
CCC(C)CC(C)CCCCCCCCCCC(O)C(C)N
|
| 644 |
+
O=C1CCCCCCCCCCC(=O)OCCC1
|
| 645 |
+
C=C1C=CC(C(C)C)C12CC=C(C(=O)O)CC2
|
| 646 |
+
COc1cc(C)cc(OC)c1
|
| 647 |
+
C=CC1(C)CCC(C(=C)C)C2C3(C(=O)Nc4ccccc43)C21NC
|
| 648 |
+
COC(=O)C(C)COc1coc2c1c(O)cn2Cc1ccccc1
|
| 649 |
+
CCN(CC)CCCNc1nccc2c(C)c3[nH]c4ccc(O)cc4c3cc12
|
| 650 |
+
O=C(NCC1CCCN2CCCCC12)c1n[nH]c2ccccc12
|
| 651 |
+
CCC(CC(C)=O)N1CNc2nc[nH]c(=O)c21
|
| 652 |
+
O=C1CCOC(c2ccccc2)C1
|
| 653 |
+
O=C(O)C(c1ccccc1)c1ccccn1
|
| 654 |
+
O=C1Nc2ccc(Br)cc2C(=O)N2CCN(C(=O)NC3CCCCC3)CC12
|
| 655 |
+
CC(C)(C)OC(=O)NNC(=NCC(=O)O)NNC(=O)OC(C)(C)C
|
| 656 |
+
O=C1c2scc(-c3ccc(F)cc3)c2-n2cccc21
|
| 657 |
+
Cc1cc(C)c2nc(C)cc(O)c2c1
|
| 658 |
+
CC(=O)Nc1nc(C)c(O)c(C)c1C
|
| 659 |
+
ON(Cc1c(F)cccc1Cl)Cc1c(F)cccc1Cl
|
| 660 |
+
CCCCCCCCCCCCC(C)=O
|
| 661 |
+
O=C(O)C1CCCN1C(=O)OCc1ccccc1
|
| 662 |
+
COc1ccc(CN2Cc3nc[nH]c3CC2C(=O)NC2CCCCC2)c2ccccc12
|
| 663 |
+
CC(=O)NC(Cc1cn(C)cn1)C(=O)O
|
| 664 |
+
COC(=O)c1c(C)c(C)c(O)c(C)c1O
|
| 665 |
+
c1ccc(Cc2nnc(C3CCN(C4CCCCC4)C3)o2)cc1
|
| 666 |
+
CN(C)CCN(Cc1cccs1)c1ccccc1
|
| 667 |
+
COc1ccc(-c2coc3cc(O)c(O)cc3c2=O)cc1O
|
| 668 |
+
C=CCC(=NOS(=O)(=O)O)SC1OC(O)C(O)C(O)C1CO
|
| 669 |
+
CCN1c2ncccc2-c2nccn2-c2cccnc21
|
| 670 |
+
CCCC(O)C(O)C1CC(OC)=CC(=O)O1
|
| 671 |
+
CC1=CC(=O)C2C(C)CCC(=C(C)C)C2C1
|
| 672 |
+
COc1ccc(C2NC(CO)C(O)C2O)cc1O
|
| 673 |
+
CCOCCOCCO
|
| 674 |
+
CCCCCCCCCC=CCCC=CC(O)C(CO)NC(=O)C(O)CCCCCCCCC
|
| 675 |
+
CCNC(=O)CNC(=O)COc1ccccc1C(=O)OC
|
| 676 |
+
CCOC1CC(CCC2(C)C(C)CCC3(C)C(CO)=CCCC32)CO1
|
| 677 |
+
O=C(CC(=O)OCc1ccccc1)OCc1ccccc1
|
| 678 |
+
Cc1cc2nnncc2o1
|
| 679 |
+
CCCC=CC(CC)CC1C=C(CC)C(=CC(=O)OC)O1
|
| 680 |
+
CCn1c2ccccc2c2ccc(N)nc21
|
| 681 |
+
[CH2-][NH2+]C1C(C(C)C)CCC(C)C12C=C(C)CC2
|
| 682 |
+
CC(C)(C)CC(=O)NCC1COCc2nc3cccnc3n21
|
| 683 |
+
COCCN(CC1CCCN2CCCCC12)C(=O)c1cc(CCC(C)C)n[nH]1
|
| 684 |
+
COc1c2occc2cc2c(=O)cc(-c3ccccc3)oc12
|
| 685 |
+
CC(=NNC(N)=S)C(=O)Nc1ccc(Br)cc1
|
| 686 |
+
Oc1cc(-c2ccccc2Cl)nc2cc3c(cc12)OCO3
|
| 687 |
+
N#Cc1cncc(/C=C/c2ccccc2)c1Oc1ccc2[nH]ccc2c1
|
| 688 |
+
CCc1ccc(C(C)NC(=S)NC2CCCCC2)cc1
|
| 689 |
+
COC(=O)C=Cc1ccc(OC2OC(CO)C(O)C(O)C2O)c(OC)c1
|
| 690 |
+
C=C1CCC=C(C)CCC2C1C(=O)OC2(C=CC=C(C)C)CO
|
| 691 |
+
Cc1cc(Nc2cccnc2)c2c(ccc3c[nH]nc32)n1
|
| 692 |
+
C#CC(Br)C1CC(O)C(CC(OC(C)=O)C(Br)CC=CCC)O1
|
| 693 |
+
CC(C)C1=CC(O)C(C)(O)CC1O
|
| 694 |
+
CC1(C)CCCC2(C)C1CCC1(C)C3COC(=O)C3=CC(O)C12
|
| 695 |
+
COc1ccc(C(=O)Nc2cc(C)ccc2C)cc1OC
|
| 696 |
+
CC1=CCCC(=O)C=CC(C)(C)CCC1
|
| 697 |
+
CCCCCCSc1cc(C(N)=S)cc(Cl)n1
|
| 698 |
+
C=CC(C)(O)CCC1(C)C2=CC(=O)CC(C)(C(=O)O)C2CCC1C
|
| 699 |
+
COc1cc(OC)c2ccc(=O)oc2c1OCC=C(C)C
|
| 700 |
+
C#CCN(C)CC1CN2CCC1CC2CNC(=S)NCCN1CCOCC1
|
| 701 |
+
CC(C)(C)CC(=O)NC1C(c2cccs2)N(C(=O)c2ccccc2)CCC1(C)O
|
| 702 |
+
COc1ccc2c(C)c(CCC(=O)NC(C)C(=O)O)c(=O)oc2c1
|
| 703 |
+
CC(C)Nc1ncccn1
|
| 704 |
+
C=CCC1CC2(CC=C(C)C)OCOC2=CC1=O
|
| 705 |
+
COc1ccc2c(c1)OC=C(c1ccc(OC)c(OC)c1)C2O
|
| 706 |
+
O=C(NCCCN1CCC(Cc2ccc(F)cc2)CC1)NC1CCCCC1
|
| 707 |
+
C=C(C)C1CC2C(C)(C)C(Br)=CC(O)C2(C)OCO1
|
| 708 |
+
O=C(CC(CN1CCN(CC=Cc2ccccc2)CC1)C(=O)O)Nc1cccc(O)c1
|
| 709 |
+
Cc1[nH]c(O)nc1C(=O)c1ccc(Cl)cc1
|
| 710 |
+
CC(=O)OCC(=CCCC(C)=CCO)CCCC(C)C(=O)CC=C(C)C
|
| 711 |
+
Oc1c2ccccc2cc2ccccc12
|
| 712 |
+
CC(O)C12CCCN3CCc4c(n(c5ccccc45)CC1)C32
|
| 713 |
+
CCCCCCCCCCCCCCCCCCC(O)CC(=O)c1ccccc1
|
| 714 |
+
Cc1ccc2c(c1C)CCc1cc(C(C)C)c(O)cc1-2
|
| 715 |
+
Cc1ncc(COP(=O)(O)O)c(CO)c1O
|
| 716 |
+
COc1c(C=O)c(CC(C)=O)cc(O)c1CO
|
| 717 |
+
CC(C)C=CC=C(CO)C1CCC2(C)CC(O)(CCC2O)C1CO
|
| 718 |
+
CNC1=CC2=NCCc3c[nH]c(c32)C1=O
|
| 719 |
+
CC1=CC2CC(C)C1(C)CCC1=CC(=O)N(CC(=O)O)C12
|
| 720 |
+
C=CCc1cc(C(C)=O)c(O)cc1OCCCCCC(=O)OC
|
| 721 |
+
CCCCCCCCCC=CC=C(C)C(=O)O
|
| 722 |
+
COc1ccc(CNc2nnc(-c3ccccc3)c3ccccc23)cc1
|
| 723 |
+
CC1=NN(c2cc(C)ccn2)CC1
|
| 724 |
+
COc1cc2c(c3oc(CO)cc(=O)c13)C=CC(C)(C)O2
|
| 725 |
+
CC1CC2C=CC3COC(=O)C3(C)C2CC1O
|
| 726 |
+
CCOC(=O)c1ccc2[nH]c(O)c(Cc3ccccc3)c(=O)c2c1
|
| 727 |
+
CCC(C(=O)c1ccc(OC)cc1)C1CCC(OC)CC1
|
| 728 |
+
CC1(C)CCn2nc(COc3ccccc3)cc2C1=O
|
| 729 |
+
CCCN(CCC)C(=O)c1ccccc1CN(CC)Cc1ccccc1
|
| 730 |
+
CCCCCCCCCCCCCCCCC(C)=O
|
| 731 |
+
CCC=CCC=CCC=CC=CC(O)CC=CCCCCCC(=O)OCC
|
| 732 |
+
CC=C1CN2CCC3(C(=O)Nc4ccccc43)C2CC1CCO
|
| 733 |
+
CCC1(C)COc2ccc(C(=O)CCCC3CC3)cc21
|
| 734 |
+
COc1cccc2c1ccn2CCNC(C)=O
|
| 735 |
+
Cc1ccc(C(C)(C)O)cc1
|
| 736 |
+
CC1(C)S[C@@H]2[C@H](S)C(=O)N2[C@H]1C(=O)O
|
| 737 |
+
COc1cccc(CN=C(O)CCCCCc2ccccc2)c1
|
| 738 |
+
COc1c(C)c(CCCCCCCCCCSC(C)=O)oc(=O)c1OC
|
| 739 |
+
CCCCC1=CC(=O)C=C(OC)C1=O
|
| 740 |
+
O=C(CCC1NC(=O)c2ccccc2NC1=O)Nc1ccc2[nH]ccc2c1
|
| 741 |
+
CC(C)C=C(NC(=O)c1ccccc1)C(=O)O
|
| 742 |
+
C=CCCCC=CCC=CCC=CCC=CCCCCC
|
| 743 |
+
CC(C)Oc1ccc(C(=O)NC(Cc2c[nH]c3ccccc23)C(=O)O)cc1
|
| 744 |
+
C#CC1(O)CCC2C3CCC4=CC(=NO)CCC4C3CCC21CC
|
| 745 |
+
COc1c(C(C)C)cc2c3c1OC(=O)C31CCCC(C)(C)C1CC2
|
| 746 |
+
CNC1=CC(=CCC(=O)O)CC1
|
| 747 |
+
CC1=CCSS1
|
| 748 |
+
COC(=Cc1ccc(O)cc1)C(=O)NC=Cc1ccc(O)cc1
|
| 749 |
+
Nc1ccccc1C(=O)OC1CCCCC1
|
| 750 |
+
O=C(CCn1ccc2c(Br)cccc21)NC(Cc1ccccc1)C(=O)O
|
| 751 |
+
Oc1cccc2[nH]ccc12
|
| 752 |
+
COc1cccc2nc3c(O)cccc3nc12
|
| 753 |
+
COc1cc(CC(C)C(C)C(OC(C)=O)c2ccc3c(c2)OCO3)ccc1O
|
| 754 |
+
COCC(=O)NCC1C=C(C)C(CC(=O)N2CCN(C)CC2)CC1C(C)C
|
| 755 |
+
CCC(=O)OC1c2c(C)coc2C(=O)C2C(O)CCC(C)C12C
|
| 756 |
+
Oc1cc(-c2ccccc2)c(O)c2c1-c1ccccc1CO2
|
| 757 |
+
C=C(C=CC=CC=CCCC)CC
|
| 758 |
+
Oc1cccc(CCC=CCC=CCCCCCCCc2cccc(O)c2O)c1O
|
| 759 |
+
COc1ccccc1C(CCNC(C)c1ccccc1)C1CCOC(C)(C)C1
|
| 760 |
+
CC1CCC(C)C12CCC1C2=CC2(C)CCC(C)(C)C12
|
| 761 |
+
O=C(O)CCCCCCCCC(=O)Nc1ccc(Cl)c(Cl)c1
|
| 762 |
+
COc1ccc(C(=O)CCN2CCCCCC2)c(OC)c1
|
| 763 |
+
CCCCCCCCCCC=CCC=CCCCCCC1=CC(C)(O)OC1=O
|
| 764 |
+
C=C1CCCC2(C)CC3OC(=O)C(CN(C)CC(O)c4ccc(O)cc4)C3CC12
|
| 765 |
+
CC(=O)NC1C(SCC(O)CN)OC(CO)C(O)C1O
|
| 766 |
+
Cc1coc2cc3oc(=O)c(CC(=O)N4CCOCC4)c(C)c3cc12
|
| 767 |
+
CC12CCC(C(OC(=O)C=Cc3ccc(O)cc3)C1)C2(C)C
|
| 768 |
+
CC(C)=CCCC(C)=CCCC1(C)OC1CCC(=CCO)CO
|
| 769 |
+
CCCCCCCCCCCCCCCCCCCC(=O)OCCCCCCCCCCCCCCCCC
|
| 770 |
+
CC(C)c1cc(C=O)c(C2CCCC(C)(C)C2C=O)cc1O
|
| 771 |
+
CCCCCCCC(C)=CC(CC=CCCC(=O)OC)OC
|
| 772 |
+
CCCCCC(=O)NC1COC(CN2CCN(C)CC2)C1O
|
| 773 |
+
CCCCCC(O)C(O)C(O)C=CCCCCCCCC(=O)O
|
| 774 |
+
CC=C(COC(=O)C(C)=CC)OC(=O)CC=Cc1cc(OC)c2c(c1)OCO2
|
| 775 |
+
CC#CC#Cc1ccc(C#CC(O)COC(C)=O)s1
|
| 776 |
+
CCNC1=NC(=O)C2(CC(C)(C)Oc3ccc(F)cc32)N1
|
| 777 |
+
COC(=O)C1(C)CCCC2(C)C(Cc3ccoc3)CCCC12
|
| 778 |
+
Clc1ccc(C2OCC3(CO2)CC2C=CC3C2)cc1
|
| 779 |
+
CC(C)C(=O)OCC1(c2ccc(CO)cc2OC(=O)C(C)C)CO1
|
| 780 |
+
Cc1cc(C)c(C=CC(=O)c2ccccc2)c(C)c1
|
| 781 |
+
CC(=CCCC12C(=O)OC3(O)CC1CC2C3C)C(N)=O
|
| 782 |
+
c1ccc2c3c([nH]c2c1)CNCC3
|
| 783 |
+
OC1C(NC2CCCC2)C2COC(O2)C1n1cnc2ccccc21
|
| 784 |
+
CCCCNC(=O)NNC(=O)OCC
|
| 785 |
+
CC(=O)OCC(C)=CC(=O)OC1C=CC(C)(C)C(C)=C1C=O
|
| 786 |
+
O=C(O)C1CCCN2CCCCC12
|
| 787 |
+
CC(C)N1CC2OCC(=O)N(C(C)C)C2C1
|
| 788 |
+
c1ccc(-c2c[nH]c(C3COCCN3C3CCC3)n2)cc1
|
| 789 |
+
Cc1cncc(SCc2ccco2)n1
|
| 790 |
+
C=CC(C)(C)OC1OC(CO)C(O)C(O)C1O
|
| 791 |
+
C=CC(C)(CCC=C(C)CCC=C(C)C(O)C(O)C=C(C)C)Oc1ccc(O)cc1CC(=O)OC
|
| 792 |
+
CCOC(=O)c1ccc(NC(=O)CSc2nc3c(=O)[nH]cnc3n2C)cc1
|
| 793 |
+
COc1c(O)ccc2c1OC1c3cc(O)cc(O)c3COC1C2=O
|
| 794 |
+
C=C1C(=O)OC2CC(C)C3CCC(O)C3(C)C(O)C12
|
| 795 |
+
CC1(C)CCCC2(C)C(O)C(C=O)=CC(OC(=O)C=Cc3ccccc3)C12
|
| 796 |
+
Oc1cc(O)cc(CCc2ccc(O)cc2O)c1
|
| 797 |
+
CC(C)=CCCC(C)=CC(O)CC(C)=CCCC(C)(O)C=Cc1cc(O)c(C)cc1O
|
| 798 |
+
CC1CCC2(O)C(CCCC2(C)C)C1(C)CCc1ccoc1
|
| 799 |
+
O=C(O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21
|
| 800 |
+
C=C(Cl)C(Cl)(CBr)CCC(Cl)C(C)(C)Br
|
| 801 |
+
CCCCCCC=CCCCCCC(CC(=O)O)CC1(c2ccccc2)CCC2(CCCC2)C1
|
| 802 |
+
CC1=CCCC2C(=O)OC3C2C(C1=O)C1CC13C
|
| 803 |
+
CCC12CN3CC(C)(CN(C1)C3c1cccnc1)C2O
|
| 804 |
+
CCCCNc1ccc(C(=O)OCCN(C)C)cc1
|
| 805 |
+
Cn1c(-c2cc3ccc(O)c(CN4CCCC4)c3oc2=O)nc2ccccc21
|
| 806 |
+
COC(=O)C(C)Oc1ccc2c3c(c(=O)oc2c1C)CCC3
|
| 807 |
+
Cc1cc(C)c2c(c1)Oc1ccccc1C(=O)N2
|
| 808 |
+
COc1cc2c(cc1O)C1C=c3cc(O)c(OC)cc3=C[NH+]1CC2
|
| 809 |
+
O=c1[nH]c(O)c(C2NCCc3ccccc32)c(=O)n1CCc1ccccc1
|
| 810 |
+
OC1CSC(O)CS1
|
| 811 |
+
COc1ccc(C=C2COc3cc(O)c(OC)c(O)c3C2=O)cc1
|
| 812 |
+
CCC12CCc3[nH]c4ccccc4c3CCN(CC(O)C1)C2
|
| 813 |
+
COCCC(NC(=O)OC(C)(C)C)C(=O)O
|
| 814 |
+
CC=CC(=O)C1=C(C)C=CCC1(C)C
|
| 815 |
+
COc1ccc2cc(C(C)=O)ccc2c1
|
| 816 |
+
O=S1(=O)CSCSSC1
|
| 817 |
+
CC1CC(=O)O1
|
| 818 |
+
C=CCCCCCC(OC(C)=O)C(OC(C)=O)C(O)C#CC#CCCC
|
| 819 |
+
CC(O)(C=CC1C(C)(O)C(O)C(O)C2C(C)(C)CCCC21C)C(O)CO
|
| 820 |
+
COC(=O)/C=C/C(Cc1ccccc1)NC(=O)CN
|
| 821 |
+
O=C(CCCCC1SCC2NC(=O)NC21)Nc1nccs1
|
| 822 |
+
C=C(C)C1CC2OC(=O)C3(CCC(O)C3(C)C1)C2O
|
| 823 |
+
CC1NC(=O)c2ccccc2N(CC(=O)NCCSc2ccccc2)C1=O
|
| 824 |
+
c1c2c(c3c4c1C15CCNC1CCC(O3)C5OC4)OCO2
|
| 825 |
+
O=C(CSc1ccc2nncn2n1)NCC1CCCN2CCCCC12
|
| 826 |
+
CCc1ccc(C(=O)c2cncc(Br)c2)cc1
|
| 827 |
+
O=c1c2ccccc2nc2n1CCC2=Cc1ccc(Br)cc1
|
| 828 |
+
CCCCCC=CCCC(O)CCCCCCCC(=O)O
|
| 829 |
+
O=P(O)(O)OCC1OC(O)CC1O
|
| 830 |
+
CC1=CC(=O)C2CC1C2(C)COC1OC(CO)C(O)C(O)C1O
|
| 831 |
+
COc1ccc(Br)c(CCN(C)CC(C)O)c1Br
|
| 832 |
+
CCCC1OC1c1cc(OC)cc(=O)o1
|
| 833 |
+
C=C(C)C1CCC2(C)C(CC=C3C4CC(C)(C)CCC4(C)CCC32C)C1(C)CCC(=O)O
|
| 834 |
+
CN(CC#CCN1CCCC1=O)CCCCl
|
| 835 |
+
CC(O)C(O)CO
|
| 836 |
+
CCCCCC=CCC=CCC=CCC=CCCCC(=O)OCC(CO)OC(=O)CCCCCCCCCCCCCCCCCCCCC
|
| 837 |
+
Cc1nnc(C2CCN(Cc3ccc4c(c3)OCO4)C2)o1
|
| 838 |
+
CC(=Nc1ccccn1)c1cc2ccccc2oc1=O
|
| 839 |
+
CC1(C)C2CCC3=CCCC(C)(O)C3(C)C21
|
| 840 |
+
C=C(C)CN1CCN(Cc2oc(C)cc(=O)c2O)CC1
|
| 841 |
+
CC(c1sc2ccccc2c1Cl)N(O)C(N)=O
|
| 842 |
+
CCCCNC(=O)Oc1ccc2c(c1)[C@@H]1CCN(CC)C1C2
|
| 843 |
+
S=C=Nc1cc(-c2ncon2)ccc1Cl
|
| 844 |
+
CC1CCCCC1
|
| 845 |
+
CC(C)CCCCCCOC(=O)C1CCCCC1C(=O)OCCCCCCC(C)C
|
| 846 |
+
CN(C)c1ccc(CNCCCCNCc2ccc(N(C)C)cc2)cc1
|
| 847 |
+
CC1(C)CC2C=C(C(=O)O)C3CC(O)C(C)(O)C23C1
|
| 848 |
+
COC(=O)c1[nH]c(=O)c2ccccc2c1-c1ccc[nH]1
|
| 849 |
+
COc1cc(C2Oc3ccc(CCCOC(C)=O)cc3C2COC(C)=O)ccc1OC(C)=O
|
| 850 |
+
COc1ccc(C2CCc3ccc(O)cc3O2)cc1O
|
| 851 |
+
Cc1coc2c1C(O)C1(C)C(CCC(O)C1C)C2
|
| 852 |
+
CC1(C)CCc2cc(CC3OC(=O)C(O)=C3c3ccc(O)cc3)ccc2O1
|
| 853 |
+
COc1cc(N2C(=O)NC(CC(=O)O)C2=O)cc(OC)c1OC
|
| 854 |
+
COc1ccc(O)c(C=Cc2cc(O)cc(OC)c2Cc2ccc(O)cc2)c1
|
| 855 |
+
CC(=O)C(C)=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CCC(O)CC(=O)O
|
| 856 |
+
C=C(C)C1C=C(C(C)C)CCC2(C)OC2CCC(C)CC1=O
|
| 857 |
+
C=C(C)C1CCC(C)C2CCC(C(=O)O)=CC12
|
| 858 |
+
CC1(C(=O)O)CSC(c2ccccc2O)=N1
|
| 859 |
+
O=C1c2ccccc2NS(=O)(=O)N1COCc1ccccc1
|
| 860 |
+
CN1CC(O)=C(C(=O)/C=C/C=C/c2ccccc2)C1=O
|
| 861 |
+
CCCCCC(=O)CCCCC=CCCCCCC(=O)OC
|
| 862 |
+
Cc1oc2c(C)c(O)ccc2c(=O)c1-c1ccc2c(c1)OCCO2
|
| 863 |
+
CC(=O)OC(C)(C)C1CC=C(C)CC1
|
| 864 |
+
COc1ccc(-c2coc3c(OC)c(O)c(OC)c(O)c3c2=O)cc1
|
| 865 |
+
CCCCCCCCCCCCCCCCCCCCCCOC(=O)CCCCCC
|
| 866 |
+
CC(O)C1OC(O)C(O)C1O
|
| 867 |
+
CC=CC=CC(=O)c1cc(C(=O)O)c(OC)cc1O
|
| 868 |
+
COc1cccc2c1CCC[C@H]2CN(C)CCc1ccc2ocnc2c1
|
| 869 |
+
Oc1c(C2OCCc3ccccc32)ccc2cccnc12
|
| 870 |
+
O=C(CCCCCNC(=O)N1CCn2c1nc1ccccc12)NC(C(=O)O)c1ccccc1
|
| 871 |
+
CCCCCCCCCC=CC(O)C(C)N
|
| 872 |
+
Cn1cnc(CC(N)C(=O)O)c1SSc1c(CC(N)C(=O)O)ncn1C
|
| 873 |
+
Clc1nssc1=NC1=NCCS1
|
| 874 |
+
OC(c1cccnc1)c1cccs1
|
| 875 |
+
Oc1nc2ccccc2nc1N1CCNCC1
|
| 876 |
+
NS(=O)(=O)CC/N=C(\S)Nc1c(Cl)cccc1Cl
|
| 877 |
+
N=c1c2c(n(Cc3c(Cl)cccc3Cl)c3c1CCC3)CCCC2
|
| 878 |
+
COc1c2c(c(COC(=O)CC(C)C)c3c(C)coc13)C(C)CC=C2
|
| 879 |
+
OCC(CO)(CO)NCCCNC(CO)(CO)CO
|
| 880 |
+
CCCCCCCCOC(=O)c1cccc(N)c1
|
| 881 |
+
COc1c(-c2ccc(O)c(O)c2)cc2oc3cc(O)c(O)cc3c2c1O
|
| 882 |
+
CC(O)=Nc1nc(O)c2nc[nH]c2n1
|
| 883 |
+
Oc1ccc2cncn2n1
|
| 884 |
+
Cc1cc2cc(O)cc(O)c2c2oc(=O)cc(O)c12
|
| 885 |
+
O=C(CCC1NC(=O)c2ccccc2NC1=O)Nc1cccc(O)c1
|
| 886 |
+
CCCCCCCCCCCCCCCCCCN=C(S)NN=Cc1ccccc1[NH+]([O-])O
|
| 887 |
+
CC(C)=CCCC(C)C1CCC(C)=C2CC=C(C)C2C1
|
| 888 |
+
Cc1cc(C(=O)O)cc2c1OC(C(C)(O)CO)C2
|
| 889 |
+
CN1CCC(NCC2OC(CO)C(O)C2N2CCCCC2)CC1
|
| 890 |
+
COc1cc(C(=O)n2ccc(C)n2)cc(OC)c1OC
|
| 891 |
+
O=C(O)c1cc2cc3ccccc3cc2ccc1=O
|
| 892 |
+
COc1ccc(C2CC(=O)Oc3cc(OC)cc(O)c32)cc1
|
| 893 |
+
COc1ccc(C=CC(=O)N2CCC3(O)CCCCC3C2)cc1OC
|
| 894 |
+
CC(=O)Nc1ccccc1C(=O)C(=O)NCCc1c[nH]c2ccccc12
|
| 895 |
+
Cc1ccc2c(COC(=O)C(C)C)coc2c1
|
| 896 |
+
CCCCCC=CCC=CCCCCCCCCCCCC(=O)OC(CO)COP(=O)(O)OCCN
|
| 897 |
+
COC(=O)Oc1ccccc1C(=O)O
|
| 898 |
+
CN1C=CCC=C1C=NO
|
| 899 |
+
Cc1ccc(O)c(C(=O)CCC(=O)c2cc(C)ccc2O)c1
|
| 900 |
+
CN1C(=O)Nc2cc(CN)ccc2S1(=O)=O
|
| 901 |
+
CCCC1=C(C)C(=O)C(O)O1
|
| 902 |
+
c1cc(-c2conn2)c[nH]1
|
| 903 |
+
COC1C=CC(O)CC(O)CC=CC=CC(O)CC=CC=CC(=O)OC(C)CCCC1O
|
| 904 |
+
Cc1cc(=O)n(C)c2ccccc12
|
| 905 |
+
C/C=C/COc1noc2c1CNCC2.Cl
|
| 906 |
+
COc1cccc2[nH]cc(CC3(O)C(=O)OC4C(O)COC43O)c12
|
| 907 |
+
CC1(O)CCC(C(C)(C)O)CC1
|
| 908 |
+
CCCCCCCCCCCCCCc1cc(=O)c2ccccc2n1C
|
| 909 |
+
COc1ccc2[nH]c3c(c2c1)CN(C(=O)C1CCCCC1)CC3
|
| 910 |
+
CC1CCC23C1CCC2(C)CC(C)(C)C3O
|
| 911 |
+
CCCCC1OCC2(COC(=O)NC(C)C)C(C)C=C(C)C1C2C
|
| 912 |
+
CCCCS(=O)(=O)Nc1ccc2[nH]c(=O)c3ccccc3c2c1
|
| 913 |
+
O=C(CNC(=O)c1ccccc1)Oc1ccc([N+](=O)[O-])cc1
|
| 914 |
+
COc1ccc2[nH]c3c(C)c4cc[n+](C)c(N(C)C)c4cc3c2c1
|
| 915 |
+
CC1CC2OC2C=CC=CC(=O)CC2C(Cl)=C(O)C=C(O)C2C(=O)O1
|
| 916 |
+
CC1OC2OC(=O)C1(O)C2O
|
| 917 |
+
CC=CC=CC1CC2=C(C(O)O1)C(O)C(C)(O)C(O)C2=O
|
| 918 |
+
COc1cc(OC)c(C(C)O)c2c1C=CC(C)(C)O2
|
| 919 |
+
COC1=CC(=O)C23CCN(Cc4c2cc2c(c4OC)OCO2)C3C1
|
| 920 |
+
O=C1NC(=O)c2ccccc2C1=Cc1ccco1
|
| 921 |
+
Oc1ccccc1-c1cc[nH]n1
|
| 922 |
+
Cc1cc2c(cc1O)OC(c1ccc(O)cc1)C(O)C2=O
|
| 923 |
+
CC(=O)C(C)CCC=C(C)C1OC(=O)C(C)CC1C
|
| 924 |
+
CCCCOCc1ccc(O)c(OC)c1
|
| 925 |
+
CCC1(C)C=C2C(=O)C(C)(C)C(=O)C(C)(C)C2(O)OO1
|
| 926 |
+
Nc1nc(=O)c2c(ncn2O)[nH]1
|
| 927 |
+
O=C1CC(Oc2ccc(C(=O)C=Cc3ccc(Cl)cc3)cc2)N1
|
| 928 |
+
C=CC1(C)CC2OC(=O)C(=C)C2CC1C(=C)CO
|
| 929 |
+
C=CCC1=CC(=O)C2C(=O)C1(O)CC(O)C2(C)C
|
| 930 |
+
COc1ccc(C2Cc3cccc(O)c3C(=O)O2)cc1OC
|
| 931 |
+
O=S(=O)(O)c1cc(N=Nc2cccc3ccccc23)c(O)c2ncccc12
|
| 932 |
+
CCCCCC=CC(=O)CCc1ccc(O)c(CO)c1
|
| 933 |
+
CCC=CCC=CCC1OC1C(O)C=CCC=CCCCCCC(=O)O
|
| 934 |
+
OC=c1ncc2ccnn12
|
| 935 |
+
CC1CC2CC(C)C(C)(C#N)C3CCC4C(C1CCC4(C)C#N)C23
|
| 936 |
+
COc1ccc(CN2Cc3ccccc3N3CCCC23)cc1
|
| 937 |
+
CC(C)C1CC(O)C(C)(O)C(O)C1O
|
| 938 |
+
C=C(C)C1C(=C)C(OC(=O)C(C)C)C=CC1OC(C)=O
|
| 939 |
+
CCCCCCCCCC(=O)Nc1cc(Cl)ccc1O
|
| 940 |
+
COc1ccc2nc(O)c(CN(C)C(=O)CC(C)C)cc2c1
|
| 941 |
+
CC1=C(CO)C2(C)CCCC(C)(C)C2C(O)C1O
|
| 942 |
+
Oc1[nH]cc2nccnc12
|
| 943 |
+
COc1cc(OC)c2[nH]c3cc(O)c(C=O)cc3c2c1
|
| 944 |
+
CCCN(CCC)CCCCOc1ccc(/C=C/c2nc3ccccc3s2)cc1
|
| 945 |
+
[O-][NH+](O)c1ccc(O)c(Cl)c1
|
| 946 |
+
C=CCC1OC(=O)C(C)(C2CC(CCc3ccccc3)OC(C)(C)O2)C1O
|
| 947 |
+
Brc1ccc(Oc2ccc(Br)c(Br)c2)c(Br)c1
|
| 948 |
+
CC1(C)OC(S)=Nc2ccc(-c3ccc(F)c(F)c3)cc21
|
| 949 |
+
Cn1c(O)c(C(=O)C=Cc2ccccn2)c(=O)c2ccccc21
|
| 950 |
+
CN(C)Cc1cn(O)c2ccccc12
|
| 951 |
+
O=C(O)C1=C(O)C2=COC(CCCCCCCO)CC2=CC1=O
|
| 952 |
+
COc1c(C)c(O)cc2c1CCC(c1ccccc1)O2
|
| 953 |
+
CC(=O)OC1C2=C(C)C(=O)OC2=CC2(O)C(O)CCC(C)C12C
|
| 954 |
+
C=C1CC23CC(C)(C)CC(O)C2(C)CCC1O3
|
| 955 |
+
O=C(C=Cc1cc2ccccc2o1)c1ccccc1OCc1ccccc1
|
| 956 |
+
CCN(CC)C(=O)Oc1ccc2c(c1)OC(=Cc1ccccc1Br)C2=O
|
| 957 |
+
CN(C)c1ncnc2c1ncn2Cc1cccc(C#N)c1
|
| 958 |
+
N=c1cnn2occcc1-2
|
| 959 |
+
O=C(Nc1ccccc1)Nc1ccc2cnccc2c1
|
| 960 |
+
O=CSCc1ccco1
|
| 961 |
+
COc1ccc(C(=O)CNC(=O)CC2NC(=O)c3ccccc3NC2=O)cc1
|
| 962 |
+
CCNC(=O)OC1COC2C(NC(=S)Nc3ccc(N(C)C)cc3)COC12
|
| 963 |
+
COc1nc(N)nc2[nH]cc(C#N)c12
|
| 964 |
+
CCCC(CCC)C(N)=O
|
| 965 |
+
CC(C)=CC1CC(C)C2=C(O1)C(=O)C(C)=CC2=O
|
| 966 |
+
CCC(C)NC(=O)COc1ccc(OCCNCC(O)COc2ccccc2)cc1
|
| 967 |
+
CCCC1CC2=C(C(OC)O1)C(O)C(O)C(O)C2O
|
| 968 |
+
COc1ccc(-c2c(C)noc2-c2ccc(OC)cc2O)cc1
|
| 969 |
+
CC=CC(OC1OC(C)C(O)C(O)C1O)=C1C(=O)OCC1CO
|
| 970 |
+
CCOC1OCC(CO)C1OCC
|
| 971 |
+
O=C(O)C1OC(OC2C(O)C(O)OC(CO)C2O)C(O)C(O)C1O
|
| 972 |
+
CC(C)=CCC1Oc2cc(C)c(O)cc2C1C
|
| 973 |
+
C=C1C(=O)OC2C1C(O)CC1(C)CCC=C(C)C21
|
| 974 |
+
O=C(CCc1c[nH]c2ccccc12)NCCNC(=O)c1ccc2cc[nH]c2c1
|
| 975 |
+
O=C1C=Cc2cc3ccccc3cc2C1=O
|
| 976 |
+
C=CCn1cc2c3c(cccc31)C(CC(C)C)NC(C(=O)NCC1CC1)C2
|
| 977 |
+
C=C(C)C1CCC(C)=C1CC(C)(C)C=NCc1ccco1
|
| 978 |
+
Cc1coc2c1C(OC(=O)C(C)C)C1(C)C(C)CCC(Cl)C1C2=O
|
| 979 |
+
COc1cc(O)cc(OC)c1C=O
|
| 980 |
+
Cc1cc2c(cc1Br)C1(C)CCC(C)C1(C=O)O2
|
| 981 |
+
N=C(N)Nc1ccc(Cl)cc1
|
| 982 |
+
OCC1=CCCCC1
|
| 983 |
+
CC1=CCCC2(C)OC2CCC(C)=CC(O)C(C(C)(C)O)CC1
|
| 984 |
+
O=C(CC1CC2OC(CNC3CCC3)C(O)C2O1)N1CCOCC1
|
| 985 |
+
CN(C)CCSC1Cc2ccccc2Sc2ccccc21
|
| 986 |
+
O=C(O)C1C2C=CC3(CN(CC4CCCO4)C(=O)C13)O2
|
| 987 |
+
NCCCNCCCCNCCCNCCCN
|
| 988 |
+
CC(C)Cn1ccc2c(NC(=O)c3cccnc3)cccc21
|
| 989 |
+
CCN(CC)CCCOC(=O)C[C@@H](C)CC[C@H]1C(CO)=CC[C@H]2C(C)(C)CCC[C@]12C
|
| 990 |
+
O=c1oc2cc(O)cc(O)c2c2c1CCC2
|
| 991 |
+
CC(C)(O)C1CCC2(C)C(O)CCC(C)(O)C2C1O
|
| 992 |
+
CC(C)CCCCCOc1ccc(C2NC(=O)NC2=O)cc1
|
| 993 |
+
CN(C)C(=O)Oc1cccc([N+](C)(C)C)c1.[Br-]
|
| 994 |
+
O=C(O)c1cc(-c2ccccc2)sc1-n1cccc1
|
| 995 |
+
N#CCCN1N=C(c2ccc(OCc3ccccc3)cc2)OCC1=S
|
| 996 |
+
CCOC(=O)c1c(C)oc2ccc(OC(=O)c3ccc(F)cc3)cc12
|
| 997 |
+
CC1=CCc2c(cc(O)c3c(=O)cc(C)oc23)OC1
|
| 998 |
+
C=CCC1(CC(O)C(C)(C)O)C=C2OCOC2=CC1=O
|
| 999 |
+
COc1cc2c(cc1OC)CN(C(=O)CC#N)CC2
|
| 1000 |
+
CCCCCCCCCC(C)(C)C(=O)Nc1c(OC)cc(OC)cc1OC
|
| 1001 |
+
Nc1ncnc2c1ncn2C1OC(CF)C(O)C1O
|
| 1002 |
+
CC(C)(C)OC(=O)NCC(CO)NC(=O)OCc1ccccc1
|
| 1003 |
+
CC(=O)N1CC23C=CC(CC2CC12CCCC2)O3
|
| 1004 |
+
C=C(C)c1cc2c(o1)C(=O)c1c(O)cccc1C2=O
|
| 1005 |
+
CC12CCC(O)C(C)(C)C1CC(=O)c1c2ccc(O)c1O
|
| 1006 |
+
CC(=NNC(N)=O)c1c(O)n(C)c2ccccc2c1=O
|
| 1007 |
+
COC1C=C2CC(C)CCC2C(CCC2CC(O)CC(=O)O2)C1C
|
| 1008 |
+
CC(=O)Nc1ccccc1O
|
| 1009 |
+
COc1ccc(NC(=O)Cc2ccccc2)c(OC)c1
|
| 1010 |
+
COc1ccc(-c2coc3c(C)c(OCCN4CCOCC4)ccc3c2=O)cc1
|
| 1011 |
+
CC1(C)CC2CC(C)(CN2Cc2c(O)occ(CN3CCCCCC3)c2=O)C1
|
| 1012 |
+
CCCCCCCCCCCCc1ccc(C(C)=O)o1
|
| 1013 |
+
COc1ccc(C(=O)c2ccc(Cl)cc2)cc1OC
|
| 1014 |
+
CCC=CCC=CC(OC(C)=O)C(C=CC=CCC=CCCCC(=O)OC)OC(C)=O
|
| 1015 |
+
CCCCCCCCCCCCC(N)=O
|
| 1016 |
+
Cc1cc(=O)c2c(=O)oc3ccccc3c2o1
|
| 1017 |
+
O=C(CCCn1cccn1)Nc1ccc2c(c1)C(=O)N1CCCC1C(=O)N2
|
| 1018 |
+
CCC1CN(S(=O)(=O)c2ccccc2)CCC1CC(=O)O
|
| 1019 |
+
O=C(NC1CCCCC1)c1ccccc1
|
| 1020 |
+
O=C(CCCCCCCCCCCCC1C=CCC1)OCC(O)CO
|
| 1021 |
+
CC1CN(CCCc2ccc(COc3ccccc3)cc2)CCO1
|
| 1022 |
+
CC(C)C1=CC2C(C)(OC(=O)C=Cc3ccccc3)CCC(O)C2(C)CC1
|
| 1023 |
+
O=C(O)CC1CCN(C(=O)C2CCCC2)CC1CCN1CCN(c2ccccn2)CC1
|
| 1024 |
+
CC=C1CN2CCC34C(=C(C=O)C1CC23)Nc1c(O)cccc14
|
| 1025 |
+
CC(=NNS(=O)(=O)c1ccc(C)cc1)c1ccccc1
|
| 1026 |
+
O=C(O)c1ccc(CC2CCC2)nc1O
|
| 1027 |
+
COC1CC(=O)C2C34CCC5CCCCC5C3(C)C(C)(CC4)C12O
|
| 1028 |
+
O=c1c2ccccc2oc2ccc(OCCOC3CCCCO3)cc12
|
| 1029 |
+
C#CC#CCCCCCCCC=C1C(=O)OC(C)C1O
|
| 1030 |
+
COc1ccc(C(=O)C=Cc2ccc(OC)cc2OC)cc1
|
| 1031 |
+
N#Cc1cc(N)ccc1F
|
| 1032 |
+
CCCCCCCCCCCC1=C(O)C(=O)C=C(NCCc2ccccc2)C1=O
|
| 1033 |
+
COC(=O)CCCC#CCCCCCCCCCCCCCc1ccco1
|
| 1034 |
+
CC1CC23NC4CCN2CCCC32C(CC(=O)C42)C1O
|
| 1035 |
+
CC(=O)OCCCCCCCC=CC(=O)O
|
| 1036 |
+
Oc1nccc2c1[nH]c1ccccc12
|
| 1037 |
+
CCCCCCCCCCCCCCCCCC(=O)NC(COC1OC(CO)C(O)C(O)C1O)C(O)CCCCCCCCCCCCCCC
|
| 1038 |
+
CCNC(=O)N1CC2OCC(=O)N(CC3CC3)C2C1
|
| 1039 |
+
CCOC(=O)CC(=O)C(=O)OCC
|
| 1040 |
+
c1ccc(SCc2ccc(CN3CCOCC3)cc2)cc1
|
| 1041 |
+
C=C(C)C1CCC(C)=C1COC(=O)c1ccncc1
|
| 1042 |
+
Cc1cc(CNC(=O)C2CCCC(NC3CCOCC3)CN(C)C(=O)C2)no1
|
| 1043 |
+
COc1ccc(OCC(O)CNC(C)C)c(/C=C/CO)c1
|
| 1044 |
+
CCOC(=O)N=C(C)c1c(O)n(C)c2ccccc2c1=O
|
| 1045 |
+
COc1cc(C(N)=O)cc(OC)c1O
|
| 1046 |
+
O=C(O)CCC1NC(=O)N(Cc2ccc3c(c2)OCO3)C1=O
|
| 1047 |
+
CC1=CC2C(CC=O)C1(C)CCCC2(C)C
|
| 1048 |
+
CC1(C)CCCC2(C)C1CCC13CC(CCC12)C1(CO1)C3=O
|
| 1049 |
+
COC(=O)C(O)=Cc1ccc(O)c(O)c1
|
| 1050 |
+
C#CC(O)C=CCCCCCCCCCCCCCCC=CC(O)C#C
|
| 1051 |
+
NC(=O)NN1c2ccccc2CCc2ccccc21
|
| 1052 |
+
Cc1cc(C)c(C)c(C)c1
|
| 1053 |
+
O=C(Cc1ccon1)c1ccc(O)cc1O
|
| 1054 |
+
CC(=CCC(O)C(C)(Cl)CBr)C(O)CBr
|
| 1055 |
+
CC(=O)Nc1ccc(CC(=O)NCCNCC(O)c2ccccc2)cc1
|
| 1056 |
+
CCCCP(=O)(O)O
|
| 1057 |
+
CCc1cc(C(=O)Cn2cnc3ccccc32)c(O)cc1O
|
| 1058 |
+
COc1ccc(-c2cc(=O)c3ccc4occc4c3o2)cc1
|
| 1059 |
+
CNc1nc2c(c(=O)[nH]c(=O)n2C)n1CCC(C)C
|
| 1060 |
+
CCCCCCCCCCCCCOP1(=O)OCC2COC(=O)C2=C(CCCC)O1
|
| 1061 |
+
COC(=O)C1c2ccoc2CC2(C)C(C)CCC12C
|
| 1062 |
+
COC(=O)Cc1ccc(OCCCOc2cc3c(cc2O)CCO3)cc1
|
| 1063 |
+
CCOP(=O)(c1ccccc1)c1ccccc1O
|
| 1064 |
+
COc1cc(C=CCO)cc(O)c1O
|
| 1065 |
+
CC12CCC(C(CO)C1)C2(C)C
|
| 1066 |
+
COc1cc2c3c(c1OC)C(=O)NC3Cc1ccccc1-2
|
| 1067 |
+
COc1c(O)c(OC)c2occ(-c3ccc(O)cc3)c(=O)c2c1O
|
| 1068 |
+
CC(C)c1ccc(C2c3ccccc3C(=O)c3ccccc32)cc1
|
| 1069 |
+
Cc1c(C)c2ccc(OC(=O)CCCNC(=O)OC(C)(C)C)cc2oc1=O
|
| 1070 |
+
C[C@@H]1CC2C(CCC3(C)C(=O)CCC23)C2(C)C=CC(=O)C=C12
|
| 1071 |
+
OCC1OC(OCCc2c[nH]c3ccccc23)C(O)C(O)C1O
|
| 1072 |
+
COc1ccc2c(ccn2CCC(=O)N2CCc3[nH]c4ccc(Cl)cc4c3C2)c1
|
| 1073 |
+
COc1cc(C(C)=CC(=O)O)oc(=O)c1C
|
| 1074 |
+
CN(CCC1CN(C(=O)Nc2ccccc2)CCC1CC(=O)O)c1ccccc1
|
| 1075 |
+
O=C(O)C1=CC(OP(=O)(O)O)C(O)C(O)C1
|
| 1076 |
+
CC(C=CC=C(C)C=CC1(O)C(C)(C)CCCC1(C)O)=CC=CC=C(C)C(=O)O
|
| 1077 |
+
CC(=O)c1ccc(NC(=O)NC2COC3C(OC(=O)Nc4ccccc4)COC23)cc1
|
| 1078 |
+
C#CC#CC=C=CC=CC=CCC(=O)O
|
| 1079 |
+
NC(N)=NCCCC(N)[PH](=O)O
|
| 1080 |
+
CC(C)CCCCCCCCCCCCOC(=O)CC(C)C
|
| 1081 |
+
C=C(C(=O)OC)C(O)CO
|
| 1082 |
+
CC(CCC(=O)O)C1CCC2C3CCC4CC(O)CCC4(C)C3C=CC12C
|
| 1083 |
+
Cc1cc(CCCCCOc2c(Cl)cc(C3=NCCO3)cc2Cl)on1
|
| 1084 |
+
COc1cccc(-c2cc(=O)c3cc(OC)ccc3o2)c1
|
| 1085 |
+
Oc1ccc2c(c1)Cc1ccccc1-2
|
| 1086 |
+
CC1=CC(C)C2COC3(CCCC3)C1C2C
|
| 1087 |
+
COc1c(O)cc2ccc(=O)oc2c1OC
|
| 1088 |
+
Clc1cc2c(cc1Cl)Oc1cc(Cl)c(Cl)cc1O2
|
| 1089 |
+
CC(NC1=NC(C)(C)Cc2ccccc21)C(=O)O
|
| 1090 |
+
CCCCCCCCCNS(=O)(=O)O
|
| 1091 |
+
C=C1CCCC(C)CCC2=C(C)C3C1CCC3(C)C(O)C2O
|
| 1092 |
+
CC(C)C=Cc1c(O)cc(-c2cc3ccc(O)cc3o2)cc1O
|
| 1093 |
+
CCCCCCCCCCCCCC=CC(O)C(COC1OC(CO)C(O)C(O)C1O)NC(=O)CCCCCCCCCCCCCCCCCCCCCCC
|
| 1094 |
+
Oc1ccc2c(c1)CCNC2c1ccc(F)cc1
|
| 1095 |
+
O=C(CCl)Nc1ccccc1C(F)(F)F
|
| 1096 |
+
COC(=O)C(C)C1CCC(C)(CCC(=O)C(C)(C)CCCC(C)=O)OO1
|
| 1097 |
+
CC(CC(=O)O)c1ccc(C(=O)O)cc1
|
| 1098 |
+
Cc1ccc2c(c1)C1CN(C)CCC1N2S(=O)(=O)c1cccnc1
|
| 1099 |
+
CCC(O)C=CC=CCCc1cc(=O)c2ccccc2[nH]1
|
| 1100 |
+
CCCCCCCCCCCCCCCCC=CCCC=CCCCCC1OCC(N)C1O
|
| 1101 |
+
N=c1occ2cc[nH]nc1-2
|
| 1102 |
+
CCC(=O)C(C)C(C)=O
|
| 1103 |
+
CCCCC(=O)OCCCC(C)C
|
| 1104 |
+
COC(=O)C=C(C(C)=O)C1CCC(C)(Cl)C(Br)C1
|
| 1105 |
+
COc1cc(C=Cc2cc(OC)c3c(c2)OC(C)(C)C=C3)ccc1O
|
| 1106 |
+
CC(C)=CCCc1cc2c(c(O)c1O)C(=O)c1ccccc1C2=O
|
| 1107 |
+
CC(C)NCC(O)COc1ccc2[nH]cc(CCN)c2c1
|
| 1108 |
+
COC1C=CC(CO)(OC)O1
|
| 1109 |
+
CC1(Cl)CCCCC1O
|
| 1110 |
+
CC(C)C1CC(O)C2(C)C3CC(O)C2(C)C(=O)C13
|
| 1111 |
+
O=C1OC2C(CO)OC(O)C(O)C2(O)C1=Cc1ccc(O)cc1
|
| 1112 |
+
CSC1OC(Cn2cnc3c(=O)[nH]cnc32)C(O)C1O
|
| 1113 |
+
COC1C(C)OC(Oc2cccc3c2NC(=O)CC3)C(O)C1O
|
| 1114 |
+
CC(C)C12CCC(C)(O)C3CCC(C)(O)C3C1O2
|
| 1115 |
+
Cc1cccc(O)c1C(=O)OC1C(=O)C(CO)=CC(O)C1O
|
| 1116 |
+
Cc1cc2c(cc1O)[nH]c1c3c(ccc12)OC(C)(C)C=C3
|
| 1117 |
+
O=[N+]([O-])c1nccn1C1SC(CO)C(O)C(O)C1O
|
| 1118 |
+
CC1C(=O)c2c(O)cc(O)cc2OC1c1ccc(O)c(O)c1
|
| 1119 |
+
Cc1ccc(NC(=O)NCCCCNC(=O)N(O)c2ccccc2)cc1
|
| 1120 |
+
CC1=CC(=O)C(O)C(C)(C)C1CCC(C)O
|
| 1121 |
+
CCN(CC)c1ccccc1
|
| 1122 |
+
O=C1C2CCCCN2C(=O)N1CCCN1CCN(c2ccc(F)cc2)CC1
|
| 1123 |
+
CC1(C)CC23CC(=O)CC2C1CCC3C(=O)O
|
| 1124 |
+
CN1C(=O)CNC(=O)c2c1ncn2C
|
| 1125 |
+
O=C1CCc2cccc3c2N1CC3
|
| 1126 |
+
COc1ccc(C(=N)S)cc1
|
| 1127 |
+
COc1cc2ccc(=O)oc2c(OC)c1OC
|
| 1128 |
+
CC1C(=O)OC(CCc2ccccc2)CC1O
|
| 1129 |
+
COc1ccc2c(c1OC)C(=O)c1ccccc1C2=O
|
| 1130 |
+
CCc1ccc(C2CC3CCC(C2C(C)=O)N3C)cc1
|
| 1131 |
+
COc1cccc2c3c([nH]c12)C(=O)NC(=O)C3=O
|
| 1132 |
+
CC(=O)C1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3CCC21C
|
| 1133 |
+
CN1CCC(N2CCC(c3nnc(C(C)(C)C)o3)C2)CC1
|
| 1134 |
+
Cc1oc2cc(O)ccc2c(=O)c1-c1nc2ccccc2n1C
|
| 1135 |
+
O=C(O)CCCCCO
|
| 1136 |
+
CC1CN(c2ccncc2)Cc2ccccc21
|
| 1137 |
+
CCC1Cc2cc(OC)c(O)cc2C(c2ccc(O)c(OC)c2)C1CC
|
| 1138 |
+
COc1ccc2[nH]c3c(c2c1)CN(C(=O)C=Cc1cccs1)CC3
|
| 1139 |
+
COc1c(O)cc(-c2ccccc2)cc1CC=C(C)C
|
| 1140 |
+
CC(=O)c1ccc2c(c1)C=CC(C)(CO)O2
|
| 1141 |
+
CCCCC=CCC=O
|
| 1142 |
+
COc1ccccc1-c1coc2c(CN(C)C)c(O)ccc2c1=O
|
| 1143 |
+
CC1(C)CCc2cc(CC(=O)NCC3CCCN4CCCCC34)ccc2O1
|
| 1144 |
+
CCC=CC=CCCC=CCCCCCCCC(=O)O
|
| 1145 |
+
CCN(CC)c1ccc2c(C)cc(=O)oc2c1
|
| 1146 |
+
O=C1OC(O)C(Cc2ccccc2)=C1c1ccccc1
|
| 1147 |
+
O=c1oc2ccc(O)c3c(=O)oc4c(O)ccc1c4c23
|
| 1148 |
+
CCCCCCCC/C=C\CCCCCCCCOC(=O)N1CCOC(CCCCCCCCCCCC)C1
|
| 1149 |
+
CC1=CC23CCC1C(C)(C)C2CCC3C
|
| 1150 |
+
CCCCCCCCCCCCCCCCCCCCCCCCC(=O)CCCCC
|
| 1151 |
+
CC1OCC(=O)C(O)C1=O
|
| 1152 |
+
CC(=O)Nc1cccc2c1ccn2CC(=O)NCC(C)C
|
| 1153 |
+
CCCCC=CC=CC#CC#CC=CCO
|
| 1154 |
+
CC12CCCC3C1C(C)(CCC2)N(C=O)C3(C)CO
|
| 1155 |
+
CC=CC1OC2C(C)OC(=O)C2C(O)C1O
|
| 1156 |
+
Cc1nn(C)c2nc(C(C)C)cc(C(=O)NCC3CCCN4CCCCC34)c12
|
| 1157 |
+
COc1cc(OC)c(C(C)=O)c2c1C=CC(C)(C)O2
|
| 1158 |
+
C(=NNc1ccccc1)c1ccc[nH]1
|
| 1159 |
+
CC(O)C(C)(O)C(C)O
|
| 1160 |
+
CC1=C2CC3(C)C(=CC2OC1=O)C(O)CCC3C
|
| 1161 |
+
CC(C)C(C)CCC(C)C1CCC2C3CCC4CCCCC4(C)C3CCC12C
|
| 1162 |
+
C=CC1CC(C)(C)C(O)C1=C(CO)C1COC(=O)C1
|
| 1163 |
+
CCC(C)CNC(N)=S
|
| 1164 |
+
COc1ccc2c(ccn2CCC(=O)NC(CC(C)C)C(=O)O)c1
|
| 1165 |
+
CCc1ccccc1NCN1C(=O)c2ccccc2C1=O
|
| 1166 |
+
c1cncc(O[C@H]2CCNC2)c1
|
| 1167 |
+
CC1=C(C=O)NC(=C2C=CC=CC2=O)S1
|
| 1168 |
+
CN1C(=O)c2cccnc2OC2CN(Cc3ccccc3)CC21
|
| 1169 |
+
CC(C(=O)O)c1cccc(Oc2ccccc2)c1
|
| 1170 |
+
CC(=O)NC1C(O)OC(C)C(O)C1O
|
| 1171 |
+
O=C(O)C1CSC(c2nc3ccc(O)cc3s2)=N1
|
| 1172 |
+
CC(C)=CC1C(C(=O)NCc2ccccn2)C1(C)C
|
| 1173 |
+
CC1=CCC(C)(CCC(=O)C(C)C)C=CCC(C)=C(C)C(=O)C1
|
| 1174 |
+
CC(C)=CC1OC(=O)C(=CCC(O)C(C)=CCCC(C)=CCO)C1O
|
| 1175 |
+
OCC=CC#CC#CC(O)C=CCCCCO
|
| 1176 |
+
CC1(C)C2CCC(C(=O)O)C1C2
|
| 1177 |
+
C1=C(c2ccccc2)CCN(CCCCc2c[nH]c3ccccc23)C1
|
| 1178 |
+
SC1=NCCCN1
|
| 1179 |
+
Cc1cccc2c1C(=O)NC2=O
|
| 1180 |
+
CC1=CC(CO)C(C)(C)C12CCCC2(C)C
|
| 1181 |
+
COC(=O)c1ccc(OC(=O)c2ccccc2Cl)cc1
|
| 1182 |
+
COC(=O)C(=CCCC(=CCCc1ccoc1)C(=O)OC)CCC=C(C)C
|
| 1183 |
+
CCCCCCCC(O)CCCC1Cc2cc(O)cc(O)c2CO1
|
| 1184 |
+
COc1cc(C)c(Br)c(C)c1
|
| 1185 |
+
CC1CCC(C(C)C)C2CC(C)(N)CC=C12
|
| 1186 |
+
CC(C)NC(=O)NC1C2COC(O2)C(n2cccn2)C1O
|
| 1187 |
+
COCC1CCCN1CC1CN2CCC1CC2CNC(=O)Nc1ccccc1
|
| 1188 |
+
OC12CC3CC(CC(C3)C1)C2
|
| 1189 |
+
CC(=O)c1cc2c(CO)cc(O)cc2oc1=O
|
| 1190 |
+
CCN(CC)CCOC(=O)C(O)(c1ccccc1)c1ccccc1
|
| 1191 |
+
NC1CCCN(P(N)(=O)NS(=O)(=O)O)C1=O
|
| 1192 |
+
N=Cc1coc2cncn12
|
| 1193 |
+
C=C1CCC2C(C)(C)CCC(OC(C)=O)C2(C)C1COC(C)=O
|
| 1194 |
+
CCCCCCCC=CCC=CCC1OC1CC
|
| 1195 |
+
CCNC(=O)c1ccc(COC(COCc2ccc(OC)cc2)Cn2ccnc2)cc1
|
| 1196 |
+
CCCCCCCCC=CCCCCCCCCCCCC(=O)OC(CO)COC(=O)CCCCCCCCCCCCCCCCCCCCC
|
| 1197 |
+
CN1CCN(C(c2ccccc2)c2ccccc2)CC1
|
| 1198 |
+
NC(=O)c1ccccc1[NH+]([O-])O
|
| 1199 |
+
CC(C)=CCCC(C)=CCCC1(C)C=Cc2cc(O)ccc2O1
|
| 1200 |
+
Nc1ncnc2c1ncn2C1OC(CSCC(N)C(=O)O)C(O)C1O
|
| 1201 |
+
Cc1nccc2nonc12
|
| 1202 |
+
S=C=NC=CCCCCCCCCCCCCCCCC=CN=C=S
|
| 1203 |
+
O=C(O)C=CC=CCCCCC(=O)O
|
| 1204 |
+
CCOC(O)=Nc1c([NH+]([O-])O)cc(Cl)cc1[NH+]([O-])O
|
| 1205 |
+
CCC(O)c1cccc(O)c1CN
|
| 1206 |
+
CC1=C2C(=CC1)C(C)(O)CCC1C(C)C(=O)OC21
|
| 1207 |
+
COc1c(O)c(O)cc2c1-c1ccc(OC)c(=O)cc1C(NC(C)=O)CC2
|
| 1208 |
+
C=CCCCCCCCCCCC(O)CC(O)CCOC(C)=O
|
| 1209 |
+
C=C(C)CC1CCC(C)C1
|
| 1210 |
+
CCCCCCCCCCCCOc1ccc(N2C(N)=NC(N)=NC2(C)C)cc1
|
| 1211 |
+
C=CC(=C)CCC1C(=C)CCC2C(C)(COC(=O)C=Cc3ccc(O)cc3)CCCC12C
|
| 1212 |
+
NC(CC(O)(Cc1c[nH]c2ccccc12)C(=O)O)C(=O)O
|
| 1213 |
+
C=CCn1ncc2c(C)nc(Cc3ccccc3)n2c1=O
|
| 1214 |
+
CCOCCn1c(N2CCN(CC)CC2)nc2ccccc21
|
| 1215 |
+
COC(=O)c1cccc2nc(C3CCN(C4CCCCC4)C3)oc12
|
| 1216 |
+
O=C(C=Cc1ccccc1Cl)c1ccc(OC(=O)c2ccccc2Cl)cc1
|
| 1217 |
+
NNc1ccc([NH+]([O-])O)cn1
|
| 1218 |
+
COC(=O)c1ccccc1N(C)C
|
| 1219 |
+
O=C1C=CC2CCCN12
|
| 1220 |
+
CC1(C)C=Cc2cc(OC3OC(CO)C(O)C3O)ccc2O1
|
| 1221 |
+
CCCCCC(C)CCCC(C)CCCC(C)=CCOP(=O)(O)O
|
| 1222 |
+
Cc1ccc(-c2cc(=O)c3cc(NC(=O)c4ccccc4Br)ccc3o2)cc1
|
| 1223 |
+
CCOC(=O)Cc1nc(-c2ccc(N=C=S)cc2)no1
|
| 1224 |
+
C=CC(C)=CC=CC(C)=CC=C1C(C)=CCCC1(C)C
|
| 1225 |
+
CCCCCCCCC(O)C(C)C(=O)O
|
| 1226 |
+
Cn1nc(C(C)(C)C)cc1C(=O)NCC1CCCN2CCCCC12
|
| 1227 |
+
CN(C)CCOc1ccc2c(c1)CCCC(c1ccccc1)=C2c1ccc(O)cc1
|
| 1228 |
+
COc1ccc2c(c1OC)C(=O)OC2CC(=O)Nc1ccc(O)c(C(=O)O)c1
|
| 1229 |
+
COC(=O)CCC(=O)CC=CCc1ccccc1
|
| 1230 |
+
C=C(CCC=C(C)CO)C1CCC(C)(O)C1C
|
| 1231 |
+
CCCCCCCC=CC#CC#CCCCC(=O)OC(C)C(=O)O
|
| 1232 |
+
C=C(C)C1CCC(C)(O)C2CCC(C)(O)C2C1
|
| 1233 |
+
CC(=O)NCCC(F)CNC(=O)CCC(=O)N(O)CCC(F)CNC(=O)CCC(=O)N(O)CCC(F)CNC(C)=O
|
| 1234 |
+
CC1(CO)CC2C=C(C(=O)O)C3CCC3(C)C2C1
|
| 1235 |
+
N#CCCn1nc(-c2ccc(OCc3ccccc3)cc2)oc1=S
|
| 1236 |
+
CCC(C)CCCCCCCCCCC(=O)OC
|
| 1237 |
+
CC1=C2CC(C)(CO)CC2CC2(C)CCC12O
|
| 1238 |
+
c1ccc(CCc2nn3c(-c4ccco4)nnc3s2)cc1
|
| 1239 |
+
C=CC1(C)C=C2CCC3C(C)(C)CCCC3(C)C2CC1
|
| 1240 |
+
OC1CCCc2nc3ccccc3c(NCc3cccs3)c21
|
| 1241 |
+
Cn1nc(CC(=O)NCC2CCCN3CCCCC23)c2ccccc2c1=O
|
| 1242 |
+
CC1NC2(CCCCC2)CC(C)(C)c2ccccc21
|
| 1243 |
+
CC(C)=CCc1ccc(O)c(CO)c1-c1ccc(C)o1
|
| 1244 |
+
CC=CC=CC(O)=C1C(=O)C2(C)C(=O)C(C)(O)C1N1CCCC12
|
| 1245 |
+
CC1CCC2(C)CC(O)C3(C)CCC4C3=C2C1CCC4C
|
| 1246 |
+
COc1ccccc1C=CC(=O)Nc1cccc2ncccc12
|
| 1247 |
+
COc1ccc(CN2CC3CN(C(=O)C(C)(C)C)CCN3C(C)(C)C2)cc1
|
| 1248 |
+
CCC(C)C(=O)OCC1OC(Oc2ccc(C(=O)O)cc2)C(O)C(O)C1O
|
| 1249 |
+
O=c1c(-c2ccccc2)nccn1C(CN1CCCC1)c1ccccc1
|
| 1250 |
+
CC1NC(CCCCCCCCCCCC(=O)O)CCC1O
|
| 1251 |
+
COc1cc2c(cc1OC)CCN(C)C(C(=O)c1ccccc1)=C2
|
| 1252 |
+
CCCC=CCOC(=O)CCC
|
| 1253 |
+
C=C1CCCC1(C)C1(C)CC=C(CO)CC1
|
| 1254 |
+
NC(CS(=O)Cc1ccccc1)C(=O)O
|
| 1255 |
+
O=C(O)C1=CN(CCOC(=O)c2cc(Br)c[nH]2)CC=C1
|
| 1256 |
+
Oc1ccc2c(c1)C[C@@H]1c3ccc(O)cc3CC[C@H]21
|
| 1257 |
+
CC(CCCCCCCCCCCCCCCCCCCCC(=O)C(C)C(=O)NCCO)OC1OC(C)C(O)CC1O
|
| 1258 |
+
C=C(C)CC=Cc1ccoc1
|
| 1259 |
+
COCC(=O)NC1C(c2ccccc2)N(Cc2nccs2)CCC1(C)O
|
| 1260 |
+
CC(=O)N1CCN(CC2OCC(NC(=O)NC3CCC3)C2O)CC1
|
| 1261 |
+
CCC(C)C(NC1=NC(C)(C)Cc2ccccc21)C(=O)O
|
| 1262 |
+
O=C(Nc1nccs1)C1CN2CCC1CC2Cn1cc(CO)nn1
|
| 1263 |
+
CCCCCCCCCCCCCCCCCCCC=O
|
| 1264 |
+
CC=CCCCCCCCCCCc1oc(=O)cc2c1C(=O)OC(C)C2
|
| 1265 |
+
CC(C)C(=O)NCCNCC(O)COc1ccccc1
|
| 1266 |
+
O=C(NC1CCC(C(=O)O)CC1)OCc1ccccc1
|
| 1267 |
+
CSCSc1nc(O)c(C)c(Cc2ccccc2)n1
|
| 1268 |
+
CCOC(=O)CC(CCc1ccc(O)cc1)OC(C)=O
|
| 1269 |
+
CC(=O)NC(C)C(=O)CCC(=O)O
|
| 1270 |
+
Cc1cc(O)cc2c1OC(C)(CCCC(C)CCCC(C)CCCC(C)C)CC2
|
| 1271 |
+
CC(C)N1CCC(c2nnc(-c3cncn3C)o2)C1
|
| 1272 |
+
c1coc(-c2nnc(C3CCN(Cc4ncc[nH]4)C3)o2)c1
|
| 1273 |
+
CCCCc1cc(OC)c(CC(C)N)cc1OC
|
| 1274 |
+
C=CC1C(OC2OC(CO)C(O)C(O)C2O)OC=C2C(=O)OCCC21
|
| 1275 |
+
CNCCc1nc(-c2cn(C)c3ccccc23)no1
|
| 1276 |
+
Cc1ncc(-c2ccccc2F)cn1
|
| 1277 |
+
CC(=O)NCC1OC(CC(=O)NCc2ccc(F)cc2)C(O)C1O
|
| 1278 |
+
CC(C)=CCCC(C)=CCc1cc(C(=O)O)ccc1O
|
| 1279 |
+
CC(=O)N(O)CCCCCNC(=O)C(O)(CC(=O)O)CC(=O)NCCCN(O)C(C)=O
|
| 1280 |
+
C=C(C)C1=CC2=C(C=CC2=C)C(C)=CC1
|
| 1281 |
+
CCS(=O)(=O)CCN1CC2CC(C1)c1cccc(=O)n1C2
|
| 1282 |
+
COc1ccc(-c2noc(C(C)NC(=O)c3ccccn3)c2C(=O)O)cc1
|
| 1283 |
+
CCCCCCCCCCCCCCCC(O)CC(O)CO
|
| 1284 |
+
O=C(C=Cc1ccc(O)cc1)Oc1ccc(O)cc1
|
| 1285 |
+
O=C(CNCCc1cc2ccccc2[nH]1)Nc1c2c(nc3ccccc13)CCC2
|
| 1286 |
+
CSCC(=O)NC1CCN2C(=O)c3cc(-c4ccsc4)ccc3NC(=O)C12
|
| 1287 |
+
CC1(C)c2cc(O)c(O)cc2CCN1C(CO)CO
|
| 1288 |
+
COc1ccc(C(=O)CN2CCc3cc(OC)c(OC)cc3C2)cc1
|
| 1289 |
+
Cc1ccc(S(=O)(=O)OCC2C3CCC(C3)C2(C)C)cc1
|
| 1290 |
+
COc1ccc(C2C3CCCCC3(O)CCN2C(=O)C=Cc2ccccc2)c(OC)c1
|
| 1291 |
+
C=Cc1nccc2c1[nH]c1c(OC)cccc12
|
| 1292 |
+
CC(C)=CCCC(C)=CCCC(=CCc1cc(O)ccc1O)C(=O)O
|
| 1293 |
+
Cc1c(C)c2ccc(OC(C)C(=O)NCC(=O)O)cc2oc1=O
|
| 1294 |
+
CC1(C)CCCC2(C)C1CCC1CC3CC12CCC3(O)CO
|
| 1295 |
+
C=C1C=CC(OC)C(C)CC(=O)c2c(C)coc2C1
|
| 1296 |
+
CC1(CN)c2ccccc2Cc2ccccc21
|
| 1297 |
+
CCC=C1CC2C(O)Nc3cc(O)c(OC)cc3C(=O)N2C1
|
| 1298 |
+
COc1c2occc2c(OC)c2c(=O)ccoc12
|
| 1299 |
+
CC1CC2=C(CC1C1OCC(CN3CCCCC3)O1)C(C)(C)CCC2
|
| 1300 |
+
Cc1cc(=O)oc2cc(OCC(=O)NCCCO)ccc12
|
| 1301 |
+
CC=C(C)CSC
|
| 1302 |
+
Nc1ncc(Cc2ccc3c(c2)CCCN3)c(N)n1
|
| 1303 |
+
CC12CC3OCCC3(CCO1)O2
|
| 1304 |
+
CCCCCCCOc1ccc(C(=O)O)cc1CC(=O)C(F)(F)F
|
| 1305 |
+
COc1cccc(Sc2ccc(NC3=NCCN3)cc2)c1
|
| 1306 |
+
O=P(O)(O)c1ccccc1OCCOc1ccccc1P(=O)(O)O
|
| 1307 |
+
CCOC(=O)CCN1CCN(c2ccccn2)CC1
|
| 1308 |
+
CC(=O)OCC(O)C(O)C1OC(O)(C(=O)O)CC(O)C1N
|
| 1309 |
+
[O-][NH+](O)c1ccc(N=Cc2ccccc2O)cc1
|
| 1310 |
+
O=c1cc(-c2cc3ccccc3o2)c2cc3c(cc2o1)CCCC3
|
| 1311 |
+
COC12C3=CC(O)CC1c1cc4c(cc1C[NH+]2CC3)OCO4
|
| 1312 |
+
O=C(CCc1ccc(O)cc1)n1cccc1
|
| 1313 |
+
CN(C)Cc1c(O)ccc2cc(-c3ccc(Cl)cc3)c(=O)oc12
|
| 1314 |
+
O=c1occ(CN2CCc3ccccc3C2)c(O)c1CN1CCOCC1
|
| 1315 |
+
CC1CC2C3=CCCN4CCCC(C(=O)C2O)C34C1
|
| 1316 |
+
C=CC1(C)CC(O)C2(C)C(C1)C(=O)CC1C3(C)CCCC12OC3=O
|
| 1317 |
+
Cc1ccc2c(c1)c1c3n2CCN=C3CCC1
|
| 1318 |
+
Cc1cc(O)cc2c1C(=O)C=C(O)C2=O
|
| 1319 |
+
OCC1NC(O)C(O)C(O)C1O
|
| 1320 |
+
CCCCOC(=O)CCC1OC(=O)C(O)=C1c1ccccc1
|
| 1321 |
+
COc1cc2oc(=O)ccc2cc1C(O)C(O)C(C)(C)O
|
| 1322 |
+
NC(CCCCNCC(=O)c1ccco1)C(=O)O
|
| 1323 |
+
Brc1cccc2cc(C=NNc3ccccc3)ccc12
|
| 1324 |
+
O=C(O)C1Cc2c([nH]c3ccccc23)C(c2cccc(O)c2)N1
|
| 1325 |
+
OCC=CC#CC#CCCO
|
| 1326 |
+
CCC(C)=CC=CC1CC2OC(C=C=CBr)CC2O1
|
| 1327 |
+
Nc1nc(N)c2ccn(COCCO)c2n1
|
| 1328 |
+
COc1ccc(C=O)cc1CN1CCCCC1c1cccnc1
|
| 1329 |
+
Cc1ccc(C(=O)c2ccc(Cl)cc2)c(O)c1
|
| 1330 |
+
CC(C)C(N)C(=O)NC(C(=O)O)C(C)O
|
| 1331 |
+
O=C(CCCCC1CCSS1)Nc1ccc(N2CCCS2(=O)=O)cc1
|
| 1332 |
+
CC1(C)CC(=O)CC(C)(C)N1Cc1ccccc1
|
| 1333 |
+
CCC(C)C(=O)OCC(C)(OC)c1ccc(C)cc1O
|
| 1334 |
+
CC(C)=C1C=CC(C)C2CCC(C)C2C1
|
| 1335 |
+
CC(C)(C)c1cc(-c2nnc(O)s2)cc(C(C)(C)C)c1O
|
| 1336 |
+
COc1ccc(N=O)c(C=CN(C)C)n1
|
| 1337 |
+
O=C(O)CCC(NC(=O)c1cncc(O)c1)C(=O)O
|
| 1338 |
+
CCOC(=O)C1C(=O)C=C(C=Cc2ccco2)CC1c1ccccc1
|
| 1339 |
+
COC(=O)C1C2C=CC3(CN(Cc4ccco4)C(=O)C13)O2
|
| 1340 |
+
CNC(=N)NC(CCCC(=O)CC(=O)CCc1ccc(O)c2c1CCCO2)CCn1ccnc1
|
| 1341 |
+
Clc1ccc(-c2cc(Cl)c(Cl)c(Cl)c2)cc1Cl
|
| 1342 |
+
C=CC1CC=C(CC)C(C(=O)OC)C2CC(=O)C=CCC12
|
| 1343 |
+
CCN1CCC[C@@H](c2cccc(O)c2)C1
|
| 1344 |
+
COCCN(CC1CCCN2CCCCC12)C(=O)c1ccccc1-n1nnnc1C
|
| 1345 |
+
Cc1cccc(CN2CCN3C(=O)NCC3C2)n1
|
| 1346 |
+
CC12CCC3c4ccc(O)cc4CCC3C1CCC2=O
|
| 1347 |
+
O=c1c(-c2ccccc2)coc2c(CN3CCOCC3)c(O)ccc12
|
| 1348 |
+
CC(C)(C)CNC1CC(Cc2cc(CN3CCOCC3)on2)C1(C)C
|
| 1349 |
+
COc1cc(CC=CC#Cc2ccc(O)cc2)ccc1O
|
| 1350 |
+
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
|
| 1351 |
+
CCCCCC1C=CC(CCCCCC(CO)C(=O)O)C(O)C1
|
| 1352 |
+
CC(CNc1ccc(C(=O)O)cc1O)C(=O)O
|
| 1353 |
+
COc1cc(OC)c2cc(C(=O)O)n(C)c2c1
|
| 1354 |
+
CC=CC1C=CC(=O)C(O)C1C(=O)C(C)O
|
| 1355 |
+
C=C1C(O)CC2C3(CO3)CC3OC(=O)C(C)C3CC12O
|
| 1356 |
+
CCCCCCC(C)CC(C)(C)C
|
| 1357 |
+
CC(=O)N=C(N)Nc1nc(C)c2cc(C)ccc2n1
|
| 1358 |
+
CCOC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1ccccn1
|
| 1359 |
+
N#CC(C(=O)O)C1C(=O)Nc2ccccc21
|
| 1360 |
+
CC(=O)C=Cc1ccc(O)c(O)c1
|
| 1361 |
+
N#Cc1ccccc1NC(=O)Nc1ccccc1O
|
| 1362 |
+
CCCCCCC=CC#CCCCCCCC1CC(CO)OC1=O
|
| 1363 |
+
CC(O)CCCC(O)C=CC1C(O)CC(=O)C1CC=CCCCC(=O)O
|
| 1364 |
+
Cn1c(O)c(C(=O)O)c(=O)c2ccccc21
|
| 1365 |
+
COc1c2c(cc3c1C(CC(=O)C=CC1=C(C)CCCC1(C)C)N(C)CC3)OCO2
|
| 1366 |
+
Cn1ccc2c(NC(=O)CC3NC(=O)N(CCc4ccccc4)C3=O)cccc21
|
| 1367 |
+
CC(O)c1c(-c2ccc(Cl)cc2)noc1C(=O)N1CCCC1
|
| 1368 |
+
C=C1CCC2COC(C3CCC=C(C)C3)C1C2
|
| 1369 |
+
CC(=O)C1CC(=O)C(C)C(Cc2ccccc2)C1
|
| 1370 |
+
O=C(Nc1ccc2c(c1)OCCO2)NC(Cc1c[nH]c2ccccc12)C(=O)O
|
| 1371 |
+
C=CC1(C)C=C2C(O)C3OC(=O)C4(CCCC(C)(C)C34)C2(O)CC1O
|
| 1372 |
+
CCCCSSCC
|
| 1373 |
+
NC(=O)[C@H]1CCCN1C(=O)[C@H](CCC(=O)O)NC(=O)[C@H]1CCC(=O)N1
|
| 1374 |
+
C=CCCCCC=CC=CC#CC=CC(=O)C=C
|
| 1375 |
+
CCCCCCCCCCCCCCCCCCCC(=O)CCCCCCCCCC(=O)O
|
| 1376 |
+
CC(=O)OC1CC(C)=C2C(C=C(C)C(=O)O)CCC(C)C21
|
| 1377 |
+
OC1C(NCc2ccncc2)C2COC(O2)C1N1CCOCC1
|
| 1378 |
+
CC(C)=CC(=O)C(C)c1ccc(C)c(O)c1
|
| 1379 |
+
CCCCCCN(CCCCCC)C(=O)NC(=O)C(F)(F)F
|
| 1380 |
+
COc1cc(N)c(Cl)cc1NC(=O)C1CCN(CC2CCC2)CC1
|
| 1381 |
+
CC1c2ccoc2CC2C1CCC1C(C)(C(=O)O)CCCC21C
|
| 1382 |
+
CCc1cnc(C)s1
|
| 1383 |
+
O=C1CN=C(c2ccccc2Cl)c2cc([NH+]([O-])O)ccc2N1
|
| 1384 |
+
CC(=O)N1CC2COCCN2C2(CN(CC(C)C)C2)C1
|
| 1385 |
+
CCCCCC1CCCC1
|
| 1386 |
+
Cc1ccc(C(=O)c2ccc3n2CCC3C(=O)O)cc1
|
| 1387 |
+
CN(C)CCN(C)CCC1CN(C(=O)Cc2cccs2)CCC1CC(=O)O
|
| 1388 |
+
Cc1ccc(O)c(C(O)Cc2ccccc2)c1
|
| 1389 |
+
COc1ccc(C(Cc2ccc(N)cc2)n2ccnc2)cc1
|
| 1390 |
+
O=C1CCc2ccc(O)c(OC3OC(CO)C(O)C(O)C3O)c2O1
|
| 1391 |
+
O=S(=O)([O-])Nc1ccc(-c2nc3ccccc3s2)cc1I.[Na+]
|
| 1392 |
+
NC(Cc1cnc[nH]1)C(=O)NC(CO)C(=O)O
|
| 1393 |
+
Cc1c(O)cc(O)c2c1C(C)(O)C(C)OC2=O
|
| 1394 |
+
CC1C(=O)OC2C1CCC(C)(O)C2O
|
| 1395 |
+
O=C1NC(Cc2c[nH]c3ccccc23)C(=O)N1CCc1ccccc1
|
| 1396 |
+
CCC=CC=CC=CC=O
|
| 1397 |
+
O=c1cc(-c2ccccc2)c2ccc(O)c(CN3CCCCC3)c2o1
|
| 1398 |
+
O=C(CCNC(=O)N1CCc2c([nH]c3ccccc23)C1)NC1CC1
|
| 1399 |
+
CCC(C)C1SCSS1
|
| 1400 |
+
Cc1cccc(Nc2c3ccccc3nc3ccccc23)c1
|
| 1401 |
+
CC1CC(O)C2C(C=O)=COC(OC3OC(CO)C(O)C(O)C3O)C12
|
| 1402 |
+
C[C@@H](NC(=O)c1ccccc1)C(=O)O
|
| 1403 |
+
CCCCOC(=O)CC(O)(CC(=O)OCCCC)C(=O)OCCCC
|
| 1404 |
+
O=C(O)C(=O)/C=C(\O)c1cccc(Br)c1
|
| 1405 |
+
COc1cc2c(cc1OC)CN(C(=O)NCCC(=O)O)CC2
|
| 1406 |
+
S=C=Nc1ccc(-c2noc(C3CCCCC3)n2)cc1
|
| 1407 |
+
CC(=O)NC[C@H]1CN(c2ccn(-c3ccc(F)cc3)c2)C(=O)O1
|
| 1408 |
+
CCCCCCCCCCCCC(O)C(O)CCC=CCCC=CCCC(O)CCCCCC(O)CC1=CC(C)OC1=O
|
| 1409 |
+
CCCCC1C(=O)C(C)=C(C)C1(O)CCCCCCC(=O)OC
|
| 1410 |
+
CC(=O)OC1CC(C)=CCCC(C)=CC2OC(=O)C(O)(CO)C21
|
| 1411 |
+
O=c1oc2cc(O)ccc2c2c(O)cc(O)cc12
|
| 1412 |
+
C=C(OC1C=CC(C(=O)CCC(=O)O)C(C(=O)O)C1O)C(=O)O
|
| 1413 |
+
CCc1cc(O)c(O)c(-c2cc(O)c(O)cc2CC)c1
|
| 1414 |
+
C=C1CC2C=CC3(CC(C)CC13)C1C2C1(C)C(N)C=O
|
| 1415 |
+
CCCC1NCCc2c1[nH]c1ccc(O)cc21
|
| 1416 |
+
COc1cc(C=CC=O)cc(CC2OC2(C)C)c1O
|
| 1417 |
+
CC1=CCC2C(C1)CN1CCc3c([nH]c4ccccc34)C21
|
| 1418 |
+
N=Cc1c[nH]c2ocnc12
|
| 1419 |
+
Oc1c(Cl)cc(I)cc1Cl
|
| 1420 |
+
CCCCCC=CCC(O)C(O)C=CC1CC1C1CCCC(=O)O1
|
| 1421 |
+
CCCCCCC=CCCC=CCCCCCCCCCCCCC=CCCC=CCCCC(=O)O
|
| 1422 |
+
CCCN(C)C(=O)c1c(-c2ccc(F)cc2)noc1C(C)O
|
| 1423 |
+
COC(=O)CCCCCCCCC(C)=O
|
| 1424 |
+
CC(CCC(=O)NC(CC(=O)O)C(=O)O)C1CCC2C3CCC4CC(O)CCC4(C)C3CC(O)C12C
|
| 1425 |
+
COc1ccc(-c2nnc(C3CCN(Cc4ccccc4)C3)o2)cc1
|
| 1426 |
+
CCCCCCCCC=CCCCC1CCCC(C)N1
|
| 1427 |
+
COC(=O)CN1C(=O)N2CCc3c([nH]c4ccccc34)C2(C)C1=O
|
| 1428 |
+
CCCC1C(c2ccc(F)cc2)CC2C[C@@H](F)C1N2C
|
| 1429 |
+
CC12CCC3c4ccc(O)cc4CCC3C1CC(=NO)C2=O
|
| 1430 |
+
C1CCCC(C2CCCCCC2)CC1
|
| 1431 |
+
CCCCCCCCCCCCCCCCCCCCCC(O)CCO
|
| 1432 |
+
COC(=O)C(Cc1cccc(I)c1)NC(=O)OC(C)(C)C
|
| 1433 |
+
N/C(=N/CCCC(N)[PH](=O)O)N[N+](=O)[O-]
|
| 1434 |
+
O=C(NCC1(COc2cccnc2)CC(O)C(O)C1)c1cnccn1
|
| 1435 |
+
CC1=CC2OC3CC(OC(=O)C=CC=CC(O)C(C)O)C(C)(C2(C)CC1)C31CO1
|
| 1436 |
+
CCCCC1(CCCC)C(=O)N=C(Nc2cccc(OC)c2)N=C1O
|
| 1437 |
+
CCCCCC1SSC(C)S1
|
| 1438 |
+
COc1ccc2[nH]c3c(c2c1)CN(C(=O)C1CCCO1)CC3
|
| 1439 |
+
COc1ccc(C=CCOC2OC(CO)C(O)C(O)C2O)c(OC)c1OC
|
| 1440 |
+
O=C(O)c1cccc(C(=O)CO[NH+]([O-])O)c1
|
| 1441 |
+
O=S1(=O)CCC(Br)C1
|
| 1442 |
+
Cc1ccc(NC(=O)c2oc3ccccc3c2C)cc1
|
| 1443 |
+
C#CC=CCCCCCCCCCCCCCC=CCCCCC=CCCCCC#CC(O)C#CCCCCCCC=CC(O)C#C
|
| 1444 |
+
CCOC(=O)Cc1cc(O)cc(O)c1C(=O)CCCCCC(C)=O
|
| 1445 |
+
COC1(C)Oc2cc(O)cc3cc(C)nc(c23)C1=O
|
| 1446 |
+
OC1CN=C2C=CC=CN2C1
|
| 1447 |
+
CC1CCC2C3CCC4CC(O)CCC4(C)C3CCC2(C)C1OS(=O)(=O)O
|
| 1448 |
+
CCCCCCC=CCCCC=CCCC(=O)O
|
| 1449 |
+
CC1=C(CCC2=CC=CC(O)=COC2)C2(C)CCCC(C)(C)C2CC1
|
| 1450 |
+
Cc1cc(C)c(CC(O)c2cc3ccccc3o2)c(C)c1
|
| 1451 |
+
CCCn1cnc2c1c(=O)n(CCCCC(C)=O)c(=O)n2C
|
| 1452 |
+
C=C(C)c1ccc(O)c(OC)c1
|
| 1453 |
+
CCCCCCC=CCC=CCCCCCCCCC(=O)O
|
| 1454 |
+
Cn1c(NC(=O)C(F)(F)F)cc(=O)n(C)c1=O
|
| 1455 |
+
CCCCCCCCSCC(O)CN1CC2CC(C1)c1cccc(=O)n1C2
|
| 1456 |
+
COc1cc(O)c2c(c1C)C(C)(O)C(CO)OC2=O
|
| 1457 |
+
CC1(C)CCCC2(C)C1C(OC(=O)C=CC(=O)O)C=C1COC(=O)C12O
|
| 1458 |
+
NC(Cc1ccc(O)c(Br)c1)C(=O)O
|
| 1459 |
+
O=C(COC(=O)c1ccccc1)c1ccccc1
|
| 1460 |
+
CC(Cc1ccc(O)c(O)c1)C(C)Cc1ccc(O)c(O)c1
|
| 1461 |
+
CC12CCC3c4ccc(OC#N)cc4CCC3C1CCC2=O
|
| 1462 |
+
CC1(C)CC2C1CC1OC1(C)C(=O)CCC2(O)CCl
|
| 1463 |
+
CCCCCCCCCCCCCCCCCCCC1Oc2c(C)c(C)c(O)c(C)c2S1
|
| 1464 |
+
CCCCCCCCC=CCCCc1cc(=O)c2ccccc2n1C
|
| 1465 |
+
CNC1CCC23CC24CCC2(C)C(C(C)NC(C)=O)CCC2(C)C4CCC3C1
|
| 1466 |
+
O=C(Nc1ccccc1C(=O)NC(Cc1ccc(O)cc1)C(=O)O)c1ccccc1
|
| 1467 |
+
OCC1OC(c2ccc(O)cc2)C=C1c1ccc(O)cc1
|
| 1468 |
+
Fc1ccc(-c2cc(NCCCCN3CCCC3)c3ccccc3n2)cc1
|
| 1469 |
+
CCOC(=O)C(NCC(O)COc1ccc(CCC(=O)OC)cc1)C(=O)OCC
|
| 1470 |
+
CCOC(=O)C1C2C=CC3(CN(CC4CCCO4)C(=O)C13)O2
|
| 1471 |
+
COc1ccc(C=C(C#N)C(=O)OC(C)C)c(OC)c1OC
|
| 1472 |
+
NC(CCCNO)C(=O)O
|
| 1473 |
+
COc1ccc(C=CCc2ccc(O)c(OC)c2)c(O)c1
|
| 1474 |
+
O=C1OCC(Cc2ccc3c(c2)OCO3)C1=Cc1cccc2c1OCO2
|
| 1475 |
+
CC(CCCC(=O)O)c1ccccc1
|
| 1476 |
+
CC1(C)CCCC2(C)C3CCC4(C)C(CC=C5COC(O)C54)C3=CCC12
|
| 1477 |
+
CCC(C)C(=O)OCC1CC(=O)OC(C)C1C
|
| 1478 |
+
CC(=O)OCCN(CCOC(C)=O)CC(c1ccccc1)c1ccccc1
|
| 1479 |
+
CCOC(=O)COc1ccc2c(c1)OC(=Cc1cccc(F)c1)C2=O
|
| 1480 |
+
S=C=NCCCCc1ccccc1
|
| 1481 |
+
COC(=O)CC(=O)OCC1=CCCC2C1(C)CC(O)C(C)C2(C)CCC1COC(OC)C1
|
| 1482 |
+
COc1cc(O)cc2oc(C)c(C)c(=O)c12
|
| 1483 |
+
COC(=O)C(NC(=O)c1ccccn1)C(C)O
|
| 1484 |
+
COc1cc(OC)cc(-c2cc(=O)c3c(OC)cccc3o2)c1
|
| 1485 |
+
NC(=O)CCNC(=O)C1=CC(N)C(O)C(O)C1
|
| 1486 |
+
Cc1ccccc1COc1ccc2oc(C)c(C(=O)O)c2c1
|
| 1487 |
+
CCCNC(=O)Nc1cccc2c1CN(C)CC2c1ccccc1
|
| 1488 |
+
C[C@H](N)Cn1ccc2cc(F)c(F)cc21
|
| 1489 |
+
CC(=O)C1C(O)CCC23OC2C(O)CC(C)C13C
|
| 1490 |
+
O=c1ccc2cc3c(-c4ccc5c(c4)OCCCO5)coc3cc2o1
|
| 1491 |
+
O=C1Cc2cc([NH+]([O-])O)ccc21
|
| 1492 |
+
COc1c2occc2cc2c(OCC=C(C)C)cc(=O)oc12
|
| 1493 |
+
CNC(=O)C(C1CC1)N1CCCC1C(=O)NC(C)C
|
| 1494 |
+
CN(C)C(=O)Oc1ccc2c(c1)OC(=Cc1cccc(Cl)c1)C2=O
|
| 1495 |
+
COc1cc(-n2cc(C(C)=O)c3ccccc32)ccc1C(N)=O
|
| 1496 |
+
CC1(C)CCc2cc(CC(=O)Nc3ccc4[nH]ccc4c3)ccc2O1
|
| 1497 |
+
CC1(C)CCc2c(c(O)cc3oc4cc(O)cc(O)c4c(=O)c23)O1
|
| 1498 |
+
N#Cc1cccc(CN2CC(O)CC2c2nc(C3CC3)no2)c1
|
| 1499 |
+
CC1CC=CC2(C)c3occc3CCC12C
|
| 1500 |
+
CCCc1cc(O)cc(OC2OC(CO)C(O)C(O)C2O)c1C(=O)O
|
| 1501 |
+
CCOc1ccccc1OC(c1ccccc1)C1CNCCO1
|
| 1502 |
+
N=Cc1cc2oncn2c1
|
| 1503 |
+
O=S1(=O)NC2CN(Cc3ccncc3Cl)CC2Oc2ncccc21
|
| 1504 |
+
CC(C)(CC(=O)O)Cc1nc2ccccc2n1Cc1ccc(O)cc1
|
| 1505 |
+
O=C(NCc1cccc(F)c1)n1ccnc1
|
| 1506 |
+
O=c1cc(-c2ccccc2)oc2cc(O)cc(OC3OCC(O)C(O)C3O)c12
|
| 1507 |
+
Cl.NC(N)=NC(=O)c1ccc2c(c1)C(O)c1c(Cl)cccc1-2
|
| 1508 |
+
CCC=CC(CC)CC(C)=CC1(CC)CC(C=CC(=O)O)(CC)OO1
|
| 1509 |
+
CC1(CO)CN(Cc2ccccc2)CC2CN(C(=O)C3CCCCC3)CCN21
|
| 1510 |
+
O=C(CCCc1nc(-c2cccnc2)no1)NCCc1c[nH]c2ccccc12
|
| 1511 |
+
CC(=O)OC1CCC2(C)C(CCC3C4CCC(=O)C4(C)CC(O)C32)C1
|
| 1512 |
+
COc1cc2oc(-c3ccccc3)cc(N)c-2c(=O)c1OC
|
| 1513 |
+
COC(=O)C1Cc2c([nH]c3ccccc23)C(c2ccccc2)N1
|
| 1514 |
+
CCC1=CC(O)CCC1=O
|
| 1515 |
+
Cc1c(CC(=O)N(C)CCc2ccccn2)c(=O)oc2cc(O)ccc12
|
| 1516 |
+
O=C(O)C=CC(=O)Nc1c(Cl)cc(Cl)cc1Cl
|
| 1517 |
+
C=Nc1noc2[nH]ccc12
|
| 1518 |
+
CC1CCCC=CCCC(=O)CCCC=CC=CC(O)CC=CC=CC(=O)O1
|
| 1519 |
+
CC1=CCC2CC1C2(C)C
|
| 1520 |
+
CCOC(=O)c1[nH]c2ccc3nc[nH]c3c2c1CCN1C(=O)c2ccccc2C1=O
|
| 1521 |
+
O=C(NC1COC2C(O)COC12)c1ccco1
|
| 1522 |
+
COc1oc(CCCCCC(C)O)c(C)c(=O)c1C
|
| 1523 |
+
COc1c(OC)c(OC)c(C(C)=O)c(OC)c1OC
|
| 1524 |
+
CC1OC(=O)Cc2c3c(c(O)c(O)c21)OC(C)(C)CC3
|
| 1525 |
+
CC(NC(=O)C1CCCCC1)c1c(-c2ccc(F)cc2)noc1C(=O)O
|
| 1526 |
+
CCCCCC=CCCCCCCCCc1cccc(O)c1O
|
| 1527 |
+
CC(=O)C1CC2C(C)(N=C=S)CCC(C(C)C)C2(O)C1O
|
| 1528 |
+
CCCc1nc(C)c(C)o1
|
| 1529 |
+
CC(C)=CCc1c(O)ccc2c1C(=O)C(O)=C(C)C2=O
|
| 1530 |
+
CC(=O)CC1CC1C1OC2=C(Br)CC(C=C=CBr)OC21
|
| 1531 |
+
CCCCC(=O)c1cc(O)cc(OC)c1
|
| 1532 |
+
O=C(COc1ccc(C(Cc2ccccc2)=NO)c(O)c1)OCc1ccccc1
|
| 1533 |
+
CC(O)C1NC(=O)CNC1=O
|
| 1534 |
+
O=C1CCc2c1c(CCCO)cn(CCO)c2=O
|
| 1535 |
+
O=C(Nc1cc(F)cc(F)c1)[C@H]1CCCC[C@H]1C(=O)O
|
| 1536 |
+
CCCCCCCCCCCCc1ccccc1C(SCCC(=O)O)[S+]([O-])CCC(=O)O
|
| 1537 |
+
CC#CC#CC#CC=CC(=O)CCCCO
|
| 1538 |
+
C=C1C(=O)OC2CC(C)C3C(O)CC(O)C3(C)C(OC(=O)C=C(C)C)C12
|
| 1539 |
+
CC1c2ccccc2CCN1c1ccncc1
|
| 1540 |
+
C=CC(C)(O)CCC1=C(C)C(O)CC(Br)C1(C)C
|
| 1541 |
+
C=CC(C)(O)CCC=C(C)C(O)C(=O)C=C(C)C
|
| 1542 |
+
COc1ccccc1C(=O)NCC1Cn2cc(-c3ccccc3)nc2CO1
|
| 1543 |
+
COc1cc(OC)c(C2COc3cc(O)cc(O)c3C2=O)cc1OC
|
| 1544 |
+
CC(=O)C=CC1CCc2c(cc(CC=C(C)C)c(O)c2C=O)O1
|
| 1545 |
+
Cc1nccc2onnc12
|
| 1546 |
+
O=C1C(O)=C(O)OC1C(O)CO
|
| 1547 |
+
CC1C(=O)Nc2cc(O)cc(c2O)C(Cc2ccccc2C(=O)O)CC=CC=CC1O
|
| 1548 |
+
O=C(NCc1ccc2c(c1)OCO2)c1c(O)c2cccc3c2n(c1=O)CCC3
|
| 1549 |
+
CNC1=CC(=O)C2(C)CCC1C2(C)C
|
| 1550 |
+
CC(O)C1(C)NC(=O)c2ccccc2N1
|
| 1551 |
+
COCOc1ccccc1[PH](=O)O
|
| 1552 |
+
CC1CCC2=C(COC2=O)C2(O)CC(C)(C)CC12
|
| 1553 |
+
CCCCCCCCCCCCC(C)CCC(=O)O
|
| 1554 |
+
O=C(c1cc(Cl)c(Cl)[nH]1)c1ccccc1O
|
| 1555 |
+
CC12COc3cc(CCc4ccccc4)cc(O)c3C1C2
|
| 1556 |
+
O=C(O)CC(=O)CBr
|
| 1557 |
+
COc1cc2c(c(O)c1O)C(=O)C(=O)C=C2
|
| 1558 |
+
Cc1ocnc2cnnc1-2
|
| 1559 |
+
CC1CCN(C2C(CNC(=O)N3CCOCC3)OC(CO)C2O)CC1
|
| 1560 |
+
O=P(O)(O)c1ccccc1OCCO
|
| 1561 |
+
CCC1CCCCN1Cc1c(O)ccc2c(=O)c(-c3nc4ccccc4s3)coc12
|
| 1562 |
+
OC1C=CC23c4cc5c(cc4CN(CC2O)C3C1)OCO5
|
| 1563 |
+
F[C]1[CH][CH][CH][C](C[NH+]2[CH][CH][CH][CH][CH]2)[CH]1
|
| 1564 |
+
CCCCCCCCCCC(C)(C)C(=O)Nc1c(OC)cc(OC)cc1OC
|
| 1565 |
+
CC(=O)OCC(C)=CCC(O)C(C)=CCCC(C)=CCO
|
| 1566 |
+
COc1ccccc1OCCCN1CCN(C(=O)c2ccc(=O)[nH]n2)CC1
|
| 1567 |
+
C=C1CCC2(N=C=S)C(C)CCC3C(C)CC(CC(C)(C)NC=O)C1C32
|
| 1568 |
+
CC(C(=O)O)c1ccc(OC2OC(CO)C(O)C(O)C2O)cc1
|
| 1569 |
+
Cn1cnc2c1CCNC2
|
| 1570 |
+
COc1c(O)cc(C(=O)O)cc1CC=C(C)C
|
| 1571 |
+
CC(C)C(N)C(=O)OCCOCn1cnc2c(=O)nc(N)[nH]c21
|
| 1572 |
+
C=CC(C)(O)C=CC=C(C)C(O)CC=C(C)C
|
| 1573 |
+
COc1ccc(C2=CC=C(N)[NH+](CCCC(=O)O)N2)cc1
|
| 1574 |
+
CCCCCc1cc(=O)c2cccc(OC)c2n1C
|
| 1575 |
+
O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C1CC1
|
| 1576 |
+
CCN(CC)CC1OCC(NS(C)(=O)=O)C1O
|
| 1577 |
+
Cc1ccc(Cl)c(C)c1
|
| 1578 |
+
NS(=O)(=O)c1nnc(NC(=O)CNCC(=O)O)s1
|
| 1579 |
+
O=C1Nc2ccc(-c3ccsc3)cc2C(=O)N2CCN(C(=O)C3CCCN3)CC12
|
| 1580 |
+
CC(=O)OCCC(C)=C(Cl)C=CC(C)(Cl)CBr
|
| 1581 |
+
N=Cc1cc2ncon2c1
|
| 1582 |
+
CCC(=O)N(CCC(Cc1ccccc1)c1ccco1)Cc1ccco1
|
| 1583 |
+
CC#Cc1ccc(-c2ccc(C(CCl)OC(C)=O)s2)s1
|
| 1584 |
+
COC(=O)C(C)C1CCC(C)(CCC=C(C)CCC=C(C)CCC=C(C)C)OO1
|
| 1585 |
+
CC(O)C(O)C12C(=O)OC(C=CC1C)C2O
|
| 1586 |
+
COC1CCC(=CC#N)C(OC2OC(CO)C(O)C(O)C2O)C1
|
| 1587 |
+
Nc1ccc2nc(NCCCN3CCOCC3)oc2c1
|
| 1588 |
+
CC#Cc1ccc(C=O)s1
|
| 1589 |
+
O=c1cc(-c2cccc(O)c2)oc2c1ccc1ccccc12
|
| 1590 |
+
CC1(c2ccncc2)CCC(=O)NC1=O
|
| 1591 |
+
O=P(NCc1ccccc1)(c1ccccc1)c1ccccc1
|
| 1592 |
+
O=C(NC1CCN2C(=O)c3ccccc3NC(=O)C2C1)c1cccs1
|
| 1593 |
+
O=C(O)Cc1cccc2nc3ccccc3nc12
|
| 1594 |
+
CC(C)=CCc1c(C=O)cc(O)c2[nH]c3ccccc3c12
|
| 1595 |
+
[O-][NH+]1CC=CC=C1SSC1=CC=CC[NH+]1[O-]
|
| 1596 |
+
COc1ccc(N2CN(C=O)c3ccc(OC)cc3C2=O)c(CO)c1
|
| 1597 |
+
CCP(=O)(OC)C(=O)C(C)(C)C
|
| 1598 |
+
CCCC(=O)c1ccc(O)cc1O
|
| 1599 |
+
CCC(CC)C(=O)N1CCC(CC(=O)O)C(CC2=NCCc3ccc(C)cc32)C1
|
| 1600 |
+
CC(C)=CCCC(C)(O)C1CC(=O)C(C)=CC1O
|
| 1601 |
+
COc1cccc2oc3ccc(O)cc3c(=O)c12
|
| 1602 |
+
Cc1nc(C(=O)O)sc1CCOP(=O)(O)O
|
| 1603 |
+
O=C1CC2C(O)C=C(CO)C2(CO)O1
|
| 1604 |
+
COc1ccc2c(ccn2CCC(=O)NC(Cc2ccccc2)C(=O)O)c1
|
| 1605 |
+
CCCCCCCCCCCCCCCCCCCC1CC(O)CC(=O)O1
|
| 1606 |
+
C=Cc1ccc(O)c(C(=O)C=C(C)C)c1
|
| 1607 |
+
COCC(=O)N1CCN2C(=O)N(CC(N)=O)C(=O)C2C1
|
| 1608 |
+
CCOC(=O)c1ccc([NH+]([O-])O)cc1NC(=O)c1ccccc1
|
| 1609 |
+
CC(C)(C)CC(=O)NC(Cn1cc(F)c(=O)[nH]c1=O)C(=O)O
|
| 1610 |
+
O=P(N1CC1)(N1CC1)N1CC1
|
| 1611 |
+
Cc1[nH]cnc1CSCC/N=C(\N)NCCSCc1nccs1
|
| 1612 |
+
CC1CC(=O)c2c(O)cc(O)cc2O1
|
| 1613 |
+
CC1=CC(O)CC(C)=CC(O)CC(C)(O)C=CC(C(C)C)CC1
|
| 1614 |
+
CC(=O)CCc1oc2ccc(C)cc2c1-c1ccccc1
|
| 1615 |
+
Cc1cccc(OC2COCCN(C(=O)c3cnccn3)C2)n1
|
| 1616 |
+
O=C(c1ccccc1)N1CCc2[nH]c3c(Br)cccc3c2C1
|
| 1617 |
+
CC(=O)Nc1ccc2c(c1)Cc1cc(NC(C)=O)ccc1-2
|
| 1618 |
+
NC(=O)CCNC(=O)C1=CC(NC(=O)c2cccc(F)c2)C(O)C(O)C1
|
| 1619 |
+
CC1=C2CC3C(C)(C=CC(O)C34CO4)CC2OC1=O
|
| 1620 |
+
COc1ccc(CO)c2c1Nc1c(C(=O)O)cccc1N2C(=O)CO
|
| 1621 |
+
COc1ccc(C=CC(=O)c2c(O)cc(OC)c(O)c2OC)cc1
|
| 1622 |
+
O=C(O)COc1ccc2c(c1)OC(=Cc1cccc(Br)c1)C2=O
|
| 1623 |
+
CN1C(=O)COCC1C(O)c1ccc(NC(=O)c2ccco2)cc1
|
| 1624 |
+
CCOP(=O)(CCN)OCC
|
| 1625 |
+
Cc1cc2oc(=O)cc(CN3CCCCC3C)c2cc1O
|
| 1626 |
+
CC1(C)CCCC2(C)C1CCC(C)(O)C2CC(=O)c1ccoc1
|
| 1627 |
+
CCN1C(=O)C(Cc2c[nH]c3ccccc23)NC1=S
|
| 1628 |
+
CCCCCCC1CCCCCCCCCCC(=O)OC2C(O1)OC(CO)C(O)C2O
|
benchmark/data/train_smiles.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
benchmark/data/val_smiles.txt
ADDED
|
@@ -0,0 +1,1627 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
C=CCON=C(CCC)C1C(=O)CC(C)(C)C(C(=O)OC)C1=O
|
| 2 |
+
CC(=O)OC1c2c(C)coc2CC2C(=O)CCC(C)C21C
|
| 3 |
+
COc1ccccc1-c1nnc(N)[nH]1
|
| 4 |
+
CCN(CC)CCOC(=O)C(C)Oc1ccc(Cl)cc1
|
| 5 |
+
COc1ccc(CCNC(=O)c2ccc3ccn(C)c3c2)cc1OC
|
| 6 |
+
Cc1ccc(C2CC(O)C(O)C2NCC(C)(C)O)cc1
|
| 7 |
+
COc1ccc(OC)c(-c2oc3ccccc3c(=O)c2O)c1
|
| 8 |
+
CC1CC(=O)OCC1O
|
| 9 |
+
NC(=O)C1(O)CC(O)C2OC21
|
| 10 |
+
C#CCCCCCCCCCC=C1C(=O)OC(=C)C1OC
|
| 11 |
+
C#CC1CN2CCC1CC2CNC(=O)N1CCOCC1
|
| 12 |
+
COc1ccc(CNCC(CO)C2(c3ccccc3)CCOC(C)(C)C2)cc1
|
| 13 |
+
OCC1OC(n2cnc3c(NC4CCCC4O)ncnc32)C(O)C1O
|
| 14 |
+
Ic1ccc(NCn2nnc3ccccc32)cc1
|
| 15 |
+
CCC(C)CCc1oc(=O)c(C)c(O)c1C
|
| 16 |
+
C=C(C)C1CCC(C)(O)C1C
|
| 17 |
+
c1ccc(C2=NN(c3ccccc3)C2)cc1
|
| 18 |
+
Cc1ccc(OP(=O)(Oc2ccc(C)cc2)Oc2ccc(C)cc2)cc1
|
| 19 |
+
Cc1c(O)cc(O)c(C=O)c1C
|
| 20 |
+
O=C(NCC1CCCO1)c1c(O)c2ccccc2[nH]c1=O
|
| 21 |
+
COC1CC(=O)OC(C)CCCCCC(O)C1=O
|
| 22 |
+
COc1cc2c(c(O)c1OC)C(C)N(C)CC2O
|
| 23 |
+
COc1ccc2[nH]c3c(NN)nncc3c2c1
|
| 24 |
+
O=C(O)CCc1ccc(O)cc1O
|
| 25 |
+
COc1ccccc1C=CC(=O)c1ccc(NC(C)=O)cc1O
|
| 26 |
+
C#CCN(C)Cc1nc(C2(O)CCN(C(=O)C3CCCCC3)CC2)cs1
|
| 27 |
+
CCC1CN2CCC3(C(=O)Nc4cc(OC)ccc43)C2CC1CCO
|
| 28 |
+
COC(=O)c1c(O)cc(O)c(CC=C(C)CCC=C(C)CCC=C(C)C)c1C
|
| 29 |
+
CCCCCC[N+](C)(C)CCO
|
| 30 |
+
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCOC(=O)CCCCCCCCCCCCCCC
|
| 31 |
+
CCNC(=O)/C=C(C)/C=C/CC(C)CCCC(C)(C)OC
|
| 32 |
+
O=C(NCCc1ccccc1)c1cc2ccc(O)c(O)c2cn1
|
| 33 |
+
CC1CCOC1=O
|
| 34 |
+
O=C(OCCO)c1cc(O)c(CO)cc1C(=O)c1ccccc1O
|
| 35 |
+
CCCC(CC(CCc1ccc(OC)c(O)c1)OC(C)=O)OC(C)=O
|
| 36 |
+
COc1c(C)c(O)cc2cc(CC(C)O)oc(=O)c12
|
| 37 |
+
C=C1CCCC(C)(C)C1CCC(C)=CCO
|
| 38 |
+
CNCC(c1ccccc1)c1ccccc1
|
| 39 |
+
CCCCCCCCc1cc2cn(C3CCC(CO)O3)c(=O)nc2o1
|
| 40 |
+
CC(C)CCn1c(N)nc2c1c(=O)n(C)c(=O)n2C
|
| 41 |
+
O=C(Cc1ccc(Cl)cc1)NC1C(c2cncnc2)CC(O)C1O
|
| 42 |
+
CC(C)=CCCC(C)=CCOc1cc(O)c(C(=O)c2ccccc2)c(O)c1
|
| 43 |
+
COc1cc(C=CC(=O)NC2=C(O)CCC2=O)ccc1O
|
| 44 |
+
CCCCCCCCCCCCCCCC(=O)OCCC
|
| 45 |
+
Oc1ccc(-c2ccc(-c3ccc(O)nn3)cc2)nn1
|
| 46 |
+
CC(=O)C1CCC(C)C1c1occc1C(C)C
|
| 47 |
+
CCCCCC=CCC1OC(C=C=CBr)CC1OC(C)=O
|
| 48 |
+
C=C1C(=O)OC2CC(=C)C3C(OC(C)=O)CC(C)(O)C3CC12
|
| 49 |
+
CNC1C(O)CC(N)C(OC2OC(CN)=CCC2N)C1O
|
| 50 |
+
O=C(CC(CO)C(=O)c1ccc2c(c1)OCO2)c1ccc2c(c1)OCO2
|
| 51 |
+
CC(C)C(N)C(=O)NC(C(=O)O)C1CC(O)C(O)CN1
|
| 52 |
+
COc1cc(C=CC(=O)OCC2CCCN3CCCCC23)ccc1O
|
| 53 |
+
CC1CCC(C)(COC(=O)CCc2ccccc2)C1(C)C
|
| 54 |
+
CCCOC(=O)C(Cl)(C(F)(F)F)C(F)(F)F
|
| 55 |
+
COc1c(C)c(O)c(C=O)c2c1C(=O)OC2
|
| 56 |
+
Nc1ccc(F)c(CO)c1
|
| 57 |
+
O=C(/C=C/c1ccc(OCc2ccccc2)cc1)N(O)CCc1ccccc1
|
| 58 |
+
CC=Cc1ccc(OC)cc1
|
| 59 |
+
CCC=CCCCCO
|
| 60 |
+
CC(C)C1CCC2(CO)CCC3(C)CC(O)C4(C)CC4C3C12
|
| 61 |
+
COc1ccc(-c2coc3cc(O)c(OC)cc3c2=O)c(O)c1
|
| 62 |
+
C=C(COCC(O)COC(=O)CCCC=CCC=CCC=CCC=CCCCCC)C(=O)O
|
| 63 |
+
C=CC1(C)CC2(O)OC(=O)C(C)=C2CC1C(=C)C(=O)OC
|
| 64 |
+
CCCCCn1cnc2c(S)nc(N)nc21
|
| 65 |
+
COc1c(C)c(O)c2c(c1C(=O)O)C(O)OC2
|
| 66 |
+
CNC(Cc1c[nH]c2cccc([NH+]([O-])O)c12)C(=O)O
|
| 67 |
+
O=c1ccn(C2OC(CO)C(O)C2O)c(=O)[nH]1
|
| 68 |
+
COc1ccc(-n2cc(-c3ccccn3)nn2)cc1N
|
| 69 |
+
O=Cc1ccc(O)c(Br)c1
|
| 70 |
+
CC(=CCO)CCCC(C)(C)O
|
| 71 |
+
O=C1CC2CCN(Cc3ccccc3)CC2CCN1C1CCCCC1
|
| 72 |
+
CC(C)=CCCC(=CCCC(=CCCC(=CCO)CO)CO)CO
|
| 73 |
+
S=CC=NCc1ccccc1
|
| 74 |
+
CCC(C)CCC1=C(C)C(=O)C(=O)c2c1[nH]c1ccccc21
|
| 75 |
+
CC=CC=CC=CC(=O)CC=C1CC=C(O)C(OC)C1
|
| 76 |
+
COc1ccc2c(c1)OC1C3C=CC(OC)C=C3OCC21O
|
| 77 |
+
CC(O)C(=O)OC1C=CC(CC(=O)C(=O)O)(C(=O)O)C=C1
|
| 78 |
+
CCCC(C)CCCCC(C)CC
|
| 79 |
+
COc1ccc2c3c1OC1C(=O)CCC4C(C2)N(C)CCC314
|
| 80 |
+
CCCCN(C)Cc1cc(=O)oc2ccc3ccccc3c12
|
| 81 |
+
COC1CC2C(=O)N3CCN(C(=O)NC(C)C)CC3C(=O)N2C1
|
| 82 |
+
CCCCCCCC(=O)c1c(O)cccc1O
|
| 83 |
+
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=CCCC1OC(=O)COC1=O
|
| 84 |
+
CC(C)[C@H](NC(=O)[C@@H](S)Cc1ccccc1)C(=O)N1CCCC1C(=O)O
|
| 85 |
+
CCCCOCC(C)OCC(C)OCC(C)OCC(C)OCC(C)O
|
| 86 |
+
CCc1ccc(C)c(O)c1
|
| 87 |
+
CC(=O)Nc1ccc(-c2ccc(N=Nc3ccccc3)cc2)cc1
|
| 88 |
+
CCCCCC=CCC=CC=CC(O)C(O)C=CCCCC(=O)O
|
| 89 |
+
Cc1ccccc1CN1CCN2C(=O)N(Cc3ccccn3)CC2C1
|
| 90 |
+
CC(C)CC(NC(=O)C(CC(=N)O)NC(=O)CN)C(=O)O
|
| 91 |
+
CC=Cc1cc(OC)c2oc(-c3ccc(O)cc3O)cc2c1
|
| 92 |
+
O=C1CCCCCCCC=CCCCCCCC1
|
| 93 |
+
CC12CCC3C(C)(CO)C(O)CCC3(C)C1CC(C1=CCOC1=O)O2
|
| 94 |
+
CCCCCC(O)C=CC(=O)CCCCCCCCC(=O)O
|
| 95 |
+
COC1C=CC2(C(=O)OCc3cc4c(cc32)OCO4)C(N(C)C=O)C1
|
| 96 |
+
O=C(O)C=Cc1ccc2ccccc2n1
|
| 97 |
+
CC(=O)c1ncco1
|
| 98 |
+
CC(C)N(C(=O)CS(=O)(=O)O)c1ccccc1
|
| 99 |
+
C=CC(C)(O)CCC1(C)C(C)CCC2(C)C(C)=CCC(C)C21
|
| 100 |
+
CO[C]1[CH][CH][C]([C]2[NH2+][CH][CH][CH][C]2[O-])[CH][C]1N
|
| 101 |
+
Fc1ccc2[nH]c(C3CCN(Cc4ccccn4)C3)nc2c1
|
| 102 |
+
C=C1CCC2C3(C)COC3CCC2(C)C1CC=C1C(=O)OCC1O
|
| 103 |
+
Cc1cc(C)nc(NC(Cc2ccccc2)C(=O)O)n1
|
| 104 |
+
CCOc1ccc(Cc2ccc(NC3=NCCN3)cc2)cc1
|
| 105 |
+
COC1CCC(OC2CCC(O)C(C)O2)C(C)O1
|
| 106 |
+
COc1ccc2c(O)c(C(=O)O)cnc2c1
|
| 107 |
+
CC[C]1[CH][CH][CH][NH+](C[C]2[CH][CH][C](Cl)[CH][CH]2)[CH]1
|
| 108 |
+
CCSC(SCC)C(CCC(O)CNC(C)=O)NC(C)=O
|
| 109 |
+
CC1Oc2c(O)cc3ccc(=O)oc3c2C1(C)C
|
| 110 |
+
C=CCC1(Cl)C(O)=C(Cl)C(=NCCCC(=O)OC)C1(OC)OC
|
| 111 |
+
Clc1cn2ccsc2n1
|
| 112 |
+
CN(C)Cc1ccccc1Sc1ccc(Br)cc1N
|
| 113 |
+
CC(C)CCCC(=O)CCCCCCC(=O)CCN(O)C(=O)C(N)CO
|
| 114 |
+
C=C1C(=O)OC2C=C(CO)CCC=C(C)CC(OCC(C)C)C12
|
| 115 |
+
CC(=O)NC(CCC(O)=CNN)C(=O)O
|
| 116 |
+
CCCCCCCCC=CCCCCCCCCCC(=O)OCC(CO)OC(=O)CCCCCCCCCC=CCCCCCCCC
|
| 117 |
+
C=CC(C)(CCC1C2(C)CCC(O2)C1(C)CCC=C(C)C)OC(C)=O
|
| 118 |
+
COC(=O)C1Cc2c([nH]c3ccccc23)C(C)N1
|
| 119 |
+
c1coc(-c2ccc(C3=Nc4cccc5cccc(c45)N3)cc2)c1
|
| 120 |
+
COc1cc2c(cc1O)C1CCc3cc(OC)c(O)cc3N1CC2
|
| 121 |
+
C#CC#CCCCC=CCCCC(=O)NCC(C)C
|
| 122 |
+
CCCCCCCCC(C)CCCCCCCC(=O)OC1C(O)C(O)C(O)C(O)C1OC1OCC(O)C(O)C1O
|
| 123 |
+
Nc1nc(O)c2ncn(CCC(CO)CO)c2n1
|
| 124 |
+
CCOC(=O)C1(CCCc2ccc(Cl)cc2)OC12CCCCC2
|
| 125 |
+
COc1ccccc1N1CCN(CCN2C(=O)CC3(CCCC3)CC2=O)CC1
|
| 126 |
+
CC(=O)OCC(C)CCCC(C)C1CCC(C)C12CC=C(C)C(O)C2
|
| 127 |
+
CC1=C(C(=O)O)C2(C)CCCC(C)(C(=O)O)C2CC1O
|
| 128 |
+
CC1=CCC(C)(C)C2CCC(C)C2(O)CC1
|
| 129 |
+
CC1(C)CCCC2(C)CC=C(C=O)C3CC312
|
| 130 |
+
CCOc1cc(OC)c(CC(C)N)cc1OC
|
| 131 |
+
CC1CC2OC2(C)CCC(=O)C2CC(C)(C)C12
|
| 132 |
+
O=Cc1cc(O)c2[nH]c3ccccc3c2c1
|
| 133 |
+
CC(C)Oc1ccc(CNCC2CCOC(C)(C)C2)cc1
|
| 134 |
+
O=C(O)c1ccc(Nc2ncc(F)c(Nc3ccccc3F)n2)cc1
|
| 135 |
+
CCc1[nH]c(O)nc1C(=O)NCCCn1ccnc1
|
| 136 |
+
Cc1cc(-c2ccc(N)c(C)c2)ccc1N
|
| 137 |
+
CCOC(=O)c1c(O)cc(O)cc1CCCCCCCC(C)O
|
| 138 |
+
CCCCCCCCCCC(=O)CC(=O)N[C@H]1CCOC1=O
|
| 139 |
+
CC12CCC3C(C(=O)CC4CC(=O)CCC43C)C1CCC2=O
|
| 140 |
+
Nc1ccc([N+](=O)[O-])cc1C(=O)O
|
| 141 |
+
C#Cc1cc(O)nc(O)n1
|
| 142 |
+
NC(Cc1ccc(S(=O)(=O)O)cc1)C(=O)O
|
| 143 |
+
COC(=O)C=CC(O)=C1C(=O)Oc2cc(O)c(O)cc21
|
| 144 |
+
COC(=O)C(C)=CCCC(C)=CCC1(CC(=O)O)CC(=O)CCC1=O
|
| 145 |
+
O=C(O)CNC(=O)c1ccc(C(O)c2ccccc2)cn1
|
| 146 |
+
CCCC(=O)C(CC)Sc1ccoc1C
|
| 147 |
+
O=c1cc2c3c(ccn2CCCO)cnc3c1O
|
| 148 |
+
COc1ccc(CCNC(=O)CCCc2c[nH]c3ccccc23)cc1
|
| 149 |
+
COc1ccc2oc(C)c(C(=O)Nc3ccccc3C)c2c1
|
| 150 |
+
CC=C(C)C(=O)OC1c2occ(C)c2CC2(C)C(C)CCCC12
|
| 151 |
+
CCCCN(C(C)=O)C(CC)C(=O)NCc1ccccc1
|
| 152 |
+
Cc1cc(O)c2c(c1)C1C(C(C)C)CCC1(C)C(O)C(O)C2=O
|
| 153 |
+
CCCCCC(CC)OC1OC(COC(=O)CC(C)(O)CC(=O)O)C(O)C(O)C1O
|
| 154 |
+
CC1C(=O)OC2C=C(CO)CCC=C(C=O)CC(O)C21
|
| 155 |
+
CCC1(c2ccc(N)cc2)CCC(=O)NC1=O
|
| 156 |
+
C[C]1[CH][CH][NH+](C[C]2[CH][CH][CH][CH][C]2F)[C](C)[CH]1
|
| 157 |
+
NC(N)=NCCCCC(N)C(=O)O
|
| 158 |
+
CC(C)=C1CCC2=CC(=O)CC(C)C2(C)C1
|
| 159 |
+
C=CC1(C)CC(O)C2C(O)(CCC3C(C)(C)CCCC32C)C1
|
| 160 |
+
CCCCCCC=CCCCCCCCc1cc(O)cc(OC(C)=O)c1
|
| 161 |
+
C=C1C=CC(C(=C)C)CC1
|
| 162 |
+
CN(C)/N=N/c1ccccc1C(N)=O
|
| 163 |
+
CC(C)(C)NC(=O)CC1CCNCC1Cc1cc(C(C)(C)C)on1
|
| 164 |
+
CC(=O)NC1C(O)C=C(CO)C(O)C1O
|
| 165 |
+
CC(C)=CCc1ccc2[nH]ccc2c1
|
| 166 |
+
CC1CCCC1C
|
| 167 |
+
CC1=CCC2C(C)(C)CC(O)CC2(C)C1CCC(C)CCO
|
| 168 |
+
CC1CC2OC(=O)C3=CCCC(C1(C)CC(O)C1=CC(=O)OC1O)C32C
|
| 169 |
+
Cc1cccc(Nc2cc(Cl)nc(SCC(=O)O)n2)c1C
|
| 170 |
+
CCCCCCCCCCCCCCCCc1ccc(CC(=O)O)o1
|
| 171 |
+
CC=C(C)C(=O)OCC1=CCN2CCC(OC(=O)C=C(C)CO)C12
|
| 172 |
+
COc1cc2c(c(OC)c1OC)C(=O)C(Cc1ccc(O)cc1)CO2
|
| 173 |
+
Oc1ccc(C=Nc2ccccc2)cc1
|
| 174 |
+
O=C(O)CCCCCNc1ccc(C(=O)O)cc1
|
| 175 |
+
CCCCCCCCCCc1ccc2c(c1)N(C)[C@@H](C(C)C)C(=O)N[C@H](CO)C2
|
| 176 |
+
C=CCCCCC=CC#CCCCCCCCCC1CC(CO)OC1=O
|
| 177 |
+
C=C1OC(=O)C(C(C=CC)C=CC(=O)C=O)C1=O
|
| 178 |
+
CC(=O)c1c(O)cc(O)cc1CC1Cc2cc(O)cc(O)c2C(=O)O1
|
| 179 |
+
CCC(C)C(NC(=O)NCC(C)C)C(=O)O
|
| 180 |
+
COc1cc(O)c2c(c1)CCCCCCCCCCCCCC(C)OC2=O
|
| 181 |
+
NC(=O)c1ccccc1NC(=O)c1ncn2c(=O)n(CCCl)nnc12
|
| 182 |
+
CC1Cc2cc(O)cc(O)c2C(=O)O1
|
| 183 |
+
COc1c(Br)cc(C=CC(=O)NCCCNCCCCNCCCN)cc1Br
|
| 184 |
+
CC1Cc2c(O)c(O)cc(O)c2C(=O)O1
|
| 185 |
+
CCCCCCCCC=C=CCCCCCCCCCCCCCCCCCCCC
|
| 186 |
+
CC(C)=CCC1C(=O)C(C(=O)CCC(C)C)=C(O)C1O
|
| 187 |
+
COc1c(C)c(O)c2c(=O)c(O)c(-c3ccccc3)oc2c1C
|
| 188 |
+
CC#CC#CC#CC=CC1OCCCC1O
|
| 189 |
+
CN(C)Cc1c[nH]c2ccccc12
|
| 190 |
+
Oc1ccc(O)nc1
|
| 191 |
+
CCCCOCCn1c(N2CCCN(C)CC2)nc2ccccc21
|
| 192 |
+
CCCCCCCCCCCCCCCC(=O)OCC1(O)OCC(O)C(O)C1O
|
| 193 |
+
CC(C)=CCCC(C)=CCc1cc2c(cc1O)oc(=O)c1c3ccc(O)cc3oc21
|
| 194 |
+
CCCCCC1=C(C(=O)OCC)C(c2cccc(C)c2)NC(=O)N1
|
| 195 |
+
CC(C)CC(=O)OC1CC2CC(O)C(C1)N2C
|
| 196 |
+
CC(O)C(O)C(O)CO
|
| 197 |
+
CCC(C)C=Cc1cc2cc(O)c(C)c(O)c2c(=O)o1
|
| 198 |
+
CCC(C)=CCC(O)CCCCCCCCCCCCCCCOC(C)=O
|
| 199 |
+
CCCCCCCCCCCCCCCC(=O)OCC(C=COC(C)=O)=CCC1C(C)=CCCC1(C)C
|
| 200 |
+
NCCCCCCCCCCC(=O)O
|
| 201 |
+
C=C1CCC2OC2(C)C(O)CC2C(C=O)=CC(C=C(C)C)C(O)C12
|
| 202 |
+
Cc1cc(C)c(C#N)c(SCc2cc(=O)oc3cc(O)c(O)cc23)n1
|
| 203 |
+
O=Cc1ccc(CO)n1CCc1ccc(O)cc1
|
| 204 |
+
CC(CC1=C(CO)COC1=O)C1=CC(C)(C)CC1
|
| 205 |
+
COc1ccc2c(=O)c(-c3ccccc3)c(C)oc2c1
|
| 206 |
+
C[n+]1ccn(COCCCS(C)(=O)=O)c1/C=N/O.[Cl-]
|
| 207 |
+
COc1ccc(-c2oc3cc(OC)ccc3c(=O)c2O)cc1
|
| 208 |
+
C=C1CCC2C(C)(C)C(O)CCC2(C)C1CCC(C)CCO
|
| 209 |
+
C=CC=CC1OC(C)CC1O
|
| 210 |
+
Cc1cccc(O)c1-c1nc2c(C(=O)O)cccc2o1
|
| 211 |
+
COc1cccc(CO)c1O
|
| 212 |
+
OCC1CCC(O)CN1
|
| 213 |
+
O=C1c2ccccc2CC2C=CC=CC12
|
| 214 |
+
CC(C)NCC(O)COC(=O)c1ccccc1Cl
|
| 215 |
+
O=C1c2ccccc2-c2c1c1ccccc1c(=O)n2CCCn1ccnc1
|
| 216 |
+
CCCCN1CCC[C@H]1CNC(=O)c1cc(SC)cc2c1OCCN2C
|
| 217 |
+
COc1cc(O)cc2c1CCN(C)C2
|
| 218 |
+
O=C(Cc1ccccc1)NNc1ccc([NH+]([O-])O)cc1
|
| 219 |
+
COC(=O)c1ccc2c(c1)-c1cc(O)ccc1OC2(C)C
|
| 220 |
+
CC(=O)CC(C[NH+]([O-])O)c1ccccc1
|
| 221 |
+
COc1ccc2ccc3oc([N+](=O)[O-])cc3c2c1OC
|
| 222 |
+
NCCN(C(=O)c1ccc(Cl)cc1Cl)c1ccc(F)cc1
|
| 223 |
+
CC1=CCCC2(C)OC2CC(=C(C)CO)C(=O)C1
|
| 224 |
+
CC=C(C)C(=O)Oc1ccc(OC(=O)C(C)C)cc1C1OC1C
|
| 225 |
+
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC(C)=O
|
| 226 |
+
C=CCc1ccc(O)c(-c2cc(OC(O)C=C)ccc2O)c1
|
| 227 |
+
CCc1c(C)c(O)cc2c1C=CC1C(C)(C)C(=O)CCC21C
|
| 228 |
+
CCCCCCCCCCCCCCCC1OC(COP(=O)([O-])O)CS1.[Na+]
|
| 229 |
+
CN(C)Cc1ccccc1-c1ccc2n(c1=O)CC1CC2CN(C(=O)CSCC(=O)O)C1
|
| 230 |
+
CC(C)=CCCC(C)=CCCC(C)CC(=O)O
|
| 231 |
+
CCN1CCCC1CNC(=O)COc1cc(O)c2c(c1)OC(C)(C)CC2=O
|
| 232 |
+
CC1(C)CCC2(CCC(C)(C)O2)O1
|
| 233 |
+
C=CC(C1=CC(O)C(OC)=CC1OC)c1ccccc1
|
| 234 |
+
C#CCn1ccc2cc(C(=O)OC)ccc21
|
| 235 |
+
CC(=O)Oc1cc(O)c2c(c1)OC(c1ccccc1)CC2=O
|
| 236 |
+
Nc1ncc(-c2ccc(O)cc2)nc1Cc1ccccc1
|
| 237 |
+
COc1ccccc1CCN1CC(C(=O)NCC2CCCN3CCCCC23)CC1=O
|
| 238 |
+
CCCCc1cc(OC)c(OC)cc1OC
|
| 239 |
+
COc1cc(OC)c2c(C)c(CC(=O)NCCC(=O)O)c(=O)oc2c1
|
| 240 |
+
Cc1ccc2c(c1)C(=O)CCC2C
|
| 241 |
+
Cc1cc(N)ccc1C(=O)OCC(O)CNC(C)(C)C
|
| 242 |
+
CCN(CC)C(=O)C=C(C)C(F)(F)F
|
| 243 |
+
CC(=O)OC1OC2OC=C(C(=O)CC=C(C)C)C3CCC1C23
|
| 244 |
+
CCCCC(=O)N1CSC[C@H]1C(=O)N1CCCC1
|
| 245 |
+
COc1c(C=CC=O)cc2c(c1OC)OCO2
|
| 246 |
+
O=C(NCC1CC2CCN1CC2CN1CCOCC1)c1ccco1
|
| 247 |
+
COC(=O)c1ccc(CCCCCCCCCCC(C)=O)cn1
|
| 248 |
+
CNc1ccccc1C(=O)CC(NC(C)=O)C(=O)OC
|
| 249 |
+
O=C(O)CSCC(=O)N1CC2CC(C1)c1ccc(-c3cccnc3)c(=O)n1C2
|
| 250 |
+
C=C1C(=O)OC2CC(=C)C3CCC4(C)OC34CC12
|
| 251 |
+
CC(=O)Nc1ccccc1-c1onc(-c2ccccc2)c1-c1ccccc1
|
| 252 |
+
CCC(=O)C(C)C
|
| 253 |
+
CCn1nnnc1Cc1ccc2[nH]cc(CCN(C)C)c2c1
|
| 254 |
+
COC1COCCN(C(=O)c2cccc(F)c2)C1
|
| 255 |
+
COc1ccc2c(C)c(CC(=O)NC(C)C)c(=O)oc2c1OC
|
| 256 |
+
COc1cc(OC)c(C(=O)CC(C)=O)c(OC)c1
|
| 257 |
+
CC(=O)OCC=C(C)CCC1C(C)=CCC2C(C)(C)C(OC(C)=O)CCC12C
|
| 258 |
+
CC1=C(C)C(=O)C(CCC(C)(O)CCC(=O)O)=C(C)C1=O
|
| 259 |
+
Cc1cc2c(c(C)c1CCO)CC(C)(CO)C2
|
| 260 |
+
C=C(C(=O)OC1C2C(CC(C)C3C=CC(=O)C31C)OC(=O)C2C)C(C)O
|
| 261 |
+
CC=CC#CC#CCCCOC(=O)CC(C)C
|
| 262 |
+
COc1cc(C=Cc2ccc3c(c2)OCO3)oc(=O)c1
|
| 263 |
+
CC(=CC(=O)c1ccccc1)NCCCC(=O)O
|
| 264 |
+
COc1cccc(C2=CCN(C)CC2)c1
|
| 265 |
+
COc1ccc(CN2CC3CN(Cc4ccccn4)CCN3C2=O)cc1
|
| 266 |
+
CC1CC23C4=CCCN2CCCC3C(=O)CC4C1O
|
| 267 |
+
Cc1cc2c(cc1Br)C1(C)CCC(C)(O2)C1C
|
| 268 |
+
CN(CCc1ccccc1)C1C(CNC(=O)c2cccc(F)c2)OC(CO)C1O
|
| 269 |
+
CC[C]1[CH][CH][CH][NH+](CC(=O)[C]2[CH][CH][CH][CH][CH]2)[CH]1
|
| 270 |
+
CCCC=CCOC(=O)CCCCCCC
|
| 271 |
+
CNC(=S)N(O)c1ccccc1
|
| 272 |
+
CC1=CCc2c(cc(CCc3ccc(O)cc3)c(C(=O)O)c2O)OC1
|
| 273 |
+
CC(=CCC(Br)C(C)=CC(Cl)Cl)CBr
|
| 274 |
+
O=C(O)c1ccccc1C1c2ccc(O)cc2Oc2cc(O)ccc21
|
| 275 |
+
CC(=O)CCC(=O)c1ccoc1
|
| 276 |
+
CN1CCC(c2c[nH]c3ccc(-n4cnnc4)cc23)CC1
|
| 277 |
+
C=CC=CC=CCC
|
| 278 |
+
O=C(O)c1cc(C(=O)O)c([NH+]([O-])O)cc1[NH+]([O-])O
|
| 279 |
+
Cc1nnn(C2CCN(Cc3ccccc3)CC2)c1-c1ccccc1
|
| 280 |
+
O=c1[nH]c2ccccc2o1
|
| 281 |
+
CC(=O)OC1CC2(C)C3CC1C(C)(C)C2(O)CCC3C
|
| 282 |
+
Cc1ccc(-c2cc(=O)c3cc(Cl)cc(Cl)c3o2)cc1
|
| 283 |
+
NC(=O)Nc1ccc(C2=NNC(=O)CC2)cc1
|
| 284 |
+
CC(C)(C)C1(C)CCNC1=O
|
| 285 |
+
C=C1CCC2OC2(C)CCC2C(C)CC12
|
| 286 |
+
CN(Cc1ccccc1)C(=O)CC1C(O)CCC2C(C)(CO)C(O)CCC12C
|
| 287 |
+
COc1cc(C=CC(=O)O)c(O)cc1O
|
| 288 |
+
CC(C)=CCCC(C)C1CC=C(C)C2CCC(C)=CC21
|
| 289 |
+
CC1(C)C2C=CCC1C2
|
| 290 |
+
O=C(O)C(=Cc1ccccc1[NH+]([O-])O)c1ccccc1
|
| 291 |
+
[O-][NH+](O)c1cccc(NC2OC(CO)C(O)C2O)c1
|
| 292 |
+
O=C(NCC1OC(CO)C(O)C1N1CCc2ccccc2C1)c1ccccc1
|
| 293 |
+
CCCCCCC=CCCCCC=CCCCCCC(=O)O
|
| 294 |
+
CCCCC12CN3CC(C)(CN(C1)C3C(O)C(O)C(O)CO)C2=O
|
| 295 |
+
NC(=O)OCC(N)C(=O)O
|
| 296 |
+
Cc1ccn(C(N)=O)n1
|
| 297 |
+
CC1(C)C=C(n2ccccc2=O)c2cc(C#N)ccc2O1
|
| 298 |
+
CC(=CC(=O)O)C=C(C)CC(C)CCCCC(O)CCO
|
| 299 |
+
COC(=O)C1CCC=C(C)C(=O)C2CC(C)(C)CC12
|
| 300 |
+
COC(=O)c1cocc2c(C)ccc1-2
|
| 301 |
+
CC(=O)CCC1=C(C)CCC(C)C1(C)C
|
| 302 |
+
CC(O)c1c(-c2ccc(F)cc2)noc1C(=O)NC1CCCCC1
|
| 303 |
+
CCn1c(O)c(C(=O)NCCN(C)C)c(=O)c2ccccc21
|
| 304 |
+
C=C(C)C1CCC2(C)OC3=C(CC12)C(=O)C1(O)COC3C1
|
| 305 |
+
CCCCCCCCCCCCCCCCCCCCCC(=O)NC(CO)C(O)CCCCCCCCCCCCCCC
|
| 306 |
+
COc1ccc(CCc2cc(=O)c3cc(OC)c(O)cc3o2)cc1O
|
| 307 |
+
CCCCCCC(=O)CC(=O)NC1CCOC1=O
|
| 308 |
+
COc1ccc(CCN(C)C)c2cc(-c3ccc(O)cc3)oc12
|
| 309 |
+
CC[C@]1(CCCc2ccccc2)CN(c2ccc(OC)cc2)C1=O
|
| 310 |
+
CC=C(C)CC1C(=O)Nc2ccccc21
|
| 311 |
+
C=CC1(C)CCC2(C)C(=C)C1CCC2C
|
| 312 |
+
COc1cc(O)ccc1C=CC(=O)c1ccc(O)cc1
|
| 313 |
+
CC1=C2CC3C(C)(O)C4OC4C(O)C3(C)C=C2OC1=O
|
| 314 |
+
OC(c1ccccc1)c1ccc(Br)cc1
|
| 315 |
+
COc1ccc(OCC(=O)NCc2c3c(c(OC)c4c2OCO4)CN(C)CC3)cc1
|
| 316 |
+
CC1=C(c2ccc(O)cc2)C(O)OC1=O
|
| 317 |
+
CCCCCC=CCC=CCCCCCCCC(C)=O
|
| 318 |
+
OCc1cc(Br)c(O)c(O)c1Br
|
| 319 |
+
Fc1ccc(-c2c[nH]c(C3COCCN3Cc3ccccc3)n2)cc1
|
| 320 |
+
CC=CC=CCCC(=O)CC(O)CCO
|
| 321 |
+
COc1ccccc1C(=O)Oc1ccc(Cl)cc1C(=O)C=Cc1cccs1
|
| 322 |
+
COC1CC(NC(C)=O)C(OC(C)=O)C(C)O1
|
| 323 |
+
CC(C)(C)C1CC(O)C(C(O)C=Cc2ccccc2)C(=O)O1
|
| 324 |
+
COc1cc2c(cc1O)CCN(C)C2Cc1ccc(O)cc1
|
| 325 |
+
CN1C(=O)CCC1(O)c1cccnc1
|
| 326 |
+
C=c1oc(=O)cc(OC)c1=CC(O)CC
|
| 327 |
+
CC(C)=CC1CC(C)C2(CC=C(C)CC2)O1
|
| 328 |
+
O=CC=Cc1ccccc1
|
| 329 |
+
CCCCCCCC(CCc1ccc(O)c(OC)c1)OC(C)=O
|
| 330 |
+
O=C(NCCc1ccccc1)c1c(O)c2cccc3c2n(c1=O)CC3
|
| 331 |
+
CC=C(CO)C(=O)OCC1=CCN2CCC(OC(=O)C=C(C)CO)C12
|
| 332 |
+
Nc1ccc(C(=O)c2ccc(N)c(N)c2)cc1N
|
| 333 |
+
CCCCCC(O)CC(=O)CCc1ccc(OC)c(OC)c1
|
| 334 |
+
COc1ccc(C2=C(c3ccc(OC)cc3)OCCC2)cc1
|
| 335 |
+
CCCCC1(CCCC)C(=O)NC(=Nc2ccc(OC)cc2)NC1=O
|
| 336 |
+
O=C(NCCOC(=O)N(O)c1ccc(Br)cc1)Oc1ccc(Cl)cc1Cl
|
| 337 |
+
[O-][NH+]1C=CC=CC1c1ccccn1
|
| 338 |
+
CCCCCC=CCC=CCCCCCCCC(=O)OCC(COC(=O)CCCCCCCCCCCCCCCCC)OC(=O)CCCCCCCCCCCCC
|
| 339 |
+
COc1ccc2c(C)c(CC(=O)O)c(=O)oc2c1C
|
| 340 |
+
Cc1cc2ocnc2nn1
|
| 341 |
+
CNCCCCCNCCCCCNC
|
| 342 |
+
CC(C)CCCCCCCCCC(C)C(=O)OCC(O)CO
|
| 343 |
+
CC(=O)C1C(c2ccc3c(c2)OCO3)C=CC2CCCCC21
|
| 344 |
+
Fc1ccccc1-c1nnc(C2CCN(Cc3cccs3)C2)o1
|
| 345 |
+
O=C(NCCCNC(=O)c1ccc2c(O)c(O)ccc2c1)c1ccc2c(O)c(O)ccc2c1
|
| 346 |
+
O=C(CSc1nc2ccccc2s1)NCC1CCCN(CCc2ccccc2)C1
|
| 347 |
+
CCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
|
| 348 |
+
CC(C)CCCCCCCCCCC1(O)C=CC2N1CCC[NH+]2CCCCCNC(=N)N
|
| 349 |
+
CC(NC(=O)Cc1ccccc1)C(=O)NC(C(=O)O)C(C)C
|
| 350 |
+
O=C(O)C=CC(=O)Nc1ccc(C=Cc2ccccc2)cc1
|
| 351 |
+
COc1cc(C2COc3cc(O)ccc3C2)ccc1O
|
| 352 |
+
CN[C@@H](C)[C@H](O)c1ccc(O)c(O)c1
|
| 353 |
+
CCC(c1ccc(O)cc1)C(CC)c1cc(I)c(O)c(I)c1
|
| 354 |
+
C=NN(C)C=C(C)N
|
| 355 |
+
CN1CCC(c2nc3ccccc3s2)C1
|
| 356 |
+
Cc1ccc(-c2c[nH]c(C3COCCN3C)n2)cc1
|
| 357 |
+
C=C(CCC1CCCC(=O)C1(C)C)CC(O)C=C(C)CC(=O)NCCc1ccccc1
|
| 358 |
+
COc1ccc(S(=O)(=O)C(CCC2CCCCC2)CC(=O)NO)cc1
|
| 359 |
+
COc1ccc(CCNC(=O)Cc2ccc3c(c2)OCC3)cc1OC
|
| 360 |
+
CC1(C)C[C]2[NH2+][C](N)[C](C(=O)[O-])[CH][C]2CO1
|
| 361 |
+
Cc1cc(=O)c2c(O)cc3c(c2o1)C=CC(C)(CO)O3
|
| 362 |
+
CC(O)c1cnccc1C(=O)c1nccc2c1[nH]c1ccccc12
|
| 363 |
+
CCC(C)(CC)C(C)C
|
| 364 |
+
COc1cc2c(c(O)c1C(C)O)C(=O)CC(O)C2O
|
| 365 |
+
Cc1cc(Br)c2c(C)ccc(C(C)C)cc1-2
|
| 366 |
+
C=C(C)C(O)COc1ccc2c(OC)c3ccoc3nc2c1OC
|
| 367 |
+
C#CC(C)N(C)C(=O)Nc1ccc(Cl)cc1
|
| 368 |
+
CCCCCCCCCCCC(=O)c1cc(C(=O)CCCCCCCCCCC)cc(C(=O)CCCCCCCCCCC)c1
|
| 369 |
+
C=C1CCC=C(CO)C(O)CC2C1CC2(C)COC(C)=O
|
| 370 |
+
O=C1CN2Cc3cc(N4CCCC4)ccc3N=C2N1
|
| 371 |
+
COc1cc2c(cc1OC)CCN(C)C(C(=O)c1cccc(N)c1)=C2
|
| 372 |
+
O=C(COC(=O)c1ccc(Br)cc1)c1cccc([NH+]([O-])O)c1
|
| 373 |
+
CN(C)C(=O)Oc1ccc2c(c1)OC(=Cc1cccc(F)c1)C2=O
|
| 374 |
+
CC(C)c1cc(C=O)cc(C(C)C)c1O
|
| 375 |
+
COC1=CC(=O)c2cc[nH]c2C1=O
|
| 376 |
+
CC(=O)OC(C)C(C=C1C=C(C)C2=NCCC3OC123)OC(=O)C(C)C
|
| 377 |
+
CC(C)C1=CC2CC1C1CCCC2C1O
|
| 378 |
+
CC(C)COP(C)(=S)OCC1CCCN2CCCCC12
|
| 379 |
+
CCOC(C)OO
|
| 380 |
+
CC(C)C(N)CC(=O)O
|
| 381 |
+
C=CC(=C)CCCC(C)CCCC(C)CCCC(C)C
|
| 382 |
+
CC1C(O)c2cocc2C(O)C2CC(C)(C)CC12
|
| 383 |
+
Nc1c(CC(=O)[O-])cccc1C(=O)c1ccc(Cl)cc1Cl.[Na+]
|
| 384 |
+
CC12CCC(N)CC1=CCC1C2CCC2(C)C1CCC2C(O)[SH](=O)=O
|
| 385 |
+
CC(C)=CCCC(C)(N)C1(C)CC=C(C)CC1
|
| 386 |
+
CC(=O)CCCCCCCCCCCCCCC=Cc1ccc2c(c1)OCO2
|
| 387 |
+
CCCC=CCC1(O)C2=C3C(CCC3N=C(N)N2)CC1C
|
| 388 |
+
CC(NC(=O)C(N)CC(O)C(=O)O)C(O)C12CC1CC=CC2=O
|
| 389 |
+
CC(=CC(=O)OC1C(C)=CC=C(C=O)C1(C)C)CO
|
| 390 |
+
CCCCCCCCCC(C)OC(C)=O
|
| 391 |
+
NC(=O)CCC(=O)O
|
| 392 |
+
CC1CCC2(C)C(CCC(O)C2(C)O)C12COC(=O)C2
|
| 393 |
+
O=C(NCCCN1CCC(Oc2ccccc2)CC1)C(c1ccccc1)c1ccccc1
|
| 394 |
+
CC1Oc2c(c(=O)[nH]c3ccccc23)C1(C)C
|
| 395 |
+
COc1ccc2oc3cc(O)c(O)c(O)c3c(=O)c2c1OC
|
| 396 |
+
OC(CNc1ccccc1I)CON=C(C1CC1)C1CC1
|
| 397 |
+
O=C1c2ccccc2C(=O)c2cc(CO)c(O)cc21
|
| 398 |
+
CC1=CCCC(C)=CC(C)(C)C=CCC1
|
| 399 |
+
CCCCCC=CCC=CCC=CCCCCC(=O)OC(CO)COC(=O)CCCCCCCC=CCC=CCCCCC
|
| 400 |
+
COc1ccc2cc(C(C)C(=O)OCC(=O)O)ccc2c1
|
| 401 |
+
Cc1c(CCC(=O)O)c(=O)oc2cc(OCC(=O)O)ccc12
|
| 402 |
+
C=C(C)C1CCC2=CC(OC2=O)C2=C(C)CC(O)(C2=O)C(C)C(=O)C1O
|
| 403 |
+
CCCCC(COCCOCCOCCO)CC(=O)O
|
| 404 |
+
C=CC1(C)CC(OC(C)=O)C2C(=C)C(=O)OC2C1C(=C)C
|
| 405 |
+
CCCCCCCCCCCCCC(=O)NC(COC1OC(CO)C(O)C(O)C1O)C(O)C=CCCC=C(C)CCCCCCCCC
|
| 406 |
+
Cn1c(N)nc2c3nccnc3ccc21
|
| 407 |
+
Cc1ccc2c(c1O)C(=O)c1c(O)cc(O)cc1C2=O
|
| 408 |
+
COC(=O)CC(c1ccsc1)c1oc2ccccc2c(=O)c1O
|
| 409 |
+
CC1(C)CCCC2(C)C(CO)C(CO)=CCC12
|
| 410 |
+
COC1=CC(=O)c2c(O)cc(CC(C)=O)c(O)c2C1=O
|
| 411 |
+
CC(O)C(OC1OC(C(=O)O)C(O)C(O)C1O)C(O)C(O)C=O
|
| 412 |
+
CC(=O)OCC12CCC1(OC(C)=O)C1C=C(C)C2CC(C)(C)C1
|
| 413 |
+
CCN(CC)c1ccc2c(COC(C)=O)cc(=O)oc2c1
|
| 414 |
+
CC(=O)C12OC1C1(C)C(=CC2=O)CCC(O)C1C
|
| 415 |
+
Cc1ccc2c(n1)CC(C(=O)CO)CCC2C
|
| 416 |
+
CC(C)(C)OC(=O)NCCNC(=O)C(N)CCCCNCCC(=O)NCCS
|
| 417 |
+
Brc1c(OCCCN2CCCCC2c2cccnc2)ccc2ccccc12
|
| 418 |
+
COc1ccccc1CNCCCCCCCCCCNCCSSCCNCCCCCCCCCCNCc1ccccc1OC
|
| 419 |
+
Oc1ccc(Oc2ccc(Br)cc2Br)cc1Br
|
| 420 |
+
CCCN(CCC)[C@@H]1CCc2cccc(C(=O)c3ccccc3)c2[C@@H]1C
|
| 421 |
+
CCC=CCC=CCC=CCC=CCC=CCCCC(=O)OC(CO)CO
|
| 422 |
+
CC(C)C(=O)OCC(=O)c1cc2c(cc1O)OC(C)(C)C=C2
|
| 423 |
+
CC1COC2=C1C(=O)C(=O)c1c2ccc2c1C(O)CCC2(C)C
|
| 424 |
+
CC(O)c1cc(OCCCC(C)(C)C(=O)O)ccc1OCCCC(C)(C)C(=O)O
|
| 425 |
+
Cc1ccc(C(C)C)c2c1CCC(C)(O)C2O
|
| 426 |
+
CN(C)c1ccc2nc(N)oc2c1
|
| 427 |
+
COC(=O)c1ccc(NC(=O)N2CCc3nc[nH]c3C2c2ccccn2)cc1
|
| 428 |
+
O=C(OCC1OC(Oc2ccc(CO)cc2O)C(O)C(O)C1O)c1ccccc1
|
| 429 |
+
C=C1C(=O)OC2C1C(OC(=O)C=C(C)C)C(O)C1(C)C(O)CCC(=C)C21
|
| 430 |
+
CC(CCc1ccccc1)NC(=O)CC1NC(=O)c2ccccc2NC1=O
|
| 431 |
+
CCCCC(N)CCCCCCC(C=CC(=O)O)Nc1ccc[nH]1
|
| 432 |
+
COc1ccccc1C(=O)NCC1COCc2nc3cc(C)ccc3n21
|
| 433 |
+
Cc1cc2ncoc2nn1
|
| 434 |
+
C[C@@H]1CCC[C@@H]2C[C@@H](NC(=O)c3cc(Cl)ccc3O)CCN21
|
| 435 |
+
C=C1CCC2C(C=C(C=O)CCC1O)C2(C)CO
|
| 436 |
+
CC(C)CCCCCCCCCCCC=CC(=O)NC=Cc1ccc(OC2OC(C)C(O)C(O)C2C)cc1
|
| 437 |
+
COc1ccc2c(c1)C1CNC2Cc2ccc(Cl)cc21
|
| 438 |
+
C=CC(C)(C)C1C(=O)C=C(c2ccccc2)OC1=O
|
| 439 |
+
Nc1ncnc2c1ncn2NC(=O)COCP(=O)(O)O
|
| 440 |
+
O=c1c(-c2ccc(O)c(O)c2)coc2cc(O)cc(O)c12
|
| 441 |
+
CCCC(=CCCC=CCCCCOC(C)=O)CCC
|
| 442 |
+
COC(=O)c1cc(C)c[nH]1
|
| 443 |
+
COc1c(C(C)=O)ccc2c1C=CC(C)(C)O2
|
| 444 |
+
CCCCCCCCCCCCCCCCCC(=O)OC(C)C(=O)OC(C)C(=O)O
|
| 445 |
+
CC#CC#CC#CC=CC=CC(CCO)OC(C)=O
|
| 446 |
+
C=CCC(CC=C)(CC=C)c1ccc(CCCCCCCCC)cc1O
|
| 447 |
+
O=C(NCC1OCC(NCc2cccs2)C1O)c1ccccc1
|
| 448 |
+
COc1cc(NC(C)CCCN)c2ncccc2c1Oc1ccc(Cl)cc1
|
| 449 |
+
C[C]1[CH][CH][C](N)[NH+](C[C]2[CH][CH][C](F)[CH][CH]2)[CH]1
|
| 450 |
+
C#CC(C)(C#C)c1ccccc1
|
| 451 |
+
NC(Cc1cc(O)c(O)cc1O)C(=O)O
|
| 452 |
+
COc1ccc(NC(=O)OC2COC3C(NC(=S)NC4CCCCC4)COC23)cc1
|
| 453 |
+
Cc1ccc(O)c(-c2cc3cc(C(=N)N)ccc3[nH]2)c1
|
| 454 |
+
CC(C)CC(N)C(=O)NC(CCC(N)=O)C(=O)N1CCCC1C(=O)O
|
| 455 |
+
CCCCCCCCCCCCCCCCNc1ccc(C(=O)NC(=O)c2ccccc2)cc1
|
| 456 |
+
COc1ccc(C=CC(=O)NC(=S)N2CCCCC2c2cccnc2)cc1
|
| 457 |
+
CC(=O)OC1CCC2(NC(=O)c3ccccc3)CSC1C2OC(C)=O
|
| 458 |
+
CNc1ccc2ncnc(Nc3cccc(Br)c3)c2n1
|
| 459 |
+
O=C(O)COc1cc(OCC(=O)O)c2c3c(c(=O)oc2c1)CCC3
|
| 460 |
+
ClCC1Cc2ccccc2CN1
|
| 461 |
+
COC(=O)CCC1C(=C(C)C)CCC(C)C1(C)Cc1c[nH]c2ccccc12
|
| 462 |
+
CCc1c(C)nc2ccc(OC)cc2c1O
|
| 463 |
+
C#CC=CC(O)C(C)O
|
| 464 |
+
CCCCCCCCCCCCCCCCCCCCCOCC(O)CO
|
| 465 |
+
COC(=O)CC(c1ccc(O)cc1)c1cc2c(cc1O)OCO2
|
| 466 |
+
O=C(CCn1cnc2ccccc2c1=O)N1CC2CC(C1)C1CCCC(=O)N1C2
|
| 467 |
+
O=S(=O)(c1ccc(F)cc1)N1CCCCC1c1cccnc1
|
| 468 |
+
CCOc1ccc(C=CC(=O)c2cc(NC(C)=O)ccc2O)cc1
|
| 469 |
+
CC(=O)OC1CCN2CC(N)CCC12
|
| 470 |
+
Cn1c(C(C#N)=NNc2cccc(Cl)c2)nc2ccccc21
|
| 471 |
+
O=C(O)C1CCCNC1
|
| 472 |
+
COc1c(O)cc2c3c1-c1cc(O)ccc1CC3NC2=O
|
| 473 |
+
O=C(CBr)Nc1ccc(Oc2ccc(C(=O)O)cc2)cc1
|
| 474 |
+
COc1cc2c(cc1O)CNCC2c1ccc(O)c(C)c1
|
| 475 |
+
CC(=O)Nc1ccc2c(c1)nc1n2C(CNCc2ccc(C(=O)O)cc2)COC1
|
| 476 |
+
C=C1OC(=O)c2c(O)cc(OC)c(C)c2C1(C)O
|
| 477 |
+
CC(NCc1c(O)ccc2c3c(c(=O)oc12)CCCC3)C(=O)O
|
| 478 |
+
COCc1cn(C2COC3C(NC(=O)NC4CCCCC4)COC32)nn1
|
| 479 |
+
CCCCCCCCCCCCCCCCCCCCCCCC(C)=CCCOC(=O)CCCC(C)O
|
| 480 |
+
C=CC1(C)CCC2(C)C3=CC(=O)CC(C)(CO)C3CCC2C1
|
| 481 |
+
NC(CSCCS(=O)(=O)O)C(=O)O
|
| 482 |
+
COC(=O)c1ncn2c(=O)n(CCCl)nnc12
|
| 483 |
+
CC(=O)OCC(C)C#CC(CO)=C1C=CC(=O)O1
|
| 484 |
+
N#Cc1nc(C(=O)O)c(O)cc1-c1cccs1
|
| 485 |
+
Cc1nc2c(=O)n(C)c(=O)n(C)c2[nH]c1=O
|
| 486 |
+
Fc1ccc(CN2CCOCC(Oc3cccnc3)C2)cc1
|
| 487 |
+
C1CNCCSCCSCCN1
|
| 488 |
+
CNC(=O)C(Cc1ccccc1)N1CCCC1C(=O)NCCOC
|
| 489 |
+
Nc1ccc(NC(=O)OCCCc2c[nH]cn2)cc1
|
| 490 |
+
O=c1cccc2n1CC1C=NCC2C1
|
| 491 |
+
Cc1ccc2c(CCl)cc(=O)oc2c1
|
| 492 |
+
c1ccc(CCN2CCc3ccccc3C2)cc1
|
| 493 |
+
CCCCCC=CC1C=CCCC1C(C)=O
|
| 494 |
+
CC(C)(C)C(=O)Oc1ccc2c(c1)OC(=Cc1ccccc1Br)C2=O
|
| 495 |
+
O=C(O)C(CCCCn1cnc2c1NC=NCC2O)Cc1cccc(Br)c1
|
| 496 |
+
COc1ccc(C=CC(=O)C2(C)CO2)cc1OC
|
| 497 |
+
CCCCCC=CCC=CCC=CCC=CCC=CCCC(=O)O
|
| 498 |
+
CCCSSCc1ccco1
|
| 499 |
+
N#Cc1ccc(OS(N)(=O)=O)cc1
|
| 500 |
+
COc1cccc(CN2CCC(c3nc4ccccc4[nH]3)C2)c1
|
| 501 |
+
CC(C)C1=Cc2ccc3c(c2C(=O)C1=O)C(O)CCC3(C)C
|
| 502 |
+
CC(=O)OC1(C(C)=O)CCC2C3C=CC4=CC(=O)CCC4(C)C3CCC21C
|
| 503 |
+
CC(=O)OC1C=CC(O)C2(CC(=O)OC2C=C(C)CCC=C(C)C)C1
|
| 504 |
+
O=C(Cc1c[nH]c2ccccc12)OC1C(O)C(O)C(O)C(O)C1O
|
| 505 |
+
COC(=O)c1c[nH]c(=O)c(C(CC(N)=O)c2cccnc2)c1O
|
| 506 |
+
CCCCCCCCCC(C)CCCC
|
| 507 |
+
O=C(C=Cc1ccccc1)c1cc2occc2cc1O
|
| 508 |
+
CC(C)Oc1ccc(CNCCC(c2ccco2)C(C)C)cc1
|
| 509 |
+
CC(C)=CCc1c(OCC(O)C(C)(C)O)ccc2ccc(=O)oc12
|
| 510 |
+
COC(=O)c1cnn2c1NC1=C(C(=O)CCC1)C2c1ccc(OC)cc1
|
| 511 |
+
C=CCC=CCC=CCCCCCCCc1cccc(O)c1
|
| 512 |
+
CC(=O)N1CC2CN(CC(C)C)CCN2C(C)(CO)C1
|
| 513 |
+
CC1OC(C)OC(C)O1
|
| 514 |
+
COC(=O)C1CC(NC(=O)c2ccc(OC)cc2)CN1C(C)=O
|
| 515 |
+
O=C(O)c1cc2ccccc2c(O)n1
|
| 516 |
+
CCC(C)OC(=O)NC(CC)(C(F)(F)F)C(F)(F)F
|
| 517 |
+
COC1=CC(=C(c2ccccc2)c2ccccc2)C=CC1=O
|
| 518 |
+
O=C(NC=CNC(=O)c1ccccc1)c1ccccc1
|
| 519 |
+
C=CC(C=Cc1ccc(O)cc1)c1ccc(O)c(OC)c1
|
| 520 |
+
O=c1cc(CCc2ccccc2)oc2c1C(O)C(O)C(O)C2O
|
| 521 |
+
O=C1/C(=C/c2ccccc2)Cc2ccccc21
|
| 522 |
+
C=CC1CN(C(C)=O)CCC1CCCc1ccnc2ccccc12
|
| 523 |
+
C=CC(C)(C)c1ccc2c(c1)CCC(C)(C)O2
|
| 524 |
+
O=c1nccc2[nH][nH]cc1-2
|
| 525 |
+
Cn1cc(C2CC(=O)Oc3c2c(=O)oc2ccccc32)cn1
|
| 526 |
+
Cc1oc2c(C)c3oc(=O)c(CCC(=O)O)c(C)c3cc2c1C
|
| 527 |
+
CN1CC2CN(C(=O)N3CCOCC3)CCN2C(C)(C)C1
|
| 528 |
+
CCCCCC=CC=CC(=O)NCC(C)CC
|
| 529 |
+
O=C(O)CCCCCCCCCCCCC1CCCC1
|
| 530 |
+
CC12OC(=O)C1(C(O)C1C=CCCC1)NC(=O)C2CCCl
|
| 531 |
+
CCCCCCCCCCCCCCCCCCNC(=O)OCC1(COC(=O)N(Cc2cccc[n+]2CC)C(C)=O)CCCCCC1.[I-]
|
| 532 |
+
CC(C)=CCCC(C)(O)C1C(O)CC(C)(O)C1C
|
| 533 |
+
COc1cccc(NC(=O)NC2COC(CN3CCOCC3)C2O)c1
|
| 534 |
+
CCCNC(=O)NC1CC(COC)C(O)C1O
|
| 535 |
+
O=C(O)C1CC(O)C=N1
|
| 536 |
+
CCCCCC(=O)N1CCC(CC(=O)O)C(Cc2nc3ccc(C)cc3[nH]2)C1
|
| 537 |
+
O=C(C1CCCCC1)N1CCC(c2nnc(-c3cccnc3)o2)C1
|
| 538 |
+
COc1cc(O)c(C(C)=O)c2c1CC(C(C)(C)O)O2
|
| 539 |
+
CC(=O)c1c(C)cccc1O
|
| 540 |
+
C=CC(C)(C)C(=O)C(=O)CC(=O)c1ccccc1
|
| 541 |
+
CCOC(=O)C1=CCCN(C)C1
|
| 542 |
+
COc1ccc(-c2n[nH]cc2C(=O)NCC2CCCN3CCCCC23)cc1
|
| 543 |
+
CC(=O)OC1CC(O)C23C(=O)C(C)C1C2(C)C(C)CCC3O
|
| 544 |
+
C=C(C)C1CCC1C(=C)C
|
| 545 |
+
CCOC(=O)CN1C(=O)C2CCCCN2C(=O)c2ccccc21
|
| 546 |
+
CC(C)(C)OC(=O)N(CCCCCOCc1ccccc1)OCc1ccccc1
|
| 547 |
+
C=C1OC(=O)C(=CCCCCCCCCCCCCCCCC)C1O
|
| 548 |
+
CCCc1cc(C(=O)O)n(C)n1
|
| 549 |
+
C=CC=CCCCCCCCC#CC#CCO
|
| 550 |
+
CCc1cc(C)cc(C)n1
|
| 551 |
+
COc1ccc(C=C2C(=O)OC(=O)c3ccccc32)cc1
|
| 552 |
+
CC1OC(n2ccc(NC(=O)Cc3ccccc3)nc2=O)CCC1O
|
| 553 |
+
CC1=CCc2oc3ccc(C)cc3c2-c2oc(C)cc21
|
| 554 |
+
COC(=O)C(C)Oc1ccc2cc(-c3ccc(OC)cc3OC)c(=O)oc2c1
|
| 555 |
+
COC1(OC)CCC2(C)C(CCC3C4CCC(O)C4(C)CCC32)C1
|
| 556 |
+
O=C1C(=CCCO)C2OCC=CC2=C1O
|
| 557 |
+
CCCCCCCCC/C=C/CC/C=C/[C@@H](O)[C@H](CO)NC(=O)CCCCCCCCCCCCCCC
|
| 558 |
+
C=C1C=CC(C(C)C)CC1O
|
| 559 |
+
O=C(C1CCCC1)N1CCOCC(Oc2cnccn2)C1
|
| 560 |
+
CC1=CC2CC3(C=CC(=O)O3)C(C)(C)C2CC1
|
| 561 |
+
CCOC(=O)C=C(Br)Br
|
| 562 |
+
CCc1c(OC)cc2c(c1O)C(=O)C=CC2=O
|
| 563 |
+
COC(=O)C1(O)CC(O)C(OC(=O)C=Cc2ccc(O)cc2)C(O)C1
|
| 564 |
+
O=C(c1ccco1)c1coc2ccc(O)c(CN3CCCCC3)c12
|
| 565 |
+
CCCN(CCC)[C@@H]1Cc2cccc(O)c2C[C@H]1C
|
| 566 |
+
Nc1c(C=O)cc(C=O)c(N)c1N=O
|
| 567 |
+
CC1CC2CC(=O)C3CCCN4CCC(O)C2C34C1
|
| 568 |
+
Cc1cc2c(c3c1OCO3)C(C)CCC2C(C)CC(=O)CC(C)C
|
| 569 |
+
CC(C)(C)c1cc(C(=O)c2cccs2)cc(C(C)(C)C)c1O
|
| 570 |
+
CN(Cc1cccc(O)c1)C(=O)c1cc[nH]n1
|
| 571 |
+
O=c1c(-c2cc(O)cc(O)c2)coc2cc(O)cc(O)c12
|
| 572 |
+
Oc1cc2cnnc-2c[nH]1
|
| 573 |
+
CC=CC=CC=CCCC=CC=CC(=O)NCC(C)(C)O
|
| 574 |
+
C=C1C(=O)OC2CC3C(=CC12O)CCC1C(C)(C)CCCC31C
|
| 575 |
+
CO[C]1[CH][CH][C](C(O)[C]2[NH2+][CH][CH]N2C)[CH][CH]1
|
| 576 |
+
COc1ccc2c(=O)cc(-c3cc(OC)c(OC)cc3OC)oc2c1
|
| 577 |
+
COCCNC(=O)C1CN2CCC1CC2Cn1cc(CN(C)C)nn1
|
| 578 |
+
CC(C(=O)OC1C(O)C2CC(O)CC1N2C)C(O)c1ccccc1
|
| 579 |
+
COC1=Nc2ccc(NC(=O)CCCCCCC(=O)NO)cc2C(C)(C)C1
|
| 580 |
+
Cc1cnoc2ncnc1-2
|
| 581 |
+
CCOC(=O)c1cc(C)n(-c2ccccn2)n1
|
| 582 |
+
CC(C)[C@@H](NC(=O)[C@H](CO)NC(=O)CCC[C@H](N)C(=O)O)C(=O)O
|
| 583 |
+
Clc1ccc(CNCCC(c2ccccc2)c2ccc3c(c2)OCO3)cc1
|
| 584 |
+
CC1=CC(=O)C(C)(C2(C)CC(CO)=CC2=O)CC1
|
| 585 |
+
CCC(C)CN=C(O)C=CCCCCC=Cc1ccc2c(c1)OCO2
|
| 586 |
+
CC(=O)C1CC2(O)C3(C)COC2(C)CC1(O)C3
|
| 587 |
+
c1ccc(OC(CC2CNC2)c2ccccc2)cc1
|
| 588 |
+
Clc1ccc(-c2cn3cc(I)ccc3n2)cc1
|
| 589 |
+
CC1(C)CN(S(C)(=O)=O)CC2CN(C3COC3)CCN21
|
| 590 |
+
O=C1OC(O)C2C(CO)=CCC2C1CO
|
| 591 |
+
C=C1OC(=O)C2(C(C=CC=CCCC)C=CC(O)C2O)C1O
|
| 592 |
+
COc1cccc(-n2c(C)nc3ccc(OCC(C)=O)cc3c2=O)c1
|
| 593 |
+
NC(CCC(=O)O)C(=O)O
|
| 594 |
+
CCCCCCCCCCCCCCCCCCCCCCCC(O)CCCCO
|
| 595 |
+
C=C(C)C1=CCC2(C)C(OC(=O)C(C)=CC)CCC(C)(O)C2C1
|
| 596 |
+
CCCc1c(C)c2cc3c4c(c2oc1=O)CCCN4CCC3
|
| 597 |
+
CCCCc1oc(CCc2ccc(O)c(OC3CCCC3)c2)cc1CO
|
| 598 |
+
NS(=O)(=O)c1cccnc1[N+](=O)[O-]
|
| 599 |
+
CCC=CC#CCCCCCCCCCCOC(C)=O
|
| 600 |
+
CCCCCCCCC(C)CCC
|
| 601 |
+
COc1cc(O)cc2c1-c1ccc(O)cc1CC2
|
| 602 |
+
O=S1(=O)Cc2ncc3ccccc3c2C1
|
| 603 |
+
Cn1sc(=O)n(-c2ccc(F)cc2)c1=O
|
| 604 |
+
CC(=O)NC1=C2CCCN3CCCC4C(C1)CC(C)CC243
|
| 605 |
+
COC(=O)C1(C(C)OC)CC(C)C(C)(OC(C)=O)C(=O)O1
|
| 606 |
+
N=Nn1cc2cccc-2o1
|
| 607 |
+
Cc1nc(C)c(-c2ccnc(N)n2)s1
|
| 608 |
+
CC(=CC=CC=O)CO
|
| 609 |
+
CCCCCCCCOC(=O)c1ccccc1C(=O)OC
|
| 610 |
+
CC(=O)OC1CCC(C)(CCC=C(C)C)C2CC=C3COC(O)C3C12C
|
| 611 |
+
CCCOCCN(C(=O)CCl)c1c(CC)cccc1CC
|
| 612 |
+
O=C(C=Cc1ccc(O)cc1)CCCCc1ccc(O)cc1
|
| 613 |
+
Nc1nc(NCC(O)CO)c(Cl)nc1[N+](=O)[O-]
|
| 614 |
+
CC1CC(C)C2c3c(ccn(O)c3=O)OC(C)(C1)C2C
|
| 615 |
+
c1cc(-c2c[nH]c(C3COCCN3C3CCC3)n2)ccn1
|
| 616 |
+
O=C(CC1NC(=O)c2ccccc2NC1=O)NCc1ccco1
|
| 617 |
+
COc1ccc(CCNC(=O)NC(C(=O)O)C(C)C)cc1
|
| 618 |
+
C=CCCC(=O)C=CC1C(C)=CCCC1(C)C
|
| 619 |
+
CC1=C2C(=O)C=C(C(=O)O)C2C2OC(=O)C(C)C2CC1
|
| 620 |
+
CC(O)CCc1ccc(C(=O)O)nc1
|
| 621 |
+
CC(=CCCC(C)=CCC=C(C)C1CC=C(C)CC1)CO
|
| 622 |
+
COc1cc(CC(C)=O)c2c(=O)cc(C(=O)CCO)oc2c1
|
| 623 |
+
C=CCNC(=O)CC1CCN(C(=O)c2ccccc2)CC1CC
|
| 624 |
+
CC(=O)OCc1cc(O)c2c(c1)C(=O)c1cc(O)cc(O)c1C2=O
|
| 625 |
+
COc1cc(CCC(O)CCCCc2ccccc2)ccc1O
|
| 626 |
+
CC1=C(CC2C(C=O)=C(C)C3CC32)C(=O)OC1=O
|
| 627 |
+
C[S+](C)(=O)CCO
|
| 628 |
+
CC(C#N)C(C)OC1OC(CO)C(O)C(O)C1O
|
| 629 |
+
CCCCCC1(O)C(=O)C(C)(C)C(=O)C(C)(C)C1O
|
| 630 |
+
CC(NC(=O)C1CC1)c1onc(-c2ccc(F)cc2)c1C(=O)O
|
| 631 |
+
CC1(C)SC2C(NC(=O)C3(N)CCCCC3)C(=O)N2C1C(=O)O
|
| 632 |
+
Cn1c(=O)c2c(O)cc(=O)oc2c2ccccc21
|
| 633 |
+
COc1ccc(CNC2(Cc3cc(CC(C)C)on3)COC2)cc1
|
| 634 |
+
N=c1cccc2oncn12
|
| 635 |
+
C=C(C)C1Cc2nc(N)nc(C)c2C1
|
| 636 |
+
C=CCN1C(=O)C(C(=O)Nc2ccccc2)C2CC1(C)Oc1ccccc12
|
| 637 |
+
COc1c(OC)c(OC(C)=O)c2cc(C)ccc2c1OC(C)=O
|
| 638 |
+
Cn1c(=O)c2nc(O)[nH]c2n(C)c1=O
|
| 639 |
+
Cc1cccc2c1CCc1cc(C(C)C)ccc1-2
|
| 640 |
+
CCCCCCc1c(C)c2cc3c(C(C)(C)C)coc3c(C)c2oc1=O
|
| 641 |
+
Nc1cccc(NC(=S)Nc2cccc(N)c2)c1
|
| 642 |
+
C#CC[C@@H](N)C(=O)O
|
| 643 |
+
O=C(NC1CCCCC1)OC1COC2C(NC(=S)Nc3ccc(F)cc3)COC12
|
| 644 |
+
c1ccc2[nH]nnc2c1
|
| 645 |
+
COc1c2ccccc2nc2oc(C(C)(O)COC(=O)c3ccccc3)cc12
|
| 646 |
+
Cc1cc(C)c(CC(=O)c2cc3ccccc3o2)c(C)c1
|
| 647 |
+
CN1CCc2cn(C)c3c2C1CC(=O)C3=O
|
| 648 |
+
CC(=CCCc1ccoc1)CCC=C(C)CCCc1ccoc1
|
| 649 |
+
CN1CCC23c4c5ccc(O)c4OC2C(=O)CCC3C1C5
|
| 650 |
+
O=C1c2ccccc2C(=O)N1N1C(=O)c2ccccc2C1=O
|
| 651 |
+
CC(=O)c1cnc(C)cn1
|
| 652 |
+
CC(=O)OC1C=C2COC(O)C2(O)C2(C)CCC(O)C(C)(C)C12
|
| 653 |
+
CCOc1ccc2c(C)c(-c3cccc(Cl)c3)c(=O)oc2c1
|
| 654 |
+
CC1=C(Cn2c3ccc(Br)c(=O)c-3nc3ccccc32)C(O)CC(C)(C)C1
|
| 655 |
+
Cc1ccccc1CC(C)C=O
|
| 656 |
+
COc1cccc(CNC2CC(COc3cccc(C)n3)C(O)C2O)c1
|
| 657 |
+
CC(C)COC(=O)Cc1cc(O)cc2c1C(=O)CC(CC(C)O)O2
|
| 658 |
+
CCCCCCCCCCCCS(=O)(=O)N(C)[C@@H]1CCN2CCc3ccccc3[C@@H]2C1
|
| 659 |
+
COc1cccc(CN2CC(F)C(OCc3nc4ccncc4[nH]3)C2)c1
|
| 660 |
+
OCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCO
|
| 661 |
+
O=C(Nc1ccc(Cl)cc1Cl)C1CC2CC3CCC1C2C3
|
| 662 |
+
CCCC(C)OC(=O)C(C)CCC
|
| 663 |
+
COc1ccc(CCN2C(=O)NC(CC(=O)NCCO)C2=O)cc1
|
| 664 |
+
CCOC(=O)c1c(-c2ccc(OC)cc2)oc2ccc(OC)cc12
|
| 665 |
+
ON=Cc1ccc(O)c(O)c1
|
| 666 |
+
CC1=CCCC(C)(O)C=CC(C(C)C)CC1
|
| 667 |
+
COC(C)(CO)C1CC=C(/C=N/O)CC1
|
| 668 |
+
CC(C)CCC(CCNCc1ccc(OC(C)C)cc1)C1CCOC(C)(C)C1
|
| 669 |
+
CCCC=Cc1cc2c(O)ccc(O)c2c(=O)o1
|
| 670 |
+
CC1CC(=O)Oc2cc(O)ccc21
|
| 671 |
+
CC(=O)NCC1OC(CC(=O)NCCN2CCOCC2)C(O)C1O
|
| 672 |
+
CCOC(=O)Cc1c(C)c2ccc(OC)c(OC)c2oc1=O
|
| 673 |
+
CC(=O)c1cc(CC2CNCCC2CC(=O)N(C)Cc2ccccc2)no1
|
| 674 |
+
CC12CCC3(CC1Cl)C(=CC(=O)CC3(C)C)C(=O)O2
|
| 675 |
+
O=C(O)C1OC(OCC(Cl)(Cl)Cl)C(O)C(O)C1O
|
| 676 |
+
CC1C=CCCC=CC=CCC=CCCC(=O)O1
|
| 677 |
+
COC(=O)c1ccccc1NC(=O)Cc1ccccc1C(=O)O
|
| 678 |
+
C=CCCCCC=CC#CC#CC=CC#CC=O
|
| 679 |
+
Nc1cnn(-c2ccccc2)c(=O)c1Cl
|
| 680 |
+
CC=C(C)C(O)C(C)C=C(C)C=CCC(C)=CCC(=O)NC(C)CC(=O)O
|
| 681 |
+
CCCN(CCC)C(=O)C1CC(=O)OC12CCOC(C)(C)C2
|
| 682 |
+
Cc1ccc(C(=O)c2cc(N)ccc2N2CCC(C)CC2)c(C)c1
|
| 683 |
+
O=C(NC(=O)C(F)(F)F)Nc1ccc(I)cc1
|
| 684 |
+
CCCCCC=CCC=CCCCCCCCC(=O)OC(COCCCCCCCCCCCCCCCCCCCC)COP(=O)(O)OCCN
|
| 685 |
+
C=CC1(O)CC(NC=O)=CC1=O
|
| 686 |
+
COc1ccc(-c2nnc(-c3nsc4ccccc34)o2)cc1
|
| 687 |
+
CCc1cccc(C=O)c1
|
| 688 |
+
c1ccc(Cc2nnc(C3CCN(C4CCC4)C3)o2)cc1
|
| 689 |
+
Cn1c(=O)ccc2c(NC/C=C/C#CC(C)(C)C)cccc21
|
| 690 |
+
CCCCCCCCCC(=O)CC(=O)NC1CCOC1=O
|
| 691 |
+
Cc1nc(N)ncc1CNC(=O)Cn1c(C)cnc(NCCc2ccccc2)c1=O
|
| 692 |
+
CNC1=NC(=Cc2c[nH]c3ccccc23)C(=O)N1C
|
| 693 |
+
COCCOc1ncccc1-c1noc(C2CCCN2C)n1
|
| 694 |
+
NCCCCOc1ccccc1CCc1ccccc1
|
| 695 |
+
CN(CCc1ccccc1)c1cnc2nc(N)nc(N)c2c1
|
| 696 |
+
C=CC(C)=CCC1(C)C(C(=O)O)=CCCC1C
|
| 697 |
+
CN(C)c1nc(N)nc2c1ncn2CC(=O)O
|
| 698 |
+
CCCCCC(O)CC(=O)CCCCCC(O)CCO
|
| 699 |
+
COc1cccc2c(=O)c(=O)c12
|
| 700 |
+
O=C(O)Cc1ccccc1Sc1c(Cl)c(Cl)cc(Cl)c1Cl
|
| 701 |
+
CCCCCCCCCCCCOc1cc(C(N)=O)cc(C(N)=O)c1
|
| 702 |
+
CC(C)CC(=O)OCC1(CO)CC(=C(C(C)C)C(C)C)C(=O)O1
|
| 703 |
+
O=C1CCC2=C1C1=CCCOC1OC2
|
| 704 |
+
CCNC(=O)NC1CC(Cc2cc(C(C)(C)C)on2)C1(C)C
|
| 705 |
+
Brc1ccccc1Nc1nc2ccccc2n2ccnc12
|
| 706 |
+
C=CC1(CO)CCC(C(C)(C)O)CC1C(=C)C
|
| 707 |
+
COc1ccccc1S
|
| 708 |
+
CC1[C@H]2Cc3ccc(NC=O)cc3[C@]1(C)CCN2CC1CC1
|
| 709 |
+
CN1CCSCCN(C)CCSCC1
|
| 710 |
+
Cc1c2c(cc3c1C(=O)CC3)C(=O)OCC2
|
| 711 |
+
COc1ccc(C2(C(=O)NC(CO)C(C)C)CCOCC2)cc1
|
| 712 |
+
CC1CCC23CC(=CCCC2C1(C)CCc1ccoc1)C(=O)O3
|
| 713 |
+
CC(=NNC(N)=S)C(=S)Nc1ccccc1
|
| 714 |
+
C=CCCCCCCCO
|
| 715 |
+
CC1CC(=O)C(CC(=O)O)C1C[NH+]([O-])O
|
| 716 |
+
CC1Oc2ccccc2C=C1C=O
|
| 717 |
+
COC(=O)C(Cc1ccc(O)cc1)NC(=O)c1ccc(OCC(C)C)cc1
|
| 718 |
+
O=C1N=C(c2c[nH]c3ccccc23)C(=S)N1
|
| 719 |
+
Cc1cccc2cc[nH]c12
|
| 720 |
+
CC1(O)CCC2C(C=CC3(C)OCCC(O)C23C)C1
|
| 721 |
+
CCN(CCCCCCO)C1CCc2cc(OC)ccc2C1
|
| 722 |
+
CC1(C)CC(C(=O)N2CCc3[nH]c4ccc(Cl)cc4c3C2)CCO1
|
| 723 |
+
CCCC(NC(=O)c1ccccc1)c1nc2ccccc2[nH]1
|
| 724 |
+
CC(NCc1c(O)ccc2c3c(c(=O)oc12)CCC3)C(=O)O
|
| 725 |
+
CCCCCCCCCCc1ccccc1S(=O)(=O)O
|
| 726 |
+
CCCCCC(O)C=CC1C(O)CC(=O)C1CC(=O)CCCCC(=O)O
|
| 727 |
+
CCCC(=O)C1C(=O)OCC1CO
|
| 728 |
+
NC(=O)C(c1ccccc1)(c1ccc(F)cc1)c1ccccc1F
|
| 729 |
+
CSC=CC(=O)N(C)CCc1ccccc1
|
| 730 |
+
CC(C)C(C)C(C)(C)C
|
| 731 |
+
CN(C/C=C/c1ccccc1)Cc1ccc2c(c1)OCCO2
|
| 732 |
+
CCCCC=C(C)C=CC1=C(C)C(=O)CCC1(C)C(=O)O
|
| 733 |
+
O=C(NCC1OCC(NCc2ncc[nH]2)C1O)C1CCCC1
|
| 734 |
+
CCCCCC=CCCCCCCCCCCCCCCCC(O)C(O)C(CO)NC(=O)C(O)CCCCCCCCCCCCCC
|
| 735 |
+
COCCNC1C2OCC(O2)C(NCc2ccccn2)C1O
|
| 736 |
+
CC(N)C(O)C=Cc1ccccc1
|
| 737 |
+
CC(=O)NC(CCCC#CC=CCl)CCCC=CC=C(Cl)Cl
|
| 738 |
+
O=C(CCC(=O)OCC(F)(F)F)NC(=S)Nc1cc(Cl)ccc1Cl
|
| 739 |
+
COc1cc2c(cc1O)C1Cc3cccc(O)c3CN1CC2
|
| 740 |
+
CCCCCCCCCCCCCCCCC(C)CO
|
| 741 |
+
COc1ccc2c(c1)CCc1ccc(OC)c(OC)c1-2
|
| 742 |
+
CC(C)CC1NC(C)SC(C)S1
|
| 743 |
+
C=C1CCCC2(C)CCC3(O)OC12C(O)C3(C)CCC(=O)C(C)C
|
| 744 |
+
COc1cc2c(c(O)c1CC=C(C)CCC=C(C)C)C(=O)N(CCc1ccccc1)C2
|
| 745 |
+
CC1C=CCC2(C)C=CC(C(C)(C)O)CC12
|
| 746 |
+
COc1cccc2c1C(=O)C(O)=CC2=O
|
| 747 |
+
COC(=O)C(=CNCCc1c[nH]c2ccccc12)[NH+]([O-])O
|
| 748 |
+
N#CC(=NNc1ccc(Cl)cc1)c1nc2ccccc2s1
|
| 749 |
+
O=C(O)CCC(=O)NC1OCC=C1CO
|
| 750 |
+
CC1CC(O)CC(C)(C)C1CCC(O)CO
|
| 751 |
+
C=C1CCC2C(C=C(C)C(O)CC1O)OC(=O)C2C
|
| 752 |
+
CC1(C)Cc2cc(C(=O)c3ccccc3)ccc2OC1=O
|
| 753 |
+
CCCCCC(O)C=CC1C(O)CC(=O)C1CCCCCCCO
|
| 754 |
+
O=C(NCCCc1ccccc1)[C@@H]1CCCN1S(=O)(=O)Cc1ccccc1
|
| 755 |
+
C=CC[C@@H](CC/C(C)=C/C=C/CCCO)OC
|
| 756 |
+
COc1ccc(NC(C)=O)cc1
|
| 757 |
+
CC1=CC(O)C(C(C)C)C(OC(=O)C=Cc2ccc(O)cc2)CC(C)=CCC1
|
| 758 |
+
COc1cc(OCC(O)CO)ccc1O
|
| 759 |
+
CC(=CC1CCC(C)C2C1=C(C)CC2O)C(=O)O
|
| 760 |
+
CC1=CCC(C(C)C)c2cc(C)ccc21
|
| 761 |
+
CC1=CC(=O)C(=C(O)C=Cc2ccccc2)C1=O
|
| 762 |
+
CC1CCC2C(C)(C)CCCC23Oc2ccccc2CC13C
|
| 763 |
+
CC(=O)NCCC(F)CNC(=O)CCC(=O)N(O)CCC(F)CNC(=O)CCC(=O)N(O)CCC(F)CN
|
| 764 |
+
CCC=CCC=CCC=CCC=CCC=CCC=CC(=O)O
|
| 765 |
+
O=C(O)Cn1c(=O)c2ccc(F)cc2n(Cc2ccc(Br)cc2F)c1=O
|
| 766 |
+
CC(=O)OCc1ccc(C(C)(O)COC(=O)C(C)C)c(O)c1
|
| 767 |
+
CC(C)CC=CC1(C)OCC23CCC4C(CCC5CC(=O)C=CC54C)C2CCC13
|
| 768 |
+
COCCC(=O)N1Cc2c(ncn2Cc2ccccc2)CC1C(=O)OC
|
| 769 |
+
CNC(=O)C(C)(C)N1CCCC1C(=O)N1CCCCCC1
|
| 770 |
+
CC(C=CC1=C(C)CCCC1(C)C)=COC1C=C(C)C(=O)O1
|
| 771 |
+
CNCC(C)(O)c1ccccc1
|
| 772 |
+
CCCCCCCC=CC(=O)NCCc1ccccc1
|
| 773 |
+
COC(=O)CCCCCCc1ccc(OCCOCCO)cc1
|
| 774 |
+
COc1ccc(-c2cc(=O)c3c(O)c(C)c(O)c(C)c3o2)cc1
|
| 775 |
+
COc1cccc2c1CCCC2CCCCN1CCN(C2=NCCCC2)CC1
|
| 776 |
+
CC(C)NCC(O)c1cc(O)cc(O)c1
|
| 777 |
+
CC(=O)CC1OC(C)C(C)c2c(C)c(O)cc(O)c21
|
| 778 |
+
CC(CCc1ccccc1)NCC(O)CNC(C)C1COc2ccccc2O1
|
| 779 |
+
CCCCOC(=O)CC1(O)C(=O)OC1C(=O)OCCCC
|
| 780 |
+
CC(C)CC12NC(=O)C3(O)C(=O)CC(Cc4ccccc4)(OC13)O2
|
| 781 |
+
COc1c(-c2ccccc2)oc2cc3occc3cc2c1=O
|
| 782 |
+
C#CC=CCC(OC(C)=O)C1CC=CCC(Br)C(CC)C1
|
| 783 |
+
COC(=O)C(CCSC)NC(=O)c1cc(C(C)C)nc2ccccc12
|
| 784 |
+
C=C(OC1C=C(C(=O)OC)C(O)C(O)C1O)C(=O)OC
|
| 785 |
+
O=C(O)C1CCN2CC(O)CC12
|
| 786 |
+
CN(CC#CCCC1SCCCS1)Cc1cccc2ccccc12
|
| 787 |
+
CCCCCCCCCCC1CCCCC1
|
| 788 |
+
Cc1ccc(N)c(S(=O)(=O)O)c1
|
| 789 |
+
O=C(Oc1ccc2c(=O)c(-c3ccccc3)coc2c1)c1ccco1
|
| 790 |
+
COc1ccc(NC(=O)N2CCc3c([nH]c4ccccc34)C2)cc1
|
| 791 |
+
COc1ccc(C(=O)NC2COC3C(OC(=O)Nc4ccccc4)COC23)cc1
|
| 792 |
+
Nc1ccc(C(=O)OCCCOC(=O)c2ccc(N)cc2)cc1
|
| 793 |
+
CC(=O)C12OC1(C)CC1C3CC=C4CC(O)CCC4(C)C3CCC12C
|
| 794 |
+
CC(=O)NC(CCCC#CC=CCl)CCCC#CC=C(Cl)Cl
|
| 795 |
+
CC(C(O)c1ccccc1)N(C)CCO
|
| 796 |
+
CCCCCC(O)CCCCCC(=O)O
|
| 797 |
+
CC=C(C)C(=O)OC1CCC2CC3OC(=O)C(C)=C3C3OCC1C23C
|
| 798 |
+
C=C1CCC2CCC3C(O)=NC(=CC(C)CC(O)C1)C3(O)C2(C)C
|
| 799 |
+
COC1OC(OC2C(O)C(O)C(O)C(O)C2O)C(O)C(O)C1O
|
| 800 |
+
CC(CCC(C=O)(OO)C(C)C)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C
|
| 801 |
+
Cc1ccc(C)c(NC2=CC(=O)CC(C)(C)C2)c1C#N
|
| 802 |
+
CC1(CO)CCCC23COC4OCC(=CCC12)C43
|
| 803 |
+
CC1=CCCC2(C)OC2C(O)C(C(C)C)C(=O)C1
|
| 804 |
+
CC(C)(Oc1cccc(CCCCCCCCO)c1)C(=O)O
|
| 805 |
+
CC#Cc1ccc(-c2ccc(C(=O)C(O)C(C)=O)s2)s1
|
| 806 |
+
CC(N)C(=O)N1CCCC1C(=O)OCc1ccccc1
|
| 807 |
+
COC(=O)c1ccccc1N1C(=O)c2ccccc2C1=O
|
| 808 |
+
CC1(C)CCc2cc(CC(=O)Cc3ccc(O)cc3)ccc2O1
|
| 809 |
+
O=C(O)CC(O)(Cc1ccccc1)Cc1ccccc1
|
| 810 |
+
CC(=O)C=CC1C(C)=CC(O)CC1(C)C
|
| 811 |
+
C=C(C)C(O)Cc1c(O)ccc(C(=O)C=Cc2ccc(O)c(CC=C(C)C)c2)c1O
|
| 812 |
+
CCCCC1C(C)CCC2C1(C)CCC1C(C)(C)CCCC12C
|
| 813 |
+
O=C(C=Cc1ccccc1)c1cc2cc(Br)ccc2oc1=O
|
| 814 |
+
O=C(O)CCCNC(=O)OCC1c2ccccc2-c2ccccc21
|
| 815 |
+
CCC(C)CCC1C(COC2OC(C(O)CO)C(O)C2O)CCC2C(C)(C)CCCC12C
|
| 816 |
+
NC1=CC(=O)c2ncccc2C1=O
|
| 817 |
+
C=CC(C)(O)CCC1(C)C2=CCCC(C)(C)C2CCC1C
|
| 818 |
+
O=C(O)CN1C(=O)[C@@H](NC(=O)[C@@H](CS)Cc2ccccc2)CCc2ccccc21
|
| 819 |
+
CCC(C)C(O)C(=O)O
|
| 820 |
+
CN(C)Cc1cn(CC2CC3CCN2CC3C(=O)N(C)C)nn1
|
| 821 |
+
C=CC(C)=CCC1C(=C)CCC2C(C)(C(=O)O)CC(O)CC12C
|
| 822 |
+
O=C(O)c1ccc(COc2ccc3ccc(=O)oc3c2)o1
|
| 823 |
+
CNC(=S)N(C)CCc1cc2c(c(OC)c1C=NO)OCO2
|
| 824 |
+
O=C1CC(O)Cc2cc(O)cc(O)c21
|
| 825 |
+
OCC(NCC1NCC(O)C1O)c1ccccc1
|
| 826 |
+
CC(C)(O)C=CC1=CC(O)C(O)C(O)C1O
|
| 827 |
+
CC(C)(C)NC(=O)CCl
|
| 828 |
+
COc1ccc(Cn2cccc2/C=C/C(=O)CC(=O)C(=O)O)cc1
|
| 829 |
+
CC(=O)Oc1cc(C)c(O)cc1CC=C(C)CCCC(C)=CC(=O)CC(C)C
|
| 830 |
+
CC(=O)c1cc(O)c2c(c1)OC(c1ccc(O)cc1)CC2=O
|
| 831 |
+
CCCCCCCCCCCCCCCNC(=O)C1CCCCC1
|
| 832 |
+
COc1ccc(CCNC(=O)Nc2cccc3[nH]ccc23)cc1OC
|
| 833 |
+
CCN(CC)C1C2OCC(O2)C(NCc2ccc(CO)o2)C1O
|
| 834 |
+
OC(CCC(c1ccc(F)cn1)N1CCNCC1)c1ccc(F)cc1
|
| 835 |
+
C=C(CCC=C(C)CO)C1CCC(C)(O)CC1
|
| 836 |
+
CC1OC(=O)C(O)C1OC(=O)C=Cc1ccc(O)c(O)c1
|
| 837 |
+
CCCCC1Cc2ccccc21
|
| 838 |
+
Cn1ccnc1CN1CC(F)C(OCc2nc3ccncc3[nH]2)C1
|
| 839 |
+
CC1=CCCC(C(=O)O)=CCc2cc(ccc2O)OC(C)(C)C(O)CC1
|
| 840 |
+
Cc1cc2c(c(CO)c1CCCl)CC(C)(C)C2O
|
| 841 |
+
OCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCO
|
| 842 |
+
C=Cc1cc2c(cn1)C(=O)OCC2
|
| 843 |
+
C=CCNCc1ccccc1
|
| 844 |
+
CCC(C)C(=O)OC1C(O)C2CC(O)CC1N2C
|
| 845 |
+
CC(=O)OC1CCC2(C)C3CCc4ccoc4C3(C)CCC2C1(C)C
|
| 846 |
+
CC(C)(O)C1Cc2c(ccc(C(=O)CCc3ccccc3)c2O)O1
|
| 847 |
+
O=CCc1ccc(O)c(O)c1
|
| 848 |
+
CCCCCCCCCCCCC(Br)CBr
|
| 849 |
+
COC(=O)COc1cccc(OCCNCC(O)COc2ccccc2F)c1
|
| 850 |
+
O=c1cc2cc[nH]n2cn1
|
| 851 |
+
NCCC[Se](=O)O
|
| 852 |
+
O=C(Nc1cccnc1)N1CCc2ccccc21
|
| 853 |
+
O=C(O)CCNC(=O)CCn1ccc2c(Br)cccc21
|
| 854 |
+
CCCCCC(=O)CC(O)CCc1ccc(OC)c(OC)c1
|
| 855 |
+
CC(C)=CCCC(C)=C1C=C2C(C)CCCC2(C)CC1
|
| 856 |
+
COc1ccc2c(c1)C(=O)c1c-2ncc(OC)c1C
|
| 857 |
+
CC(C)(C)CN1CCC(c2nnc(-c3ccccc3)o2)C1
|
| 858 |
+
O=C1c2c(O)cc(O)cc2OC(c2cc(O)ccc2O)C1O
|
| 859 |
+
OCc1cc(O)c(O)c(Br)c1
|
| 860 |
+
CC=C1C(=O)CC2C3CC=C4C(O)C(O)CCC4C3CCC12C
|
| 861 |
+
COC(=O)Cc1nc2cc3ccccc3cc2[nH]c1=O
|
| 862 |
+
CC(=O)Oc1ccc2oc(=O)c(-c3ccccc3)c(C)c2c1
|
| 863 |
+
CCCCCC=CCC(O)C=CC(O)C1CC1C1CCCCCC(=O)O1
|
| 864 |
+
Cc1cc2c(C(C)C)cc(O)c3c2c(c1O)OC3=O
|
| 865 |
+
CCCCCC(O)CCCC(=O)OCC(O)CO
|
| 866 |
+
CCCCCCCC/C=C\C/C=C\C=C\SCCCC(=O)O
|
| 867 |
+
CC1=CCC(C(C)C)C2C=C(CO)CCC12
|
| 868 |
+
CC(C)(O)CCc1c(-c2ccc(O)cc2O)oc2cc(O)cc(O)c2c1=O
|
| 869 |
+
O=c1c2cccc(O)c2nc2n1-c1ccccc1C(O)=NC2
|
| 870 |
+
Cc1cccc(-c2nn3c(-c4ccco4)nnc3s2)c1
|
| 871 |
+
COc1ccc2[nH]c3c(c2c1)CN(C(=O)CCC(=O)NCc1ccccc1)CC3
|
| 872 |
+
CCCNC(=O)N1CC2NS(=O)(=O)c3ccccc3OC2C1
|
| 873 |
+
COc1ccc(-c2ccc3cccc4c3c2C=C(O)C4=O)cc1
|
| 874 |
+
O=C(O)CCNC(=O)C(=Cc1ccc(O)cc1)NC(=O)c1ccccc1
|
| 875 |
+
COc1cc(C=CC(=O)NCCCCN=C(N)N)cc(OC)c1O
|
| 876 |
+
C=C1CCC(=O)C2(C)CCC(O)(C(C)C)CC12O
|
| 877 |
+
CCCCCCCCC(=O)CC(=O)NC1CCOC1=O
|
| 878 |
+
CC1c2[nH]c3ccccc3c2C(=O)C(O)C1C
|
| 879 |
+
CSCC(C=O)=Cc1ccccc1
|
| 880 |
+
C=C1CC(O)C=C(C)CCC(C(C)C)C(O)C1OO
|
| 881 |
+
COc1cc(CCC(=O)CCCCc2cccc(O)c2)ccc1O
|
| 882 |
+
CC1=C2CC34OC3(C)CC3OC34C(C)CC2OC1=O
|
| 883 |
+
CCCCCCSc1cc(C(N)=O)cc(SCCCCC)n1
|
| 884 |
+
CC(NC(=O)Cn1ccc2c(Br)cccc21)C(=O)O
|
| 885 |
+
COC(=O)c1ccc(OC)cc1O
|
| 886 |
+
COC(C#N)C(Oc1ccccc1)c1ccccc1
|
| 887 |
+
CC(C)NC(=O)NC1CCN(C(=O)CC(C)(C)C)C1C(=O)N(C)CC(N)=O
|
| 888 |
+
CC1=CC2C(=C(C)C)C(=O)CC(C)(O)C2CC1
|
| 889 |
+
c1ccc(N(Cn2nnc3ccccc32)c2ccccc2)cc1
|
| 890 |
+
CC(C)=CCc1c(CCCc2ccc(O)cc2O)cc(O)c2c1C=CC(C)(C)O2
|
| 891 |
+
COc1ccc2c3c1OC1C(=O)C[C@H](C)C4C(C2)N(C)CCC314
|
| 892 |
+
Cc1c(-c2cccc(Br)c2)c(=O)oc2ccc(Br)cc12
|
| 893 |
+
Cc1cc(C)c(CCP(=O)(O)O)c(C[C@H](N)C(=O)O)c1
|
| 894 |
+
CNCC(O)c1ccc(O)c2c1CCC(C)(C)C2
|
| 895 |
+
C=CC(C)(C)c1ccc(OC)c(C=CC(=O)c2ccc(O)cc2)c1
|
| 896 |
+
CC(=O)OCC=C(C)CCC1C(C)=CCC2C(C)(C)C(O)C(=O)CC12C
|
| 897 |
+
COC(=O)CC1CCC(=O)Oc2ccc(C(=O)O)cc2N1
|
| 898 |
+
COc1cc2c(c(O)c1OC)COC(C)C2
|
| 899 |
+
CCCCCC=CC1=C(CO)C2OC(=O)OC2C(O)C1O
|
| 900 |
+
COC(=O)CC(O)C1OC(c2ccccc2)C(O)C1O
|
| 901 |
+
CC(C)(C)c1ccc(-c2cc(CC3CNCCC3CC(=O)NC3CCCCC3)no2)cc1
|
| 902 |
+
CC1=CC=C2C(C)=CCC(C(C)C(=O)O)C=C12
|
| 903 |
+
COc1ccc2c(=O)c3ccoc3n(CCC(C)(C)O)c2c1
|
| 904 |
+
NCC(O)c1cccc2c1Cc1ccccc1-2
|
| 905 |
+
C=CCCCCCCCCCC#CCCCCC(=O)OC
|
| 906 |
+
Cc1ccc2nc(C3CCN(C)C3)[nH]c2c1
|
| 907 |
+
CC(C)=CCCC(C)C1CCC(C)(O)C2CC=C(C)C2C1O
|
| 908 |
+
COc1ccc(-c2coc3cc(O)c(O)c(O)c3c2=O)cc1O
|
| 909 |
+
CCc1nc(C)c(C)s1
|
| 910 |
+
Cc1cc(C)nc(NC(=O)c2cc(Cl)ccc2O)n1
|
| 911 |
+
CSc1cccc(N=C=S)c1
|
| 912 |
+
CCCCCC=CCC=CCC=CCCCCC(=O)OC(CO)COC(=O)CCCCCCCC=CCCCCCCCC
|
| 913 |
+
C=C1CCC2C(CN3CCOCC3)C(=O)OC2C2C1CC1OC12C
|
| 914 |
+
O=C(O)CCC(=O)C1COc2ccccc2O1
|
| 915 |
+
CC1C=C2CCC3C(C)(C(=O)O)CCCC3(C)C2CC1
|
| 916 |
+
C=C1CCN2CCCC12
|
| 917 |
+
COc1cc(C)c(OC)c2c1OCO2
|
| 918 |
+
C=CCCCC(C)C1CCC(C)=CC1=O
|
| 919 |
+
Cc1nc(NC(=O)c2ccc(C(C)(C)C)cc2)c(C)c(C)c1O
|
| 920 |
+
CC(C)c1ccc(NC(=O)OC2COC3C(NCC4CCCCC4)COC23)cc1
|
| 921 |
+
COc1ccc(CCNC(=O)NC(CC(C)C)C(=O)O)cc1OC
|
| 922 |
+
COc1ccccc1-c1nnc2n1NC(c1ccccc1)S2
|
| 923 |
+
COc1cc(C(CC(C)C)NC(C)=O)oc(=O)c1
|
| 924 |
+
C=CC1=CC(O)([NH2+][CH2-])C(=O)C1
|
| 925 |
+
CN(CC/C=C/c1ccccc1)Cc1cccc2ccccc12
|
| 926 |
+
CC(=O)C=C1CC(=O)Nc2ccccc2N1
|
| 927 |
+
C#CC1Cc2sccc2C(N)=N1
|
| 928 |
+
CC(C)C1CCC2(C)CC3OOC12C(O)C=C3C(=O)O
|
| 929 |
+
CC(=CCOc1cc2oc(=O)ccc2cc1O)CO
|
| 930 |
+
COc1ccc2[nH]cc(CCNC(=O)c3cccc4c3ccn4C)c2c1
|
| 931 |
+
CN(C)C=CC(=O)C1C(=O)CCC1=O
|
| 932 |
+
OCC#CCSc1nc2ccccc2o1
|
| 933 |
+
O=C(O)c1cc(O)c(Cc2c(O)cc(C(=O)O)cc2O)c(O)c1
|
| 934 |
+
CC(C)(O)C1CC=C(CO)CC1
|
| 935 |
+
CCC1CN(C(C)=O)CCC1CC(=O)NCc1cccc2ccccc12
|
| 936 |
+
CCC(NC(=O)OC)(C(F)(F)F)C(F)(F)F
|
| 937 |
+
Cc1nnc(NC(=O)c2cc(Br)cc(Br)c2O)s1
|
| 938 |
+
Cc1cc(=O)c2c(O)cc3oc(=O)c4cnccc4c3c2o1
|
| 939 |
+
COc1ccccc1N1CCN(CCCCNC(=O)C(C)c2ccccc2)CC1
|
| 940 |
+
Cc1cc(C)c2oc(=O)cc(O)c2c1
|
| 941 |
+
C=C1CCCC2C1(C)CCC(C)C2(C)CC1=C(O)C(=O)C=C(O)C1=O
|
| 942 |
+
CC(O)C1=Nc2c(nc(N)[nH]c2=O)NC1C
|
| 943 |
+
CNC(=O)C(C)(C)N1CCCC1C(=O)N(C)Cc1ccccc1
|
| 944 |
+
C=C1CC(O)CC2(C)CC3OC(=O)C(C)(O)C3CC12
|
| 945 |
+
CC1=C(C=O)CC2C1(C)CCC1C(C(=O)O)C(C)CCC12C
|
| 946 |
+
CC(CCc1ccc(O)cc1)NCCc1ccc(O)c(O)c1
|
| 947 |
+
O=C(O)CCCC(O)C=CC=CCCC(O)CC=CCCCCC(O)O
|
| 948 |
+
CCN(CC)CCNC(=O)c1cc(Cl)c(N)cc1OCC(OC)OC
|
| 949 |
+
CC(C)=CCOc1ccc2ccc(=O)oc2c1CC=C(C)C
|
| 950 |
+
CC(=O)C(O)Cc1ccccc1
|
| 951 |
+
O=c1cc(-c2ccccc2)oc2ccc(OCCCCCCN3CCCCC3)cc12
|
| 952 |
+
CN1C(C(=O)O)CSC1C1CSC(c2ccccc2O)=N1
|
| 953 |
+
CN1CC2(C)CN(CC(N)=O)CC(C)(C1)C2
|
| 954 |
+
COc1ccc(-c2c(O)cc3c(c2OC)OCO3)cc1
|
| 955 |
+
CC1=C(Cn2c3ccc(Br)c(=O)c-3nc3ccccc32)CCC(C)(C)C1
|
| 956 |
+
c1cc(-c2ccno2)[nH]n1
|
| 957 |
+
C=C(C)CC(C)=O
|
| 958 |
+
COC(=O)C1=CCC(C)CC1
|
| 959 |
+
CC1CC=CC2(C)C1C=C(CCCO)C1CCCCC12
|
| 960 |
+
COc1cccc(CN[C@@H](Cc2ccccc2)C(=O)OC(C)(C)C)c1
|
| 961 |
+
COc1ccc2c(=O)c(-c3ccc(Cl)cc3)coc2c1C
|
| 962 |
+
C=C1CCCC(C)(C)C1CCC(C)=CC(=O)OCC(O)CO
|
| 963 |
+
Cc1ccc(NS(=O)(=O)c2ccc(N)cc2)nn1
|
| 964 |
+
C=C1CCCC2(C)C1CCC13CC(CCC12)C(C(=O)O)C3
|
| 965 |
+
CC(N)C(O)=NC(CC(Cl)C1(O)CN=C1O)C(=O)O
|
| 966 |
+
CC1CC2OC(=O)C(C)C2C(O)C2(C)C(=O)C=CC12
|
| 967 |
+
CC(C)c1ccccc1C(C)(C)C
|
| 968 |
+
O=C(CC1OC(CNCc2ccccn2)C(O)C1O)N1CCN(c2ccccc2)CC1
|
| 969 |
+
COBN1CCC(Cl)C1COCc1ccc(C(=O)OC)s1
|
| 970 |
+
CC(=O)OCC1CC(O)C2(C)C(CO)=CCCC2C1(C)CCC(C)CCO
|
| 971 |
+
CC(O)c1c(-c2ccc(Cl)cc2)noc1C(=O)NC1CCCC1
|
| 972 |
+
O=C(O)CCCNc1ccccc1C(=O)O
|
| 973 |
+
CCC(=O)c1ccc(OC)c(OC)c1
|
| 974 |
+
CCCCCCCCC1(c2ccncc2)CCC(=O)NC1=O
|
| 975 |
+
Oc1[nH]cc2ncncc12
|
| 976 |
+
CNC(=O)[C@@H](NC(=O)[C@H](CCCc1ccccc1)CC(=O)NO)C(C)C
|
| 977 |
+
CC(=O)CC(C)=O
|
| 978 |
+
CC=CCC=CCCCCCCCCOC(C)=O
|
| 979 |
+
C[C@H]1C(=O)CCC2C1CCC1(C)C(O)CCC21
|
| 980 |
+
c1ccc2c(c1)c1[nH]ccc3cc[nH]c2c31
|
| 981 |
+
O=Cc1ccc(NCc2ccc(F)cc2)cn1
|
| 982 |
+
CCOC(=O)C1C(=O)C(=O)Nc2ccccc21
|
| 983 |
+
COCCOC(=O)c1c(C)oc2ccc(OCC=C(C)C)cc12
|
| 984 |
+
CCC=C(C)c1ccc(C(C)O)c(=O)o1
|
| 985 |
+
CC(C)C(C)CCC(C)C1CCC2C3CCC4C(C)CCCC4(C)C3CCC12C
|
| 986 |
+
O=C(O)COc1ccc2c(c1)OC(=Cc1ccccc1Br)C2=O
|
| 987 |
+
O=C(O)C(Cc1c[nH]c2ccccc12)NS(=O)(=O)c1ccccc1
|
| 988 |
+
CC(=O)N1c2ccccc2C2C1CC1CCC3C(C)(C)C(O)CCC3(C)C12C
|
| 989 |
+
CC(C)CCN1CC2Oc3ccccc3C(=O)N(C)C2C1
|
| 990 |
+
CC1(C)CCCC2(C)C1CC(O)C13C=CC(CC21)C(O)(CO)C3
|
| 991 |
+
Cc1ccc(CCNc2ccccc2)cn1
|
| 992 |
+
CC(=O)C(C)Cc1ccc2c(c1)OCO2
|
| 993 |
+
COc1ccc(OC)c(-c2cc3ccc(OC)cc3oc2=O)c1
|
| 994 |
+
COc1ccc(-c2oc3c(=O)cc(C)oc3c2CC(=O)NCCn2ccnc2)cc1
|
| 995 |
+
CCOC(=O)C(=NNc1cccc([NH+]([O-])O)c1)C(C)=O
|
| 996 |
+
N#Cc1cccc(CNC2C(c3cccnc3)CC(O)C2O)c1
|
| 997 |
+
COCCOC(=O)c1c(C)oc2ccc(OCC(N)=O)cc12
|
| 998 |
+
Nc1ncnc2[nH]cc(Br)c12
|
| 999 |
+
CC(C)(O)C(O)Cc1ccc2c(c1O)COC2
|
| 1000 |
+
Clc1ccc(-c2c(Cl)cc(Cl)c(Cl)c2Cl)cc1Cl
|
| 1001 |
+
c1ccc(Sc2ccccc2)cc1
|
| 1002 |
+
COC(=O)C(O)C(N)C(=O)O
|
| 1003 |
+
O=c1[nH]c2sccc2c(=O)n1CCN1CCN(c2ccccc2)CC1
|
| 1004 |
+
CCCCNc1nc(NC(C)(C)C)nc(NC(C)(C)C)n1
|
| 1005 |
+
CC1=C2CC3C(CC2(C)CCC1)OC(=O)C3CN1CCSCC1
|
| 1006 |
+
C=C(CCC(OO)C(=C)C)C1CC=C(C)CC1
|
| 1007 |
+
O=C(CCc1nc2cccnc2[nH]1)Nc1ccc(O)c(C(=O)O)c1
|
| 1008 |
+
Cc1cc(O)c2c(c1)Cc1cc(O)cc(O)c1OC2=O
|
| 1009 |
+
COc1ccc2c(c1)c(=O)oc1c(C)c3occ(C)c3cc12
|
| 1010 |
+
CC1=CCC2OC1CC1=CCC(C(C)C)C1(C)CC(=O)C2(C)O
|
| 1011 |
+
Nc1nc(O)c2c(n1)C(=O)C=CC2=O
|
| 1012 |
+
CCCCCCCC/C=C\C/C=C\C=C\Sc1cccc(C(=O)OC)c1
|
| 1013 |
+
O=C1CCC(=O)NCCCCCCN(O)C(=O)CCC(=O)NCCCCCN(O)C(=O)CCC(=O)NCCCCCCN1
|
| 1014 |
+
CSCCC(NC(=O)Cn1ccc2c(Br)cccc21)C(=O)O
|
| 1015 |
+
CC(C)(C)c1cc(CC2CC(NC(=O)C3CC3)C2(C)C)no1
|
| 1016 |
+
CCCOc1ccc(NCCC(=O)c2ccc3c(c2)OCO3)cc1
|
| 1017 |
+
COc1ccc2c(c1CCCC(=O)O)OC(c1ccc(O)cc1O)CC2
|
| 1018 |
+
NCCC(O)(P(=O)(O)O)P(=O)(O)O
|
| 1019 |
+
CC(C)CCCCCCCCCCC(=O)O
|
| 1020 |
+
CC=CC1=C(C=CC)C(=O)C2(C1)CC(O)C(=O)O2
|
| 1021 |
+
CC12CCC3C(C(=O)CC4C(O)CCCC43C)C1CCC2O
|
| 1022 |
+
O=C(OCC1CC2c3ccccc3C1c1ccccc12)C(Cl)Cl
|
| 1023 |
+
C=C(C)COc1ccc2c(C)c(CC(=O)NCCc3ccncc3)c(=O)oc2c1
|
| 1024 |
+
C#CC=CCCCCCCCCCCC=CCCCCCCCCCCCCC#CC#CCO
|
| 1025 |
+
CCCSSCCC
|
| 1026 |
+
COC(=O)c1ccc2c(=O)n(CC(N)=O)cnc2c1
|
| 1027 |
+
O=C(O)COc1ccc2c(c1)OC(=Cc1cccc(F)c1)C2=O
|
| 1028 |
+
COC(=O)/C=C/CNC(=O)[C@@H](CCSC)NC(C)=O
|
| 1029 |
+
CCCc1cc(-c2onc(C)c2-c2ccc(C(=O)OCC)o2)c(O)cc1OC
|
| 1030 |
+
C1=CC=CC=CC=CC=CC=CC=CC=C1
|
| 1031 |
+
C=c1ccn2c(=N)onc12
|
| 1032 |
+
Cc1cc(C)c(C)c(N)c1
|
| 1033 |
+
CC(=O)NC(Cc1ccc(F)cc1)C(=O)O
|
| 1034 |
+
CN1C(=O)NC(=Cc2ccccc2)C1=O
|
| 1035 |
+
CC(=O)c1cc2c(c(O)c1C)C(=O)C(=O)c1c(O)cccc1-2
|
| 1036 |
+
CN1CCN(C(=O)CC2CCNCC2Cc2cc(CN3CCCCC3)on2)CC1
|
| 1037 |
+
COCCOCC1(CNC(=O)c2nc3ccccc3[nH]2)CC(O)C(O)C1
|
| 1038 |
+
CC1CCC(C(C)C)C(OC(=O)c2ccccc2)C1
|
| 1039 |
+
C=C1CCC=C(C)CC2OC(CC2C(=C)C)C(C)(O)CCC1O
|
| 1040 |
+
CNC1CC(c2ccc(Cl)cc2)c2ccccc21
|
| 1041 |
+
CC(=O)OC(C)(C)C1CC=C(C)CCC=C(C)CC(O)C=C(C)CC1
|
| 1042 |
+
COC(=O)c1c[nH]c2cc(Br)ccc12
|
| 1043 |
+
C=CCCC(CC=C(C)CCOC(C)=O)C(=C)C
|
| 1044 |
+
OCC1=CCN2CC(O)C(O)C12
|
| 1045 |
+
CCCCSSC(CC)SC
|
| 1046 |
+
CC1=CC2C(C(C)O)CCC(C)(C)C2CC1
|
| 1047 |
+
Cc1c(Cl)cnc(N=C(N)N)c1Cl
|
| 1048 |
+
O=C1Nc2cc(Cl)ccc2C1=Cc1c[nH]c2ncccc12
|
| 1049 |
+
CCCCCCCCCC[S+]([O-])CCC(=O)NC(CO)(CO)CO
|
| 1050 |
+
COc1cc2c3c(cc4ccccc4c3c1O)NC2=O
|
| 1051 |
+
Cc1cc2oc(=O)cc(C)c2c(O)c1CN1CCCC1
|
| 1052 |
+
CC(=O)N[C@@H](CS)C(=O)N[C@H](C(N)=O)C(C)C
|
| 1053 |
+
CC(C)=CCCC1COC2OC(O)C3=CCC=C(C)CCC1C32
|
| 1054 |
+
C=C1CC23CCC4C(C)(C(=O)OC)CCCC4(C)C2(O)CCC1C3
|
| 1055 |
+
CC1(CO)CCCC2(C)c3ccccc3C(CO)C12
|
| 1056 |
+
CC(=O)NCCOc1cccc2c1N(C(=O)c1ccc3c(c1)OCO3)CCC2
|
| 1057 |
+
CCCCCCCCCCCCCCCCCc1ccc(O)c(O)c1
|
| 1058 |
+
CC(C)CCn1c(N(C)C)nc2c1c(=O)n(C)c(=O)n2C
|
| 1059 |
+
CCCCCC(CC(=O)CCc1ccc(O)c(OC)c1)OC
|
| 1060 |
+
CC(=O)OC1CCC2(C)C(CCC3C4CCCOC4(C)CCC32)C1
|
| 1061 |
+
OCC1CCN2CCC(O)C12
|
| 1062 |
+
CC1CCC2(OC2(C)C)C(=O)C1
|
| 1063 |
+
CC(C#N)c1ccccc1[NH+]([O-])O
|
| 1064 |
+
Cc1c(O)cc(C=Cc2ccccc2)cc1O
|
| 1065 |
+
COc1cc(=O)oc(C)c1C=CC(C)O
|
| 1066 |
+
CC(C)CCCCCCCCCCC=COCC(COP(=O)(O)OCCN)OC(=O)CCCCCCCCCCCC(C)C
|
| 1067 |
+
CCCCC(O)CCCCCCCCCCC(=O)O
|
| 1068 |
+
CCCCCCCC=CCCCCCC(=O)O
|
| 1069 |
+
COc1cc2c(c(O)c1C)C(=O)CCC2O
|
| 1070 |
+
C=C1OC2CC(C(=O)O)C=CC2NC1=O
|
| 1071 |
+
O=C(NNC(=O)c1cc([N+](=O)[O-])c[nH]1)Nc1ccccc1
|
| 1072 |
+
COC(=O)c1cccc(Nc2cc(C)nc3ccc(OC)cc23)c1
|
| 1073 |
+
COc1ccc(CCN2C(=O)NC(CC(=O)N3CCCC3)C2=O)cc1
|
| 1074 |
+
CC(C)c1cc(C(=O)N2CC[C@@H](N)C2)nn1C(C)(C)C
|
| 1075 |
+
CN(C)C(=O)Oc1ccc2cc(-c3ccc(Cl)cc3)c(=O)oc2c1
|
| 1076 |
+
CC(=O)Oc1ccc(C2Oc3cc(OC(C)=O)ccc3CC2OC(C)=O)cc1
|
| 1077 |
+
CCC(=O)OCC(C)=CCCC1(C)OC2(C)C=CC1CC2
|
| 1078 |
+
CC(=O)OC1c2c(C)coc2C(=O)C2CCCC(C)C21C
|
| 1079 |
+
CCC1CN(C)CC2C(C(=O)OC)c3c([nH]c4ccccc34)C(=O)CC12
|
| 1080 |
+
CC1(CO)CCCC2(C)C1CC(O)C13C=C(CC=O)C(CCC21)C3
|
| 1081 |
+
O=C(NC1CC2C(=O)NCC(CCC(=O)N3CCOCC3)N2C1)C1CC1
|
| 1082 |
+
Cc1cccc(OCC2(CNC(=O)NC(C)C)CC(O)C(O)C2)n1
|
| 1083 |
+
COP(=O)(O)ON1C(N)=NCC1CN(C)C
|
| 1084 |
+
CC(O)=NC(CCC(=O)O)C(=O)OP(=O)(O)O
|
| 1085 |
+
NCCCCC(=O)O
|
| 1086 |
+
CC(=O)OCC(=CCCC(=CCO)CO)CCC=C(C)C(O)CC=C(C)C
|
| 1087 |
+
Cc1ccc(C(C)(C)O)c(O)c1
|
| 1088 |
+
CC12CCC3C(CCC4CC(O)CCC43C)C1CCC2CCO
|
| 1089 |
+
c1ccc(-c2cc3ccncc3cn2)cc1
|
| 1090 |
+
CCCCCCCCCCCCCCC(O)CO
|
| 1091 |
+
NC(=O)NCCCCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
|
| 1092 |
+
CC(CO)CCC(=O)C(C)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C
|
| 1093 |
+
COc1ccc(Cn2cnc3c([nH]c4cc(OC)ccc43)c2=O)cc1
|
| 1094 |
+
C=C1CCCC2(C)CC3OC(=O)C(CNC4CCCCC4Cc4ccco4)C3CC12
|
| 1095 |
+
NC(Cc1c[nH]c2ccc(O)cc12)C(=O)O
|
| 1096 |
+
CC1=CC(=O)C2=C(CO)CCC3C(C)C(=O)OC3C12
|
| 1097 |
+
CNCCCCCN
|
| 1098 |
+
NCCCC(N)C(=O)NCC(=O)O
|
| 1099 |
+
COc1ccc(SCc2cnc3nc(N)nc(N)c3n2)cc1OC
|
| 1100 |
+
COc1cc(O)c2c(=O)c3c(C)cc(O)cc3oc2c1
|
| 1101 |
+
CC(=O)NC(CSNN=C1N=CN=C1C(N)=O)C(=O)O
|
| 1102 |
+
N=c1cc2[nH]nccc-2o1
|
| 1103 |
+
CSc1cc(=O)n2c3ccccc3c3ccnc1c32
|
| 1104 |
+
C=CC(C)=CC(C)C(=O)C=CCC1CC(=O)NC(=O)C1
|
| 1105 |
+
COc1ccc(C(=O)CCc2ccccc2)c(OC)c1CC=C(C)C
|
| 1106 |
+
CC1CCCC(C)(C)C1=NO
|
| 1107 |
+
COC1OC(=O)C2=C1C1(C)CCCC(C)(C)C1CC2O
|
| 1108 |
+
COc1ccc(C2NCCc3[nH]cnc32)cc1
|
| 1109 |
+
CCC=CCC=CCC=CCC=CCC=CCCCCCC(=O)OC(CO)COC(=O)CCCCCCCCCCCC=CCCCCCCCC
|
| 1110 |
+
Cc1[nH]c2ccccc2c1CCNC(=O)c1c(F)cccc1F
|
| 1111 |
+
C=C(C(=O)OC)C1CCC2(C)C(=O)C=CC(C)(O)C2C1O
|
| 1112 |
+
CCOC(=O)CC(c1ccc2nccnc2c1)c1oc(CO)cc(=O)c1O
|
| 1113 |
+
CCOC(=O)CC1CC(C(C)C)C(CNC(=O)C(C)(C)C)C=C1C
|
| 1114 |
+
CCN(CC)C(=O)c1ccccc1
|
| 1115 |
+
CC1CCCCCCCC(=O)O1
|
| 1116 |
+
COc1ccc2nc(CC3CN(Cc4cnc[nH]4)CCC3CC(=O)O)[nH]c2c1
|
| 1117 |
+
CC(C=C1CN2CCCC2C(C)(O)C1)CO
|
| 1118 |
+
COc1ccc(C(=O)Nc2c(Cl)cncc2Cl)cc1OC
|
| 1119 |
+
CCC=CSC
|
| 1120 |
+
C=CC=CCC1=C(C)C(O)CC1=O
|
| 1121 |
+
CCOc1ccc(NC2=CC(=O)c3ncsc3C2=O)cc1
|
| 1122 |
+
CC1=CCCC2(C)OC2C(O)C(C(C)C)CC1
|
| 1123 |
+
CCCCCC(O)CC(=O)CCCC(=O)CC(O)CCCCCCCC(O)CC(=O)CCCC(=O)CC(O)CCCCC
|
| 1124 |
+
CCCOc1ccc(C(=O)Oc2cc(=O)oc3ccccc23)cc1
|
| 1125 |
+
CC(=O)OC1CC2C(C)(C)CCC(O)C2(C)C2CCC3CC12C1OC31C
|
| 1126 |
+
CCOC(=O)C(C#N)=Cc1ccc(N(C)C)cc1
|
| 1127 |
+
Cc1c[nH]c(=O)[nH]ccc(=O)n(Br)c1=O
|
| 1128 |
+
COCC(COC(C)=O)OC(C)=O
|
| 1129 |
+
CC(=O)Nc1cccc(Cl)c1Cl
|
| 1130 |
+
CC(NC(=O)c1ccccc1)c1c(-c2ccc(F)cc2)noc1C(=O)O
|
| 1131 |
+
CCCCCC=CCCCCCCC=CC=CC(=O)NCC(C)C
|
| 1132 |
+
CCCCC=CC=CC=CCCCCCCCC(=O)OCC
|
| 1133 |
+
CC(C)CC(N)C(=O)NC(CCC(=O)O)C(=O)NC(CCCCN)C(=O)O
|
| 1134 |
+
CCCCCCCCCC(=O)CC1C2=COC(CC(C)O)=CC2=CC(=O)C1(C)O
|
| 1135 |
+
c1ccc(C2=NOC(c3ccccc3)C2)cc1
|
| 1136 |
+
COc1ccccc1CCn1cnc2c([nH]c3ccccc32)c1=O
|
| 1137 |
+
Cc1coc2c1c(C)cc1oc(=O)c(CC(=O)NCCN(C)C)c(C)c12
|
| 1138 |
+
CC(=O)c1c[nH]c2ccccc12
|
| 1139 |
+
N[C@H](C(=O)O)c1ccc(C(=O)O)c(O)c1
|
| 1140 |
+
CS(=O)(=O)NCC1OCC(NC2COC2)C1O
|
| 1141 |
+
COc1ccc(C=CCC(CO)C(O)c2ccc(OC)c(OC)c2)cc1
|
| 1142 |
+
COc1ccc(CNC(=S)NCc2ccccc2)cc1
|
| 1143 |
+
COc1ccccc1-c1c(C)oc2cc(OCC(N)=O)ccc2c1=O
|
| 1144 |
+
O=C(O)Cc1ccc(O)c(O)c1
|
| 1145 |
+
CCCc1oc2c(c(=O)c1CC)C(=O)CC(C)(C)C2
|
| 1146 |
+
CC1=CCOC1=O
|
| 1147 |
+
CC(=NCCc1ccccn1)c1c(O)n(C)c2ccccc2c1=O
|
| 1148 |
+
C1CSCCOCCSCCO1
|
| 1149 |
+
Cc1cc2c(c(=O)o1)C(c1ccc3c(c1)OCCO3)CC(=O)O2
|
| 1150 |
+
COc1cc(O)c(-c2cc(=O)c3c(O)cc(O)c(OC)c3o2)cc1O
|
| 1151 |
+
COC(=O)C1C2CCC(CC1OC(=O)c1ccccc1)N2
|
| 1152 |
+
CC(C)=CCC=C(C)C(C)c1c(O)c2c(C)cccc2oc1=O
|
| 1153 |
+
OCC=CC#CC#Cc1ccccc1
|
| 1154 |
+
CC1(C)C=Cc2c(ccc(-c3cc4ccc(O)cc4o3)c2O)O1
|
| 1155 |
+
COC(=O)c1csnn1
|
| 1156 |
+
CCCCCC=CCC=CCC=CCCCCCCC(=O)NCCO
|
| 1157 |
+
Nc1c(OS(=O)(=O)O)cccc1C(=O)CC(N)C(=O)O
|
| 1158 |
+
CC(=O)OC1c2c(C)coc2C(=O)C2C(O)CCC(C)C12C
|
| 1159 |
+
COC(=O)CC(c1oc(CSc2ccc(OC)cc2)cc(=O)c1O)C(C)C
|
| 1160 |
+
O=C(Cn1ccc2cc(OCc3ccccc3)ccc21)NCC1CCCN2CCCCC12
|
| 1161 |
+
CC(C)=CCCC(C)=CCc1c(O)oc2cccc(C=O)c2c1=O
|
| 1162 |
+
C=C(CC)C(=O)O
|
| 1163 |
+
N#CCCCCC=CCC=CCCCCCCCCCCCCCCc1ccc(C=O)[nH]1
|
| 1164 |
+
CC1OC(=O)C=CC(O)C=CC(Cl)C1O
|
| 1165 |
+
O=C(O)c1c(O)c(Cl)cc(Cl)c1Cl
|
| 1166 |
+
CC(=O)C=CC12OC1(C)CC(O)C(O)C2(C)C
|
| 1167 |
+
CC1(C)Cc2ccccc2-c2nnc(-c3cccc(O)c3)n21
|
| 1168 |
+
CC(=O)SC(C)SC(C)=O
|
| 1169 |
+
COc1ccc2c(c1)nc(CC(C)(C)CC(=O)O)n2Cc1ccc(Cl)cc1
|
| 1170 |
+
CC1(C)C2CCC3(C)C(C2)C(C(=O)O)CCC13O
|
| 1171 |
+
CC(C)=CC1C(C)CCC2C(C)CCC12O
|
| 1172 |
+
Cc1ccc2c(c1)C(=O)c1c(O)cccc1C2=O
|
| 1173 |
+
CCCCCCCCCCCC(O)C(Cc1ccccc1)NC(C)=O
|
| 1174 |
+
CCOC(=O)C(=O)c1ccc(O)cc1
|
| 1175 |
+
CCCCCc1ccc(C(=O)OCC2CCCN3CCCCC23)c(=O)o1
|
| 1176 |
+
Cc1ccc(N)c(C)c1
|
| 1177 |
+
CCC=CCC=CCC=CCC=CCC=CCCCCCC(=O)OC(CO)COC(=O)CCCCCC=CCC=CCC=CCC=CCCCCC
|
| 1178 |
+
CCCCCCCCCCCC(=O)OCC(CO)OC(=O)CCCCCCCCCCC
|
| 1179 |
+
COC1(C)CCC2(CC1Br)C(C)=CC(=O)CC2(C)C
|
| 1180 |
+
O=C1C=CC2C3c4ccccc4C(c4ccccc43)C2C=C1
|
| 1181 |
+
CC=Cc1ccc(OC(=O)Cc2ccccc2)c(OC)c1
|
| 1182 |
+
O=C(Cc1ccccc1)NNc1ccccc1
|
| 1183 |
+
CN(c1ccccc1)c1nc(Cl)nc(Cl)n1
|
| 1184 |
+
O=C(NCCCOc1ccc2nc(O)ccc2c1)N(C1CCCCCCC1)[C@H]1CCCC[C@@H]1O
|
| 1185 |
+
CCCCCCC[C@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)NC
|
| 1186 |
+
CC(=O)NC(COC(C)=O)Cc1ccccc1
|
| 1187 |
+
CC(=O)Nc1ccc(OC(=O)c2ccc(Cl)cc2)c(C(C)=O)c1
|
| 1188 |
+
COc1ccc(OC)c2c1cc(C(=O)NCc1ccc3c(c1)OCO3)n2C
|
| 1189 |
+
CC1=NN(c2ccccc2)C(=O)C1=Cc1ccccc1O
|
| 1190 |
+
COC(=O)CC1C(C)=CC(=O)CC1(C)C
|
| 1191 |
+
CC1=CCCC(C)CC(=O)C2(C)CC(=O)C(C(C)C)C2CC1
|
| 1192 |
+
CCCC[NH+]1CCc2cc(OC)c(OC)cc2C1C
|
| 1193 |
+
Cc1ccc(NC=C2C(=O)OC(C)(C)OC2=O)cc1
|
| 1194 |
+
Cc1ccc2c(c1)C(=O)c1ccc(O)c(O)c1C2=O
|
| 1195 |
+
O=C(O)c1cccc(-c2ccc(F)cc2)c1
|
| 1196 |
+
O=C(NC(=O)C(F)(F)F)Nc1cc(Cl)ccc1Cl
|
| 1197 |
+
Cc1cc2c3ccccc3[nH]c2c2c[n+](C)ccc12.[I-]
|
| 1198 |
+
COc1c(C)c(O)c2c(c1C)COC2=O
|
| 1199 |
+
OC1COCCN(c2nc3ccccc3o2)C1
|
| 1200 |
+
O=C(C=Cc1ccc(Cl)cc1)c1cc(F)ccc1O
|
| 1201 |
+
CCOc1ccc(-c2cc(=O)c3cc(O)ccc3o2)cc1
|
| 1202 |
+
CN1C(=O)NC(=Cc2ccc([NH+]([O-])O)cc2)C1=O
|
| 1203 |
+
CCCCCCCC(O)C(O)CC#CC#CC(=O)CC
|
| 1204 |
+
CC(C)=C1C(=O)C=C2CCC(O)C(C)C2(C)C1O
|
| 1205 |
+
CCC(=NNC(N)=O)C1CC2(C)CCC1C2(C)C
|
| 1206 |
+
O=C1CCCCC1(O)CCO
|
| 1207 |
+
O=C(Cn1cnc2ccccc2c1=O)NC(Cc1ccc(O)cc1)C(=O)O
|
| 1208 |
+
C=C(C)C(O)COc1ccc2ccc(=O)oc2c1
|
| 1209 |
+
[O-][NH+](O)c1cc(-c2cn3ccccc3n2)ccc1I
|
| 1210 |
+
CCCCCC=CCC=CCCCCCCCC(=O)OCCCCCCCCCCCCCCCCCCCCO
|
| 1211 |
+
CCC=CCC=CCCCOS(=O)(=O)O
|
| 1212 |
+
CCCCC(C)C=C(C)C(=O)CC
|
| 1213 |
+
CCCCCCCCCCCCCCCCCCCC(=O)OCC(O)COP(=O)(O)OCCN
|
| 1214 |
+
CC(=O)C1(CCC(C)CCO)C(O)CC2C(C)(C)C(O)CCC21C
|
| 1215 |
+
CC(C)=CCCC(C)=CCOc1ccc(C=C2C(=O)N(O)C(C)C(=O)N2O)cc1
|
| 1216 |
+
C=C1C(=O)OC2C1CC=C(C)C1CC=C(C)C12
|
| 1217 |
+
CC(NC(=O)Cc1ccccc1)c1onc(-c2ccc(F)cc2)c1C(=O)O
|
| 1218 |
+
O=C(c1ccc(O)cc1)c1oc2cc(O)ccc2c1-c1ccc(O)cc1
|
| 1219 |
+
O=C1N[C@@H](Cc2ccccc2)C(O)[C@H](Cc2ccccc2)N1
|
| 1220 |
+
NCCC1=CC(C(C=O)CCCCCC(CC=CCCC(=O)O)Cc2ccc(O)cc2)CC1
|
| 1221 |
+
O=C(NNC(=O)c1ccc([NH+]([O-])O)cc1)Nc1ccccc1
|
| 1222 |
+
CCC1CN(C(=O)c2ccccc2)CCC1CC(=O)N1CCC(O)CC1
|
| 1223 |
+
c1ccc(-c2ccnc(NC3COC4C(NCC5CCCCC5)COC34)n2)cc1
|
| 1224 |
+
O=c1[nH]c(=O)n(C2CC(O)C(CO)O2)cc1F
|
| 1225 |
+
CC(O)C1CCC2C3CC=C4CC(N)CCC4(C)C3CCC12C
|
| 1226 |
+
CC1(C)COC(c2ccccc2Br)=N1
|
| 1227 |
+
Nc1c2c3c(cccc3[nH]c1=O)C(=O)c1ccccc1-2
|
| 1228 |
+
CCCCCCCCCCCCCCCCCCC(C)CCCCCCCC(C)C(C)=O
|
| 1229 |
+
CC(O)CCOC(=O)c1ccccc1C(=O)O
|
| 1230 |
+
CCC(C(=O)O)C(C)C
|
| 1231 |
+
CCCCC12CN3CC(C)(CN(C1)C3c1ccco1)C2=O
|
| 1232 |
+
O=C1C(CO)=CC(=O)c2c1cc1ccccc1c2O
|
| 1233 |
+
O=C(O)c1cc(Cc2c(O)ccc3ccccc23)ccc1O
|
| 1234 |
+
CCC(C)C1OC1(C)C1(O)C(C)=CC2CC(CO)=CCC2C1C=CC=CC=CC(=O)O
|
| 1235 |
+
O=C1c2ccc(O)cc2OCC1(O)c1ccc(O)cc1
|
| 1236 |
+
COC(=O)c1cc2ccccc2c(O)c1C(=O)OC
|
| 1237 |
+
CC(C)NC(=O)NCC1OC(CO)C(O)C1N(C)CCN(C)C
|
| 1238 |
+
CCCCCCCCCCCCC=CC(O)C(COC1OC(CO)C(O)C(O)C1O)NC(=O)CCCCCCCCCCCCCCCCCCCC
|
| 1239 |
+
COc1ccc(CNC(=O)CC2CCNCC2Cc2cc(CN(C)C)on2)cc1
|
| 1240 |
+
COc1ccc2c([nH]c3cc(O)c(C=O)cc32)c1CC=C(C)CCC=C(C)C
|
| 1241 |
+
Cc1nc2ncccc2c(=O)n1CCNC(=O)c1ccc2c(c1)OCO2
|
| 1242 |
+
C=CC(C)(O)CC(O)C=C(C)CC(O)C=C(C)C
|
| 1243 |
+
CC(=CC(O)C(=O)O)C1CC(=O)OC1(C)C
|
| 1244 |
+
C=C(C(=O)OC)C1CCC(C)(OCC)C2CC(O)C(C)=CC12
|
| 1245 |
+
CCCCCC(O)C=CC1CCC(=O)C1CCCCCCC(=O)NCCCN(C)C
|
| 1246 |
+
CS(=O)(=O)N1CC(F)C(OCc2nc3ccccc3o2)C1
|
| 1247 |
+
O=S(=O)(c1cccc([NH+]([O-])O)c1)N1CCCCC1
|
| 1248 |
+
COc1ccc2c(c1O)-c1ccc(O)cc1CC2
|
| 1249 |
+
CNCCCC(=O)c1ccc(O)nc1
|
| 1250 |
+
c1ccc2c(c1)-c1nc3ccccc3nc1-2
|
| 1251 |
+
CC(=O)OCC(=CCCC(C)=CCO)CCC=C(CCC=C(C)C)C(=O)O
|
| 1252 |
+
CC(=CC(=O)O)CCC1(C)C(C)CCC2(C)C(C)=C(C=O)CC21
|
| 1253 |
+
CCOC(=O)C1COC(=O)C2C1CCC2(C)O
|
| 1254 |
+
Cc1ccc(Br)c(N)c1
|
| 1255 |
+
CC(=O)OC1CC(C)(O)C2=CCC(C)=C2C2OC(=O)C(C)C12
|
| 1256 |
+
CCC1(C)CC(CCNCc2ccccc2)(C(C)C)CCO1
|
| 1257 |
+
COc1cccc2c3c(c(=O)n(C)c12)CC(C(C)C)O3
|
| 1258 |
+
O=C(NC1COC2C(Nc3nccc(C4CCCC4)n3)COC12)N1CCOCC1
|
| 1259 |
+
COc1cc(C2Cc3cc(O)cc(O)c3C(=O)O2)ccc1O
|
| 1260 |
+
CC=C(C)C=C(C)C(O)C(C)C(=O)CCC
|
| 1261 |
+
CCCCC=CC=CC=CCCCCCCCC(=O)OCC(O)CO
|
| 1262 |
+
CC1=CC(=O)CC(C)(C)C1CO
|
| 1263 |
+
COc1ccc(-c2[nH]nc(C)c2-c2ccc(OC)c(O)c2)c(O)c1
|
| 1264 |
+
COC(=O)C1=CC2C(O)C(C)(C)CC23C1COC(=O)C31CO1
|
| 1265 |
+
CC(CO)Cc1cc(O)c(O)c(Br)c1Br
|
| 1266 |
+
Cc1ccc(C(C)C)o1
|
| 1267 |
+
CCCCCCCC1CCCC(=O)NCCCN(O)CCCCNCCCN1
|
| 1268 |
+
CCCCC=CCCCC=CCCCCC=O
|
| 1269 |
+
O=C(CCC(O)Cc1ccccc1)OC1OC(C(=O)O)C(O)C(O)C1O
|
| 1270 |
+
O=C(O)C1C2C=CC3(CN(c4ccccc4)C(=O)C13)O2
|
| 1271 |
+
CCCCCCCCCCCCCC=CC(O)=C(O)C(=O)O
|
| 1272 |
+
CC(=CC(=O)O)CCC=C(C)CCC(=O)O
|
| 1273 |
+
CC(=O)N(CCC(Cc1ccccc1)c1ccco1)C(C)c1ccccc1
|
| 1274 |
+
O=c1ncncn1C1OC(CO)C(O)C1O
|
| 1275 |
+
COC1=CC(=O)C2=C(CCc3cccc(O)c32)C1=O
|
| 1276 |
+
Clc1ccc(-c2cn3cc(I)ccc3n2)cc1Cl
|
| 1277 |
+
CCCCCCCCCCCCCCCC(OC(C)=O)C(CO)NC(C)=O
|
| 1278 |
+
NC1COC2C(OC(=O)Nc3ccccc3)COC12
|
| 1279 |
+
CCCCCC(O)CC(CC(=O)O)OC1OC(CO)C(O)C(O)C1O
|
| 1280 |
+
CCCCCCCCCCCCCCCCCCCCCCCCCCCC(=O)NCCc1c[nH]c2ccccc12
|
| 1281 |
+
COc1ccc(-c2noc(C3CC(NCC4CC4)CN3C)n2)cc1
|
| 1282 |
+
CC=CC1=CC(O)C(O)C1=O
|
| 1283 |
+
CCCCCC(O)CC(CCc1ccc(O)c2c1CCC1CCCC1O2)OC(C)=O
|
| 1284 |
+
S=C(Nc1ccccc1)NN1c2ccccc2CCc2ccccc21
|
| 1285 |
+
CC1=CCCC2(C)OC2CCC(C)=CC2OC(=O)C(C)=C2CC1
|
| 1286 |
+
CNCCc1cc(O)c(O)c(O)c1
|
| 1287 |
+
CC1(C)CC2CC(C)(O)c3cocc3C(O)C2C1
|
| 1288 |
+
O=C(CC1(n2cccc2)CCOCC1)NCC1CCCN2CCCCC12
|
| 1289 |
+
CCCCCCC(=O)C=CC=CC(O)=NCC(C)C
|
| 1290 |
+
CCCCCCCCCCCCCCCC(=O)NCC
|
| 1291 |
+
CCCCC#CC#CC#CCCCCCCCCCC(=O)O
|
| 1292 |
+
O=C(Cl)CCc1ccccc1
|
| 1293 |
+
O=C1CCC(=O)NCCCCCN(O)C(=O)CCC(=O)NCCCCCN(O)C(=O)CCC(=O)NCCCCCN1
|
| 1294 |
+
O=C(O)C1CC(O)CN1C(=O)C1CCCN1
|
| 1295 |
+
CC(=O)OCCCC=CC=CC#Cc1cccs1
|
| 1296 |
+
COc1ccc(NCCNC(C)=O)c2c1C(=O)c1ccccc1C2=O
|
| 1297 |
+
COc1ccc(C=CCOC(=O)CCCCCCCC=CCC=CCCCCCO)cc1OC
|
| 1298 |
+
COc1cc(O)c(Cc2ccc(O)cc2)c(CCc2ccccc2O)c1
|
| 1299 |
+
COc1c2c(cc3c1C1C(OC3OC)C(O)C=C3CCN(C)C31)OCO2
|
| 1300 |
+
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)OCc1ccco1
|
| 1301 |
+
COC(COC(C)=O)C(OC)C(O)C(CO)OC(C)=O
|
| 1302 |
+
CCCCCCCCCCC=CCCC(=O)O
|
| 1303 |
+
CC(C)=CCCC(C)=CCCC1(C)OC1Cc1c[nH]c([NH+]([O-])O)c1
|
| 1304 |
+
CCCCCC(=O)CC(=O)C=Cc1cc(CCO)c(O)c(OC)c1
|
| 1305 |
+
CC(C)=CCO
|
| 1306 |
+
COc1ccccc1CCO
|
| 1307 |
+
COc1ccc(CC(C)C(C)Cc2cc(O)c(OC)c(O)c2)cc1OC
|
| 1308 |
+
CC(=O)N1C(=O)C2CCCN2C(=O)c2ccccc21
|
| 1309 |
+
CN1Cc2sc(Br)cc2C(c2ccccc2)C1
|
| 1310 |
+
CCCCC1(CCCC)C(=O)NC(=O)N(C)C1=S
|
| 1311 |
+
COc1ccc(NCC(=O)CC(c2ccccc2)C2CCOC(C)(C)C2)cc1
|
| 1312 |
+
CC(C=CC1(O)C(C)=CC(O)CC1(C)CO)=CC(=O)O
|
| 1313 |
+
COC1COCCN(Cc2cn(C)c3ccccc23)C1
|
| 1314 |
+
CSCCC(NC(=O)CCn1ccc2cc(OCc3ccccc3)ccc21)C(=O)O
|
| 1315 |
+
CCCNC1CCc2nc(N)sc2C1=O
|
| 1316 |
+
C=C(C)C1CCC2(C)CC(=O)C=C(C)C2C1
|
| 1317 |
+
COC1C=C(C)CC(=O)C2(O)C(CC2(C)C)C(CO)=C1
|
| 1318 |
+
NC1C2COC(O2)C(Sc2ccccc2)C1O
|
| 1319 |
+
CCCOC(NC(C)c1ccc(OC)c(OC)c1)OCCC
|
| 1320 |
+
COc1cc(C(C)=CC(C)=CC(=O)O)oc(=O)c1CO
|
| 1321 |
+
CCCCc1oc2ccccc2c1C(=O)c1ccc(OCCC[N+](C)(C)C)cc1
|
| 1322 |
+
C=C(C)C12C=C3CCC4C(C)(C(=O)O)CCCC4(C)C3(CC1)OO2
|
| 1323 |
+
C#CC(O)C=CCCCCCCCCCCCCC=CCCCCCCCCCCCCCCC(O)C#CC#CC#CCO
|
| 1324 |
+
CCCCCCCCCOc1ccc(C=Nc2ccc(CCCC)cc2)cc1
|
| 1325 |
+
COc1cc(O)c2c(-c3cc(O)ccc3O)cc(=O)oc2c1
|
| 1326 |
+
O=C(NCCNC(=O)c1c[nH]c2ccccc12)c1c[nH]c2ccccc12
|
| 1327 |
+
CC(C)=CCc1c(O)c(O)c2c(c1CC=C(C)C)C(=O)C1C(O)=CC=C(O)C1O2
|
| 1328 |
+
CC=CC=CC(=O)C=CC1OC(=O)C(C)C1O
|
| 1329 |
+
C=CCNc1nnc(SCC(=O)N2CCN(c3ccccc3Cl)CC2)s1
|
| 1330 |
+
CC(C)NC(=O)CC1CC2OC(CNC(=O)N3CCOCC3)C(O)C2O1
|
| 1331 |
+
CC(C)C(CC(=N)O)c1ccco1
|
| 1332 |
+
CCCS(=O)(=O)N1CCC(C(=O)N(C)C)CC1
|
| 1333 |
+
CC1=CCC(C(C)(N)CC=CC(C)(C)O)CC1
|
| 1334 |
+
Cc1c(Br)cc2c(c1Br)OC1(CO)CCC2(C)C1C
|
| 1335 |
+
O=C(CC1OC(CNC(=O)c2ccccc2)C(O)C1O)NCCN1CCOCC1
|
| 1336 |
+
C=C1C(OC(C)=O)CC(C(C)CCC=C(C)C)C2OC12
|
| 1337 |
+
CC(C)=CCOc1cc(O)c2c(c1)OC(c1ccc(O)cc1)C(O)C2=O
|
| 1338 |
+
CCCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)NCCCCCCCCCCCN
|
| 1339 |
+
C=C1CCC2(C)CCC(O)C(C)(C)C23CC13
|
| 1340 |
+
Cc1ccc2c(c1)OC(c1ccc3c(c1)OCCO3)CC2=O
|
| 1341 |
+
CCCC(C)c1cccc(CC)c1O
|
| 1342 |
+
CC1(C)CC2C(O)C(O)C3=CCC3(C)C2C1
|
| 1343 |
+
CCC12C=CCN(CCc3c([nH]c4ccccc34)C(C(=O)OC)C1)C2
|
| 1344 |
+
COc1cccc2c1[NH+]([O-])c1cccc(O)c1[NH+]2[O-]
|
| 1345 |
+
CC(=O)OC1(C(C)=O)CCCCC1
|
| 1346 |
+
O=C(NCCS(=O)(=O)O)C(=O)c1c[nH]c2ccccc12
|
| 1347 |
+
COC(=O)C1=C(C)NC(=O)NC1c1ccc(Cl)cc1
|
| 1348 |
+
CC1=CC=CC(C)(C)C=C1
|
| 1349 |
+
CCCCCc1cc(O)cc(O)c1C(=O)OCC
|
| 1350 |
+
CC(=O)c1ccc(OCC(=O)NC(c2ccccc2)c2cccs2)cc1
|
| 1351 |
+
COc1cc(O)c(C)c2cc(C)ncc12
|
| 1352 |
+
CC=CCCC1Cc2cc(O)c(CC=CC)c(O)c2C(=O)O1
|
| 1353 |
+
Cc1ccc(NC(=O)C2CCC(=O)N2C2OC(=O)c3ccccc32)c(C)c1
|
| 1354 |
+
CC1(C)CCCC2(C)C3=C(CCC12)COC3=O
|
| 1355 |
+
COc1c(C)cc2c(C(C)C)cc(=O)oc2c1O
|
| 1356 |
+
COc1c2c(c(Br)c3c1C(=O)N(C)CC3)OCO2
|
| 1357 |
+
CC1CC2CC(=O)C3(O)CCCN4CCCC2(O)C43C1
|
| 1358 |
+
COc1ccc(-c2cc(C=CC=O)ccc2O)cc1C=CC=O
|
| 1359 |
+
CC=CC=CC#CC=CCOC(C)=O
|
| 1360 |
+
CCCCCCCCn1sccc1=O
|
| 1361 |
+
C#CC=CC(O)CCCCCCCCCCCCCC=CCCC=CC#CC(O)C#CCCCCC=CCCCCCCC=CC(O)C#C
|
| 1362 |
+
O=C(O)CCCCCCCCCCCCCCCCCCCCCCCO
|
| 1363 |
+
COC(=O)c1occ2c1C(C)(O)C1CC(C)(C)CC1C2O
|
| 1364 |
+
CC(C)n1cc(CCC(=O)Nc2ccc(C(N)=O)cc2)c2ccccc21
|
| 1365 |
+
CC12CN3CC(C)(CN(C1)C3c1cccc3[nH]ccc13)C2=O
|
| 1366 |
+
CCCCCCCCCCC(=O)C1(O)C(OC(C)=O)C=CC1OC(C)=O
|
| 1367 |
+
CNCC(C)Cc1ccc2c(c1)OCO2
|
| 1368 |
+
CC(C)(C)NC(=O)NC1CC(CO)C(O)C1O
|
| 1369 |
+
Nc1c(Cl)cccc1-c1c[nH]c(C(=O)O)c1
|
| 1370 |
+
COc1ccc2c(c1)CN(C(=O)NC(C)C)CCC1COC(=O)N21
|
| 1371 |
+
CC(C(O)CC(=O)O)C1CCC2C3C(O)CC4CC(O)CCC4(C)C3CCC12C
|
| 1372 |
+
CCC1c2ccc(C)n2CCN1C(=O)Nc1cc(OC)ccc1OC
|
| 1373 |
+
O=C(O)CC(NC(=O)CP(=O)(O)O)C(=O)O
|
| 1374 |
+
CC(C)C(NC(=O)Nc1ccc(OC(F)(F)F)cc1)C(=O)O
|
| 1375 |
+
CCCCCCC(=O)CCCCCCCC1CCC(=O)O1
|
| 1376 |
+
Cc1coc2c1C(OC(=O)CC(C)C)C1(C)C(C)CCCC1C2=O
|
| 1377 |
+
NC(=O)c1ccccc1NC(=O)CC(CN1CCCC1)C(=O)O
|
| 1378 |
+
CC(C)CCCCCCC#CC=COCC(O)CO
|
| 1379 |
+
O=C1C(=Cc2ccccc2)Oc2c1ccc(O)c2CN1CCCCC1
|
| 1380 |
+
CCCCC(Br)=CC(=O)O
|
| 1381 |
+
O=CC(O)CS(=O)(=O)O
|
| 1382 |
+
C=C1CCC2C(C)(C(=O)O)CCCC2(C)C1CCC(C)(O)CC(=O)O
|
| 1383 |
+
C=CC1(C)CCC2(C)C(CCC3(O)C2CCCC3(C)COC(C)=O)C1
|
| 1384 |
+
CO[C@@H]1CCOP(=O)(NCCCl)N1CCCl
|
| 1385 |
+
CC(O)C(=O)CCC(=O)OCCc1ccccc1
|
| 1386 |
+
CC=CC#CC#CC=CC=CCCCCC(=O)OC
|
| 1387 |
+
CCCCCCCCCc1ccc(Oc2ccc(C)cc2CC(=O)O)c(Cl)c1
|
| 1388 |
+
C[C]1[CH][CH][C](C(=O)C[NH+]2[CH][C](C)[CH][CH][C]2N)[CH][CH]1
|
| 1389 |
+
CC(=O)OC1C2CC(CC(=O)O2)OC1c1ccccc1
|
| 1390 |
+
CN(C)CCOc1cc2c(c3ccccc13)-c1ccc(O)cc1C2=O
|
| 1391 |
+
Cc1c(C)c2c(cc(C)c3c(C)coc32)oc1=O
|
| 1392 |
+
CC1=C2C(O)CC2(C)C2CC(C)(CO)C=C2C1=O
|
| 1393 |
+
COc1cc2oc(=O)ccc2c2c1C=CC(C)(C)O2
|
| 1394 |
+
COC(CN(C)CC1C(=O)OC2CC3(C)CCCC4(CO4)C3CC21)OC
|
| 1395 |
+
CC1=CC(=C(c2ccc(N)cc2)c2ccc(N)cc2)C=CC1=N
|
| 1396 |
+
CC=C(C)C=C(C)c1oc(=O)cc(OC)c1C
|
| 1397 |
+
COC(C)=C1C(=O)C=CC1=O
|
| 1398 |
+
C=C1C(=O)OC2C1C(OC(=O)C(C)=CC)CC(CO)C1CC=C(C)C12
|
| 1399 |
+
CC=Cc1cc(O)c(C(O)C(O)C(C)O)c(=O)o1
|
| 1400 |
+
CCCCCC/N=c1\ccn(Cc2ccccc2)c2c(OC)cccc12
|
| 1401 |
+
CCCCCCCCC=CCCCCCCCC(=O)NCc1ccccc1
|
| 1402 |
+
COc1cc2c(cc1OC)C1C(CC=C3CCN(C)C31)OC2
|
| 1403 |
+
O=C(C=Cc1ccccc1)OC1C(O)OC(CO)C(O)C1O
|
| 1404 |
+
CN(C)CCCCN(C)C
|
| 1405 |
+
CCc1cc2c(=O)c(-c3ccc(Cl)cc3)coc2cc1O
|
| 1406 |
+
COc1ccc2nc3n(c2c1)C(CNC(C)C)COC3
|
| 1407 |
+
CC(C)NCC(O)COc1cccc2ccccc12
|
| 1408 |
+
Cc1ccc(OCCNC(=O)C(=O)NCCC(C)C)cc1
|
| 1409 |
+
O=c1cc2[nH]ccn2cn1
|
| 1410 |
+
CC(CCCCCCCCCCCCCC=CC(=O)O)OC1OC(C)C(O)CC1O
|
| 1411 |
+
Oc1[nH]cc2nnccc12
|
| 1412 |
+
CC#CC#CC#CC=CC=CCCCC=O
|
| 1413 |
+
CCC1CCCCC1C
|
| 1414 |
+
CC1=CCCC(C=O)=CC2CC2(C)C(O)CC1
|
| 1415 |
+
Oc1ccc(-c2nnc3n2NC(c2ccco2)S3)cc1
|
| 1416 |
+
CC1=CCCC2=CC(CC(CO)=CC=C(C(C)C)CC1)OC2=O
|
| 1417 |
+
CC=C(C)C(=O)CC
|
| 1418 |
+
COC1=CC(=O)C(C)=C(OC)C1=O
|
| 1419 |
+
CCCCCC=CCC=CCC=CCCCCC(=O)OCC(O)COP(=O)(O)OCCN
|
| 1420 |
+
COC(=O)c1c(C)oc2cc(Br)c(OCc3ccccc3Br)cc12
|
| 1421 |
+
CC=CC#CC#CC=CCOC(C)=O
|
| 1422 |
+
O=C(O)CC(NC(=O)c1ccccc1NC(=O)c1ccccc1)C(=O)O
|
| 1423 |
+
CC(C)(C)CC(C)(C)c1ccc(O)c(Cc2ccc(Cl)cc2Cl)c1
|
| 1424 |
+
CN(CC1CCCN2CCCCC12)C(=O)CCCc1nc(-c2cccnc2)no1
|
| 1425 |
+
COc1ccc(Oc2oc3cc(O)cc(O)c3c(=O)c2O)cc1
|
| 1426 |
+
CC(C)=CCc1c(O)cccc1C(=O)c1c(O)cc(O)c(CC=C(C)C)c1O
|
| 1427 |
+
O=C(O)Nc1ccc(Cl)cc1
|
| 1428 |
+
COc1ccc(O)c(-c2cc(=O)c3c(O)cc(O)cc3o2)c1
|
| 1429 |
+
O=C(OCC1CCCN2CCCCC12)c1ccc2c(c1)OCO2
|
| 1430 |
+
COC(=O)C1C[C]2[NH2+][CH]N(C)[C]2CN1C(=O)CCC(=O)[O-]
|
| 1431 |
+
C=C1C(O)CCC(CC(=O)c2cc(O)ccc2O)(C(=O)O)C1O
|
| 1432 |
+
COc1ccccc1N1CC(C(=O)Nc2ccc3oc(=O)ccc3c2)CC1=O
|
| 1433 |
+
CCC(O)C=CC1C(O)CC(=O)C1CC=CCC=CCC=CCCC(=O)O
|
| 1434 |
+
CC1=CC2OC3C(O)C(O)C(C)(C2(CO)CC1O)C31CO1
|
| 1435 |
+
CC(C)C(N)C(=O)N1CC(O)CC1C(O)C(=O)O
|
| 1436 |
+
CCOC(=O)Cc1nc(-c2ccc(Cl)cc2)oc1-c1ccc(Br)o1
|
| 1437 |
+
Oc1ccc2cccc(N3CCNCC3)c2c1
|
| 1438 |
+
CC(C)CC1c2cc(O)c(O)cc2C=C2c3ccc(O)c(O)c3CCC21
|
| 1439 |
+
CN(C=Cc1ccccc1)C(=O)C=Cc1ccccc1
|
| 1440 |
+
O=C(O)C(Cl)=CCl
|
| 1441 |
+
CC12CCC(C(=CN(O)c3ccccc3)C1=O)C2(C)C
|
| 1442 |
+
O=C1CC2(CCCCC2)Oc2ccc(O)cc21
|
| 1443 |
+
O=C(O)CCCNC(=O)OCc1ccccc1
|
| 1444 |
+
CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCC1(CC(=O)O)CC(=O)C=CC1=O
|
| 1445 |
+
Cc1cccc(OCC2CC(NC(=O)Nc3ccccc3)C(O)C2O)n1
|
| 1446 |
+
CC1(O)Cc2cccc(F)c21
|
| 1447 |
+
CC1C(OC2OC(CO)C(O)C(O)C2O)CC2(C(C)C)CC12
|
| 1448 |
+
CN1CCCC(n2nc(Cc3ccc(Cl)cc3)c3ccccc3c2=O)CC1
|
| 1449 |
+
O=C1CN(C(=O)NC(Cc2ccccc2)C(=O)NC(CO)C(=O)O)c2ccccc2N1
|
| 1450 |
+
CCOC(=O)c1c(C)n(C)c2c1cc(O)c1occ(C)c12
|
| 1451 |
+
COc1ccc2oc(S(N)(=O)=O)cc2c1
|
| 1452 |
+
NC(=O)NC(=O)Nc1ccccc1Cl
|
| 1453 |
+
C=C(c1ccccc1)c1ccccc1
|
| 1454 |
+
C#CCNC(=O)C1CCCN1C(C(=O)NC)c1ccccc1
|
| 1455 |
+
CNCC(C)CN1c2ccccc2Sc2ccccc21
|
| 1456 |
+
C=CC1(CO)CCC2(C)C(CCC3(O)C2CCCC3(C)C)C1
|
| 1457 |
+
C=C1OC(=O)C(CCCCCCCCCCCCCc2ccc3c(c2)OCO3)C1=O
|
| 1458 |
+
CNC(=O)OCc1c(COC(=O)NC)c(C)n(-c2ccccc2)c1C
|
| 1459 |
+
CCCOc1ccccc1-c1nc2nc[nH]c2c(=O)[nH]1
|
| 1460 |
+
CCC1CN(C(=O)c2ccc(F)cc2)CCC1CC(=O)NCCO
|
| 1461 |
+
NC(=O)NC=C(C=O)c1ccccc1
|
| 1462 |
+
COc1cccc(CN2CCN3C(=O)N(c4ccccc4)CC3C2)n1
|
| 1463 |
+
COc1ccc2c3c1OC1CC(O)C=CC31CCNC2
|
| 1464 |
+
CC(C)CCCCCCCCCCCCCCCCCCCC(O)CO
|
| 1465 |
+
COC(=O)C1=CC2CC(C)(C)CCC(C)(O)C2CC1
|
| 1466 |
+
CC(=O)N1CC2CN(C(C)C)CCN2C(C)(CO)C1
|
| 1467 |
+
CC(C)NC(=O)C1CN2CCC1CC2Cn1cc(CO)nn1
|
| 1468 |
+
CON(C)CCCCCCCCCCC=CC#CCCCCC1=CC=C[NH+](C)C1
|
| 1469 |
+
C=NC1(C)CCC2C(C)CC3CC(C)(C)CC4=C3C2C1CC4
|
| 1470 |
+
CCc1cn(C2OCC(O)C2CO)c(=O)nc1O
|
| 1471 |
+
CCC=CC#CC#CC=CCCC(=O)CCOC1OC(CO)C(O)C(O)C1O
|
| 1472 |
+
NCCCCCNC(=O)C(Cc1ccc(O)cc1)NC(=O)C1OC1C(=O)O
|
| 1473 |
+
COc1cccc(I)c1O
|
| 1474 |
+
CC#Cc1cn(C2OC(CO)C(O)C2O)c(=O)[nH]c1=O
|
| 1475 |
+
CCCCCCCCCCC(=O)CCCC(=O)OC
|
| 1476 |
+
O=C(NN1CCC=C(CCCO)C1)c1ccccc1
|
| 1477 |
+
CC=CC#CC#CC=CCOC(=O)C(C)=CC
|
| 1478 |
+
COc1cc2c(cc1OC)C(=O)C(=Cc1cccnc1)C2
|
| 1479 |
+
NC(=S)NN=Cc1ccccc1O
|
| 1480 |
+
CC1(C)C2CCC3(C)C(C=CCC13O)C2
|
| 1481 |
+
CCOC(=O)C=Cc1ccc(OC2OC(CO)C(O)C(O)C2O)c(OC)c1
|
| 1482 |
+
Nc1cc([NH+]([O-])O)ccc1Cl
|
| 1483 |
+
COc1ccccc1C
|
| 1484 |
+
CC1=CC(=O)C2(C)CC(=O)C(C(C)C)C2CCC(C)=CCC1
|
| 1485 |
+
CC1CC2C=CC3(O)COC(=O)C3(C)C2CC1O
|
| 1486 |
+
CCC=CCC=CC=CC(O)CCCCCCCC(=O)OC
|
| 1487 |
+
CCC1(C)CC(CCNCc2ccc(OC)c(OC)c2)(CCC(C)C)CCO1
|
| 1488 |
+
CCCCCCCC(O)CCCCCCCCCCCCCCCCCCCCCCCC(C)CC
|
| 1489 |
+
Oc1cc2c(cc1O)C(c1ccco1)=NCC2
|
| 1490 |
+
COc1cc(CC(C)NCC#N)c(OC)cc1I
|
| 1491 |
+
CC(C)NC(C)C(O)COc1ccc(Cl)c(Cl)c1
|
| 1492 |
+
O=C(O)C(=O)CC(=O)c1ccc(Cc2ccc(Cl)c(Cl)c2)cc1
|
| 1493 |
+
CC(=O)OCCC(C)OC(C)=O
|
| 1494 |
+
CC1(C)CCCC2(C)C(CO)=CC1C2CCO
|
| 1495 |
+
COc1cc(O)c2c(c1C)C(C)(O)C(C)OC2=O
|
| 1496 |
+
CC(C)(O)C(NC(=O)C(CS)NC(=O)CCCC(N)C(=O)O)C(=O)NCC(=O)O
|
| 1497 |
+
CC1CC2OC(=O)C(C)C2CC2(C)C(O)OCCC12
|
| 1498 |
+
CCCCCCCCCCCCCCCC(=O)N1CCc2c([nH]c3ccccc23)C1C
|
| 1499 |
+
CC(C)=CCCC(C)(O)C1CC=C(COC(=O)c2ccco2)CC1
|
| 1500 |
+
CCn1c(C(=O)NCCc2c[nH]c3ccccc23)cc2sccc21
|
| 1501 |
+
COc1ccc(NC(=O)N2CCC(CO)N(C)c3cccnc3C2)cc1
|
| 1502 |
+
C#CC(O)C=CCCCCCCC#Cc1ccc(C#CCCCCCCC=CC(O)C#C)o1
|
| 1503 |
+
Cc1csc(C)c1
|
| 1504 |
+
COc1cc(C=CCO)ccc1Oc1ccc(C(O)C(O)CO)cc1OC
|
| 1505 |
+
COC(=O)CNC(=O)c1ccccc1
|
| 1506 |
+
CCCCCCC(=O)C=CC(=O)CCCCCCCC(=O)O
|
| 1507 |
+
Cc1cc(C)c2oc(=O)cc(CN3CCCCC3)c2c1
|
| 1508 |
+
COc1cc2c(cc1C(=O)COC(C)=O)C=CC(C)(C)O2
|
| 1509 |
+
CN1CCc2cc3c(c(OCC(N)=O)c2C1=O)OCO3
|
| 1510 |
+
CC(=O)OCC1(C)CCCC2(C)C1CCC1(C)OC(CO)(C(O)CO)CCC12
|
| 1511 |
+
CC(C)=CC(=O)OCC=CC=CCCC=CC(=O)NCC(C)C
|
| 1512 |
+
OC1CCCOC1C=CC#Cc1cccs1
|
| 1513 |
+
CC=C1CC2CCC(C)C1(C)C2(C)C
|
| 1514 |
+
NC(=O)C(=Cc1ccc(Cl)cc1Cl)c1nc2ccccc2s1
|
| 1515 |
+
O=C(O)c1ccccc1C1C(=O)c2ccccc2C1=O
|
| 1516 |
+
CCCCCCCCCCCc1cc(=O)c2c(O)cccc2o1
|
| 1517 |
+
CC(O)C#CC1(C)CCCC(C)(C)O1
|
| 1518 |
+
CC(C)CC/C=C(\NC(=O)C1CC1(C)C)C(=O)O
|
| 1519 |
+
CC(C=CC(=O)O)C1CCC2C3C=CC4=CC(=O)CCC4(C)C3CCC12C
|
| 1520 |
+
CCc1cn(CC(NC(=O)c2ccco2)C(=O)O)c(=O)[nH]c1=O
|
| 1521 |
+
CC=CC(=O)OC1C=CC(C)OC(=O)CCC(OC(C)=O)C1O
|
| 1522 |
+
CC(C)CCCOC(=O)CCC(C)C
|
| 1523 |
+
CC1(C)C2C=CC(O2)C(Cl)C1O
|
| 1524 |
+
NCC(=O)Oc1c(-c2ccc(O)cc2)oc2cc(O)cc(O)c2c1=O
|
| 1525 |
+
CC=CC(N)=S
|
| 1526 |
+
C=Cc1c(N)ccc2cnccc12
|
| 1527 |
+
COC(=O)c1ccccc1NC(=O)NCCC(C)C
|
| 1528 |
+
CC(=O)C(O)Cc1ccc(O)cc1
|
| 1529 |
+
CC(=O)OC1(C(C)C)CC=C(C)C2CCC(C)=CC21
|
| 1530 |
+
NCC(O)P(=O)(O)O
|
| 1531 |
+
O=C1C(=Cc2ccc(O)cc2)C(=O)c2ccccc21
|
| 1532 |
+
COc1cc(-c2cc(=O)c3ccccc3o2)cc(I)c1OC
|
| 1533 |
+
N#Cc1ccc(Cn2ccnc2)cc1OCCc1ccc(-c2ccccc2)cc1
|
| 1534 |
+
COc1ccc(C(=O)OCc2cc3cc(OC)ccc3nc2O)cc1
|
| 1535 |
+
NC(=O)[C@H](Cc1ccccc1)NC(=O)Nc1nnc(S)s1
|
| 1536 |
+
CN(C)CCN(C)CC1CN2CCC1CC2CNC(=O)c1ccccc1
|
| 1537 |
+
CCCCCCCCCCCCCCCC(O)CCO
|
| 1538 |
+
COCC(=O)NC1C(c2cccs2)N(CCC(C)(C)C)CCC1(C)O
|
| 1539 |
+
C=C(C)C(=O)OCCOP(=O)(O)Oc1ccccc1
|
| 1540 |
+
CC(=O)NC[C@@H]1OC(=O)N2c3ccc(-c4ccccc4)cc3C[C@@H]12
|
| 1541 |
+
COc1c2c(cc3c1C(Nc1ccncc1)N(C)CC3)OCO2
|
| 1542 |
+
C=CCOc1ccc2c(C)cc(=O)oc2c1OCC=C
|
| 1543 |
+
C#CC(O)C=CCCCCCCCCCCCC#CCCCCCCCCCCCC(O)C#CC#CCO
|
| 1544 |
+
CNCC(O)CN1CCc2c(Br)cccc2C1
|
| 1545 |
+
COc1ccc(-n2cc(C(=O)O)c3c2C(c2cccnc2)CC(=O)N3)cc1
|
| 1546 |
+
CC(NC(=O)C1CC1)c1c(-c2ccc(Cl)cc2)noc1C(=O)O
|
| 1547 |
+
CCNc1c(C=O)c(=O)oc2ccccc12
|
| 1548 |
+
COc1ccc(NC2OC(=O)c3c2ccc(OC)c3OC)c(OC)c1
|
| 1549 |
+
O=C(Cl)CCC(=O)Cl
|
| 1550 |
+
O=C(O)C=CCCCCCCCCCC=C(Br)Br
|
| 1551 |
+
O=C(O)Cc1cc(O)cc(O)c1O
|
| 1552 |
+
CC1=CCC(C(C)CCC(O)C(C)(C)O)CC1=O
|
| 1553 |
+
CCOC(CNC(=O)c1cc(Br)c(Br)[nH]1)CC(=O)OC
|
| 1554 |
+
COc1c(O)ccc(CCC(=O)CC(O)CC(C)CCCO)c1-c1ccc(CN)cc1
|
| 1555 |
+
Cc1[nH]c2ccccc2c(=O)c1C(=O)C=Cc1ccccc1
|
| 1556 |
+
C/C=C(\NC(=O)CCCCCC)C(=O)O
|
| 1557 |
+
COc1c(O)cc2c(c1O)C(=O)c1ccccc1C2=O
|
| 1558 |
+
COc1ccc(C(=O)c2nccc3cc(OC)c(OC)cc23)cc1
|
| 1559 |
+
CC1=C2COCC(C)C2CC1O
|
| 1560 |
+
CCC(C)C(C)C(O)CN
|
| 1561 |
+
C=C(C)C1CCC2(C)CC(Br)CC(C)C2(O)C1
|
| 1562 |
+
C[C]1[CH][C](C)[C]2CCCC[C]2[NH2+]1
|
| 1563 |
+
COc1ccc(Cl)cc1C(=O)C=Cc1ccc(F)cc1
|
| 1564 |
+
O=C(OCC1CCCN2CCCCC12)c1cc2ccccc2oc1=O
|
| 1565 |
+
CCCNC(=O)NC1CC(O)(C(=O)NCC(N)=O)CC(O)C1O
|
| 1566 |
+
Cc1cc(C=C2C=CC(=O)O2)oc1C
|
| 1567 |
+
CC(=O)OCC1=C(C(=O)O)N2C(=O)C(NC(=O)C=Cc3ccccc3)C2SC1
|
| 1568 |
+
COC(=O)C1SCC(NC(=O)c2ccccc2)C1OC(C)=O
|
| 1569 |
+
NS(=O)(=O)c1ccc(NC(=O)CNCC(=O)O)c(Br)c1
|
| 1570 |
+
CC1CCCC2CCCCC12
|
| 1571 |
+
CCC(=O)c1cn(-c2ccc(F)cc2)c2ccc(Cl)cc12
|
| 1572 |
+
COc1cc(OC)c2c(c1)C(=O)c1cccc(O)c1C2=O
|
| 1573 |
+
COC(=O)c1cccc(Nc2nc(NCCO)c3ccccc3n2)c1
|
| 1574 |
+
O=S(=O)(O)OC1C(Oc2ccc(O)cc2)OC(CO)C(O)C1O
|
| 1575 |
+
CCN(CC)CCN=C(O)COc1ccc(OC)cc1
|
| 1576 |
+
CC1=CC(=O)C(C(C)CCC(O)C(C)(C)O)CC1
|
| 1577 |
+
O=C(Nc1ccc(OCCN2CCCC2)cc1)C1(c2ccccc2)CCOCC1
|
| 1578 |
+
CN1C(=O)c2ccccc2NC(=O)C12OC2c1ccccc1
|
| 1579 |
+
CC(=O)N[C@@H](CC(C)C)C(=O)O
|
| 1580 |
+
COc1ccc2oc3cccc(O)c3c(=O)c2c1
|
| 1581 |
+
O=C(Nc1cccc(C(=O)O)c1)NC(Cc1ccccc1)C(=O)O
|
| 1582 |
+
CN(C)CCc1c[nH]c2ccsc12
|
| 1583 |
+
C=C1C(O)CC2C(C)(C)CCCC2(C)C1CCC(C)=O
|
| 1584 |
+
COC(=O)c1ccccc1NC(=O)N1CCc2nc[nH]c2C1c1ccc(OC)cc1
|
| 1585 |
+
O=C(O)c1ccc2c(c1)OCO2
|
| 1586 |
+
O=C(NCCOC(=O)Nc1ccccc1)Nc1ccc(Cl)c(Cl)c1
|
| 1587 |
+
COc1c(O)cc2cc3c(c(O)c2c1C)C(=O)CC(O)C3
|
| 1588 |
+
NC(C(=O)O)c1ccc(C(=O)O)cc1
|
| 1589 |
+
CCCCCCCCC1C2C=CC3(O)C(C(=O)O)=CC(O)C4CC1C2C43
|
| 1590 |
+
O=c1cccc2n1CC1CCCN3CCCC2(O)C13
|
| 1591 |
+
CCCCCCCCCCCCCCCCC(C)CCCCCC
|
| 1592 |
+
COC(OC)C1(C)CCCC2(C)c3cc(O)c(C(C)C)cc3CCC21
|
| 1593 |
+
Cc1ccc(C(=O)NCC2OC(CC(=O)N3CCC(C)CC3)C(O)C2O)cc1
|
| 1594 |
+
CC1=C(O)C(=O)C23C(=O)C1C(C)(C)C2CCC3C
|
| 1595 |
+
O=C(O)C1OC(OCC2OC(O)C(O)C(O)C2O)C(O)C(O)C1O
|
| 1596 |
+
COc1ccc(CN2C(=O)NC(CC(=O)NC(CO)CCSC)C2=O)cc1
|
| 1597 |
+
c1ccc(-c2cc3c(cn2)CCCC3)cc1
|
| 1598 |
+
CC1=CCC2=C(C)CC(OC(=O)c3ccc(O)cc3)C(C(C)C)CC12
|
| 1599 |
+
CCC1CC2(CC)OC(=O)C3(CC)OC(=O)C1C23
|
| 1600 |
+
CC(=O)OCC1=C(C(=O)O)N2C(=O)C(NC(=O)Cc3ccccc3)C2SC1
|
| 1601 |
+
Oc1cccc2c1ccc1c3ccccc3ccc21
|
| 1602 |
+
CCCCCCCCCCC(=O)NCC(=O)O
|
| 1603 |
+
O=C(CCCCC1CCSS1)N1CCCC1c1nnc2ccccn12
|
| 1604 |
+
ClCCOC1CCCN(Cc2ccccc2)CO1
|
| 1605 |
+
COc1c(O)ccc2c1CC(O)C(c1ccccc1)O2
|
| 1606 |
+
Nc1nc(N)c2c3c(oc2n1)CN(Cc1ccccc1)CC3
|
| 1607 |
+
CCC12CCCN3CCC4(c5ccccc5NC4C(C(=O)O)C1)C32
|
| 1608 |
+
CN1c2ccccc2C(C)(C)C1O
|
| 1609 |
+
COC(CCC=CCCCCC(=O)CCCCCCC(=O)CCC=CCCCCCCO)C(=O)NCCc1ccc(O)cc1
|
| 1610 |
+
CCC=CCC=CCC=CCC=CCC=CCCCCCC(=O)OCC(CO)OC(=O)CCCCCCCC=CCCCCCCCC
|
| 1611 |
+
CCOC(=O)c1c(C)oc2ccc(OC(=O)c3ccc(Br)cc3)cc12
|
| 1612 |
+
CC(=O)CC1=C(C)C(=O)c2c(O)cc(O)cc2C1=O
|
| 1613 |
+
COc1cc(CCCCCCCCC(=O)c2c(O)cccc2O)ccc1O
|
| 1614 |
+
COc1cc2c(cc1OC)CC(=O)N(CC(=O)NCCn1ccc3ccccc31)C=C2
|
| 1615 |
+
CCCc1cc(OC)cc(O)c1C(=O)O
|
| 1616 |
+
C=C1CCCC2C1(C)CCC(C)C2(C)CC1=CC(=O)C=C(N)C1=O
|
| 1617 |
+
COc1cccc2c1CC(Cc1ccncc1)C2
|
| 1618 |
+
COC(=O)Cc1c(C)c2ccc(OCC=C(C)C)cc2oc1=O
|
| 1619 |
+
C=CC=CC=CC(O)C(C)O
|
| 1620 |
+
CCC1C(=O)Nc2cc3[nH]c(-c4cccnc4)nc3cc21
|
| 1621 |
+
C=C1C=C(CCC(=O)O)C(=O)OC1C=CC=CC
|
| 1622 |
+
O=C(NC1C(c2ccccc2)CC(O)C1O)c1cccnc1
|
| 1623 |
+
Nc1cccc2c1ccc1ccccc12
|
| 1624 |
+
Cc1ccc(C)c(Cl)c1
|
| 1625 |
+
CNCc1cn(CC2OC(CC(=O)N3CCCC3)C(O)C2O)nn1
|
| 1626 |
+
COC(=O)C(C)(C)CCCOc1ccc(C)c(OCCCC(C)(C)C(=O)OC)c1
|
| 1627 |
+
CC(C)(O)c1cc(CC2(N)COC2)no1
|
benchmark/latent_visualization_legacy.py
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Latent Space Visualization for Molecule VAE Models
|
| 4 |
+
Integrated with existing benchmark pipeline structure
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
import random
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import numpy as np
|
| 12 |
+
import matplotlib.pyplot as plt
|
| 13 |
+
import seaborn as sns
|
| 14 |
+
from matplotlib.colors import ListedColormap
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
import torch
|
| 18 |
+
import torch.nn as nn
|
| 19 |
+
import torch.nn.functional as F
|
| 20 |
+
from torch.utils.data import DataLoader, Dataset
|
| 21 |
+
|
| 22 |
+
from sklearn.manifold import TSNE
|
| 23 |
+
from sklearn.decomposition import PCA
|
| 24 |
+
from tqdm import tqdm
|
| 25 |
+
from rdkit import Chem
|
| 26 |
+
from rdkit import RDLogger
|
| 27 |
+
RDLogger.DisableLog('rdApp.*')
|
| 28 |
+
|
| 29 |
+
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
| 30 |
+
|
| 31 |
+
# Import from existing benchmark code
|
| 32 |
+
from transformers import AutoTokenizer
|
| 33 |
+
try:
|
| 34 |
+
from FastChemTokenizer import FastChemTokenizer
|
| 35 |
+
except ImportError:
|
| 36 |
+
print("FastChemTokenizer not found. Please ensure it's in your PYTHONPATH.")
|
| 37 |
+
FastChemTokenizer = None
|
| 38 |
+
|
| 39 |
+
# Set seeds for reproducibility
|
| 40 |
+
def set_seed(seed=42):
|
| 41 |
+
torch.manual_seed(seed)
|
| 42 |
+
torch.cuda.manual_seed_all(seed)
|
| 43 |
+
np.random.seed(seed)
|
| 44 |
+
random.seed(seed)
|
| 45 |
+
os.environ['PYTHONHASHSEED'] = str(seed)
|
| 46 |
+
torch.backends.cudnn.deterministic = True
|
| 47 |
+
torch.backends.cudnn.benchmark = False
|
| 48 |
+
|
| 49 |
+
set_seed(42)
|
| 50 |
+
|
| 51 |
+
# Device setup
|
| 52 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 53 |
+
|
| 54 |
+
class TokenizerWrapper:
|
| 55 |
+
def __init__(self, tokenizer, name, bos_token="<s>", eos_token="</s>", pad_token="<pad>", unk_token="<unk>"):
|
| 56 |
+
self.tokenizer = tokenizer
|
| 57 |
+
self.name = name
|
| 58 |
+
self.bos_token = bos_token
|
| 59 |
+
self.eos_token = eos_token
|
| 60 |
+
self.pad_token = pad_token
|
| 61 |
+
self.unk_token = unk_token
|
| 62 |
+
|
| 63 |
+
if hasattr(tokenizer, 'add_special_tokens'):
|
| 64 |
+
tokenizer.add_special_tokens({
|
| 65 |
+
'bos_token': bos_token,
|
| 66 |
+
'eos_token': eos_token,
|
| 67 |
+
'pad_token': pad_token,
|
| 68 |
+
'unk_token': unk_token
|
| 69 |
+
})
|
| 70 |
+
|
| 71 |
+
def encode(self, smiles: str, add_special_tokens: bool = True):
|
| 72 |
+
if isinstance(self.tokenizer, FastChemTokenizer):
|
| 73 |
+
# 1. get ids directly
|
| 74 |
+
ids = self.tokenizer.encode(smiles) # ← no .tokenize() here
|
| 75 |
+
# 2. add specials ourselves
|
| 76 |
+
if add_special_tokens:
|
| 77 |
+
ids = [self.tokenizer.bos_token_id] + ids + [self.tokenizer.eos_token_id]
|
| 78 |
+
return {'input_ids': ids}
|
| 79 |
+
else:
|
| 80 |
+
# Hugging-Face style tokenizer
|
| 81 |
+
return self.tokenizer(
|
| 82 |
+
smiles,
|
| 83 |
+
add_special_tokens=add_special_tokens,
|
| 84 |
+
return_attention_mask=False,
|
| 85 |
+
return_tensors=None
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
def decode(self, token_ids, skip_special_tokens=True):
|
| 89 |
+
if isinstance(self.tokenizer, FastChemTokenizer):
|
| 90 |
+
# 1. map single ids → tokens
|
| 91 |
+
tokens = [self.tokenizer.id_to_token.get(tid, self.tokenizer.unk_token)
|
| 92 |
+
for tid in token_ids]
|
| 93 |
+
# 2. drop specials if requested
|
| 94 |
+
if skip_special_tokens:
|
| 95 |
+
specials = {self.tokenizer.bos_token,
|
| 96 |
+
self.tokenizer.eos_token,
|
| 97 |
+
self.tokenizer.pad_token,
|
| 98 |
+
self.tokenizer.unk_token} # add any others you use
|
| 99 |
+
tokens = [t for t in tokens if t not in specials]
|
| 100 |
+
# 3. detokenise
|
| 101 |
+
if hasattr(self.tokenizer, 'detokenize'):
|
| 102 |
+
return self.tokenizer.detokenize(tokens)
|
| 103 |
+
else:
|
| 104 |
+
return "".join(tokens) # chemistry tokens are atomic
|
| 105 |
+
else:
|
| 106 |
+
return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
|
| 107 |
+
|
| 108 |
+
def __len__(self):
|
| 109 |
+
if isinstance(self.tokenizer, FastChemTokenizer):
|
| 110 |
+
# FastChemTokenizer uses ._vocab or .vocab depending on version
|
| 111 |
+
return len(getattr(self.tokenizer, 'vocab',
|
| 112 |
+
getattr(self.tokenizer, '_vocab', self.tokenizer)))
|
| 113 |
+
else:
|
| 114 |
+
return len(self.tokenizer)
|
| 115 |
+
|
| 116 |
+
def get_vocab(self):
|
| 117 |
+
if isinstance(self.tokenizer, FastChemTokenizer):
|
| 118 |
+
return self.tokenizer.vocab
|
| 119 |
+
else:
|
| 120 |
+
return self.tokenizer.get_vocab()
|
| 121 |
+
|
| 122 |
+
@property
|
| 123 |
+
def bos_token_id(self):
|
| 124 |
+
return self.tokenizer.bos_token_id
|
| 125 |
+
|
| 126 |
+
@property
|
| 127 |
+
def eos_token_id(self):
|
| 128 |
+
return self.tokenizer.eos_token_id
|
| 129 |
+
|
| 130 |
+
@property
|
| 131 |
+
def pad_token_id(self):
|
| 132 |
+
return self.tokenizer.pad_token_id
|
| 133 |
+
|
| 134 |
+
@property
|
| 135 |
+
def unk_token_id(self):
|
| 136 |
+
return self.tokenizer.unk_token_id
|
| 137 |
+
|
| 138 |
+
def collate_fn(batch, tokenizer, max_length=128):
|
| 139 |
+
encodings = [tokenizer.encode(s, add_special_tokens=True) for s in batch]
|
| 140 |
+
input_ids = [e['input_ids'] for e in encodings]
|
| 141 |
+
|
| 142 |
+
max_len = min(max(len(ids) for ids in input_ids), max_length)
|
| 143 |
+
padded = []
|
| 144 |
+
lengths = []
|
| 145 |
+
|
| 146 |
+
pad_token_id = tokenizer.tokenizer.pad_token_id # FIXED: dynamic
|
| 147 |
+
|
| 148 |
+
for ids in input_ids:
|
| 149 |
+
if len(ids) > max_length:
|
| 150 |
+
ids = ids[:max_length]
|
| 151 |
+
else:
|
| 152 |
+
ids = ids + [pad_token_id] * (max_len - len(ids))
|
| 153 |
+
padded.append(ids)
|
| 154 |
+
lengths.append(min(len(ids), max_length))
|
| 155 |
+
|
| 156 |
+
return torch.tensor(padded, dtype=torch.long), torch.tensor(lengths, dtype=torch.long)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
class SmilesDataset(Dataset):
|
| 160 |
+
def __init__(self, smiles_list):
|
| 161 |
+
self.smiles_list = smiles_list
|
| 162 |
+
def __len__(self):
|
| 163 |
+
return len(self.smiles_list)
|
| 164 |
+
def __getitem__(self, idx):
|
| 165 |
+
return self.smiles_list[idx]
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
class MoleculeVAE(nn.Module):
|
| 170 |
+
def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, latent_dim=128, num_layers=2,
|
| 171 |
+
pad_token_id=0, bos_token_id=1, eos_token_id=2):
|
| 172 |
+
super().__init__()
|
| 173 |
+
self.vocab_size = vocab_size
|
| 174 |
+
self.embed_dim = embed_dim
|
| 175 |
+
self.hidden_dim = hidden_dim
|
| 176 |
+
self.latent_dim = latent_dim
|
| 177 |
+
self.num_layers = num_layers
|
| 178 |
+
self.pad_token_id = pad_token_id
|
| 179 |
+
self.bos_token_id = bos_token_id
|
| 180 |
+
self.eos_token_id = eos_token_id
|
| 181 |
+
|
| 182 |
+
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_token_id)
|
| 183 |
+
self.encoder_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
|
| 184 |
+
self.fc_mu = nn.Linear(hidden_dim * 2, latent_dim)
|
| 185 |
+
self.fc_logvar = nn.Linear(hidden_dim * 2, latent_dim)
|
| 186 |
+
|
| 187 |
+
self.decoder_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
|
| 188 |
+
self.fc_out = nn.Linear(hidden_dim, vocab_size)
|
| 189 |
+
|
| 190 |
+
self.latent2hidden = nn.Linear(latent_dim, num_layers * hidden_dim)
|
| 191 |
+
self.latent2cell = nn.Linear(latent_dim, num_layers * hidden_dim)
|
| 192 |
+
|
| 193 |
+
self._init_weights()
|
| 194 |
+
|
| 195 |
+
def _init_weights(self):
|
| 196 |
+
for m in self.modules():
|
| 197 |
+
if isinstance(m, nn.Linear):
|
| 198 |
+
nn.init.xavier_uniform_(m.weight)
|
| 199 |
+
if m.bias is not None:
|
| 200 |
+
nn.init.zeros_(m.bias)
|
| 201 |
+
elif isinstance(m, nn.LSTM):
|
| 202 |
+
for name, param in m.named_parameters():
|
| 203 |
+
if 'weight' in name:
|
| 204 |
+
nn.init.orthogonal_(param)
|
| 205 |
+
elif 'bias' in name:
|
| 206 |
+
nn.init.zeros_(param)
|
| 207 |
+
|
| 208 |
+
def encode(self, x, lengths):
|
| 209 |
+
embedded = self.embedding(x)
|
| 210 |
+
packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
|
| 211 |
+
packed_out, (hidden, _) = self.encoder_lstm(packed)
|
| 212 |
+
h_forward = hidden[-2]
|
| 213 |
+
h_backward = hidden[-1]
|
| 214 |
+
h = torch.cat([h_forward, h_backward], dim=1)
|
| 215 |
+
mu = self.fc_mu(h)
|
| 216 |
+
logvar = self.fc_logvar(h)
|
| 217 |
+
return mu, logvar
|
| 218 |
+
|
| 219 |
+
def reparameterize(self, mu, logvar):
|
| 220 |
+
if self.training:
|
| 221 |
+
std = torch.exp(0.5 * logvar)
|
| 222 |
+
eps = torch.randn_like(std)
|
| 223 |
+
return mu + eps * std
|
| 224 |
+
else:
|
| 225 |
+
return mu
|
| 226 |
+
|
| 227 |
+
def decode(self, z, max_length=128, mode="greedy", temperature=1.0):
|
| 228 |
+
"""
|
| 229 |
+
Decode latent vector z into a sequence.
|
| 230 |
+
Returns full logits at each step.
|
| 231 |
+
PATCHED: stops generation when EOS is predicted.
|
| 232 |
+
"""
|
| 233 |
+
batch_size = z.size(0)
|
| 234 |
+
device = z.device
|
| 235 |
+
|
| 236 |
+
# Initialize hidden states from latent
|
| 237 |
+
h0 = self.latent2hidden(z).view(self.num_layers, batch_size, self.hidden_dim)
|
| 238 |
+
c0 = self.latent2cell(z).view(self.num_layers, batch_size, self.hidden_dim)
|
| 239 |
+
hidden = (h0, c0)
|
| 240 |
+
|
| 241 |
+
# Start with BOS token — shape: (batch_size, 1)
|
| 242 |
+
input_token = torch.full((batch_size, 1), self.bos_token_id, dtype=torch.long, device=device)
|
| 243 |
+
logits = []
|
| 244 |
+
finished = torch.zeros(batch_size, dtype=torch.bool, device=device) # ← TRACK FINISHED SEQS
|
| 245 |
+
|
| 246 |
+
for _ in range(max_length):
|
| 247 |
+
embedded = self.embedding(input_token) # (batch, 1, embed_dim)
|
| 248 |
+
output, hidden = self.decoder_lstm(embedded, hidden)
|
| 249 |
+
logit = self.fc_out(output) # (batch, 1, vocab)
|
| 250 |
+
logits.append(logit)
|
| 251 |
+
|
| 252 |
+
if mode == "greedy":
|
| 253 |
+
input_token = logit.argmax(dim=-1) # (batch, 1)
|
| 254 |
+
elif mode == "sample":
|
| 255 |
+
probs = torch.softmax(logit.squeeze(1) / temperature, dim=-1) # (batch, vocab)
|
| 256 |
+
input_token = torch.multinomial(probs, 1) # (batch, 1)
|
| 257 |
+
else:
|
| 258 |
+
raise ValueError(f"Unknown decode mode: {mode}")
|
| 259 |
+
|
| 260 |
+
# ← EARLY STOPPING AT EOS
|
| 261 |
+
just_finished = (input_token.squeeze(1) == self.eos_token_id)
|
| 262 |
+
finished |= just_finished
|
| 263 |
+
input_token[finished] = self.pad_token_id # pad finished sequences
|
| 264 |
+
if finished.all():
|
| 265 |
+
break
|
| 266 |
+
|
| 267 |
+
return torch.cat(logits, dim=1) # (batch, seq_len, vocab)
|
| 268 |
+
|
| 269 |
+
def forward(self, input_ids, lengths, target_seq=None, teacher_forcing_ratio=0.0, temperature=1.0):
|
| 270 |
+
mu, logvar = self.encode(input_ids, lengths)
|
| 271 |
+
z = self.reparameterize(mu, logvar)
|
| 272 |
+
|
| 273 |
+
if self.training and target_seq is not None and teacher_forcing_ratio > 0:
|
| 274 |
+
# Training with teacher forcing
|
| 275 |
+
batch_size, seq_len = target_seq.size()
|
| 276 |
+
device = target_seq.device
|
| 277 |
+
|
| 278 |
+
# Initialize hidden states
|
| 279 |
+
h0 = self.latent2hidden(z).view(self.num_layers, batch_size, self.hidden_dim)
|
| 280 |
+
c0 = self.latent2cell(z).view(self.num_layers, batch_size, self.hidden_dim)
|
| 281 |
+
hidden = (h0, c0)
|
| 282 |
+
|
| 283 |
+
logits = []
|
| 284 |
+
input_token = target_seq[:, 0].unsqueeze(1) # BOS
|
| 285 |
+
|
| 286 |
+
for t in range(1, seq_len):
|
| 287 |
+
embedded = self.embedding(input_token)
|
| 288 |
+
output, hidden = self.decoder_lstm(embedded, hidden)
|
| 289 |
+
logit = self.fc_out(output)
|
| 290 |
+
logits.append(logit)
|
| 291 |
+
|
| 292 |
+
use_teacher = torch.rand(1).item() < teacher_forcing_ratio
|
| 293 |
+
if use_teacher:
|
| 294 |
+
input_token = target_seq[:, t].unsqueeze(1)
|
| 295 |
+
else:
|
| 296 |
+
input_token = logit.argmax(dim=-1)
|
| 297 |
+
|
| 298 |
+
logits = torch.cat(logits, dim=1)
|
| 299 |
+
else:
|
| 300 |
+
# Inference mode
|
| 301 |
+
max_len = target_seq.size(1) if target_seq is not None else 128
|
| 302 |
+
logits = self.decode(z, max_length=max_len, mode="greedy", temperature=temperature)
|
| 303 |
+
|
| 304 |
+
return logits, mu, logvar
|
| 305 |
+
|
| 306 |
+
class LatentSpaceVisualizer:
|
| 307 |
+
def __init__(self, model_path, tokenizer, device='cuda' if torch.cuda.is_available() else 'cpu'):
|
| 308 |
+
self.device = device
|
| 309 |
+
self.tokenizer = tokenizer
|
| 310 |
+
self.model = self.load_model(model_path)
|
| 311 |
+
|
| 312 |
+
def load_model(self, model_path):
|
| 313 |
+
"""Load the trained VAE model"""
|
| 314 |
+
checkpoint = torch.load(model_path, map_location=self.device)
|
| 315 |
+
|
| 316 |
+
# Extract model parameters from checkpoint
|
| 317 |
+
if 'model_state_dict' in checkpoint:
|
| 318 |
+
state_dict = checkpoint['model_state_dict']
|
| 319 |
+
else:
|
| 320 |
+
state_dict = checkpoint
|
| 321 |
+
|
| 322 |
+
# Get vocab size from tokenizer
|
| 323 |
+
vocab_size = len(self.tokenizer)
|
| 324 |
+
pad_token_id = self.tokenizer.tokenizer.pad_token_id
|
| 325 |
+
|
| 326 |
+
# Initialize model with correct parameters
|
| 327 |
+
model = MoleculeVAE(vocab_size=vocab_size, pad_token_id=pad_token_id)
|
| 328 |
+
model.load_state_dict(state_dict)
|
| 329 |
+
model.to(self.device)
|
| 330 |
+
model.eval()
|
| 331 |
+
|
| 332 |
+
return model
|
| 333 |
+
|
| 334 |
+
def encode_molecules(self, smiles_list, batch_size=32):
|
| 335 |
+
"""Encode molecules to latent space"""
|
| 336 |
+
dataset = SmilesDataset(smiles_list)
|
| 337 |
+
dataloader = DataLoader(
|
| 338 |
+
dataset,
|
| 339 |
+
batch_size=batch_size,
|
| 340 |
+
shuffle=False,
|
| 341 |
+
collate_fn=lambda batch: collate_fn(batch, self.tokenizer, max_length=128)
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
all_mus = []
|
| 345 |
+
with torch.no_grad():
|
| 346 |
+
for input_ids, lengths in tqdm(dataloader, desc="Encoding molecules"):
|
| 347 |
+
input_ids = input_ids.to(self.device)
|
| 348 |
+
lengths = lengths.to(self.device)
|
| 349 |
+
|
| 350 |
+
mu, logvar = self.model.encode(input_ids, lengths)
|
| 351 |
+
all_mus.append(mu.cpu().numpy())
|
| 352 |
+
|
| 353 |
+
return np.concatenate(all_mus, axis=0)
|
| 354 |
+
|
| 355 |
+
def create_grid_latent_points(self, grid_size=100, z_range=4):
|
| 356 |
+
"""Create a grid of points in 2D latent space"""
|
| 357 |
+
x = np.linspace(-z_range, z_range, grid_size)
|
| 358 |
+
y = np.linspace(-z_range, z_range, grid_size)
|
| 359 |
+
xx, yy = np.meshgrid(x, y)
|
| 360 |
+
|
| 361 |
+
# Create circular mask
|
| 362 |
+
center = grid_size // 2
|
| 363 |
+
radius = grid_size // 2
|
| 364 |
+
y_coords, x_coords = np.ogrid[:grid_size, :grid_size]
|
| 365 |
+
mask = (x_coords - center) ** 2 + (y_coords - center) ** 2 <= radius ** 2
|
| 366 |
+
|
| 367 |
+
return xx, yy, mask
|
| 368 |
+
|
| 369 |
+
def classify_latent_points(self, latent_points, dim1=0, dim2=1, additional_dim=None):
|
| 370 |
+
"""
|
| 371 |
+
Classify latent points by chemical validity (RDKit parseable)
|
| 372 |
+
Returns: 0 for invalid/unparseable molecules, 1 for valid molecules
|
| 373 |
+
"""
|
| 374 |
+
classifications = []
|
| 375 |
+
|
| 376 |
+
with torch.no_grad():
|
| 377 |
+
# Process in batches to avoid memory issues
|
| 378 |
+
batch_size = 32
|
| 379 |
+
for i in range(0, len(latent_points), batch_size):
|
| 380 |
+
batch_points = latent_points[i:i+batch_size]
|
| 381 |
+
|
| 382 |
+
# Create full latent vectors (sample from normal for other dimensions)
|
| 383 |
+
full_z = torch.randn(len(batch_points), self.model.latent_dim, device=self.device) * 0.1
|
| 384 |
+
full_z[:, dim1] = torch.FloatTensor(batch_points[:, 0]).to(self.device)
|
| 385 |
+
full_z[:, dim2] = torch.FloatTensor(batch_points[:, 1]).to(self.device)
|
| 386 |
+
|
| 387 |
+
# If additional dimension specified (for z2 plots)
|
| 388 |
+
if additional_dim is not None:
|
| 389 |
+
if isinstance(additional_dim, dict):
|
| 390 |
+
for dim_idx, dim_val in additional_dim.items():
|
| 391 |
+
full_z[:, dim_idx] = dim_val
|
| 392 |
+
|
| 393 |
+
try:
|
| 394 |
+
# Decode to SMILES
|
| 395 |
+
logits = self.model.decode(full_z, max_length=64, temperature=0.8)
|
| 396 |
+
predictions = torch.argmax(logits, dim=-1)
|
| 397 |
+
|
| 398 |
+
# Check chemical validity for each decoded molecule
|
| 399 |
+
batch_classes = []
|
| 400 |
+
for pred in predictions:
|
| 401 |
+
pred_ids = pred.cpu().tolist()
|
| 402 |
+
|
| 403 |
+
# Remove padding and special tokens
|
| 404 |
+
pad_id = self.tokenizer.tokenizer.pad_token_id
|
| 405 |
+
eos_id = self.tokenizer.tokenizer.eos_token_id
|
| 406 |
+
|
| 407 |
+
# Trim at EOS or pad
|
| 408 |
+
for j, token_id in enumerate(pred_ids):
|
| 409 |
+
if token_id in [pad_id, eos_id]:
|
| 410 |
+
pred_ids = pred_ids[:j]
|
| 411 |
+
break
|
| 412 |
+
|
| 413 |
+
try:
|
| 414 |
+
decoded_smiles = self.tokenizer.decode(pred_ids, skip_special_tokens=True)
|
| 415 |
+
# Test chemical validity with RDKit
|
| 416 |
+
mol = Chem.MolFromSmiles(decoded_smiles)
|
| 417 |
+
|
| 418 |
+
if mol is None:
|
| 419 |
+
# Invalid/unparseable molecule
|
| 420 |
+
batch_classes.append(0)
|
| 421 |
+
else:
|
| 422 |
+
# Valid, RDKit-parseable molecule
|
| 423 |
+
batch_classes.append(1)
|
| 424 |
+
|
| 425 |
+
except Exception:
|
| 426 |
+
# Decoding or parsing failed - invalid
|
| 427 |
+
batch_classes.append(0)
|
| 428 |
+
|
| 429 |
+
classifications.extend(batch_classes)
|
| 430 |
+
|
| 431 |
+
except Exception as e:
|
| 432 |
+
# If decoding fails, all points in batch are invalid
|
| 433 |
+
classifications.extend([0] * len(batch_points))
|
| 434 |
+
|
| 435 |
+
return np.array(classifications)
|
| 436 |
+
|
| 437 |
+
def plot_latent_space_interpolation(self, grid_size=100, z_range=4, save_path=None):
|
| 438 |
+
"""
|
| 439 |
+
Create latent space interpolation plots similar to the reference images
|
| 440 |
+
"""
|
| 441 |
+
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
|
| 442 |
+
axes = axes.flatten()
|
| 443 |
+
|
| 444 |
+
# Create color map (RED for invalid molecules, GREEN for valid molecules)
|
| 445 |
+
colors = ['#FF4444', '#44AA44'] # Red (invalid) and Green (valid)
|
| 446 |
+
cmap = ListedColormap(colors)
|
| 447 |
+
|
| 448 |
+
plot_idx = 0
|
| 449 |
+
|
| 450 |
+
# First row: different dimension pairs
|
| 451 |
+
dimension_pairs = [(0, 1), (2, 3), (4, 5), (6, 7)]
|
| 452 |
+
|
| 453 |
+
for dim_pair in dimension_pairs:
|
| 454 |
+
dim1, dim2 = dim_pair
|
| 455 |
+
|
| 456 |
+
# Create grid
|
| 457 |
+
xx, yy, mask = self.create_grid_latent_points(grid_size, z_range)
|
| 458 |
+
|
| 459 |
+
# Get points within circular boundary
|
| 460 |
+
valid_points = []
|
| 461 |
+
valid_coords = []
|
| 462 |
+
|
| 463 |
+
for i in range(grid_size):
|
| 464 |
+
for j in range(grid_size):
|
| 465 |
+
if mask[i, j]:
|
| 466 |
+
valid_points.append([xx[i, j], yy[i, j]])
|
| 467 |
+
valid_coords.append([i, j])
|
| 468 |
+
|
| 469 |
+
valid_points = np.array(valid_points)
|
| 470 |
+
|
| 471 |
+
# Classify points based on chemical validity
|
| 472 |
+
print(f"Classifying latent space chemical validity for dimensions {dim1}, {dim2}...")
|
| 473 |
+
classifications = self.classify_latent_points(valid_points, dim1, dim2)
|
| 474 |
+
|
| 475 |
+
# Create classification grid
|
| 476 |
+
class_grid = np.zeros((grid_size, grid_size))
|
| 477 |
+
class_grid.fill(np.nan) # Fill with NaN for areas outside circle
|
| 478 |
+
|
| 479 |
+
for point_idx, (i, j) in enumerate(valid_coords):
|
| 480 |
+
class_grid[i, j] = classifications[point_idx]
|
| 481 |
+
|
| 482 |
+
# Plot
|
| 483 |
+
ax = axes[plot_idx]
|
| 484 |
+
im = ax.imshow(class_grid, extent=[-z_range, z_range, -z_range, z_range],
|
| 485 |
+
origin='lower', cmap=cmap, alpha=0.8, vmin=0, vmax=1)
|
| 486 |
+
|
| 487 |
+
# Add concentric circles
|
| 488 |
+
circles = [1, 2, 3, 4]
|
| 489 |
+
for radius in circles:
|
| 490 |
+
if radius <= z_range:
|
| 491 |
+
circle = plt.Circle((0, 0), radius, fill=False, color='black',
|
| 492 |
+
alpha=0.3, linewidth=0.5)
|
| 493 |
+
ax.add_patch(circle)
|
| 494 |
+
|
| 495 |
+
# Set labels and title
|
| 496 |
+
ax.set_xlabel(f'Latent dimension z{dim1}')
|
| 497 |
+
ax.set_ylabel(f'Latent dimension z{dim2}')
|
| 498 |
+
ax.set_title('SMILES')
|
| 499 |
+
ax.set_xlim(-z_range, z_range)
|
| 500 |
+
ax.set_ylim(-z_range, z_range)
|
| 501 |
+
ax.set_aspect('equal')
|
| 502 |
+
|
| 503 |
+
plot_idx += 1
|
| 504 |
+
|
| 505 |
+
# Second row: fix z0, z1 and vary z2
|
| 506 |
+
for z2_val in [-2, -1, 1, 2]:
|
| 507 |
+
dim1, dim2 = 0, 1 # Use z0 and z1 for x,y
|
| 508 |
+
|
| 509 |
+
# Create grid
|
| 510 |
+
xx, yy, mask = self.create_grid_latent_points(grid_size, z_range)
|
| 511 |
+
|
| 512 |
+
# Get points within circular boundary
|
| 513 |
+
valid_points = []
|
| 514 |
+
valid_coords = []
|
| 515 |
+
|
| 516 |
+
for i in range(grid_size):
|
| 517 |
+
for j in range(grid_size):
|
| 518 |
+
if mask[i, j]:
|
| 519 |
+
valid_points.append([xx[i, j], yy[i, j]])
|
| 520 |
+
valid_coords.append([i, j])
|
| 521 |
+
|
| 522 |
+
valid_points = np.array(valid_points)
|
| 523 |
+
|
| 524 |
+
# Classify points with z2 fixed - check chemical validity
|
| 525 |
+
print(f"Classifying latent space chemical validity for z0, z1 with z2 = {z2_val}...")
|
| 526 |
+
classifications = self.classify_latent_points(
|
| 527 |
+
valid_points, dim1, dim2,
|
| 528 |
+
additional_dim={2: z2_val}
|
| 529 |
+
)
|
| 530 |
+
|
| 531 |
+
# Create classification grid
|
| 532 |
+
class_grid = np.zeros((grid_size, grid_size))
|
| 533 |
+
class_grid.fill(np.nan)
|
| 534 |
+
|
| 535 |
+
for point_idx, (i, j) in enumerate(valid_coords):
|
| 536 |
+
class_grid[i, j] = classifications[point_idx]
|
| 537 |
+
|
| 538 |
+
# Plot
|
| 539 |
+
ax = axes[plot_idx]
|
| 540 |
+
im = ax.imshow(class_grid, extent=[-z_range, z_range, -z_range, z_range],
|
| 541 |
+
origin='lower', cmap=cmap, alpha=0.8, vmin=0, vmax=1)
|
| 542 |
+
|
| 543 |
+
# Add concentric circles
|
| 544 |
+
for radius in circles:
|
| 545 |
+
if radius <= z_range:
|
| 546 |
+
circle = plt.Circle((0, 0), radius, fill=False, color='black',
|
| 547 |
+
alpha=0.3, linewidth=0.5)
|
| 548 |
+
ax.add_patch(circle)
|
| 549 |
+
|
| 550 |
+
ax.set_xlabel('Latent dimension z0')
|
| 551 |
+
ax.set_ylabel('Latent dimension z1')
|
| 552 |
+
ax.set_title(f'SMILES; z2 = {z2_val}')
|
| 553 |
+
ax.set_xlim(-z_range, z_range)
|
| 554 |
+
ax.set_ylim(-z_range, z_range)
|
| 555 |
+
ax.set_aspect('equal')
|
| 556 |
+
|
| 557 |
+
plot_idx += 1
|
| 558 |
+
|
| 559 |
+
plt.suptitle(f'Latent Space Chemical Validity - {self.tokenizer.name}\n(Red: Invalid molecules, Green: Valid molecules)', fontsize=16)
|
| 560 |
+
plt.tight_layout()
|
| 561 |
+
|
| 562 |
+
if save_path:
|
| 563 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 564 |
+
|
| 565 |
+
plt.show()
|
| 566 |
+
|
| 567 |
+
def plot_molecule_embeddings(self, smiles_list, method='tsne', save_path=None):
|
| 568 |
+
"""Plot actual molecule embeddings in 2D using dimensionality reduction"""
|
| 569 |
+
print(f"Encoding {len(smiles_list)} molecules...")
|
| 570 |
+
embeddings = self.encode_molecules(smiles_list)
|
| 571 |
+
|
| 572 |
+
# Create simple labels based on molecular properties
|
| 573 |
+
labels = []
|
| 574 |
+
for smiles in smiles_list:
|
| 575 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 576 |
+
if mol is None:
|
| 577 |
+
labels.append(0)
|
| 578 |
+
else:
|
| 579 |
+
# Simple binary classification
|
| 580 |
+
mw = Chem.Descriptors.MolWt(mol)
|
| 581 |
+
labels.append(1 if mw > 200 else 0)
|
| 582 |
+
|
| 583 |
+
labels = np.array(labels)
|
| 584 |
+
|
| 585 |
+
# Reduce dimensionality
|
| 586 |
+
print(f"Computing {method.upper()} projection...")
|
| 587 |
+
if method == 'tsne':
|
| 588 |
+
reducer = TSNE(n_components=2, random_state=42, perplexity=min(30, len(smiles_list)//4))
|
| 589 |
+
else:
|
| 590 |
+
reducer = PCA(n_components=2, random_state=42)
|
| 591 |
+
|
| 592 |
+
embeddings_2d = reducer.fit_transform(embeddings)
|
| 593 |
+
|
| 594 |
+
# Plot
|
| 595 |
+
plt.figure(figsize=(10, 8))
|
| 596 |
+
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1],
|
| 597 |
+
c=labels, cmap='RdYlGn', alpha=0.7, s=20)
|
| 598 |
+
plt.colorbar(scatter, label='Molecular Weight > 200')
|
| 599 |
+
plt.title(f'{method.upper()} of Molecule Embeddings - {self.tokenizer.name}')
|
| 600 |
+
plt.xlabel(f'{method.upper()} 1')
|
| 601 |
+
plt.ylabel(f'{method.upper()} 2')
|
| 602 |
+
|
| 603 |
+
if save_path:
|
| 604 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 605 |
+
|
| 606 |
+
plt.show()
|
| 607 |
+
|
| 608 |
+
def load_data_and_tokenizers():
|
| 609 |
+
"""Load data and tokenizers using your existing structure"""
|
| 610 |
+
# Load SMILES data (adjust path as needed)
|
| 611 |
+
data_path = "../data/sample_all_8k_smi.csv"
|
| 612 |
+
if not os.path.exists(data_path):
|
| 613 |
+
print(f"Data file not found: {data_path}")
|
| 614 |
+
print("Please update the data_path in the script.")
|
| 615 |
+
return None, None
|
| 616 |
+
|
| 617 |
+
df = pd.read_csv(data_path)
|
| 618 |
+
if 'SMILES' not in df.columns:
|
| 619 |
+
raise ValueError("Expected column 'SMILES' in CSV")
|
| 620 |
+
|
| 621 |
+
smiles_list = df['SMILES'].dropna().tolist()
|
| 622 |
+
|
| 623 |
+
# Validate SMILES
|
| 624 |
+
valid_smiles = []
|
| 625 |
+
for smiles in smiles_list:
|
| 626 |
+
if Chem.MolFromSmiles(smiles) is not None:
|
| 627 |
+
valid_smiles.append(smiles)
|
| 628 |
+
|
| 629 |
+
print(f"Loaded {len(valid_smiles)} valid SMILES")
|
| 630 |
+
|
| 631 |
+
# Initialize tokenizers
|
| 632 |
+
try:
|
| 633 |
+
tok1_hf = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
|
| 634 |
+
tokenizer1 = TokenizerWrapper(tok1_hf, name="ChemBERTa",
|
| 635 |
+
bos_token="<s>", eos_token="</s>",
|
| 636 |
+
pad_token="<pad>", unk_token="<unk>")
|
| 637 |
+
except Exception as e:
|
| 638 |
+
print(f"Failed to load ChemBERTa tokenizer: {e}")
|
| 639 |
+
tokenizer1 = None
|
| 640 |
+
|
| 641 |
+
try:
|
| 642 |
+
tok2_fast = FastChemTokenizer.from_pretrained("../smitok")
|
| 643 |
+
tokenizer2 = TokenizerWrapper(tok2_fast, name="FastChemTokenizer",
|
| 644 |
+
bos_token="[BOS]", eos_token="[EOS]",
|
| 645 |
+
pad_token="[PAD]", unk_token="[UNK]")
|
| 646 |
+
except Exception as e:
|
| 647 |
+
print(f"Failed to load FastChemTokenizer: {e}")
|
| 648 |
+
tokenizer2 = None
|
| 649 |
+
|
| 650 |
+
tokenizers = [t for t in [tokenizer1, tokenizer2] if t is not None]
|
| 651 |
+
|
| 652 |
+
return valid_smiles, tokenizers
|
| 653 |
+
|
| 654 |
+
def create_latent_visualizations():
|
| 655 |
+
"""Main function to create latent space visualizations"""
|
| 656 |
+
|
| 657 |
+
# Load data and tokenizers
|
| 658 |
+
smiles_list, tokenizers = load_data_and_tokenizers()
|
| 659 |
+
if smiles_list is None or not tokenizers:
|
| 660 |
+
print("Failed to load data or tokenizers. Please check your setup.")
|
| 661 |
+
return
|
| 662 |
+
|
| 663 |
+
# Use a subset for faster visualization
|
| 664 |
+
viz_smiles = smiles_list[:1000] # Adjust size as needed
|
| 665 |
+
|
| 666 |
+
# Model paths
|
| 667 |
+
model_paths = {
|
| 668 |
+
'ChemBERTa': './checkpoints/ChemBERTa/best_model_ChemBERTa.pt',
|
| 669 |
+
'FastChemTokenizer': './checkpoints/FastChemTokenizer/best_model_FastChemTokenizer.pt'
|
| 670 |
+
}
|
| 671 |
+
|
| 672 |
+
# Create output directory
|
| 673 |
+
os.makedirs('latent_space_plots', exist_ok=True)
|
| 674 |
+
|
| 675 |
+
for tokenizer in tokenizers:
|
| 676 |
+
model_path = model_paths.get(tokenizer.name)
|
| 677 |
+
if model_path is None or not os.path.exists(model_path):
|
| 678 |
+
print(f"Model not found for {tokenizer.name}: {model_path}")
|
| 679 |
+
continue
|
| 680 |
+
|
| 681 |
+
print(f"\n{'='*60}")
|
| 682 |
+
print(f"Creating visualizations for {tokenizer.name}")
|
| 683 |
+
print(f"{'='*60}")
|
| 684 |
+
|
| 685 |
+
try:
|
| 686 |
+
# Create visualizer
|
| 687 |
+
visualizer = LatentSpaceVisualizer(model_path, tokenizer, device)
|
| 688 |
+
|
| 689 |
+
# Create latent space interpolation plots
|
| 690 |
+
print("Creating latent space interpolation plots...")
|
| 691 |
+
save_path = f'latent_space_plots/{tokenizer.name}_latent_interpolation.png'
|
| 692 |
+
visualizer.plot_latent_space_interpolation(save_path=save_path)
|
| 693 |
+
|
| 694 |
+
# Create molecule embedding plots
|
| 695 |
+
print("Creating t-SNE embedding plot...")
|
| 696 |
+
save_path = f'latent_space_plots/{tokenizer.name}_embeddings_tsne.png'
|
| 697 |
+
visualizer.plot_molecule_embeddings(viz_smiles, method='tsne', save_path=save_path)
|
| 698 |
+
|
| 699 |
+
print("Creating PCA embedding plot...")
|
| 700 |
+
save_path = f'latent_space_plots/{tokenizer.name}_embeddings_pca.png'
|
| 701 |
+
visualizer.plot_molecule_embeddings(viz_smiles, method='pca', save_path=save_path)
|
| 702 |
+
|
| 703 |
+
except Exception as e:
|
| 704 |
+
print(f"Error processing {tokenizer.name}: {str(e)}")
|
| 705 |
+
import traceback
|
| 706 |
+
traceback.print_exc()
|
| 707 |
+
continue
|
| 708 |
+
|
| 709 |
+
print(f"\n{'='*60}")
|
| 710 |
+
print("Visualization complete! Check the 'latent_space_plots' directory for results.")
|
| 711 |
+
print(f"{'='*60}")
|
| 712 |
+
|
| 713 |
+
if __name__ == "__main__":
|
| 714 |
+
# Import RDKit descriptors for molecular property calculation
|
| 715 |
+
try:
|
| 716 |
+
from rdkit.Chem import Descriptors, rdMolDescriptors
|
| 717 |
+
except ImportError:
|
| 718 |
+
print("RDKit Descriptors not available. Using simpler classification.")
|
| 719 |
+
# Fallback to simple classification if descriptors not available
|
| 720 |
+
Descriptors = None
|
| 721 |
+
rdMolDescriptors = None
|
| 722 |
+
|
| 723 |
+
create_latent_visualizations()
|
benchmark/sample_all_8k_smi.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
latent_space_plots/ChemBERTa_latent_interpolation.png
ADDED
|
Git LFS Details
|
latent_space_plots/FastChemTokenizerHF_latent_interpolation.png
ADDED
|
Git LFS Details
|