Spaces:
Sleeping
Sleeping
File size: 3,539 Bytes
f3b11f9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | import pandas as pd
import os
from sklearn.model_selection import train_test_split
import utils.file as uf
import configuration.config_default as cfgd
import preprocess.property_change_encoder as pce
SEED = 42
# SPLIT_RATIO = 0.8
def get_smiles_list(file_name):
"""
Get smiles list for building vocabulary
:param file_name:
:return:
"""
pd_data = pd.read_csv(file_name, sep=",")
print("Read %s file" % file_name)
# ravel('K') 是将二维数组展成一维
smiles_list = pd.unique(pd_data[['constantSMILES', 'fromVarSMILES', 'toVarSMILES']].values.ravel('K'))
print("Number of SMILES in chemical transformations: %d" % len(smiles_list))
return smiles_list
def split_data(input_transformations_path,SPLIT_RATIO, LOG=None):
"""
Split data into training, validation and test set, write to files
:param input_transformations_path:L
:return: dataframe
"""
data = pd.read_csv(input_transformations_path, sep=",")
if LOG:
LOG.info("Read %s file" % input_transformations_path)
train, test = train_test_split(
data, test_size=(1-SPLIT_RATIO)/2, random_state=SEED)
train, validation = train_test_split(train, test_size=(1-SPLIT_RATIO)/2, random_state=SEED)
if LOG:
LOG.info("Train, Validation, Test: %d, %d, %d" % (len(train), len(validation), len(test)))
parent = uf.get_parent_dir(input_transformations_path)
train.to_csv(os.path.join(parent, "train.csv"), index=False)
validation.to_csv(os.path.join(parent, "validation.csv"), index=False)
test.to_csv(os.path.join(parent, "test.csv"), index=False)
return train, validation, test
def save_df_property_encoded(file_name, property_change_encoder, LOG=None):
data = pd.read_csv(file_name, sep=",")
for property_name in cfgd.PROPERTIES:
if property_name == 'pki':
encoder, start_map_interval = property_change_encoder[property_name]
data['Delta_{}'.format(property_name)] = \
data['Delta_{}'.format(property_name)].apply(lambda x:
pce.value_in_interval(x, start_map_interval), encoder)
elif property_name == 'qed':
encoder, start_map_interval = property_change_encoder[property_name]
data['Delta_{}'.format(property_name)] = \
data['Delta_{}'.format(property_name)].apply(lambda x:
pce.value_in_interval(x, start_map_interval), encoder)
elif property_name == 'sa':
encoder, start_map_interval = property_change_encoder[property_name]
data['Delta_{}'.format(property_name)] = \
data['Delta_{}'.format(property_name)].apply(lambda x:
pce.value_in_interval(x, start_map_interval), encoder)
output_file = file_name.split('.csv')[0] + '_encoded.csv'
LOG.info("Saving encoded property change to file: {}".format(output_file))
data.to_csv(output_file, index=False)
return output_file
def prop_change(source, target, threshold):
if source <= threshold and target > threshold:
return "low->high"
elif source > threshold and target <= threshold:
return "high->low"
elif source <= threshold and target <= threshold:
return "no_change"
elif source > threshold and target > threshold:
return "no_change"
|