| | import pandas as pd |
| | import gzip |
| | from pathlib import Path |
| |
|
| | def load_csv(file_path): |
| | return pd.read_csv(file_path) |
| |
|
| | def load_gzip_csv(file_path): |
| | with gzip.open(file_path, 'rt') as f: |
| | return pd.read_csv(f) |
| |
|
| | def save_gzip_csv(df, file_path): |
| | df.to_csv(file_path, index=False, compression='gzip') |
| |
|
| | def process_and_save_property(df, property_name, output_folder): |
| | |
| | property_df = df[['smiles', property_name]].dropna() |
| | |
| | |
| | property_df = property_df.rename(columns={property_name: property_name.replace('mol_', '').replace('plym_', '')}) |
| | |
| | |
| | output_path = output_folder / f"{property_name.replace('mol_', '').replace('plym_', '')}.csv.gz" |
| | save_gzip_csv(property_df, output_path) |
| | print(f"Saved {len(property_df)} rows for {property_name} to {output_path}") |
| |
|
| | def main(): |
| | |
| | pretrain_qa_folder = Path('/dccstor/graph-design2/liugang/2_model_prepared/step1_graph_dit/data/preprocess/pretrain_qa') |
| | pretrain_path = Path('/dccstor/graph-design2/liugang/2_model_prepared/step1_graph_dit/data/raw/pretrain.csv.gz') |
| | |
| | |
| | output_folder = Path('.') |
| |
|
| | |
| | train_df = load_csv(pretrain_qa_folder / 'train_df.csv') |
| | test_df = load_csv(pretrain_qa_folder / 'test_df.csv') |
| | pretrain_df = load_gzip_csv(pretrain_path) |
| |
|
| | |
| | all_data = pd.concat([train_df, test_df, pretrain_df], ignore_index=True) |
| |
|
| | |
| | all_data = all_data.drop_duplicates(subset='smiles', keep='first') |
| |
|
| | |
| | properties = ['mol_BBBP', 'mol_HIV', 'mol_BACE', 'plym_CO2', 'plym_N2', 'plym_O2', 'plym_FFV', 'plym_TC'] |
| |
|
| | |
| | for prop in properties: |
| | process_and_save_property(all_data, prop, output_folder) |
| |
|
| | if __name__ == "__main__": |
| | main() |