{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "4feb6490", "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem\n", "import pandas as pd\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": null, "id": "59d77a9b", "metadata": {}, "outputs": [], "source": [ "BASE_PATH = '/'\n", "TARGETS = ['FFV']\n", "\n", "def get_canonical_smiles(smiles):\n", " try:\n", " mol = Chem.MolFromSmiles(smiles)\n", " if mol:\n", " return Chem.MolToSmiles(mol, canonical=True)\n", " except:\n", " pass\n", " return smiles\n", "\n", "print(\"šŸ“‚ Loading competition data...\")\n", "train = pd.read_csv(BASE_PATH + 'train.csv')\n", "test = pd.read_csv(BASE_PATH + 'test.csv')\n", "\n", "print(f\" Training samples: {len(train)}\")\n", "print(f\" Test samples: {len(test)}\")\n", "\n", "def clean_and_validate_smiles(smiles):\n", " if not isinstance(smiles, str) or len(smiles) == 0:\n", " return None\n", " \n", " bad_patterns = [\n", " '[R]', '[R1]', '[R2]', '[R3]', '[R4]', '[R5]', \n", " \"[R']\", '[R\"]', 'R1', 'R2', 'R3', 'R4', 'R5',\n", " '([R])', '([R1])', '([R2])', \n", " ]\n", " \n", " for pattern in bad_patterns:\n", " if pattern in smiles:\n", " return None\n", " \n", " if '][' in smiles and any(x in smiles for x in ['[R', 'R]']):\n", " return None\n", " \n", " try:\n", " mol = Chem.MolFromSmiles(smiles)\n", " if mol is not None:\n", " return Chem.MolToSmiles(mol, canonical=True)\n", " else:\n", " return None\n", " except:\n", " return None\n", " \n", " return smiles\n", "\n", "print(\"šŸ”„ Cleaning and validating SMILES...\")\n", "train['SMILES'] = train['SMILES'].apply(clean_and_validate_smiles)\n", "test['SMILES'] = test['SMILES'].apply(clean_and_validate_smiles)\n", "\n", "invalid_train = train['SMILES'].isnull().sum()\n", "invalid_test = test['SMILES'].isnull().sum()\n", "\n", "print(f\" Removed {invalid_train} invalid SMILES from training data\")\n", "print(f\" Removed {invalid_test} invalid SMILES from test data\")\n", "\n", "train = train[train['SMILES'].notnull()].reset_index(drop=True)\n", "test = test[test['SMILES'].notnull()].reset_index(drop=True)\n", "\n", "print(f\" Final training samples: {len(train)}\")\n", "print(f\" Final test samples: {len(test)}\")\n", "\n", "def add_extra_data_clean(df_train, df_extra, target):\n", " n_samples_before = len(df_train[df_train[target].notnull()])\n", " \n", " print(f\" Processing {len(df_extra)} {target} samples...\")\n", " \n", " df_extra['SMILES'] = df_extra['SMILES'].apply(clean_and_validate_smiles)\n", " \n", " before_filter = len(df_extra)\n", " df_extra = df_extra[df_extra['SMILES'].notnull()]\n", " df_extra = df_extra.dropna(subset=[target])\n", " after_filter = len(df_extra)\n", " \n", " print(f\" Kept {after_filter}/{before_filter} valid samples\")\n", " \n", " if len(df_extra) == 0:\n", " print(f\" No valid data remaining for {target}\")\n", " return df_train\n", " \n", " df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()\n", " \n", " cross_smiles = set(df_extra['SMILES']) & set(df_train['SMILES'])\n", " unique_smiles_extra = set(df_extra['SMILES']) - set(df_train['SMILES'])\n", "\n", " filled_count = 0\n", " for smile in df_train[df_train[target].isnull()]['SMILES'].tolist():\n", " if smile in cross_smiles:\n", " df_train.loc[df_train['SMILES']==smile, target] = \\\n", " df_extra[df_extra['SMILES']==smile][target].values[0]\n", " filled_count += 1\n", " \n", " extra_to_add = df_extra[df_extra['SMILES'].isin(unique_smiles_extra)].copy()\n", " if len(extra_to_add) > 0:\n", " for col in TARGETS:\n", " if col not in extra_to_add.columns:\n", " extra_to_add[col] = np.nan\n", " \n", " extra_to_add = extra_to_add[['SMILES'] + TARGETS]\n", " df_train = pd.concat([df_train, extra_to_add], axis=0, ignore_index=True)\n", "\n", " n_samples_after = len(df_train[df_train[target].notnull()])\n", " print(f' {target}: +{n_samples_after-n_samples_before} samples, +{len(unique_smiles_extra)} unique SMILES')\n", " return df_train\n", "\n", "print(\"\\nšŸ“‚ Loading external datasets...\")\n", "\n", "external_datasets = []\n", "\n", "def safe_load_dataset(path, target, processor_func, description):\n", " try:\n", " if path.endswith('.xlsx'):\n", " data = pd.read_excel(path)\n", " else:\n", " data = pd.read_csv(path)\n", " \n", " data = processor_func(data)\n", " external_datasets.append((target, data))\n", " print(f\" āœ… {description}: {len(data)} samples\")\n", " return True\n", " except Exception as e:\n", " print(f\" āš ļø {description} failed: {str(e)[:100]}\")\n", " return False\n", "\n", "# Link: https://www.kaggle.com/competitions/neurips-open-polymer-prediction-2025/data\n", "safe_load_dataset(\n", " '/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv',\n", " 'FFV', \n", " lambda df: df[['SMILES', 'FFV']] if 'FFV' in df.columns else df,\n", " 'NeurIPS FFV data'\n", ")\n", "\n", "print(\"\\nšŸ”„ Integrating external data...\")\n", "train_extended = train[['SMILES'] + TARGETS].copy()\n", "\n", "for target, dataset in external_datasets:\n", " print(f\" Processing {target} data...\")\n", " train_extended = add_extra_data_clean(train_extended, dataset, target)\n", "\n", "print(f\"\\nšŸ“Š Final training data:\")\n", "print(f\" Original samples: {len(train)}\")\n", "print(f\" Extended samples: {len(train_extended)}\")\n", "print(f\" Gain: +{len(train_extended) - len(train)} samples\")\n", "\n", "for target in TARGETS:\n", " count = train_extended[target].notna().sum()\n", " original_count = train[target].notna().sum() if target in train.columns else 0\n", " gain = count - original_count\n", " print(f\" {target}: {count:,} samples (+{gain})\")\n", "\n", "train_df = train_extended\n", "print(f\"\\nāœ… Data integration complete with clean SMILES!\")" ] }, { "cell_type": "code", "execution_count": null, "id": "6401df45", "metadata": {}, "outputs": [], "source": [ "# Run above cell after downloading datasets Or just uncomment this below line \n", "# train_df = pd.read_csv(\"FFV_Unaugmented.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "55763e3f", "metadata": {}, "outputs": [], "source": [ "def augment_smiles_dataset(train_df, label, num_augments=1):\n", " augmented_df = {\n", " 'SMILES': [],\n", " }\n", " augmented_df[label] = []\n", " failed = 0\n", " \n", " # \n", " for idx, row in tqdm(train_df.iterrows(), desc=\"šŸ”¬ Data Augmentation\", total=len(train_df)):\n", " smiles = row['SMILES']\n", " target = row[label]\n", " \n", " # \n", " mol = Chem.MolFromSmiles(smiles)\n", " if mol is None:\n", " failed += 1\n", " continue\n", " augmented_df['SMILES'].append(smiles)\n", " augmented_df[label].append(target)\n", " \n", " # \n", " for _ in range(num_augments):\n", " rand_smiles = Chem.MolToSmiles(mol, doRandom=True)\n", " augmented_df['SMILES'].append(rand_smiles)\n", " augmented_df[label].append(target)\n", " \n", " print(f\"No. of Failed SMILES: {failed}\")\n", " return pd.DataFrame(augmented_df)" ] }, { "cell_type": "code", "execution_count": null, "id": "20719e92", "metadata": {}, "outputs": [], "source": [ "train_df = augment_smiles_dataset(train_df, 'FFV')\n", "\n", "ffv = train_df[train_df['FFV'].notnull()].drop(columns=[]).reset_index(drop=True)\n", "ffv.to_csv('FFV.csv', index=False)" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 5 }