{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4feb6490",
   "metadata": {},
   "outputs": [],
   "source": [
    "from rdkit import Chem\n",
    "import pandas as pd\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59d77a9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "BASE_PATH = '/'\n",
    "TARGETS = ['FFV']\n",
    "\n",
    "def get_canonical_smiles(smiles):\n",
    "    try:\n",
    "        mol = Chem.MolFromSmiles(smiles)\n",
    "        if mol:\n",
    "            return Chem.MolToSmiles(mol, canonical=True)\n",
    "    except:\n",
    "        pass\n",
    "    return smiles\n",
    "\n",
    "print(\"📂 Loading competition data...\")\n",
    "train = pd.read_csv(BASE_PATH + 'train.csv')\n",
    "test = pd.read_csv(BASE_PATH + 'test.csv')\n",
    "\n",
    "print(f\"   Training samples: {len(train)}\")\n",
    "print(f\"   Test samples: {len(test)}\")\n",
    "\n",
    "def clean_and_validate_smiles(smiles):\n",
    "    if not isinstance(smiles, str) or len(smiles) == 0:\n",
    "        return None\n",
    "    \n",
    "    bad_patterns = [\n",
    "        '[R]', '[R1]', '[R2]', '[R3]', '[R4]', '[R5]', \n",
    "        \"[R']\", '[R\"]', 'R1', 'R2', 'R3', 'R4', 'R5',\n",
    "        '([R])', '([R1])', '([R2])', \n",
    "    ]\n",
    "    \n",
    "    for pattern in bad_patterns:\n",
    "        if pattern in smiles:\n",
    "            return None\n",
    "    \n",
    "    if '][' in smiles and any(x in smiles for x in ['[R', 'R]']):\n",
    "        return None\n",
    "    \n",
    "    try:\n",
    "        mol = Chem.MolFromSmiles(smiles)\n",
    "        if mol is not None:\n",
    "            return Chem.MolToSmiles(mol, canonical=True)\n",
    "        else:\n",
    "            return None\n",
    "    except:\n",
    "        return None\n",
    "    \n",
    "    return smiles\n",
    "\n",
    "print(\"🔄 Cleaning and validating SMILES...\")\n",
    "train['SMILES'] = train['SMILES'].apply(clean_and_validate_smiles)\n",
    "test['SMILES'] = test['SMILES'].apply(clean_and_validate_smiles)\n",
    "\n",
    "invalid_train = train['SMILES'].isnull().sum()\n",
    "invalid_test = test['SMILES'].isnull().sum()\n",
    "\n",
    "print(f\"   Removed {invalid_train} invalid SMILES from training data\")\n",
    "print(f\"   Removed {invalid_test} invalid SMILES from test data\")\n",
    "\n",
    "train = train[train['SMILES'].notnull()].reset_index(drop=True)\n",
    "test = test[test['SMILES'].notnull()].reset_index(drop=True)\n",
    "\n",
    "print(f\"   Final training samples: {len(train)}\")\n",
    "print(f\"   Final test samples: {len(test)}\")\n",
    "\n",
    "def add_extra_data_clean(df_train, df_extra, target):\n",
    "    n_samples_before = len(df_train[df_train[target].notnull()])\n",
    "    \n",
    "    print(f\"      Processing {len(df_extra)} {target} samples...\")\n",
    "    \n",
    "    df_extra['SMILES'] = df_extra['SMILES'].apply(clean_and_validate_smiles)\n",
    "    \n",
    "    before_filter = len(df_extra)\n",
    "    df_extra = df_extra[df_extra['SMILES'].notnull()]\n",
    "    df_extra = df_extra.dropna(subset=[target])\n",
    "    after_filter = len(df_extra)\n",
    "    \n",
    "    print(f\"      Kept {after_filter}/{before_filter} valid samples\")\n",
    "    \n",
    "    if len(df_extra) == 0:\n",
    "        print(f\"      No valid data remaining for {target}\")\n",
    "        return df_train\n",
    "    \n",
    "    df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()\n",
    "    \n",
    "    cross_smiles = set(df_extra['SMILES']) & set(df_train['SMILES'])\n",
    "    unique_smiles_extra = set(df_extra['SMILES']) - set(df_train['SMILES'])\n",
    "\n",
    "    filled_count = 0\n",
    "    for smile in df_train[df_train[target].isnull()]['SMILES'].tolist():\n",
    "        if smile in cross_smiles:\n",
    "            df_train.loc[df_train['SMILES']==smile, target] = \\\n",
    "                df_extra[df_extra['SMILES']==smile][target].values[0]\n",
    "            filled_count += 1\n",
    "    \n",
    "    extra_to_add = df_extra[df_extra['SMILES'].isin(unique_smiles_extra)].copy()\n",
    "    if len(extra_to_add) > 0:\n",
    "        for col in TARGETS:\n",
    "            if col not in extra_to_add.columns:\n",
    "                extra_to_add[col] = np.nan\n",
    "        \n",
    "        extra_to_add = extra_to_add[['SMILES'] + TARGETS]\n",
    "        df_train = pd.concat([df_train, extra_to_add], axis=0, ignore_index=True)\n",
    "\n",
    "    n_samples_after = len(df_train[df_train[target].notnull()])\n",
    "    print(f'      {target}: +{n_samples_after-n_samples_before} samples, +{len(unique_smiles_extra)} unique SMILES')\n",
    "    return df_train\n",
    "\n",
    "print(\"\\n📂 Loading external datasets...\")\n",
    "\n",
    "external_datasets = []\n",
    "\n",
    "def safe_load_dataset(path, target, processor_func, description):\n",
    "    try:\n",
    "        if path.endswith('.xlsx'):\n",
    "            data = pd.read_excel(path)\n",
    "        else:\n",
    "            data = pd.read_csv(path)\n",
    "        \n",
    "        data = processor_func(data)\n",
    "        external_datasets.append((target, data))\n",
    "        print(f\"   ✅ {description}: {len(data)} samples\")\n",
    "        return True\n",
    "    except Exception as e:\n",
    "        print(f\"   ⚠️ {description} failed: {str(e)[:100]}\")\n",
    "        return False\n",
    "\n",
    "# Link: https://www.kaggle.com/competitions/neurips-open-polymer-prediction-2025/data\n",
    "safe_load_dataset(\n",
    "    '/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv',\n",
    "    'FFV', \n",
    "    lambda df: df[['SMILES', 'FFV']] if 'FFV' in df.columns else df,\n",
    "    'NeurIPS FFV data'\n",
    ")\n",
    "\n",
    "print(\"\\n🔄 Integrating external data...\")\n",
    "train_extended = train[['SMILES'] + TARGETS].copy()\n",
    "\n",
    "for target, dataset in external_datasets:\n",
    "    print(f\"   Processing {target} data...\")\n",
    "    train_extended = add_extra_data_clean(train_extended, dataset, target)\n",
    "\n",
    "print(f\"\\n📊 Final training data:\")\n",
    "print(f\"   Original samples: {len(train)}\")\n",
    "print(f\"   Extended samples: {len(train_extended)}\")\n",
    "print(f\"   Gain: +{len(train_extended) - len(train)} samples\")\n",
    "\n",
    "for target in TARGETS:\n",
    "    count = train_extended[target].notna().sum()\n",
    "    original_count = train[target].notna().sum() if target in train.columns else 0\n",
    "    gain = count - original_count\n",
    "    print(f\"   {target}: {count:,} samples (+{gain})\")\n",
    "\n",
    "train_df = train_extended\n",
    "print(f\"\\n✅ Data integration complete with clean SMILES!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6401df45",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run above cell after downloading datasets Or just uncomment this below line \n",
    "# train_df = pd.read_csv(\"FFV_Unaugmented.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55763e3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def augment_smiles_dataset(train_df, label, num_augments=1):\n",
    "    augmented_df = {\n",
    "        'SMILES': [],\n",
    "    }\n",
    "    augmented_df[label] = []\n",
    "    failed = 0\n",
    "    \n",
    "    # \n",
    "    for idx, row in tqdm(train_df.iterrows(), desc=\"🔬 Data Augmentation\", total=len(train_df)):\n",
    "        smiles = row['SMILES']\n",
    "        target = row[label]\n",
    "    \n",
    "        # \n",
    "        mol = Chem.MolFromSmiles(smiles)\n",
    "        if mol is None:\n",
    "            failed += 1\n",
    "            continue\n",
    "        augmented_df['SMILES'].append(smiles)\n",
    "        augmented_df[label].append(target)\n",
    "        \n",
    "        # \n",
    "        for _ in range(num_augments):\n",
    "            rand_smiles = Chem.MolToSmiles(mol, doRandom=True)\n",
    "            augmented_df['SMILES'].append(rand_smiles)\n",
    "            augmented_df[label].append(target)\n",
    "    \n",
    "    print(f\"No. of Failed SMILES: {failed}\")\n",
    "    return pd.DataFrame(augmented_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20719e92",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = augment_smiles_dataset(train_df, 'FFV')\n",
    "\n",
    "ffv = train_df[train_df['FFV'].notnull()].drop(columns=[]).reset_index(drop=True)\n",
    "ffv.to_csv('FFV.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}