{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from transformers import PreTrainedModel, AutoConfig, BertModel, BertTokenizerFast, BertConfig, AutoModel, AutoTokenizer\n", "import pandas as pd\n", "import torch\n", "import os\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.model_selection import train_test_split\n", "from tqdm import tqdm\n", "import joblib\n", "\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('/home/jovyan/simson_training_bolgov/kaggle_comp/train.csv')\n", "\n", "targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idSMILESTgFFVTcDensityRg
04.215886e+08*C(=O)c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(Oc4ccc(N5...NaN0.376767NaNNaNNaN
17.984549e+08*c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(N4C(=O)c5ccc(C...NaN0.346993NaNNaNNaN
2NaN*CC/C=C(/*)CNaNNaNNaNNaNNaN
3NaN*CC(*)(C)C(=O)OCCN(CC)c1ccc(/N=N/c2ccc(OC)cc2)...NaNNaNNaNNaNNaN
4NaN*Oc1cc(OC(=O)c2ccc(OCC)cc2)c(OC(=O)CCCC(*)=O)c...NaNNaNNaNNaNNaN
........................
169582.389975e+08*OC(=O)Oc1ccc(S(=O)(=O)c2ccc(OC(=O)OC3CC4CC(*)...NaN0.339596NaNNaNNaN
16959NaN*c1ccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4ccc(N5C(=O)c6c...NaNNaNNaNNaNNaN
16960NaN*OC(F)(F)COC(=O)c1cc(OCCCCC)cc(C(=O)OCC(*)(F)F)c1NaNNaNNaNNaNNaN
169611.973417e+09*C=CC1CC(*)C2C(=O)N(c3ccc(F)cc3)C(=O)C12NaN0.374710NaNNaNNaN
16962NaN*/C=C/[Ge](/C=C/[Si](*)(c1ccccc1)c1ccccc1)(c1c...NaNNaNNaNNaNNaN
\n", "

16963 rows × 7 columns

\n", "
" ], "text/plain": [ " id SMILES Tg \\\n", "0 4.215886e+08 *C(=O)c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(Oc4ccc(N5... NaN \n", "1 7.984549e+08 *c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(N4C(=O)c5ccc(C... NaN \n", "2 NaN *CC/C=C(/*)C NaN \n", "3 NaN *CC(*)(C)C(=O)OCCN(CC)c1ccc(/N=N/c2ccc(OC)cc2)... NaN \n", "4 NaN *Oc1cc(OC(=O)c2ccc(OCC)cc2)c(OC(=O)CCCC(*)=O)c... NaN \n", "... ... ... .. \n", "16958 2.389975e+08 *OC(=O)Oc1ccc(S(=O)(=O)c2ccc(OC(=O)OC3CC4CC(*)... NaN \n", "16959 NaN *c1ccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4ccc(N5C(=O)c6c... NaN \n", "16960 NaN *OC(F)(F)COC(=O)c1cc(OCCCCC)cc(C(=O)OCC(*)(F)F)c1 NaN \n", "16961 1.973417e+09 *C=CC1CC(*)C2C(=O)N(c3ccc(F)cc3)C(=O)C12 NaN \n", "16962 NaN */C=C/[Ge](/C=C/[Si](*)(c1ccccc1)c1ccccc1)(c1c... NaN \n", "\n", " FFV Tc Density Rg \n", "0 0.376767 NaN NaN NaN \n", "1 0.346993 NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "... ... .. ... .. \n", "16958 0.339596 NaN NaN NaN \n", "16959 NaN NaN NaN NaN \n", "16960 NaN NaN NaN NaN \n", "16961 0.374710 NaN NaN NaN \n", "16962 NaN NaN NaN NaN \n", "\n", "[16963 rows x 7 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for i in range(1, 5):\n", " supplement_path = f'/home/jovyan/simson_training_bolgov/kaggle_comp/train_supplement/dataset{i}.csv'\n", " supplement_ds = pd.read_csv(supplement_path)\n", "\n", " if 'TC_mean' in supplement_ds.columns:\n", " supplement_ds = supplement_ds.rename(columns = {'TC_mean': 'Tc'})\n", "\n", " df = pd.concat([df, supplement_ds], axis=0)\n", "\n", "df = df.sample(frac=1).reset_index(drop=True)\n", "df" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|████████████████████████████████████| 14419/14419 [00:43<00:00, 328.78it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Advanced augmentation completed:\n", "Original size: 14419, Augmented size: 168551\n", "Augmentation factor: 11.69x\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████| 2545/2545 [00:07<00:00, 333.57it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Advanced augmentation completed:\n", "Original size: 2545, Augmented size: 29716\n", "Augmentation factor: 11.68x\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from rdkit import Chem\n", "import random\n", "from typing import Optional, List, Union\n", "\n", "def augment_smiles_dataset(df: pd.DataFrame,\n", " smiles_column: str = 'SMILES',\n", " augmentation_strategies: List[str] = ['enumeration', 'kekulize', 'stereo_enum'],\n", " n_augmentations: int = 10,\n", " preserve_original: bool = True,\n", " random_seed: Optional[int] = None) -> pd.DataFrame:\n", " \"\"\"\n", " Advanced SMILES augmentation with multiple strategies.\n", " \n", " Parameters:\n", " -----------\n", " augmentation_strategies : List[str]\n", " List of augmentation strategies: 'enumeration', 'kekulize', 'stereo_enum'\n", " \"\"\"\n", " \n", " if random_seed is not None:\n", " random.seed(random_seed)\n", " np.random.seed(random_seed)\n", " \n", " def apply_augmentation_strategy(smiles: str, strategy: str) -> List[str]:\n", " \"\"\"Apply specific augmentation strategy\"\"\"\n", " try:\n", " mol = Chem.MolFromSmiles(smiles)\n", " if mol is None:\n", " return [smiles]\n", " \n", " augmented = []\n", " \n", " if strategy == 'enumeration':\n", " # Standard SMILES enumeration\n", " for _ in range(n_augmentations):\n", " enum_smiles = Chem.MolToSmiles(mol, \n", " canonical=False, \n", " doRandom=True,\n", " isomericSmiles=True)\n", " augmented.append(enum_smiles)\n", " \n", " elif strategy == 'kekulize':\n", " # Kekulization variants\n", " try:\n", " Chem.Kekulize(mol)\n", " kek_smiles = Chem.MolToSmiles(mol, kekuleSmiles=True)\n", " augmented.append(kek_smiles)\n", " except:\n", " pass\n", " \n", " elif strategy == 'stereo_enum':\n", " # Stereochemistry enumeration\n", " for _ in range(n_augmentations // 2):\n", " # Remove stereochemistry\n", " Chem.RemoveStereochemistry(mol)\n", " no_stereo = Chem.MolToSmiles(mol)\n", " augmented.append(no_stereo)\n", " \n", " return list(set(augmented)) # Remove duplicates\n", " \n", " except Exception as e:\n", " print(f\"Error in {strategy} for {smiles}: {e}\")\n", " return [smiles]\n", " \n", " augmented_rows = []\n", " \n", " for idx, row in tqdm(df.iterrows(), total=len(df)):\n", " original_smiles = row[smiles_column]\n", " \n", " # Add original if requested\n", " if preserve_original:\n", " original_row = row.to_dict()\n", " original_row['augmentation_strategy'] = 'original'\n", " original_row['is_augmented'] = False\n", " augmented_rows.append(original_row)\n", " \n", " # Apply each augmentation strategy\n", " for strategy in augmentation_strategies:\n", " strategy_smiles = apply_augmentation_strategy(original_smiles, strategy)\n", " \n", " for aug_smiles in strategy_smiles:\n", " if aug_smiles != original_smiles: # Avoid duplicating original\n", " new_row = row.to_dict().copy()\n", " new_row[smiles_column] = aug_smiles\n", " new_row['augmentation_strategy'] = strategy\n", " new_row['is_augmented'] = True\n", " augmented_rows.append(new_row)\n", " \n", " augmented_df = pd.DataFrame(augmented_rows)\n", " augmented_df = augmented_df.reset_index(drop=True)\n", " \n", " print(f\"Advanced augmentation completed:\")\n", " print(f\"Original size: {len(df)}, Augmented size: {len(augmented_df)}\")\n", " print(f\"Augmentation factor: {len(augmented_df) / len(df):.2f}x\")\n", " \n", " return augmented_df.reset_index(drop=True)\n", "\n", "def create_splits(df):\n", " length = len(df)\n", " train_length = int(0.85 * length)\n", " train = df.loc[:train_length]\n", " test = df.loc[train_length:]\n", " return train, test\n", "\n", "train, test = create_splits(df)\n", "\n", "train = train.reset_index(drop=True)\n", "test = test.reset_index(drop=True)\n", "\n", "train = augment_smiles_dataset(train)\n", "test = augment_smiles_dataset(test)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "scalers = []\n", "\n", "for target in targets:\n", " target_scaler = StandardScaler()\n", " train[target] = target_scaler.fit_transform(train[target].to_numpy().reshape(-1, 1))\n", " test[target] = target_scaler.transform(test[target].to_numpy().reshape(-1, 1))\n", " \n", " scalers.append(target_scaler)\n", "\n", "smiles_train = train['SMILES']\n", "smiles_test = test['SMILES']\n", "\n", "labels_train = train[targets].values\n", "labels_test = test[targets].values" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['target_scalers.pkl']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "joblib.dump(scalers, 'target_scalers.pkl')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_279009/2507782815.py:68: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", " simson_params = torch.load('/home/jovyan/simson_training_bolgov/kaggle_comp/simson_polymer_1m_uncompiled.pth')\n" ] } ], "source": [ "from sklearn.metrics import mean_absolute_error\n", "from transformers import AutoTokenizer, BertModel\n", "import torch\n", "from torch import nn\n", "from transformers.activations import ACT2FN\n", "\n", "def global_ap(x):\n", " return torch.mean(x.view(x.size(0), x.size(1), -1), dim=1)\n", "\n", "class SimSonEncoder(nn.Module):\n", " def __init__(self, config: BertConfig, max_len: int, dropout: float = 0.1):\n", " super(SimSonEncoder, self).__init__()\n", " self.config = config\n", " self.max_len = max_len\n", "\n", " self.bert = BertModel(config, add_pooling_layer=False)\n", "\n", " self.linear = nn.Linear(config.hidden_size, max_len)\n", " self.dropout = nn.Dropout(dropout)\n", "\n", " def forward(self, input_ids, attention_mask=None):\n", " if attention_mask is None:\n", " attention_mask = input_ids.ne(0)\n", " \n", " outputs = self.bert(\n", " input_ids=input_ids,\n", " attention_mask=attention_mask\n", " )\n", "\n", " hidden_states = outputs.last_hidden_state\n", " \n", " hidden_states = self.dropout(hidden_states)\n", " \n", " pooled = global_ap(hidden_states)\n", " \n", " out = self.linear(pooled)\n", " \n", " return out\n", "\n", "\n", "class SimSonClassifier(nn.Module):\n", " def __init__(self, encoder: SimSonEncoder, num_labels: int, dropout=0.1):\n", " super(SimSonClassifier, self).__init__()\n", " self.encoder = encoder\n", " self.clf = nn.Linear(encoder.max_len, num_labels)\n", " self.relu = nn.ReLU()\n", " self.dropout = nn.Dropout(dropout)\n", "\n", " def forward(self, input_ids, attention_mask=None, labels=None):\n", " x = self.encoder(input_ids, attention_mask)\n", " x = self.relu(self.dropout(x))\n", " x = self.clf(x)\n", " return x\n", "\n", "tokenizer_path = 'DeepChem/ChemBERTa-77M-MTR'\n", "tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)\n", "\n", "# Only the hidden size is slightly larger, everything else is the same\n", "config = BertConfig(\n", " vocab_size=tokenizer.vocab_size,\n", " hidden_size=768,\n", " num_hidden_layers=4,\n", " num_attention_heads=12,\n", " intermediate_size=2048,\n", " max_position_embeddings=512\n", " )\n", "\n", "simson_params = torch.load('/home/jovyan/simson_training_bolgov/kaggle_comp/simson_polymer_1m_uncompiled.pth')\n", "\n", "backbone = SimSonEncoder(config=config, max_len=512)\n", "backbone.load_state_dict(simson_params)\n", "\n", "model = SimSonClassifier(encoder=backbone, num_labels=len(targets))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import torch\n", "from torch.utils.data import Dataset, Sampler, DataLoader\n", "\n", "\n", "class SMILESDataset(Dataset):\n", " def __init__(self, smiles_list, labels, tokenizer, max_length=256):\n", " self.smiles_list = smiles_list\n", " self.labels = labels # Shape: (num_samples, 5) - already scaled\n", " self.tokenizer = tokenizer\n", " self.max_length = max_length\n", " \n", " # Create mask for valid (non-NaN) labels\n", " self.label_masks = ~np.isnan(self.labels) # True where label is valid\n", " \n", " # Replace NaNs with 0 for safe tensor conversion (mask will handle exclusion)\n", " self.labels = np.nan_to_num(self.labels, nan=0.0)\n", " \n", " def __len__(self):\n", " return len(self.smiles_list)\n", " \n", " def __getitem__(self, idx):\n", " smiles = self.tokenizer.cls_token + self.smiles_list[idx]\n", " \n", " # Tokenize the SMILES string\n", " encoding = self.tokenizer(\n", " smiles,\n", " truncation=True,\n", " padding='max_length',\n", " max_length=self.max_length,\n", " return_tensors='pt'\n", " )\n", " \n", " return {\n", " 'input_ids': encoding['input_ids'].flatten(),\n", " 'attention_mask': encoding['attention_mask'].flatten(),\n", " 'labels': torch.tensor(self.labels[idx], dtype=torch.float32),\n", " 'label_mask': torch.tensor(self.label_masks[idx], dtype=torch.float32)\n", " }\n", " \n", " def get_label_statistics(self):\n", " \"\"\"Return statistics about label availability for 5 labels\"\"\"\n", " label_counts = self.label_masks.sum(axis=0)\n", " total_samples = len(self.smiles_list)\n", " \n", " stats = {\n", " 'total_samples': total_samples,\n", " 'label_0_count': label_counts[0],\n", " 'label_1_count': label_counts[1],\n", " 'label_2_count': label_counts[2],\n", " 'label_3_count': label_counts[3],\n", " 'label_4_count': label_counts[4],\n", " 'label_0_ratio': label_counts[0] / total_samples,\n", " 'label_1_ratio': label_counts[1] / total_samples,\n", " 'label_2_ratio': label_counts[2] / total_samples,\n", " 'label_3_ratio': label_counts[3] / total_samples,\n", " 'label_4_ratio': label_counts[4] / total_samples,\n", " 'all_labels_count': (self.label_masks.sum(axis=1) == 5).sum(),\n", " 'partial_labels_count': ((self.label_masks.sum(axis=1) > 0) & (self.label_masks.sum(axis=1) < 5)).sum(),\n", " 'no_labels_count': (self.label_masks.sum(axis=1) == 0).sum()\n", " }\n", " \n", " return stats\n", "\n", "\n", "class UnderrepresentedLabelSampler(Sampler):\n", " \"\"\"\n", " Custom sampler that gives higher sampling probability to samples containing under-represented labels.\n", " This ensures each batch contains a good mix of samples with different label availability patterns.\n", " \"\"\"\n", " def __init__(self, dataset, num_labels=5, underrep_boost=2.0):\n", " \"\"\"\n", " Args:\n", " dataset: SMILESDataset instance\n", " num_labels: Number of labels (5)\n", " underrep_boost: Multiplier to boost probability of under-represented labels\n", " \"\"\"\n", " self.dataset = dataset\n", " self.num_samples = len(dataset)\n", " self.num_labels = num_labels\n", " self.underrep_boost = underrep_boost\n", " \n", " # Calculate label frequencies\n", " label_counts = dataset.label_masks.sum(axis=0) # Count valid samples per label\n", " total_samples = self.num_samples\n", " \n", " # Label frequencies (proportion of samples with each label)\n", " label_freq = label_counts / total_samples\n", " \n", " # Inverse frequency weights (higher for under-represented labels)\n", " # Add small epsilon to avoid division by zero\n", " self.label_weights = 1.0 / (label_freq + 1e-6)\n", " \n", " # Apply boost to under-represented labels\n", " # Labels with frequency < median get boosted\n", " median_freq = np.median(label_freq)\n", " underrep_mask = label_freq < median_freq\n", " self.label_weights[underrep_mask] *= self.underrep_boost\n", " \n", " # Calculate sample weights based on which labels are present\n", " sample_weights = []\n", " for i in range(self.num_samples):\n", " mask = dataset.label_masks[i] # Boolean mask for present labels\n", " if mask.sum() > 0:\n", " # Weight is average of present labels' weights\n", " weights = self.label_weights[mask]\n", " sample_weight = weights.mean()\n", " else:\n", " # If no labels present, give minimal weight\n", " sample_weight = 0.1\n", " sample_weights.append(sample_weight)\n", " \n", " self.sample_weights = torch.tensor(sample_weights, dtype=torch.double)\n", " \n", " # Print sampling statistics\n", " print(f\"Label frequencies: {label_freq}\")\n", " print(f\"Label weights: {self.label_weights}\")\n", " print(f\"Under-represented labels (< median freq {median_freq:.3f}): {np.where(underrep_mask)[0]}\")\n", " print(f\"Sample weight range: [{self.sample_weights.min():.3f}, {self.sample_weights.max():.3f}]\")\n", " \n", " def __iter__(self):\n", " # Sample with replacement according to calculated weights\n", " indices = torch.multinomial(self.sample_weights, self.num_samples, replacement=True)\n", " return iter(indices.tolist())\n", " \n", " def __len__(self):\n", " return self.num_samples\n", "\n", "\n", "def calculate_unweighted_loss(predictions, labels, label_mask):\n", " \"\"\"\n", " Calculate simple unweighted MSE loss with masking (no label weights)\n", " \n", " Args:\n", " predictions: Model outputs (batch_size, 5)\n", " labels: Ground truth labels (batch_size, 5)\n", " label_mask: Mask for valid labels (batch_size, 5)\n", " \"\"\"\n", " loss_fn = nn.MSELoss(reduction='none')\n", " \n", " # Calculate per-sample, per-label losses\n", " losses = loss_fn(predictions, labels) # Shape: (batch_size, 5)\n", " \n", " # Apply masking to exclude NaN labels\n", " valid_mask = label_mask.bool()\n", " masked_losses = losses * valid_mask.float()\n", " \n", " # Calculate final loss (only over valid predictions)\n", " total_loss = masked_losses.sum()\n", " total_valid = valid_mask.sum()\n", " \n", " return total_loss / total_valid if total_valid > 0 else torch.tensor(0.0, device=predictions.device, requires_grad=True)\n", "\n", "\n", "def calculate_true_loss(predictions, labels, label_mask, scalers=None):\n", " \"\"\"\n", " Calculate unscaled MAE loss for monitoring using separate scalers for each label\n", " \n", " Args:\n", " predictions (torch.Tensor): Model outputs of shape (batch_size, 5).\n", " labels (torch.Tensor): Ground truth labels of shape (batch_size, 5).\n", " label_mask (torch.Tensor): Boolean mask for valid labels of shape (batch_size, 5).\n", " scalers: List of scaler objects, one for each label\n", " \n", " Returns:\n", " float: Average MAE across all valid samples\n", " \"\"\"\n", " # Detach tensors from the computation graph and move to CPU\n", " predictions_np = predictions.cpu().detach().numpy()\n", " labels_np = labels.cpu().numpy()\n", " label_mask_np = label_mask.cpu().numpy().astype(bool)\n", " \n", " total_mae = 0\n", " total_samples = 0\n", " \n", " for label_idx in range(5):\n", " # Get valid samples for this label\n", " valid_mask = label_mask_np[:, label_idx]\n", " \n", " if valid_mask.any():\n", " valid_preds = predictions_np[valid_mask, label_idx].reshape(-1, 1)\n", " valid_labels = labels_np[valid_mask, label_idx].reshape(-1, 1)\n", " \n", " if scalers is not None:\n", " # Unscale using the corresponding scaler for this label\n", " unscaled_preds = scalers[label_idx].inverse_transform(valid_preds).flatten()\n", " unscaled_labels = scalers[label_idx].inverse_transform(valid_labels).flatten()\n", " else:\n", " unscaled_preds = valid_preds.flatten()\n", " unscaled_labels = valid_labels.flatten()\n", " \n", " # Calculate MAE for this label\n", " mae = np.mean(np.abs(unscaled_preds - unscaled_labels))\n", " total_mae += mae * len(unscaled_preds)\n", " total_samples += len(unscaled_preds)\n", " \n", " return total_mae / total_samples if total_samples > 0 else 0.0\n", "\n", "\n", "def calculate_individual_label_losses(predictions, labels, label_mask, scalers=None):\n", " \"\"\"\n", " Calculate unscaled MAE loss for each individual label\n", " \n", " Args:\n", " predictions (torch.Tensor): Model outputs of shape (batch_size, 5).\n", " labels (torch.Tensor): Ground truth labels of shape (batch_size, 5).\n", " label_mask (torch.Tensor): Boolean mask for valid labels of shape (batch_size, 5).\n", " scalers: List of scaler objects, one for each label\n", " \n", " Returns:\n", " dict: Dictionary with MAE for each label\n", " \"\"\"\n", " # Detach tensors from the computation graph and move to CPU\n", " predictions_np = predictions.cpu().detach().numpy()\n", " labels_np = labels.cpu().numpy()\n", " label_mask_np = label_mask.cpu().numpy().astype(bool)\n", " \n", " individual_losses = {}\n", " \n", " for label_idx in range(5):\n", " # Get valid samples for this label\n", " valid_mask = label_mask_np[:, label_idx]\n", " \n", " if valid_mask.any():\n", " valid_preds = predictions_np[valid_mask, label_idx].reshape(-1, 1)\n", " valid_labels = labels_np[valid_mask, label_idx].reshape(-1, 1)\n", " \n", " if scalers is not None:\n", " # Unscale using the corresponding scaler for this label\n", " unscaled_preds = scalers[label_idx].inverse_transform(valid_preds).flatten()\n", " unscaled_labels = scalers[label_idx].inverse_transform(valid_labels).flatten()\n", " else:\n", " unscaled_preds = valid_preds.flatten()\n", " unscaled_labels = valid_labels.flatten()\n", " \n", " # Calculate MAE for this label\n", " mae = np.mean(np.abs(unscaled_preds - unscaled_labels))\n", " individual_losses[f'label_{label_idx}'] = mae\n", " else:\n", " individual_losses[f'label_{label_idx}'] = None # No valid samples for this label\n", " \n", " return individual_losses\n", "\n", "\n", "def analyze_batch_composition(dataloader, num_batches=5):\n", " \"\"\"\n", " Analyze the composition of batches to see label distribution\n", " \"\"\"\n", " print(\"Analyzing batch composition:\")\n", " \n", " for batch_idx, batch in enumerate(dataloader):\n", " if batch_idx >= num_batches:\n", " break\n", " \n", " label_mask = batch['label_mask'].numpy()\n", " \n", " # Count samples with each label in this batch\n", " label_counts = label_mask.sum(axis=0)\n", " batch_size = label_mask.shape[0]\n", " \n", " print(f\"Batch {batch_idx + 1}: Size={batch_size}\")\n", " for i in range(5):\n", " print(f\" Label {i}: {label_counts[i]}/{batch_size} ({label_counts[i]/batch_size:.2%})\")\n", " print()\n", "\n", "\n", "def train_model(model, train_dataloader, val_dataloader, \n", " scalers=None, num_epochs=10, learning_rate=2e-5, device='cuda', \n", " patience=3, validation_steps=500):\n", " \"\"\"\n", " Train model with unweighted loss and custom sampler for five labels\n", " \n", " Args:\n", " model: CustomModel instance (should output 5 labels)\n", " train_dataloader: Training data loader with custom sampler\n", " val_dataloader: Validation data loader \n", " scalers: List of scalers for unscaled loss monitoring\n", " num_epochs: Number of training epochs\n", " learning_rate: Learning rate\n", " device: Training device\n", " patience: Early stopping patience (in validation steps)\n", " validation_steps: Perform validation every N training steps\n", " \"\"\"\n", " model.to(device)\n", " \n", " optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)\n", " total_steps = len(train_dataloader) * num_epochs\n", " scheduler = LinearLR(optimizer, start_factor=1.0, end_factor=0.1, total_iters=total_steps)\n", " \n", " train_losses = []\n", " val_losses = []\n", " \n", " # Early stopping initialization\n", " best_val_loss = float('inf')\n", " steps_no_improve = 0\n", " best_model_state = None\n", " \n", " # Training tracking\n", " global_step = 0\n", " running_train_loss = 0\n", " running_true_train_loss = 0\n", " train_steps_count = 0\n", " \n", " print(f\"Training with custom sampler (no label weights)\")\n", " print(f\"Validation will be performed every {validation_steps} steps\")\n", " \n", " model.train()\n", " \n", " for epoch in range(num_epochs):\n", " print(f\"\\nEpoch {epoch + 1}/{num_epochs}\")\n", " \n", " train_progress = tqdm(train_dataloader, desc=\"Training\", leave=False)\n", " \n", " for batch_idx, batch in enumerate(train_progress):\n", " with torch.autocast(dtype=torch.float16, device_type=\"cuda\"):\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['labels'].to(device)\n", " label_mask = batch['label_mask'].to(device)\n", " \n", " optimizer.zero_grad()\n", " \n", " # Model forward pass\n", " outputs = model(\n", " input_ids=input_ids,\n", " attention_mask=attention_mask,\n", " )\n", " \n", " # Calculate unweighted loss (sampler handles the balancing)\n", " loss = calculate_unweighted_loss(outputs, labels, label_mask)\n", " \n", " # Calculate true loss for monitoring\n", " true_loss = calculate_true_loss(outputs, labels, label_mask, scalers)\n", " \n", " # Accumulate losses for averaging\n", " running_train_loss += loss.item()\n", " running_true_train_loss += true_loss\n", " train_steps_count += 1\n", " \n", " loss.backward()\n", " \n", " torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n", " \n", " optimizer.step()\n", " scheduler.step()\n", " \n", " global_step += 1\n", " \n", " train_progress.set_postfix({\n", " 'step': global_step,\n", " 'loss': f'{loss.item():.4f}',\n", " 'true_loss': f'{true_loss:.4f}',\n", " 'lr': f'{scheduler.get_last_lr()[0]:.2e}'\n", " })\n", " \n", " # Perform validation every validation_steps\n", " if global_step % validation_steps == 0:\n", " # Calculate average training losses since last validation\n", " avg_train_loss = running_train_loss / train_steps_count\n", " avg_true_train_loss = running_true_train_loss / train_steps_count\n", " \n", " train_losses.append(avg_train_loss)\n", " \n", " # Reset running averages\n", " running_train_loss = 0\n", " running_true_train_loss = 0\n", " train_steps_count = 0\n", " \n", " # Validation\n", " model.eval()\n", " total_val_loss = 0\n", " total_true_val_loss = 0\n", " val_batches = 0\n", " \n", " # Track individual label losses across all validation batches\n", " accumulated_individual_losses = {f'label_{i}': [] for i in range(5)}\n", "\n", " with torch.no_grad():\n", " for val_batch in val_dataloader:\n", " with torch.autocast(dtype=torch.float16, device_type=\"cuda\"):\n", " input_ids = val_batch['input_ids'].to(device)\n", " attention_mask = val_batch['attention_mask'].to(device)\n", " labels = val_batch['labels'].to(device)\n", " label_mask = val_batch['label_mask'].to(device)\n", " \n", " outputs = model(\n", " input_ids=input_ids,\n", " attention_mask=attention_mask,\n", " )\n", " \n", " val_loss = calculate_unweighted_loss(outputs, labels, label_mask)\n", " val_true_loss = calculate_true_loss(outputs, labels, label_mask, scalers)\n", " \n", " # Calculate individual label losses for this batch\n", " individual_losses = calculate_individual_label_losses(outputs, labels, label_mask, scalers)\n", " \n", " # Accumulate individual losses\n", " for label_key, loss_value in individual_losses.items():\n", " if loss_value is not None:\n", " accumulated_individual_losses[label_key].append(loss_value)\n", "\n", " total_val_loss += val_loss.item()\n", " total_true_val_loss += val_true_loss\n", " val_batches += 1\n", " \n", " avg_val_loss = total_val_loss / val_batches\n", " avg_val_true_loss = total_true_val_loss / val_batches\n", " val_losses.append(avg_val_loss)\n", " \n", " # Calculate average individual label losses\n", " avg_individual_losses = {}\n", " for label_key, losses in accumulated_individual_losses.items():\n", " if losses:\n", " avg_individual_losses[label_key] = np.mean(losses)\n", " else:\n", " avg_individual_losses[label_key] = None\n", " \n", " # Print validation results with individual label losses\n", " print(f\"\\nStep {global_step} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | True train loss: {avg_true_train_loss:.4f} | True val loss: {avg_val_true_loss:.4f}\")\n", " print(\"Individual label losses (unscaled):\")\n", " for i in range(5):\n", " label_key = f'label_{i}'\n", " if avg_individual_losses[label_key] is not None:\n", " print(f\" Label {i}: {avg_individual_losses[label_key]:.4f}\")\n", " else:\n", " print(f\" Label {i}: No valid samples\")\n", " \n", " # Early stopping check and best model saving\n", " if avg_val_loss < best_val_loss:\n", " best_val_loss = avg_val_loss\n", " steps_no_improve = 0\n", " best_model_state = model.state_dict().copy()\n", " print(f\"New best validation loss: {best_val_loss:.4f}\")\n", " else:\n", " steps_no_improve += 1\n", " if steps_no_improve >= patience:\n", " print(f\"Early stopping triggered after {global_step} steps ({steps_no_improve} validation steps without improvement).\")\n", " # Load best model and return\n", " if best_model_state is not None:\n", " model.load_state_dict(best_model_state)\n", " print(f\"Loaded best model with validation loss: {best_val_loss:.4f}\")\n", " return train_losses, val_losses, best_val_loss\n", " \n", " model.train()\n", " \n", " # Handle any remaining training loss that hasn't been validated\n", " if train_steps_count > 0:\n", " avg_train_loss = running_train_loss / train_steps_count\n", " train_losses.append(avg_train_loss)\n", " \n", " # Load the best model state before returning\n", " if best_model_state is not None:\n", " model.load_state_dict(best_model_state)\n", " print(f\"Loaded best model with validation loss: {best_val_loss:.4f}\")\n", " \n", " return train_losses, val_losses, best_val_loss\n", "\n", "\n", "def run_training(smiles_train, smiles_test, labels_train, labels_test, \n", " model, tokenizer, scalers, num_epochs=5, learning_rate=1e-5, \n", " batch_size=256, validation_steps=500, underrep_boost=2.0):\n", " \"\"\"\n", " Complete training pipeline for five labels with custom sampler\n", " \n", " Args:\n", " smiles_train, smiles_test: Lists of SMILES strings\n", " labels_train, labels_test: numpy arrays of shape (num_samples, 5) - ALREADY SCALED\n", " model: CustomModel instance (configured for 5 outputs)\n", " tokenizer: Tokenizer instance\n", " scalers: List of 5 scalers, one for each label (for inverse transform only)\n", " num_epochs: Number of training epochs\n", " learning_rate: Learning rate\n", " batch_size: Batch size for training\n", " validation_steps: Perform validation every N training steps\n", " underrep_boost: Boost factor for under-represented labels in sampler\n", " \"\"\"\n", " \n", " print(\"Setting up datasets for five-label training with custom sampler\")\n", " \n", " # Create datasets - no scaling performed here\n", " train_dataset = SMILESDataset(smiles_train, labels_train, tokenizer)\n", " val_dataset = SMILESDataset(smiles_test, labels_test, tokenizer)\n", " \n", " # Print dataset statistics\n", " train_stats = train_dataset.get_label_statistics()\n", " val_stats = val_dataset.get_label_statistics()\n", " \n", " print(\"Training dataset statistics:\")\n", " for key, value in train_stats.items():\n", " print(f\" {key}: {value}\")\n", " \n", " print(\"Validation dataset statistics:\")\n", " for key, value in val_stats.items():\n", " print(f\" {key}: {value}\")\n", " \n", " # Create custom sampler for balanced training\n", " train_sampler = UnderrepresentedLabelSampler(\n", " train_dataset, \n", " num_labels=5, \n", " underrep_boost=underrep_boost\n", " )\n", " \n", " # Create data loaders\n", " train_dataloader = DataLoader(\n", " train_dataset,\n", " batch_size=batch_size,\n", " sampler=None, # Use custom sampler instead of shuffle=True\n", " num_workers=4,\n", " pin_memory=True\n", " )\n", " \n", " val_dataloader = DataLoader(\n", " val_dataset,\n", " batch_size=batch_size,\n", " shuffle=False,\n", " num_workers=4,\n", " pin_memory=True\n", " )\n", " \n", " # Analyze batch composition to verify sampler effectiveness\n", " print(\"\\n\" + \"=\"*50)\n", " #analyze_batch_composition(train_dataloader, num_batches=3)\n", " print(\"=\"*50)\n", " \n", " # Set device\n", " device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", " print(f\"Using device: {device}\")\n", " print(f\"Training steps per epoch: {len(train_dataloader)}\")\n", " print(f\"Total training steps: {len(train_dataloader) * num_epochs}\")\n", " \n", " # Train the model\n", " train_losses, val_losses, best_val_loss = train_model(\n", " model=model,\n", " train_dataloader=train_dataloader,\n", " val_dataloader=val_dataloader,\n", " scalers=scalers,\n", " num_epochs=num_epochs,\n", " learning_rate=learning_rate,\n", " device=device,\n", " patience=10,\n", " validation_steps=validation_steps,\n", " )\n", " \n", " print('Training completed.')\n", " print(f'Number of validation checkpoints: {len(val_losses)}')\n", " print(f'Final training losses: {train_losses[-5:] if len(train_losses) >= 5 else train_losses}')\n", " print(f'Best validation loss: {best_val_loss:.4f}')\n", " \n", " # Save model\n", " torch.save(model.state_dict(), '/home/jovyan/simson_training_bolgov/kaggle_comp/checkpoints/clf_kaggle.bin')\n", " print(\"Model saved successfully!\")\n", " \n", " return train_losses, val_losses, best_val_loss\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setting up datasets for five-label training with custom sampler\n", "Training dataset statistics:\n", " total_samples: 168551\n", " label_0_count: 5446\n", " label_1_count: 78850\n", " label_2_count: 14846\n", " label_3_count: 5779\n", " label_4_count: 5782\n", " label_0_ratio: 0.032310695279173664\n", " label_1_ratio: 0.46781092962960763\n", " label_2_ratio: 0.08808016564719284\n", " label_3_ratio: 0.03428635843157264\n", " label_4_ratio: 0.03430415719871137\n", " all_labels_count: 0\n", " partial_labels_count: 96406\n", " no_labels_count: 72145\n", "Validation dataset statistics:\n", " total_samples: 29716\n", " label_0_count: 947\n", " label_1_count: 13878\n", " label_2_count: 2764\n", " label_3_count: 957\n", " label_4_count: 955\n", " label_0_ratio: 0.03186835374882218\n", " label_1_ratio: 0.4670211333961502\n", " label_2_ratio: 0.0930138645847355\n", " label_3_ratio: 0.03220487279580024\n", " label_4_ratio: 0.03213756898640463\n", " all_labels_count: 0\n", " partial_labels_count: 17016\n", " no_labels_count: 12700\n", "Label frequencies: [0.0323107 0.46781093 0.08808017 0.03428636 0.03430416]\n", "Label weights: [61.89709276 2.13761116 11.35316492 58.33053614 29.15013606]\n", "Under-represented labels (< median freq 0.034): [0 3]\n", "Sample weight range: [0.100, 61.897]\n", "\n", "==================================================\n", "==================================================\n", "Using device: cuda\n", "Training steps per epoch: 1317\n", "Total training steps: 26340\n", "Training with custom sampler (no label weights)\n", "Validation will be performed every 1316 steps\n", "\n", "Epoch 1/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 1316 | Train Loss: 0.6250 | Val Loss: 0.4127 | True train loss: 3.9762 | True val loss: 3.8368\n", "Individual label losses (unscaled):\n", " Label 0: 76.7992\n", " Label 1: 0.0127\n", " Label 2: 0.0372\n", " Label 3: 0.0987\n", " Label 4: 3.3515\n", "New best validation loss: 0.4127\n", "\n", "Epoch 2/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 2632 | Train Loss: 0.5464 | Val Loss: 0.4244 | True train loss: 3.5447 | True val loss: 3.4895\n", "Individual label losses (unscaled):\n", " Label 0: 68.7228\n", " Label 1: 0.0130\n", " Label 2: 0.0379\n", " Label 3: 0.0952\n", " Label 4: 3.8732\n", "\n", "Epoch 3/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 100%|█| 1317/1317 [01:22<00:00, 1.88it/s, step=3951, loss=0.6545, tru" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 3948 | Train Loss: 0.5242 | Val Loss: 0.4007 | True train loss: 3.4056 | True val loss: 3.2830\n", "Individual label losses (unscaled):\n", " Label 0: 63.8785\n", " Label 1: 0.0130\n", " Label 2: 0.0362\n", " Label 3: 0.1013\n", " Label 4: 3.4475\n", "New best validation loss: 0.4007\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Epoch 4/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 100%|▉| 1315/1317 [01:22<00:01, 1.87it/s, step=5267, loss=0.3083, tru" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 5264 | Train Loss: 0.5011 | Val Loss: 0.3770 | True train loss: 3.1835 | True val loss: 3.3785\n", "Individual label losses (unscaled):\n", " Label 0: 66.0959\n", " Label 1: 0.0124\n", " Label 2: 0.0382\n", " Label 3: 0.0951\n", " Label 4: 3.3052\n", "New best validation loss: 0.3770\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Epoch 5/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 100%|▉| 1315/1317 [01:22<00:01, 1.87it/s, step=6583, loss=0.2640, tru" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 6580 | Train Loss: 0.4860 | Val Loss: 0.3498 | True train loss: 3.2743 | True val loss: 3.4532\n", "Individual label losses (unscaled):\n", " Label 0: 67.9448\n", " Label 1: 0.0116\n", " Label 2: 0.0392\n", " Label 3: 0.0810\n", " Label 4: 3.3704\n", "New best validation loss: 0.3498\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Epoch 6/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 100%|▉| 1313/1317 [01:22<00:02, 1.87it/s, step=7899, loss=0.1156, tru" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 7896 | Train Loss: 0.4671 | Val Loss: 0.3422 | True train loss: 3.1278 | True val loss: 3.3296\n", "Individual label losses (unscaled):\n", " Label 0: 63.2215\n", " Label 1: 0.0117\n", " Label 2: 0.0362\n", " Label 3: 0.0827\n", " Label 4: 3.2292\n", "New best validation loss: 0.3422\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Epoch 7/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 100%|▉| 1313/1317 [01:22<00:02, 1.86it/s, step=9215, loss=0.2901, tru" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 9212 | Train Loss: 0.4557 | Val Loss: 0.3389 | True train loss: 3.0609 | True val loss: 3.2751\n", "Individual label losses (unscaled):\n", " Label 0: 63.4267\n", " Label 1: 0.0114\n", " Label 2: 0.0381\n", " Label 3: 0.0815\n", " Label 4: 2.8806\n", "New best validation loss: 0.3389\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Epoch 8/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 100%|▉| 1311/1317 [01:22<00:03, 1.87it/s, step=10531, loss=0.4604, tr" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 10528 | Train Loss: 0.4474 | Val Loss: 0.3379 | True train loss: 3.0718 | True val loss: 3.2051\n", "Individual label losses (unscaled):\n", " Label 0: 61.2247\n", " Label 1: 0.0113\n", " Label 2: 0.0372\n", " Label 3: 0.0828\n", " Label 4: 2.9602\n", "New best validation loss: 0.3379\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Epoch 9/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 100%|▉| 1311/1317 [01:21<00:03, 1.87it/s, step=11847, loss=0.2547, tr" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 11844 | Train Loss: 0.4285 | Val Loss: 0.3416 | True train loss: 3.0075 | True val loss: 3.1697\n", "Individual label losses (unscaled):\n", " Label 0: 61.3822\n", " Label 1: 0.0112\n", " Label 2: 0.0421\n", " Label 3: 0.0847\n", " Label 4: 3.3251\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Epoch 10/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 99%|▉| 1309/1317 [01:21<00:04, 1.87it/s, step=13163, loss=0.2791, tr" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 13160 | Train Loss: 0.4116 | Val Loss: 0.3174 | True train loss: 2.9027 | True val loss: 3.1666\n", "Individual label losses (unscaled):\n", " Label 0: 59.6537\n", " Label 1: 0.0110\n", " Label 2: 0.0365\n", " Label 3: 0.0877\n", " Label 4: 3.1535\n", "New best validation loss: 0.3174\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Epoch 11/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 99%|▉| 1309/1317 [01:21<00:04, 1.87it/s, step=14479, loss=0.3915, tr" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 14476 | Train Loss: 0.3983 | Val Loss: 0.3039 | True train loss: 2.8602 | True val loss: 3.1240\n", "Individual label losses (unscaled):\n", " Label 0: 60.6528\n", " Label 1: 0.0107\n", " Label 2: 0.0371\n", " Label 3: 0.0827\n", " Label 4: 3.2043\n", "New best validation loss: 0.3039\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Epoch 12/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 99%|▉| 1307/1317 [01:21<00:05, 1.87it/s, step=15795, loss=0.1155, tr" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 15792 | Train Loss: 0.3863 | Val Loss: 0.3050 | True train loss: 2.7796 | True val loss: 3.0697\n", "Individual label losses (unscaled):\n", " Label 0: 59.8002\n", " Label 1: 0.0108\n", " Label 2: 0.0371\n", " Label 3: 0.0815\n", " Label 4: 3.1037\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Epoch 13/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 99%|▉| 1307/1317 [01:21<00:05, 1.87it/s, step=17111, loss=0.2704, tr" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 17108 | Train Loss: 0.3779 | Val Loss: 0.2881 | True train loss: 2.7442 | True val loss: 3.1636\n", "Individual label losses (unscaled):\n", " Label 0: 61.2941\n", " Label 1: 0.0102\n", " Label 2: 0.0361\n", " Label 3: 0.0836\n", " Label 4: 3.1077\n", "New best validation loss: 0.2881\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Epoch 14/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 99%|▉| 1305/1317 [01:21<00:06, 1.87it/s, step=18427, loss=0.4965, tr" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Step 18424 | Train Loss: 0.3645 | Val Loss: 0.2822 | True train loss: 2.6844 | True val loss: 3.1494\n", "Individual label losses (unscaled):\n", " Label 0: 61.1663\n", " Label 1: 0.0100\n", " Label 2: 0.0365\n", " Label 3: 0.0743\n", " Label 4: 3.2309\n", "New best validation loss: 0.2822\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Epoch 15/20\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[15]\u001b[39m\u001b[32m, line 10\u001b[39m\n\u001b[32m 6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtqdm\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m tqdm\n\u001b[32m 8\u001b[39m BATCH_SIZE = \u001b[32m128\u001b[39m\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m train_losses, val_losses, best_loss = \u001b[43mrun_training\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 11\u001b[39m \u001b[43m \u001b[49m\u001b[43msmiles_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msmiles_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[32m 12\u001b[39m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscalers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_epochs\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m20\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlearning_rate\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m1e-4\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mBATCH_SIZE\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalidation_steps\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msmiles_train\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m/\u001b[49m\u001b[43m/\u001b[49m\u001b[43m \u001b[49m\u001b[43mBATCH_SIZE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 13\u001b[39m \u001b[43m)\u001b[49m\n", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[13]\u001b[39m\u001b[32m, line 532\u001b[39m, in \u001b[36mrun_training\u001b[39m\u001b[34m(smiles_train, smiles_test, labels_train, labels_test, model, tokenizer, scalers, num_epochs, learning_rate, batch_size, validation_steps, underrep_boost)\u001b[39m\n\u001b[32m 529\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mTotal training steps: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(train_dataloader)\u001b[38;5;250m \u001b[39m*\u001b[38;5;250m \u001b[39mnum_epochs\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 531\u001b[39m \u001b[38;5;66;03m# Train the model\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m532\u001b[39m train_losses, val_losses, best_val_loss = \u001b[43mtrain_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 533\u001b[39m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 534\u001b[39m \u001b[43m \u001b[49m\u001b[43mtrain_dataloader\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtrain_dataloader\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 535\u001b[39m \u001b[43m \u001b[49m\u001b[43mval_dataloader\u001b[49m\u001b[43m=\u001b[49m\u001b[43mval_dataloader\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 536\u001b[39m \u001b[43m \u001b[49m\u001b[43mscalers\u001b[49m\u001b[43m=\u001b[49m\u001b[43mscalers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 537\u001b[39m \u001b[43m \u001b[49m\u001b[43mnum_epochs\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnum_epochs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 538\u001b[39m \u001b[43m \u001b[49m\u001b[43mlearning_rate\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlearning_rate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 539\u001b[39m \u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 540\u001b[39m \u001b[43m \u001b[49m\u001b[43mpatience\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 541\u001b[39m \u001b[43m \u001b[49m\u001b[43mvalidation_steps\u001b[49m\u001b[43m=\u001b[49m\u001b[43mvalidation_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 542\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 544\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m'\u001b[39m\u001b[33mTraining completed.\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 545\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[33mNumber of validation checkpoints: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(val_losses)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m)\n", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[13]\u001b[39m\u001b[32m, line 351\u001b[39m, in \u001b[36mtrain_model\u001b[39m\u001b[34m(model, train_dataloader, val_dataloader, scalers, num_epochs, learning_rate, device, patience, validation_steps)\u001b[39m\n\u001b[32m 345\u001b[39m scheduler.step()\n\u001b[32m 347\u001b[39m global_step += \u001b[32m1\u001b[39m\n\u001b[32m 349\u001b[39m train_progress.set_postfix({\n\u001b[32m 350\u001b[39m \u001b[33m'\u001b[39m\u001b[33mstep\u001b[39m\u001b[33m'\u001b[39m: global_step,\n\u001b[32m--> \u001b[39m\u001b[32m351\u001b[39m \u001b[33m'\u001b[39m\u001b[33mloss\u001b[39m\u001b[33m'\u001b[39m: \u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mloss\u001b[49m\u001b[43m.\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.4f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m,\n\u001b[32m 352\u001b[39m \u001b[33m'\u001b[39m\u001b[33mtrue_loss\u001b[39m\u001b[33m'\u001b[39m: \u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtrue_loss\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.4f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m,\n\u001b[32m 353\u001b[39m \u001b[33m'\u001b[39m\u001b[33mlr\u001b[39m\u001b[33m'\u001b[39m: \u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mscheduler.get_last_lr()[\u001b[32m0\u001b[39m]\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.2e\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\n\u001b[32m 354\u001b[39m })\n\u001b[32m 356\u001b[39m \u001b[38;5;66;03m# Perform validation every validation_steps\u001b[39;00m\n\u001b[32m 357\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m global_step % validation_steps == \u001b[32m0\u001b[39m:\n\u001b[32m 358\u001b[39m \u001b[38;5;66;03m# Calculate average training losses since last validation\u001b[39;00m\n", "\u001b[31mKeyboardInterrupt\u001b[39m: " ] } ], "source": [ "import numpy as np\n", "import torch\n", "from torch.optim import AdamW\n", "from torch.optim.lr_scheduler import LinearLR\n", "from torch.utils.data import DataLoader\n", "from tqdm import tqdm\n", "\n", "BATCH_SIZE = 128\n", "\n", "train_losses, val_losses, best_loss = run_training(\n", " smiles_train, smiles_test, labels_train, labels_test, \n", " model, tokenizer, scalers, num_epochs=20, learning_rate=1e-4, batch_size=BATCH_SIZE, validation_steps=len(smiles_train) // BATCH_SIZE,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kaggle": { "accelerator": "gpu", "dataSources": [ { "databundleVersionId": 12966160, "sourceId": 74608, "sourceType": "competition" }, { "datasetId": 7678100, "sourceId": 12189904, "sourceType": "datasetVersion" }, { "datasetId": 7690162, "sourceId": 12207625, "sourceType": "datasetVersion" }, { "datasetId": 7716502, "sourceId": 12322957, "sourceType": "datasetVersion" }, { "datasetId": 7801155, "sourceId": 12372847, "sourceType": "datasetVersion" }, { "datasetId": 7809006, "sourceId": 12525286, "sourceType": "datasetVersion" }, { "datasetId": 7912957, "sourceId": 12668147, "sourceType": "datasetVersion" } ], "dockerImageVersionId": 31041, "isGpuEnabled": true, "isInternetEnabled": true, "language": "python", "sourceType": "notebook" }, "kernelspec": { "display_name": "Python [conda env:.mlspace-bolgov_simson_training]", "language": "python", "name": "conda-env-.mlspace-bolgov_simson_training-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 4 }