{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from transformers import PreTrainedModel, AutoConfig, BertModel, BertTokenizerFast, BertConfig, AutoModel, AutoTokenizer\n", "import pandas as pd\n", "import torch\n", "import os\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.model_selection import train_test_split\n", "from tqdm import tqdm\n", "import joblib\n", "\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('/home/jovyan/simson_training_bolgov/kaggle_comp/train.csv')\n", "\n", "targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | id | \n", "SMILES | \n", "Tg | \n", "FFV | \n", "Tc | \n", "Density | \n", "Rg | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "4.215886e+08 | \n", "*C(=O)c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(Oc4ccc(N5... | \n", "NaN | \n", "0.376767 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 1 | \n", "7.984549e+08 | \n", "*c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(N4C(=O)c5ccc(C... | \n", "NaN | \n", "0.346993 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 2 | \n", "NaN | \n", "*CC/C=C(/*)C | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 3 | \n", "NaN | \n", "*CC(*)(C)C(=O)OCCN(CC)c1ccc(/N=N/c2ccc(OC)cc2)... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 4 | \n", "NaN | \n", "*Oc1cc(OC(=O)c2ccc(OCC)cc2)c(OC(=O)CCCC(*)=O)c... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 16958 | \n", "2.389975e+08 | \n", "*OC(=O)Oc1ccc(S(=O)(=O)c2ccc(OC(=O)OC3CC4CC(*)... | \n", "NaN | \n", "0.339596 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 16959 | \n", "NaN | \n", "*c1ccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4ccc(N5C(=O)c6c... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 16960 | \n", "NaN | \n", "*OC(F)(F)COC(=O)c1cc(OCCCCC)cc(C(=O)OCC(*)(F)F)c1 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 16961 | \n", "1.973417e+09 | \n", "*C=CC1CC(*)C2C(=O)N(c3ccc(F)cc3)C(=O)C12 | \n", "NaN | \n", "0.374710 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 16962 | \n", "NaN | \n", "*/C=C/[Ge](/C=C/[Si](*)(c1ccccc1)c1ccccc1)(c1c... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
16963 rows × 7 columns
\n", "