diff --git "a/src/testing.ipynb" "b/src/testing.ipynb"
deleted file mode 100644--- "a/src/testing.ipynb"
+++ /dev/null
@@ -1,2226 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "4e56ea4e",
-   "metadata": {},
-   "source": [
-    "# Training"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "0b96a4f0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import Any\n",
-    "from scipy import sparse\n",
-    "from pathlib import Path\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "\n",
-    "BASE_DIR = Path.cwd().parent\n",
-    "SAVE_DIR = BASE_DIR / \"saved\" # folder to save progress outputs\n",
-    "\n",
-    "# load n-gram outputs\n",
-    "ngram_dir = SAVE_DIR / \"ngram_features\"\n",
-    "ngram_dict_df = {split: pd.read_parquet(ngram_dir / f\"dataframes/{split}_ngram.parquet\") for split in [\"train\", \"validation\", \"test\"]}\n",
-    "\n",
-    "# # load SVD-reduction outputs\n",
-    "# reduced_dir = SAVE_DIR / \"dimensionality_reduction\"\n",
-    "# reduced_dict_df = {split: pd.read_parquet(reduced_dir / f\"dataframes/{split}_reduced.parquet\") for split in [\"train\", \"validation\", \"test\"]}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "df99844e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from dataclasses import dataclass\n",
-    "\n",
-    "@dataclass(slots=True)\n",
-    "class Config:\n",
-    "    verbose: bool = True\n",
-    "    \n",
-    "    include_statistical: bool = True\n",
-    "    include_tfidf: bool = True\n",
-    "    include_char_ngrams: bool = True\n",
-    "    include_pos_ngrams: bool = True\n",
-    "    include_readability: bool = True\n",
-    "    \n",
-    "    pairwise_operations: tuple[str, ...] = (\"abs_diff\", \"product\")\n",
-    "    use_sparse_matrices: bool = False\n",
-    "    model_type: str = \"hist_gb\"\n",
-    "    random_state: int = 42\n",
-    "    \n",
-    "    threshold_metric: str = \"accuracy\"\n",
-    "    threshold_grid_step: float = 0.01\n",
-    "    return_pairwise_matrices: bool = False\n",
-    "config = Config()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "32772a9d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def _feature_family_from_suffix(suffix: str) -> str:\n",
-    "    if suffix.startswith(\"tfidf_\"):\n",
-    "        return \"tfidf\"\n",
-    "    if suffix.startswith(\"char\") and \"_tfidf_\" in suffix:\n",
-    "        return \"char_ngrams\"\n",
-    "    if suffix.startswith(\"pos\") and \"_tfidf_\" in suffix:\n",
-    "        return \"pos_ngrams\"\n",
-    "    if suffix.startswith(\"readability_\"):\n",
-    "        return \"readability\"\n",
-    "    return \"statistical\"\n",
-    "\n",
-    "def _include_family(family: str, config: Config = config) -> bool:\n",
-    "    return {\n",
-    "        \"statistical\": config.include_statistical,\n",
-    "        \"tfidf\": config.include_tfidf,\n",
-    "        \"char_ngrams\": config.include_char_ngrams,\n",
-    "        \"pos_ngrams\": config.include_pos_ngrams,\n",
-    "        \"readability\": config.include_readability,\n",
-    "    }[family]\n",
-    "\n",
-    "\n",
-    "suffixes: list[str] = []\n",
-    "for column in ngram_dict_df[\"train\"].columns:\n",
-    "    if not column.startswith(\"text1_\"): \n",
-    "        continue\n",
-    "    suffix = column[len(\"text1_\"):] # getting the suffix\n",
-    "    family = _feature_family_from_suffix(suffix) # categorize suffix\n",
-    "    if _include_family(family):\n",
-    "        suffixes.append(suffix)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "4f4c5f9f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "73810633\n",
-      "16708698\n",
-      "16475773\n"
-     ]
-    }
-   ],
-   "source": [
-    "X_by_split: dict[str, Any] = {}\n",
-    "y_by_split: dict[str, np.ndarray] = {}\n",
-    "for split, df in ngram_dict_df.items():\n",
-    "\n",
-    "    columns: list[sparse.csr_matrix] = []\n",
-    "    feature_names: list[str] = []\n",
-    "    for suffix in suffixes:\n",
-    "        \n",
-    "        # for each feature vector's member, build a sparse column vector\n",
-    "        left = sparse.csr_matrix(df[f\"text1_{suffix}\"].to_numpy(dtype=\"float32\").reshape(-1,1))\n",
-    "        right = sparse.csr_matrix(df[f\"text2_{suffix}\"].to_numpy(dtype=\"float32\").reshape(-1,1))\n",
-    "\n",
-    "        diff = left - right\n",
-    "\n",
-    "        # absolute diff\n",
-    "        columns.append(abs(diff))\n",
-    "        feature_names.append(f\"{suffix}_abs_diff\")\n",
-    "\n",
-    "        # product\n",
-    "        columns.append(left.multiply(right))\n",
-    "        feature_names.append(f\"{suffix}_product\")\n",
-    "\n",
-    "        # # squared diff\n",
-    "        # columns.append(diff.power(2))\n",
-    "        # feature_names.append(f\"{suffix}_sq_diff\")\n",
-    "\n",
-    "    X = sparse.hstack(columns, format=\"csr\", dtype=\"float32\")\n",
-    "    y = df[\"same\"].to_numpy(dtype=np.int8, copy=False) # for binary label\n",
-    "\n",
-    "    X_by_split[split] = X\n",
-    "    y_by_split[split] = y\n",
-    "\n",
-    "    nnz = int(X.nnz)\n",
-    "    print(nnz)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "024ce327",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<Compressed Sparse Row sparse matrix of dtype 'float32'\n",
-       "\twith 73810633 stored elements and shape (32926, 159328)>"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "X_by_split[\"train\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "8fc0b339",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<style>#sk-container-id-2 {\n",
-       "  /* Definition of color scheme common for light and dark mode */\n",
-       "  --sklearn-color-text: #000;\n",
-       "  --sklearn-color-text-muted: #666;\n",
-       "  --sklearn-color-line: gray;\n",
-       "  /* Definition of color scheme for unfitted estimators */\n",
-       "  --sklearn-color-unfitted-level-0: #fff5e6;\n",
-       "  --sklearn-color-unfitted-level-1: #f6e4d2;\n",
-       "  --sklearn-color-unfitted-level-2: #ffe0b3;\n",
-       "  --sklearn-color-unfitted-level-3: chocolate;\n",
-       "  /* Definition of color scheme for fitted estimators */\n",
-       "  --sklearn-color-fitted-level-0: #f0f8ff;\n",
-       "  --sklearn-color-fitted-level-1: #d4ebff;\n",
-       "  --sklearn-color-fitted-level-2: #b3dbfd;\n",
-       "  --sklearn-color-fitted-level-3: cornflowerblue;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2.light {\n",
-       "  /* Specific color for light theme */\n",
-       "  --sklearn-color-text-on-default-background: black;\n",
-       "  --sklearn-color-background: white;\n",
-       "  --sklearn-color-border-box: black;\n",
-       "  --sklearn-color-icon: #696969;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2.dark {\n",
-       "  --sklearn-color-text-on-default-background: white;\n",
-       "  --sklearn-color-background: #111;\n",
-       "  --sklearn-color-border-box: white;\n",
-       "  --sklearn-color-icon: #878787;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 {\n",
-       "  color: var(--sklearn-color-text);\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 pre {\n",
-       "  padding: 0;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 input.sk-hidden--visually {\n",
-       "  border: 0;\n",
-       "  clip: rect(1px 1px 1px 1px);\n",
-       "  clip: rect(1px, 1px, 1px, 1px);\n",
-       "  height: 1px;\n",
-       "  margin: -1px;\n",
-       "  overflow: hidden;\n",
-       "  padding: 0;\n",
-       "  position: absolute;\n",
-       "  width: 1px;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-dashed-wrapped {\n",
-       "  border: 1px dashed var(--sklearn-color-line);\n",
-       "  margin: 0 0.4em 0.5em 0.4em;\n",
-       "  box-sizing: border-box;\n",
-       "  padding-bottom: 0.4em;\n",
-       "  background-color: var(--sklearn-color-background);\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-container {\n",
-       "  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
-       "     but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
-       "     so we also need the `!important` here to be able to override the\n",
-       "     default hidden behavior on the sphinx rendered scikit-learn.org.\n",
-       "     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
-       "  display: inline-block !important;\n",
-       "  position: relative;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-text-repr-fallback {\n",
-       "  display: none;\n",
-       "}\n",
-       "\n",
-       "div.sk-parallel-item,\n",
-       "div.sk-serial,\n",
-       "div.sk-item {\n",
-       "  /* draw centered vertical line to link estimators */\n",
-       "  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
-       "  background-size: 2px 100%;\n",
-       "  background-repeat: no-repeat;\n",
-       "  background-position: center center;\n",
-       "}\n",
-       "\n",
-       "/* Parallel-specific style estimator block */\n",
-       "\n",
-       "#sk-container-id-2 div.sk-parallel-item::after {\n",
-       "  content: \"\";\n",
-       "  width: 100%;\n",
-       "  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
-       "  flex-grow: 1;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-parallel {\n",
-       "  display: flex;\n",
-       "  align-items: stretch;\n",
-       "  justify-content: center;\n",
-       "  background-color: var(--sklearn-color-background);\n",
-       "  position: relative;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-parallel-item {\n",
-       "  display: flex;\n",
-       "  flex-direction: column;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-parallel-item:first-child::after {\n",
-       "  align-self: flex-end;\n",
-       "  width: 50%;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-parallel-item:last-child::after {\n",
-       "  align-self: flex-start;\n",
-       "  width: 50%;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-parallel-item:only-child::after {\n",
-       "  width: 0;\n",
-       "}\n",
-       "\n",
-       "/* Serial-specific style estimator block */\n",
-       "\n",
-       "#sk-container-id-2 div.sk-serial {\n",
-       "  display: flex;\n",
-       "  flex-direction: column;\n",
-       "  align-items: center;\n",
-       "  background-color: var(--sklearn-color-background);\n",
-       "  padding-right: 1em;\n",
-       "  padding-left: 1em;\n",
-       "}\n",
-       "\n",
-       "\n",
-       "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
-       "clickable and can be expanded/collapsed.\n",
-       "- Pipeline and ColumnTransformer use this feature and define the default style\n",
-       "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
-       "*/\n",
-       "\n",
-       "/* Pipeline and ColumnTransformer style (default) */\n",
-       "\n",
-       "#sk-container-id-2 div.sk-toggleable {\n",
-       "  /* Default theme specific background. It is overwritten whether we have a\n",
-       "  specific estimator or a Pipeline/ColumnTransformer */\n",
-       "  background-color: var(--sklearn-color-background);\n",
-       "}\n",
-       "\n",
-       "/* Toggleable label */\n",
-       "#sk-container-id-2 label.sk-toggleable__label {\n",
-       "  cursor: pointer;\n",
-       "  display: flex;\n",
-       "  width: 100%;\n",
-       "  margin-bottom: 0;\n",
-       "  padding: 0.5em;\n",
-       "  box-sizing: border-box;\n",
-       "  text-align: center;\n",
-       "  align-items: center;\n",
-       "  justify-content: center;\n",
-       "  gap: 0.5em;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 label.sk-toggleable__label .caption {\n",
-       "  font-size: 0.6rem;\n",
-       "  font-weight: lighter;\n",
-       "  color: var(--sklearn-color-text-muted);\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 label.sk-toggleable__label-arrow:before {\n",
-       "  /* Arrow on the left of the label */\n",
-       "  content: \"▸\";\n",
-       "  float: left;\n",
-       "  margin-right: 0.25em;\n",
-       "  color: var(--sklearn-color-icon);\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {\n",
-       "  color: var(--sklearn-color-text);\n",
-       "}\n",
-       "\n",
-       "/* Toggleable content - dropdown */\n",
-       "\n",
-       "#sk-container-id-2 div.sk-toggleable__content {\n",
-       "  display: none;\n",
-       "  text-align: left;\n",
-       "  /* unfitted */\n",
-       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-toggleable__content.fitted {\n",
-       "  /* fitted */\n",
-       "  background-color: var(--sklearn-color-fitted-level-0);\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-toggleable__content pre {\n",
-       "  margin: 0.2em;\n",
-       "  border-radius: 0.25em;\n",
-       "  color: var(--sklearn-color-text);\n",
-       "  /* unfitted */\n",
-       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-toggleable__content.fitted pre {\n",
-       "  /* unfitted */\n",
-       "  background-color: var(--sklearn-color-fitted-level-0);\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
-       "  /* Expand drop-down */\n",
-       "  display: block;\n",
-       "  width: 100%;\n",
-       "  overflow: visible;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
-       "  content: \"▾\";\n",
-       "}\n",
-       "\n",
-       "/* Pipeline/ColumnTransformer-specific style */\n",
-       "\n",
-       "#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
-       "  color: var(--sklearn-color-text);\n",
-       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
-       "  background-color: var(--sklearn-color-fitted-level-2);\n",
-       "}\n",
-       "\n",
-       "/* Estimator-specific style */\n",
-       "\n",
-       "/* Colorize estimator box */\n",
-       "#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
-       "  /* unfitted */\n",
-       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
-       "  /* fitted */\n",
-       "  background-color: var(--sklearn-color-fitted-level-2);\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-label label.sk-toggleable__label,\n",
-       "#sk-container-id-2 div.sk-label label {\n",
-       "  /* The background is the default theme color */\n",
-       "  color: var(--sklearn-color-text-on-default-background);\n",
-       "}\n",
-       "\n",
-       "/* On hover, darken the color of the background */\n",
-       "#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {\n",
-       "  color: var(--sklearn-color-text);\n",
-       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
-       "}\n",
-       "\n",
-       "/* Label box, darken color on hover, fitted */\n",
-       "#sk-container-id-2 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
-       "  color: var(--sklearn-color-text);\n",
-       "  background-color: var(--sklearn-color-fitted-level-2);\n",
-       "}\n",
-       "\n",
-       "/* Estimator label */\n",
-       "\n",
-       "#sk-container-id-2 div.sk-label label {\n",
-       "  font-family: monospace;\n",
-       "  font-weight: bold;\n",
-       "  line-height: 1.2em;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-label-container {\n",
-       "  text-align: center;\n",
-       "}\n",
-       "\n",
-       "/* Estimator-specific */\n",
-       "#sk-container-id-2 div.sk-estimator {\n",
-       "  font-family: monospace;\n",
-       "  border: 1px dotted var(--sklearn-color-border-box);\n",
-       "  border-radius: 0.25em;\n",
-       "  box-sizing: border-box;\n",
-       "  margin-bottom: 0.5em;\n",
-       "  /* unfitted */\n",
-       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-estimator.fitted {\n",
-       "  /* fitted */\n",
-       "  background-color: var(--sklearn-color-fitted-level-0);\n",
-       "}\n",
-       "\n",
-       "/* on hover */\n",
-       "#sk-container-id-2 div.sk-estimator:hover {\n",
-       "  /* unfitted */\n",
-       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 div.sk-estimator.fitted:hover {\n",
-       "  /* fitted */\n",
-       "  background-color: var(--sklearn-color-fitted-level-2);\n",
-       "}\n",
-       "\n",
-       "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
-       "\n",
-       "/* Common style for \"i\" and \"?\" */\n",
-       "\n",
-       ".sk-estimator-doc-link,\n",
-       "a:link.sk-estimator-doc-link,\n",
-       "a:visited.sk-estimator-doc-link {\n",
-       "  float: right;\n",
-       "  font-size: smaller;\n",
-       "  line-height: 1em;\n",
-       "  font-family: monospace;\n",
-       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
-       "  border-radius: 1em;\n",
-       "  height: 1em;\n",
-       "  width: 1em;\n",
-       "  text-decoration: none !important;\n",
-       "  margin-left: 0.5em;\n",
-       "  text-align: center;\n",
-       "  /* unfitted */\n",
-       "  border: var(--sklearn-color-unfitted-level-3) 1pt solid;\n",
-       "  color: var(--sklearn-color-unfitted-level-3);\n",
-       "}\n",
-       "\n",
-       ".sk-estimator-doc-link.fitted,\n",
-       "a:link.sk-estimator-doc-link.fitted,\n",
-       "a:visited.sk-estimator-doc-link.fitted {\n",
-       "  /* fitted */\n",
-       "  background-color: var(--sklearn-color-fitted-level-0);\n",
-       "  border: var(--sklearn-color-fitted-level-3) 1pt solid;\n",
-       "  color: var(--sklearn-color-fitted-level-3);\n",
-       "}\n",
-       "\n",
-       "/* On hover */\n",
-       "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
-       ".sk-estimator-doc-link:hover,\n",
-       "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
-       ".sk-estimator-doc-link:hover {\n",
-       "  /* unfitted */\n",
-       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
-       "  border: var(--sklearn-color-fitted-level-0) 1pt solid;\n",
-       "  color: var(--sklearn-color-unfitted-level-0);\n",
-       "  text-decoration: none;\n",
-       "}\n",
-       "\n",
-       "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
-       ".sk-estimator-doc-link.fitted:hover,\n",
-       "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
-       ".sk-estimator-doc-link.fitted:hover {\n",
-       "  /* fitted */\n",
-       "  background-color: var(--sklearn-color-fitted-level-3);\n",
-       "  border: var(--sklearn-color-fitted-level-0) 1pt solid;\n",
-       "  color: var(--sklearn-color-fitted-level-0);\n",
-       "  text-decoration: none;\n",
-       "}\n",
-       "\n",
-       "/* Span, style for the box shown on hovering the info icon */\n",
-       ".sk-estimator-doc-link span {\n",
-       "  display: none;\n",
-       "  z-index: 9999;\n",
-       "  position: relative;\n",
-       "  font-weight: normal;\n",
-       "  right: .2ex;\n",
-       "  padding: .5ex;\n",
-       "  margin: .5ex;\n",
-       "  width: min-content;\n",
-       "  min-width: 20ex;\n",
-       "  max-width: 50ex;\n",
-       "  color: var(--sklearn-color-text);\n",
-       "  box-shadow: 2pt 2pt 4pt #999;\n",
-       "  /* unfitted */\n",
-       "  background: var(--sklearn-color-unfitted-level-0);\n",
-       "  border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
-       "}\n",
-       "\n",
-       ".sk-estimator-doc-link.fitted span {\n",
-       "  /* fitted */\n",
-       "  background: var(--sklearn-color-fitted-level-0);\n",
-       "  border: var(--sklearn-color-fitted-level-3);\n",
-       "}\n",
-       "\n",
-       ".sk-estimator-doc-link:hover span {\n",
-       "  display: block;\n",
-       "}\n",
-       "\n",
-       "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
-       "\n",
-       "#sk-container-id-2 a.estimator_doc_link {\n",
-       "  float: right;\n",
-       "  font-size: 1rem;\n",
-       "  line-height: 1em;\n",
-       "  font-family: monospace;\n",
-       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
-       "  border-radius: 1rem;\n",
-       "  height: 1rem;\n",
-       "  width: 1rem;\n",
-       "  text-decoration: none;\n",
-       "  /* unfitted */\n",
-       "  color: var(--sklearn-color-unfitted-level-1);\n",
-       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 a.estimator_doc_link.fitted {\n",
-       "  /* fitted */\n",
-       "  background-color: var(--sklearn-color-fitted-level-0);\n",
-       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
-       "  color: var(--sklearn-color-fitted-level-1);\n",
-       "}\n",
-       "\n",
-       "/* On hover */\n",
-       "#sk-container-id-2 a.estimator_doc_link:hover {\n",
-       "  /* unfitted */\n",
-       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
-       "  color: var(--sklearn-color-background);\n",
-       "  text-decoration: none;\n",
-       "}\n",
-       "\n",
-       "#sk-container-id-2 a.estimator_doc_link.fitted:hover {\n",
-       "  /* fitted */\n",
-       "  background-color: var(--sklearn-color-fitted-level-3);\n",
-       "}\n",
-       "\n",
-       ".estimator-table {\n",
-       "    font-family: monospace;\n",
-       "}\n",
-       "\n",
-       ".estimator-table summary {\n",
-       "    padding: .5rem;\n",
-       "    cursor: pointer;\n",
-       "}\n",
-       "\n",
-       ".estimator-table summary::marker {\n",
-       "    font-size: 0.7rem;\n",
-       "}\n",
-       "\n",
-       ".estimator-table details[open] {\n",
-       "    padding-left: 0.1rem;\n",
-       "    padding-right: 0.1rem;\n",
-       "    padding-bottom: 0.3rem;\n",
-       "}\n",
-       "\n",
-       ".estimator-table .parameters-table {\n",
-       "    margin-left: auto !important;\n",
-       "    margin-right: auto !important;\n",
-       "    margin-top: 0;\n",
-       "}\n",
-       "\n",
-       ".estimator-table .parameters-table tr:nth-child(odd) {\n",
-       "    background-color: #fff;\n",
-       "}\n",
-       "\n",
-       ".estimator-table .parameters-table tr:nth-child(even) {\n",
-       "    background-color: #f6f6f6;\n",
-       "}\n",
-       "\n",
-       ".estimator-table .parameters-table tr:hover {\n",
-       "    background-color: #e0e0e0;\n",
-       "}\n",
-       "\n",
-       ".estimator-table table td {\n",
-       "    border: 1px solid rgba(106, 105, 104, 0.232);\n",
-       "}\n",
-       "\n",
-       "/*\n",
-       "    `table td`is set in notebook with right text-align.\n",
-       "    We need to overwrite it.\n",
-       "*/\n",
-       ".estimator-table table td.param {\n",
-       "    text-align: left;\n",
-       "    position: relative;\n",
-       "    padding: 0;\n",
-       "}\n",
-       "\n",
-       ".user-set td {\n",
-       "    color:rgb(255, 94, 0);\n",
-       "    text-align: left !important;\n",
-       "}\n",
-       "\n",
-       ".user-set td.value {\n",
-       "    color:rgb(255, 94, 0);\n",
-       "    background-color: transparent;\n",
-       "}\n",
-       "\n",
-       ".default td {\n",
-       "    color: black;\n",
-       "    text-align: left !important;\n",
-       "}\n",
-       "\n",
-       ".user-set td i,\n",
-       ".default td i {\n",
-       "    color: black;\n",
-       "}\n",
-       "\n",
-       "/*\n",
-       "    Styles for parameter documentation links\n",
-       "    We need styling for visited so jupyter doesn't overwrite it\n",
-       "*/\n",
-       "a.param-doc-link,\n",
-       "a.param-doc-link:link,\n",
-       "a.param-doc-link:visited {\n",
-       "    text-decoration: underline dashed;\n",
-       "    text-underline-offset: .3em;\n",
-       "    color: inherit;\n",
-       "    display: block;\n",
-       "    padding: .5em;\n",
-       "}\n",
-       "\n",
-       "/* \"hack\" to make the entire area of the cell containing the link clickable */\n",
-       "a.param-doc-link::before {\n",
-       "    position: absolute;\n",
-       "    content: \"\";\n",
-       "    inset: 0;\n",
-       "}\n",
-       "\n",
-       ".param-doc-description {\n",
-       "    display: none;\n",
-       "    position: absolute;\n",
-       "    z-index: 9999;\n",
-       "    left: 0;\n",
-       "    padding: .5ex;\n",
-       "    margin-left: 1.5em;\n",
-       "    color: var(--sklearn-color-text);\n",
-       "    box-shadow: .3em .3em .4em #999;\n",
-       "    width: max-content;\n",
-       "    text-align: left;\n",
-       "    max-height: 10em;\n",
-       "    overflow-y: auto;\n",
-       "\n",
-       "    /* unfitted */\n",
-       "    background: var(--sklearn-color-unfitted-level-0);\n",
-       "    border: thin solid var(--sklearn-color-unfitted-level-3);\n",
-       "}\n",
-       "\n",
-       "/* Fitted state for parameter tooltips */\n",
-       ".fitted .param-doc-description {\n",
-       "    /* fitted */\n",
-       "    background: var(--sklearn-color-fitted-level-0);\n",
-       "    border: thin solid var(--sklearn-color-fitted-level-3);\n",
-       "}\n",
-       "\n",
-       ".param-doc-link:hover .param-doc-description {\n",
-       "    display: block;\n",
-       "}\n",
-       "\n",
-       ".copy-paste-icon {\n",
-       "    background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCA0NDggNTEyIj48IS0tIUZvbnQgQXdlc29tZSBGcmVlIDYuNy4yIGJ5IEBmb250YXdlc29tZSAtIGh0dHBzOi8vZm9udGF3ZXNvbWUuY29tIExpY2Vuc2UgLSBodHRwczovL2ZvbnRhd2Vzb21lLmNvbS9saWNlbnNlL2ZyZWUgQ29weXJpZ2h0IDIwMjUgRm9udGljb25zLCBJbmMuLS0+PHBhdGggZD0iTTIwOCAwTDMzMi4xIDBjMTIuNyAwIDI0LjkgNS4xIDMzLjkgMTQuMWw2Ny45IDY3LjljOSA5IDE0LjEgMjEuMiAxNC4xIDMzLjlMNDQ4IDMzNmMwIDI2LjUtMjEuNSA0OC00OCA0OGwtMTkyIDBjLTI2LjUgMC00OC0yMS41LTQ4LTQ4bDAtMjg4YzAtMjYuNSAyMS41LTQ4IDQ4LTQ4ek00OCAxMjhsODAgMCAwIDY0LTY0IDAgMCAyNTYgMTkyIDAgMC0zMiA2NCAwIDAgNDhjMCAyNi41LTIxLjUgNDgtNDggNDhMNDggNTEyYy0yNi41IDAtNDgtMjEuNS00OC00OEwwIDE3NmMwLTI2LjUgMjEuNS00OCA0OC00OHoiLz48L3N2Zz4=);\n",
-       "    background-repeat: no-repeat;\n",
-       "    background-size: 14px 14px;\n",
-       "    background-position: 0;\n",
-       "    display: inline-block;\n",
-       "    width: 14px;\n",
-       "    height: 14px;\n",
-       "    cursor: pointer;\n",
-       "}\n",
-       "</style><body><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
-       "              colsample_bylevel=None, colsample_bynode=None,\n",
-       "              colsample_bytree=0.3, device=None, early_stopping_rounds=None,\n",
-       "              enable_categorical=False, eval_metric=&#x27;logloss&#x27;,\n",
-       "              feature_types=None, feature_weights=None, gamma=None,\n",
-       "              grow_policy=None, importance_type=None,\n",
-       "              interaction_constraints=None, learning_rate=0.05, max_bin=None,\n",
-       "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
-       "              max_delta_step=None, max_depth=4, max_leaves=None,\n",
-       "              min_child_weight=3, missing=nan, monotone_constraints=None,\n",
-       "              multi_strategy=None, n_estimators=500, n_jobs=2,\n",
-       "              num_parallel_tree=None, ...)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" checked><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow\"><div><div>XGBClassifier</div></div><div><a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier\">?<span>Documentation for XGBClassifier</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></div></label><div class=\"sk-toggleable__content fitted\" data-param-prefix=\"\">\n",
-       "        <div class=\"estimator-table\">\n",
-       "            <details>\n",
-       "                <summary>Parameters</summary>\n",
-       "                <table class=\"parameters-table\">\n",
-       "                  <tbody>\n",
-       "                    \n",
-       "        <tr class=\"default\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('objective',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=objective,-typing.Union%5Bstr%2C%20xgboost.sklearn._SklObjWProto%2C%20typing.Callable%5B%5Btyping.Any%2C%20typing.Any%5D%2C%20typing.Tuple%5Bnumpy.ndarray%2C%20numpy.ndarray%5D%5D%2C%20NoneType%5D\">\n",
-       "            objective\n",
-       "            <span class=\"param-doc-description\">objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType]<br><br>Specify the learning task and the corresponding learning objective or a custom<br>objective function to be used.<br><br>For custom objective, see :doc:`/tutorials/custom_metric_obj` and<br>:ref:`custom-obj-metric` for more information, along with the end note for<br>function signatures.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">&#x27;binary:logistic&#x27;</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('base_score',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=base_score,-typing.Union%5Bfloat%2C%20typing.List%5Bfloat%5D%2C%20NoneType%5D\">\n",
-       "            base_score\n",
-       "            <span class=\"param-doc-description\">base_score: typing.Union[float, typing.List[float], NoneType]<br><br>The initial prediction score of all instances, global bias.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('booster',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">booster</td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('callbacks',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=callbacks,-typing.Optional%5Btyping.List%5Bxgboost.callback.TrainingCallback%5D%5D\">\n",
-       "            callbacks\n",
-       "            <span class=\"param-doc-description\">callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]]<br><br>List of callback functions that are applied at end of each iteration.<br>It is possible to use predefined callbacks by using<br>:ref:`Callback API <callback_api>`.<br><br>.. note::<br><br>   States in callback are not preserved during training, which means callback<br>   objects can not be reused for multiple training sessions without<br>   reinitialization or deepcopy.<br><br>.. code-block:: python<br><br>    for params in parameters_grid:<br>        # be sure to (re)initialize the callbacks before each run<br>        callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]<br>        reg = xgboost.XGBRegressor(**params, callbacks=callbacks)<br>        reg.fit(X, y)</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('colsample_bylevel',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=colsample_bylevel,-typing.Optional%5Bfloat%5D\">\n",
-       "            colsample_bylevel\n",
-       "            <span class=\"param-doc-description\">colsample_bylevel: typing.Optional[float]<br><br>Subsample ratio of columns for each level.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('colsample_bynode',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=colsample_bynode,-typing.Optional%5Bfloat%5D\">\n",
-       "            colsample_bynode\n",
-       "            <span class=\"param-doc-description\">colsample_bynode: typing.Optional[float]<br><br>Subsample ratio of columns for each split.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('colsample_bytree',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=colsample_bytree,-typing.Optional%5Bfloat%5D\">\n",
-       "            colsample_bytree\n",
-       "            <span class=\"param-doc-description\">colsample_bytree: typing.Optional[float]<br><br>Subsample ratio of columns when constructing each tree.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">0.3</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('device',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=device,-typing.Optional%5Bstr%5D\">\n",
-       "            device\n",
-       "            <span class=\"param-doc-description\">device: typing.Optional[str]<br><br>.. versionadded:: 2.0.0<br><br>Device ordinal, available options are `cpu`, `cuda`, and `gpu`.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('early_stopping_rounds',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=early_stopping_rounds,-typing.Optional%5Bint%5D\">\n",
-       "            early_stopping_rounds\n",
-       "            <span class=\"param-doc-description\">early_stopping_rounds: typing.Optional[int]<br><br>.. versionadded:: 1.6.0<br><br>- Activates early stopping. Validation metric needs to improve at least once in<br>  every **early_stopping_rounds** round(s) to continue training.  Requires at<br>  least one item in **eval_set** in :py:meth:`fit`.<br><br>- If early stopping occurs, the model will have two additional attributes:<br>  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the<br>  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal<br>  number of trees during inference. If users want to access the full model<br>  (including trees built after early stopping), they can specify the<br>  `iteration_range` in these inference methods. In addition, other utilities<br>  like model plotting can also use the entire model.<br><br>- If you prefer to discard the trees after `best_iteration`, consider using the<br>  callback function :py:class:`xgboost.callback.EarlyStopping`.<br><br>- If there's more than one item in **eval_set**, the last entry will be used for<br>  early stopping.  If there's more than one metric in **eval_metric**, the last<br>  metric will be used for early stopping.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('enable_categorical',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=enable_categorical,-bool\">\n",
-       "            enable_categorical\n",
-       "            <span class=\"param-doc-description\">enable_categorical: bool<br><br>See the same parameter of :py:class:`DMatrix` for details.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">False</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('eval_metric',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=eval_metric,-typing.Union%5Bstr%2C%20typing.List%5Btyping.Union%5Bstr%2C%20typing.Callable%5D%5D%2C%20typing.Callable%2C%20NoneType%5D\">\n",
-       "            eval_metric\n",
-       "            <span class=\"param-doc-description\">eval_metric: typing.Union[str, typing.List[typing.Union[str, typing.Callable]], typing.Callable, NoneType]<br><br>.. versionadded:: 1.6.0<br><br>Metric used for monitoring the training result and early stopping.  It can be a<br>string or list of strings as names of predefined metric in XGBoost (See<br>:doc:`/parameter`), one of the metrics in :py:mod:`sklearn.metrics`, or any<br>other user defined metric that looks like `sklearn.metrics`.<br><br>If custom objective is also provided, then custom metric should implement the<br>corresponding reverse link function.<br><br>Unlike the `scoring` parameter commonly used in scikit-learn, when a callable<br>object is provided, it's assumed to be a cost function and by default XGBoost<br>will minimize the result during early stopping.<br><br>For advanced usage on Early stopping like directly choosing to maximize instead<br>of minimize, see :py:obj:`xgboost.callback.EarlyStopping`.<br><br>See :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more<br>information.<br><br>.. code-block:: python<br><br>    from sklearn.datasets import load_diabetes<br>    from sklearn.metrics import mean_absolute_error<br>    X, y = load_diabetes(return_X_y=True)<br>    reg = xgb.XGBRegressor(<br>        tree_method=\"hist\",<br>        eval_metric=mean_absolute_error,<br>    )<br>    reg.fit(X, y, eval_set=[(X, y)])</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">&#x27;logloss&#x27;</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('feature_types',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=feature_types,-typing.Optional%5Btyping.Sequence%5Bstr%5D%5D\">\n",
-       "            feature_types\n",
-       "            <span class=\"param-doc-description\">feature_types: typing.Optional[typing.Sequence[str]]<br><br>.. versionadded:: 1.7.0<br><br>Used for specifying feature types without constructing a dataframe. See<br>the :py:class:`DMatrix` for details.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('feature_weights',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=feature_weights,-Optional%5BArrayLike%5D\">\n",
-       "            feature_weights\n",
-       "            <span class=\"param-doc-description\">feature_weights: Optional[ArrayLike]<br><br>Weight for each feature, defines the probability of each feature being selected<br>when colsample is being used.  All values must be greater than 0, otherwise a<br>`ValueError` is thrown.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('gamma',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=gamma,-typing.Optional%5Bfloat%5D\">\n",
-       "            gamma\n",
-       "            <span class=\"param-doc-description\">gamma: typing.Optional[float]<br><br>(min_split_loss) Minimum loss reduction required to make a further partition on<br>a leaf node of the tree.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('grow_policy',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=grow_policy,-typing.Optional%5Bstr%5D\">\n",
-       "            grow_policy\n",
-       "            <span class=\"param-doc-description\">grow_policy: typing.Optional[str]<br><br>Tree growing policy.<br><br>- depthwise: Favors splitting at nodes closest to the node,<br>- lossguide: Favors splitting at nodes with highest loss change.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('importance_type',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">importance_type</td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('interaction_constraints',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=interaction_constraints,-typing.Union%5Bstr%2C%20typing.List%5Btyping.Tuple%5Bstr%5D%5D%2C%20NoneType%5D\">\n",
-       "            interaction_constraints\n",
-       "            <span class=\"param-doc-description\">interaction_constraints: typing.Union[str, typing.List[typing.Tuple[str]], NoneType]<br><br>Constraints for interaction representing permitted interactions.  The<br>constraints must be specified in the form of a nested list, e.g. ``[[0, 1], [2,<br>3, 4]]``, where each inner list is a group of indices of features that are<br>allowed to interact with each other.  See :doc:`tutorial<br></tutorials/feature_interaction_constraint>` for more information</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('learning_rate',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=learning_rate,-typing.Optional%5Bfloat%5D\">\n",
-       "            learning_rate\n",
-       "            <span class=\"param-doc-description\">learning_rate: typing.Optional[float]<br><br>Boosting learning rate (xgb's \"eta\")</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">0.05</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('max_bin',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=max_bin,-typing.Optional%5Bint%5D\">\n",
-       "            max_bin\n",
-       "            <span class=\"param-doc-description\">max_bin: typing.Optional[int]<br><br>If using histogram-based algorithm, maximum number of bins per feature</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('max_cat_threshold',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=max_cat_threshold,-typing.Optional%5Bint%5D\">\n",
-       "            max_cat_threshold\n",
-       "            <span class=\"param-doc-description\">max_cat_threshold: typing.Optional[int]<br><br>.. versionadded:: 1.7.0<br><br>.. note:: This parameter is experimental<br><br>Maximum number of categories considered for each split. Used only by<br>partition-based splits for preventing over-fitting. Also, `enable_categorical`<br>needs to be set to have categorical feature support. See :doc:`Categorical Data<br></tutorials/categorical>` and :ref:`cat-param` for details.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('max_cat_to_onehot',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=max_cat_to_onehot,-Optional%5Bint%5D\">\n",
-       "            max_cat_to_onehot\n",
-       "            <span class=\"param-doc-description\">max_cat_to_onehot: Optional[int]<br><br>.. versionadded:: 1.6.0<br><br>.. note:: This parameter is experimental<br><br>A threshold for deciding whether XGBoost should use one-hot encoding based split<br>for categorical data.  When number of categories is lesser than the threshold<br>then one-hot encoding is chosen, otherwise the categories will be partitioned<br>into children nodes. Also, `enable_categorical` needs to be set to have<br>categorical feature support. See :doc:`Categorical Data<br></tutorials/categorical>` and :ref:`cat-param` for details.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('max_delta_step',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=max_delta_step,-typing.Optional%5Bfloat%5D\">\n",
-       "            max_delta_step\n",
-       "            <span class=\"param-doc-description\">max_delta_step: typing.Optional[float]<br><br>Maximum delta step we allow each tree's weight estimation to be.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('max_depth',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=max_depth,-%20typing.Optional%5Bint%5D\">\n",
-       "            max_depth\n",
-       "            <span class=\"param-doc-description\">max_depth:  typing.Optional[int]<br><br>Maximum tree depth for base learners.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">4</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('max_leaves',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=max_leaves,-typing.Optional%5Bint%5D\">\n",
-       "            max_leaves\n",
-       "            <span class=\"param-doc-description\">max_leaves: typing.Optional[int]<br><br>Maximum number of leaves; 0 indicates no limit.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('min_child_weight',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=min_child_weight,-typing.Optional%5Bfloat%5D\">\n",
-       "            min_child_weight\n",
-       "            <span class=\"param-doc-description\">min_child_weight: typing.Optional[float]<br><br>Minimum sum of instance weight(hessian) needed in a child.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">3</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('missing',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=missing,-float\">\n",
-       "            missing\n",
-       "            <span class=\"param-doc-description\">missing: float<br><br>Value in the data which needs to be present as a missing value. Default to<br>:py:data:`numpy.nan`.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">nan</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('monotone_constraints',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=monotone_constraints,-typing.Union%5Btyping.Dict%5Bstr%2C%20int%5D%2C%20str%2C%20NoneType%5D\">\n",
-       "            monotone_constraints\n",
-       "            <span class=\"param-doc-description\">monotone_constraints: typing.Union[typing.Dict[str, int], str, NoneType]<br><br>Constraint of variable monotonicity.  See :doc:`tutorial </tutorials/monotonic>`<br>for more information.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('multi_strategy',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=multi_strategy,-typing.Optional%5Bstr%5D\">\n",
-       "            multi_strategy\n",
-       "            <span class=\"param-doc-description\">multi_strategy: typing.Optional[str]<br><br>.. versionadded:: 2.0.0<br><br>.. note:: This parameter is working-in-progress.<br><br>The strategy used for training multi-target models, including multi-target<br>regression and multi-class classification. See :doc:`/tutorials/multioutput` for<br>more information.<br><br>- ``one_output_per_tree``: One model for each target.<br>- ``multi_output_tree``:  Use multi-target trees.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('n_estimators',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=n_estimators,-Optional%5Bint%5D\">\n",
-       "            n_estimators\n",
-       "            <span class=\"param-doc-description\">n_estimators: Optional[int]<br><br>Number of boosting rounds.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">500</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('n_jobs',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=n_jobs,-typing.Optional%5Bint%5D\">\n",
-       "            n_jobs\n",
-       "            <span class=\"param-doc-description\">n_jobs: typing.Optional[int]<br><br>Number of parallel threads used to run xgboost.  When used with other<br>Scikit-Learn algorithms like grid search, you may choose which algorithm to<br>parallelize and balance the threads.  Creating thread contention will<br>significantly slow down both algorithms.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">2</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('num_parallel_tree',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">num_parallel_tree</td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('random_state',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=random_state,-typing.Union%5Bnumpy.random.mtrand.RandomState%2C%20numpy.random._generator.Generator%2C%20int%2C%20NoneType%5D\">\n",
-       "            random_state\n",
-       "            <span class=\"param-doc-description\">random_state: typing.Union[numpy.random.mtrand.RandomState, numpy.random._generator.Generator, int, NoneType]<br><br>Random number seed.<br><br>.. note::<br><br>   Using gblinear booster with shotgun updater is nondeterministic as<br>   it uses Hogwild algorithm.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">42</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('reg_alpha',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=reg_alpha,-typing.Optional%5Bfloat%5D\">\n",
-       "            reg_alpha\n",
-       "            <span class=\"param-doc-description\">reg_alpha: typing.Optional[float]<br><br>L1 regularization term on weights (xgb's alpha).</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">1.0</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('reg_lambda',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=reg_lambda,-typing.Optional%5Bfloat%5D\">\n",
-       "            reg_lambda\n",
-       "            <span class=\"param-doc-description\">reg_lambda: typing.Optional[float]<br><br>L2 regularization term on weights (xgb's lambda).</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">5.0</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('sampling_method',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=sampling_method,-typing.Optional%5Bstr%5D\">\n",
-       "            sampling_method\n",
-       "            <span class=\"param-doc-description\">sampling_method: typing.Optional[str]<br><br>Sampling method. Used only by the GPU version of ``hist`` tree method.<br><br>- ``uniform``: Select random training instances uniformly.<br>- ``gradient_based``: Select random training instances with higher probability<br>    when the gradient and hessian are larger. (cf. CatBoost)</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('scale_pos_weight',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=scale_pos_weight,-typing.Optional%5Bfloat%5D\">\n",
-       "            scale_pos_weight\n",
-       "            <span class=\"param-doc-description\">scale_pos_weight: typing.Optional[float]<br><br>Balancing of positive and negative weights.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('subsample',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=subsample,-typing.Optional%5Bfloat%5D\">\n",
-       "            subsample\n",
-       "            <span class=\"param-doc-description\">subsample: typing.Optional[float]<br><br>Subsample ratio of the training instance.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">0.8</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('tree_method',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=tree_method,-typing.Optional%5Bstr%5D\">\n",
-       "            tree_method\n",
-       "            <span class=\"param-doc-description\">tree_method: typing.Optional[str]<br><br>Specify which tree method to use.  Default to auto.  If this parameter is set to<br>default, XGBoost will choose the most conservative option available.  It's<br>recommended to study this option from the parameters document :doc:`tree method<br></treemethod>`</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">&#x27;hist&#x27;</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('validate_parameters',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=validate_parameters,-typing.Optional%5Bbool%5D\">\n",
-       "            validate_parameters\n",
-       "            <span class=\"param-doc-description\">validate_parameters: typing.Optional[bool]<br><br>Give warnings for unknown parameter.</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "\n",
-       "        <tr class=\"user-set\">\n",
-       "            <td><i class=\"copy-paste-icon\"\n",
-       "                 onclick=\"copyToClipboard('verbosity',\n",
-       "                          this.parentElement.nextElementSibling)\"\n",
-       "            ></i></td>\n",
-       "            <td class=\"param\">\n",
-       "        <a class=\"param-doc-link\"\n",
-       "            rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier#:~:text=verbosity,-typing.Optional%5Bint%5D\">\n",
-       "            verbosity\n",
-       "            <span class=\"param-doc-description\">verbosity: typing.Optional[int]<br><br>The degree of verbosity. Valid values are 0 (silent) - 3 (debug).</span>\n",
-       "        </a>\n",
-       "    </td>\n",
-       "            <td class=\"value\">None</td>\n",
-       "        </tr>\n",
-       "    \n",
-       "                  </tbody>\n",
-       "                </table>\n",
-       "            </details>\n",
-       "        </div>\n",
-       "    </div></div></div></div></div><script>function copyToClipboard(text, element) {\n",
-       "    // Get the parameter prefix from the closest toggleable content\n",
-       "    const toggleableContent = element.closest('.sk-toggleable__content');\n",
-       "    const paramPrefix = toggleableContent ? toggleableContent.dataset.paramPrefix : '';\n",
-       "    const fullParamName = paramPrefix ? `${paramPrefix}${text}` : text;\n",
-       "\n",
-       "    const originalStyle = element.style;\n",
-       "    const computedStyle = window.getComputedStyle(element);\n",
-       "    const originalWidth = computedStyle.width;\n",
-       "    const originalHTML = element.innerHTML.replace('Copied!', '');\n",
-       "\n",
-       "    navigator.clipboard.writeText(fullParamName)\n",
-       "        .then(() => {\n",
-       "            element.style.width = originalWidth;\n",
-       "            element.style.color = 'green';\n",
-       "            element.innerHTML = \"Copied!\";\n",
-       "\n",
-       "            setTimeout(() => {\n",
-       "                element.innerHTML = originalHTML;\n",
-       "                element.style = originalStyle;\n",
-       "            }, 2000);\n",
-       "        })\n",
-       "        .catch(err => {\n",
-       "            console.error('Failed to copy:', err);\n",
-       "            element.style.color = 'red';\n",
-       "            element.innerHTML = \"Failed!\";\n",
-       "            setTimeout(() => {\n",
-       "                element.innerHTML = originalHTML;\n",
-       "                element.style = originalStyle;\n",
-       "            }, 2000);\n",
-       "        });\n",
-       "    return false;\n",
-       "}\n",
-       "\n",
-       "document.querySelectorAll('.copy-paste-icon').forEach(function(element) {\n",
-       "    const toggleableContent = element.closest('.sk-toggleable__content');\n",
-       "    const paramPrefix = toggleableContent ? toggleableContent.dataset.paramPrefix : '';\n",
-       "    const paramName = element.parentElement.nextElementSibling\n",
-       "        .textContent.trim().split(' ')[0];\n",
-       "    const fullParamName = paramPrefix ? `${paramPrefix}${paramName}` : paramName;\n",
-       "\n",
-       "    element.setAttribute('title', fullParamName);\n",
-       "});\n",
-       "\n",
-       "\n",
-       "/**\n",
-       " * Adapted from Skrub\n",
-       " * https://github.com/skrub-data/skrub/blob/403466d1d5d4dc76a7ef569b3f8228db59a31dc3/skrub/_reporting/_data/templates/report.js#L789\n",
-       " * @returns \"light\" or \"dark\"\n",
-       " */\n",
-       "function detectTheme(element) {\n",
-       "    const body = document.querySelector('body');\n",
-       "\n",
-       "    // Check VSCode theme\n",
-       "    const themeKindAttr = body.getAttribute('data-vscode-theme-kind');\n",
-       "    const themeNameAttr = body.getAttribute('data-vscode-theme-name');\n",
-       "\n",
-       "    if (themeKindAttr && themeNameAttr) {\n",
-       "        const themeKind = themeKindAttr.toLowerCase();\n",
-       "        const themeName = themeNameAttr.toLowerCase();\n",
-       "\n",
-       "        if (themeKind.includes(\"dark\") || themeName.includes(\"dark\")) {\n",
-       "            return \"dark\";\n",
-       "        }\n",
-       "        if (themeKind.includes(\"light\") || themeName.includes(\"light\")) {\n",
-       "            return \"light\";\n",
-       "        }\n",
-       "    }\n",
-       "\n",
-       "    // Check Jupyter theme\n",
-       "    if (body.getAttribute('data-jp-theme-light') === 'false') {\n",
-       "        return 'dark';\n",
-       "    } else if (body.getAttribute('data-jp-theme-light') === 'true') {\n",
-       "        return 'light';\n",
-       "    }\n",
-       "\n",
-       "    // Guess based on a parent element's color\n",
-       "    const color = window.getComputedStyle(element.parentNode, null).getPropertyValue('color');\n",
-       "    const match = color.match(/^rgb\\s*\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)\\s*$/i);\n",
-       "    if (match) {\n",
-       "        const [r, g, b] = [\n",
-       "            parseFloat(match[1]),\n",
-       "            parseFloat(match[2]),\n",
-       "            parseFloat(match[3])\n",
-       "        ];\n",
-       "\n",
-       "        // https://en.wikipedia.org/wiki/HSL_and_HSV#Lightness\n",
-       "        const luma = 0.299 * r + 0.587 * g + 0.114 * b;\n",
-       "\n",
-       "        if (luma > 180) {\n",
-       "            // If the text is very bright we have a dark theme\n",
-       "            return 'dark';\n",
-       "        }\n",
-       "        if (luma < 75) {\n",
-       "            // If the text is very dark we have a light theme\n",
-       "            return 'light';\n",
-       "        }\n",
-       "        // Otherwise fall back to the next heuristic.\n",
-       "    }\n",
-       "\n",
-       "    // Fallback to system preference\n",
-       "    return window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light';\n",
-       "}\n",
-       "\n",
-       "\n",
-       "function forceTheme(elementId) {\n",
-       "    const estimatorElement = document.querySelector(`#${elementId}`);\n",
-       "    if (estimatorElement === null) {\n",
-       "        console.error(`Element with id ${elementId} not found.`);\n",
-       "    } else {\n",
-       "        const theme = detectTheme(estimatorElement);\n",
-       "        estimatorElement.classList.add(theme);\n",
-       "    }\n",
-       "}\n",
-       "\n",
-       "forceTheme('sk-container-id-2');</script></body>"
-      ],
-      "text/plain": [
-       "XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
-       "              colsample_bylevel=None, colsample_bynode=None,\n",
-       "              colsample_bytree=0.3, device=None, early_stopping_rounds=None,\n",
-       "              enable_categorical=False, eval_metric='logloss',\n",
-       "              feature_types=None, feature_weights=None, gamma=None,\n",
-       "              grow_policy=None, importance_type=None,\n",
-       "              interaction_constraints=None, learning_rate=0.05, max_bin=None,\n",
-       "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
-       "              max_delta_step=None, max_depth=4, max_leaves=None,\n",
-       "              min_child_weight=3, missing=nan, monotone_constraints=None,\n",
-       "              multi_strategy=None, n_estimators=500, n_jobs=2,\n",
-       "              num_parallel_tree=None, ...)"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from xgboost import XGBClassifier\n",
-    "\n",
-    "model = XGBClassifier(\n",
-    "    objective=\"binary:logistic\",\n",
-    "    eval_metric=\"logloss\",\n",
-    "    n_estimators=500,\n",
-    "    max_depth=4,\n",
-    "    learning_rate=0.05,\n",
-    "    subsample=0.8,\n",
-    "    colsample_bytree=0.3,\n",
-    "    min_child_weight=3,\n",
-    "    reg_lambda=5.0,\n",
-    "    reg_alpha=1.0,\n",
-    "    random_state=42,\n",
-    "    n_jobs=2,\n",
-    "    tree_method=\"hist\",\n",
-    ")\n",
-    "model.fit(X_by_split[\"train\"], y_by_split[\"train\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "84f539a0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if hasattr(model, \"predict_proba\"):\n",
-    "    validation_proba = model.predict_proba(X_by_split[\"validation\"])[:, 1]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bbc43eb9",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "17ec9212",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.metrics import (\n",
-    "    accuracy_score,\n",
-    "    balanced_accuracy_score,\n",
-    "    confusion_matrix,\n",
-    "    f1_score,\n",
-    "    precision_score,\n",
-    "    recall_score,\n",
-    "    roc_auc_score,\n",
-    ")\n",
-    "\n",
-    "def compute_metrics(y_true: np.ndarray, y_proba: np.ndarray, threshold: float) -> dict[str, Any]:\n",
-    "    \n",
-    "    y_pred = (y_proba >= threshold).astype(int)\n",
-    "    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()\n",
-    "\n",
-    "    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0\n",
-    "    sensitivity = recall_score(y_true, y_pred, zero_division=0)\n",
-    "    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)\n",
-    "    youden_j = sensitivity + specificity - 1.0\n",
-    "\n",
-    "    return {\n",
-    "        \"threshold\": round(threshold, 6),\n",
-    "        \"accuracy\": round(accuracy_score(y_true, y_pred), 6),\n",
-    "        \"precision\": round(precision_score(y_true, y_pred, zero_division=0), 6),\n",
-    "        \"recall\": round(sensitivity, 6),\n",
-    "        \"f1\": round(f1_score(y_true, y_pred, zero_division=0), 6),\n",
-    "        \"balanced_accuracy\": round(balanced_accuracy, 6),\n",
-    "        \"specificity\": round(specificity, 6),\n",
-    "        \"youden_j\": round(youden_j, 6),\n",
-    "        \"roc_auc\": round(roc_auc_score(y_true, y_proba), 6),\n",
-    "        \"tn\": int(tn),\n",
-    "        \"fp\": int(fp),\n",
-    "        \"fn\": int(fn),\n",
-    "        \"tp\": int(tp),\n",
-    "    }\n",
-    "\n",
-    "def find_best_threshold(\n",
-    "    y_true: np.ndarray,\n",
-    "    y_proba: np.ndarray,\n",
-    "    config: Config = config,\n",
-    ") -> tuple[float, dict[str, Any]]:\n",
-    "    \n",
-    "    thresholds = np.arange(0.0, 1.0, config.threshold_grid_step, dtype=np.float32)\n",
-    "    if thresholds.size == 0:\n",
-    "        thresholds = np.array([0.5], dtype=np.float64)\n",
-    "\n",
-    "    best_threshold = 0.5\n",
-    "    best_metrics = compute_metrics(y_true, y_proba, threshold=best_threshold)\n",
-    "    best_score = float(best_metrics[config.threshold_metric])\n",
-    "\n",
-    "    for threshold in thresholds: # loop over threshold grid\n",
-    "        metrics = compute_metrics(y_true, y_proba, threshold=float(threshold))\n",
-    "        score = float(metrics[config.threshold_metric])\n",
-    "\n",
-    "        # if current score is better than best_score...\n",
-    "        if score > best_score:\n",
-    "            best_threshold = float(threshold)\n",
-    "            best_metrics = metrics\n",
-    "            best_score = score\n",
-    "\n",
-    "    return best_threshold, best_metrics"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "4c9082ba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "best_threshold, validation_metrics = find_best_threshold(y_by_split[\"validation\"], validation_proba, config=config)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "8ab81ba2",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.5099999904632568"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "best_threshold"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "644474d1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if hasattr(model, \"predict_proba\"):\n",
-    "    test_proba = model.predict_proba(X_by_split[\"test\"])[:, 1]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "5e32cecb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "test_metrics = compute_metrics(y_by_split[\"test\"], test_proba, threshold=best_threshold)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "id": "352f80b7",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>split</th>\n",
-       "      <th>threshold</th>\n",
-       "      <th>accuracy</th>\n",
-       "      <th>precision</th>\n",
-       "      <th>recall</th>\n",
-       "      <th>f1</th>\n",
-       "      <th>balanced_accuracy</th>\n",
-       "      <th>specificity</th>\n",
-       "      <th>youden_j</th>\n",
-       "      <th>roc_auc</th>\n",
-       "      <th>tn</th>\n",
-       "      <th>fp</th>\n",
-       "      <th>fn</th>\n",
-       "      <th>tp</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>validation</td>\n",
-       "      <td>0.51</td>\n",
-       "      <td>0.758463</td>\n",
-       "      <td>0.767159</td>\n",
-       "      <td>0.796782</td>\n",
-       "      <td>0.781690</td>\n",
-       "      <td>0.754883</td>\n",
-       "      <td>0.712985</td>\n",
-       "      <td>0.509767</td>\n",
-       "      <td>0.846681</td>\n",
-       "      <td>2427</td>\n",
-       "      <td>977</td>\n",
-       "      <td>821</td>\n",
-       "      <td>3219</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>test</td>\n",
-       "      <td>0.51</td>\n",
-       "      <td>0.760764</td>\n",
-       "      <td>0.756104</td>\n",
-       "      <td>0.801294</td>\n",
-       "      <td>0.778043</td>\n",
-       "      <td>0.758784</td>\n",
-       "      <td>0.716274</td>\n",
-       "      <td>0.517567</td>\n",
-       "      <td>0.849923</td>\n",
-       "      <td>2522</td>\n",
-       "      <td>999</td>\n",
-       "      <td>768</td>\n",
-       "      <td>3097</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "        split  threshold  accuracy  precision    recall        f1  \\\n",
-       "0  validation       0.51  0.758463   0.767159  0.796782  0.781690   \n",
-       "1        test       0.51  0.760764   0.756104  0.801294  0.778043   \n",
-       "\n",
-       "   balanced_accuracy  specificity  youden_j   roc_auc    tn   fp   fn    tp  \n",
-       "0           0.754883     0.712985  0.509767  0.846681  2427  977  821  3219  \n",
-       "1           0.758784     0.716274  0.517567  0.849923  2522  999  768  3097  "
-      ]
-     },
-     "execution_count": 32,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "metrics_summary_df = pd.DataFrame([\n",
-    "        {\"split\": \"validation\", **validation_metrics},\n",
-    "        {\"split\": \"test\", **test_metrics},\n",
-    "    ])\n",
-    "metrics_summary_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3fda5c18",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5eff666b",
-   "metadata": {},
-   "source": [
-    "# Inference"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 100,
-   "id": "2b71800a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "import pickle\n",
-    "from pathlib import Path\n",
-    "\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "from scipy import sparse\n",
-    "from textstat import textstat\n",
-    "\n",
-    "from normalization import normalize_text, Config as NormalizationConfig\n",
-    "from masking_regex import mask_split as regex_mask_split\n",
-    "from masking_spacy import (\n",
-    "    load_nlp_model,\n",
-    "    _apply_ner_mask,\n",
-    "    _build_linguistic_record,\n",
-    "    Config as SpacyMaskingConfig,\n",
-    ")\n",
-    "\n",
-    "from statistical_features import extract_split_statistics, Config as StatisticalConfig\n",
-    "from tfidf_features import record_to_tfidf_text, Config as TFIDFConfig\n",
-    "from ngram_features import (\n",
-    "    build_space_free_char_ngrams,\n",
-    "    record_to_pos_sequence,\n",
-    "    Config as NGramConfig,\n",
-    ")\n",
-    "\n",
-    "BASE_DIR = Path.cwd().parent\n",
-    "SAVE_DIR = BASE_DIR / \"saved\"\n",
-    "\n",
-    "# load configs/artifacts fitted during training\n",
-    "with open(SAVE_DIR / \"normalization\" / \"normalization_config.json\", \"r\", encoding=\"utf-8\") as f:\n",
-    "    normalization_config = NormalizationConfig(**json.load(f))\n",
-    "\n",
-    "with open(SAVE_DIR / \"masking\" / \"spacy_config.json\", \"r\", encoding=\"utf-8\") as f:\n",
-    "    spacy_cfg_dict = json.load(f)\n",
-    "spacy_cfg_dict[\"verbose\"] = False\n",
-    "spacy_config = SpacyMaskingConfig(**spacy_cfg_dict)\n",
-    "\n",
-    "with open(SAVE_DIR / \"masking\" / \"statistical_config.json\", \"r\", encoding=\"utf-8\") as f:\n",
-    "    statistical_config = StatisticalConfig(**json.load(f))\n",
-    "\n",
-    "with open(SAVE_DIR / \"tfidf_features\" / \"tfidf_config.json\", \"r\", encoding=\"utf-8\") as f:\n",
-    "    tfidf_config = TFIDFConfig(**json.load(f))\n",
-    "with open(SAVE_DIR / \"tfidf_features\" / \"vectorizer.pkl\", \"rb\") as f:\n",
-    "    tfidf_vectorizer = pickle.load(f)\n",
-    "\n",
-    "with open(SAVE_DIR / \"ngram_features\" / \"ngram_config.json\", \"r\", encoding=\"utf-8\") as f:\n",
-    "    ngram_config = NGramConfig(**json.load(f))\n",
-    "with open(SAVE_DIR / \"ngram_features\" / \"char_vectorizer.pkl\", \"rb\") as f:\n",
-    "    char_vectorizer = pickle.load(f)\n",
-    "with open(SAVE_DIR / \"ngram_features\" / \"pos_vectorizer.pkl\", \"rb\") as f:\n",
-    "    pos_vectorizer = pickle.load(f)\n",
-    "\n",
-    "nlp = load_nlp_model(config=spacy_config)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 101,
-   "id": "993e8444",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Stat features [inference:text1]: 100%|██████████| 1/1 [00:00<00:00, 7884.03it/s]\n",
-      "Stat features [inference:text2]: 100%|██████████| 1/1 [00:00<00:00, 5343.06it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "                   step  seconds  percent\n",
-      "                  total   0.3072 100.0000\n",
-      "7_build_pairwise_matrix   0.2333  75.9616\n",
-      " 6a_char_ngram_features   0.0370  12.0416\n",
-      "       5_tfidf_features   0.0155   5.0444\n",
-      "  6b_pos_ngram_features   0.0059   1.9164\n",
-      " 4_statistical_features   0.0056   1.8356\n",
-      "         3b_spacy_text1   0.0038   1.2389\n",
-      "         3c_spacy_text2   0.0031   1.0036\n",
-      "6c_readability_features   0.0011   0.3651\n",
-      "      8_model_inference   0.0007   0.2325\n",
-      "        2_regex_masking   0.0004   0.1422\n",
-      "        1_normalization   0.0004   0.1413\n",
-      "   3d_spacy_postprocess   0.0002   0.0719\n",
-      "    3a_spacy_load_model   0.0000   0.0008\n",
-      "probability_same: 0.9880850911140442\n",
-      "predicted_label: 1\n",
-      "threshold: 0.5099999904632568\n",
-      "masked_text1: <ORDINAL> text here.\n",
-      "masked_text2: <ORDINAL> text here.\n"
-     ]
-    }
-   ],
-   "source": [
-    "import time\n",
-    "\n",
-    "def _predict_positive_proba(model, X):\n",
-    "    if hasattr(model, \"predict_proba\"):\n",
-    "        return float(model.predict_proba(X)[0, 1])\n",
-    "    if hasattr(model, \"decision_function\"):\n",
-    "        score = float(model.decision_function(X)[0])\n",
-    "        return 1.0 / (1.0 + np.exp(-score))\n",
-    "    raise ValueError(\"Model must support predict_proba or decision_function.\")\n",
-    "\n",
-    "pairwise_column_pairs = [(f\"text1_{suffix}\", f\"text2_{suffix}\") for suffix in suffixes]\n",
-    "\n",
-    "\n",
-    "def predict_similarity(text1: str, text2: str, model, suffixes, threshold=None, show_timing=True):\n",
-    "    threshold = 0.5 if threshold is None else float(threshold)\n",
-    "    timings = {}\n",
-    "\n",
-    "    def tic():\n",
-    "        return time.perf_counter()\n",
-    "\n",
-    "    def toc(label, start):\n",
-    "        timings[label] = time.perf_counter() - start\n",
-    "\n",
-    "    total_start = tic()\n",
-    "\n",
-    "    # 1. normalization\n",
-    "    t = tic()\n",
-    "    pair_df = pd.DataFrame(\n",
-    "        [{\n",
-    "            \"text1\": normalize_text(text1, config=normalization_config),\n",
-    "            \"text2\": normalize_text(text2, config=normalization_config),\n",
-    "            \"same\": 0,\n",
-    "        }]\n",
-    "    )\n",
-    "    toc(\"1_normalization\", t)\n",
-    "\n",
-    "    # 2. regex masking\n",
-    "    t = tic()\n",
-    "    regex_masked_df, _ = regex_mask_split(pair_df)\n",
-    "    toc(\"2_regex_masking\", t)\n",
-    "\n",
-    "    # 3a. load spaCy model\n",
-    "    t = tic()\n",
-    "    spacy_config.nlp_n_process = 1\n",
-    "    toc(\"3a_spacy_load_model\", t)\n",
-    "\n",
-    "    # 3b. spaCy inference text1\n",
-    "    def spacy_mask_one_text(text, nlp):\n",
-    "        doc = nlp(text)\n",
-    "        masked_text, _ = _apply_ner_mask(text, doc)\n",
-    "        record = _build_linguistic_record(doc)\n",
-    "        return masked_text, record\n",
-    "\n",
-    "    t = tic()\n",
-    "    masked_text1, record1 = spacy_mask_one_text(regex_masked_df.iloc[0][\"text1\"], nlp)\n",
-    "    toc(\"3b_spacy_text1\", t)\n",
-    "\n",
-    "    # 3c. spaCy inference text2\n",
-    "    t = tic()\n",
-    "    masked_text2, record2 = spacy_mask_one_text(regex_masked_df.iloc[0][\"text2\"], nlp)\n",
-    "    toc(\"3c_spacy_text2\", t)\n",
-    "\n",
-    "    # 3d. assemble masked outputs\n",
-    "    t = tic()\n",
-    "    masked_df = regex_masked_df.copy()\n",
-    "    masked_df.at[0, \"text1\"] = masked_text1\n",
-    "    masked_df.at[0, \"text2\"] = masked_text2\n",
-    "\n",
-    "    split_cache = {\n",
-    "        \"text1\": [record1],\n",
-    "        \"text2\": [record2],\n",
-    "    }\n",
-    "    toc(\"3d_spacy_postprocess\", t)\n",
-    "\n",
-    "    # 4. statistical features\n",
-    "    t = tic()\n",
-    "    feature_df = extract_split_statistics(\n",
-    "        masked_df,\n",
-    "        split_cache=split_cache,\n",
-    "        split_name=\"inference\",\n",
-    "        config=statistical_config,\n",
-    "    )\n",
-    "    toc(\"4_statistical_features\", t)\n",
-    "\n",
-    "    # 5. TF-IDF features\n",
-    "    t = tic()\n",
-    "    for column in [\"text1\", \"text2\"]:\n",
-    "        docs = [record_to_tfidf_text(record, config=tfidf_config) for record in split_cache[column]]\n",
-    "        tfidf_matrix = tfidf_vectorizer.transform(docs).toarray()\n",
-    "        tfidf_cols = [f\"{column}_tfidf_{i:05d}\" for i in range(tfidf_matrix.shape[1])]\n",
-    "        tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_cols)\n",
-    "        feature_df = pd.concat([feature_df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)\n",
-    "    toc(\"5_tfidf_features\", t)\n",
-    "\n",
-    "    # 6a. char n-gram features\n",
-    "    t = tic()\n",
-    "    for column in [\"text1\", \"text2\"]:\n",
-    "        char_docs = [\n",
-    "            \" \".join(build_space_free_char_ngrams(text, n=ngram_config.char_ngram_n))\n",
-    "            for text in masked_df[column].tolist()\n",
-    "        ]\n",
-    "        char_matrix = char_vectorizer.transform(char_docs).toarray()\n",
-    "        char_cols = [f\"{column}_char{ngram_config.char_ngram_n}_tfidf_{i:05d}\" for i in range(char_matrix.shape[1])]\n",
-    "        char_df = pd.DataFrame(char_matrix, columns=char_cols)\n",
-    "        feature_df = pd.concat([feature_df.reset_index(drop=True), char_df.reset_index(drop=True)], axis=1)\n",
-    "    toc(\"6a_char_ngram_features\", t)\n",
-    "\n",
-    "    # 6b. POS n-gram features\n",
-    "    t = tic()\n",
-    "    for column in [\"text1\", \"text2\"]:\n",
-    "        pos_docs = [\n",
-    "            \" \".join(record_to_pos_sequence(record))\n",
-    "            for record in split_cache[column]\n",
-    "        ]\n",
-    "        pos_matrix = pos_vectorizer.transform(pos_docs).toarray()\n",
-    "        pos_cols = [f\"{column}_pos{ngram_config.pos_ngram_range}_tfidf_{i:05d}\" for i in range(pos_matrix.shape[1])]\n",
-    "        pos_df = pd.DataFrame(pos_matrix, columns=pos_cols)\n",
-    "        feature_df = pd.concat([feature_df.reset_index(drop=True), pos_df.reset_index(drop=True)], axis=1)\n",
-    "    toc(\"6b_pos_ngram_features\", t)\n",
-    "\n",
-    "    # 6c. readability features\n",
-    "    t = tic()\n",
-    "    readability_df = pd.DataFrame([{\n",
-    "        \"text1_readability_flesch_kincaid_grade\": round(textstat.flesch_kincaid_grade(masked_df.iloc[0][\"text1\"]), 5),\n",
-    "        \"text1_readability_gunning_fog\": round(textstat.gunning_fog(masked_df.iloc[0][\"text1\"]), 5),\n",
-    "        \"text1_readability_smog\": round(textstat.smog_index(masked_df.iloc[0][\"text1\"]), 5),\n",
-    "        \"text1_readability_coleman_liau\": round(textstat.coleman_liau_index(masked_df.iloc[0][\"text1\"]), 5),\n",
-    "        \"text2_readability_flesch_kincaid_grade\": round(textstat.flesch_kincaid_grade(masked_df.iloc[0][\"text2\"]), 5),\n",
-    "        \"text2_readability_gunning_fog\": round(textstat.gunning_fog(masked_df.iloc[0][\"text2\"]), 5),\n",
-    "        \"text2_readability_smog\": round(textstat.smog_index(masked_df.iloc[0][\"text2\"]), 5),\n",
-    "        \"text2_readability_coleman_liau\": round(textstat.coleman_liau_index(masked_df.iloc[0][\"text2\"]), 5),\n",
-    "    }])\n",
-    "    feature_df = pd.concat([feature_df.reset_index(drop=True), readability_df.reset_index(drop=True)], axis=1)\n",
-    "    toc(\"6c_readability_features\", t)\n",
-    "\n",
-    "    # 7. build pairwise feature matrix fast\n",
-    "    t = tic()\n",
-    "\n",
-    "    row_values = feature_df.iloc[0].to_dict()\n",
-    "    X_pair = np.empty((1, 2 * len(pairwise_column_pairs)), dtype=np.float32)\n",
-    "\n",
-    "    j = 0\n",
-    "    for left_col, right_col in pairwise_column_pairs:\n",
-    "        left = np.float32(row_values.get(left_col, 0.0))\n",
-    "        right = np.float32(row_values.get(right_col, 0.0))\n",
-    "        diff = left - right\n",
-    "\n",
-    "        X_pair[0, j] = abs(diff)      # abs_diff\n",
-    "        X_pair[0, j + 1] = left * right  # product\n",
-    "        j += 2\n",
-    "\n",
-    "    toc(\"7_build_pairwise_matrix\", t)\n",
-    "\n",
-    "\n",
-    "    # 8. inference\n",
-    "    t = tic()\n",
-    "    probability_same = _predict_positive_proba(model, X_pair)\n",
-    "    predicted_label = int(probability_same >= threshold)\n",
-    "    toc(\"8_model_inference\", t)\n",
-    "\n",
-    "    toc(\"total\", total_start)\n",
-    "\n",
-    "    if show_timing:\n",
-    "        timing_df = (\n",
-    "            pd.DataFrame(\n",
-    "                [{\"step\": step, \"seconds\": seconds} for step, seconds in timings.items()]\n",
-    "            )\n",
-    "            .sort_values(\"seconds\", ascending=False, ignore_index=True)\n",
-    "        )\n",
-    "        timing_df[\"percent\"] = 100 * timing_df[\"seconds\"] / timings[\"total\"]\n",
-    "        print(timing_df.to_string(index=False, float_format=lambda x: f\"{x:.4f}\"))\n",
-    "\n",
-    "    return {\n",
-    "        \"probability_same\": probability_same,\n",
-    "        \"predicted_label\": predicted_label,\n",
-    "        \"threshold\": threshold,\n",
-    "        \"normalized_text1\": pair_df.iloc[0][\"text1\"],\n",
-    "        \"normalized_text2\": pair_df.iloc[0][\"text2\"],\n",
-    "        \"masked_text1\": masked_df.iloc[0][\"text1\"],\n",
-    "        \"masked_text2\": masked_df.iloc[0][\"text2\"],\n",
-    "        \"timings\": timings,\n",
-    "    }\n",
-    "\n",
-    "\n",
-    "# example\n",
-    "text1 = \"First text here.\"\n",
-    "text2 = \"Second text here.\"\n",
-    "\n",
-    "result = predict_similarity(\n",
-    "    text1=text1,\n",
-    "    text2=text2,\n",
-    "    model=model,\n",
-    "    suffixes=suffixes,\n",
-    "    threshold=best_threshold,\n",
-    ")\n",
-    "\n",
-    "print(\"probability_same:\", result[\"probability_same\"])\n",
-    "print(\"predicted_label:\", result[\"predicted_label\"])\n",
-    "print(\"threshold:\", result[\"threshold\"])\n",
-    "print(\"masked_text1:\", result[\"masked_text1\"])\n",
-    "print(\"masked_text2:\", result[\"masked_text2\"])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 92,
-   "id": "e541c13a",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'verbose': True,\n",
-       " 'char_ngram_n': 4,\n",
-       " 'char_tfidf_min_df': 2,\n",
-       " 'char_tfidf_max_df': 0.95,\n",
-       " 'char_tfidf_max_features': 50000,\n",
-       " 'pos_ngram_range': [2, 3],\n",
-       " 'pos_tfidf_min_df': 2,\n",
-       " 'pos_tfidf_max_df': 0.95,\n",
-       " 'pos_tfidf_max_features': 5000,\n",
-       " 'sublinear_tf': True,\n",
-       " 'norm': 'l2',\n",
-       " 'include_readability': True,\n",
-       " 'dense_output': True}"
-      ]
-     },
-     "execution_count": 92,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ngram_config"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "47e54c1d",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "99cb99cb",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "814d10a3",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0\n",
-      "1\n",
-      "2\n",
-      "3\n",
-      "4\n",
-      "5\n",
-      "6\n",
-      "7\n",
-      "8\n",
-      "9\n",
-      "10\n",
-      "11\n",
-      "12\n",
-      "13\n",
-      "14\n",
-      "15\n",
-      "16\n",
-      "17\n",
-      "18\n"
-     ]
-    }
-   ],
-   "source": [
-    "for i in range(19):\n",
-    "    print(i)\n",
-    "    continue"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "910822e2",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}