diff --git "a/src/testing.ipynb" "b/src/testing.ipynb" deleted file mode 100644--- "a/src/testing.ipynb" +++ /dev/null @@ -1,2226 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "4e56ea4e", - "metadata": {}, - "source": [ - "# Training" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "0b96a4f0", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Any\n", - "from scipy import sparse\n", - "from pathlib import Path\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "BASE_DIR = Path.cwd().parent\n", - "SAVE_DIR = BASE_DIR / \"saved\" # folder to save progress outputs\n", - "\n", - "# load n-gram outputs\n", - "ngram_dir = SAVE_DIR / \"ngram_features\"\n", - "ngram_dict_df = {split: pd.read_parquet(ngram_dir / f\"dataframes/{split}_ngram.parquet\") for split in [\"train\", \"validation\", \"test\"]}\n", - "\n", - "# # load SVD-reduction outputs\n", - "# reduced_dir = SAVE_DIR / \"dimensionality_reduction\"\n", - "# reduced_dict_df = {split: pd.read_parquet(reduced_dir / f\"dataframes/{split}_reduced.parquet\") for split in [\"train\", \"validation\", \"test\"]}" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "df99844e", - "metadata": {}, - "outputs": [], - "source": [ - "from dataclasses import dataclass\n", - "\n", - "@dataclass(slots=True)\n", - "class Config:\n", - " verbose: bool = True\n", - " \n", - " include_statistical: bool = True\n", - " include_tfidf: bool = True\n", - " include_char_ngrams: bool = True\n", - " include_pos_ngrams: bool = True\n", - " include_readability: bool = True\n", - " \n", - " pairwise_operations: tuple[str, ...] = (\"abs_diff\", \"product\")\n", - " use_sparse_matrices: bool = False\n", - " model_type: str = \"hist_gb\"\n", - " random_state: int = 42\n", - " \n", - " threshold_metric: str = \"accuracy\"\n", - " threshold_grid_step: float = 0.01\n", - " return_pairwise_matrices: bool = False\n", - "config = Config()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "32772a9d", - "metadata": {}, - "outputs": [], - "source": [ - "def _feature_family_from_suffix(suffix: str) -> str:\n", - " if suffix.startswith(\"tfidf_\"):\n", - " return \"tfidf\"\n", - " if suffix.startswith(\"char\") and \"_tfidf_\" in suffix:\n", - " return \"char_ngrams\"\n", - " if suffix.startswith(\"pos\") and \"_tfidf_\" in suffix:\n", - " return \"pos_ngrams\"\n", - " if suffix.startswith(\"readability_\"):\n", - " return \"readability\"\n", - " return \"statistical\"\n", - "\n", - "def _include_family(family: str, config: Config = config) -> bool:\n", - " return {\n", - " \"statistical\": config.include_statistical,\n", - " \"tfidf\": config.include_tfidf,\n", - " \"char_ngrams\": config.include_char_ngrams,\n", - " \"pos_ngrams\": config.include_pos_ngrams,\n", - " \"readability\": config.include_readability,\n", - " }[family]\n", - "\n", - "\n", - "suffixes: list[str] = []\n", - "for column in ngram_dict_df[\"train\"].columns:\n", - " if not column.startswith(\"text1_\"): \n", - " continue\n", - " suffix = column[len(\"text1_\"):] # getting the suffix\n", - " family = _feature_family_from_suffix(suffix) # categorize suffix\n", - " if _include_family(family):\n", - " suffixes.append(suffix)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "4f4c5f9f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "73810633\n", - "16708698\n", - "16475773\n" - ] - } - ], - "source": [ - "X_by_split: dict[str, Any] = {}\n", - "y_by_split: dict[str, np.ndarray] = {}\n", - "for split, df in ngram_dict_df.items():\n", - "\n", - " columns: list[sparse.csr_matrix] = []\n", - " feature_names: list[str] = []\n", - " for suffix in suffixes:\n", - " \n", - " # for each feature vector's member, build a sparse column vector\n", - " left = sparse.csr_matrix(df[f\"text1_{suffix}\"].to_numpy(dtype=\"float32\").reshape(-1,1))\n", - " right = sparse.csr_matrix(df[f\"text2_{suffix}\"].to_numpy(dtype=\"float32\").reshape(-1,1))\n", - "\n", - " diff = left - right\n", - "\n", - " # absolute diff\n", - " columns.append(abs(diff))\n", - " feature_names.append(f\"{suffix}_abs_diff\")\n", - "\n", - " # product\n", - " columns.append(left.multiply(right))\n", - " feature_names.append(f\"{suffix}_product\")\n", - "\n", - " # # squared diff\n", - " # columns.append(diff.power(2))\n", - " # feature_names.append(f\"{suffix}_sq_diff\")\n", - "\n", - " X = sparse.hstack(columns, format=\"csr\", dtype=\"float32\")\n", - " y = df[\"same\"].to_numpy(dtype=np.int8, copy=False) # for binary label\n", - "\n", - " X_by_split[split] = X\n", - " y_by_split[split] = y\n", - "\n", - " nnz = int(X.nnz)\n", - " print(nnz)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "024ce327", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X_by_split[\"train\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "8fc0b339", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
-       "              colsample_bylevel=None, colsample_bynode=None,\n",
-       "              colsample_bytree=0.3, device=None, early_stopping_rounds=None,\n",
-       "              enable_categorical=False, eval_metric='logloss',\n",
-       "              feature_types=None, feature_weights=None, gamma=None,\n",
-       "              grow_policy=None, importance_type=None,\n",
-       "              interaction_constraints=None, learning_rate=0.05, max_bin=None,\n",
-       "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
-       "              max_delta_step=None, max_depth=4, max_leaves=None,\n",
-       "              min_child_weight=3, missing=nan, monotone_constraints=None,\n",
-       "              multi_strategy=None, n_estimators=500, n_jobs=2,\n",
-       "              num_parallel_tree=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" - ], - "text/plain": [ - "XGBClassifier(base_score=None, booster=None, callbacks=None,\n", - " colsample_bylevel=None, colsample_bynode=None,\n", - " colsample_bytree=0.3, device=None, early_stopping_rounds=None,\n", - " enable_categorical=False, eval_metric='logloss',\n", - " feature_types=None, feature_weights=None, gamma=None,\n", - " grow_policy=None, importance_type=None,\n", - " interaction_constraints=None, learning_rate=0.05, max_bin=None,\n", - " max_cat_threshold=None, max_cat_to_onehot=None,\n", - " max_delta_step=None, max_depth=4, max_leaves=None,\n", - " min_child_weight=3, missing=nan, monotone_constraints=None,\n", - " multi_strategy=None, n_estimators=500, n_jobs=2,\n", - " num_parallel_tree=None, ...)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from xgboost import XGBClassifier\n", - "\n", - "model = XGBClassifier(\n", - " objective=\"binary:logistic\",\n", - " eval_metric=\"logloss\",\n", - " n_estimators=500,\n", - " max_depth=4,\n", - " learning_rate=0.05,\n", - " subsample=0.8,\n", - " colsample_bytree=0.3,\n", - " min_child_weight=3,\n", - " reg_lambda=5.0,\n", - " reg_alpha=1.0,\n", - " random_state=42,\n", - " n_jobs=2,\n", - " tree_method=\"hist\",\n", - ")\n", - "model.fit(X_by_split[\"train\"], y_by_split[\"train\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "84f539a0", - "metadata": {}, - "outputs": [], - "source": [ - "if hasattr(model, \"predict_proba\"):\n", - " validation_proba = model.predict_proba(X_by_split[\"validation\"])[:, 1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbc43eb9", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "17ec9212", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import (\n", - " accuracy_score,\n", - " balanced_accuracy_score,\n", - " confusion_matrix,\n", - " f1_score,\n", - " precision_score,\n", - " recall_score,\n", - " roc_auc_score,\n", - ")\n", - "\n", - "def compute_metrics(y_true: np.ndarray, y_proba: np.ndarray, threshold: float) -> dict[str, Any]:\n", - " \n", - " y_pred = (y_proba >= threshold).astype(int)\n", - " tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()\n", - "\n", - " specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0\n", - " sensitivity = recall_score(y_true, y_pred, zero_division=0)\n", - " balanced_accuracy = balanced_accuracy_score(y_true, y_pred)\n", - " youden_j = sensitivity + specificity - 1.0\n", - "\n", - " return {\n", - " \"threshold\": round(threshold, 6),\n", - " \"accuracy\": round(accuracy_score(y_true, y_pred), 6),\n", - " \"precision\": round(precision_score(y_true, y_pred, zero_division=0), 6),\n", - " \"recall\": round(sensitivity, 6),\n", - " \"f1\": round(f1_score(y_true, y_pred, zero_division=0), 6),\n", - " \"balanced_accuracy\": round(balanced_accuracy, 6),\n", - " \"specificity\": round(specificity, 6),\n", - " \"youden_j\": round(youden_j, 6),\n", - " \"roc_auc\": round(roc_auc_score(y_true, y_proba), 6),\n", - " \"tn\": int(tn),\n", - " \"fp\": int(fp),\n", - " \"fn\": int(fn),\n", - " \"tp\": int(tp),\n", - " }\n", - "\n", - "def find_best_threshold(\n", - " y_true: np.ndarray,\n", - " y_proba: np.ndarray,\n", - " config: Config = config,\n", - ") -> tuple[float, dict[str, Any]]:\n", - " \n", - " thresholds = np.arange(0.0, 1.0, config.threshold_grid_step, dtype=np.float32)\n", - " if thresholds.size == 0:\n", - " thresholds = np.array([0.5], dtype=np.float64)\n", - "\n", - " best_threshold = 0.5\n", - " best_metrics = compute_metrics(y_true, y_proba, threshold=best_threshold)\n", - " best_score = float(best_metrics[config.threshold_metric])\n", - "\n", - " for threshold in thresholds: # loop over threshold grid\n", - " metrics = compute_metrics(y_true, y_proba, threshold=float(threshold))\n", - " score = float(metrics[config.threshold_metric])\n", - "\n", - " # if current score is better than best_score...\n", - " if score > best_score:\n", - " best_threshold = float(threshold)\n", - " best_metrics = metrics\n", - " best_score = score\n", - "\n", - " return best_threshold, best_metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "4c9082ba", - "metadata": {}, - "outputs": [], - "source": [ - "best_threshold, validation_metrics = find_best_threshold(y_by_split[\"validation\"], validation_proba, config=config)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "8ab81ba2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.5099999904632568" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "best_threshold" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "644474d1", - "metadata": {}, - "outputs": [], - "source": [ - "if hasattr(model, \"predict_proba\"):\n", - " test_proba = model.predict_proba(X_by_split[\"test\"])[:, 1]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "5e32cecb", - "metadata": {}, - "outputs": [], - "source": [ - "test_metrics = compute_metrics(y_by_split[\"test\"], test_proba, threshold=best_threshold)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "352f80b7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
splitthresholdaccuracyprecisionrecallf1balanced_accuracyspecificityyouden_jroc_auctnfpfntp
0validation0.510.7584630.7671590.7967820.7816900.7548830.7129850.5097670.84668124279778213219
1test0.510.7607640.7561040.8012940.7780430.7587840.7162740.5175670.84992325229997683097
\n", - "
" - ], - "text/plain": [ - " split threshold accuracy precision recall f1 \\\n", - "0 validation 0.51 0.758463 0.767159 0.796782 0.781690 \n", - "1 test 0.51 0.760764 0.756104 0.801294 0.778043 \n", - "\n", - " balanced_accuracy specificity youden_j roc_auc tn fp fn tp \n", - "0 0.754883 0.712985 0.509767 0.846681 2427 977 821 3219 \n", - "1 0.758784 0.716274 0.517567 0.849923 2522 999 768 3097 " - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "metrics_summary_df = pd.DataFrame([\n", - " {\"split\": \"validation\", **validation_metrics},\n", - " {\"split\": \"test\", **test_metrics},\n", - " ])\n", - "metrics_summary_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3fda5c18", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "5eff666b", - "metadata": {}, - "source": [ - "# Inference" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "id": "2b71800a", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import pickle\n", - "from pathlib import Path\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "from scipy import sparse\n", - "from textstat import textstat\n", - "\n", - "from normalization import normalize_text, Config as NormalizationConfig\n", - "from masking_regex import mask_split as regex_mask_split\n", - "from masking_spacy import (\n", - " load_nlp_model,\n", - " _apply_ner_mask,\n", - " _build_linguistic_record,\n", - " Config as SpacyMaskingConfig,\n", - ")\n", - "\n", - "from statistical_features import extract_split_statistics, Config as StatisticalConfig\n", - "from tfidf_features import record_to_tfidf_text, Config as TFIDFConfig\n", - "from ngram_features import (\n", - " build_space_free_char_ngrams,\n", - " record_to_pos_sequence,\n", - " Config as NGramConfig,\n", - ")\n", - "\n", - "BASE_DIR = Path.cwd().parent\n", - "SAVE_DIR = BASE_DIR / \"saved\"\n", - "\n", - "# load configs/artifacts fitted during training\n", - "with open(SAVE_DIR / \"normalization\" / \"normalization_config.json\", \"r\", encoding=\"utf-8\") as f:\n", - " normalization_config = NormalizationConfig(**json.load(f))\n", - "\n", - "with open(SAVE_DIR / \"masking\" / \"spacy_config.json\", \"r\", encoding=\"utf-8\") as f:\n", - " spacy_cfg_dict = json.load(f)\n", - "spacy_cfg_dict[\"verbose\"] = False\n", - "spacy_config = SpacyMaskingConfig(**spacy_cfg_dict)\n", - "\n", - "with open(SAVE_DIR / \"masking\" / \"statistical_config.json\", \"r\", encoding=\"utf-8\") as f:\n", - " statistical_config = StatisticalConfig(**json.load(f))\n", - "\n", - "with open(SAVE_DIR / \"tfidf_features\" / \"tfidf_config.json\", \"r\", encoding=\"utf-8\") as f:\n", - " tfidf_config = TFIDFConfig(**json.load(f))\n", - "with open(SAVE_DIR / \"tfidf_features\" / \"vectorizer.pkl\", \"rb\") as f:\n", - " tfidf_vectorizer = pickle.load(f)\n", - "\n", - "with open(SAVE_DIR / \"ngram_features\" / \"ngram_config.json\", \"r\", encoding=\"utf-8\") as f:\n", - " ngram_config = NGramConfig(**json.load(f))\n", - "with open(SAVE_DIR / \"ngram_features\" / \"char_vectorizer.pkl\", \"rb\") as f:\n", - " char_vectorizer = pickle.load(f)\n", - "with open(SAVE_DIR / \"ngram_features\" / \"pos_vectorizer.pkl\", \"rb\") as f:\n", - " pos_vectorizer = pickle.load(f)\n", - "\n", - "nlp = load_nlp_model(config=spacy_config)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "id": "993e8444", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Stat features [inference:text1]: 100%|██████████| 1/1 [00:00<00:00, 7884.03it/s]\n", - "Stat features [inference:text2]: 100%|██████████| 1/1 [00:00<00:00, 5343.06it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " step seconds percent\n", - " total 0.3072 100.0000\n", - "7_build_pairwise_matrix 0.2333 75.9616\n", - " 6a_char_ngram_features 0.0370 12.0416\n", - " 5_tfidf_features 0.0155 5.0444\n", - " 6b_pos_ngram_features 0.0059 1.9164\n", - " 4_statistical_features 0.0056 1.8356\n", - " 3b_spacy_text1 0.0038 1.2389\n", - " 3c_spacy_text2 0.0031 1.0036\n", - "6c_readability_features 0.0011 0.3651\n", - " 8_model_inference 0.0007 0.2325\n", - " 2_regex_masking 0.0004 0.1422\n", - " 1_normalization 0.0004 0.1413\n", - " 3d_spacy_postprocess 0.0002 0.0719\n", - " 3a_spacy_load_model 0.0000 0.0008\n", - "probability_same: 0.9880850911140442\n", - "predicted_label: 1\n", - "threshold: 0.5099999904632568\n", - "masked_text1: text here.\n", - "masked_text2: text here.\n" - ] - } - ], - "source": [ - "import time\n", - "\n", - "def _predict_positive_proba(model, X):\n", - " if hasattr(model, \"predict_proba\"):\n", - " return float(model.predict_proba(X)[0, 1])\n", - " if hasattr(model, \"decision_function\"):\n", - " score = float(model.decision_function(X)[0])\n", - " return 1.0 / (1.0 + np.exp(-score))\n", - " raise ValueError(\"Model must support predict_proba or decision_function.\")\n", - "\n", - "pairwise_column_pairs = [(f\"text1_{suffix}\", f\"text2_{suffix}\") for suffix in suffixes]\n", - "\n", - "\n", - "def predict_similarity(text1: str, text2: str, model, suffixes, threshold=None, show_timing=True):\n", - " threshold = 0.5 if threshold is None else float(threshold)\n", - " timings = {}\n", - "\n", - " def tic():\n", - " return time.perf_counter()\n", - "\n", - " def toc(label, start):\n", - " timings[label] = time.perf_counter() - start\n", - "\n", - " total_start = tic()\n", - "\n", - " # 1. normalization\n", - " t = tic()\n", - " pair_df = pd.DataFrame(\n", - " [{\n", - " \"text1\": normalize_text(text1, config=normalization_config),\n", - " \"text2\": normalize_text(text2, config=normalization_config),\n", - " \"same\": 0,\n", - " }]\n", - " )\n", - " toc(\"1_normalization\", t)\n", - "\n", - " # 2. regex masking\n", - " t = tic()\n", - " regex_masked_df, _ = regex_mask_split(pair_df)\n", - " toc(\"2_regex_masking\", t)\n", - "\n", - " # 3a. load spaCy model\n", - " t = tic()\n", - " spacy_config.nlp_n_process = 1\n", - " toc(\"3a_spacy_load_model\", t)\n", - "\n", - " # 3b. spaCy inference text1\n", - " def spacy_mask_one_text(text, nlp):\n", - " doc = nlp(text)\n", - " masked_text, _ = _apply_ner_mask(text, doc)\n", - " record = _build_linguistic_record(doc)\n", - " return masked_text, record\n", - "\n", - " t = tic()\n", - " masked_text1, record1 = spacy_mask_one_text(regex_masked_df.iloc[0][\"text1\"], nlp)\n", - " toc(\"3b_spacy_text1\", t)\n", - "\n", - " # 3c. spaCy inference text2\n", - " t = tic()\n", - " masked_text2, record2 = spacy_mask_one_text(regex_masked_df.iloc[0][\"text2\"], nlp)\n", - " toc(\"3c_spacy_text2\", t)\n", - "\n", - " # 3d. assemble masked outputs\n", - " t = tic()\n", - " masked_df = regex_masked_df.copy()\n", - " masked_df.at[0, \"text1\"] = masked_text1\n", - " masked_df.at[0, \"text2\"] = masked_text2\n", - "\n", - " split_cache = {\n", - " \"text1\": [record1],\n", - " \"text2\": [record2],\n", - " }\n", - " toc(\"3d_spacy_postprocess\", t)\n", - "\n", - " # 4. statistical features\n", - " t = tic()\n", - " feature_df = extract_split_statistics(\n", - " masked_df,\n", - " split_cache=split_cache,\n", - " split_name=\"inference\",\n", - " config=statistical_config,\n", - " )\n", - " toc(\"4_statistical_features\", t)\n", - "\n", - " # 5. TF-IDF features\n", - " t = tic()\n", - " for column in [\"text1\", \"text2\"]:\n", - " docs = [record_to_tfidf_text(record, config=tfidf_config) for record in split_cache[column]]\n", - " tfidf_matrix = tfidf_vectorizer.transform(docs).toarray()\n", - " tfidf_cols = [f\"{column}_tfidf_{i:05d}\" for i in range(tfidf_matrix.shape[1])]\n", - " tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_cols)\n", - " feature_df = pd.concat([feature_df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)\n", - " toc(\"5_tfidf_features\", t)\n", - "\n", - " # 6a. char n-gram features\n", - " t = tic()\n", - " for column in [\"text1\", \"text2\"]:\n", - " char_docs = [\n", - " \" \".join(build_space_free_char_ngrams(text, n=ngram_config.char_ngram_n))\n", - " for text in masked_df[column].tolist()\n", - " ]\n", - " char_matrix = char_vectorizer.transform(char_docs).toarray()\n", - " char_cols = [f\"{column}_char{ngram_config.char_ngram_n}_tfidf_{i:05d}\" for i in range(char_matrix.shape[1])]\n", - " char_df = pd.DataFrame(char_matrix, columns=char_cols)\n", - " feature_df = pd.concat([feature_df.reset_index(drop=True), char_df.reset_index(drop=True)], axis=1)\n", - " toc(\"6a_char_ngram_features\", t)\n", - "\n", - " # 6b. POS n-gram features\n", - " t = tic()\n", - " for column in [\"text1\", \"text2\"]:\n", - " pos_docs = [\n", - " \" \".join(record_to_pos_sequence(record))\n", - " for record in split_cache[column]\n", - " ]\n", - " pos_matrix = pos_vectorizer.transform(pos_docs).toarray()\n", - " pos_cols = [f\"{column}_pos{ngram_config.pos_ngram_range}_tfidf_{i:05d}\" for i in range(pos_matrix.shape[1])]\n", - " pos_df = pd.DataFrame(pos_matrix, columns=pos_cols)\n", - " feature_df = pd.concat([feature_df.reset_index(drop=True), pos_df.reset_index(drop=True)], axis=1)\n", - " toc(\"6b_pos_ngram_features\", t)\n", - "\n", - " # 6c. readability features\n", - " t = tic()\n", - " readability_df = pd.DataFrame([{\n", - " \"text1_readability_flesch_kincaid_grade\": round(textstat.flesch_kincaid_grade(masked_df.iloc[0][\"text1\"]), 5),\n", - " \"text1_readability_gunning_fog\": round(textstat.gunning_fog(masked_df.iloc[0][\"text1\"]), 5),\n", - " \"text1_readability_smog\": round(textstat.smog_index(masked_df.iloc[0][\"text1\"]), 5),\n", - " \"text1_readability_coleman_liau\": round(textstat.coleman_liau_index(masked_df.iloc[0][\"text1\"]), 5),\n", - " \"text2_readability_flesch_kincaid_grade\": round(textstat.flesch_kincaid_grade(masked_df.iloc[0][\"text2\"]), 5),\n", - " \"text2_readability_gunning_fog\": round(textstat.gunning_fog(masked_df.iloc[0][\"text2\"]), 5),\n", - " \"text2_readability_smog\": round(textstat.smog_index(masked_df.iloc[0][\"text2\"]), 5),\n", - " \"text2_readability_coleman_liau\": round(textstat.coleman_liau_index(masked_df.iloc[0][\"text2\"]), 5),\n", - " }])\n", - " feature_df = pd.concat([feature_df.reset_index(drop=True), readability_df.reset_index(drop=True)], axis=1)\n", - " toc(\"6c_readability_features\", t)\n", - "\n", - " # 7. build pairwise feature matrix fast\n", - " t = tic()\n", - "\n", - " row_values = feature_df.iloc[0].to_dict()\n", - " X_pair = np.empty((1, 2 * len(pairwise_column_pairs)), dtype=np.float32)\n", - "\n", - " j = 0\n", - " for left_col, right_col in pairwise_column_pairs:\n", - " left = np.float32(row_values.get(left_col, 0.0))\n", - " right = np.float32(row_values.get(right_col, 0.0))\n", - " diff = left - right\n", - "\n", - " X_pair[0, j] = abs(diff) # abs_diff\n", - " X_pair[0, j + 1] = left * right # product\n", - " j += 2\n", - "\n", - " toc(\"7_build_pairwise_matrix\", t)\n", - "\n", - "\n", - " # 8. inference\n", - " t = tic()\n", - " probability_same = _predict_positive_proba(model, X_pair)\n", - " predicted_label = int(probability_same >= threshold)\n", - " toc(\"8_model_inference\", t)\n", - "\n", - " toc(\"total\", total_start)\n", - "\n", - " if show_timing:\n", - " timing_df = (\n", - " pd.DataFrame(\n", - " [{\"step\": step, \"seconds\": seconds} for step, seconds in timings.items()]\n", - " )\n", - " .sort_values(\"seconds\", ascending=False, ignore_index=True)\n", - " )\n", - " timing_df[\"percent\"] = 100 * timing_df[\"seconds\"] / timings[\"total\"]\n", - " print(timing_df.to_string(index=False, float_format=lambda x: f\"{x:.4f}\"))\n", - "\n", - " return {\n", - " \"probability_same\": probability_same,\n", - " \"predicted_label\": predicted_label,\n", - " \"threshold\": threshold,\n", - " \"normalized_text1\": pair_df.iloc[0][\"text1\"],\n", - " \"normalized_text2\": pair_df.iloc[0][\"text2\"],\n", - " \"masked_text1\": masked_df.iloc[0][\"text1\"],\n", - " \"masked_text2\": masked_df.iloc[0][\"text2\"],\n", - " \"timings\": timings,\n", - " }\n", - "\n", - "\n", - "# example\n", - "text1 = \"First text here.\"\n", - "text2 = \"Second text here.\"\n", - "\n", - "result = predict_similarity(\n", - " text1=text1,\n", - " text2=text2,\n", - " model=model,\n", - " suffixes=suffixes,\n", - " threshold=best_threshold,\n", - ")\n", - "\n", - "print(\"probability_same:\", result[\"probability_same\"])\n", - "print(\"predicted_label:\", result[\"predicted_label\"])\n", - "print(\"threshold:\", result[\"threshold\"])\n", - "print(\"masked_text1:\", result[\"masked_text1\"])\n", - "print(\"masked_text2:\", result[\"masked_text2\"])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 92, - "id": "e541c13a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'verbose': True,\n", - " 'char_ngram_n': 4,\n", - " 'char_tfidf_min_df': 2,\n", - " 'char_tfidf_max_df': 0.95,\n", - " 'char_tfidf_max_features': 50000,\n", - " 'pos_ngram_range': [2, 3],\n", - " 'pos_tfidf_min_df': 2,\n", - " 'pos_tfidf_max_df': 0.95,\n", - " 'pos_tfidf_max_features': 5000,\n", - " 'sublinear_tf': True,\n", - " 'norm': 'l2',\n", - " 'include_readability': True,\n", - " 'dense_output': True}" - ] - }, - "execution_count": 92, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ngram_config" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47e54c1d", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "99cb99cb", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "814d10a3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0\n", - "1\n", - "2\n", - "3\n", - "4\n", - "5\n", - "6\n", - "7\n", - "8\n", - "9\n", - "10\n", - "11\n", - "12\n", - "13\n", - "14\n", - "15\n", - "16\n", - "17\n", - "18\n" - ] - } - ], - "source": [ - "for i in range(19):\n", - " print(i)\n", - " continue" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "910822e2", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}