datamatters24
/

research-document-archive

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 42 - Duplicate Detection via Page Embeddings\n",
+    "\n",
+    "Pipeline notebook that finds near-duplicate pages using pgvector cosine distance.\n",
+    "\n",
+    "Uses `CROSS JOIN LATERAL` to efficiently find the top-5 nearest neighbours per page\n",
+    "and filters by a cosine similarity threshold.\n",
+    "\n",
+    "**Outputs:**\n",
+    "- `duplicate_pairs` table rows\n",
+    "- `page_features`: `is_duplicate` = 1.0 for flagged pages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Parameters\n",
+    "source_section = None\n",
+    "similarity_threshold = 0.95\n",
+    "batch_size = 10000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, warnings, time\n",
+    "sys.path.insert(0, '/opt/epstein_env/research')\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "from research_lib.config import COLLECTIONS\n",
+    "from research_lib.db import fetch_df, fetch_all, get_conn, bulk_insert, upsert_feature\n",
+    "from research_lib.incremental import start_run, finish_run\n",
+    "\n",
+    "print('Libraries loaded.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Start run ----\n",
+    "PIPELINE = 'duplicate_detection'\n",
+    "run_id = start_run(PIPELINE, source_section=source_section, parameters={\n",
+    "    'similarity_threshold': similarity_threshold,\n",
+    "    'batch_size': batch_size,\n",
+    "})\n",
+    "\n",
+    "# Get page IDs with embeddings\n",
+    "section_filter = ''\n",
+    "params = []\n",
+    "if source_section:\n",
+    "    section_filter = 'AND d.source_section = %s'\n",
+    "    params.append(source_section)\n",
+    "\n",
+    "page_ids_df = fetch_df(f\"\"\"\n",
+    "    SELECT p.id\n",
+    "    FROM pages p\n",
+    "    JOIN documents d ON d.id = p.document_id\n",
+    "    WHERE p.embedding IS NOT NULL\n",
+    "      {section_filter}\n",
+    "    ORDER BY p.id\n",
+    "\"\"\", params or None)\n",
+    "\n",
+    "all_page_ids = page_ids_df['id'].tolist()\n",
+    "print(f'Total pages with embeddings: {len(all_page_ids)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Process in batches using CROSS JOIN LATERAL ----\n",
+    "total_pairs = 0\n",
+    "total_batches = (len(all_page_ids) + batch_size - 1) // batch_size\n",
+    "all_duplicate_page_ids = set()\n",
+    "\n",
+    "for batch_idx in range(total_batches):\n",
+    "    start = batch_idx * batch_size\n",
+    "    end = min(start + batch_size, len(all_page_ids))\n",
+    "    batch_ids = all_page_ids[start:end]\n",
+    "    print(f'Batch {batch_idx + 1}/{total_batches}: pages {start}-{end - 1}')\n",
+    "\n",
+    "    t0 = time.time()\n",
+    "\n",
+    "    # Build the query -- find top-5 nearest neighbours for each page in batch\n",
+    "    id_list = ','.join(str(i) for i in batch_ids)\n",
+    "    sql = f\"\"\"\n",
+    "    SELECT p1.id AS page_id_a, p2.id AS page_id_b,\n",
+    "           1 - (p1.embedding <=> p2.embedding) AS similarity\n",
+    "    FROM pages p1\n",
+    "    CROSS JOIN LATERAL (\n",
+    "        SELECT id, embedding FROM pages\n",
+    "        WHERE id > p1.id AND embedding IS NOT NULL\n",
+    "        ORDER BY p1.embedding <=> embedding\n",
+    "        LIMIT 5\n",
+    "    ) p2\n",
+    "    WHERE p1.id IN ({id_list})\n",
+    "      AND (1 - (p1.embedding <=> p2.embedding)) >= %s\n",
+    "    \"\"\"\n",
+    "\n",
+    "    pairs_df = fetch_df(sql, [similarity_threshold])\n",
+    "    elapsed = time.time() - t0\n",
+    "    print(f'  Found {len(pairs_df)} pairs in {elapsed:.1f}s')\n",
+    "\n",
+    "    if len(pairs_df) > 0:\n",
+    "        # Insert into duplicate_pairs\n",
+    "        pair_rows = [\n",
+    "            (int(r.page_id_a), int(r.page_id_b), float(r.similarity))\n",
+    "            for r in pairs_df.itertuples()\n",
+    "        ]\n",
+    "        n = bulk_insert(\n",
+    "            'duplicate_pairs',\n",
+    "            ['page_id_a', 'page_id_b', 'similarity'],\n",
+    "            pair_rows,\n",
+    "            on_conflict='(page_id_a, page_id_b) DO NOTHING',\n",
+    "        )\n",
+    "        print(f'  Inserted {n} duplicate_pairs rows')\n",
+    "        total_pairs += n\n",
+    "\n",
+    "        # Track duplicate page IDs\n",
+    "        all_duplicate_page_ids.update(pairs_df['page_id_a'].tolist())\n",
+    "        all_duplicate_page_ids.update(pairs_df['page_id_b'].tolist())\n",
+    "\n",
+    "print(f'\\nTotal duplicate pairs inserted: {total_pairs}')\n",
+    "print(f'Unique pages flagged as duplicates: {len(all_duplicate_page_ids)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Flag pages in page_features ----\n",
+    "if all_duplicate_page_ids:\n",
+    "    dup_rows = [\n",
+    "        (int(pid), 'is_duplicate', 1.0, None)\n",
+    "        for pid in all_duplicate_page_ids\n",
+    "    ]\n",
+    "    n = upsert_feature(\n",
+    "        'page_features',\n",
+    "        ['page_id', 'feature_name'],\n",
+    "        ['feature_value', 'feature_json'],\n",
+    "        dup_rows,\n",
+    "    )\n",
+    "    print(f'Flagged {n} pages as is_duplicate in page_features')\n",
+    "\n",
+    "finish_run(run_id, documents_processed=len(all_page_ids))\n",
+    "print(f'Run {run_id} complete.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Stats: documents with duplicates ----\n",
+    "doc_dup_df = fetch_df(\"\"\"\n",
+    "    SELECT d.source_section,\n",
+    "           COUNT(DISTINCT p.document_id) AS docs_with_duplicates,\n",
+    "           COUNT(*) AS duplicate_pages\n",
+    "    FROM page_features pf\n",
+    "    JOIN pages p ON p.id = pf.page_id\n",
+    "    JOIN documents d ON d.id = p.document_id\n",
+    "    WHERE pf.feature_name = 'is_duplicate' AND pf.feature_value = 1.0\n",
+    "    GROUP BY d.source_section\n",
+    "    ORDER BY docs_with_duplicates DESC\n",
+    "\"\"\")\n",
+    "\n",
+    "print('Documents with duplicate pages by collection:')\n",
+    "print(doc_dup_df.to_string(index=False))\n",
+    "\n",
+    "total_dup_count = fetch_df('SELECT COUNT(*) AS cnt FROM duplicate_pairs')\n",
+    "print(f\"\\nTotal duplicate pairs in database: {total_dup_count['cnt'].iloc[0]}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}