Upload notebooks/03_topic_classification/31_document_clustering.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

notebooks/03_topic_classification/31_document_clustering.ipynb +234 -0

notebooks/03_topic_classification/31_document_clustering.ipynb ADDED Viewed

	@@ -0,0 +1,234 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 31 - Document Clustering\n",
+    "\n",
+    "Pipeline notebook for K-Means document clustering using pre-computed embeddings.\n",
+    "\n",
+    "Loads document-level embeddings (averaged page embeddings), runs MiniBatchKMeans,\n",
+    "and stores cluster assignments in the `document_features` table."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Parameters\n",
+    "source_section = None\n",
+    "n_clusters = 20\n",
+    "batch_size = 50000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '/opt/epstein_env/research')\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.cluster import MiniBatchKMeans\n",
+    "from sklearn.metrics import silhouette_score\n",
+    "from collections import Counter\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "from research_lib.db import fetch_df, upsert_feature\n",
+    "from research_lib.incremental import start_run, finish_run, get_unprocessed_documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Start run\n",
+    "run_id = start_run(\n",
+    "    'document_clustering',\n",
+    "    source_section=source_section,\n",
+    "    parameters={'n_clusters': n_clusters, 'batch_size': batch_size},\n",
+    ")\n",
+    "print(f'Started run {run_id}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load document-level embeddings (average of page embeddings)\n",
+    "where_clause = ''\n",
+    "params = []\n",
+    "if source_section:\n",
+    "    where_clause = 'WHERE d.source_section = %s'\n",
+    "    params = [source_section]\n",
+    "\n",
+    "sql = f\"\"\"\n",
+    "    SELECT d.id as document_id, d.source_section,\n",
+    "           AVG(p.embedding) as embedding\n",
+    "    FROM documents d\n",
+    "    JOIN pages p ON p.document_id = d.id\n",
+    "    {where_clause}\n",
+    "    AND p.embedding IS NOT NULL\n",
+    "    GROUP BY d.id, d.source_section\n",
+    "    ORDER BY d.id\n",
+    "\"\"\"\n",
+    "doc_df = fetch_df(sql, params or None)\n",
+    "print(f'Loaded embeddings for {len(doc_df)} documents')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert embeddings to numpy array\n",
+    "embeddings = np.stack(doc_df['embedding'].values).astype(np.float32)\n",
+    "doc_ids = doc_df['document_id'].tolist()\n",
+    "\n",
+    "print(f'Embeddings shape: {embeddings.shape}')\n",
+    "\n",
+    "# Adjust n_clusters if we have fewer documents\n",
+    "actual_n_clusters = min(n_clusters, len(doc_ids))\n",
+    "if actual_n_clusters < n_clusters:\n",
+    "    print(f'Adjusted n_clusters from {n_clusters} to {actual_n_clusters} (fewer documents than clusters)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run MiniBatchKMeans clustering\n",
+    "print(f'Running MiniBatchKMeans with {actual_n_clusters} clusters...')\n",
+    "kmeans = MiniBatchKMeans(\n",
+    "    n_clusters=actual_n_clusters,\n",
+    "    batch_size=batch_size,\n",
+    "    random_state=42,\n",
+    "    n_init=3,\n",
+    "    max_iter=300,\n",
+    "    verbose=1,\n",
+    ")\n",
+    "cluster_labels = kmeans.fit_predict(embeddings)\n",
+    "print(f'Clustering complete. Inertia: {kmeans.inertia_:.2f}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute silhouette score (sample if dataset is large)\n",
+    "print('Computing silhouette score...')\n",
+    "if len(doc_ids) > 50000:\n",
+    "    # Sample for efficiency\n",
+    "    rng = np.random.RandomState(42)\n",
+    "    sample_idx = rng.choice(len(doc_ids), size=50000, replace=False)\n",
+    "    sil_score = silhouette_score(\n",
+    "        embeddings[sample_idx],\n",
+    "        cluster_labels[sample_idx],\n",
+    "        metric='cosine',\n",
+    "        sample_size=10000,\n",
+    "        random_state=42,\n",
+    "    )\n",
+    "else:\n",
+    "    sil_score = silhouette_score(\n",
+    "        embeddings,\n",
+    "        cluster_labels,\n",
+    "        metric='cosine',\n",
+    "        sample_size=min(10000, len(doc_ids)),\n",
+    "        random_state=42,\n",
+    "    )\n",
+    "print(f'Silhouette score: {sil_score:.4f}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Store cluster assignments in document_features\n",
+    "rows = [\n",
+    "    (\n",
+    "        doc_id,\n",
+    "        'cluster_id',\n",
+    "        str(int(cluster_label)),\n",
+    "        None,  # feature_json\n",
+    "    )\n",
+    "    for doc_id, cluster_label in zip(doc_ids, cluster_labels)\n",
+    "]\n",
+    "\n",
+    "print(f'Upserting {len(rows)} cluster assignments...')\n",
+    "upserted = upsert_feature(\n",
+    "    'document_features',\n",
+    "    unique_cols=['document_id', 'feature_name'],\n",
+    "    data_cols=['feature_value', 'feature_json'],\n",
+    "    rows=rows,\n",
+    ")\n",
+    "print(f'Upserted {upserted} rows')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Finish run\n",
+    "finish_run(run_id, documents_processed=len(doc_ids))\n",
+    "print(f'Run {run_id} completed.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Summary: cluster sizes\n",
+    "print('=== Document Clustering Summary ===')\n",
+    "print(f'Source section: {source_section or \"all\"}')\n",
+    "print(f'Documents clustered: {len(doc_ids)}')\n",
+    "print(f'Number of clusters: {actual_n_clusters}')\n",
+    "print(f'Silhouette score: {sil_score:.4f}')\n",
+    "print(f'Inertia: {kmeans.inertia_:.2f}')\n",
+    "\n",
+    "cluster_counts = Counter(cluster_labels)\n",
+    "print('\\nCluster sizes (sorted by size):')\n",
+    "for cluster_id, count in sorted(cluster_counts.items(), key=lambda x: x[1], reverse=True):\n",
+    "    pct = 100 * count / len(doc_ids)\n",
+    "    print(f'  Cluster {cluster_id:3d}: {count:6d} documents ({pct:.1f}%)')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}