Upload notebooks/03_topic_classification/30_topic_modeling.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

notebooks/03_topic_classification/30_topic_modeling.ipynb +333 -0

notebooks/03_topic_classification/30_topic_modeling.ipynb ADDED Viewed

	@@ -0,0 +1,333 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 30 - Topic Modeling\n",
+    "\n",
+    "Pipeline notebook for BERTopic-based topic modeling using pre-computed embeddings.\n",
+    "\n",
+    "Loads document embeddings (averaged page embeddings) from the database, fits a BERTopic\n",
+    "model with UMAP + HDBSCAN, and stores discovered topics and document-topic assignments\n",
+    "in the `topics` and `document_topics` tables."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Parameters\n",
+    "source_section = \"doj_disclosures\"\n",
+    "min_topic_size = 50\n",
+    "nr_topics = \"auto\"\n",
+    "sample_size = 100000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '/opt/epstein_env/research')\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "from research_lib.db import fetch_df, fetch_all, bulk_insert, get_conn\n",
+    "from research_lib.incremental import start_run, finish_run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Start run\n",
+    "run_id = start_run(\n",
+    "    'topic_modeling',\n",
+    "    source_section=source_section,\n",
+    "    parameters={\n",
+    "        'min_topic_size': min_topic_size,\n",
+    "        'nr_topics': nr_topics,\n",
+    "        'sample_size': sample_size,\n",
+    "    },\n",
+    ")\n",
+    "print(f'Started run {run_id}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load document-level embeddings (average of page embeddings)\n",
+    "where_clause = ''\n",
+    "params = []\n",
+    "if source_section:\n",
+    "    where_clause = 'WHERE d.source_section = %s'\n",
+    "    params = [source_section]\n",
+    "\n",
+    "sql = f\"\"\"\n",
+    "    SELECT d.id as document_id, d.source_section,\n",
+    "           AVG(p.embedding) as embedding\n",
+    "    FROM documents d\n",
+    "    JOIN pages p ON p.document_id = d.id\n",
+    "    {where_clause}\n",
+    "    AND p.embedding IS NOT NULL\n",
+    "    GROUP BY d.id, d.source_section\n",
+    "\"\"\"\n",
+    "doc_embeddings_df = fetch_df(sql, params or None)\n",
+    "print(f'Loaded embeddings for {len(doc_embeddings_df)} documents')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Also load concatenated page text per document for topic representation\n",
+    "text_sql = f\"\"\"\n",
+    "    SELECT d.id as document_id,\n",
+    "           STRING_AGG(p.ocr_text, ' ' ORDER BY p.page_number) as full_text\n",
+    "    FROM documents d\n",
+    "    JOIN pages p ON p.document_id = d.id\n",
+    "    {where_clause}\n",
+    "    AND p.ocr_text IS NOT NULL AND p.ocr_text != ''\n",
+    "    GROUP BY d.id\n",
+    "\"\"\"\n",
+    "text_df = fetch_df(text_sql, params or None)\n",
+    "print(f'Loaded text for {len(text_df)} documents')\n",
+    "\n",
+    "# Merge\n",
+    "merged_df = doc_embeddings_df.merge(text_df, on='document_id', how='inner')\n",
+    "print(f'Documents with both embeddings and text: {len(merged_df)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert embeddings to numpy array\n",
+    "embeddings = np.stack(merged_df['embedding'].values)\n",
+    "docs_text = merged_df['full_text'].tolist()\n",
+    "doc_ids = merged_df['document_id'].tolist()\n",
+    "\n",
+    "print(f'Embeddings shape: {embeddings.shape}')\n",
+    "print(f'Documents: {len(docs_text)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sample if dataset is larger than sample_size\n",
+    "if len(docs_text) > sample_size:\n",
+    "    print(f'Sampling {sample_size} documents from {len(docs_text)} for fitting...')\n",
+    "    rng = np.random.RandomState(42)\n",
+    "    sample_idx = rng.choice(len(docs_text), size=sample_size, replace=False)\n",
+    "    sample_idx.sort()\n",
+    "    fit_embeddings = embeddings[sample_idx]\n",
+    "    fit_texts = [docs_text[i] for i in sample_idx]\n",
+    "    fit_doc_ids = [doc_ids[i] for i in sample_idx]\n",
+    "else:\n",
+    "    fit_embeddings = embeddings\n",
+    "    fit_texts = docs_text\n",
+    "    fit_doc_ids = doc_ids\n",
+    "\n",
+    "print(f'Fitting on {len(fit_texts)} documents')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# BERTopic with pre-computed embeddings\n",
+    "from bertopic import BERTopic\n",
+    "from umap import UMAP\n",
+    "from hdbscan import HDBSCAN\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "\n",
+    "umap_model = UMAP(\n",
+    "    n_components=5,\n",
+    "    n_neighbors=15,\n",
+    "    metric='cosine',\n",
+    "    random_state=42,\n",
+    ")\n",
+    "hdbscan_model = HDBSCAN(\n",
+    "    min_cluster_size=min_topic_size,\n",
+    "    metric='euclidean',\n",
+    "    prediction_data=True,\n",
+    ")\n",
+    "vectorizer = CountVectorizer(\n",
+    "    stop_words='english',\n",
+    "    ngram_range=(1, 2),\n",
+    ")\n",
+    "\n",
+    "topic_model = BERTopic(\n",
+    "    embedding_model=None,  # pre-computed\n",
+    "    umap_model=umap_model,\n",
+    "    hdbscan_model=hdbscan_model,\n",
+    "    vectorizer_model=vectorizer,\n",
+    "    nr_topics=nr_topics if nr_topics != \"auto\" else None,\n",
+    "    verbose=True,\n",
+    ")\n",
+    "\n",
+    "print('Fitting BERTopic model...')\n",
+    "topics, probs = topic_model.fit_transform(fit_texts, fit_embeddings)\n",
+    "print(f'Fit complete. Found {len(set(topics)) - (1 if -1 in topics else 0)} topics.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# If we sampled, transform the full dataset\n",
+    "if len(docs_text) > sample_size:\n",
+    "    print('Transforming full dataset...')\n",
+    "    all_topics, all_probs = topic_model.transform(docs_text, embeddings)\n",
+    "else:\n",
+    "    all_topics = topics\n",
+    "    all_probs = probs\n",
+    "\n",
+    "print(f'All documents assigned topics.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Store topics in topics table\n",
+    "topic_info = topic_model.get_topic_info()\n",
+    "topic_rows = []\n",
+    "\n",
+    "for _, row in topic_info.iterrows():\n",
+    "    topic_id = row['Topic']\n",
+    "    if topic_id == -1:\n",
+    "        continue  # Skip outlier topic\n",
+    "\n",
+    "    # Get top words for this topic\n",
+    "    topic_words = topic_model.get_topic(topic_id)\n",
+    "    keywords = [w for w, _ in topic_words[:10]] if topic_words else []\n",
+    "    label = ', '.join(keywords[:5]) if keywords else f'Topic {topic_id}'\n",
+    "\n",
+    "    topic_rows.append((\n",
+    "        f'bertopic_{topic_id}',  # topic_name\n",
+    "        label,  # topic_label\n",
+    "        ','.join(keywords),  # keywords\n",
+    "        int(row['Count']),  # document_count\n",
+    "        source_section,  # source_section\n",
+    "        'topic_modeling',  # model_name\n",
+    "    ))\n",
+    "\n",
+    "if topic_rows:\n",
+    "    inserted = bulk_insert(\n",
+    "        'topics',\n",
+    "        ['topic_name', 'topic_label', 'keywords', 'document_count', 'source_section', 'model_name'],\n",
+    "        topic_rows,\n",
+    "        on_conflict='DO NOTHING',\n",
+    "    )\n",
+    "    print(f'Inserted {inserted} topics')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Store document-topic assignments\n",
+    "assignment_rows = []\n",
+    "for i, (doc_id, topic_id) in enumerate(zip(doc_ids, all_topics)):\n",
+    "    if topic_id == -1:\n",
+    "        continue  # Skip outlier assignments\n",
+    "\n",
+    "    prob = float(all_probs[i]) if all_probs is not None and len(all_probs) > i else None\n",
+    "    assignment_rows.append((\n",
+    "        doc_id,\n",
+    "        f'bertopic_{topic_id}',\n",
+    "        prob,\n",
+    "        'topic_modeling',\n",
+    "    ))\n",
+    "\n",
+    "if assignment_rows:\n",
+    "    inserted = bulk_insert(\n",
+    "        'document_topics',\n",
+    "        ['document_id', 'topic_name', 'probability', 'model_name'],\n",
+    "        assignment_rows,\n",
+    "        on_conflict='DO NOTHING',\n",
+    "    )\n",
+    "    print(f'Inserted {inserted} document-topic assignments')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Finish run\n",
+    "finish_run(run_id, documents_processed=len(doc_ids))\n",
+    "print(f'Run {run_id} completed.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Summary\n",
+    "print('=== Topic Modeling Summary ===')\n",
+    "print(f'Source section: {source_section or \"all\"}')\n",
+    "print(f'Documents processed: {len(doc_ids)}')\n",
+    "n_topics = len(set(all_topics)) - (1 if -1 in all_topics else 0)\n",
+    "n_outliers = sum(1 for t in all_topics if t == -1)\n",
+    "print(f'Topics discovered: {n_topics}')\n",
+    "print(f'Outlier documents: {n_outliers} ({100*n_outliers/len(all_topics):.1f}%)')\n",
+    "\n",
+    "print('\\nTopic overview:')\n",
+    "for _, row in topic_info.head(20).iterrows():\n",
+    "    topic_id = row['Topic']\n",
+    "    topic_words = topic_model.get_topic(topic_id)\n",
+    "    top_words = ', '.join([w for w, _ in (topic_words[:5] if topic_words else [])])\n",
+    "    print(f'  Topic {topic_id:3d}: {row[\"Count\"]:5d} docs | {top_words}')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}