Upload notebooks/03_topic_classification/32_keyword_extraction.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

notebooks/03_topic_classification/32_keyword_extraction.ipynb +242 -0

notebooks/03_topic_classification/32_keyword_extraction.ipynb ADDED Viewed

	@@ -0,0 +1,242 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 32 - Keyword Extraction\n",
+    "\n",
+    "Pipeline notebook for TF-IDF keyword extraction from document OCR text.\n",
+    "\n",
+    "Concatenates page-level OCR text per document, fits a TF-IDF vectorizer across the\n",
+    "corpus, and extracts the top-K keywords per document. Results stored in the\n",
+    "`document_keywords` table."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Parameters\n",
+    "source_section = None\n",
+    "top_k = 20\n",
+    "batch_size = 5000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '/opt/epstein_env/research')\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from collections import Counter, defaultdict\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "from research_lib.db import fetch_df, bulk_insert\n",
+    "from research_lib.incremental import start_run, finish_run, get_unprocessed_documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Start run\n",
+    "run_id = start_run(\n",
+    "    'keyword_extraction',\n",
+    "    source_section=source_section,\n",
+    "    parameters={'top_k': top_k, 'batch_size': batch_size},\n",
+    ")\n",
+    "print(f'Started run {run_id}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load concatenated page text per document\n",
+    "where_clause = ''\n",
+    "params = []\n",
+    "if source_section:\n",
+    "    where_clause = 'WHERE d.source_section = %s'\n",
+    "    params = [source_section]\n",
+    "\n",
+    "sql = f\"\"\"\n",
+    "    SELECT d.id as document_id, d.source_section,\n",
+    "           STRING_AGG(p.ocr_text, ' ' ORDER BY p.page_number) as full_text\n",
+    "    FROM documents d\n",
+    "    JOIN pages p ON p.document_id = d.id\n",
+    "    {where_clause}\n",
+    "    AND p.ocr_text IS NOT NULL AND p.ocr_text != ''\n",
+    "    GROUP BY d.id, d.source_section\n",
+    "    ORDER BY d.id\n",
+    "\"\"\"\n",
+    "docs_df = fetch_df(sql, params or None)\n",
+    "print(f'Loaded text for {len(docs_df)} documents')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Fit TF-IDF vectorizer\n",
+    "print('Fitting TF-IDF vectorizer...')\n",
+    "tfidf = TfidfVectorizer(\n",
+    "    max_features=50000,\n",
+    "    max_df=0.95,\n",
+    "    min_df=2,\n",
+    "    stop_words='english',\n",
+    "    ngram_range=(1, 2),\n",
+    "    sublinear_tf=True,\n",
+    "    dtype=np.float32,\n",
+    ")\n",
+    "\n",
+    "tfidf_matrix = tfidf.fit_transform(docs_df['full_text'].fillna(''))\n",
+    "feature_names = np.array(tfidf.get_feature_names_out())\n",
+    "print(f'TF-IDF matrix shape: {tfidf_matrix.shape}')\n",
+    "print(f'Vocabulary size: {len(feature_names)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract top_k keywords per document in batches\n",
+    "all_rows = []\n",
+    "doc_ids = docs_df['document_id'].tolist()\n",
+    "\n",
+    "n_batches = (len(doc_ids) + batch_size - 1) // batch_size\n",
+    "for batch_idx in tqdm(range(n_batches), desc='Extracting keywords'):\n",
+    "    start = batch_idx * batch_size\n",
+    "    end = min(start + batch_size, len(doc_ids))\n",
+    "\n",
+    "    batch_matrix = tfidf_matrix[start:end]\n",
+    "    batch_doc_ids = doc_ids[start:end]\n",
+    "\n",
+    "    for i in range(batch_matrix.shape[0]):\n",
+    "        row = batch_matrix.getrow(i)\n",
+    "        if row.nnz == 0:\n",
+    "            continue\n",
+    "\n",
+    "        # Get top_k indices by TF-IDF score\n",
+    "        data = row.toarray().flatten()\n",
+    "        top_indices = data.argsort()[::-1][:top_k]\n",
+    "\n",
+    "        for rank, idx in enumerate(top_indices, 1):\n",
+    "            score = float(data[idx])\n",
+    "            if score <= 0:\n",
+    "                break\n",
+    "            keyword = feature_names[idx]\n",
+    "            all_rows.append((\n",
+    "                batch_doc_ids[i],\n",
+    "                keyword,\n",
+    "                score,\n",
+    "                rank,\n",
+    "            ))\n",
+    "\n",
+    "print(f'Total keyword entries: {len(all_rows)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Insert into document_keywords table\n",
+    "if all_rows:\n",
+    "    # Insert in chunks to avoid memory issues\n",
+    "    chunk_size = 50000\n",
+    "    total_inserted = 0\n",
+    "    for i in tqdm(range(0, len(all_rows), chunk_size), desc='Inserting keywords'):\n",
+    "        chunk = all_rows[i:i + chunk_size]\n",
+    "        inserted = bulk_insert(\n",
+    "            'document_keywords',\n",
+    "            ['document_id', 'keyword', 'tfidf_score', 'rank'],\n",
+    "            chunk,\n",
+    "            on_conflict='DO NOTHING',\n",
+    "        )\n",
+    "        total_inserted += inserted\n",
+    "\n",
+    "    print(f'Inserted {total_inserted} keyword rows')\n",
+    "else:\n",
+    "    print('No keywords to insert.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Finish run\n",
+    "finish_run(run_id, documents_processed=len(doc_ids))\n",
+    "print(f'Run {run_id} completed.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Top 20 global keywords per collection\n",
+    "print('=== Keyword Extraction Summary ===')\n",
+    "print(f'Source section: {source_section or \"all\"}')\n",
+    "print(f'Documents processed: {len(doc_ids)}')\n",
+    "print(f'Total keyword entries: {len(all_rows)}')\n",
+    "\n",
+    "# Aggregate keywords by collection\n",
+    "keyword_scores_by_section = defaultdict(lambda: Counter())\n",
+    "for i, row in docs_df.iterrows():\n",
+    "    section = row['source_section']\n",
+    "    doc_idx = i\n",
+    "    tfidf_row = tfidf_matrix.getrow(doc_idx)\n",
+    "    if tfidf_row.nnz > 0:\n",
+    "        data = tfidf_row.toarray().flatten()\n",
+    "        top_indices = data.argsort()[::-1][:5]\n",
+    "        for idx in top_indices:\n",
+    "            if data[idx] > 0:\n",
+    "                keyword_scores_by_section[section][feature_names[idx]] += data[idx]\n",
+    "\n",
+    "print('\\nTop 20 keywords per collection:')\n",
+    "for section in sorted(keyword_scores_by_section.keys()):\n",
+    "    print(f'\\n  {section}:')\n",
+    "    for keyword, score in keyword_scores_by_section[section].most_common(20):\n",
+    "        print(f'    {keyword:30s} {score:.2f}')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}