datamatters24
/

research-document-archive

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Entity Explorer\n",
+    "\n",
+    "Explore named entities extracted by the OCR/NLP pipeline:\n",
+    "- Top entities by frequency for each major entity type (PERSON, ORG, GPE, DATE)\n",
+    "- Entity type distribution\n",
+    "- Entity count per collection heatmap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "from research_lib.db import fetch_df\n",
+    "from research_lib.plotting import set_style, save_fig, COLLECTION_COLORS\n",
+    "\n",
+    "set_style()\n",
+    "print(\"Libraries loaded.\")"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Top 50 entities by frequency for PERSON, ORG, GPE, DATE\n",
+    "entity_types = [\"PERSON\", \"ORG\", \"GPE\", \"DATE\"]\n",
+    "\n",
+    "fig, axes = plt.subplots(2, 2, figsize=(20, 20))\n",
+    "axes = axes.flatten()\n",
+    "\n",
+    "for idx, etype in enumerate(entity_types):\n",
+    "    df_top = fetch_df(f\"\"\"\n",
+    "        SELECT entity_text, COUNT(*) AS freq\n",
+    "        FROM entities\n",
+    "        WHERE entity_type = '{etype}'\n",
+    "        GROUP BY entity_text\n",
+    "        ORDER BY freq DESC\n",
+    "        LIMIT 50\n",
+    "    \"\"\")\n",
+    "\n",
+    "    ax = axes[idx]\n",
+    "    if len(df_top) > 0:\n",
+    "        # Plot top 30 for readability, store full 50 in data\n",
+    "        plot_df = df_top.head(30)\n",
+    "        ax.barh(\n",
+    "            range(len(plot_df) - 1, -1, -1),\n",
+    "            plot_df[\"freq\"],\n",
+    "            color=sns.color_palette(\"viridis\", len(plot_df)),\n",
+    "        )\n",
+    "        ax.set_yticks(range(len(plot_df) - 1, -1, -1))\n",
+    "        ax.set_yticklabels(plot_df[\"entity_text\"], fontsize=8)\n",
+    "        ax.set_xlabel(\"Frequency\")\n",
+    "    ax.set_title(f\"Top {etype} Entities\")\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "save_fig(fig, \"top_entities_by_type\")\n",
+    "plt.show()\n",
+    "\n",
+    "# Print full top 50 for each type\n",
+    "for etype in entity_types:\n",
+    "    df_top = fetch_df(f\"\"\"\n",
+    "        SELECT entity_text, COUNT(*) AS freq\n",
+    "        FROM entities\n",
+    "        WHERE entity_type = '{etype}'\n",
+    "        GROUP BY entity_text\n",
+    "        ORDER BY freq DESC\n",
+    "        LIMIT 50\n",
+    "    \"\"\")\n",
+    "    print(f\"\\n=== Top 50 {etype} ===\")\n",
+    "    print(df_top.to_string(index=False))"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Entity type distribution pie chart\n",
+    "df_type_dist = fetch_df(\"\"\"\n",
+    "    SELECT entity_type, COUNT(*) AS freq\n",
+    "    FROM entities\n",
+    "    GROUP BY entity_type\n",
+    "    ORDER BY freq DESC\n",
+    "\"\"\")\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(10, 10))\n",
+    "colors = sns.color_palette(\"Set2\", len(df_type_dist))\n",
+    "wedges, texts, autotexts = ax.pie(\n",
+    "    df_type_dist[\"freq\"],\n",
+    "    labels=df_type_dist[\"entity_type\"],\n",
+    "    autopct=\"%1.1f%%\",\n",
+    "    colors=colors,\n",
+    "    pctdistance=0.85,\n",
+    ")\n",
+    "for autotext in autotexts:\n",
+    "    autotext.set_fontsize(9)\n",
+    "ax.set_title(\"Entity Type Distribution\")\n",
+    "plt.tight_layout()\n",
+    "save_fig(fig, \"entity_type_distribution\")\n",
+    "plt.show()\n",
+    "\n",
+    "print(\"\\nEntity type counts:\")\n",
+    "df_type_dist[\"pct\"] = (df_type_dist[\"freq\"] / df_type_dist[\"freq\"].sum() * 100).round(1)\n",
+    "print(df_type_dist.to_string(index=False))\n",
+    "print(f\"\\nTotal entities: {df_type_dist['freq'].sum():,}\")"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Entity count per collection heatmap\n",
+    "df_heatmap = fetch_df(\"\"\"\n",
+    "    SELECT d.source_section, e.entity_type, COUNT(*) AS freq\n",
+    "    FROM entities e\n",
+    "    JOIN pages p ON p.id = e.page_id\n",
+    "    JOIN documents d ON d.id = p.document_id\n",
+    "    GROUP BY d.source_section, e.entity_type\n",
+    "    ORDER BY d.source_section, e.entity_type\n",
+    "\"\"\")\n",
+    "\n",
+    "pivot = df_heatmap.pivot_table(\n",
+    "    index=\"source_section\", columns=\"entity_type\", values=\"freq\", fill_value=0\n",
+    ")\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(14, 8))\n",
+    "sns.heatmap(\n",
+    "    pivot,\n",
+    "    annot=True,\n",
+    "    fmt=\",.0f\",\n",
+    "    cmap=\"YlOrRd\",\n",
+    "    linewidths=0.5,\n",
+    "    ax=ax,\n",
+    ")\n",
+    "ax.set_title(\"Entity Count by Collection and Type\")\n",
+    "ax.set_xlabel(\"Entity Type\")\n",
+    "ax.set_ylabel(\"Collection\")\n",
+    "plt.tight_layout()\n",
+    "save_fig(fig, \"entity_collection_heatmap\")\n",
+    "plt.show()"
+   ],
+   "execution_count": null,
+   "outputs": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}