Upload notebooks/05_cross_analysis/52_summary_dashboard.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

notebooks/05_cross_analysis/52_summary_dashboard.ipynb +390 -0

notebooks/05_cross_analysis/52_summary_dashboard.ipynb ADDED Viewed

	@@ -0,0 +1,390 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 52 - Summary Dashboard\n",
+    "\n",
+    "Master overview dashboard pulling from all analysis tables.\n",
+    "\n",
+    "- Collection stats (from `collection_stats` materialized view)\n",
+    "- Top entities across all collections\n",
+    "- Top topics per collection\n",
+    "- Forensic alerts: most redacted, lowest confidence, classification stamps\n",
+    "- Entity network summary: most connected nodes, largest communities\n",
+    "- Key metrics as a printable report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, warnings, json\n",
+    "sys.path.insert(0, '/opt/epstein_env/research')\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from datetime import datetime\n",
+    "\n",
+    "from research_lib.config import COLLECTIONS, COLLECTION_LABELS\n",
+    "from research_lib.db import fetch_df, fetch_all\n",
+    "from research_lib.plotting import (\n",
+    "    set_style, save_fig, COLLECTION_COLORS, collection_color,\n",
+    ")\n",
+    "\n",
+    "set_style()\n",
+    "print('Summary Dashboard loaded.')\n",
+    "print(f'Report generated: {datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Collection Statistics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Collection stats from materialized view ----\n",
+    "try:\n",
+    "    collection_stats = fetch_df('SELECT * FROM collection_stats ORDER BY source_section')\n",
+    "    print('Collection Statistics:')\n",
+    "    print(collection_stats.to_string(index=False))\n",
+    "except Exception as e:\n",
+    "    print(f'collection_stats view not available: {e}')\n",
+    "    # Fallback: compute from documents table\n",
+    "    collection_stats = fetch_df(\"\"\"\n",
+    "        SELECT source_section,\n",
+    "               COUNT(*) AS document_count,\n",
+    "               SUM(page_count) AS total_pages\n",
+    "        FROM documents\n",
+    "        GROUP BY source_section\n",
+    "        ORDER BY source_section\n",
+    "    \"\"\")\n",
+    "    print('Collection Statistics (from documents table):')\n",
+    "    print(collection_stats.to_string(index=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Top Entities Across All Collections"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Top 10 entities across all collections ----\n",
+    "top_entities = fetch_df(\"\"\"\n",
+    "    SELECT entity_text, entity_type,\n",
+    "           COUNT(*) AS mention_count,\n",
+    "           COUNT(DISTINCT document_id) AS doc_count,\n",
+    "           COUNT(DISTINCT d.source_section) AS collection_count\n",
+    "    FROM entities e\n",
+    "    JOIN documents d ON d.id = e.document_id\n",
+    "    WHERE e.entity_type IN ('PERSON', 'ORG', 'GPE')\n",
+    "    GROUP BY entity_text, entity_type\n",
+    "    ORDER BY doc_count DESC\n",
+    "    LIMIT 10\n",
+    "\"\"\")\n",
+    "\n",
+    "print('Top 10 Entities (by document count):')\n",
+    "print(top_entities.to_string(index=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Top Topics Per Collection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Topics per collection ----\n",
+    "try:\n",
+    "    topics_df = fetch_df(\"\"\"\n",
+    "        SELECT source_section, topic_label, document_count, top_words\n",
+    "        FROM topics\n",
+    "        WHERE topic_label IS NOT NULL\n",
+    "        ORDER BY source_section, document_count DESC\n",
+    "    \"\"\")\n",
+    "\n",
+    "    if not topics_df.empty:\n",
+    "        for section in topics_df['source_section'].unique():\n",
+    "            section_topics = topics_df[topics_df['source_section'] == section].head(5)\n",
+    "            label = COLLECTION_LABELS.get(section, section)\n",
+    "            print(f'\\n--- {label} ---')\n",
+    "            print(section_topics[['topic_label', 'document_count']].to_string(index=False))\n",
+    "    else:\n",
+    "        print('No topic data available.')\n",
+    "except Exception as e:\n",
+    "    print(f'Topics table not available: {e}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Forensic Alerts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Documents with most redactions ----\n",
+    "try:\n",
+    "    most_redacted = fetch_df(\"\"\"\n",
+    "        SELECT df.document_id, d.source_section, d.filename,\n",
+    "               df.feature_value AS total_redactions\n",
+    "        FROM document_features df\n",
+    "        JOIN documents d ON d.id = df.document_id\n",
+    "        WHERE df.feature_name = 'total_redactions' AND df.feature_value > 0\n",
+    "        ORDER BY df.feature_value DESC\n",
+    "        LIMIT 10\n",
+    "    \"\"\")\n",
+    "    print('ALERT: Most Redacted Documents')\n",
+    "    if not most_redacted.empty:\n",
+    "        print(most_redacted.to_string(index=False))\n",
+    "    else:\n",
+    "        print('  No redacted documents found.')\n",
+    "except Exception:\n",
+    "    print('  Redaction data not available.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Documents with lowest OCR confidence ----\n",
+    "try:\n",
+    "    lowest_conf = fetch_df(\"\"\"\n",
+    "        SELECT df.document_id, d.source_section, d.filename,\n",
+    "               df.feature_value AS avg_ocr_confidence\n",
+    "        FROM document_features df\n",
+    "        JOIN documents d ON d.id = df.document_id\n",
+    "        WHERE df.feature_name = 'avg_ocr_confidence'\n",
+    "        ORDER BY df.feature_value ASC\n",
+    "        LIMIT 10\n",
+    "    \"\"\")\n",
+    "    print('\\nALERT: Lowest OCR Confidence Documents')\n",
+    "    if not lowest_conf.empty:\n",
+    "        print(lowest_conf.to_string(index=False))\n",
+    "    else:\n",
+    "        print('  No OCR confidence data found.')\n",
+    "except Exception:\n",
+    "    print('  OCR confidence data not available.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Classification stamps summary ----\n",
+    "try:\n",
+    "    stamps_df = fetch_df(\"\"\"\n",
+    "        SELECT df.feature_json AS stamps, d.source_section\n",
+    "        FROM document_features df\n",
+    "        JOIN documents d ON d.id = df.document_id\n",
+    "        WHERE df.feature_name = 'classification_stamps'\n",
+    "          AND df.feature_json IS NOT NULL\n",
+    "          AND df.feature_json != '[]'\n",
+    "    \"\"\")\n",
+    "\n",
+    "    all_stamps = []\n",
+    "    for _, row in stamps_df.iterrows():\n",
+    "        s = row['stamps']\n",
+    "        if isinstance(s, str):\n",
+    "            s = json.loads(s)\n",
+    "        if s:\n",
+    "            all_stamps.extend(s)\n",
+    "\n",
+    "    print('\\nALERT: Classification Stamps Found')\n",
+    "    if all_stamps:\n",
+    "        stamp_counts = pd.Series(all_stamps).value_counts()\n",
+    "        print(stamp_counts.to_string())\n",
+    "        print(f'\\nTotal documents with stamps: {len(stamps_df)}')\n",
+    "    else:\n",
+    "        print('  No stamps found.')\n",
+    "except Exception:\n",
+    "    print('  Stamp data not available.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Entity Network Summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Most connected entities (by relationship count) ----\n",
+    "try:\n",
+    "    connected = fetch_df(\"\"\"\n",
+    "        SELECT entity, relationship_count FROM (\n",
+    "            SELECT entity_a AS entity, COUNT(*) AS relationship_count\n",
+    "            FROM entity_relationships\n",
+    "            GROUP BY entity_a\n",
+    "            UNION ALL\n",
+    "            SELECT entity_b AS entity, COUNT(*) AS relationship_count\n",
+    "            FROM entity_relationships\n",
+    "            GROUP BY entity_b\n",
+    "        ) sub\n",
+    "        GROUP BY entity\n",
+    "        ORDER BY SUM(relationship_count) DESC\n",
+    "        LIMIT 20\n",
+    "    \"\"\")\n",
+    "    print('Most Connected Entities (by relationship count):')\n",
+    "    if not connected.empty:\n",
+    "        print(connected.to_string(index=False))\n",
+    "    else:\n",
+    "        print('  No entity relationships found.')\n",
+    "except Exception:\n",
+    "    print('  Entity relationship data not available.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Community summary (if available) ----\n",
+    "try:\n",
+    "    communities = fetch_df(\"\"\"\n",
+    "        SELECT feature_json->>'community' AS community,\n",
+    "               COUNT(*) AS member_count\n",
+    "        FROM document_features\n",
+    "        WHERE feature_name = 'community_id'\n",
+    "        GROUP BY feature_json->>'community'\n",
+    "        ORDER BY member_count DESC\n",
+    "        LIMIT 10\n",
+    "    \"\"\")\n",
+    "    if not communities.empty:\n",
+    "        print('\\nLargest Entity Communities:')\n",
+    "        print(communities.to_string(index=False))\n",
+    "except Exception:\n",
+    "    pass  # community data may not be available yet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Key Metrics Report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Printable summary report ----\n",
+    "print('=' * 70)\n",
+    "print('RESEARCH ANALYSIS -- KEY METRICS REPORT')\n",
+    "print(f'Generated: {datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")}')\n",
+    "print('=' * 70)\n",
+    "\n",
+    "# Document counts\n",
+    "doc_count = fetch_df('SELECT COUNT(*) AS cnt FROM documents')\n",
+    "page_count = fetch_df('SELECT COUNT(*) AS cnt FROM pages')\n",
+    "entity_count = fetch_df('SELECT COUNT(*) AS cnt FROM entities')\n",
+    "\n",
+    "print(f\"\\nTotal Documents:   {doc_count['cnt'].iloc[0]:>10,}\")\n",
+    "print(f\"Total Pages:       {page_count['cnt'].iloc[0]:>10,}\")\n",
+    "print(f\"Total Entities:    {entity_count['cnt'].iloc[0]:>10,}\")\n",
+    "\n",
+    "# Feature counts\n",
+    "try:\n",
+    "    feat_counts = fetch_df(\"\"\"\n",
+    "        SELECT feature_name, COUNT(*) AS cnt\n",
+    "        FROM document_features\n",
+    "        GROUP BY feature_name\n",
+    "        ORDER BY cnt DESC\n",
+    "    \"\"\")\n",
+    "    print('\\nDocument Features Computed:')\n",
+    "    for _, r in feat_counts.iterrows():\n",
+    "        print(f\"  {r['feature_name']:<30} {r['cnt']:>8,}\")\n",
+    "except Exception:\n",
+    "    pass\n",
+    "\n",
+    "# Topic counts\n",
+    "try:\n",
+    "    topic_count = fetch_df('SELECT COUNT(*) AS cnt FROM topics')\n",
+    "    print(f\"\\nTopics Discovered: {topic_count['cnt'].iloc[0]:>10,}\")\n",
+    "except Exception:\n",
+    "    pass\n",
+    "\n",
+    "# Duplicate pairs\n",
+    "try:\n",
+    "    dup_count = fetch_df('SELECT COUNT(*) AS cnt FROM duplicate_pairs')\n",
+    "    print(f\"Duplicate Pairs:   {dup_count['cnt'].iloc[0]:>10,}\")\n",
+    "except Exception:\n",
+    "    pass\n",
+    "\n",
+    "# Pipeline runs\n",
+    "try:\n",
+    "    runs = fetch_df(\"\"\"\n",
+    "        SELECT pipeline_name, status, COUNT(*) AS runs,\n",
+    "               SUM(documents_processed) AS total_docs_processed\n",
+    "        FROM analysis_runs\n",
+    "        GROUP BY pipeline_name, status\n",
+    "        ORDER BY pipeline_name\n",
+    "    \"\"\")\n",
+    "    print('\\nPipeline Runs:')\n",
+    "    print(runs.to_string(index=False))\n",
+    "except Exception:\n",
+    "    pass\n",
+    "\n",
+    "print('\\n' + '=' * 70)\n",
+    "print('END OF REPORT')\n",
+    "print('=' * 70)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}