datamatters24
/

research-document-archive

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 44 - Forensic Dashboard\n",
+    "\n",
+    "Interactive dashboard summarizing forensic analysis results:\n",
+    "- OCR confidence by collection (heatmap)\n",
+    "- Redaction counts per collection (bar chart)\n",
+    "- Top 20 most redacted documents (table)\n",
+    "- Classification stamp distribution (pie chart)\n",
+    "- Documents with lowest OCR confidence (table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, warnings, json\n",
+    "sys.path.insert(0, '/opt/epstein_env/research')\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "from research_lib.config import COLLECTIONS, COLLECTION_LABELS\n",
+    "from research_lib.db import fetch_df\n",
+    "from research_lib.plotting import (\n",
+    "    set_style, save_fig, COLLECTION_COLORS, collection_color,\n",
+    ")\n",
+    "\n",
+    "set_style()\n",
+    "print('Libraries loaded.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Load document features ----\n",
+    "conf_df = fetch_df(\"\"\"\n",
+    "    SELECT df.document_id, d.source_section, d.filename,\n",
+    "           df.feature_value AS avg_ocr_confidence\n",
+    "    FROM document_features df\n",
+    "    JOIN documents d ON d.id = df.document_id\n",
+    "    WHERE df.feature_name = 'avg_ocr_confidence'\n",
+    "\"\"\")\n",
+    "\n",
+    "redact_df = fetch_df(\"\"\"\n",
+    "    SELECT df.document_id, d.source_section, d.filename,\n",
+    "           df.feature_value AS total_redactions\n",
+    "    FROM document_features df\n",
+    "    JOIN documents d ON d.id = df.document_id\n",
+    "    WHERE df.feature_name = 'total_redactions'\n",
+    "\"\"\")\n",
+    "\n",
+    "stamp_df = fetch_df(\"\"\"\n",
+    "    SELECT df.document_id, d.source_section,\n",
+    "           df.feature_json AS stamps\n",
+    "    FROM document_features df\n",
+    "    JOIN documents d ON d.id = df.document_id\n",
+    "    WHERE df.feature_name = 'classification_stamps'\n",
+    "\"\"\")\n",
+    "\n",
+    "print(f'Confidence data: {len(conf_df)} docs')\n",
+    "print(f'Redaction data: {len(redact_df)} docs')\n",
+    "print(f'Stamp data: {len(stamp_df)} docs')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Heatmap: OCR Confidence by Collection ----\n",
+    "if not conf_df.empty:\n",
+    "    # Create confidence bands per collection\n",
+    "    bins = [0, 20, 40, 60, 80, 100]\n",
+    "    labels = ['0-20', '20-40', '40-60', '60-80', '80-100']\n",
+    "    conf_df['conf_band'] = pd.cut(\n",
+    "        conf_df['avg_ocr_confidence'], bins=bins, labels=labels, include_lowest=True\n",
+    "    )\n",
+    "    pivot = conf_df.groupby(['source_section', 'conf_band']).size().unstack(fill_value=0)\n",
+    "    # Normalize to percentages\n",
+    "    pivot_pct = pivot.div(pivot.sum(axis=1), axis=0) * 100\n",
+    "\n",
+    "    fig, ax = plt.subplots(figsize=(12, 8))\n",
+    "    sns.heatmap(\n",
+    "        pivot_pct, annot=True, fmt='.1f', cmap='RdYlGn',\n",
+    "        cbar_kws={'label': '% of Documents'}, ax=ax,\n",
+    "    )\n",
+    "    ax.set_title('OCR Confidence Distribution by Collection (%)')\n",
+    "    ax.set_xlabel('Confidence Band')\n",
+    "    ax.set_ylabel('Collection')\n",
+    "    plt.tight_layout()\n",
+    "    save_fig(fig, 'forensic_ocr_heatmap')\n",
+    "    plt.show()\n",
+    "else:\n",
+    "    print('No OCR confidence data available.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Bar chart: documents with redactions per collection ----\n",
+    "if not redact_df.empty:\n",
+    "    redacted_only = redact_df[redact_df['total_redactions'] > 0]\n",
+    "    by_collection = (\n",
+    "        redacted_only.groupby('source_section')\n",
+    "        .size()\n",
+    "        .reset_index(name='docs_with_redactions')\n",
+    "        .sort_values('docs_with_redactions', ascending=False)\n",
+    "    )\n",
+    "\n",
+    "    colors = [collection_color(s) for s in by_collection['source_section']]\n",
+    "\n",
+    "    fig, ax = plt.subplots(figsize=(12, 6))\n",
+    "    ax.bar(by_collection['source_section'], by_collection['docs_with_redactions'], color=colors)\n",
+    "    ax.set_title('Documents with Detected Redactions by Collection')\n",
+    "    ax.set_ylabel('Number of Documents')\n",
+    "    plt.xticks(rotation=45, ha='right')\n",
+    "    plt.tight_layout()\n",
+    "    save_fig(fig, 'forensic_redactions_by_collection')\n",
+    "    plt.show()\n",
+    "else:\n",
+    "    print('No redaction data available.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Table: Top 20 most redacted documents ----\n",
+    "if not redact_df.empty:\n",
+    "    top_redacted = (\n",
+    "        redact_df.nlargest(20, 'total_redactions')\n",
+    "        [['document_id', 'source_section', 'filename', 'total_redactions']]\n",
+    "    )\n",
+    "    print('Top 20 Most Redacted Documents:')\n",
+    "    print(top_redacted.to_string(index=False))\n",
+    "else:\n",
+    "    print('No redaction data available.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Pie chart: classification stamp distribution ----\n",
+    "if not stamp_df.empty:\n",
+    "    # Parse stamps JSON and flatten\n",
+    "    all_stamps = []\n",
+    "    for _, row in stamp_df.iterrows():\n",
+    "        stamps = row['stamps']\n",
+    "        if isinstance(stamps, str):\n",
+    "            stamps = json.loads(stamps)\n",
+    "        if stamps:\n",
+    "            all_stamps.extend(stamps)\n",
+    "\n",
+    "    if all_stamps:\n",
+    "        stamp_counts = pd.Series(all_stamps).value_counts()\n",
+    "\n",
+    "        fig, ax = plt.subplots(figsize=(10, 8))\n",
+    "        ax.pie(\n",
+    "            stamp_counts.values,\n",
+    "            labels=stamp_counts.index,\n",
+    "            autopct='%1.1f%%',\n",
+    "            startangle=140,\n",
+    "            colors=sns.color_palette('Set2', len(stamp_counts)),\n",
+    "        )\n",
+    "        ax.set_title('Classification Stamp Distribution')\n",
+    "        plt.tight_layout()\n",
+    "        save_fig(fig, 'forensic_stamps_pie')\n",
+    "        plt.show()\n",
+    "    else:\n",
+    "        print('No classification stamps found in any documents.')\n",
+    "else:\n",
+    "    print('No stamp data available.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Table: Documents with lowest OCR confidence ----\n",
+    "if not conf_df.empty:\n",
+    "    lowest = (\n",
+    "        conf_df.nsmallest(20, 'avg_ocr_confidence')\n",
+    "        [['document_id', 'source_section', 'filename', 'avg_ocr_confidence']]\n",
+    "    )\n",
+    "    print('Documents with Lowest OCR Confidence:')\n",
+    "    print(lowest.to_string(index=False))\n",
+    "else:\n",
+    "    print('No OCR confidence data available.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Combined forensic risk score ----\n",
+    "# Quick summary combining redactions + low confidence\n",
+    "if not conf_df.empty and not redact_df.empty:\n",
+    "    merged = conf_df[['document_id', 'source_section', 'filename', 'avg_ocr_confidence']].merge(\n",
+    "        redact_df[['document_id', 'total_redactions']],\n",
+    "        on='document_id', how='left',\n",
+    "    )\n",
+    "    merged['total_redactions'] = merged['total_redactions'].fillna(0)\n",
+    "\n",
+    "    # Flag: low confidence + has redactions\n",
+    "    flagged = merged[\n",
+    "        (merged['avg_ocr_confidence'] < 40) & (merged['total_redactions'] > 0)\n",
+    "    ].sort_values('avg_ocr_confidence')\n",
+    "\n",
+    "    print(f'\\nDocuments with BOTH low confidence (<40) AND redactions: {len(flagged)}')\n",
+    "    if len(flagged) > 0:\n",
+    "        print(flagged.head(20).to_string(index=False))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}