Upload notebooks/01_exploration/10_collection_overview.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

notebooks/01_exploration/10_collection_overview.ipynb +219 -0

notebooks/01_exploration/10_collection_overview.ipynb ADDED Viewed

	@@ -0,0 +1,219 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Collection Overview\n",
+    "\n",
+    "High-level statistics and visualizations across all document collections:\n",
+    "- Document counts per collection\n",
+    "- Page count distributions\n",
+    "- OCR confidence distributions\n",
+    "- Total size per collection\n",
+    "- Processing timeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import plotly.express as px\n",
+    "\n",
+    "from research_lib.db import fetch_df\n",
+    "from research_lib.plotting import set_style, save_fig, COLLECTION_COLORS\n",
+    "\n",
+    "set_style()\n",
+    "print(\"Libraries loaded.\")"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Document counts per collection\n",
+    "df_counts = fetch_df(\"\"\"\n",
+    "    SELECT source_section, COUNT(*) AS doc_count\n",
+    "    FROM documents\n",
+    "    GROUP BY source_section\n",
+    "    ORDER BY doc_count DESC\n",
+    "\"\"\")\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(12, 6))\n",
+    "colors = [COLLECTION_COLORS.get(s, \"#999999\") for s in df_counts[\"source_section\"]]\n",
+    "ax.barh(df_counts[\"source_section\"], df_counts[\"doc_count\"], color=colors)\n",
+    "ax.set_xlabel(\"Number of Documents\")\n",
+    "ax.set_title(\"Document Count by Collection\")\n",
+    "for i, (v, label) in enumerate(zip(df_counts[\"doc_count\"], df_counts[\"source_section\"])):\n",
+    "    ax.text(v + max(df_counts[\"doc_count\"]) * 0.01, i, f\"{v:,}\", va=\"center\")\n",
+    "plt.tight_layout()\n",
+    "save_fig(fig, \"collection_doc_counts\")\n",
+    "plt.show()\n",
+    "\n",
+    "print(f\"\\nTotal documents: {df_counts['doc_count'].sum():,}\")"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Page count distribution per collection (histogram)\n",
+    "df_pages = fetch_df(\"\"\"\n",
+    "    SELECT d.source_section, d.id AS doc_id, COUNT(p.id) AS page_count\n",
+    "    FROM documents d\n",
+    "    LEFT JOIN pages p ON p.document_id = d.id\n",
+    "    GROUP BY d.source_section, d.id\n",
+    "\"\"\")\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(14, 6))\n",
+    "for section in sorted(df_pages[\"source_section\"].unique()):\n",
+    "    subset = df_pages[df_pages[\"source_section\"] == section]\n",
+    "    color = COLLECTION_COLORS.get(section, \"#999999\")\n",
+    "    ax.hist(subset[\"page_count\"], bins=50, alpha=0.6, label=section, color=color)\n",
+    "ax.set_xlabel(\"Pages per Document\")\n",
+    "ax.set_ylabel(\"Number of Documents\")\n",
+    "ax.set_title(\"Page Count Distribution by Collection\")\n",
+    "ax.legend()\n",
+    "plt.tight_layout()\n",
+    "save_fig(fig, \"page_count_distribution\")\n",
+    "plt.show()\n",
+    "\n",
+    "print(\"\\nPage count statistics per collection:\")\n",
+    "print(df_pages.groupby(\"source_section\")[\"page_count\"].describe().round(1).to_string())"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# OCR confidence distribution per collection (box plots)\n",
+    "df_ocr = fetch_df(\"\"\"\n",
+    "    SELECT d.source_section, p.ocr_confidence\n",
+    "    FROM pages p\n",
+    "    JOIN documents d ON d.id = p.document_id\n",
+    "    WHERE p.ocr_confidence IS NOT NULL\n",
+    "\"\"\")\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(14, 7))\n",
+    "palette = {s: COLLECTION_COLORS.get(s, \"#999999\") for s in df_ocr[\"source_section\"].unique()}\n",
+    "sns.boxplot(\n",
+    "    data=df_ocr,\n",
+    "    x=\"source_section\",\n",
+    "    y=\"ocr_confidence\",\n",
+    "    palette=palette,\n",
+    "    ax=ax,\n",
+    ")\n",
+    "ax.set_xlabel(\"Collection\")\n",
+    "ax.set_ylabel(\"OCR Confidence\")\n",
+    "ax.set_title(\"OCR Confidence Distribution by Collection\")\n",
+    "plt.xticks(rotation=45, ha=\"right\")\n",
+    "plt.tight_layout()\n",
+    "save_fig(fig, \"ocr_confidence_distribution\")\n",
+    "plt.show()\n",
+    "\n",
+    "print(\"\\nOCR confidence statistics per collection:\")\n",
+    "print(df_ocr.groupby(\"source_section\")[\"ocr_confidence\"].describe().round(2).to_string())"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Total size per collection\n",
+    "df_size = fetch_df(\"\"\"\n",
+    "    SELECT source_section,\n",
+    "           COUNT(*) AS doc_count,\n",
+    "           SUM(file_size_bytes) AS total_bytes,\n",
+    "           AVG(file_size_bytes) AS avg_bytes\n",
+    "    FROM documents\n",
+    "    WHERE file_size_bytes IS NOT NULL\n",
+    "    GROUP BY source_section\n",
+    "    ORDER BY total_bytes DESC\n",
+    "\"\"\")\n",
+    "\n",
+    "df_size[\"total_gb\"] = df_size[\"total_bytes\"] / (1024**3)\n",
+    "df_size[\"avg_mb\"] = df_size[\"avg_bytes\"] / (1024**2)\n",
+    "\n",
+    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
+    "\n",
+    "colors = [COLLECTION_COLORS.get(s, \"#999999\") for s in df_size[\"source_section\"]]\n",
+    "axes[0].barh(df_size[\"source_section\"], df_size[\"total_gb\"], color=colors)\n",
+    "axes[0].set_xlabel(\"Total Size (GB)\")\n",
+    "axes[0].set_title(\"Total Collection Size\")\n",
+    "\n",
+    "axes[1].barh(df_size[\"source_section\"], df_size[\"avg_mb\"], color=colors)\n",
+    "axes[1].set_xlabel(\"Average Document Size (MB)\")\n",
+    "axes[1].set_title(\"Average Document Size\")\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "save_fig(fig, \"collection_sizes\")\n",
+    "plt.show()\n",
+    "\n",
+    "print(df_size[[\"source_section\", \"doc_count\", \"total_gb\", \"avg_mb\"]].to_string(index=False))"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Processing timeline (if processed_at column exists)\n",
+    "try:\n",
+    "    df_timeline = fetch_df(\"\"\"\n",
+    "        SELECT source_section,\n",
+    "               DATE(processed_at) AS process_date,\n",
+    "               COUNT(*) AS docs_processed\n",
+    "        FROM documents\n",
+    "        WHERE processed_at IS NOT NULL\n",
+    "        GROUP BY source_section, DATE(processed_at)\n",
+    "        ORDER BY process_date\n",
+    "    \"\"\")\n",
+    "\n",
+    "    if len(df_timeline) > 0:\n",
+    "        fig = px.line(\n",
+    "            df_timeline,\n",
+    "            x=\"process_date\",\n",
+    "            y=\"docs_processed\",\n",
+    "            color=\"source_section\",\n",
+    "            title=\"Document Processing Timeline\",\n",
+    "            labels={\"process_date\": \"Date\", \"docs_processed\": \"Documents Processed\"},\n",
+    "        )\n",
+    "        fig.update_layout(width=1000, height=500)\n",
+    "        fig.show()\n",
+    "    else:\n",
+    "        print(\"No processed_at data available for timeline.\")\n",
+    "\n",
+    "except Exception as e:\n",
+    "    print(f\"Timeline not available: {e}\")\n",
+    "    print(\"The 'processed_at' column may not exist in the documents table.\")"
+   ],
+   "execution_count": null,
+   "outputs": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}