datamatters24
/

research-document-archive

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sample Documents — OCR Quality Check\n",
+    "\n",
+    "Random sample of documents from each collection to inspect OCR quality:\n",
+    "- Basic metadata (file path, total pages, OCR confidence stats)\n",
+    "- First 500 characters of OCR text from page 1\n",
+    "- Flag documents with average OCR confidence below 40"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "import pandas as pd\n",
+    "from IPython.display import display, HTML\n",
+    "\n",
+    "from research_lib.db import fetch_df\n",
+    "\n",
+    "pd.set_option(\"display.max_colwidth\", 120)\n",
+    "pd.set_option(\"display.max_rows\", 200)\n",
+    "print(\"Libraries loaded.\")"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Random sample of 10 docs per collection with metadata\n",
+    "df_samples = fetch_df(\"\"\"\n",
+    "    WITH ranked AS (\n",
+    "        SELECT d.id AS doc_id,\n",
+    "               d.source_section,\n",
+    "               d.file_path,\n",
+    "               d.total_pages,\n",
+    "               AVG(p.ocr_confidence) AS avg_confidence,\n",
+    "               MIN(p.ocr_confidence) AS min_confidence,\n",
+    "               MAX(p.ocr_confidence) AS max_confidence,\n",
+    "               ROW_NUMBER() OVER (PARTITION BY d.source_section ORDER BY RANDOM()) AS rn\n",
+    "        FROM documents d\n",
+    "        LEFT JOIN pages p ON p.document_id = d.id\n",
+    "        GROUP BY d.id, d.source_section, d.file_path, d.total_pages\n",
+    "    )\n",
+    "    SELECT doc_id, source_section, file_path, total_pages,\n",
+    "           ROUND(avg_confidence::numeric, 2) AS avg_confidence,\n",
+    "           ROUND(min_confidence::numeric, 2) AS min_confidence,\n",
+    "           ROUND(max_confidence::numeric, 2) AS max_confidence\n",
+    "    FROM ranked\n",
+    "    WHERE rn <= 10\n",
+    "    ORDER BY source_section, doc_id\n",
+    "\"\"\")\n",
+    "\n",
+    "for section in sorted(df_samples[\"source_section\"].unique()):\n",
+    "    subset = df_samples[df_samples[\"source_section\"] == section]\n",
+    "    print(f\"\\n{'='*80}\")\n",
+    "    print(f\"Collection: {section} ({len(subset)} samples)\")\n",
+    "    print(f\"{'='*80}\")\n",
+    "    display(subset.drop(columns=[\"source_section\"]).reset_index(drop=True))"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# For each sample doc, show first 500 chars of OCR text from page 1\n",
+    "doc_ids = df_samples[\"doc_id\"].tolist()\n",
+    "\n",
+    "if doc_ids:\n",
+    "    ids_str = \",\".join(str(i) for i in doc_ids)\n",
+    "    df_text = fetch_df(f\"\"\"\n",
+    "        SELECT p.document_id AS doc_id,\n",
+    "               d.source_section,\n",
+    "               LEFT(p.ocr_text, 500) AS text_preview,\n",
+    "               p.ocr_confidence\n",
+    "        FROM pages p\n",
+    "        JOIN documents d ON d.id = p.document_id\n",
+    "        WHERE p.document_id IN ({ids_str})\n",
+    "          AND p.page_number = 1\n",
+    "        ORDER BY d.source_section, p.document_id\n",
+    "    \"\"\")\n",
+    "\n",
+    "    for _, row in df_text.iterrows():\n",
+    "        print(f\"\\n--- Doc {row['doc_id']} [{row['source_section']}] (conf: {row['ocr_confidence']}) ---\")\n",
+    "        print(row[\"text_preview\"])\n",
+    "        print(\"...\")\n",
+    "else:\n",
+    "    print(\"No sample documents found.\")"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Flag documents with average OCR confidence < 40\n",
+    "df_low_quality = fetch_df(\"\"\"\n",
+    "    SELECT d.id AS doc_id,\n",
+    "           d.source_section,\n",
+    "           d.file_path,\n",
+    "           d.total_pages,\n",
+    "           ROUND(AVG(p.ocr_confidence)::numeric, 2) AS avg_confidence,\n",
+    "           COUNT(p.id) AS pages_with_ocr\n",
+    "    FROM documents d\n",
+    "    JOIN pages p ON p.document_id = d.id\n",
+    "    GROUP BY d.id, d.source_section, d.file_path, d.total_pages\n",
+    "    HAVING AVG(p.ocr_confidence) < 40\n",
+    "    ORDER BY AVG(p.ocr_confidence) ASC\n",
+    "\"\"\")\n",
+    "\n",
+    "print(f\"Documents with avg OCR confidence < 40: {len(df_low_quality)}\")\n",
+    "print()\n",
+    "\n",
+    "if len(df_low_quality) > 0:\n",
+    "    # Summary by collection\n",
+    "    summary = df_low_quality.groupby(\"source_section\").agg(\n",
+    "        count=(\"doc_id\", \"count\"),\n",
+    "        avg_conf=(\"avg_confidence\", \"mean\"),\n",
+    "    ).round(2)\n",
+    "    print(\"Low-quality documents by collection:\")\n",
+    "    display(summary)\n",
+    "\n",
+    "    print(f\"\\nShowing first 50 low-quality documents:\")\n",
+    "    display(df_low_quality.head(50))\n",
+    "else:\n",
+    "    print(\"No documents below the confidence threshold.\")"
+   ],
+   "execution_count": null,
+   "outputs": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}