datamatters24 commited on
Commit
fbfe2c4
·
verified ·
1 Parent(s): 07ab7bf

Upload notebooks/01_exploration/12_sample_documents.ipynb with huggingface_hub

Browse files
notebooks/01_exploration/12_sample_documents.ipynb ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Sample Documents — OCR Quality Check\n",
8
+ "\n",
9
+ "Random sample of documents from each collection to inspect OCR quality:\n",
10
+ "- Basic metadata (file path, total pages, OCR confidence stats)\n",
11
+ "- First 500 characters of OCR text from page 1\n",
12
+ "- Flag documents with average OCR confidence below 40"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "metadata": {},
18
+ "source": [
19
+ "import pandas as pd\n",
20
+ "from IPython.display import display, HTML\n",
21
+ "\n",
22
+ "from research_lib.db import fetch_df\n",
23
+ "\n",
24
+ "pd.set_option(\"display.max_colwidth\", 120)\n",
25
+ "pd.set_option(\"display.max_rows\", 200)\n",
26
+ "print(\"Libraries loaded.\")"
27
+ ],
28
+ "execution_count": null,
29
+ "outputs": []
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "metadata": {},
34
+ "source": [
35
+ "# Random sample of 10 docs per collection with metadata\n",
36
+ "df_samples = fetch_df(\"\"\"\n",
37
+ " WITH ranked AS (\n",
38
+ " SELECT d.id AS doc_id,\n",
39
+ " d.source_section,\n",
40
+ " d.file_path,\n",
41
+ " d.total_pages,\n",
42
+ " AVG(p.ocr_confidence) AS avg_confidence,\n",
43
+ " MIN(p.ocr_confidence) AS min_confidence,\n",
44
+ " MAX(p.ocr_confidence) AS max_confidence,\n",
45
+ " ROW_NUMBER() OVER (PARTITION BY d.source_section ORDER BY RANDOM()) AS rn\n",
46
+ " FROM documents d\n",
47
+ " LEFT JOIN pages p ON p.document_id = d.id\n",
48
+ " GROUP BY d.id, d.source_section, d.file_path, d.total_pages\n",
49
+ " )\n",
50
+ " SELECT doc_id, source_section, file_path, total_pages,\n",
51
+ " ROUND(avg_confidence::numeric, 2) AS avg_confidence,\n",
52
+ " ROUND(min_confidence::numeric, 2) AS min_confidence,\n",
53
+ " ROUND(max_confidence::numeric, 2) AS max_confidence\n",
54
+ " FROM ranked\n",
55
+ " WHERE rn <= 10\n",
56
+ " ORDER BY source_section, doc_id\n",
57
+ "\"\"\")\n",
58
+ "\n",
59
+ "for section in sorted(df_samples[\"source_section\"].unique()):\n",
60
+ " subset = df_samples[df_samples[\"source_section\"] == section]\n",
61
+ " print(f\"\\n{'='*80}\")\n",
62
+ " print(f\"Collection: {section} ({len(subset)} samples)\")\n",
63
+ " print(f\"{'='*80}\")\n",
64
+ " display(subset.drop(columns=[\"source_section\"]).reset_index(drop=True))"
65
+ ],
66
+ "execution_count": null,
67
+ "outputs": []
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "metadata": {},
72
+ "source": [
73
+ "# For each sample doc, show first 500 chars of OCR text from page 1\n",
74
+ "doc_ids = df_samples[\"doc_id\"].tolist()\n",
75
+ "\n",
76
+ "if doc_ids:\n",
77
+ " ids_str = \",\".join(str(i) for i in doc_ids)\n",
78
+ " df_text = fetch_df(f\"\"\"\n",
79
+ " SELECT p.document_id AS doc_id,\n",
80
+ " d.source_section,\n",
81
+ " LEFT(p.ocr_text, 500) AS text_preview,\n",
82
+ " p.ocr_confidence\n",
83
+ " FROM pages p\n",
84
+ " JOIN documents d ON d.id = p.document_id\n",
85
+ " WHERE p.document_id IN ({ids_str})\n",
86
+ " AND p.page_number = 1\n",
87
+ " ORDER BY d.source_section, p.document_id\n",
88
+ " \"\"\")\n",
89
+ "\n",
90
+ " for _, row in df_text.iterrows():\n",
91
+ " print(f\"\\n--- Doc {row['doc_id']} [{row['source_section']}] (conf: {row['ocr_confidence']}) ---\")\n",
92
+ " print(row[\"text_preview\"])\n",
93
+ " print(\"...\")\n",
94
+ "else:\n",
95
+ " print(\"No sample documents found.\")"
96
+ ],
97
+ "execution_count": null,
98
+ "outputs": []
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "metadata": {},
103
+ "source": [
104
+ "# Flag documents with average OCR confidence < 40\n",
105
+ "df_low_quality = fetch_df(\"\"\"\n",
106
+ " SELECT d.id AS doc_id,\n",
107
+ " d.source_section,\n",
108
+ " d.file_path,\n",
109
+ " d.total_pages,\n",
110
+ " ROUND(AVG(p.ocr_confidence)::numeric, 2) AS avg_confidence,\n",
111
+ " COUNT(p.id) AS pages_with_ocr\n",
112
+ " FROM documents d\n",
113
+ " JOIN pages p ON p.document_id = d.id\n",
114
+ " GROUP BY d.id, d.source_section, d.file_path, d.total_pages\n",
115
+ " HAVING AVG(p.ocr_confidence) < 40\n",
116
+ " ORDER BY AVG(p.ocr_confidence) ASC\n",
117
+ "\"\"\")\n",
118
+ "\n",
119
+ "print(f\"Documents with avg OCR confidence < 40: {len(df_low_quality)}\")\n",
120
+ "print()\n",
121
+ "\n",
122
+ "if len(df_low_quality) > 0:\n",
123
+ " # Summary by collection\n",
124
+ " summary = df_low_quality.groupby(\"source_section\").agg(\n",
125
+ " count=(\"doc_id\", \"count\"),\n",
126
+ " avg_conf=(\"avg_confidence\", \"mean\"),\n",
127
+ " ).round(2)\n",
128
+ " print(\"Low-quality documents by collection:\")\n",
129
+ " display(summary)\n",
130
+ "\n",
131
+ " print(f\"\\nShowing first 50 low-quality documents:\")\n",
132
+ " display(df_low_quality.head(50))\n",
133
+ "else:\n",
134
+ " print(\"No documents below the confidence threshold.\")"
135
+ ],
136
+ "execution_count": null,
137
+ "outputs": []
138
+ }
139
+ ],
140
+ "metadata": {
141
+ "kernelspec": {
142
+ "display_name": "Python 3",
143
+ "language": "python",
144
+ "name": "python3"
145
+ },
146
+ "language_info": {
147
+ "name": "python",
148
+ "version": "3.10.0"
149
+ }
150
+ },
151
+ "nbformat": 4,
152
+ "nbformat_minor": 5
153
+ }