datamatters24 commited on
Commit
fd37e45
·
verified ·
1 Parent(s): fbfe2c4

Upload notebooks/01_exploration/10_collection_overview.ipynb with huggingface_hub

Browse files
notebooks/01_exploration/10_collection_overview.ipynb ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Collection Overview\n",
8
+ "\n",
9
+ "High-level statistics and visualizations across all document collections:\n",
10
+ "- Document counts per collection\n",
11
+ "- Page count distributions\n",
12
+ "- OCR confidence distributions\n",
13
+ "- Total size per collection\n",
14
+ "- Processing timeline"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "metadata": {},
20
+ "source": [
21
+ "import pandas as pd\n",
22
+ "import matplotlib.pyplot as plt\n",
23
+ "import seaborn as sns\n",
24
+ "import plotly.express as px\n",
25
+ "\n",
26
+ "from research_lib.db import fetch_df\n",
27
+ "from research_lib.plotting import set_style, save_fig, COLLECTION_COLORS\n",
28
+ "\n",
29
+ "set_style()\n",
30
+ "print(\"Libraries loaded.\")"
31
+ ],
32
+ "execution_count": null,
33
+ "outputs": []
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "metadata": {},
38
+ "source": [
39
+ "# Document counts per collection\n",
40
+ "df_counts = fetch_df(\"\"\"\n",
41
+ " SELECT source_section, COUNT(*) AS doc_count\n",
42
+ " FROM documents\n",
43
+ " GROUP BY source_section\n",
44
+ " ORDER BY doc_count DESC\n",
45
+ "\"\"\")\n",
46
+ "\n",
47
+ "fig, ax = plt.subplots(figsize=(12, 6))\n",
48
+ "colors = [COLLECTION_COLORS.get(s, \"#999999\") for s in df_counts[\"source_section\"]]\n",
49
+ "ax.barh(df_counts[\"source_section\"], df_counts[\"doc_count\"], color=colors)\n",
50
+ "ax.set_xlabel(\"Number of Documents\")\n",
51
+ "ax.set_title(\"Document Count by Collection\")\n",
52
+ "for i, (v, label) in enumerate(zip(df_counts[\"doc_count\"], df_counts[\"source_section\"])):\n",
53
+ " ax.text(v + max(df_counts[\"doc_count\"]) * 0.01, i, f\"{v:,}\", va=\"center\")\n",
54
+ "plt.tight_layout()\n",
55
+ "save_fig(fig, \"collection_doc_counts\")\n",
56
+ "plt.show()\n",
57
+ "\n",
58
+ "print(f\"\\nTotal documents: {df_counts['doc_count'].sum():,}\")"
59
+ ],
60
+ "execution_count": null,
61
+ "outputs": []
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "metadata": {},
66
+ "source": [
67
+ "# Page count distribution per collection (histogram)\n",
68
+ "df_pages = fetch_df(\"\"\"\n",
69
+ " SELECT d.source_section, d.id AS doc_id, COUNT(p.id) AS page_count\n",
70
+ " FROM documents d\n",
71
+ " LEFT JOIN pages p ON p.document_id = d.id\n",
72
+ " GROUP BY d.source_section, d.id\n",
73
+ "\"\"\")\n",
74
+ "\n",
75
+ "fig, ax = plt.subplots(figsize=(14, 6))\n",
76
+ "for section in sorted(df_pages[\"source_section\"].unique()):\n",
77
+ " subset = df_pages[df_pages[\"source_section\"] == section]\n",
78
+ " color = COLLECTION_COLORS.get(section, \"#999999\")\n",
79
+ " ax.hist(subset[\"page_count\"], bins=50, alpha=0.6, label=section, color=color)\n",
80
+ "ax.set_xlabel(\"Pages per Document\")\n",
81
+ "ax.set_ylabel(\"Number of Documents\")\n",
82
+ "ax.set_title(\"Page Count Distribution by Collection\")\n",
83
+ "ax.legend()\n",
84
+ "plt.tight_layout()\n",
85
+ "save_fig(fig, \"page_count_distribution\")\n",
86
+ "plt.show()\n",
87
+ "\n",
88
+ "print(\"\\nPage count statistics per collection:\")\n",
89
+ "print(df_pages.groupby(\"source_section\")[\"page_count\"].describe().round(1).to_string())"
90
+ ],
91
+ "execution_count": null,
92
+ "outputs": []
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "metadata": {},
97
+ "source": [
98
+ "# OCR confidence distribution per collection (box plots)\n",
99
+ "df_ocr = fetch_df(\"\"\"\n",
100
+ " SELECT d.source_section, p.ocr_confidence\n",
101
+ " FROM pages p\n",
102
+ " JOIN documents d ON d.id = p.document_id\n",
103
+ " WHERE p.ocr_confidence IS NOT NULL\n",
104
+ "\"\"\")\n",
105
+ "\n",
106
+ "fig, ax = plt.subplots(figsize=(14, 7))\n",
107
+ "palette = {s: COLLECTION_COLORS.get(s, \"#999999\") for s in df_ocr[\"source_section\"].unique()}\n",
108
+ "sns.boxplot(\n",
109
+ " data=df_ocr,\n",
110
+ " x=\"source_section\",\n",
111
+ " y=\"ocr_confidence\",\n",
112
+ " palette=palette,\n",
113
+ " ax=ax,\n",
114
+ ")\n",
115
+ "ax.set_xlabel(\"Collection\")\n",
116
+ "ax.set_ylabel(\"OCR Confidence\")\n",
117
+ "ax.set_title(\"OCR Confidence Distribution by Collection\")\n",
118
+ "plt.xticks(rotation=45, ha=\"right\")\n",
119
+ "plt.tight_layout()\n",
120
+ "save_fig(fig, \"ocr_confidence_distribution\")\n",
121
+ "plt.show()\n",
122
+ "\n",
123
+ "print(\"\\nOCR confidence statistics per collection:\")\n",
124
+ "print(df_ocr.groupby(\"source_section\")[\"ocr_confidence\"].describe().round(2).to_string())"
125
+ ],
126
+ "execution_count": null,
127
+ "outputs": []
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "metadata": {},
132
+ "source": [
133
+ "# Total size per collection\n",
134
+ "df_size = fetch_df(\"\"\"\n",
135
+ " SELECT source_section,\n",
136
+ " COUNT(*) AS doc_count,\n",
137
+ " SUM(file_size_bytes) AS total_bytes,\n",
138
+ " AVG(file_size_bytes) AS avg_bytes\n",
139
+ " FROM documents\n",
140
+ " WHERE file_size_bytes IS NOT NULL\n",
141
+ " GROUP BY source_section\n",
142
+ " ORDER BY total_bytes DESC\n",
143
+ "\"\"\")\n",
144
+ "\n",
145
+ "df_size[\"total_gb\"] = df_size[\"total_bytes\"] / (1024**3)\n",
146
+ "df_size[\"avg_mb\"] = df_size[\"avg_bytes\"] / (1024**2)\n",
147
+ "\n",
148
+ "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
149
+ "\n",
150
+ "colors = [COLLECTION_COLORS.get(s, \"#999999\") for s in df_size[\"source_section\"]]\n",
151
+ "axes[0].barh(df_size[\"source_section\"], df_size[\"total_gb\"], color=colors)\n",
152
+ "axes[0].set_xlabel(\"Total Size (GB)\")\n",
153
+ "axes[0].set_title(\"Total Collection Size\")\n",
154
+ "\n",
155
+ "axes[1].barh(df_size[\"source_section\"], df_size[\"avg_mb\"], color=colors)\n",
156
+ "axes[1].set_xlabel(\"Average Document Size (MB)\")\n",
157
+ "axes[1].set_title(\"Average Document Size\")\n",
158
+ "\n",
159
+ "plt.tight_layout()\n",
160
+ "save_fig(fig, \"collection_sizes\")\n",
161
+ "plt.show()\n",
162
+ "\n",
163
+ "print(df_size[[\"source_section\", \"doc_count\", \"total_gb\", \"avg_mb\"]].to_string(index=False))"
164
+ ],
165
+ "execution_count": null,
166
+ "outputs": []
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "metadata": {},
171
+ "source": [
172
+ "# Processing timeline (if processed_at column exists)\n",
173
+ "try:\n",
174
+ " df_timeline = fetch_df(\"\"\"\n",
175
+ " SELECT source_section,\n",
176
+ " DATE(processed_at) AS process_date,\n",
177
+ " COUNT(*) AS docs_processed\n",
178
+ " FROM documents\n",
179
+ " WHERE processed_at IS NOT NULL\n",
180
+ " GROUP BY source_section, DATE(processed_at)\n",
181
+ " ORDER BY process_date\n",
182
+ " \"\"\")\n",
183
+ "\n",
184
+ " if len(df_timeline) > 0:\n",
185
+ " fig = px.line(\n",
186
+ " df_timeline,\n",
187
+ " x=\"process_date\",\n",
188
+ " y=\"docs_processed\",\n",
189
+ " color=\"source_section\",\n",
190
+ " title=\"Document Processing Timeline\",\n",
191
+ " labels={\"process_date\": \"Date\", \"docs_processed\": \"Documents Processed\"},\n",
192
+ " )\n",
193
+ " fig.update_layout(width=1000, height=500)\n",
194
+ " fig.show()\n",
195
+ " else:\n",
196
+ " print(\"No processed_at data available for timeline.\")\n",
197
+ "\n",
198
+ "except Exception as e:\n",
199
+ " print(f\"Timeline not available: {e}\")\n",
200
+ " print(\"The 'processed_at' column may not exist in the documents table.\")"
201
+ ],
202
+ "execution_count": null,
203
+ "outputs": []
204
+ }
205
+ ],
206
+ "metadata": {
207
+ "kernelspec": {
208
+ "display_name": "Python 3",
209
+ "language": "python",
210
+ "name": "python3"
211
+ },
212
+ "language_info": {
213
+ "name": "python",
214
+ "version": "3.10.0"
215
+ }
216
+ },
217
+ "nbformat": 4,
218
+ "nbformat_minor": 5
219
+ }