datamatters24 commited on
Commit
00a070c
·
verified ·
1 Parent(s): eddc997

Upload notebooks/05_cross_analysis/50_timeline_analysis.ipynb with huggingface_hub

Browse files
notebooks/05_cross_analysis/50_timeline_analysis.ipynb ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# 50 - Timeline Analysis\n",
8
+ "\n",
9
+ "Pipeline notebook that builds a timeline from DATE entities extracted during NER.\n",
10
+ "\n",
11
+ "- Parses DATE entity text into actual date objects using `dateutil.parser`\n",
12
+ "- Builds entity-date co-occurrence: PERSON/ORG entities appearing on the same page as DATE entities\n",
13
+ "- Stores `timeline_dates` as JSONB in `document_features`\n",
14
+ "- Plots date frequency histogram and summarizes date ranges per collection"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "metadata": {
21
+ "tags": [
22
+ "parameters"
23
+ ]
24
+ },
25
+ "outputs": [],
26
+ "source": [
27
+ "# Parameters\n",
28
+ "source_section = None"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "import sys, warnings, json\n",
38
+ "sys.path.insert(0, '/opt/epstein_env/research')\n",
39
+ "warnings.filterwarnings('ignore')\n",
40
+ "\n",
41
+ "import pandas as pd\n",
42
+ "import numpy as np\n",
43
+ "import matplotlib.pyplot as plt\n",
44
+ "from dateutil import parser as dateutil_parser\n",
45
+ "from collections import defaultdict\n",
46
+ "\n",
47
+ "from research_lib.config import COLLECTIONS, COLLECTION_LABELS\n",
48
+ "from research_lib.db import fetch_df, upsert_feature\n",
49
+ "from research_lib.incremental import (\n",
50
+ " start_run, finish_run, get_processed_doc_ids,\n",
51
+ ")\n",
52
+ "from research_lib.plotting import (\n",
53
+ " set_style, save_fig, COLLECTION_COLORS, collection_color,\n",
54
+ ")\n",
55
+ "\n",
56
+ "set_style()\n",
57
+ "print('Libraries loaded.')"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": null,
63
+ "metadata": {},
64
+ "outputs": [],
65
+ "source": [
66
+ "# ---- Start incremental run ----\n",
67
+ "PIPELINE = 'timeline_analysis'\n",
68
+ "run_id = start_run(PIPELINE, source_section=source_section)\n",
69
+ "\n",
70
+ "processed_ids = get_processed_doc_ids(PIPELINE, feature_name='timeline_dates')\n",
71
+ "print(f'Already processed: {len(processed_ids)} documents')"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": null,
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": [
80
+ "# ---- Query DATE entities ----\n",
81
+ "section_filter = ''\n",
82
+ "params = []\n",
83
+ "if source_section:\n",
84
+ " section_filter = 'AND d.source_section = %s'\n",
85
+ " params.append(source_section)\n",
86
+ "\n",
87
+ "exclude_clause = ''\n",
88
+ "if processed_ids:\n",
89
+ " exclude_clause = f\"AND e.document_id NOT IN ({','.join(str(i) for i in processed_ids)})\"\n",
90
+ "\n",
91
+ "date_entities = fetch_df(f\"\"\"\n",
92
+ " SELECT e.id, e.document_id, e.page_id, e.entity_text, e.entity_type,\n",
93
+ " d.source_section\n",
94
+ " FROM entities e\n",
95
+ " JOIN documents d ON d.id = e.document_id\n",
96
+ " WHERE e.entity_type = 'DATE'\n",
97
+ " {section_filter}\n",
98
+ " {exclude_clause}\n",
99
+ " ORDER BY e.document_id, e.page_id\n",
100
+ "\"\"\", params or None)\n",
101
+ "\n",
102
+ "print(f'DATE entities to process: {len(date_entities)}')\n",
103
+ "print(f'Across {date_entities[\"document_id\"].nunique()} documents' if len(date_entities) > 0 else '')"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "# ---- Parse dates ----\n",
113
+ "def try_parse_date(text):\n",
114
+ " \"\"\"Attempt to parse a date string. Returns ISO format string or None.\"\"\"\n",
115
+ " try:\n",
116
+ " dt = dateutil_parser.parse(text, fuzzy=True)\n",
117
+ " # Reject dates clearly out of range\n",
118
+ " if dt.year < 1900 or dt.year > 2030:\n",
119
+ " return None\n",
120
+ " return dt.strftime('%Y-%m-%d')\n",
121
+ " except (ValueError, OverflowError, TypeError):\n",
122
+ " return None\n",
123
+ "\n",
124
+ "if len(date_entities) > 0:\n",
125
+ " date_entities['parsed_date'] = date_entities['entity_text'].apply(try_parse_date)\n",
126
+ " valid_dates = date_entities.dropna(subset=['parsed_date'])\n",
127
+ " print(f'Successfully parsed: {len(valid_dates)} / {len(date_entities)} '\n",
128
+ " f'({len(valid_dates)/len(date_entities)*100:.1f}%)')\n",
129
+ "else:\n",
130
+ " valid_dates = pd.DataFrame()"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": null,
136
+ "metadata": {},
137
+ "outputs": [],
138
+ "source": [
139
+ "# ---- Build entity-date co-occurrence ----\n",
140
+ "# Find PERSON/ORG entities on the same page as DATE entities\n",
141
+ "if len(valid_dates) > 0:\n",
142
+ " # Get unique page IDs that have dates\n",
143
+ " date_page_ids = valid_dates['page_id'].dropna().unique().tolist()\n",
144
+ "\n",
145
+ " if date_page_ids:\n",
146
+ " page_id_list = ','.join(str(int(pid)) for pid in date_page_ids[:50000]) # cap for safety\n",
147
+ " cooccurrence_df = fetch_df(f\"\"\"\n",
148
+ " SELECT e.entity_text, e.entity_type, e.page_id, e.document_id\n",
149
+ " FROM entities e\n",
150
+ " WHERE e.page_id IN ({page_id_list})\n",
151
+ " AND e.entity_type IN ('PERSON', 'ORG')\n",
152
+ " \"\"\")\n",
153
+ " print(f'PERSON/ORG entities co-occurring with dates: {len(cooccurrence_df)}')\n",
154
+ "\n",
155
+ " # Merge: for each page, which persons/orgs are near which dates\n",
156
+ " date_page = valid_dates[['page_id', 'parsed_date']].drop_duplicates()\n",
157
+ " cooc_merged = cooccurrence_df.merge(date_page, on='page_id', how='inner')\n",
158
+ " print(f'Co-occurrence pairs: {len(cooc_merged)}')\n",
159
+ " else:\n",
160
+ " cooc_merged = pd.DataFrame()\n",
161
+ "else:\n",
162
+ " cooc_merged = pd.DataFrame()"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": null,
168
+ "metadata": {},
169
+ "outputs": [],
170
+ "source": [
171
+ "# ---- Store timeline_dates in document_features ----\n",
172
+ "if len(valid_dates) > 0:\n",
173
+ " doc_dates = (\n",
174
+ " valid_dates.groupby('document_id')['parsed_date']\n",
175
+ " .apply(lambda x: sorted(x.unique().tolist()))\n",
176
+ " .reset_index()\n",
177
+ " )\n",
178
+ "\n",
179
+ " rows = [\n",
180
+ " (int(r.document_id), 'timeline_dates', None, json.dumps(r.parsed_date))\n",
181
+ " for r in doc_dates.itertuples()\n",
182
+ " ]\n",
183
+ " n = upsert_feature(\n",
184
+ " 'document_features',\n",
185
+ " ['document_id', 'feature_name'],\n",
186
+ " ['feature_value', 'feature_json'],\n",
187
+ " rows,\n",
188
+ " )\n",
189
+ " print(f'Stored timeline_dates for {n} documents')\n",
190
+ "\n",
191
+ " finish_run(run_id, documents_processed=len(doc_dates))\n",
192
+ "else:\n",
193
+ " finish_run(run_id, documents_processed=0)\n",
194
+ " print('No valid dates to store.')"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": null,
200
+ "metadata": {},
201
+ "outputs": [],
202
+ "source": [
203
+ "# ---- Plot: date frequency histogram ----\n",
204
+ "if len(valid_dates) > 0:\n",
205
+ " valid_dates['date_obj'] = pd.to_datetime(valid_dates['parsed_date'], errors='coerce')\n",
206
+ " date_series = valid_dates['date_obj'].dropna()\n",
207
+ "\n",
208
+ " fig, ax = plt.subplots(figsize=(14, 6))\n",
209
+ " ax.hist(date_series.dt.year, bins=range(1900, 2031), color='#2563eb',\n",
210
+ " edgecolor='white', alpha=0.8)\n",
211
+ " ax.set_title('Distribution of Dates Found in Documents')\n",
212
+ " ax.set_xlabel('Year')\n",
213
+ " ax.set_ylabel('Frequency')\n",
214
+ " plt.tight_layout()\n",
215
+ " save_fig(fig, 'timeline_date_histogram')\n",
216
+ " plt.show()\n",
217
+ "else:\n",
218
+ " print('No dates to plot.')"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": null,
224
+ "metadata": {},
225
+ "outputs": [],
226
+ "source": [
227
+ "# ---- Most common date ranges per collection ----\n",
228
+ "if len(valid_dates) > 0:\n",
229
+ " valid_dates['year'] = pd.to_datetime(valid_dates['parsed_date'], errors='coerce').dt.year\n",
230
+ " by_collection = (\n",
231
+ " valid_dates.dropna(subset=['year'])\n",
232
+ " .groupby('source_section')['year']\n",
233
+ " .agg(['min', 'max', 'median', 'count'])\n",
234
+ " .sort_values('count', ascending=False)\n",
235
+ " )\n",
236
+ " print('Date ranges per collection:')\n",
237
+ " print(by_collection.to_string())\n",
238
+ "\n",
239
+ " # Top 10 most common specific dates\n",
240
+ " top_dates = valid_dates['parsed_date'].value_counts().head(10)\n",
241
+ " print('\\nTop 10 most frequently mentioned dates:')\n",
242
+ " print(top_dates.to_string())\n",
243
+ "else:\n",
244
+ " print('No date data available.')"
245
+ ]
246
+ }
247
+ ],
248
+ "metadata": {
249
+ "kernelspec": {
250
+ "display_name": "Python 3",
251
+ "language": "python",
252
+ "name": "python3"
253
+ },
254
+ "language_info": {
255
+ "name": "python",
256
+ "version": "3.10.0"
257
+ }
258
+ },
259
+ "nbformat": 4,
260
+ "nbformat_minor": 5
261
+ }