Upload notebooks/05_cross_analysis/50_timeline_analysis.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

notebooks/05_cross_analysis/50_timeline_analysis.ipynb +261 -0

notebooks/05_cross_analysis/50_timeline_analysis.ipynb ADDED Viewed

	@@ -0,0 +1,261 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 50 - Timeline Analysis\n",
+    "\n",
+    "Pipeline notebook that builds a timeline from DATE entities extracted during NER.\n",
+    "\n",
+    "- Parses DATE entity text into actual date objects using `dateutil.parser`\n",
+    "- Builds entity-date co-occurrence: PERSON/ORG entities appearing on the same page as DATE entities\n",
+    "- Stores `timeline_dates` as JSONB in `document_features`\n",
+    "- Plots date frequency histogram and summarizes date ranges per collection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Parameters\n",
+    "source_section = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, warnings, json\n",
+    "sys.path.insert(0, '/opt/epstein_env/research')\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "from dateutil import parser as dateutil_parser\n",
+    "from collections import defaultdict\n",
+    "\n",
+    "from research_lib.config import COLLECTIONS, COLLECTION_LABELS\n",
+    "from research_lib.db import fetch_df, upsert_feature\n",
+    "from research_lib.incremental import (\n",
+    "    start_run, finish_run, get_processed_doc_ids,\n",
+    ")\n",
+    "from research_lib.plotting import (\n",
+    "    set_style, save_fig, COLLECTION_COLORS, collection_color,\n",
+    ")\n",
+    "\n",
+    "set_style()\n",
+    "print('Libraries loaded.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Start incremental run ----\n",
+    "PIPELINE = 'timeline_analysis'\n",
+    "run_id = start_run(PIPELINE, source_section=source_section)\n",
+    "\n",
+    "processed_ids = get_processed_doc_ids(PIPELINE, feature_name='timeline_dates')\n",
+    "print(f'Already processed: {len(processed_ids)} documents')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Query DATE entities ----\n",
+    "section_filter = ''\n",
+    "params = []\n",
+    "if source_section:\n",
+    "    section_filter = 'AND d.source_section = %s'\n",
+    "    params.append(source_section)\n",
+    "\n",
+    "exclude_clause = ''\n",
+    "if processed_ids:\n",
+    "    exclude_clause = f\"AND e.document_id NOT IN ({','.join(str(i) for i in processed_ids)})\"\n",
+    "\n",
+    "date_entities = fetch_df(f\"\"\"\n",
+    "    SELECT e.id, e.document_id, e.page_id, e.entity_text, e.entity_type,\n",
+    "           d.source_section\n",
+    "    FROM entities e\n",
+    "    JOIN documents d ON d.id = e.document_id\n",
+    "    WHERE e.entity_type = 'DATE'\n",
+    "      {section_filter}\n",
+    "      {exclude_clause}\n",
+    "    ORDER BY e.document_id, e.page_id\n",
+    "\"\"\", params or None)\n",
+    "\n",
+    "print(f'DATE entities to process: {len(date_entities)}')\n",
+    "print(f'Across {date_entities[\"document_id\"].nunique()} documents' if len(date_entities) > 0 else '')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Parse dates ----\n",
+    "def try_parse_date(text):\n",
+    "    \"\"\"Attempt to parse a date string. Returns ISO format string or None.\"\"\"\n",
+    "    try:\n",
+    "        dt = dateutil_parser.parse(text, fuzzy=True)\n",
+    "        # Reject dates clearly out of range\n",
+    "        if dt.year < 1900 or dt.year > 2030:\n",
+    "            return None\n",
+    "        return dt.strftime('%Y-%m-%d')\n",
+    "    except (ValueError, OverflowError, TypeError):\n",
+    "        return None\n",
+    "\n",
+    "if len(date_entities) > 0:\n",
+    "    date_entities['parsed_date'] = date_entities['entity_text'].apply(try_parse_date)\n",
+    "    valid_dates = date_entities.dropna(subset=['parsed_date'])\n",
+    "    print(f'Successfully parsed: {len(valid_dates)} / {len(date_entities)} '\n",
+    "          f'({len(valid_dates)/len(date_entities)*100:.1f}%)')\n",
+    "else:\n",
+    "    valid_dates = pd.DataFrame()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Build entity-date co-occurrence ----\n",
+    "# Find PERSON/ORG entities on the same page as DATE entities\n",
+    "if len(valid_dates) > 0:\n",
+    "    # Get unique page IDs that have dates\n",
+    "    date_page_ids = valid_dates['page_id'].dropna().unique().tolist()\n",
+    "\n",
+    "    if date_page_ids:\n",
+    "        page_id_list = ','.join(str(int(pid)) for pid in date_page_ids[:50000])  # cap for safety\n",
+    "        cooccurrence_df = fetch_df(f\"\"\"\n",
+    "            SELECT e.entity_text, e.entity_type, e.page_id, e.document_id\n",
+    "            FROM entities e\n",
+    "            WHERE e.page_id IN ({page_id_list})\n",
+    "              AND e.entity_type IN ('PERSON', 'ORG')\n",
+    "        \"\"\")\n",
+    "        print(f'PERSON/ORG entities co-occurring with dates: {len(cooccurrence_df)}')\n",
+    "\n",
+    "        # Merge: for each page, which persons/orgs are near which dates\n",
+    "        date_page = valid_dates[['page_id', 'parsed_date']].drop_duplicates()\n",
+    "        cooc_merged = cooccurrence_df.merge(date_page, on='page_id', how='inner')\n",
+    "        print(f'Co-occurrence pairs: {len(cooc_merged)}')\n",
+    "    else:\n",
+    "        cooc_merged = pd.DataFrame()\n",
+    "else:\n",
+    "    cooc_merged = pd.DataFrame()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Store timeline_dates in document_features ----\n",
+    "if len(valid_dates) > 0:\n",
+    "    doc_dates = (\n",
+    "        valid_dates.groupby('document_id')['parsed_date']\n",
+    "        .apply(lambda x: sorted(x.unique().tolist()))\n",
+    "        .reset_index()\n",
+    "    )\n",
+    "\n",
+    "    rows = [\n",
+    "        (int(r.document_id), 'timeline_dates', None, json.dumps(r.parsed_date))\n",
+    "        for r in doc_dates.itertuples()\n",
+    "    ]\n",
+    "    n = upsert_feature(\n",
+    "        'document_features',\n",
+    "        ['document_id', 'feature_name'],\n",
+    "        ['feature_value', 'feature_json'],\n",
+    "        rows,\n",
+    "    )\n",
+    "    print(f'Stored timeline_dates for {n} documents')\n",
+    "\n",
+    "    finish_run(run_id, documents_processed=len(doc_dates))\n",
+    "else:\n",
+    "    finish_run(run_id, documents_processed=0)\n",
+    "    print('No valid dates to store.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Plot: date frequency histogram ----\n",
+    "if len(valid_dates) > 0:\n",
+    "    valid_dates['date_obj'] = pd.to_datetime(valid_dates['parsed_date'], errors='coerce')\n",
+    "    date_series = valid_dates['date_obj'].dropna()\n",
+    "\n",
+    "    fig, ax = plt.subplots(figsize=(14, 6))\n",
+    "    ax.hist(date_series.dt.year, bins=range(1900, 2031), color='#2563eb',\n",
+    "            edgecolor='white', alpha=0.8)\n",
+    "    ax.set_title('Distribution of Dates Found in Documents')\n",
+    "    ax.set_xlabel('Year')\n",
+    "    ax.set_ylabel('Frequency')\n",
+    "    plt.tight_layout()\n",
+    "    save_fig(fig, 'timeline_date_histogram')\n",
+    "    plt.show()\n",
+    "else:\n",
+    "    print('No dates to plot.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Most common date ranges per collection ----\n",
+    "if len(valid_dates) > 0:\n",
+    "    valid_dates['year'] = pd.to_datetime(valid_dates['parsed_date'], errors='coerce').dt.year\n",
+    "    by_collection = (\n",
+    "        valid_dates.dropna(subset=['year'])\n",
+    "        .groupby('source_section')['year']\n",
+    "        .agg(['min', 'max', 'median', 'count'])\n",
+    "        .sort_values('count', ascending=False)\n",
+    "    )\n",
+    "    print('Date ranges per collection:')\n",
+    "    print(by_collection.to_string())\n",
+    "\n",
+    "    # Top 10 most common specific dates\n",
+    "    top_dates = valid_dates['parsed_date'].value_counts().head(10)\n",
+    "    print('\\nTop 10 most frequently mentioned dates:')\n",
+    "    print(top_dates.to_string())\n",
+    "else:\n",
+    "    print('No date data available.')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}