datamatters24
/

research-document-archive

+#!/usr/bin/env python3
+"""
+Stage 3: Correlate documents with historical crisis events.
+Scoring methods:
+  1. Date overlap — document date falls within event date range (or within buffer)
+  2. Keyword match — OCR text contains event-specific keywords
+  3. Entity overlap — extracted entities match event keywords
+  4. Collection affinity — source_section naturally maps to certain events
+Each method contributes a partial score; combined score determines relevance.
+Populates: document_events table
+"""
+import json
+import logging
+import re
+import sys
+from datetime import date, timedelta
+import psycopg2
+import psycopg2.extras
+from config import BATCH_SIZE
+from db import get_conn, fetch_all
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s  %(levelname)-8s  %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+log = logging.getLogger(__name__)
+# Buffer days: documents created within this window of an event still correlate
+DATE_BUFFER_DAYS = 365
+# Minimum total score to record a correlation
+MIN_SCORE = 0.15
+# Collection -> event affinity (source_section maps naturally to events)
+COLLECTION_AFFINITY = {
+    "jfk_assassination": ["JFK Assassination", "RFK Assassination", "MLK Assassination",
+                          "Church Committee Investigations", "Warren Commission"],
+    "cia_mkultra": ["MKUltra Program", "Church Committee Investigations"],
+    "cia_stargate": ["CIA Stargate / Remote Viewing Program"],
+    "cia_declassified": ["Bay of Pigs Invasion", "Cuban Missile Crisis",
+                         "Area 51 / U-2 Program", "Iran-Contra Affair"],
+    "lincoln_archives": ["Lincoln Assassination", "Civil War End / Reconstruction"],
+}
+def load_events():
+    """Load all historical events from DB."""
+    rows = fetch_all("SELECT * FROM historical_events ORDER BY id")
+    events = []
+    for r in rows:
+        kw = r["keywords"]
+        if isinstance(kw, str):
+            kw = json.loads(kw)
+        events.append({
+            "id": r["id"],
+            "name": r["event_name"],
+            "start": r["start_date"],
+            "end": r["end_date"] or r["start_date"],
+            "category": r["category"],
+            "keywords": [k.lower() for k in kw],
+        })
+    return events
+def score_date_overlap(doc_date: date | None, doc_range_start: date | None,
+                       doc_range_end: date | None, event: dict) -> float:
+    """Score based on temporal overlap between document date and event."""
+    if not doc_date:
+        return 0.0
+    ev_start = event["start"] - timedelta(days=DATE_BUFFER_DAYS)
+    ev_end = event["end"] + timedelta(days=DATE_BUFFER_DAYS)
+    # Direct overlap: doc date within event range (no buffer)
+    if event["start"] <= doc_date <= event["end"]:
+        return 0.5
+    # Within buffer range
+    if ev_start <= doc_date <= ev_end:
+        # Score decays with distance
+        if doc_date < event["start"]:
+            days_away = (event["start"] - doc_date).days
+        else:
+            days_away = (doc_date - event["end"]).days
+        decay = max(0, 1.0 - days_away / DATE_BUFFER_DAYS)
+        return 0.3 * decay
+    # Check doc range overlap with event range
+    if doc_range_start and doc_range_end:
+        if doc_range_start <= event["end"] and doc_range_end >= event["start"]:
+            # Partial overlap
+            overlap_start = max(doc_range_start, event["start"])
+            overlap_end = min(doc_range_end, event["end"])
+            overlap_days = (overlap_end - overlap_start).days + 1
+            doc_span = (doc_range_end - doc_range_start).days + 1
+            if doc_span > 0:
+                return 0.3 * min(overlap_days / doc_span, 1.0)
+    return 0.0
+def score_keyword_match(doc_id: int, event: dict, conn) -> tuple[float, list[str]]:
+    """
+    Score based on keyword matches in the first few pages of OCR text.
+    Returns (score, matched_keywords).
+    """
+    if not event["keywords"]:
+        return 0.0, []
+    with conn.cursor() as cur:
+        cur.execute(
+            """SELECT string_agg(ocr_text, ' ') as combined_text
+               FROM (
+                   SELECT ocr_text FROM pages
+                   WHERE document_id = %s AND ocr_text IS NOT NULL
+                   ORDER BY page_number
+                   LIMIT 5
+               ) sub""",
+            (doc_id,),
+        )
+        row = cur.fetchone()
+    if not row or not row[0]:
+        return 0.0, []
+    text_lower = row[0].lower()
+    matched = []
+    for kw in event["keywords"]:
+        # Use word boundary matching for short keywords to avoid false positives
+        if len(kw) < 5:
+            pattern = r'\b' + re.escape(kw) + r'\b'
+            if re.search(pattern, text_lower):
+                matched.append(kw)
+        else:
+            if kw in text_lower:
+                matched.append(kw)
+    if not matched:
+        return 0.0, matched
+    # Score: more keyword matches = higher score, max 0.4
+    ratio = len(matched) / len(event["keywords"])
+    return min(0.4, 0.15 + 0.25 * ratio), matched
+def score_entity_match(doc_id: int, event: dict, conn) -> tuple[float, list[str]]:
+    """
+    Score based on entity overlap (PERSON, ORG, GPE, EVENT entities
+    matching event keywords).
+    """
+    if not event["keywords"]:
+        return 0.0, []
+    with conn.cursor() as cur:
+        cur.execute(
+            """SELECT DISTINCT lower(entity_text) as ent
+               FROM entities
+               WHERE document_id = %s
+                 AND entity_type IN ('PERSON', 'ORG', 'GPE', 'EVENT', 'NORP')""",
+            (doc_id,),
+        )
+        entities = {row[0] for row in cur.fetchall()}
+    if not entities:
+        return 0.0, []
+    matched = []
+    for kw in event["keywords"]:
+        for ent in entities:
+            if kw in ent or ent in kw:
+                matched.append(kw)
+                break
+    if not matched:
+        return 0.0, matched
+    return min(0.3, 0.1 + 0.2 * len(matched) / len(event["keywords"])), matched
+def score_collection_affinity(source_section: str, event: dict) -> float:
+    """Score based on natural mapping between collection and event."""
+    affinity_events = COLLECTION_AFFINITY.get(source_section, [])
+    if event["name"] in affinity_events:
+        return 0.2
+    return 0.0
+def process_correlations():
+    """Main correlation loop."""
+    events = load_events()
+    log.info(f"Loaded {len(events)} historical events")
+    conn = get_conn()
+    conn.autocommit = False
+    # Get documents with dates, that haven't been correlated yet
+    with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+        cur.execute("""
+            SELECT d.id, d.source_section, d.file_path,
+                   dd.estimated_date, dd.date_range_start, dd.date_range_end
+            FROM documents d
+            LEFT JOIN document_dates dd ON dd.document_id = d.id
+            WHERE NOT EXISTS (
+                SELECT 1 FROM document_events de WHERE de.document_id = d.id
+            )
+            ORDER BY d.id
+        """)
+        docs = cur.fetchall()
+    total = len(docs)
+    log.info(f"Processing {total} documents for crisis correlation")
+    batch = []
+    processed = 0
+    correlations_found = 0
+    for doc in docs:
+        doc_id = doc["id"]
+        for event in events:
+            methods = []
+            details = {}
+            total_score = 0.0
+            # 1. Date overlap
+            date_score = score_date_overlap(
+                doc["estimated_date"], doc["date_range_start"],
+                doc["date_range_end"], event
+            )
+            if date_score > 0:
+                total_score += date_score
+                methods.append("date")
+                details["date_score"] = round(date_score, 3)
+            # 2. Collection affinity (cheap — no DB query)
+            affinity = score_collection_affinity(doc["source_section"] or "", event)
+            if affinity > 0:
+                total_score += affinity
+                methods.append("collection")
+            # Only do expensive keyword/entity lookups if we already have some signal
+            # or if the collection has natural affinity
+            if total_score > 0.05 or affinity > 0:
+                # 3. Keyword match
+                kw_score, kw_matched = score_keyword_match(doc_id, event, conn)
+                if kw_score > 0:
+                    total_score += kw_score
+                    methods.append("keyword")
+                    details["matched_keywords"] = kw_matched
+                # 4. Entity match
+                ent_score, ent_matched = score_entity_match(doc_id, event, conn)
+                if ent_score > 0:
+                    total_score += ent_score
+                    methods.append("entity")
+                    details["matched_entities"] = ent_matched
+            if total_score >= MIN_SCORE:
+                batch.append((
+                    doc_id, event["id"], round(total_score, 4),
+                    json.dumps(methods), json.dumps(details),
+                ))
+                correlations_found += 1
+        processed += 1
+        if processed % 500 == 0:
+            if batch:
+                _flush_batch(conn, batch)
+                batch = []
+            log.info(
+                f"Progress: {processed}/{total} ({processed*100//total}%) "
+                f"— {correlations_found} correlations found"
+            )
+    if batch:
+        _flush_batch(conn, batch)
+    conn.close()
+    log.info(f"Done. {processed} docs processed, {correlations_found} correlations found.")
+    # Print summary
+    stats = fetch_all("""
+        SELECT he.event_name, COUNT(*) as doc_count,
+               ROUND(AVG(de.relevance_score)::numeric, 3) as avg_score
+        FROM document_events de
+        JOIN historical_events he ON he.id = de.event_id
+        GROUP BY he.event_name
+        ORDER BY doc_count DESC
+    """)
+    log.info("Crisis correlation summary:")
+    for row in stats:
+        log.info(f"  {row['event_name']}: {row['doc_count']} docs (avg score: {row['avg_score']})")
+def _flush_batch(conn, batch):
+    with conn.cursor() as cur:
+        psycopg2.extras.execute_batch(
+            cur,
+            """INSERT INTO document_events
+               (document_id, event_id, relevance_score, match_methods, details)
+               VALUES (%s, %s, %s, %s, %s)
+               ON CONFLICT (document_id, event_id) DO UPDATE SET
+                 relevance_score = EXCLUDED.relevance_score,
+                 match_methods = EXCLUDED.match_methods,
+                 details = EXCLUDED.details,
+                 created_at = NOW()
+            """,
+            batch,
+            page_size=500,
+        )
+    conn.commit()
+if __name__ == "__main__":
+    process_correlations()