datamatters24
/

research-document-archive

+#!/usr/bin/env python3
+"""
+Phase 9: Sentiment Analysis
+Computes polarity and subjectivity per document using TextBlob.
+- Polarity: -1.0 (negative) to 1.0 (positive)
+- Subjectivity: 0.0 (objective/factual) to 1.0 (subjective/opinion)
+Analyzes first 5 pages of OCR text per document.
+Stores results in document_features (sentiment).
+Runs on: Hetzner CPU
+"""
+import json
+import logging
+import psycopg2.extras
+from textblob import TextBlob
+from db import get_conn
+logging.basicConfig(level=logging.INFO, format="%(asctime)s  %(levelname)-8s  %(message)s")
+log = logging.getLogger(__name__)
+BATCH_SIZE = 500
+MAX_TEXT_LEN = 5000  # characters per document
+def get_pending_docs(conn, limit):
+    with conn.cursor() as cur:
+        cur.execute("""
+            SELECT d.id, string_agg(p.ocr_text, ' ' ORDER BY p.page_number) as text
+            FROM documents d
+            JOIN pages p ON p.document_id = d.id AND p.page_number <= 5
+            WHERE d.id NOT IN (
+                SELECT document_id FROM document_features WHERE feature_name = 'sentiment'
+            )
+            GROUP BY d.id
+            ORDER BY d.id
+            LIMIT %s
+        """, (limit,))
+        return cur.fetchall()
+def analyze_sentiment(text):
+    """Compute sentiment using TextBlob."""
+    if not text or len(text.strip()) < 50:
+        return None
+    # Truncate for speed
+    text = text[:MAX_TEXT_LEN]
+    blob = TextBlob(text)
+    return {
+        'polarity': round(blob.sentiment.polarity, 4),
+        'subjectivity': round(blob.sentiment.subjectivity, 4),
+    }
+def main():
+    conn = get_conn()
+    # Count pending
+    with conn.cursor() as cur:
+        cur.execute("""
+            SELECT COUNT(DISTINCT d.id) FROM documents d
+            JOIN pages p ON p.document_id = d.id AND p.page_number <= 5
+            WHERE d.id NOT IN (
+                SELECT document_id FROM document_features WHERE feature_name = 'sentiment'
+            )
+        """)
+        total = cur.fetchone()[0]
+    log.info(f"Analyzing sentiment for {total} documents")
+    processed = 0
+    skipped = 0
+    while True:
+        docs = get_pending_docs(conn, BATCH_SIZE)
+        if not docs:
+            break
+        rows = []
+        for doc_id, text in docs:
+            result = analyze_sentiment(text)
+            if result:
+                rows.append((
+                    doc_id, 'sentiment',
+                    result['polarity'],
+                    json.dumps(result),
+                ))
+            else:
+                # Mark as processed with neutral
+                rows.append((
+                    doc_id, 'sentiment', 0.0,
+                    json.dumps({'polarity': 0.0, 'subjectivity': 0.0, 'note': 'insufficient_text'}),
+                ))
+                skipped += 1
+        with conn.cursor() as cur:
+            psycopg2.extras.execute_batch(
+                cur,
+                """INSERT INTO document_features (document_id, feature_name, feature_value, feature_json)
+                   VALUES (%s, %s, %s, %s::jsonb)
+                   ON CONFLICT (document_id, feature_name) DO NOTHING""",
+                rows,
+                page_size=500,
+            )
+        conn.commit()
+        processed += len(docs)
+        if processed % 5000 == 0:
+            log.info(f"  {processed}/{total} analyzed ({skipped} skipped)")
+    # Store aggregate stats in analytics_cache
+    with conn.cursor() as cur:
+        cur.execute("""
+            SELECT d.source_section,
+                   AVG((df.feature_json->>'polarity')::float) as avg_polarity,
+                   AVG((df.feature_json->>'subjectivity')::float) as avg_subjectivity,
+                   COUNT(*) as doc_count
+            FROM document_features df
+            JOIN documents d ON d.id = df.document_id
+            WHERE df.feature_name = 'sentiment'
+              AND df.feature_json->>'note' IS NULL
+            GROUP BY d.source_section
+            ORDER BY avg_polarity
+        """)
+        stats = cur.fetchall()
+    sentiment_summary = {}
+    for section, avg_pol, avg_sub, count in stats:
+        sentiment_summary[section] = {
+            'avg_polarity': round(avg_pol, 4),
+            'avg_subjectivity': round(avg_sub, 4),
+            'doc_count': count,
+        }
+        log.info(f"  {section}: polarity={avg_pol:.4f}, subjectivity={avg_sub:.4f} ({count} docs)")
+    with conn.cursor() as cur:
+        cur.execute("""
+            INSERT INTO analytics_cache (key, value)
+            VALUES ('sentiment_summary', %s::jsonb)
+            ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value, updated_at = NOW()
+        """, (json.dumps(sentiment_summary),))
+    conn.commit()
+    conn.close()
+    log.info(f"Done. {processed} documents analyzed ({skipped} skipped).")
+if __name__ == "__main__":
+    main()