Upload notebooks/03_topic_classification/33_sentiment_analysis.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

notebooks/03_topic_classification/33_sentiment_analysis.ipynb +326 -0

notebooks/03_topic_classification/33_sentiment_analysis.ipynb ADDED Viewed

	@@ -0,0 +1,326 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 33 - Sentiment Analysis\n",
+    "\n",
+    "Pipeline notebook for page-level sentiment analysis using TextBlob.\n",
+    "\n",
+    "Computes polarity and subjectivity per page, aggregates per document (mean, min, max),\n",
+    "and stores results in `page_features` and `document_features` tables."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Parameters\n",
+    "source_section = None\n",
+    "batch_size = 1000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '/opt/epstein_env/research')\n",
+    "\n",
+    "import json\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from textblob import TextBlob\n",
+    "from collections import defaultdict\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "from research_lib.db import fetch_df, fetch_all, upsert_feature\n",
+    "from research_lib.incremental import (\n",
+    "    start_run, finish_run, get_unprocessed_documents, get_processed_doc_ids,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Start run\n",
+    "run_id = start_run(\n",
+    "    'sentiment_analysis',\n",
+    "    source_section=source_section,\n",
+    "    parameters={'batch_size': batch_size},\n",
+    ")\n",
+    "print(f'Started run {run_id}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get unprocessed documents\n",
+    "processed_ids = get_processed_doc_ids(\n",
+    "    'sentiment_analysis',\n",
+    "    feature_table='document_features',\n",
+    "    feature_name='sentiment_polarity',\n",
+    ")\n",
+    "print(f'Already processed: {len(processed_ids)} documents')\n",
+    "\n",
+    "# Build query for unprocessed pages\n",
+    "where_clauses = [\"p.ocr_text IS NOT NULL\", \"p.ocr_text != ''\"]\n",
+    "params = []\n",
+    "\n",
+    "if source_section:\n",
+    "    where_clauses.append('d.source_section = %s')\n",
+    "    params.append(source_section)\n",
+    "\n",
+    "if processed_ids:\n",
+    "    where_clauses.append(f'p.document_id NOT IN ({\",\".join(str(i) for i in processed_ids)})')\n",
+    "\n",
+    "where_sql = 'WHERE ' + ' AND '.join(where_clauses)\n",
+    "\n",
+    "# Count total pages\n",
+    "count_sql = f\"\"\"\n",
+    "    SELECT COUNT(*) FROM pages p\n",
+    "    JOIN documents d ON d.id = p.document_id\n",
+    "    {where_sql}\n",
+    "\"\"\"\n",
+    "total_pages = fetch_all(count_sql, params or None)[0]['count']\n",
+    "print(f'Pages to process: {total_pages}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Process pages in batches\n",
+    "page_sentiments = []  # (page_id, document_id, polarity, subjectivity)\n",
+    "offset = 0\n",
+    "\n",
+    "pbar = tqdm(total=total_pages, desc='Analyzing sentiment')\n",
+    "while True:\n",
+    "    sql = f\"\"\"\n",
+    "        SELECT p.id as page_id, p.document_id, p.ocr_text\n",
+    "        FROM pages p\n",
+    "        JOIN documents d ON d.id = p.document_id\n",
+    "        {where_sql}\n",
+    "        ORDER BY p.document_id, p.page_number\n",
+    "        LIMIT %s OFFSET %s\n",
+    "    \"\"\"\n",
+    "    batch_params = (params or []) + [batch_size, offset]\n",
+    "    batch_df = fetch_df(sql, batch_params)\n",
+    "\n",
+    "    if batch_df.empty:\n",
+    "        break\n",
+    "\n",
+    "    for _, row in batch_df.iterrows():\n",
+    "        text = row['ocr_text']\n",
+    "        if not text or len(text.strip()) < 10:\n",
+    "            continue\n",
+    "\n",
+    "        # Truncate very long texts for efficiency\n",
+    "        blob = TextBlob(text[:50000])\n",
+    "        polarity = blob.sentiment.polarity\n",
+    "        subjectivity = blob.sentiment.subjectivity\n",
+    "\n",
+    "        page_sentiments.append((\n",
+    "            row['page_id'],\n",
+    "            row['document_id'],\n",
+    "            polarity,\n",
+    "            subjectivity,\n",
+    "        ))\n",
+    "\n",
+    "    offset += batch_size\n",
+    "    pbar.update(len(batch_df))\n",
+    "\n",
+    "pbar.close()\n",
+    "print(f'Analyzed {len(page_sentiments)} pages')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Store page-level sentiment in page_features\n",
+    "page_rows = [\n",
+    "    (\n",
+    "        page_id,\n",
+    "        'sentiment_polarity',\n",
+    "        str(round(polarity, 6)),\n",
+    "        None,\n",
+    "    )\n",
+    "    for page_id, doc_id, polarity, subjectivity in page_sentiments\n",
+    "]\n",
+    "\n",
+    "if page_rows:\n",
+    "    print(f'Upserting {len(page_rows)} page-level polarity features...')\n",
+    "    upserted = upsert_feature(\n",
+    "        'page_features',\n",
+    "        unique_cols=['page_id', 'feature_name'],\n",
+    "        data_cols=['feature_value', 'feature_json'],\n",
+    "        rows=page_rows,\n",
+    "    )\n",
+    "    print(f'Upserted {upserted} page_features rows')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Aggregate per document: mean, min, max polarity; mean subjectivity\n",
+    "doc_sentiments = defaultdict(lambda: {'polarities': [], 'subjectivities': []})\n",
+    "\n",
+    "for page_id, doc_id, polarity, subjectivity in page_sentiments:\n",
+    "    doc_sentiments[doc_id]['polarities'].append(polarity)\n",
+    "    doc_sentiments[doc_id]['subjectivities'].append(subjectivity)\n",
+    "\n",
+    "# Build document-level feature rows\n",
+    "doc_polarity_rows = []\n",
+    "doc_subjectivity_rows = []\n",
+    "\n",
+    "for doc_id, data in doc_sentiments.items():\n",
+    "    polarities = data['polarities']\n",
+    "    subjectivities = data['subjectivities']\n",
+    "\n",
+    "    mean_pol = float(np.mean(polarities))\n",
+    "    min_pol = float(np.min(polarities))\n",
+    "    max_pol = float(np.max(polarities))\n",
+    "    mean_subj = float(np.mean(subjectivities))\n",
+    "\n",
+    "    doc_polarity_rows.append((\n",
+    "        doc_id,\n",
+    "        'sentiment_polarity',\n",
+    "        str(round(mean_pol, 6)),\n",
+    "        json.dumps({\n",
+    "            'mean': round(mean_pol, 6),\n",
+    "            'min': round(min_pol, 6),\n",
+    "            'max': round(max_pol, 6),\n",
+    "            'n_pages': len(polarities),\n",
+    "        }),\n",
+    "    ))\n",
+    "\n",
+    "    doc_subjectivity_rows.append((\n",
+    "        doc_id,\n",
+    "        'sentiment_subjectivity',\n",
+    "        str(round(mean_subj, 6)),\n",
+    "        json.dumps({\n",
+    "            'mean': round(mean_subj, 6),\n",
+    "            'min': round(float(np.min(subjectivities)), 6),\n",
+    "            'max': round(float(np.max(subjectivities)), 6),\n",
+    "            'n_pages': len(subjectivities),\n",
+    "        }),\n",
+    "    ))\n",
+    "\n",
+    "print(f'Document-level features prepared for {len(doc_sentiments)} documents')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Store document-level sentiment features\n",
+    "if doc_polarity_rows:\n",
+    "    print('Upserting document polarity features...')\n",
+    "    upserted = upsert_feature(\n",
+    "        'document_features',\n",
+    "        unique_cols=['document_id', 'feature_name'],\n",
+    "        data_cols=['feature_value', 'feature_json'],\n",
+    "        rows=doc_polarity_rows,\n",
+    "    )\n",
+    "    print(f'  Polarity: {upserted} rows')\n",
+    "\n",
+    "if doc_subjectivity_rows:\n",
+    "    print('Upserting document subjectivity features...')\n",
+    "    upserted = upsert_feature(\n",
+    "        'document_features',\n",
+    "        unique_cols=['document_id', 'feature_name'],\n",
+    "        data_cols=['feature_value', 'feature_json'],\n",
+    "        rows=doc_subjectivity_rows,\n",
+    "    )\n",
+    "    print(f'  Subjectivity: {upserted} rows')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Finish run\n",
+    "finish_run(run_id, documents_processed=len(doc_sentiments))\n",
+    "print(f'Run {run_id} completed.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Distribution summary\n",
+    "print('=== Sentiment Analysis Summary ===')\n",
+    "print(f'Source section: {source_section or \"all\"}')\n",
+    "print(f'Pages analyzed: {len(page_sentiments)}')\n",
+    "print(f'Documents analyzed: {len(doc_sentiments)}')\n",
+    "\n",
+    "if page_sentiments:\n",
+    "    all_pol = [s[2] for s in page_sentiments]\n",
+    "    all_subj = [s[3] for s in page_sentiments]\n",
+    "\n",
+    "    print(f'\\nPage-level polarity:')\n",
+    "    print(f'  Mean:   {np.mean(all_pol):.4f}')\n",
+    "    print(f'  Median: {np.median(all_pol):.4f}')\n",
+    "    print(f'  Std:    {np.std(all_pol):.4f}')\n",
+    "    print(f'  Range:  [{np.min(all_pol):.4f}, {np.max(all_pol):.4f}]')\n",
+    "\n",
+    "    print(f'\\nPage-level subjectivity:')\n",
+    "    print(f'  Mean:   {np.mean(all_subj):.4f}')\n",
+    "    print(f'  Median: {np.median(all_subj):.4f}')\n",
+    "    print(f'  Std:    {np.std(all_subj):.4f}')\n",
+    "    print(f'  Range:  [{np.min(all_subj):.4f}, {np.max(all_subj):.4f}]')\n",
+    "\n",
+    "    # Polarity distribution buckets\n",
+    "    negative = sum(1 for p in all_pol if p < -0.1)\n",
+    "    neutral = sum(1 for p in all_pol if -0.1 <= p <= 0.1)\n",
+    "    positive = sum(1 for p in all_pol if p > 0.1)\n",
+    "    print(f'\\nPolarity distribution:')\n",
+    "    print(f'  Negative (< -0.1): {negative} ({100*negative/len(all_pol):.1f}%)')\n",
+    "    print(f'  Neutral (-0.1..0.1): {neutral} ({100*neutral/len(all_pol):.1f}%)')\n",
+    "    print(f'  Positive (> 0.1): {positive} ({100*positive/len(all_pol):.1f}%)')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}