datamatters24 commited on
Commit
8f695bc
·
verified ·
1 Parent(s): 00a070c

Upload notebooks/03_topic_classification/33_sentiment_analysis.ipynb with huggingface_hub

Browse files
notebooks/03_topic_classification/33_sentiment_analysis.ipynb ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# 33 - Sentiment Analysis\n",
8
+ "\n",
9
+ "Pipeline notebook for page-level sentiment analysis using TextBlob.\n",
10
+ "\n",
11
+ "Computes polarity and subjectivity per page, aggregates per document (mean, min, max),\n",
12
+ "and stores results in `page_features` and `document_features` tables."
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": null,
18
+ "metadata": {
19
+ "tags": [
20
+ "parameters"
21
+ ]
22
+ },
23
+ "outputs": [],
24
+ "source": [
25
+ "# Parameters\n",
26
+ "source_section = None\n",
27
+ "batch_size = 1000"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": null,
33
+ "metadata": {},
34
+ "outputs": [],
35
+ "source": [
36
+ "import sys\n",
37
+ "sys.path.insert(0, '/opt/epstein_env/research')\n",
38
+ "\n",
39
+ "import json\n",
40
+ "import numpy as np\n",
41
+ "import pandas as pd\n",
42
+ "from textblob import TextBlob\n",
43
+ "from collections import defaultdict\n",
44
+ "from tqdm.auto import tqdm\n",
45
+ "\n",
46
+ "from research_lib.db import fetch_df, fetch_all, upsert_feature\n",
47
+ "from research_lib.incremental import (\n",
48
+ " start_run, finish_run, get_unprocessed_documents, get_processed_doc_ids,\n",
49
+ ")"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": null,
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "# Start run\n",
59
+ "run_id = start_run(\n",
60
+ " 'sentiment_analysis',\n",
61
+ " source_section=source_section,\n",
62
+ " parameters={'batch_size': batch_size},\n",
63
+ ")\n",
64
+ "print(f'Started run {run_id}')"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": null,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "# Get unprocessed documents\n",
74
+ "processed_ids = get_processed_doc_ids(\n",
75
+ " 'sentiment_analysis',\n",
76
+ " feature_table='document_features',\n",
77
+ " feature_name='sentiment_polarity',\n",
78
+ ")\n",
79
+ "print(f'Already processed: {len(processed_ids)} documents')\n",
80
+ "\n",
81
+ "# Build query for unprocessed pages\n",
82
+ "where_clauses = [\"p.ocr_text IS NOT NULL\", \"p.ocr_text != ''\"]\n",
83
+ "params = []\n",
84
+ "\n",
85
+ "if source_section:\n",
86
+ " where_clauses.append('d.source_section = %s')\n",
87
+ " params.append(source_section)\n",
88
+ "\n",
89
+ "if processed_ids:\n",
90
+ " where_clauses.append(f'p.document_id NOT IN ({\",\".join(str(i) for i in processed_ids)})')\n",
91
+ "\n",
92
+ "where_sql = 'WHERE ' + ' AND '.join(where_clauses)\n",
93
+ "\n",
94
+ "# Count total pages\n",
95
+ "count_sql = f\"\"\"\n",
96
+ " SELECT COUNT(*) FROM pages p\n",
97
+ " JOIN documents d ON d.id = p.document_id\n",
98
+ " {where_sql}\n",
99
+ "\"\"\"\n",
100
+ "total_pages = fetch_all(count_sql, params or None)[0]['count']\n",
101
+ "print(f'Pages to process: {total_pages}')"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": null,
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "# Process pages in batches\n",
111
+ "page_sentiments = [] # (page_id, document_id, polarity, subjectivity)\n",
112
+ "offset = 0\n",
113
+ "\n",
114
+ "pbar = tqdm(total=total_pages, desc='Analyzing sentiment')\n",
115
+ "while True:\n",
116
+ " sql = f\"\"\"\n",
117
+ " SELECT p.id as page_id, p.document_id, p.ocr_text\n",
118
+ " FROM pages p\n",
119
+ " JOIN documents d ON d.id = p.document_id\n",
120
+ " {where_sql}\n",
121
+ " ORDER BY p.document_id, p.page_number\n",
122
+ " LIMIT %s OFFSET %s\n",
123
+ " \"\"\"\n",
124
+ " batch_params = (params or []) + [batch_size, offset]\n",
125
+ " batch_df = fetch_df(sql, batch_params)\n",
126
+ "\n",
127
+ " if batch_df.empty:\n",
128
+ " break\n",
129
+ "\n",
130
+ " for _, row in batch_df.iterrows():\n",
131
+ " text = row['ocr_text']\n",
132
+ " if not text or len(text.strip()) < 10:\n",
133
+ " continue\n",
134
+ "\n",
135
+ " # Truncate very long texts for efficiency\n",
136
+ " blob = TextBlob(text[:50000])\n",
137
+ " polarity = blob.sentiment.polarity\n",
138
+ " subjectivity = blob.sentiment.subjectivity\n",
139
+ "\n",
140
+ " page_sentiments.append((\n",
141
+ " row['page_id'],\n",
142
+ " row['document_id'],\n",
143
+ " polarity,\n",
144
+ " subjectivity,\n",
145
+ " ))\n",
146
+ "\n",
147
+ " offset += batch_size\n",
148
+ " pbar.update(len(batch_df))\n",
149
+ "\n",
150
+ "pbar.close()\n",
151
+ "print(f'Analyzed {len(page_sentiments)} pages')"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "metadata": {},
158
+ "outputs": [],
159
+ "source": [
160
+ "# Store page-level sentiment in page_features\n",
161
+ "page_rows = [\n",
162
+ " (\n",
163
+ " page_id,\n",
164
+ " 'sentiment_polarity',\n",
165
+ " str(round(polarity, 6)),\n",
166
+ " None,\n",
167
+ " )\n",
168
+ " for page_id, doc_id, polarity, subjectivity in page_sentiments\n",
169
+ "]\n",
170
+ "\n",
171
+ "if page_rows:\n",
172
+ " print(f'Upserting {len(page_rows)} page-level polarity features...')\n",
173
+ " upserted = upsert_feature(\n",
174
+ " 'page_features',\n",
175
+ " unique_cols=['page_id', 'feature_name'],\n",
176
+ " data_cols=['feature_value', 'feature_json'],\n",
177
+ " rows=page_rows,\n",
178
+ " )\n",
179
+ " print(f'Upserted {upserted} page_features rows')"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": null,
185
+ "metadata": {},
186
+ "outputs": [],
187
+ "source": [
188
+ "# Aggregate per document: mean, min, max polarity; mean subjectivity\n",
189
+ "doc_sentiments = defaultdict(lambda: {'polarities': [], 'subjectivities': []})\n",
190
+ "\n",
191
+ "for page_id, doc_id, polarity, subjectivity in page_sentiments:\n",
192
+ " doc_sentiments[doc_id]['polarities'].append(polarity)\n",
193
+ " doc_sentiments[doc_id]['subjectivities'].append(subjectivity)\n",
194
+ "\n",
195
+ "# Build document-level feature rows\n",
196
+ "doc_polarity_rows = []\n",
197
+ "doc_subjectivity_rows = []\n",
198
+ "\n",
199
+ "for doc_id, data in doc_sentiments.items():\n",
200
+ " polarities = data['polarities']\n",
201
+ " subjectivities = data['subjectivities']\n",
202
+ "\n",
203
+ " mean_pol = float(np.mean(polarities))\n",
204
+ " min_pol = float(np.min(polarities))\n",
205
+ " max_pol = float(np.max(polarities))\n",
206
+ " mean_subj = float(np.mean(subjectivities))\n",
207
+ "\n",
208
+ " doc_polarity_rows.append((\n",
209
+ " doc_id,\n",
210
+ " 'sentiment_polarity',\n",
211
+ " str(round(mean_pol, 6)),\n",
212
+ " json.dumps({\n",
213
+ " 'mean': round(mean_pol, 6),\n",
214
+ " 'min': round(min_pol, 6),\n",
215
+ " 'max': round(max_pol, 6),\n",
216
+ " 'n_pages': len(polarities),\n",
217
+ " }),\n",
218
+ " ))\n",
219
+ "\n",
220
+ " doc_subjectivity_rows.append((\n",
221
+ " doc_id,\n",
222
+ " 'sentiment_subjectivity',\n",
223
+ " str(round(mean_subj, 6)),\n",
224
+ " json.dumps({\n",
225
+ " 'mean': round(mean_subj, 6),\n",
226
+ " 'min': round(float(np.min(subjectivities)), 6),\n",
227
+ " 'max': round(float(np.max(subjectivities)), 6),\n",
228
+ " 'n_pages': len(subjectivities),\n",
229
+ " }),\n",
230
+ " ))\n",
231
+ "\n",
232
+ "print(f'Document-level features prepared for {len(doc_sentiments)} documents')"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": null,
238
+ "metadata": {},
239
+ "outputs": [],
240
+ "source": [
241
+ "# Store document-level sentiment features\n",
242
+ "if doc_polarity_rows:\n",
243
+ " print('Upserting document polarity features...')\n",
244
+ " upserted = upsert_feature(\n",
245
+ " 'document_features',\n",
246
+ " unique_cols=['document_id', 'feature_name'],\n",
247
+ " data_cols=['feature_value', 'feature_json'],\n",
248
+ " rows=doc_polarity_rows,\n",
249
+ " )\n",
250
+ " print(f' Polarity: {upserted} rows')\n",
251
+ "\n",
252
+ "if doc_subjectivity_rows:\n",
253
+ " print('Upserting document subjectivity features...')\n",
254
+ " upserted = upsert_feature(\n",
255
+ " 'document_features',\n",
256
+ " unique_cols=['document_id', 'feature_name'],\n",
257
+ " data_cols=['feature_value', 'feature_json'],\n",
258
+ " rows=doc_subjectivity_rows,\n",
259
+ " )\n",
260
+ " print(f' Subjectivity: {upserted} rows')"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": null,
266
+ "metadata": {},
267
+ "outputs": [],
268
+ "source": [
269
+ "# Finish run\n",
270
+ "finish_run(run_id, documents_processed=len(doc_sentiments))\n",
271
+ "print(f'Run {run_id} completed.')"
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": null,
277
+ "metadata": {},
278
+ "outputs": [],
279
+ "source": [
280
+ "# Distribution summary\n",
281
+ "print('=== Sentiment Analysis Summary ===')\n",
282
+ "print(f'Source section: {source_section or \"all\"}')\n",
283
+ "print(f'Pages analyzed: {len(page_sentiments)}')\n",
284
+ "print(f'Documents analyzed: {len(doc_sentiments)}')\n",
285
+ "\n",
286
+ "if page_sentiments:\n",
287
+ " all_pol = [s[2] for s in page_sentiments]\n",
288
+ " all_subj = [s[3] for s in page_sentiments]\n",
289
+ "\n",
290
+ " print(f'\\nPage-level polarity:')\n",
291
+ " print(f' Mean: {np.mean(all_pol):.4f}')\n",
292
+ " print(f' Median: {np.median(all_pol):.4f}')\n",
293
+ " print(f' Std: {np.std(all_pol):.4f}')\n",
294
+ " print(f' Range: [{np.min(all_pol):.4f}, {np.max(all_pol):.4f}]')\n",
295
+ "\n",
296
+ " print(f'\\nPage-level subjectivity:')\n",
297
+ " print(f' Mean: {np.mean(all_subj):.4f}')\n",
298
+ " print(f' Median: {np.median(all_subj):.4f}')\n",
299
+ " print(f' Std: {np.std(all_subj):.4f}')\n",
300
+ " print(f' Range: [{np.min(all_subj):.4f}, {np.max(all_subj):.4f}]')\n",
301
+ "\n",
302
+ " # Polarity distribution buckets\n",
303
+ " negative = sum(1 for p in all_pol if p < -0.1)\n",
304
+ " neutral = sum(1 for p in all_pol if -0.1 <= p <= 0.1)\n",
305
+ " positive = sum(1 for p in all_pol if p > 0.1)\n",
306
+ " print(f'\\nPolarity distribution:')\n",
307
+ " print(f' Negative (< -0.1): {negative} ({100*negative/len(all_pol):.1f}%)')\n",
308
+ " print(f' Neutral (-0.1..0.1): {neutral} ({100*neutral/len(all_pol):.1f}%)')\n",
309
+ " print(f' Positive (> 0.1): {positive} ({100*positive/len(all_pol):.1f}%)')"
310
+ ]
311
+ }
312
+ ],
313
+ "metadata": {
314
+ "kernelspec": {
315
+ "display_name": "Python 3",
316
+ "language": "python",
317
+ "name": "python3"
318
+ },
319
+ "language_info": {
320
+ "name": "python",
321
+ "version": "3.10.0"
322
+ }
323
+ },
324
+ "nbformat": 4,
325
+ "nbformat_minor": 5
326
+ }