datamatters24 commited on
Commit
5be2108
·
verified ·
1 Parent(s): da2c1b4

Upload ml/12_sentiment_analysis.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. ml/12_sentiment_analysis.py +155 -0
ml/12_sentiment_analysis.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 9: Sentiment Analysis
4
+
5
+ Computes polarity and subjectivity per document using TextBlob.
6
+ - Polarity: -1.0 (negative) to 1.0 (positive)
7
+ - Subjectivity: 0.0 (objective/factual) to 1.0 (subjective/opinion)
8
+
9
+ Analyzes first 5 pages of OCR text per document.
10
+ Stores results in document_features (sentiment).
11
+
12
+ Runs on: Hetzner CPU
13
+ """
14
+
15
+ import json
16
+ import logging
17
+
18
+ import psycopg2.extras
19
+ from textblob import TextBlob
20
+
21
+ from db import get_conn
22
+
23
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s")
24
+ log = logging.getLogger(__name__)
25
+
26
+ BATCH_SIZE = 500
27
+ MAX_TEXT_LEN = 5000 # characters per document
28
+
29
+
30
+ def get_pending_docs(conn, limit):
31
+ with conn.cursor() as cur:
32
+ cur.execute("""
33
+ SELECT d.id, string_agg(p.ocr_text, ' ' ORDER BY p.page_number) as text
34
+ FROM documents d
35
+ JOIN pages p ON p.document_id = d.id AND p.page_number <= 5
36
+ WHERE d.id NOT IN (
37
+ SELECT document_id FROM document_features WHERE feature_name = 'sentiment'
38
+ )
39
+ GROUP BY d.id
40
+ ORDER BY d.id
41
+ LIMIT %s
42
+ """, (limit,))
43
+ return cur.fetchall()
44
+
45
+
46
+ def analyze_sentiment(text):
47
+ """Compute sentiment using TextBlob."""
48
+ if not text or len(text.strip()) < 50:
49
+ return None
50
+
51
+ # Truncate for speed
52
+ text = text[:MAX_TEXT_LEN]
53
+ blob = TextBlob(text)
54
+
55
+ return {
56
+ 'polarity': round(blob.sentiment.polarity, 4),
57
+ 'subjectivity': round(blob.sentiment.subjectivity, 4),
58
+ }
59
+
60
+
61
+ def main():
62
+ conn = get_conn()
63
+
64
+ # Count pending
65
+ with conn.cursor() as cur:
66
+ cur.execute("""
67
+ SELECT COUNT(DISTINCT d.id) FROM documents d
68
+ JOIN pages p ON p.document_id = d.id AND p.page_number <= 5
69
+ WHERE d.id NOT IN (
70
+ SELECT document_id FROM document_features WHERE feature_name = 'sentiment'
71
+ )
72
+ """)
73
+ total = cur.fetchone()[0]
74
+
75
+ log.info(f"Analyzing sentiment for {total} documents")
76
+
77
+ processed = 0
78
+ skipped = 0
79
+
80
+ while True:
81
+ docs = get_pending_docs(conn, BATCH_SIZE)
82
+ if not docs:
83
+ break
84
+
85
+ rows = []
86
+ for doc_id, text in docs:
87
+ result = analyze_sentiment(text)
88
+ if result:
89
+ rows.append((
90
+ doc_id, 'sentiment',
91
+ result['polarity'],
92
+ json.dumps(result),
93
+ ))
94
+ else:
95
+ # Mark as processed with neutral
96
+ rows.append((
97
+ doc_id, 'sentiment', 0.0,
98
+ json.dumps({'polarity': 0.0, 'subjectivity': 0.0, 'note': 'insufficient_text'}),
99
+ ))
100
+ skipped += 1
101
+
102
+ with conn.cursor() as cur:
103
+ psycopg2.extras.execute_batch(
104
+ cur,
105
+ """INSERT INTO document_features (document_id, feature_name, feature_value, feature_json)
106
+ VALUES (%s, %s, %s, %s::jsonb)
107
+ ON CONFLICT (document_id, feature_name) DO NOTHING""",
108
+ rows,
109
+ page_size=500,
110
+ )
111
+ conn.commit()
112
+
113
+ processed += len(docs)
114
+ if processed % 5000 == 0:
115
+ log.info(f" {processed}/{total} analyzed ({skipped} skipped)")
116
+
117
+ # Store aggregate stats in analytics_cache
118
+ with conn.cursor() as cur:
119
+ cur.execute("""
120
+ SELECT d.source_section,
121
+ AVG((df.feature_json->>'polarity')::float) as avg_polarity,
122
+ AVG((df.feature_json->>'subjectivity')::float) as avg_subjectivity,
123
+ COUNT(*) as doc_count
124
+ FROM document_features df
125
+ JOIN documents d ON d.id = df.document_id
126
+ WHERE df.feature_name = 'sentiment'
127
+ AND df.feature_json->>'note' IS NULL
128
+ GROUP BY d.source_section
129
+ ORDER BY avg_polarity
130
+ """)
131
+ stats = cur.fetchall()
132
+
133
+ sentiment_summary = {}
134
+ for section, avg_pol, avg_sub, count in stats:
135
+ sentiment_summary[section] = {
136
+ 'avg_polarity': round(avg_pol, 4),
137
+ 'avg_subjectivity': round(avg_sub, 4),
138
+ 'doc_count': count,
139
+ }
140
+ log.info(f" {section}: polarity={avg_pol:.4f}, subjectivity={avg_sub:.4f} ({count} docs)")
141
+
142
+ with conn.cursor() as cur:
143
+ cur.execute("""
144
+ INSERT INTO analytics_cache (key, value)
145
+ VALUES ('sentiment_summary', %s::jsonb)
146
+ ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value, updated_at = NOW()
147
+ """, (json.dumps(sentiment_summary),))
148
+ conn.commit()
149
+
150
+ conn.close()
151
+ log.info(f"Done. {processed} documents analyzed ({skipped} skipped).")
152
+
153
+
154
+ if __name__ == "__main__":
155
+ main()