Danialebrat commited on
Commit
d61f3de
·
1 Parent(s): dac4ffa

Adding files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +0 -35
  2. Dockerfile +0 -20
  3. README.md +0 -19
  4. processing_brand_sentiment/ARCHITECTURE_PROPOSAL.md +437 -0
  5. processing_brand_sentiment/README.md +402 -0
  6. processing_brand_sentiment/config_files/analysis_categories.json +123 -0
  7. processing_brand_sentiment/config_files/brand_config.json +111 -0
  8. processing_brand_sentiment/config_files/workflow_config.json +60 -0
  9. processing_brand_sentiment/database/__init__.py +8 -0
  10. processing_brand_sentiment/database/snowflake_connection.py +240 -0
  11. processing_brand_sentiment/database/sql/create_comments_output_table.sql +161 -0
  12. processing_brand_sentiment/database/sql/create_output_table.sql +250 -0
  13. processing_brand_sentiment/database/sql/fetch_comments.sql +82 -0
  14. processing_brand_sentiment/database/sql/fetch_forum_posts.sql +106 -0
  15. processing_brand_sentiment/database/sql/init_comments_output_table.sql +78 -0
  16. processing_brand_sentiment/database/sql/init_output_table.sql +89 -0
  17. processing_brand_sentiment/main.py +1088 -0
  18. processing_brand_sentiment/utils/__init__.py +8 -0
  19. processing_brand_sentiment/utils/html_parser.py +253 -0
  20. processing_brand_sentiment/workflow/__init__.py +10 -0
  21. processing_brand_sentiment/workflow/agents/__init__.py +39 -0
  22. processing_brand_sentiment/workflow/agents/base_agent.py +169 -0
  23. processing_brand_sentiment/workflow/agents/comment_preprocessor_agent.py +211 -0
  24. processing_brand_sentiment/workflow/agents/content_preprocessor_agent.py +570 -0
  25. processing_brand_sentiment/workflow/agents/output_validator_agent.py +408 -0
  26. processing_brand_sentiment/workflow/agents/preprocessor_agent.py +408 -0
  27. processing_brand_sentiment/workflow/agents/relevance_validator_agent.py +289 -0
  28. processing_brand_sentiment/workflow/agents/sabian_analyzer_agent.py +388 -0
  29. processing_brand_sentiment/workflow/agents/sabian_relevance_extraction_agent.py +431 -0
  30. processing_brand_sentiment/workflow/agents/sabian_sentiment_analyzer_agent.py +434 -0
  31. processing_brand_sentiment/workflow/comment_orchestrator.py +558 -0
  32. processing_brand_sentiment/workflow/orchestrator.py +551 -0
  33. requirements.txt +0 -0
  34. src/streamlit_app.py +0 -40
  35. visualization/README.md +347 -0
  36. visualization/SnowFlakeConnection.py +150 -0
  37. visualization/agents/README.md +320 -0
  38. visualization/agents/__init__.py +8 -0
  39. visualization/agents/base_agent.py +88 -0
  40. visualization/agents/content_summary_agent.py +366 -0
  41. visualization/app.py +180 -0
  42. visualization/components/dashboard.py +538 -0
  43. visualization/components/reply_required.py +324 -0
  44. visualization/components/sentiment_analysis.py +671 -0
  45. visualization/config/viz_config.json +87 -0
  46. visualization/data/data_loader.py +427 -0
  47. visualization/img/musora.png +0 -0
  48. visualization/requirements.txt +17 -0
  49. visualization/utils/data_processor.py +604 -0
  50. visualization/utils/llm_helper.py +149 -0
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile DELETED
@@ -1,20 +0,0 @@
1
- FROM python:3.13.5-slim
2
-
3
- WORKDIR /app
4
-
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- git \
9
- && rm -rf /var/lib/apt/lists/*
10
-
11
- COPY requirements.txt ./
12
- COPY src/ ./src/
13
-
14
- RUN pip3 install -r requirements.txt
15
-
16
- EXPOSE 8501
17
-
18
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
-
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md DELETED
@@ -1,19 +0,0 @@
1
- ---
2
- title: Brand Sentiment Analysis
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: Interactive UI for brand sentiment Analysis
12
- ---
13
-
14
- # Welcome to Streamlit!
15
-
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
-
18
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
processing_brand_sentiment/ARCHITECTURE_PROPOSAL.md ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Brand Sentiment Analysis - Architecture Redesign Proposal
2
+
3
+ ## Executive Summary
4
+
5
+ This document proposes a redesigned multi-agent architecture to address accuracy issues identified during manual evaluation. The new design separates **fact extraction** from **analysis**, adds strict validation, and improves content preprocessing.
6
+
7
+ ---
8
+
9
+ ## Current Issues Analysis
10
+
11
+ | Issue | Root Cause | Impact |
12
+ |-------|------------|--------|
13
+ | **B8X/B8 variation** | Word-boundary matching misses aliases | Missing relevant posts |
14
+ | **Competitor products attributed to Sabian** | LLM lacks competitor awareness, no strict list enforcement | False positives, wrong product attribution |
15
+ | **Short text language detection** | Lingua fails on short brand-heavy text | Skipping valid English posts |
16
+ | **False positive relevance** | Single-pass relevance + no verification | Pizza oven marked as Sabian discussion |
17
+ | **Long posts with overlapping content** | Poor quote separation, raw thread context | Confusing LLM, extraction from wrong content |
18
+
19
+ ---
20
+
21
+ ## Proposed Architecture
22
+
23
+ ### Design Principles
24
+
25
+ 1. **Separation of Concerns**: Fact extraction vs. interpretation/analysis
26
+ 2. **Strict Validation**: Enforce predefined value lists at every step
27
+ 3. **Structured Data Flow**: Each agent receives clean, relevant input
28
+ 4. **Fail-Safe Defaults**: Conservative approach - when uncertain, mark as not relevant
29
+
30
+ ### New Workflow
31
+
32
+ ```
33
+ ┌─────────────────────────────────────────────────────────────────┐
34
+ │ 1. CONTENT PREPROCESSOR │
35
+ │ (No LLM) │
36
+ │ • Enhanced HTML parsing (better quote separation) │
37
+ │ • Text cleaning and normalization │
38
+ │ • Language detection (skip for short texts < 50 chars) │
39
+ │ • Keyword screening with aliases (B8 → B8X) │
40
+ │ • Extract: cleaned_content, quoted_content, raw_thread_context │
41
+ └─────────────────────────────┬───────────────────────────────────┘
42
+
43
+
44
+ ┌───────────────────────────────┐
45
+ │ Has any Sabian-related │
46
+ │ keywords (primary/contextual)?│
47
+ └───────────────┬───────────────┘
48
+ │ │
49
+ YES NO
50
+ │ │
51
+ ▼ ▼
52
+ ┌─────────────────────────────────┐ ┌──────────────────┐
53
+ │ 2. RELEVANCE & EXTRACTION │ │ Mark as │
54
+ │ AGENT (LLM #1) │ │ NOT RELEVANT │
55
+ │ │ │ (0 LLM calls) │
56
+ │ INPUT: │ └──────────────────┘
57
+ │ • cleaned_content │
58
+ │ • quoted_content │
59
+ │ • raw_thread_context │
60
+ │ • keywords_found │
61
+ │ │
62
+ │ OUTPUT: │
63
+ │ • IS_RELEVANT: boolean │
64
+ │ • RELEVANCE_CONFIDENCE: h/m/l │
65
+ │ • RELEVANCE_REASON: string │
66
+ │ • PRODUCTS_MENTIONED: [] │ ← STRICT: only from predefined list
67
+ │ • SABIAN_MENTION_CONTEXT │
68
+ │ • AUTHOR_ROLE │
69
+ │ • COMPETITORS_MENTIONED: [] │ ← Brand names only, no products
70
+ │ • THREAD_CONTEXT_SUMMARY │ ← 1-2 sentence summary
71
+ └─────────────────┬───────────────┘
72
+
73
+
74
+ ┌─────────────────┐
75
+ │ IS_RELEVANT? │
76
+ └────────┬────────┘
77
+ │ │
78
+ YES NO
79
+ │ │
80
+ ▼ ▼
81
+ ┌─────────────────────────────────┐ ┌──────────────────┐
82
+ │ 3. SENTIMENT & INTENT │ │ Store with │
83
+ │ ANALYZER (LLM #2) │ │ is_relevant=F │
84
+ │ │ │ (1 LLM call) │
85
+ │ INPUT (structured): │ └──────────────────┘
86
+ │ • cleaned_content │
87
+ │ • PRODUCTS_MENTIONED │ ← Pre-validated list
88
+ │ • SABIAN_MENTION_CONTEXT │
89
+ │ • AUTHOR_ROLE │
90
+ │ • COMPETITORS_MENTIONED │
91
+ │ • THREAD_CONTEXT_SUMMARY │ ← Clean, concise context
92
+ │ │
93
+ │ OUTPUT: │
94
+ │ • SENTIMENT_LEVEL │
95
+ │ • EMOTION_TYPE │
96
+ │ • SENTIMENT_CONFIDENCE │
97
+ │ • SARCASM_DETECTED │
98
+ │ • PRODUCT_ATTRIBUTES: [] │
99
+ │ • COMPETITOR_PRODUCTS_OWNED: []│
100
+ │ • COMPARISON_TYPE │
101
+ │ • INTENTS: [] │
102
+ │ • PURCHASE_STAGE │
103
+ │ • DECISION_DRIVERS: [] │
104
+ │ • PAIN_POINTS: [] │
105
+ │ • DELIGHT_FACTORS: [] │
106
+ │ • ANALYSIS_NOTES │
107
+ └─────────────────┬───────────────┘
108
+
109
+
110
+ ┌─────────────────────────────────┐
111
+ │ 4. OUTPUT VALIDATOR │
112
+ │ (No LLM - Rule-based) │
113
+ │ │
114
+ │ • Verify all values from lists │
115
+ │ • Check logical consistency │
116
+ │ • Flag anomalies for review │
117
+ │ • Set processing_status │
118
+ └─────────────────────────────────┘
119
+ ```
120
+
121
+ ---
122
+
123
+ ## API Call Summary
124
+
125
+ | Scenario | Current Calls | New Calls | Notes |
126
+ |----------|--------------|-----------|-------|
127
+ | No keywords found | 0 | 0 | Same |
128
+ | Primary keywords, relevant | 1 | 2 | +1 for better extraction |
129
+ | Primary keywords, not relevant | 1 | 1 | Extraction determines not relevant |
130
+ | Ambiguous keywords, relevant | 2 | 2 | Same |
131
+ | Ambiguous keywords, not relevant | 2 | 1 | Early exit after extraction |
132
+
133
+ **Net Impact**: Slight increase for some cases, but significantly better accuracy.
134
+
135
+ ---
136
+
137
+ ## Agent Specifications
138
+
139
+ ### Agent 1: Content Preprocessor (No LLM)
140
+
141
+ **File**: `workflow/agents/content_preprocessor_agent.py`
142
+
143
+ **Improvements over current**:
144
+ 1. Enhanced HTML parsing with better quote/reply separation
145
+ 2. Product alias mapping (B8 → B8X, etc.)
146
+ 3. Skip language detection for texts < 50 characters
147
+ 4. Always process if primary Sabian keywords found (regardless of language detection)
148
+
149
+ **Product Aliases** (add to brand_config.json):
150
+ ```json
151
+ "product_aliases": {
152
+ "B8": "B8X",
153
+ "sbrs": "SBR",
154
+ "hand hammered": "HH",
155
+ "hand-hammered": "HH"
156
+ }
157
+ ```
158
+
159
+ ---
160
+
161
+ ### Agent 2: Relevance & Extraction Agent (LLM #1)
162
+
163
+ **File**: `workflow/agents/relevance_extraction_agent.py`
164
+
165
+ **Purpose**: Determine relevance with HIGH confidence and extract verifiable facts.
166
+
167
+ **Key Design Decisions**:
168
+
169
+ 1. **Strict Product Matching**:
170
+ - Provide explicit product list in prompt
171
+ - Instruction: "ONLY return products that EXACTLY match items in this list"
172
+ - Return empty list if no exact matches (not hallucinated guesses)
173
+
174
+ 2. **Competitor Awareness**:
175
+ - List competitor BRAND names (not products)
176
+ - Instruction: "Products like '2002', 'Signature', 'K Custom' belong to competitors, NOT Sabian"
177
+ - Prevent cross-brand attribution
178
+
179
+ 3. **Thread Context Summarization**:
180
+ - Summarize in 1-2 sentences maximum
181
+ - Focus only on information relevant to understanding the post's context
182
+
183
+ 4. **Conservative Relevance**:
184
+ - When uncertain, mark as NOT relevant
185
+ - Require explicit Sabian product/brand mention IN THE POST CONTENT
186
+ - Quoted content mentioning Sabian does NOT make post relevant
187
+
188
+ **System Prompt Structure**:
189
+ ```
190
+ You are a brand mention extractor for Sabian cymbals. Your job is to:
191
+ 1. Determine if the POST CONTENT discusses Sabian products
192
+ 2. Extract ONLY facts, not interpretations
193
+
194
+ ## CRITICAL RULES
195
+
196
+ ### Rule 1: Relevance Based on POST CONTENT Only
197
+ - The post is relevant ONLY if the POST CONTENT itself mentions Sabian products
198
+ - Quoted/parent content mentioning Sabian does NOT make the post relevant
199
+ - Generic replies ("Thanks!", "Got it!") are NEVER relevant
200
+
201
+ ### Rule 2: Strict Product Matching
202
+ SABIAN PRODUCTS (use ONLY these exact values):
203
+ [HHX, HH, AAX, AA, Artisan, FRX, Omni, Chopper, Stratus, XSR, B8X, SBR]
204
+
205
+ - Return ONLY products from this list
206
+ - If you see a product not in this list, do NOT include it
207
+ - "2002", "Signature", "Sound Edge", "Formula 602" are PAISTE products, NOT Sabian
208
+ - "K Custom", "A Custom", "K Zildjian" are ZILDJIAN products, NOT Sabian
209
+ - When uncertain, return empty list []
210
+
211
+ ### Rule 3: Competitor Brand Awareness
212
+ COMPETITOR BRANDS: [Zildjian, Paiste, Meinl, Dream Cymbals, Istanbul Agop, Bosphorus]
213
+
214
+ - Only return competitor BRAND names in competitors_mentioned
215
+ - Do NOT guess competitor products
216
+
217
+ ### Rule 4: Thread Context Summary
218
+ - Summarize thread context in 1-2 sentences maximum
219
+ - Focus on what helps understand the post's topic
220
+ - If thread is about pizza ovens, say "Thread discusses pizza ovens and cooking"
221
+
222
+ ## OUTPUT FORMAT
223
+ Return ONLY valid JSON:
224
+ {
225
+ "is_relevant": boolean,
226
+ "relevance_confidence": "high" | "medium" | "low",
227
+ "relevance_reason": "1-2 sentences explaining decision",
228
+ "products_mentioned": [], // ONLY from Sabian list above
229
+ "sabian_mention_context": "primary_focus" | "significant_mention" | "casual_mention" | "comparison_context" | null,
230
+ "author_role": "current_owner" | "past_owner" | "potential_buyer" | "never_owned" | "unknown",
231
+ "competitors_mentioned": [], // Brand names only
232
+ "thread_context_summary": "1-2 sentence summary"
233
+ }
234
+ ```
235
+
236
+ ---
237
+
238
+ ### Agent 3: Sentiment & Intent Analyzer (LLM #2)
239
+
240
+ **File**: `workflow/agents/sentiment_analyzer_agent.py`
241
+
242
+ **Purpose**: Deep analysis on VERIFIED relevant posts with STRUCTURED input.
243
+
244
+ **Key Design Decisions**:
245
+
246
+ 1. **Receives Pre-Validated Input**:
247
+ - Products already extracted and validated
248
+ - Thread context already summarized
249
+ - Author role already determined
250
+
251
+ 2. **Focused Analysis**:
252
+ - Sentiment TOWARDS SABIAN ONLY
253
+ - Intent classification
254
+ - Pain points / Delights (author's own experience only)
255
+ - Purchase journey (author's own journey only)
256
+
257
+ 3. **No Hallucination on Products**:
258
+ - Products are GIVEN in input, not re-extracted
259
+ - Can only discuss attributes of provided products
260
+
261
+ **System Prompt Structure**:
262
+ ```
263
+ You are a sentiment analyst for Sabian cymbal discussions.
264
+
265
+ ## INPUT CONTEXT (Pre-validated, trust these values)
266
+ - Products mentioned: {products_mentioned}
267
+ - Sabian mention context: {sabian_mention_context}
268
+ - Author role: {author_role}
269
+ - Thread summary: {thread_context_summary}
270
+ - Competitors mentioned: {competitors_mentioned}
271
+
272
+ ## YOUR TASK
273
+ Analyze the sentiment, emotions, and intents in this post about Sabian.
274
+
275
+ ## CRITICAL RULES
276
+
277
+ ### Rule 1: Sabian-Specific Sentiment
278
+ - Sentiment MUST be about Sabian, NOT overall post tone
279
+ - Example: "Love my new kit! The SBR cymbals sound terrible."
280
+ - Overall: positive | Sabian sentiment: NEGATIVE
281
+
282
+ ### Rule 2: Author Perspective Only
283
+ These fields are ONLY for author's OWN experience:
284
+ - purchase_stage, decision_drivers, pain_points, delight_factors
285
+ - If author is giving ADVICE to others, these should be null/empty
286
+
287
+ ### Rule 3: Use Only Valid Values
288
+ [List all valid values for each field]
289
+
290
+ ## OUTPUT FORMAT
291
+ {
292
+ "sentiment_level": "...",
293
+ "emotion_type": "..." or null,
294
+ "sentiment_confidence": "high" | "medium" | "low",
295
+ "sarcasm_detected": boolean,
296
+ "product_attributes": [],
297
+ "competitor_products_owned": [],
298
+ "comparison_type": "..." or null,
299
+ "intents": [],
300
+ "purchase_stage": "..." or null,
301
+ "decision_drivers": [],
302
+ "pain_points": [],
303
+ "delight_factors": [],
304
+ "analysis_notes": "1-2 sentences"
305
+ }
306
+ ```
307
+
308
+ ---
309
+
310
+ ### Agent 4: Output Validator (No LLM)
311
+
312
+ **File**: `workflow/agents/output_validator_agent.py`
313
+
314
+ **Purpose**: Final validation and anomaly detection.
315
+
316
+ **Validation Rules**:
317
+
318
+ 1. **List Validation**:
319
+ - All products_mentioned are in Sabian product list
320
+ - All competitors_mentioned are in competitor list
321
+ - All categorical values are from predefined lists
322
+
323
+ 2. **Logical Consistency**:
324
+ - If is_relevant=True, products_mentioned should not be empty (flag if empty)
325
+ - If sabian_mention_context="primary_focus", products_mentioned should have items
326
+ - If sentiment_level="very_negative", pain_points should not be empty (warn)
327
+
328
+ 3. **Anomaly Flagging**:
329
+ - Flag for manual review if inconsistencies detected
330
+ - Add `validation_flags` field to output
331
+
332
+ ---
333
+
334
+ ## Configuration Changes
335
+
336
+ ### brand_config.json Updates
337
+
338
+ ```json
339
+ {
340
+ "brand": {
341
+ "name": "Sabian",
342
+ "products": ["HHX", "HH", "AAX", "AA", "Artisan", "FRX", "Omni", "Chopper", "Stratus", "XSR", "B8X", "SBR"],
343
+ "product_aliases": {
344
+ "B8": "B8X",
345
+ "sbrs": "SBR",
346
+ "hhx's": "HHX",
347
+ "aax's": "AAX"
348
+ },
349
+ "competitor_products_warning": [
350
+ "2002", "Signature", "Sound Edge", "Formula 602", "Giant Beat",
351
+ "K Custom", "A Custom", "K Zildjian", "A Zildjian", "S Family",
352
+ "Byzance", "Pure Alloy", "HCS",
353
+ "Bliss", "Contact", "Energy"
354
+ ],
355
+ "competitors": [...]
356
+ },
357
+ "preprocessing": {
358
+ "min_length_for_language_detection": 50,
359
+ "always_process_if_primary_keyword": true
360
+ }
361
+ }
362
+ ```
363
+
364
+ ---
365
+
366
+ ## File Structure
367
+
368
+ ```
369
+ processing_brand_sentiment/
370
+ ├── config_files/
371
+ │ ├── brand_config.json # Updated with aliases, warnings
372
+ │ ├── workflow_config.json # Agent configurations
373
+ │ └── analysis_categories.json # Category definitions (unchanged)
374
+ ├── workflow/
375
+ │ ├── orchestrator.py # Updated workflow graph
376
+ │ └── agents/
377
+ │ ├── base_agent.py # Base class (unchanged)
378
+ │ ├── content_preprocessor_agent.py # Enhanced preprocessing
379
+ │ ├── relevance_extraction_agent.py # NEW: Extraction + relevance
380
+ │ ├── sentiment_analyzer_agent.py # NEW: Focused analysis
381
+ │ └── output_validator_agent.py # NEW: Validation
382
+ ```
383
+
384
+ ---
385
+
386
+ ## Migration Path
387
+
388
+ ### Phase 1: Configuration Updates
389
+ 1. Update brand_config.json with product aliases
390
+ 2. Add competitor product warnings
391
+ 3. Update preprocessing settings
392
+
393
+ ### Phase 2: New Agents
394
+ 1. Create relevance_extraction_agent.py
395
+ 2. Create sentiment_analyzer_agent.py
396
+ 3. Create output_validator_agent.py
397
+ 4. Update content_preprocessor_agent.py
398
+
399
+ ### Phase 3: Orchestrator Update
400
+ 1. Update workflow graph with new flow
401
+ 2. Update state definition
402
+ 3. Add new routing logic
403
+
404
+ ### Phase 4: Testing & Validation
405
+ 1. Run on test batch with known issues
406
+ 2. Compare accuracy metrics
407
+ 3. Fine-tune prompts based on results
408
+
409
+ ---
410
+
411
+ ## Expected Improvements
412
+
413
+ | Issue | Current Behavior | Expected After |
414
+ |-------|------------------|----------------|
415
+ | B8/B8X | Missed | Caught via alias mapping |
416
+ | Paiste products as Sabian | Attributed to Sabian | Correctly identified as competitor |
417
+ | Short text language | Marked as Latin | Processed as English |
418
+ | False positive (pizza) | Marked relevant | Marked not relevant |
419
+ | Long confusing context | Raw text confuses LLM | Summarized 1-2 sentences |
420
+
421
+ ---
422
+
423
+ ## Success Metrics
424
+
425
+ 1. **Relevance Accuracy**: >99% (currently ~90%)
426
+ 2. **Product Attribution Accuracy**: >99% (currently ~85%)
427
+ 3. **Sentiment Accuracy**: >95% (current unknown)
428
+ 4. **False Positive Rate**: <1%
429
+ 5. **False Negative Rate**: <1%
430
+
431
+ ---
432
+
433
+ ## Questions for Review
434
+
435
+ 1. Should we add a manual review queue for flagged posts?
436
+ 2. Should thread_context_summary be stored in output for debugging?
437
+ 3. Preferred batch size for re-processing existing data?
processing_brand_sentiment/README.md ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Brand Sentiment Analysis Pipeline
2
+
3
+ A modular, scalable system for analyzing forum discussions and social media comments about specific brands using an agentic workflow with LLMs. The initial implementation focuses on **Sabian** (a cymbal manufacturer), but the architecture supports easy addition of new brands through configuration.
4
+
5
+ ## Overview
6
+
7
+ The pipeline fetches data from Snowflake (forum posts and/or social media comments), preprocesses them (parsing HTML for forums or cleaning plain text for comments), detects language, validates brand relevance, performs comprehensive sentiment and intelligence extraction using OpenAI's API, and stores enriched results back to Snowflake.
8
+
9
+ ## Data Sources
10
+
11
+ | Source | Table | Output Table | Description |
12
+ |--------|-------|--------------|-------------|
13
+ | **Forums** | `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS` | `SABIAN_BRAND_ANALYSIS` | Forum posts with thread context |
14
+ | **Comments** | `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS` | `SABIAN_BRAND_ANALYSIS_COMMENTS` | Social media comments with content context |
15
+
16
+ ## Architecture v4.0
17
+
18
+ The system uses a 4-agent pipeline that separates **fact extraction** from **analysis** for improved accuracy. Both data sources share the same extraction, analysis, and validation agents - only the preprocessor differs.
19
+
20
+ ```
21
+ ┌─────────────────────────────────────────────────────────────────┐
22
+ │ 1a. CONTENT PREPROCESSOR (Forums) │
23
+ │ (No LLM) │
24
+ │ - HTML parsing with quote/reply separation │
25
+ │ - Product alias mapping (B8 → B8X) │
26
+ │ - Smart language detection │
27
+ │ - Keyword-based relevance screening │
28
+ ├─────────────────────────────────────────────────────────────────┤
29
+ │ 1b. COMMENT PREPROCESSOR (Comments) │
30
+ │ (No LLM) │
31
+ │ - Plain text cleaning (no HTML) │
32
+ │ - Product alias mapping (B8 → B8X) │
33
+ │ - Smart language detection │
34
+ │ - Keyword-based relevance screening │
35
+ │ - Context: content title + description + parent comment │
36
+ └─────────────────────────────┬───────────────────────────────────┘
37
+
38
+
39
+ ┌───────────────────────────────┐
40
+ │ Has Sabian-related keywords? │
41
+ └───────────────┬───────────────┘
42
+ │ │
43
+ YES NO
44
+ │ │
45
+ ▼ ▼
46
+ ┌─────────────────────────────────┐ ┌──────────────────┐
47
+ │ 2. RELEVANCE & EXTRACTION │ │ Mark as │
48
+ │ AGENT (LLM #1) │ │ NOT RELEVANT │
49
+ │ [SHARED] │ │ (0 LLM calls) │
50
+ │ - Validates relevance │ └──────────────────┘
51
+ │ - Extracts products (strict) │
52
+ │ - Identifies author role │
53
+ │ - Summarizes context │
54
+ │ - Detects competitors │
55
+ └─────────────────┬───────────────┘
56
+
57
+
58
+ ┌─────────────────┐
59
+ │ IS_RELEVANT? │
60
+ └────────┬────────┘
61
+ │ │
62
+ YES NO
63
+ │ │
64
+ ▼ ▼
65
+ ┌─────────────────────────────────┐ ┌──────────────────┐
66
+ │ 3. SENTIMENT & INTENT │ │ Store with │
67
+ │ ANALYZER (LLM #2) │ │ is_relevant=F │
68
+ │ [SHARED] │ │ (1 LLM call) │
69
+ │ - Sabian-specific sentiment │ └──────────────────┘
70
+ │ - Intent classification │
71
+ │ - Pain points / Delights │
72
+ ��� - Purchase journey (author) │
73
+ │ - Competitor products owned │
74
+ └─────────────────┬───────────────┘
75
+
76
+
77
+ ┌─────────────────────────────────┐
78
+ │ 4. OUTPUT VALIDATOR │
79
+ │ (No LLM - Rule-based) │
80
+ │ [SHARED] │
81
+ │ - Validates all values │
82
+ │ - Checks logical consistency │
83
+ │ - Flags anomalies for review │
84
+ └─────────────────────────────────┘
85
+ ```
86
+
87
+ ## Features
88
+
89
+ - **Multi-Source Support**: Process forums, social media comments, or both
90
+ - **4-Agent Pipeline**: Separation of extraction and analysis for improved accuracy
91
+ - **Strict Product Matching**: Only returns products from predefined list, preventing hallucination
92
+ - **Competitor Awareness**: Knows which products belong to competitors
93
+ - **Smart Language Detection**: Skips detection for short texts, always processes if primary keywords found
94
+ - **Product Alias Mapping**: Handles variations (B8 → B8X, "hand hammered" → HH)
95
+ - **Thread/Comment Context**: LLM summarizes context for clarity
96
+ - **Validation & Anomaly Detection**: Rule-based validator catches errors and flags edge cases
97
+ - **Author Perspective Tracking**: Distinguishes author's own experience from advice to others
98
+ - **Platform Tracking**: Records source platform for each processed item
99
+
100
+ ## Project Structure
101
+
102
+ ```
103
+ processing_brand_sentiment/
104
+ ├── config_files/
105
+ │ ├── brand_config.json # Brand products, aliases, competitors, keywords, data sources
106
+ │ ├── workflow_config.json # LLM settings, batch sizes, output config (forums + comments)
107
+ │ └── analysis_categories.json # Sentiment, intent, pain point categories
108
+ ├── database/
109
+ │ ├── __init__.py
110
+ │ ├── snowflake_connection.py # Snowflake connection handler
111
+ │ └── sql/
112
+ │ ├── fetch_forum_posts.sql # Query for forum posts with thread context
113
+ │ ├── fetch_comments.sql # Query for social media comments with content context
114
+ │ ├── create_output_table.sql # Forum output schema with views
115
+ │ ├── init_output_table.sql # Forum table initialization
116
+ │ ├── create_comments_output_table.sql # Comment output schema with views
117
+ │ └── init_comments_output_table.sql # Comment table initialization
118
+ ├── workflow/
119
+ │ ├── __init__.py
120
+ │ ├── orchestrator.py # Forum LangGraph workflow coordinator
121
+ │ ├── comment_orchestrator.py # Comment LangGraph workflow coordinator
122
+ │ └── agents/
123
+ │ ├── __init__.py
124
+ │ ├── base_agent.py # Abstract base class
125
+ │ ├── content_preprocessor_agent.py # Forum: HTML parsing, alias mapping
126
+ │ ├── comment_preprocessor_agent.py # Comments: plain text, comment context
127
+ │ ├── sabian_relevance_extraction_agent.py # Shared: relevance + extraction
128
+ │ ├── sabian_sentiment_analyzer_agent.py # Shared: sentiment analysis
129
+ │ └── output_validator_agent.py # Shared: rule-based validation
130
+ ├── utils/
131
+ │ ├── __init__.py
132
+ │ └── html_parser.py # HTML content extraction (forums only)
133
+ ├── logs/ # Processing logs (auto-created)
134
+ ├── main.py # Main execution script (multi-source)
135
+ ├── .env # Environment variables
136
+ └── README.md # This file
137
+ ```
138
+
139
+ ## Setup
140
+
141
+ ### 1. Install Dependencies
142
+
143
+ ```bash
144
+ pip install langchain-openai langgraph snowflake-snowpark-python python-dotenv pandas beautifulsoup4 lingua-language-detector
145
+ ```
146
+
147
+ ### 2. Configure Environment Variables
148
+
149
+ Ensure `.env` file contains:
150
+
151
+ ```env
152
+ # Snowflake
153
+ SNOWFLAKE_USER=your_user
154
+ SNOWFLAKE_PASSWORD=your_password
155
+ SNOWFLAKE_ACCOUNT=your_account
156
+ SNOWFLAKE_ROLE=your_role
157
+ SNOWFLAKE_DATABASE=SOCIAL_MEDIA_DB
158
+ SNOWFLAKE_WAREHOUSE=your_warehouse
159
+ SNOWFLAKE_SCHEMA=ML_FEATURES
160
+
161
+ # OpenAI
162
+ OPENAI_API_KEY=your_openai_key
163
+ ```
164
+
165
+ ### 3. Initialize Snowflake Tables
166
+
167
+ Run the initialization scripts before first processing:
168
+
169
+ ```sql
170
+ -- For forums
171
+ database/sql/init_output_table.sql
172
+
173
+ -- For social media comments
174
+ database/sql/init_comments_output_table.sql
175
+ ```
176
+
177
+ ## Usage
178
+
179
+ ### Process All Sources (Default)
180
+
181
+ ```bash
182
+ python main.py
183
+ ```
184
+
185
+ ### Process Forums Only
186
+
187
+ ```bash
188
+ python main.py --data-source forums
189
+ ```
190
+
191
+ ### Process Social Media Comments Only
192
+
193
+ ```bash
194
+ python main.py --data-source comments
195
+ ```
196
+
197
+ ### Process Limited Number
198
+
199
+ ```bash
200
+ python main.py --limit 100
201
+ python main.py --data-source comments --limit 50
202
+ ```
203
+
204
+ ### Sequential Processing (Debug Mode)
205
+
206
+ ```bash
207
+ python main.py --limit 50 --sequential
208
+ ```
209
+
210
+ ### First Run (Overwrite Mode)
211
+
212
+ ```bash
213
+ python main.py --overwrite --limit 100
214
+ ```
215
+
216
+ ### Command-Line Arguments
217
+
218
+ | Argument | Description | Default |
219
+ |----------|-------------|---------|
220
+ | `--limit N` | Process only N items per source | All unprocessed |
221
+ | `--overwrite` | Overwrite existing table | Append mode |
222
+ | `--sequential` | Single-threaded processing | Parallel |
223
+ | `--config-dir PATH` | Custom config directory | config_files/ |
224
+ | `--data-source SOURCE` | Source to process: `forums`, `comments`, `all` | `all` |
225
+
226
+ ## Configuration
227
+
228
+ ### brand_config.json
229
+
230
+ Key sections:
231
+
232
+ ```json
233
+ {
234
+ "brand": {
235
+ "name": "Sabian",
236
+ "products": ["HHX", "HH", "AAX", "AA", "Artisan", "FRX", "Omni", "Chopper", "Stratus", "XSR", "B8X", "SBR"],
237
+ "product_aliases": {
238
+ "b8": "B8X",
239
+ "hand hammered": "HH"
240
+ },
241
+ "competitor_products_warning": {
242
+ "paiste_products": ["2002", "signature", "sound edge", "formula 602"],
243
+ "zildjian_products": ["k custom", "a custom", "k zildjian"]
244
+ },
245
+ "competitors": [...]
246
+ },
247
+ "data_sources": {
248
+ "forums": {
249
+ "table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS",
250
+ "platform": "musora_forums"
251
+ },
252
+ "comments": {
253
+ "table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS",
254
+ "platform_column": "PLATFORM"
255
+ }
256
+ }
257
+ }
258
+ ```
259
+
260
+ ### analysis_categories.json
261
+
262
+ Defines valid values for all categorical fields:
263
+
264
+ - `author_role`: current_owner, past_owner, potential_buyer, never_owned, unknown
265
+ - `sabian_mention_context`: primary_focus, significant_mention, casual_mention, comparison_context
266
+ - `sentiment_level`: very_negative, negative, neutral, positive, very_positive
267
+ - `intents`: seeking_information, providing_information, sharing_experience, comparing, praising, criticizing, buying_selling, general_discussion
268
+ - `feedback_aspects`: sound_quality, price_value, durability, playability, versatility, customer_service, availability, aesthetics
269
+
270
+ ## Output Tables
271
+
272
+ ### Forum Output: `SABIAN_BRAND_ANALYSIS`
273
+
274
+ | Category | Key Columns |
275
+ |----------|-------------|
276
+ | **Identifiers** | POST_ID, THREAD_ID, POST_AUTHOR_ID, PLATFORM |
277
+ | **Content** | ORIGINAL_CONTENT, CLEANED_CONTENT, QUOTED_CONTENT, THREAD_CONTEXT_SUMMARY |
278
+ | **Thread** | THREAD_TITLE, THREAD_FIRST_POST, POST_CREATED_AT, THREAD_STARTED_AT |
279
+ | **Category** | CATEGORY_TITLE, CATEGORY_TOPIC |
280
+
281
+ ### Comment Output: `SABIAN_BRAND_ANALYSIS_COMMENTS`
282
+
283
+ | Category | Key Columns |
284
+ |----------|-------------|
285
+ | **Identifiers** | COMMENT_SK, COMMENT_ID, PLATFORM, AUTHOR_NAME, AUTHOR_ID |
286
+ | **Content** | ORIGINAL_TEXT, COMMENT_TIMESTAMP |
287
+ | **Context** | CONTENT_SK, CONTENT_ID, CONTENT_DESCRIPTION, PARENT_COMMENT_ID, PARENT_COMMENT_TEXT |
288
+ | **Channel** | CHANNEL_SK, CHANNEL_NAME, CHANNEL_DISPLAY_NAME |
289
+
290
+ ### Shared Analysis Columns (Both Tables)
291
+
292
+ | Category | Fields | Notes |
293
+ |----------|--------|-------|
294
+ | **Language** | DETECTED_LANGUAGE, LANGUAGE_CODE, IS_ENGLISH | Language detection |
295
+ | **Relevance** | IS_RELEVANT, RELEVANCE_CONFIDENCE, RELEVANCE_REASON | Brand relevance |
296
+ | **Extraction** | PRODUCTS_MENTIONED, AUTHOR_ROLE, SABIAN_MENTION_CONTEXT | From Agent 1 |
297
+ | **Sentiment** | SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_CONFIDENCE | Sabian-specific |
298
+ | **Intents** | INTENTS (multi-label) | What author is trying to accomplish |
299
+ | **Journey** | PURCHASE_STAGE, DECISION_DRIVERS | Author perspective only |
300
+ | **Feedback** | PAIN_POINTS, DELIGHT_FACTORS | Author's own experience |
301
+ | **Competitive** | COMPETITORS_MENTIONED, COMPETITOR_PRODUCTS_OWNED, COMPARISON_TYPE | Competitive intel |
302
+ | **Validation** | VALIDATION_FLAGS, PROCESSING_STATUS | Anomaly detection |
303
+
304
+ ### Processing Status Values
305
+
306
+ | Status | Description |
307
+ |--------|-------------|
308
+ | `completed` | Successfully processed, no issues |
309
+ | `completed_with_flags` | Processed but has anomalies to review |
310
+ | `validation_failed` | Validation errors detected |
311
+ | `workflow_error` | Unexpected error during processing |
312
+
313
+ ### Available Views
314
+
315
+ #### Forum Views
316
+
317
+ | View | Description |
318
+ |------|-------------|
319
+ | `VW_SABIAN_RELEVANT_ANALYSIS` | Only relevant, successfully processed posts |
320
+ | `VW_SABIAN_FLAGGED_POSTS` | Posts with validation flags for review |
321
+ | `VW_SABIAN_SENTIMENT_DISTRIBUTION` | Sentiment breakdown statistics |
322
+ | `VW_SABIAN_PRODUCT_MENTIONS` | Product mention summary |
323
+ | `VW_SABIAN_COMPETITOR_ANALYSIS` | Competitor comparison analysis |
324
+ | `VW_SABIAN_PAIN_POINTS` | Pain point frequency analysis |
325
+ | `VW_SABIAN_AUTHOR_ROLES` | Author role distribution |
326
+ | `VW_SABIAN_COMPETITOR_OWNERSHIP` | Competitor brands owned by authors |
327
+ | `VW_SABIAN_VALIDATION_SUMMARY` | Processing status breakdown |
328
+
329
+ #### Comment Views
330
+
331
+ | View | Description |
332
+ |------|-------------|
333
+ | `VW_SABIAN_COMMENTS_RELEVANT_ANALYSIS` | Relevant, successful comments |
334
+ | `VW_SABIAN_COMMENTS_FLAGGED` | Comments with validation flags |
335
+ | `VW_SABIAN_COMMENTS_SENTIMENT_DISTRIBUTION` | Sentiment by platform |
336
+ | `VW_SABIAN_COMMENTS_PRODUCT_MENTIONS` | Product mentions by platform |
337
+ | `VW_SABIAN_COMMENTS_VALIDATION_SUMMARY` | Processing status by platform |
338
+
339
+ ## API Call Efficiency
340
+
341
+ | Scenario | LLM Calls | Notes |
342
+ |----------|-----------|-------|
343
+ | No keywords found | 0 | Early exit in preprocessor |
344
+ | Primary keywords, relevant | 2 | Extraction + Analysis |
345
+ | Primary keywords, not relevant | 1 | Only Extraction |
346
+ | Non-English content | 0 | Skipped |
347
+
348
+ ## Key Design Decisions
349
+
350
+ ### Why Separate Forum and Comment Preprocessors?
351
+
352
+ 1. **Different input formats**: Forums use HTML (quotes, blockquotes), comments are plain text
353
+ 2. **Different context sources**: Forums have thread title + first post + category; comments have content title + description + parent comment
354
+ 3. **Shared analysis**: Both feed into the same extraction and analysis agents
355
+
356
+ ### Why Separate Output Tables?
357
+
358
+ 1. **Different identifiers**: Forums use POST_ID/THREAD_ID; comments use COMMENT_SK/COMMENT_ID/PLATFORM
359
+ 2. **Different metadata**: Forums have thread context; comments have content/channel metadata
360
+ 3. **Clean separation**: Avoids NULL columns and schema confusion
361
+ 4. **Shared analysis columns**: All extracted intelligence fields are identical
362
+
363
+ ### Why Platform Column for Forums?
364
+
365
+ The `PLATFORM` column was added to `SABIAN_BRAND_ANALYSIS` (defaulting to `musora_forums`) to enable cross-source analysis and maintain consistency with the comments table which uses the dynamic platform value from the source data.
366
+
367
+ ## Troubleshooting
368
+
369
+ ### "Table does not exist" on First Run
370
+
371
+ Run the appropriate init SQL in Snowflake first:
372
+ - Forums: `database/sql/init_output_table.sql`
373
+ - Comments: `database/sql/init_comments_output_table.sql`
374
+
375
+ ### No Comments Being Processed
376
+
377
+ Check that `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS` table exists and contains data. The query joins with `DIM_CONTENT` and `DIM_CHANNEL` - verify these dimension tables have matching records.
378
+
379
+ ### Competitor Products Attributed to Sabian
380
+
381
+ Check `brand_config.json` for `competitor_products_warning` section. Add any missing competitor products.
382
+
383
+ ### API Rate Limits
384
+
385
+ Use `--sequential` mode or reduce `--limit`:
386
+ ```bash
387
+ python main.py --sequential --limit 50
388
+ ```
389
+
390
+ ## Schema Version History
391
+
392
+ | Version | Changes |
393
+ |---------|---------|
394
+ | 1.0 | Initial release |
395
+ | 2.0 | Added author_role, post_type, sabian_mention_context |
396
+ | 3.0 | Removed post_type (merged into intents), unified feedback_aspects |
397
+ | 4.0 | 4-agent pipeline, thread_context_summary, validation flags, product aliases |
398
+ | 4.0+ | Added social media comments support, PLATFORM column, separate comment output table |
399
+
400
+ ## License
401
+
402
+ Internal use only - Brand sentiment analysis project.
processing_brand_sentiment/config_files/analysis_categories.json ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "author_role": {
3
+ "description": "Author's relationship to Sabian products",
4
+ "categories": [
5
+ {"value": "current_owner", "description": "Currently owns/uses Sabian"},
6
+ {"value": "past_owner", "description": "Previously owned, sold/replaced"},
7
+ {"value": "potential_buyer", "description": "Considering purchasing Sabian"},
8
+ {"value": "never_owned", "description": "Explicitly doesn't own Sabian"},
9
+ {"value": "unknown", "description": "Cannot determine from post"}
10
+ ]
11
+ },
12
+ "sabian_mention_context": {
13
+ "description": "How prominently Sabian is discussed",
14
+ "categories": [
15
+ {"value": "primary_focus", "description": "Sabian is the main topic"},
16
+ {"value": "significant_mention", "description": "Discussed with detail, not main focus"},
17
+ {"value": "casual_mention", "description": "Brief mention among other topics"},
18
+ {"value": "comparison_context", "description": "Mentioned while comparing to competitors"}
19
+ ]
20
+ },
21
+ "sentiment": {
22
+ "brand_specific": true,
23
+ "description": "Sentiment TOWARDS SABIAN ONLY (not overall post tone)",
24
+ "levels": [
25
+ {"value": "very_negative", "description": "Strong criticism, anger, severe disappointment"},
26
+ {"value": "negative", "description": "Complaints, dissatisfaction, mild criticism"},
27
+ {"value": "neutral", "description": "Factual mention, balanced, no clear sentiment"},
28
+ {"value": "positive", "description": "Satisfaction, appreciation, mild praise"},
29
+ {"value": "very_positive", "description": "Enthusiasm, strong praise, highly recommend"}
30
+ ]
31
+ },
32
+ "emotions": {
33
+ "brand_specific": true,
34
+ "description": "Emotion towards SABIAN specifically",
35
+ "categories": [
36
+ {"value": "frustration", "description": "Annoyance with product issues"},
37
+ {"value": "disappointment", "description": "Unmet expectations"},
38
+ {"value": "anger", "description": "Strong negative emotion"},
39
+ {"value": "satisfaction", "description": "Expectations met, content"},
40
+ {"value": "excitement", "description": "Eagerness, anticipation"},
41
+ {"value": "curiosity", "description": "Interest, wanting to know more"},
42
+ {"value": "indifference", "description": "No strong feelings"}
43
+ ]
44
+ },
45
+ "intents": {
46
+ "multi_label": true,
47
+ "description": "What the author is trying to accomplish (can select multiple)",
48
+ "categories": [
49
+ {"value": "seeking_information", "description": "Asking questions, seeking advice/recommendations"},
50
+ {"value": "providing_information", "description": "Answering questions, giving advice, helping others"},
51
+ {"value": "sharing_experience", "description": "Personal experience, review, testimonial, purchase announcement"},
52
+ {"value": "comparing", "description": "Comparing brands/products against each other"},
53
+ {"value": "praising", "description": "Actively endorsing, recommending, advocating for Sabian"},
54
+ {"value": "criticizing", "description": "Actively complaining, warning others, reporting issues"},
55
+ {"value": "buying_selling", "description": "Listing gear for sale, looking to buy/trade"},
56
+ {"value": "general_discussion", "description": "General conversation not fitting above"}
57
+ ]
58
+ },
59
+ "purchase_stage": {
60
+ "author_perspective_only": true,
61
+ "description": "Author's own purchase journey stage (null if giving advice to others)",
62
+ "categories": [
63
+ {"value": "researching", "description": "Gathering info before buying"},
64
+ {"value": "deciding", "description": "Actively comparing, about to decide"},
65
+ {"value": "recently_purchased", "description": "Just bought the product"},
66
+ {"value": "long_term_owner", "description": "Owned for extended period"},
67
+ {"value": "selling_replacing", "description": "Selling or replacing gear"}
68
+ ]
69
+ },
70
+ "comparison_type": {
71
+ "description": "Type of competitive comparison (if comparing)",
72
+ "categories": [
73
+ {"value": "direct_comparison", "description": "Side-by-side evaluation"},
74
+ {"value": "preference_statement", "description": "Stating brand preference"},
75
+ {"value": "switching_to_sabian", "description": "Moving or Moved from competitor to Sabian"},
76
+ {"value": "switching_from_sabian", "description": "Moving or Moved from Sabian to competitor"}
77
+ ]
78
+ },
79
+ "feedback_aspects": {
80
+ "description": "Product/brand aspects discussed. Used for BOTH pain_points (negative) and delight_factors (positive)",
81
+ "categories": [
82
+ {"value": "sound_quality", "description": "Sound, tone, character, audio qualities"},
83
+ {"value": "price_value", "description": "Cost, value for money, deals"},
84
+ {"value": "durability", "description": "Build quality, longevity, cracking/wear"},
85
+ {"value": "playability", "description": "Feel, response, ease of playing"},
86
+ {"value": "versatility", "description": "Range of genres/applications, flexibility"},
87
+ {"value": "customer_service", "description": "Support, warranty, brand interaction"},
88
+ {"value": "availability", "description": "Stock, ease of finding/purchasing"},
89
+ {"value": "aesthetics", "description": "Appearance, finish, visual appeal"}
90
+ ]
91
+ },
92
+ "decision_drivers": {
93
+ "author_perspective_only": true,
94
+ "description": "What influenced AUTHOR's own purchase decision (empty if giving advice)",
95
+ "categories": [
96
+ {"value": "sound_quality", "description": "Sound characteristics"},
97
+ {"value": "price", "description": "Cost/budget considerations"},
98
+ {"value": "durability", "description": "Build quality, longevity"},
99
+ {"value": "artist_endorsement", "description": "Influenced by endorsed artists"},
100
+ {"value": "peer_recommendation", "description": "Friends/community recommended"},
101
+ {"value": "hands_on_testing", "description": "Tried before buying"},
102
+ {"value": "brand_loyalty", "description": "Previous positive experience"},
103
+ {"value": "versatility", "description": "Multi-genre/application use"},
104
+ {"value": "online_reviews", "description": "Read reviews that influenced"}
105
+ ]
106
+ },
107
+ "product_attributes": {
108
+ "description": "Attributes being discussed about Sabian products",
109
+ "categories": [
110
+ {"value": "sound_quality", "description": "Tone, character, audio qualities"},
111
+ {"value": "durability", "description": "Build quality, longevity"},
112
+ {"value": "price", "description": "Cost and value"},
113
+ {"value": "playability", "description": "Feel, response"},
114
+ {"value": "aesthetics", "description": "Appearance, finish"},
115
+ {"value": "volume", "description": "Loudness, projection"},
116
+ {"value": "sustain", "description": "How long sound lasts"},
117
+ {"value": "versatility", "description": "Range of applications"}
118
+ ]
119
+ },
120
+ "analysis_notes_guidelines": {
121
+ "description": "Keep to 1-2 sentences. Focus on Sabian-specific insights not captured by other fields."
122
+ }
123
+ }
processing_brand_sentiment/config_files/brand_config.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "brand": {
3
+ "name": "Sabian",
4
+ "description": "Sabian is a Canadian manufacturer of cymbals founded in 1981",
5
+ "products": [
6
+ "HHX",
7
+ "AAX",
8
+ "Artisan",
9
+ "FRX",
10
+ "Omni",
11
+ "Chopper",
12
+ "Stratus",
13
+ "XSR",
14
+ "B8X",
15
+ "SBR"
16
+ ],
17
+ "product_aliases": {
18
+ "b8": "B8X",
19
+ "sbrs": "SBR",
20
+ "hhxs": "HHX",
21
+ "aaxs": "AAX",
22
+ "hhx's": "HHX",
23
+ "aax's": "AAX"
24
+ },
25
+ "product_descriptions": {
26
+ "HHX": "Hand Hammered Xtreme - Professional series with dark, complex tones",
27
+ "AAX": "Bright, cutting cymbals for modern music",
28
+ "Artisan": "Premium hand-crafted cymbals with unique character",
29
+ "FRX": "Frequency Reduced Xtreme - Lower volume cymbals",
30
+ "Omni": "Multi-purpose cymbals for various playing styles",
31
+ "Chopper": "Effect cymbals with unique sound",
32
+ "Stratus": "Dark, complex sounds for jazz and fusion",
33
+ "XSR": "Entry-level professional cymbals",
34
+ "B8X": "Bronze entry-level cymbals",
35
+ "SBR": "Entry-level brass cymbals"
36
+ },
37
+ "competitor_products_warning": {
38
+ "description": "Products that belong to competitors - DO NOT attribute to Sabian",
39
+ "paiste_products": ["2002", "signature", "sound edge", "formula 602", "giant beat", "pst", "rude", "masters", "traditionals", "twenty", "dark energy"],
40
+ "zildjian_products": ["k custom", "a custom", "k zildjian", "a zildjian", "s family", "i family", "l80", "kerope", "constantinople", "k sweet"],
41
+ "meinl_products": ["byzance", "pure alloy", "hcs", "classics custom", "mb20", "mb10", "soundcaster"],
42
+ "dream_products": ["bliss", "contact", "energy", "dark matter", "vintage bliss", "eclipse"],
43
+ "istanbul_products": ["agop", "xist", "traditional", "sultan", "mehmet"]
44
+ },
45
+ "competitors": [
46
+ {
47
+ "name": "Zildjian",
48
+ "aliases": ["zildjian", "zil", "z custom", "a custom", "k custom", "k zildjian", "a zildjian"]
49
+ },
50
+ {
51
+ "name": "Meinl",
52
+ "aliases": ["meinl", "byzance", "classics"]
53
+ },
54
+ {
55
+ "name": "Paiste",
56
+ "aliases": ["paiste", "2002", "signature", "formula 602", "sound edge"]
57
+ },
58
+ {
59
+ "name": "Dream Cymbals",
60
+ "aliases": ["dream", "dream cymbals", "bliss"]
61
+ },
62
+ {
63
+ "name": "Istanbul Agop",
64
+ "aliases": ["istanbul", "agop", "istanbul agop", "istanbul mehmet"]
65
+ },
66
+ {
67
+ "name": "Bosphorus",
68
+ "aliases": ["bosphorus"]
69
+ }
70
+ ]
71
+ },
72
+ "relevance_keywords": {
73
+ "primary": {
74
+ "description": "Keywords that definitively indicate Sabian content",
75
+ "keywords": ["sabian", "hhx", "aax", "artisan", "frx", "omni", "chopper", "stratus", "xsr", "b8x", "sbr"]
76
+ },
77
+ "contextual": {
78
+ "description": "Ambiguous keywords that need context verification",
79
+ "keywords": ["b8"]
80
+ },
81
+ "cymbal_context": {
82
+ "description": "Keywords that provide cymbal-related context for disambiguation",
83
+ "keywords": ["cymbal", "cymbals", "crash", "ride", "hi-hat", "hihat", "hi hat", "splash", "china", "bell", "stack", "effects"]
84
+ }
85
+ },
86
+ "preprocessing": {
87
+ "min_length_for_language_detection": 50,
88
+ "default_language_for_short_text": "English",
89
+ "always_process_if_primary_keyword": true,
90
+ "min_content_length": 3
91
+ },
92
+ "filter_conditions": {
93
+ "exclude_access_levels": ["team", "house-coach"],
94
+ "exclude_post_states": ["deleted", "spam"],
95
+ "require_content_length_min": 3
96
+ },
97
+ "data_sources": {
98
+ "forums": {
99
+ "table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS",
100
+ "description": "Forum posts mentioning Sabian and their products",
101
+ "sql_query_file": "database/sql/fetch_forum_posts.sql",
102
+ "platform": "musora_forums"
103
+ },
104
+ "comments": {
105
+ "table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS",
106
+ "description": "Social media comments potentially related to Sabian brand",
107
+ "sql_query_file": "database/sql/fetch_comments.sql",
108
+ "platform_column": "PLATFORM"
109
+ }
110
+ }
111
+ }
processing_brand_sentiment/config_files/workflow_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "llm": {
3
+ "default_model": "gpt-5-nano",
4
+ "default_temperature": 0.2,
5
+ "max_retries": 3,
6
+ "timeout": 60
7
+ },
8
+ "agents": {
9
+ "preprocessor": {
10
+ "name": "PreprocessorAgent",
11
+ "description": "Deterministic agent for HTML parsing, text cleaning, language detection",
12
+ "model": "gpt-5-nano",
13
+ "temperature": 0.0,
14
+ "uses_llm": false
15
+ },
16
+ "relevance_validator": {
17
+ "name": "RelevanceValidatorAgent",
18
+ "description": "Lightweight LLM for disambiguation of ambiguous terms (HH, AA)",
19
+ "model": "gpt-5-nano",
20
+ "temperature": 0.0,
21
+ "max_retries": 2
22
+ },
23
+ "brand_analyzer": {
24
+ "name": "SabianAnalyzerAgent",
25
+ "description": "Comprehensive brand analysis for Sabian products",
26
+ "model": "gpt-5-nano",
27
+ "temperature": 0.2,
28
+ "max_retries": 3
29
+ }
30
+ },
31
+ "workflow": {
32
+ "parallel_processing": {
33
+ "enabled": true,
34
+ "worker_calculation": "CPU count - 2, max 5 workers",
35
+ "max_workers": 5,
36
+ "min_batch_size": 20,
37
+ "max_batch_size": 500
38
+ },
39
+ "thread_context": {
40
+ "enabled": true,
41
+ "include_thread_title": true,
42
+ "include_first_post": true
43
+ }
44
+ },
45
+ "output": {
46
+ "table_name": "SABIAN_BRAND_ANALYSIS",
47
+ "database": "SOCIAL_MEDIA_DB",
48
+ "schema": "ML_FEATURES"
49
+ },
50
+ "comments_output": {
51
+ "table_name": "SABIAN_BRAND_ANALYSIS_COMMENTS",
52
+ "database": "SOCIAL_MEDIA_DB",
53
+ "schema": "ML_FEATURES"
54
+ },
55
+ "logging": {
56
+ "level": "INFO",
57
+ "log_directory": "logs",
58
+ "log_file_prefix": "brand_sentiment_processing"
59
+ }
60
+ }
processing_brand_sentiment/database/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Database module for brand sentiment analysis.
3
+ Contains Snowflake connection handler and SQL query utilities.
4
+ """
5
+
6
+ from .snowflake_connection import SnowFlakeConn
7
+
8
+ __all__ = ['SnowFlakeConn']
processing_brand_sentiment/database/snowflake_connection.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Snowflake connection handler for brand sentiment analysis.
3
+ Provides methods for reading data, executing queries, and storing results.
4
+ """
5
+
6
+ import os
7
+ from snowflake.snowpark import Session
8
+ from dotenv import load_dotenv
9
+ import logging
10
+ import pandas as pd
11
+ from typing import Optional, List, Any
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Load environment variables
16
+ load_dotenv()
17
+
18
+
19
+ class SnowFlakeConn:
20
+ """
21
+ Handles Snowflake database connections and operations for brand sentiment analysis.
22
+ """
23
+
24
+ def __init__(self):
25
+ """Initialize Snowflake connection."""
26
+ self.session = self.connect_to_snowflake()
27
+
28
+ def connect_to_snowflake(self) -> Session:
29
+ """
30
+ Create a connection to Snowflake using environment variables.
31
+
32
+ Returns:
33
+ Snowflake Session object
34
+ """
35
+ conn = dict(
36
+ user=self.get_credential("SNOWFLAKE_USER"),
37
+ password=self.get_credential("SNOWFLAKE_PASSWORD"),
38
+ account=self.get_credential("SNOWFLAKE_ACCOUNT"),
39
+ role=self.get_credential("SNOWFLAKE_ROLE"),
40
+ database=self.get_credential("SNOWFLAKE_DATABASE"),
41
+ warehouse=self.get_credential("SNOWFLAKE_WAREHOUSE"),
42
+ schema=self.get_credential("SNOWFLAKE_SCHEMA"),
43
+ )
44
+
45
+ session = Session.builder.configs(conn).create()
46
+ logger.info("Successfully connected to Snowflake")
47
+ return session
48
+
49
+ def get_credential(self, key: str) -> str:
50
+ """
51
+ Get credential from environment variables.
52
+
53
+ Args:
54
+ key: Environment variable name
55
+
56
+ Returns:
57
+ Credential value
58
+ """
59
+ return os.getenv(key)
60
+
61
+ def run_read_query(self, query: str, description: str = "data") -> pd.DataFrame:
62
+ """
63
+ Execute a SQL query that fetches data.
64
+
65
+ Args:
66
+ query: SQL query string
67
+ description: Description of what data is being fetched
68
+
69
+ Returns:
70
+ Pandas DataFrame containing query results
71
+ """
72
+ try:
73
+ dataframe = self.session.sql(query).to_pandas()
74
+ dataframe.columns = dataframe.columns.str.lower()
75
+ logger.info(f"Successfully read {len(dataframe)} rows for {description}")
76
+ return dataframe
77
+ except Exception as e:
78
+ logger.error(f"Error reading {description}: {e}")
79
+ raise
80
+
81
+ def store_df_to_snowflake(
82
+ self,
83
+ table_name: str,
84
+ dataframe: pd.DataFrame,
85
+ database: str = "SOCIAL_MEDIA_DB",
86
+ schema: str = "ML_FEATURES",
87
+ overwrite: bool = False
88
+ ) -> None:
89
+ """
90
+ Store a DataFrame to Snowflake.
91
+
92
+ Args:
93
+ table_name: Target table name
94
+ dataframe: DataFrame to store
95
+ database: Target database
96
+ schema: Target schema
97
+ overwrite: If True, overwrite existing data; if False, append
98
+ """
99
+ try:
100
+ self.session.use_database(database)
101
+ self.session.use_schema(schema)
102
+
103
+ dataframe = dataframe.reset_index(drop=True)
104
+ dataframe.columns = dataframe.columns.str.upper()
105
+
106
+ self.session.write_pandas(
107
+ df=dataframe,
108
+ table_name=table_name.strip().upper(),
109
+ auto_create_table=True,
110
+ overwrite=overwrite,
111
+ use_logical_type=True
112
+ )
113
+ logger.info(f"Successfully stored {len(dataframe)} rows to {table_name}")
114
+
115
+ except Exception as e:
116
+ logger.error(f"Error storing data to {table_name}: {e}")
117
+ raise
118
+
119
+ def execute_sql_file(self, file_path: str) -> Optional[List[Any]]:
120
+ """
121
+ Execute SQL queries from a file.
122
+
123
+ Args:
124
+ file_path: Path to SQL file
125
+
126
+ Returns:
127
+ Query result or None for DDL/DML
128
+ """
129
+ try:
130
+ with open(file_path, 'r', encoding='utf-8') as file:
131
+ sql_content = file.read()
132
+
133
+ result = self.session.sql(sql_content).collect()
134
+ logger.info(f"Successfully executed SQL from {file_path}")
135
+ return result
136
+ except Exception as e:
137
+ logger.error(f"Error executing SQL file {file_path}: {e}")
138
+ return None
139
+
140
+ def execute_query(self, query: str, description: str = "query") -> Optional[List[Any]]:
141
+ """
142
+ Execute a SQL query and return results.
143
+
144
+ Args:
145
+ query: SQL query string
146
+ description: Description of the query for logging
147
+
148
+ Returns:
149
+ Query results
150
+ """
151
+ try:
152
+ result = self.session.sql(query).collect()
153
+ logger.info(f"Successfully executed {description}")
154
+ return result
155
+ except Exception as e:
156
+ logger.error(f"Error executing {description}: {e}")
157
+ return None
158
+
159
+ def fetch_forum_posts_with_context(
160
+ self,
161
+ sql_file_path: str,
162
+ limit: Optional[int] = None
163
+ ) -> pd.DataFrame:
164
+ """
165
+ Fetch forum posts with thread context from SQL file.
166
+
167
+ Args:
168
+ sql_file_path: Path to the SQL query file
169
+ limit: Optional limit on number of posts to fetch
170
+
171
+ Returns:
172
+ DataFrame containing forum posts with context
173
+ """
174
+ try:
175
+ with open(sql_file_path, 'r', encoding='utf-8') as f:
176
+ query = f.read()
177
+
178
+ # Add limit if specified
179
+ if limit:
180
+ # Strip whitespace first, then semicolon, to handle Windows line endings
181
+ query = query.strip().rstrip(';') + f"\nLIMIT {limit};"
182
+
183
+ df = self.run_read_query(query, "forum posts with context")
184
+
185
+ # Validate required columns
186
+ required_cols = ['post_id', 'post_content', 'thread_id']
187
+ missing_cols = [col for col in required_cols if col not in df.columns]
188
+ if missing_cols:
189
+ logger.warning(f"Missing expected columns: {missing_cols}")
190
+
191
+ return df
192
+
193
+ except Exception as e:
194
+ logger.error(f"Error fetching forum posts: {e}")
195
+ raise
196
+
197
+ def fetch_comments(
198
+ self,
199
+ sql_file_path: str,
200
+ limit: Optional[int] = None
201
+ ) -> pd.DataFrame:
202
+ """
203
+ Fetch social media comments with context from SQL file.
204
+
205
+ Args:
206
+ sql_file_path: Path to the SQL query file
207
+ limit: Optional limit on number of comments to fetch
208
+
209
+ Returns:
210
+ DataFrame containing comments with context
211
+ """
212
+ try:
213
+ with open(sql_file_path, 'r', encoding='utf-8') as f:
214
+ query = f.read()
215
+
216
+ # Add limit if specified
217
+ if limit:
218
+ query = query.strip().rstrip(';') + f"\nLIMIT {limit};"
219
+
220
+ df = self.run_read_query(query, "social media comments with context")
221
+
222
+ # Validate required columns
223
+ required_cols = ['comment_sk', 'comment_id', 'comment_text', 'platform']
224
+ missing_cols = [col for col in required_cols if col not in df.columns]
225
+ if missing_cols:
226
+ logger.warning(f"Missing expected columns: {missing_cols}")
227
+
228
+ return df
229
+
230
+ except Exception as e:
231
+ logger.error(f"Error fetching comments: {e}")
232
+ raise
233
+
234
+ def close_connection(self) -> None:
235
+ """Close the Snowflake session."""
236
+ try:
237
+ self.session.close()
238
+ logger.info("Snowflake connection closed")
239
+ except Exception as e:
240
+ logger.error(f"Error closing connection: {e}")
processing_brand_sentiment/database/sql/create_comments_output_table.sql ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Create the output table for Sabian brand sentiment analysis on social media comments
2
+ -- Stores processed comments with extracted brand intelligence
3
+ -- Schema Version 4.0: Same analysis fields as forum table, different source identifiers
4
+
5
+ CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS (
6
+ -- Source identifiers (comment-specific)
7
+ COMMENT_SK NUMBER(38,0),
8
+ COMMENT_ID VARCHAR(16777216),
9
+ ORIGINAL_TEXT VARCHAR(16777216),
10
+ PLATFORM VARCHAR(16777216),
11
+ COMMENT_TIMESTAMP TIMESTAMP_NTZ(9),
12
+ AUTHOR_NAME VARCHAR(16777216),
13
+ AUTHOR_ID VARCHAR(16777216),
14
+ CONTENT_SK NUMBER(38,0),
15
+ CONTENT_ID VARCHAR(16777216),
16
+ CONTENT_DESCRIPTION VARCHAR(16777216),
17
+ CHANNEL_SK NUMBER(38,0),
18
+ CHANNEL_NAME VARCHAR(16777216),
19
+ CHANNEL_DISPLAY_NAME VARCHAR(16777216),
20
+ PARENT_COMMENT_ID VARCHAR(16777216),
21
+ PARENT_COMMENT_TEXT VARCHAR(16777216),
22
+
23
+ -- Language detection
24
+ DETECTED_LANGUAGE VARCHAR(100),
25
+ LANGUAGE_CODE VARCHAR(10),
26
+ IS_ENGLISH BOOLEAN,
27
+
28
+ -- Relevance assessment
29
+ IS_RELEVANT BOOLEAN,
30
+ RELEVANCE_CONFIDENCE VARCHAR(20),
31
+ RELEVANCE_REASON VARCHAR(500),
32
+
33
+ -- Author classification
34
+ AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
35
+ SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
36
+
37
+ -- Sentiment analysis
38
+ SENTIMENT_LEVEL VARCHAR(20),
39
+ EMOTION_TYPE VARCHAR(50),
40
+ SENTIMENT_TARGET VARCHAR(50),
41
+ SENTIMENT_CONFIDENCE VARCHAR(20),
42
+
43
+ -- Product information (stored as JSON arrays)
44
+ PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"]
45
+ PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
46
+ PURCHASE_STAGE VARCHAR(50),
47
+
48
+ -- Competitive intelligence
49
+ COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"]
50
+ COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
51
+ COMPARISON_TYPE VARCHAR(50),
52
+ COMPETITIVE_POSITIONING VARCHAR(500),
53
+ BRAND_SWITCHING VARCHAR(100),
54
+
55
+ -- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
56
+ INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"]
57
+ DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"]
58
+ PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"]
59
+ DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
60
+
61
+ -- Analysis notes
62
+ ANALYSIS_NOTES VARCHAR(16777216),
63
+ SARCASM_DETECTED BOOLEAN,
64
+
65
+ -- Validation results
66
+ VALIDATION_PASSED BOOLEAN,
67
+ VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
68
+ VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
69
+ VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags
70
+
71
+ -- Processing metadata
72
+ PROCESSING_SUCCESS BOOLEAN,
73
+ PROCESSING_STATUS VARCHAR(50), -- completed, completed_with_flags, validation_failed, workflow_error
74
+ PROCESSING_ERRORS VARCHAR(16777216),
75
+ PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
76
+ WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
77
+ )
78
+ COMMENT = 'Brand sentiment analysis results for Sabian social media comments. Schema v4.0: 4-agent pipeline with extraction/analysis separation and validation.';
79
+
80
+ -- Create indexes for common query patterns
81
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_SK ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(COMMENT_SK);
82
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_PLATFORM ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PLATFORM);
83
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_IS_RELEVANT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(IS_RELEVANT);
84
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_SENTIMENT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(SENTIMENT_LEVEL);
85
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_PROCESSED_AT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PROCESSED_AT);
86
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_AUTHOR_ROLE ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(AUTHOR_ROLE);
87
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_MENTION_CTX ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(SABIAN_MENTION_CONTEXT);
88
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_STATUS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PROCESSING_STATUS);
89
+
90
+ -- Create view for relevant comments only
91
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_RELEVANT_ANALYSIS AS
92
+ SELECT *
93
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
94
+ WHERE IS_RELEVANT = TRUE
95
+ AND PROCESSING_SUCCESS = TRUE;
96
+
97
+ -- Create view for comments needing review (flagged by validator)
98
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_FLAGGED AS
99
+ SELECT
100
+ COMMENT_SK,
101
+ COMMENT_ID,
102
+ PLATFORM,
103
+ ORIGINAL_TEXT,
104
+ IS_RELEVANT,
105
+ RELEVANCE_CONFIDENCE,
106
+ RELEVANCE_REASON,
107
+ PRODUCTS_MENTIONED,
108
+ SABIAN_MENTION_CONTEXT,
109
+ SENTIMENT_LEVEL,
110
+ VALIDATION_FLAGS,
111
+ VALIDATION_WARNINGS,
112
+ PROCESSING_STATUS
113
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
114
+ WHERE PROCESSING_STATUS = 'completed_with_flags'
115
+ OR VALIDATION_PASSED = FALSE
116
+ ORDER BY PROCESSED_AT DESC;
117
+
118
+ -- Create view for sentiment distribution
119
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_SENTIMENT_DISTRIBUTION AS
120
+ SELECT
121
+ PLATFORM,
122
+ SENTIMENT_LEVEL,
123
+ EMOTION_TYPE,
124
+ SENTIMENT_TARGET,
125
+ COUNT(*) AS COMMENT_COUNT,
126
+ COUNT(CASE WHEN SARCASM_DETECTED = TRUE THEN 1 END) AS SARCASM_COUNT
127
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
128
+ WHERE IS_RELEVANT = TRUE
129
+ AND PROCESSING_SUCCESS = TRUE
130
+ GROUP BY PLATFORM, SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_TARGET
131
+ ORDER BY COMMENT_COUNT DESC;
132
+
133
+ -- Create view for product mentions summary
134
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_PRODUCT_MENTIONS AS
135
+ SELECT
136
+ PLATFORM,
137
+ TRIM(product.VALUE::STRING) AS PRODUCT,
138
+ SENTIMENT_LEVEL,
139
+ COUNT(*) AS MENTION_COUNT,
140
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT,
141
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT
142
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS,
143
+ LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PRODUCTS_MENTIONED)) AS product
144
+ WHERE IS_RELEVANT = TRUE
145
+ AND PROCESSING_SUCCESS = TRUE
146
+ AND PRODUCTS_MENTIONED IS NOT NULL
147
+ GROUP BY PLATFORM, TRIM(product.VALUE::STRING), SENTIMENT_LEVEL
148
+ ORDER BY MENTION_COUNT DESC;
149
+
150
+ -- Create view for validation summary
151
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_VALIDATION_SUMMARY AS
152
+ SELECT
153
+ PLATFORM,
154
+ PROCESSING_STATUS,
155
+ VALIDATION_PASSED,
156
+ COUNT(*) AS COMMENT_COUNT,
157
+ COUNT(CASE WHEN IS_RELEVANT = TRUE THEN 1 END) AS RELEVANT_COUNT,
158
+ COUNT(CASE WHEN IS_RELEVANT = FALSE THEN 1 END) AS NOT_RELEVANT_COUNT
159
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
160
+ GROUP BY PLATFORM, PROCESSING_STATUS, VALIDATION_PASSED
161
+ ORDER BY COMMENT_COUNT DESC;
processing_brand_sentiment/database/sql/create_output_table.sql ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Create the output table for Sabian brand sentiment analysis
2
+ -- Stores processed forum posts with extracted brand intelligence
3
+ -- Schema Version 4.0: Added THREAD_CONTEXT_SUMMARY, validation fields, and processing status
4
+
5
+ CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS (
6
+ -- Source identifiers
7
+ POST_ID NUMBER(38,0) PRIMARY KEY,
8
+ THREAD_ID NUMBER(38,0),
9
+ POST_AUTHOR_ID NUMBER(38,0),
10
+
11
+ -- Original and processed content
12
+ ORIGINAL_CONTENT VARCHAR(16777216),
13
+ CLEANED_CONTENT VARCHAR(16777216),
14
+ QUOTED_CONTENT VARCHAR(16777216),
15
+ THREAD_CONTEXT VARCHAR(16777216), -- Raw thread context (legacy)
16
+ THREAD_CONTEXT_SUMMARY VARCHAR(4096), -- NEW v4.0: Summarized thread context for analysis
17
+
18
+ -- Thread metadata
19
+ THREAD_TITLE VARCHAR(16777216),
20
+ THREAD_FIRST_POST VARCHAR(16777216),
21
+
22
+ -- Timestamps
23
+ POST_CREATED_AT TIMESTAMP_LTZ(9),
24
+ THREAD_STARTED_AT TIMESTAMP_LTZ(9),
25
+
26
+ -- Category information
27
+ CATEGORY_TITLE VARCHAR(16777216),
28
+ CATEGORY_TOPIC VARCHAR(16777216),
29
+
30
+ -- Language detection
31
+ DETECTED_LANGUAGE VARCHAR(100),
32
+ LANGUAGE_CODE VARCHAR(10),
33
+ IS_ENGLISH BOOLEAN,
34
+
35
+ -- Relevance assessment
36
+ IS_RELEVANT BOOLEAN,
37
+ RELEVANCE_CONFIDENCE VARCHAR(20),
38
+ RELEVANCE_REASON VARCHAR(500),
39
+
40
+ -- Author classification
41
+ AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
42
+ SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
43
+
44
+ -- Sentiment analysis
45
+ SENTIMENT_LEVEL VARCHAR(20),
46
+ EMOTION_TYPE VARCHAR(50),
47
+ SENTIMENT_TARGET VARCHAR(50),
48
+ SENTIMENT_CONFIDENCE VARCHAR(20),
49
+
50
+ -- Product information (stored as JSON arrays)
51
+ PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"]
52
+ PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
53
+ PURCHASE_STAGE VARCHAR(50),
54
+
55
+ -- Competitive intelligence
56
+ COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"]
57
+ COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
58
+ COMPARISON_TYPE VARCHAR(50),
59
+ COMPETITIVE_POSITIONING VARCHAR(500),
60
+ BRAND_SWITCHING VARCHAR(100),
61
+
62
+ -- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
63
+ INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"]
64
+ DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"]
65
+ PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"]
66
+ DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
67
+
68
+ -- Analysis notes
69
+ ANALYSIS_NOTES VARCHAR(16777216),
70
+ SARCASM_DETECTED BOOLEAN,
71
+
72
+ -- Validation results (NEW v4.0)
73
+ VALIDATION_PASSED BOOLEAN,
74
+ VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
75
+ VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
76
+ VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags (e.g., "sarcasm_detected", "low_confidence_relevant")
77
+
78
+ -- Platform identifier
79
+ PLATFORM VARCHAR(50) DEFAULT 'musora_forums',
80
+
81
+ -- Processing metadata
82
+ PROCESSING_SUCCESS BOOLEAN,
83
+ PROCESSING_STATUS VARCHAR(50), -- NEW v4.0: completed, completed_with_flags, validation_failed, workflow_error
84
+ PROCESSING_ERRORS VARCHAR(16777216),
85
+ PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
86
+ WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
87
+ )
88
+ COMMENT = 'Brand sentiment analysis results for Sabian forum posts. Schema v4.0: 4-agent pipeline with extraction/analysis separation, thread context summarization, and validation.';
89
+
90
+ -- Create indexes for common query patterns
91
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_THREAD_ID ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(THREAD_ID);
92
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_IS_RELEVANT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(IS_RELEVANT);
93
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_SENTIMENT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(SENTIMENT_LEVEL);
94
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_PROCESSED_AT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(PROCESSED_AT);
95
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_AUTHOR_ROLE ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(AUTHOR_ROLE);
96
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_MENTION_CONTEXT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(SABIAN_MENTION_CONTEXT);
97
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_PROCESSING_STATUS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(PROCESSING_STATUS);
98
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_VALIDATION_FLAGS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(VALIDATION_PASSED);
99
+
100
+ -- Create view for relevant posts only
101
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_RELEVANT_ANALYSIS AS
102
+ SELECT *
103
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
104
+ WHERE IS_RELEVANT = TRUE
105
+ AND PROCESSING_SUCCESS = TRUE;
106
+
107
+ -- Create view for posts needing review (flagged by validator)
108
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_FLAGGED_POSTS AS
109
+ SELECT
110
+ POST_ID,
111
+ THREAD_ID,
112
+ CLEANED_CONTENT,
113
+ THREAD_CONTEXT_SUMMARY,
114
+ IS_RELEVANT,
115
+ RELEVANCE_CONFIDENCE,
116
+ RELEVANCE_REASON,
117
+ PRODUCTS_MENTIONED,
118
+ SABIAN_MENTION_CONTEXT,
119
+ SENTIMENT_LEVEL,
120
+ VALIDATION_FLAGS,
121
+ VALIDATION_WARNINGS,
122
+ PROCESSING_STATUS
123
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
124
+ WHERE PROCESSING_STATUS = 'completed_with_flags'
125
+ OR VALIDATION_PASSED = FALSE
126
+ ORDER BY PROCESSED_AT DESC;
127
+
128
+ -- Create view for sentiment distribution
129
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_SENTIMENT_DISTRIBUTION AS
130
+ SELECT
131
+ SENTIMENT_LEVEL,
132
+ EMOTION_TYPE,
133
+ SENTIMENT_TARGET,
134
+ COUNT(*) AS POST_COUNT,
135
+ COUNT(CASE WHEN SARCASM_DETECTED = TRUE THEN 1 END) AS SARCASM_COUNT
136
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
137
+ WHERE IS_RELEVANT = TRUE
138
+ AND PROCESSING_SUCCESS = TRUE
139
+ GROUP BY SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_TARGET
140
+ ORDER BY POST_COUNT DESC;
141
+
142
+ -- Create view for product mentions summary
143
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_PRODUCT_MENTIONS AS
144
+ SELECT
145
+ TRIM(product.VALUE::STRING) AS PRODUCT,
146
+ SENTIMENT_LEVEL,
147
+ COUNT(*) AS MENTION_COUNT,
148
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT,
149
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT
150
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
151
+ LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PRODUCTS_MENTIONED)) AS product
152
+ WHERE IS_RELEVANT = TRUE
153
+ AND PROCESSING_SUCCESS = TRUE
154
+ AND PRODUCTS_MENTIONED IS NOT NULL
155
+ GROUP BY TRIM(product.VALUE::STRING), SENTIMENT_LEVEL
156
+ ORDER BY MENTION_COUNT DESC;
157
+
158
+ -- Create view for competitor analysis
159
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMPETITOR_ANALYSIS AS
160
+ SELECT
161
+ TRIM(competitor.VALUE::STRING) AS COMPETITOR,
162
+ COMPARISON_TYPE,
163
+ BRAND_SWITCHING,
164
+ COUNT(*) AS MENTION_COUNT,
165
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_SENTIMENT,
166
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_SENTIMENT
167
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
168
+ LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(COMPETITORS_MENTIONED)) AS competitor
169
+ WHERE IS_RELEVANT = TRUE
170
+ AND PROCESSING_SUCCESS = TRUE
171
+ AND COMPETITORS_MENTIONED IS NOT NULL
172
+ GROUP BY TRIM(competitor.VALUE::STRING), COMPARISON_TYPE, BRAND_SWITCHING
173
+ ORDER BY MENTION_COUNT DESC;
174
+
175
+ -- Create view for pain points analysis
176
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_PAIN_POINTS AS
177
+ SELECT
178
+ TRIM(pain_point.VALUE::STRING) AS PAIN_POINT,
179
+ COUNT(*) AS OCCURRENCE_COUNT,
180
+ ARRAY_AGG(DISTINCT SENTIMENT_LEVEL) AS SENTIMENT_LEVELS
181
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
182
+ LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PAIN_POINTS)) AS pain_point
183
+ WHERE IS_RELEVANT = TRUE
184
+ AND PROCESSING_SUCCESS = TRUE
185
+ AND PAIN_POINTS IS NOT NULL
186
+ GROUP BY TRIM(pain_point.VALUE::STRING)
187
+ ORDER BY OCCURRENCE_COUNT DESC;
188
+
189
+ -- Create view for author role analysis
190
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_AUTHOR_ROLES AS
191
+ SELECT
192
+ AUTHOR_ROLE,
193
+ SABIAN_MENTION_CONTEXT,
194
+ COUNT(*) AS POST_COUNT,
195
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT,
196
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT,
197
+ COUNT(CASE WHEN SENTIMENT_LEVEL = 'neutral' THEN 1 END) AS NEUTRAL_COUNT
198
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
199
+ WHERE IS_RELEVANT = TRUE
200
+ AND PROCESSING_SUCCESS = TRUE
201
+ GROUP BY AUTHOR_ROLE, SABIAN_MENTION_CONTEXT
202
+ ORDER BY POST_COUNT DESC;
203
+
204
+ -- Create view for competitor ownership analysis
205
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMPETITOR_OWNERSHIP AS
206
+ SELECT
207
+ TRIM(competitor.VALUE::STRING) AS COMPETITOR_OWNED,
208
+ AUTHOR_ROLE,
209
+ COUNT(*) AS AUTHOR_COUNT,
210
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_TOWARD_SABIAN,
211
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_TOWARD_SABIAN
212
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
213
+ LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(COMPETITOR_PRODUCTS_OWNED)) AS competitor
214
+ WHERE IS_RELEVANT = TRUE
215
+ AND PROCESSING_SUCCESS = TRUE
216
+ AND COMPETITOR_PRODUCTS_OWNED IS NOT NULL
217
+ GROUP BY TRIM(competitor.VALUE::STRING), AUTHOR_ROLE
218
+ ORDER BY AUTHOR_COUNT DESC;
219
+
220
+ -- Create view for mention context by sentiment
221
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_MENTION_DEPTH AS
222
+ SELECT
223
+ SABIAN_MENTION_CONTEXT,
224
+ SENTIMENT_LEVEL,
225
+ COUNT(*) AS POST_COUNT,
226
+ AVG(CASE
227
+ WHEN SENTIMENT_LEVEL = 'very_positive' THEN 2
228
+ WHEN SENTIMENT_LEVEL = 'positive' THEN 1
229
+ WHEN SENTIMENT_LEVEL = 'neutral' THEN 0
230
+ WHEN SENTIMENT_LEVEL = 'negative' THEN -1
231
+ WHEN SENTIMENT_LEVEL = 'very_negative' THEN -2
232
+ ELSE 0
233
+ END) AS AVG_SENTIMENT_SCORE
234
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
235
+ WHERE IS_RELEVANT = TRUE
236
+ AND PROCESSING_SUCCESS = TRUE
237
+ GROUP BY SABIAN_MENTION_CONTEXT, SENTIMENT_LEVEL
238
+ ORDER BY SABIAN_MENTION_CONTEXT, POST_COUNT DESC;
239
+
240
+ -- Create view for validation flags analysis (NEW v4.0)
241
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_VALIDATION_SUMMARY AS
242
+ SELECT
243
+ PROCESSING_STATUS,
244
+ VALIDATION_PASSED,
245
+ COUNT(*) AS POST_COUNT,
246
+ COUNT(CASE WHEN IS_RELEVANT = TRUE THEN 1 END) AS RELEVANT_COUNT,
247
+ COUNT(CASE WHEN IS_RELEVANT = FALSE THEN 1 END) AS NOT_RELEVANT_COUNT
248
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
249
+ GROUP BY PROCESSING_STATUS, VALIDATION_PASSED
250
+ ORDER BY POST_COUNT DESC;
processing_brand_sentiment/database/sql/fetch_comments.sql ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Query to fetch social media comments with context for brand sentiment analysis
2
+ -- Source: SOCIAL_MEDIA_DB.brand_sentiment.SABIAN_comments (same structure as CORE.FACT_COMMENTS)
3
+ -- Includes: comment content, parent comment text, content metadata, channel info
4
+ -- Excludes: official accounts, already-processed comments, empty comments
5
+
6
+ SELECT
7
+ -- Comment identifiers
8
+ fc.COMMENT_SK,
9
+ fc.COMMENT_ID,
10
+ fc.PLATFORM,
11
+ fc.MESSAGE AS COMMENT_TEXT,
12
+ fc.CREATED_TIME AS COMMENT_TIMESTAMP,
13
+ fc.AUTHOR_NAME,
14
+ fc.AUTHOR_ID,
15
+ fc.LIKE_COUNT,
16
+ fc.PARENT_COMMENT_ID,
17
+ fc.REPLIES_COUNT,
18
+ fc.COMMENT_LENGTH,
19
+ fc.IS_ACTIVE AS COMMENT_IS_ACTIVE,
20
+
21
+ -- Parent comment information (self-join to get parent comment text)
22
+ parent_fc.MESSAGE AS PARENT_COMMENT_TEXT,
23
+
24
+ -- Content information
25
+ dc.CONTENT_SK,
26
+ dc.CONTENT_ID,
27
+ dc.CONTENT_TYPE,
28
+ dc.MESSAGE AS CONTENT_DESCRIPTION,
29
+ dc.TITLE AS CONTENT_TITLE,
30
+ dc.PERMALINK_URL,
31
+ dc.CREATED_TIME AS CONTENT_TIMESTAMP,
32
+
33
+ -- Channel information
34
+ dch.CHANNEL_SK,
35
+ dch.CHANNEL_NAME,
36
+ dch.CHANNEL_DISPLAY_NAME
37
+
38
+ FROM
39
+ SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS fc
40
+
41
+ -- Left join to get parent comment text if it exists
42
+ LEFT JOIN
43
+ SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS parent_fc
44
+ ON fc.PARENT_COMMENT_ID = parent_fc.COMMENT_ID
45
+ AND fc.PLATFORM = parent_fc.PLATFORM
46
+
47
+ INNER JOIN
48
+ SOCIAL_MEDIA_DB.CORE.DIM_CONTENT dc
49
+ ON fc.CONTENT_SK = dc.CONTENT_SK
50
+
51
+ INNER JOIN
52
+ SOCIAL_MEDIA_DB.CORE.DIM_CHANNEL dch
53
+ ON dc.CHANNEL_NAME = dch.CHANNEL_NAME
54
+ AND dc.PLATFORM = dch.PLATFORM
55
+
56
+ -- Left join with output table to exclude already-processed comments
57
+ LEFT JOIN
58
+ SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS sba
59
+ ON fc.COMMENT_SK = sba.COMMENT_SK
60
+
61
+ WHERE
62
+ -- Active records only
63
+ fc.IS_ACTIVE = TRUE
64
+ AND dc.IS_ACTIVE = TRUE
65
+ AND dch.IS_ACTIVE = TRUE
66
+
67
+ -- Exclude official accounts
68
+ AND (fc.AUTHOR_NAME IS NULL OR fc.AUTHOR_NAME NOT IN (
69
+ 'Musora', 'Drumeo', 'Pianote',
70
+ '@PianoteOfficial', '@DrumeoOfficial', '@MusoraOfficial'
71
+ ))
72
+
73
+ -- Exclude already-processed comments
74
+ AND sba.COMMENT_SK IS NULL
75
+
76
+ -- Ensure comment has content
77
+ AND fc.MESSAGE IS NOT NULL
78
+ AND TRIM(fc.MESSAGE) != ''
79
+ AND LENGTH(TRIM(fc.MESSAGE)) > 0
80
+
81
+ ORDER BY
82
+ fc.CREATED_TIME DESC;
processing_brand_sentiment/database/sql/fetch_forum_posts.sql ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Query to fetch forum posts with thread context for brand sentiment analysis
2
+ -- Includes: post content, thread context (title, first post), parent relationships
3
+ -- Excludes: team/house-coach posts, already-processed posts, deleted posts
4
+
5
+ WITH thread_first_posts AS (
6
+ -- Get the first post (by creation date) for each thread to use as context
7
+ -- Using ROW_NUMBER for reliable first post identification
8
+ SELECT
9
+ THREAD_ID,
10
+ POST_CONTENT AS FIRST_POST_CONTENT,
11
+ POST_AUTHOR_ID AS FIRST_POST_AUTHOR_ID,
12
+ POST_CREATED_AT AS FIRST_POST_CREATED_AT
13
+ FROM (
14
+ SELECT
15
+ THREAD_ID,
16
+ POST_CONTENT,
17
+ POST_AUTHOR_ID,
18
+ POST_CREATED_AT,
19
+ ROW_NUMBER() OVER (PARTITION BY THREAD_ID ORDER BY POST_CREATED_AT ASC) AS rn
20
+ FROM SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS
21
+ WHERE POST_CONTENT IS NOT NULL
22
+ AND TRIM(POST_CONTENT) != ''
23
+ ) ranked
24
+ WHERE rn = 1
25
+ )
26
+
27
+ SELECT
28
+ -- Post identifiers
29
+ fp.POST_ID,
30
+ fp.POST_AUTHOR_ID,
31
+ fp.THREAD_ID,
32
+
33
+ -- Post content (may contain HTML with quoted parent)
34
+ fp.POST_CONTENT,
35
+
36
+ -- Post timestamps
37
+ fp.POST_CREATED_AT,
38
+ fp.POST_EDITED_ON,
39
+ fp.POST_PUBLISHED_ON,
40
+ fp.POST_STATE,
41
+
42
+ -- Parent/Child relationships (for context)
43
+ fp.PROMPTING_POST_ID,
44
+ fp.PARENT_ID,
45
+ fp.PARENT_CONTENT,
46
+ fp.PARENT_AUTHOR_ID,
47
+ fp.PARENT_CREATED_AT,
48
+ fp.CHILD_ID,
49
+ fp.CHILD_CONTENT,
50
+
51
+ -- Thread context
52
+ fp.THREAD_TITLE,
53
+ fp.THREAD_SLUG,
54
+ fp.THREAD_STATE,
55
+ fp.THREAD_LOCKED,
56
+ fp.THREAD_PINNED,
57
+ fp.THREAD_POST_COUNT,
58
+ fp.THREAD_PUBLISHED_ON,
59
+
60
+ -- First post of the thread (for context)
61
+ tfp.FIRST_POST_CONTENT AS THREAD_FIRST_POST,
62
+ tfp.FIRST_POST_CREATED_AT AS THREAD_STARTED_AT,
63
+
64
+ -- Category information
65
+ fp.CATEGORY_ID,
66
+ fp.CATEGORY_BRAND,
67
+ fp.CATEGORY_DESCRIPTION,
68
+ fp.CATEGORY_TITLE,
69
+ fp.CATEGORY_TOPIC,
70
+ fp.CATEGORY_SLUG,
71
+
72
+ -- Access levels (for filtering)
73
+ fp.POST_AUTHOR_ACCESS_LEVEL,
74
+ fp.PARENT_AUTHOR_ACCESS_LEVEL,
75
+ fp.CHILD_AUTHOR_ACCESS_LEVEL
76
+
77
+ FROM
78
+ SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS fp
79
+
80
+ -- Join to get thread's first post for context
81
+ LEFT JOIN
82
+ thread_first_posts tfp ON fp.THREAD_ID = tfp.THREAD_ID
83
+
84
+ -- Left join with output table to exclude already-processed posts
85
+ LEFT JOIN
86
+ SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS sba
87
+ ON fp.POST_ID = sba.POST_ID
88
+
89
+ WHERE
90
+ -- Exclude team and house-coach posts (internal comments)
91
+ (fp.POST_AUTHOR_ACCESS_LEVEL IS NULL OR fp.POST_AUTHOR_ACCESS_LEVEL NOT IN ('team', 'house-coach'))
92
+
93
+ -- Exclude deleted posts
94
+ AND (fp.POST_STATE IS NULL OR fp.POST_STATE != 'deleted')
95
+ AND fp.POST_DELETED_AT IS NULL
96
+
97
+ -- Exclude already-processed posts
98
+ AND sba.POST_ID IS NULL
99
+
100
+ -- Ensure post has content
101
+ AND fp.POST_CONTENT IS NOT NULL
102
+ AND TRIM(fp.POST_CONTENT) != ''
103
+ AND LENGTH(TRIM(fp.POST_CONTENT)) > 0
104
+
105
+ ORDER BY
106
+ fp.POST_CREATED_AT DESC;
processing_brand_sentiment/database/sql/init_comments_output_table.sql ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Initialize empty output table for Sabian brand sentiment analysis on social media comments
2
+ -- Run this script BEFORE the first processing run to create the table structure
3
+ -- This prevents "table not found" errors when the fetch query tries to check for already-processed comments
4
+
5
+ CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS (
6
+ -- Source identifiers (comment-specific)
7
+ COMMENT_SK NUMBER(38,0),
8
+ COMMENT_ID VARCHAR(16777216),
9
+ ORIGINAL_TEXT VARCHAR(16777216),
10
+ PLATFORM VARCHAR(16777216),
11
+ COMMENT_TIMESTAMP TIMESTAMP_NTZ(9),
12
+ AUTHOR_NAME VARCHAR(16777216),
13
+ AUTHOR_ID VARCHAR(16777216),
14
+ CONTENT_SK NUMBER(38,0),
15
+ CONTENT_ID VARCHAR(16777216),
16
+ CONTENT_DESCRIPTION VARCHAR(16777216),
17
+ CHANNEL_SK NUMBER(38,0),
18
+ CHANNEL_NAME VARCHAR(16777216),
19
+ CHANNEL_DISPLAY_NAME VARCHAR(16777216),
20
+ PARENT_COMMENT_ID VARCHAR(16777216),
21
+ PARENT_COMMENT_TEXT VARCHAR(16777216),
22
+
23
+ -- Language detection
24
+ DETECTED_LANGUAGE VARCHAR(100),
25
+ LANGUAGE_CODE VARCHAR(10),
26
+ IS_ENGLISH BOOLEAN,
27
+
28
+ -- Relevance assessment
29
+ IS_RELEVANT BOOLEAN,
30
+ RELEVANCE_CONFIDENCE VARCHAR(20),
31
+ RELEVANCE_REASON VARCHAR(500),
32
+
33
+ -- Author classification
34
+ AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
35
+ SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
36
+
37
+ -- Sentiment analysis
38
+ SENTIMENT_LEVEL VARCHAR(20),
39
+ EMOTION_TYPE VARCHAR(50),
40
+ SENTIMENT_TARGET VARCHAR(50),
41
+ SENTIMENT_CONFIDENCE VARCHAR(20),
42
+
43
+ -- Product information (stored as JSON arrays)
44
+ PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"]
45
+ PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
46
+ PURCHASE_STAGE VARCHAR(50),
47
+
48
+ -- Competitive intelligence
49
+ COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"]
50
+ COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
51
+ COMPARISON_TYPE VARCHAR(50),
52
+ COMPETITIVE_POSITIONING VARCHAR(500),
53
+ BRAND_SWITCHING VARCHAR(100),
54
+
55
+ -- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
56
+ INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"]
57
+ DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"]
58
+ PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"]
59
+ DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
60
+
61
+ -- Analysis notes
62
+ ANALYSIS_NOTES VARCHAR(16777216),
63
+ SARCASM_DETECTED BOOLEAN,
64
+
65
+ -- Validation results
66
+ VALIDATION_PASSED BOOLEAN,
67
+ VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
68
+ VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
69
+ VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags
70
+
71
+ -- Processing metadata
72
+ PROCESSING_SUCCESS BOOLEAN,
73
+ PROCESSING_STATUS VARCHAR(50), -- completed, completed_with_flags, validation_failed, workflow_error
74
+ PROCESSING_ERRORS VARCHAR(16777216),
75
+ PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
76
+ WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
77
+ )
78
+ COMMENT = 'Brand sentiment analysis results for Sabian social media comments. Schema v4.0: 4-agent pipeline with extraction/analysis separation and validation.';
processing_brand_sentiment/database/sql/init_output_table.sql ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Initialize empty output table for Sabian brand sentiment analysis
2
+ -- Run this script BEFORE the first processing run to create the table structure
3
+ -- This prevents "table not found" errors when the fetch query tries to check for already-processed posts
4
+ -- Schema Version 4.0: Added THREAD_CONTEXT_SUMMARY, validation fields, and processing status
5
+
6
+ CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS (
7
+ -- Source identifiers
8
+ POST_ID NUMBER(38,0) PRIMARY KEY,
9
+ THREAD_ID NUMBER(38,0),
10
+ POST_AUTHOR_ID NUMBER(38,0),
11
+
12
+ -- Original and processed content
13
+ ORIGINAL_CONTENT VARCHAR(16777216),
14
+ CLEANED_CONTENT VARCHAR(16777216),
15
+ QUOTED_CONTENT VARCHAR(16777216),
16
+ THREAD_CONTEXT VARCHAR(16777216), -- Raw thread context (legacy)
17
+ THREAD_CONTEXT_SUMMARY VARCHAR(4096), -- NEW v4.0: Summarized thread context
18
+
19
+ -- Thread metadata
20
+ THREAD_TITLE VARCHAR(16777216),
21
+ THREAD_FIRST_POST VARCHAR(16777216),
22
+
23
+ -- Timestamps
24
+ POST_CREATED_AT TIMESTAMP_LTZ(9),
25
+ THREAD_STARTED_AT TIMESTAMP_LTZ(9),
26
+
27
+ -- Category information
28
+ CATEGORY_TITLE VARCHAR(16777216),
29
+ CATEGORY_TOPIC VARCHAR(16777216),
30
+
31
+ -- Language detection
32
+ DETECTED_LANGUAGE VARCHAR(100),
33
+ LANGUAGE_CODE VARCHAR(10),
34
+ IS_ENGLISH BOOLEAN,
35
+
36
+ -- Relevance assessment
37
+ IS_RELEVANT BOOLEAN,
38
+ RELEVANCE_CONFIDENCE VARCHAR(20),
39
+ RELEVANCE_REASON VARCHAR(500),
40
+
41
+ -- Author classification
42
+ AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
43
+ SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
44
+
45
+ -- Sentiment analysis
46
+ SENTIMENT_LEVEL VARCHAR(20),
47
+ EMOTION_TYPE VARCHAR(50),
48
+ SENTIMENT_TARGET VARCHAR(50),
49
+ SENTIMENT_CONFIDENCE VARCHAR(20),
50
+
51
+ -- Product information (stored as JSON arrays)
52
+ PRODUCTS_MENTIONED VARCHAR(16777216),
53
+ PRODUCT_ATTRIBUTES VARCHAR(16777216),
54
+
55
+ -- Competitive intelligence
56
+ COMPETITORS_MENTIONED VARCHAR(16777216),
57
+ COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
58
+ COMPARISON_TYPE VARCHAR(50),
59
+ COMPETITIVE_POSITIONING VARCHAR(500),
60
+ BRAND_SWITCHING VARCHAR(100),
61
+
62
+ -- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
63
+ INTENTS VARCHAR(16777216), -- Multi-label: seeking_information, providing_information, sharing_experience, comparing, praising, criticizing, buying_selling, general_discussion
64
+ PURCHASE_STAGE VARCHAR(50), -- AUTHOR's own stage only
65
+ DECISION_DRIVERS VARCHAR(16777216), -- AUTHOR's own decision drivers only
66
+ PAIN_POINTS VARCHAR(16777216), -- AUTHOR's negative feedback aspects (uses feedback_aspects categories)
67
+ DELIGHT_FACTORS VARCHAR(16777216), -- AUTHOR's positive feedback aspects (uses feedback_aspects categories)
68
+
69
+ -- Analysis notes
70
+ ANALYSIS_NOTES VARCHAR(16777216),
71
+ SARCASM_DETECTED BOOLEAN,
72
+
73
+ -- Validation results (NEW v4.0)
74
+ VALIDATION_PASSED BOOLEAN,
75
+ VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
76
+ VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
77
+ VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags
78
+
79
+ -- Platform identifier
80
+ PLATFORM VARCHAR(50) DEFAULT 'musora_forums',
81
+
82
+ -- Processing metadata
83
+ PROCESSING_SUCCESS BOOLEAN,
84
+ PROCESSING_STATUS VARCHAR(50), -- NEW v4.0: completed, completed_with_flags, validation_failed, workflow_error
85
+ PROCESSING_ERRORS VARCHAR(16777216),
86
+ PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
87
+ WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
88
+ )
89
+ COMMENT = 'Brand sentiment analysis results for Sabian forum posts. Schema v4.0: Added thread_context_summary, validation fields, and processing status.';
processing_brand_sentiment/main.py ADDED
@@ -0,0 +1,1088 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main execution script for brand sentiment analysis workflow.
3
+ Orchestrates data fetching, processing, and storage using an agentic workflow.
4
+ Supports parallel processing with multiprocessing for improved performance.
5
+ Supports multiple data sources: forums, social media comments, or both.
6
+ """
7
+
8
+ import json
9
+ import os
10
+ import logging
11
+ import argparse
12
+ from datetime import datetime
13
+ import pandas as pd
14
+ from dotenv import load_dotenv
15
+ from multiprocessing import Pool, cpu_count
16
+ import traceback
17
+ from typing import Dict, Any, List
18
+
19
+ from database.snowflake_connection import SnowFlakeConn
20
+ from workflow.orchestrator import BrandAnalysisWorkflow
21
+ from workflow.comment_orchestrator import CommentAnalysisWorkflow
22
+
23
+ # Get the directory where this script is located
24
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
25
+
26
+ # Load environment variables
27
+ load_dotenv(os.path.join(SCRIPT_DIR, '.env'))
28
+
29
+ # Ensure logs directory exists
30
+ LOGS_DIR = os.path.join(SCRIPT_DIR, 'logs')
31
+ os.makedirs(LOGS_DIR, exist_ok=True)
32
+
33
+ # Configure logging
34
+ logging.basicConfig(
35
+ level=logging.INFO,
36
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
37
+ handlers=[
38
+ logging.FileHandler(
39
+ os.path.join(LOGS_DIR, f'brand_sentiment_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
40
+ ),
41
+ logging.StreamHandler()
42
+ ]
43
+ )
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ # ============================================================
48
+ # Configuration Loading
49
+ # ============================================================
50
+
51
+ def load_configs(config_dir: str = None) -> Dict[str, Dict]:
52
+ """
53
+ Load all configuration files.
54
+
55
+ Args:
56
+ config_dir: Directory containing config files
57
+
58
+ Returns:
59
+ Dictionary with all configurations
60
+ """
61
+ if config_dir is None:
62
+ config_dir = os.path.join(SCRIPT_DIR, 'config_files')
63
+
64
+ configs = {}
65
+
66
+ # Load workflow config
67
+ with open(os.path.join(config_dir, 'workflow_config.json'), 'r') as f:
68
+ configs['workflow'] = json.load(f)
69
+
70
+ # Load brand config
71
+ with open(os.path.join(config_dir, 'brand_config.json'), 'r') as f:
72
+ configs['brand'] = json.load(f)
73
+
74
+ # Load analysis categories
75
+ with open(os.path.join(config_dir, 'analysis_categories.json'), 'r') as f:
76
+ configs['categories'] = json.load(f)
77
+
78
+ return configs
79
+
80
+
81
+ # ============================================================
82
+ # Batch Processing Utilities
83
+ # ============================================================
84
+
85
+ def calculate_optimal_batch_size(
86
+ total_posts: int,
87
+ num_workers: int,
88
+ min_batch: int = 20,
89
+ max_batch: int = 500
90
+ ) -> int:
91
+ """
92
+ Calculate optimal batch size based on total posts and workers.
93
+
94
+ Args:
95
+ total_posts: Total number of posts to process
96
+ num_workers: Number of parallel workers
97
+ min_batch: Minimum batch size
98
+ max_batch: Maximum batch size
99
+
100
+ Returns:
101
+ Optimal batch size
102
+ """
103
+ if total_posts <= min_batch:
104
+ return total_posts
105
+
106
+ batch_size = total_posts // num_workers
107
+ batch_size = max(min_batch, min(max_batch, batch_size))
108
+
109
+ return batch_size
110
+
111
+
112
+ def safe_to_json(value: Any) -> Any:
113
+ """
114
+ Safely convert a value to JSON string.
115
+ Handles None, NaN, lists, and already-string values.
116
+
117
+ Args:
118
+ value: Value to convert
119
+
120
+ Returns:
121
+ JSON string if list, None if null, original value otherwise
122
+ """
123
+ # Handle None and NaN
124
+ if value is None or (isinstance(value, float) and pd.isna(value)):
125
+ return None
126
+ # Handle lists - convert to JSON
127
+ if isinstance(value, list):
128
+ return json.dumps(value) if value else None
129
+ # Handle already-string values
130
+ if isinstance(value, str):
131
+ return value if value else None
132
+ # Return as-is for other types
133
+ return value
134
+
135
+
136
+ def safe_json_list_length(value: Any) -> int:
137
+ """
138
+ Safely get the length of a JSON array string.
139
+ Handles None, NaN, empty strings, and invalid JSON.
140
+
141
+ Args:
142
+ value: Value to parse (expected JSON string of array)
143
+
144
+ Returns:
145
+ Length of the array, or 0 if invalid/empty
146
+ """
147
+ # Handle None and NaN
148
+ if value is None or (isinstance(value, float) and pd.isna(value)):
149
+ return 0
150
+ # Handle non-string values
151
+ if not isinstance(value, str):
152
+ return 0
153
+ # Handle empty strings
154
+ if not value or value == '[]' or value == 'null':
155
+ return 0
156
+ # Try to parse JSON
157
+ try:
158
+ parsed = json.loads(value)
159
+ return len(parsed) if isinstance(parsed, list) else 0
160
+ except (json.JSONDecodeError, TypeError):
161
+ return 0
162
+
163
+
164
+ def calculate_batch_stats(df: pd.DataFrame) -> Dict[str, int]:
165
+ """
166
+ Calculate statistics from batch results.
167
+ Handles null values safely for all fields.
168
+
169
+ Args:
170
+ df: DataFrame with processed results
171
+
172
+ Returns:
173
+ Dictionary with statistics
174
+ """
175
+ stats = {
176
+ 'relevant_count': 0,
177
+ 'not_relevant_count': 0,
178
+ 'products_mentioned_count': 0,
179
+ 'competitors_mentioned_count': 0,
180
+ 'positive_sentiment_count': 0,
181
+ 'negative_sentiment_count': 0,
182
+ # Author role stats
183
+ 'current_owner_count': 0,
184
+ 'potential_buyer_count': 0,
185
+ 'primary_focus_count': 0
186
+ }
187
+
188
+ # Handle empty dataframe
189
+ if df.empty:
190
+ return stats
191
+
192
+ # Count relevant/not relevant posts
193
+ if 'IS_RELEVANT' in df.columns:
194
+ relevant_col = df['IS_RELEVANT']
195
+ non_null_mask = relevant_col.notna()
196
+ if non_null_mask.any():
197
+ stats['relevant_count'] = int(relevant_col[non_null_mask].astype(bool).sum())
198
+ stats['not_relevant_count'] = int((~relevant_col[non_null_mask].astype(bool)).sum())
199
+
200
+ # Count product mentions using safe helper
201
+ if 'PRODUCTS_MENTIONED' in df.columns:
202
+ stats['products_mentioned_count'] = int(
203
+ df['PRODUCTS_MENTIONED'].apply(safe_json_list_length).sum()
204
+ )
205
+
206
+ # Count competitor mentions using safe helper
207
+ if 'COMPETITORS_MENTIONED' in df.columns:
208
+ stats['competitors_mentioned_count'] = int(
209
+ df['COMPETITORS_MENTIONED'].apply(safe_json_list_length).sum()
210
+ )
211
+
212
+ # Count sentiment distribution
213
+ if 'SENTIMENT_LEVEL' in df.columns:
214
+ sentiment_values = df['SENTIMENT_LEVEL'].dropna()
215
+ if not sentiment_values.empty:
216
+ stats['positive_sentiment_count'] = int(
217
+ sentiment_values.isin(['positive', 'very_positive']).sum()
218
+ )
219
+ stats['negative_sentiment_count'] = int(
220
+ sentiment_values.isin(['negative', 'very_negative']).sum()
221
+ )
222
+
223
+ # Count author roles
224
+ if 'AUTHOR_ROLE' in df.columns:
225
+ author_roles = df['AUTHOR_ROLE'].dropna()
226
+ if not author_roles.empty:
227
+ stats['current_owner_count'] = int((author_roles == 'current_owner').sum())
228
+ stats['potential_buyer_count'] = int((author_roles == 'potential_buyer').sum())
229
+
230
+ # Count mention context
231
+ if 'SABIAN_MENTION_CONTEXT' in df.columns:
232
+ mention_context = df['SABIAN_MENTION_CONTEXT'].dropna()
233
+ if not mention_context.empty:
234
+ stats['primary_focus_count'] = int((mention_context == 'primary_focus').sum())
235
+
236
+ return stats
237
+
238
+
239
+ def aggregate_results(results: List[Dict[str, Any]]) -> Dict[str, Any]:
240
+ """
241
+ Aggregate results from multiple batches.
242
+
243
+ Args:
244
+ results: List of batch result dictionaries
245
+
246
+ Returns:
247
+ Aggregated statistics dictionary
248
+ """
249
+ aggregated = {
250
+ 'total_processed': sum(r.get('total_processed', 0) for r in results),
251
+ 'total_stored': sum(r.get('total_stored', 0) for r in results),
252
+ 'failed_count': sum(r.get('failed_count', 0) for r in results),
253
+ 'relevant_count': sum(r.get('relevant_count', 0) for r in results),
254
+ 'not_relevant_count': sum(r.get('not_relevant_count', 0) for r in results),
255
+ 'products_mentioned_count': sum(r.get('products_mentioned_count', 0) for r in results),
256
+ 'competitors_mentioned_count': sum(r.get('competitors_mentioned_count', 0) for r in results),
257
+ 'positive_sentiment_count': sum(r.get('positive_sentiment_count', 0) for r in results),
258
+ 'negative_sentiment_count': sum(r.get('negative_sentiment_count', 0) for r in results),
259
+ 'current_owner_count': sum(r.get('current_owner_count', 0) for r in results),
260
+ 'potential_buyer_count': sum(r.get('potential_buyer_count', 0) for r in results),
261
+ 'primary_focus_count': sum(r.get('primary_focus_count', 0) for r in results),
262
+ 'failed_batches': sum(1 for r in results if not r.get('success', False))
263
+ }
264
+
265
+ # Log failed batches
266
+ failed_batches = [r for r in results if not r.get('success', False)]
267
+ if failed_batches:
268
+ logger.error(f"{len(failed_batches)} batch(es) failed:")
269
+ for fb in failed_batches:
270
+ logger.error(f" Batch {fb.get('batch_num')}: {fb.get('error')}")
271
+
272
+ return aggregated
273
+
274
+
275
+ # ============================================================
276
+ # Forum Processing (existing functionality)
277
+ # ============================================================
278
+
279
+ # Columns that should be converted from lists to JSON strings
280
+ FORUM_JSON_ARRAY_COLUMNS = [
281
+ 'products_mentioned', 'product_attributes', 'competitors_mentioned',
282
+ 'competitor_products_owned', 'intents', 'decision_drivers',
283
+ 'pain_points', 'delight_factors', 'processing_errors', 'relevance_keywords_found',
284
+ 'validation_errors', 'validation_warnings', 'validation_flags'
285
+ ]
286
+
287
+ # Column mapping from forum workflow state to output table
288
+ FORUM_COLUMN_MAPPING = {
289
+ 'post_id': 'POST_ID',
290
+ 'thread_id': 'THREAD_ID',
291
+ 'post_author_id': 'POST_AUTHOR_ID',
292
+ 'original_content': 'ORIGINAL_CONTENT',
293
+ 'cleaned_content': 'CLEANED_CONTENT',
294
+ 'quoted_content': 'QUOTED_CONTENT',
295
+ 'raw_thread_context': 'THREAD_CONTEXT',
296
+ 'thread_context_summary': 'THREAD_CONTEXT_SUMMARY',
297
+ 'thread_title': 'THREAD_TITLE',
298
+ 'thread_first_post': 'THREAD_FIRST_POST',
299
+ 'post_created_at': 'POST_CREATED_AT',
300
+ 'thread_started_at': 'THREAD_STARTED_AT',
301
+ 'category_title': 'CATEGORY_TITLE',
302
+ 'category_topic': 'CATEGORY_TOPIC',
303
+ 'detected_language': 'DETECTED_LANGUAGE',
304
+ 'language_code': 'LANGUAGE_CODE',
305
+ 'is_english': 'IS_ENGLISH',
306
+ 'is_relevant': 'IS_RELEVANT',
307
+ 'relevance_confidence': 'RELEVANCE_CONFIDENCE',
308
+ 'relevance_reason': 'RELEVANCE_REASON',
309
+ 'author_role': 'AUTHOR_ROLE',
310
+ 'sabian_mention_context': 'SABIAN_MENTION_CONTEXT',
311
+ 'sentiment_level': 'SENTIMENT_LEVEL',
312
+ 'emotion_type': 'EMOTION_TYPE',
313
+ 'sentiment_target': 'SENTIMENT_TARGET',
314
+ 'sentiment_confidence': 'SENTIMENT_CONFIDENCE',
315
+ 'products_mentioned': 'PRODUCTS_MENTIONED',
316
+ 'product_attributes': 'PRODUCT_ATTRIBUTES',
317
+ 'competitors_mentioned': 'COMPETITORS_MENTIONED',
318
+ 'competitor_products_owned': 'COMPETITOR_PRODUCTS_OWNED',
319
+ 'comparison_type': 'COMPARISON_TYPE',
320
+ 'competitive_positioning': 'COMPETITIVE_POSITIONING',
321
+ 'brand_switching': 'BRAND_SWITCHING',
322
+ 'intents': 'INTENTS',
323
+ 'purchase_stage': 'PURCHASE_STAGE',
324
+ 'decision_drivers': 'DECISION_DRIVERS',
325
+ 'pain_points': 'PAIN_POINTS',
326
+ 'delight_factors': 'DELIGHT_FACTORS',
327
+ 'analysis_notes': 'ANALYSIS_NOTES',
328
+ 'sarcasm_detected': 'SARCASM_DETECTED',
329
+ 'validation_passed': 'VALIDATION_PASSED',
330
+ 'validation_errors': 'VALIDATION_ERRORS',
331
+ 'validation_warnings': 'VALIDATION_WARNINGS',
332
+ 'validation_flags': 'VALIDATION_FLAGS',
333
+ 'success': 'PROCESSING_SUCCESS',
334
+ 'processing_status': 'PROCESSING_STATUS',
335
+ 'processing_errors': 'PROCESSING_ERRORS'
336
+ }
337
+
338
+
339
+ def prepare_forum_output_dataframe(df: pd.DataFrame) -> pd.DataFrame:
340
+ """
341
+ Prepare forum output DataFrame with proper column mapping.
342
+
343
+ Args:
344
+ df: DataFrame with processing results
345
+
346
+ Returns:
347
+ DataFrame ready for Snowflake storage
348
+ """
349
+ output_df = pd.DataFrame()
350
+
351
+ for source_col, target_col in FORUM_COLUMN_MAPPING.items():
352
+ if source_col in df.columns:
353
+ value = df[source_col].copy()
354
+ if source_col in FORUM_JSON_ARRAY_COLUMNS:
355
+ value = value.apply(safe_to_json)
356
+ output_df[target_col] = value
357
+ else:
358
+ output_df[target_col] = None
359
+
360
+ # Add metadata
361
+ output_df['PLATFORM'] = 'musora_forums'
362
+ output_df['PROCESSED_AT'] = datetime.now()
363
+ output_df['WORKFLOW_VERSION'] = '4.0'
364
+
365
+ return output_df
366
+
367
+
368
+ def process_forum_batch_worker(batch_data: tuple) -> Dict[str, Any]:
369
+ """
370
+ Worker function to process a single batch of forum posts.
371
+ Runs in a separate process.
372
+
373
+ Args:
374
+ batch_data: Tuple containing (batch_num, posts, configs, api_key, overwrite_first_batch, output_config)
375
+
376
+ Returns:
377
+ Dictionary with batch statistics
378
+ """
379
+ batch_num, posts, configs, api_key, overwrite_first_batch, output_config = batch_data
380
+
381
+ worker_logger = logging.getLogger(f"ForumWorker-{batch_num}")
382
+
383
+ try:
384
+ worker_logger.info(f"Forum Batch {batch_num}: Starting processing of {len(posts)} posts")
385
+
386
+ # Initialize Snowflake connection for this worker
387
+ snowflake = SnowFlakeConn()
388
+
389
+ # Initialize workflow for this worker
390
+ workflow = BrandAnalysisWorkflow(
391
+ workflow_config=configs['workflow'],
392
+ brand_config=configs['brand'],
393
+ analysis_categories=configs['categories'],
394
+ api_key=api_key
395
+ )
396
+
397
+ # Process posts
398
+ results = workflow.process_batch(posts)
399
+
400
+ # Convert to DataFrame
401
+ results_df = pd.DataFrame(results)
402
+
403
+ # Filter successful results
404
+ initial_count = len(results_df)
405
+ df_successful = results_df[results_df['success'] == True].copy()
406
+ failed_count = initial_count - len(df_successful)
407
+
408
+ worker_logger.info(f"Forum Batch {batch_num}: Processed {initial_count} posts, {len(df_successful)} successful")
409
+
410
+ # Prepare output DataFrame
411
+ output_df = prepare_forum_output_dataframe(df_successful)
412
+
413
+ # Store results
414
+ if len(output_df) > 0:
415
+ overwrite = overwrite_first_batch and batch_num == 1
416
+
417
+ snowflake.store_df_to_snowflake(
418
+ table_name=output_config['table_name'],
419
+ dataframe=output_df,
420
+ database=output_config['database'],
421
+ schema=output_config['schema'],
422
+ overwrite=overwrite
423
+ )
424
+
425
+ worker_logger.info(f"Forum Batch {batch_num}: Stored {len(output_df)} records to Snowflake")
426
+ else:
427
+ worker_logger.warning(f"Forum Batch {batch_num}: No successful records to store")
428
+
429
+ # Close connection
430
+ snowflake.close_connection()
431
+
432
+ # Calculate statistics
433
+ stats = calculate_batch_stats(output_df)
434
+ stats.update({
435
+ 'batch_num': batch_num,
436
+ 'success': True,
437
+ 'total_processed': initial_count,
438
+ 'total_stored': len(output_df),
439
+ 'failed_count': failed_count,
440
+ 'error': None
441
+ })
442
+
443
+ return stats
444
+
445
+ except Exception as e:
446
+ error_msg = f"Forum Batch {batch_num} failed: {str(e)}"
447
+ worker_logger.error(error_msg)
448
+ worker_logger.error(traceback.format_exc())
449
+
450
+ return {
451
+ 'batch_num': batch_num,
452
+ 'success': False,
453
+ 'total_processed': len(posts),
454
+ 'total_stored': 0,
455
+ 'failed_count': len(posts),
456
+ 'error': error_msg
457
+ }
458
+
459
+
460
+ # ============================================================
461
+ # Comment Processing (new functionality)
462
+ # ============================================================
463
+
464
+ # Columns that should be converted from lists to JSON strings (same analysis fields)
465
+ COMMENT_JSON_ARRAY_COLUMNS = [
466
+ 'products_mentioned', 'product_attributes', 'competitors_mentioned',
467
+ 'competitor_products_owned', 'intents', 'decision_drivers',
468
+ 'pain_points', 'delight_factors', 'processing_errors', 'relevance_keywords_found',
469
+ 'validation_errors', 'validation_warnings', 'validation_flags'
470
+ ]
471
+
472
+ # Column mapping from comment workflow state to output table
473
+ COMMENT_COLUMN_MAPPING = {
474
+ # Comment-specific identifiers
475
+ 'comment_sk': 'COMMENT_SK',
476
+ 'comment_id': 'COMMENT_ID',
477
+ 'original_text': 'ORIGINAL_TEXT',
478
+ 'platform': 'PLATFORM',
479
+ 'comment_timestamp': 'COMMENT_TIMESTAMP',
480
+ 'author_name': 'AUTHOR_NAME',
481
+ 'author_id': 'AUTHOR_ID',
482
+ 'content_sk': 'CONTENT_SK',
483
+ 'content_id': 'CONTENT_ID',
484
+ 'content_description': 'CONTENT_DESCRIPTION',
485
+ 'channel_sk': 'CHANNEL_SK',
486
+ 'channel_name': 'CHANNEL_NAME',
487
+ 'channel_display_name': 'CHANNEL_DISPLAY_NAME',
488
+ 'parent_comment_id': 'PARENT_COMMENT_ID',
489
+ 'parent_comment_text': 'PARENT_COMMENT_TEXT',
490
+ # Analysis fields (same as forums)
491
+ 'detected_language': 'DETECTED_LANGUAGE',
492
+ 'language_code': 'LANGUAGE_CODE',
493
+ 'is_english': 'IS_ENGLISH',
494
+ 'is_relevant': 'IS_RELEVANT',
495
+ 'relevance_confidence': 'RELEVANCE_CONFIDENCE',
496
+ 'relevance_reason': 'RELEVANCE_REASON',
497
+ 'author_role': 'AUTHOR_ROLE',
498
+ 'sabian_mention_context': 'SABIAN_MENTION_CONTEXT',
499
+ 'sentiment_level': 'SENTIMENT_LEVEL',
500
+ 'emotion_type': 'EMOTION_TYPE',
501
+ 'sentiment_target': 'SENTIMENT_TARGET',
502
+ 'sentiment_confidence': 'SENTIMENT_CONFIDENCE',
503
+ 'products_mentioned': 'PRODUCTS_MENTIONED',
504
+ 'product_attributes': 'PRODUCT_ATTRIBUTES',
505
+ 'purchase_stage': 'PURCHASE_STAGE',
506
+ 'competitors_mentioned': 'COMPETITORS_MENTIONED',
507
+ 'competitor_products_owned': 'COMPETITOR_PRODUCTS_OWNED',
508
+ 'comparison_type': 'COMPARISON_TYPE',
509
+ 'competitive_positioning': 'COMPETITIVE_POSITIONING',
510
+ 'brand_switching': 'BRAND_SWITCHING',
511
+ 'intents': 'INTENTS',
512
+ 'decision_drivers': 'DECISION_DRIVERS',
513
+ 'pain_points': 'PAIN_POINTS',
514
+ 'delight_factors': 'DELIGHT_FACTORS',
515
+ 'analysis_notes': 'ANALYSIS_NOTES',
516
+ 'sarcasm_detected': 'SARCASM_DETECTED',
517
+ 'validation_passed': 'VALIDATION_PASSED',
518
+ 'validation_errors': 'VALIDATION_ERRORS',
519
+ 'validation_warnings': 'VALIDATION_WARNINGS',
520
+ 'validation_flags': 'VALIDATION_FLAGS',
521
+ 'success': 'PROCESSING_SUCCESS',
522
+ 'processing_status': 'PROCESSING_STATUS',
523
+ 'processing_errors': 'PROCESSING_ERRORS'
524
+ }
525
+
526
+
527
+ def prepare_comment_output_dataframe(df: pd.DataFrame) -> pd.DataFrame:
528
+ """
529
+ Prepare comment output DataFrame with proper column mapping.
530
+
531
+ Args:
532
+ df: DataFrame with processing results
533
+
534
+ Returns:
535
+ DataFrame ready for Snowflake storage
536
+ """
537
+ output_df = pd.DataFrame()
538
+
539
+ for source_col, target_col in COMMENT_COLUMN_MAPPING.items():
540
+ if source_col in df.columns:
541
+ value = df[source_col].copy()
542
+ if source_col in COMMENT_JSON_ARRAY_COLUMNS:
543
+ value = value.apply(safe_to_json)
544
+ output_df[target_col] = value
545
+ else:
546
+ output_df[target_col] = None
547
+
548
+ # Add metadata
549
+ output_df['PROCESSED_AT'] = datetime.now()
550
+ output_df['WORKFLOW_VERSION'] = '4.0'
551
+
552
+ return output_df
553
+
554
+
555
+ def process_comment_batch_worker(batch_data: tuple) -> Dict[str, Any]:
556
+ """
557
+ Worker function to process a single batch of social media comments.
558
+ Runs in a separate process.
559
+
560
+ Args:
561
+ batch_data: Tuple containing (batch_num, comments, configs, api_key, overwrite_first_batch, output_config)
562
+
563
+ Returns:
564
+ Dictionary with batch statistics
565
+ """
566
+ batch_num, comments, configs, api_key, overwrite_first_batch, output_config = batch_data
567
+
568
+ worker_logger = logging.getLogger(f"CommentWorker-{batch_num}")
569
+
570
+ try:
571
+ worker_logger.info(f"Comment Batch {batch_num}: Starting processing of {len(comments)} comments")
572
+
573
+ # Initialize Snowflake connection for this worker
574
+ snowflake = SnowFlakeConn()
575
+
576
+ # Initialize comment workflow for this worker
577
+ workflow = CommentAnalysisWorkflow(
578
+ workflow_config=configs['workflow'],
579
+ brand_config=configs['brand'],
580
+ analysis_categories=configs['categories'],
581
+ api_key=api_key
582
+ )
583
+
584
+ # Process comments
585
+ results = workflow.process_batch(comments)
586
+
587
+ # Convert to DataFrame
588
+ results_df = pd.DataFrame(results)
589
+
590
+ # Filter successful results
591
+ initial_count = len(results_df)
592
+ df_successful = results_df[results_df['success'] == True].copy()
593
+ failed_count = initial_count - len(df_successful)
594
+
595
+ worker_logger.info(f"Comment Batch {batch_num}: Processed {initial_count} comments, {len(df_successful)} successful")
596
+
597
+ # Prepare output DataFrame
598
+ output_df = prepare_comment_output_dataframe(df_successful)
599
+
600
+ # Store results
601
+ if len(output_df) > 0:
602
+ overwrite = overwrite_first_batch and batch_num == 1
603
+
604
+ snowflake.store_df_to_snowflake(
605
+ table_name=output_config['table_name'],
606
+ dataframe=output_df,
607
+ database=output_config['database'],
608
+ schema=output_config['schema'],
609
+ overwrite=overwrite
610
+ )
611
+
612
+ worker_logger.info(f"Comment Batch {batch_num}: Stored {len(output_df)} records to Snowflake")
613
+ else:
614
+ worker_logger.warning(f"Comment Batch {batch_num}: No successful records to store")
615
+
616
+ # Close connection
617
+ snowflake.close_connection()
618
+
619
+ # Calculate statistics
620
+ stats = calculate_batch_stats(output_df)
621
+ stats.update({
622
+ 'batch_num': batch_num,
623
+ 'success': True,
624
+ 'total_processed': initial_count,
625
+ 'total_stored': len(output_df),
626
+ 'failed_count': failed_count,
627
+ 'error': None
628
+ })
629
+
630
+ return stats
631
+
632
+ except Exception as e:
633
+ error_msg = f"Comment Batch {batch_num} failed: {str(e)}"
634
+ worker_logger.error(error_msg)
635
+ worker_logger.error(traceback.format_exc())
636
+
637
+ return {
638
+ 'batch_num': batch_num,
639
+ 'success': False,
640
+ 'total_processed': len(comments),
641
+ 'total_stored': 0,
642
+ 'failed_count': len(comments),
643
+ 'error': error_msg
644
+ }
645
+
646
+
647
+ # ============================================================
648
+ # Main Processor Class
649
+ # ============================================================
650
+
651
+ class BrandSentimentProcessor:
652
+ """
653
+ Main processor class that orchestrates the entire workflow.
654
+ Supports processing forums, social media comments, or both.
655
+ """
656
+
657
+ def __init__(self, config_dir: str = None):
658
+ """
659
+ Initialize the processor.
660
+
661
+ Args:
662
+ config_dir: Directory containing configuration files
663
+ """
664
+ # Load configurations
665
+ self.configs = load_configs(config_dir)
666
+
667
+ # Initialize Snowflake connection
668
+ self.snowflake = SnowFlakeConn()
669
+
670
+ # Get OpenAI API key
671
+ self.api_key = os.getenv("OPENAI_API_KEY")
672
+ if not self.api_key:
673
+ raise ValueError("OPENAI_API_KEY not found in environment variables")
674
+
675
+ # Get output configurations
676
+ self.forum_output_config = self.configs['workflow'].get('output', {
677
+ 'table_name': 'SABIAN_BRAND_ANALYSIS',
678
+ 'database': 'SOCIAL_MEDIA_DB',
679
+ 'schema': 'ML_FEATURES'
680
+ })
681
+
682
+ self.comment_output_config = self.configs['workflow'].get('comments_output', {
683
+ 'table_name': 'SABIAN_BRAND_ANALYSIS_COMMENTS',
684
+ 'database': 'SOCIAL_MEDIA_DB',
685
+ 'schema': 'ML_FEATURES'
686
+ })
687
+
688
+ logger.info("BrandSentimentProcessor initialized successfully")
689
+
690
+ def fetch_forum_posts(self, limit: int = None) -> pd.DataFrame:
691
+ """
692
+ Fetch forum posts from Snowflake.
693
+
694
+ Args:
695
+ limit: Optional limit on number of posts
696
+
697
+ Returns:
698
+ DataFrame containing post data
699
+ """
700
+ logger.info("Fetching forum posts...")
701
+
702
+ sql_path = os.path.join(SCRIPT_DIR, 'database', 'sql', 'fetch_forum_posts.sql')
703
+ df = self.snowflake.fetch_forum_posts_with_context(sql_path, limit)
704
+
705
+ logger.info(f"Fetched {len(df)} forum posts")
706
+ return df
707
+
708
+ def fetch_comments(self, limit: int = None) -> pd.DataFrame:
709
+ """
710
+ Fetch social media comments from Snowflake.
711
+
712
+ Args:
713
+ limit: Optional limit on number of comments
714
+
715
+ Returns:
716
+ DataFrame containing comment data
717
+ """
718
+ logger.info("Fetching social media comments...")
719
+
720
+ sql_path = os.path.join(SCRIPT_DIR, 'database', 'sql', 'fetch_comments.sql')
721
+ df = self.snowflake.fetch_comments(sql_path, limit)
722
+
723
+ logger.info(f"Fetched {len(df)} social media comments")
724
+ return df
725
+
726
+ def calculate_num_workers(self) -> int:
727
+ """
728
+ Calculate number of parallel workers.
729
+
730
+ Returns:
731
+ Number of workers
732
+ """
733
+ parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {})
734
+ max_workers = parallel_config.get('max_workers', 5)
735
+
736
+ num_cpus = cpu_count()
737
+ num_workers = max(1, min(max_workers, num_cpus - 2))
738
+
739
+ logger.info(f"Using {num_workers} parallel workers (CPU count: {num_cpus})")
740
+ return num_workers
741
+
742
+ # ---- Forum Processing ----
743
+
744
+ def process_forums_parallel(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
745
+ """
746
+ Process forum posts using parallel workers.
747
+
748
+ Args:
749
+ df: DataFrame containing posts
750
+ overwrite: Whether to overwrite existing table
751
+
752
+ Returns:
753
+ Dictionary with aggregated statistics
754
+ """
755
+ posts = df.to_dict('records')
756
+ total_posts = len(posts)
757
+
758
+ logger.info(f"Processing {total_posts} forum posts using parallel processing...")
759
+
760
+ num_workers = self.calculate_num_workers()
761
+
762
+ parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {})
763
+ min_batch = parallel_config.get('min_batch_size', 20)
764
+ max_batch = parallel_config.get('max_batch_size', 400)
765
+
766
+ batch_size = calculate_optimal_batch_size(total_posts, num_workers, min_batch, max_batch)
767
+ logger.info(f"Forum batch size: {batch_size}")
768
+
769
+ # Create batches
770
+ batches = []
771
+ for i in range(0, total_posts, batch_size):
772
+ batch = posts[i:i + batch_size]
773
+ batch_num = (i // batch_size) + 1
774
+ batches.append((batch_num, batch, self.configs, self.api_key, overwrite, self.forum_output_config))
775
+
776
+ total_batches = len(batches)
777
+ logger.info(f"Split into {total_batches} forum batches")
778
+
779
+ # Process in parallel
780
+ with Pool(processes=num_workers) as pool:
781
+ results = pool.map(process_forum_batch_worker, batches)
782
+
783
+ return aggregate_results(results)
784
+
785
+ def process_forums_sequential(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
786
+ """
787
+ Process forum posts sequentially (for debugging).
788
+
789
+ Args:
790
+ df: DataFrame containing posts
791
+ overwrite: Whether to overwrite existing table
792
+
793
+ Returns:
794
+ Dictionary with statistics
795
+ """
796
+ logger.info(f"Processing {len(df)} forum posts using sequential processing...")
797
+
798
+ posts = df.to_dict('records')
799
+ batch_data = (1, posts, self.configs, self.api_key, overwrite, self.forum_output_config)
800
+ result = process_forum_batch_worker(batch_data)
801
+
802
+ return {
803
+ 'total_processed': result.get('total_processed', 0),
804
+ 'total_stored': result.get('total_stored', 0),
805
+ 'failed_count': result.get('failed_count', 0),
806
+ 'relevant_count': result.get('relevant_count', 0),
807
+ 'not_relevant_count': result.get('not_relevant_count', 0),
808
+ 'products_mentioned_count': result.get('products_mentioned_count', 0),
809
+ 'competitors_mentioned_count': result.get('competitors_mentioned_count', 0),
810
+ 'positive_sentiment_count': result.get('positive_sentiment_count', 0),
811
+ 'negative_sentiment_count': result.get('negative_sentiment_count', 0),
812
+ 'current_owner_count': result.get('current_owner_count', 0),
813
+ 'potential_buyer_count': result.get('potential_buyer_count', 0),
814
+ 'primary_focus_count': result.get('primary_focus_count', 0),
815
+ 'failed_batches': 0 if result.get('success', False) else 1
816
+ }
817
+
818
+ # ---- Comment Processing ----
819
+
820
+ def process_comments_parallel(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
821
+ """
822
+ Process social media comments using parallel workers.
823
+
824
+ Args:
825
+ df: DataFrame containing comments
826
+ overwrite: Whether to overwrite existing table
827
+
828
+ Returns:
829
+ Dictionary with aggregated statistics
830
+ """
831
+ comments = df.to_dict('records')
832
+ total_comments = len(comments)
833
+
834
+ logger.info(f"Processing {total_comments} comments using parallel processing...")
835
+
836
+ num_workers = self.calculate_num_workers()
837
+
838
+ parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {})
839
+ min_batch = parallel_config.get('min_batch_size', 20)
840
+ max_batch = parallel_config.get('max_batch_size', 400)
841
+
842
+ batch_size = calculate_optimal_batch_size(total_comments, num_workers, min_batch, max_batch)
843
+ logger.info(f"Comment batch size: {batch_size}")
844
+
845
+ # Create batches
846
+ batches = []
847
+ for i in range(0, total_comments, batch_size):
848
+ batch = comments[i:i + batch_size]
849
+ batch_num = (i // batch_size) + 1
850
+ batches.append((batch_num, batch, self.configs, self.api_key, overwrite, self.comment_output_config))
851
+
852
+ total_batches = len(batches)
853
+ logger.info(f"Split into {total_batches} comment batches")
854
+
855
+ # Process in parallel
856
+ with Pool(processes=num_workers) as pool:
857
+ results = pool.map(process_comment_batch_worker, batches)
858
+
859
+ return aggregate_results(results)
860
+
861
+ def process_comments_sequential(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
862
+ """
863
+ Process social media comments sequentially (for debugging).
864
+
865
+ Args:
866
+ df: DataFrame containing comments
867
+ overwrite: Whether to overwrite existing table
868
+
869
+ Returns:
870
+ Dictionary with statistics
871
+ """
872
+ logger.info(f"Processing {len(df)} comments using sequential processing...")
873
+
874
+ comments = df.to_dict('records')
875
+ batch_data = (1, comments, self.configs, self.api_key, overwrite, self.comment_output_config)
876
+ result = process_comment_batch_worker(batch_data)
877
+
878
+ return {
879
+ 'total_processed': result.get('total_processed', 0),
880
+ 'total_stored': result.get('total_stored', 0),
881
+ 'failed_count': result.get('failed_count', 0),
882
+ 'relevant_count': result.get('relevant_count', 0),
883
+ 'not_relevant_count': result.get('not_relevant_count', 0),
884
+ 'products_mentioned_count': result.get('products_mentioned_count', 0),
885
+ 'competitors_mentioned_count': result.get('competitors_mentioned_count', 0),
886
+ 'positive_sentiment_count': result.get('positive_sentiment_count', 0),
887
+ 'negative_sentiment_count': result.get('negative_sentiment_count', 0),
888
+ 'current_owner_count': result.get('current_owner_count', 0),
889
+ 'potential_buyer_count': result.get('potential_buyer_count', 0),
890
+ 'primary_focus_count': result.get('primary_focus_count', 0),
891
+ 'failed_batches': 0 if result.get('success', False) else 1
892
+ }
893
+
894
+ # ---- Unified Processing ----
895
+
896
+ def _log_source_summary(self, source_name: str, stats: Dict[str, Any], processing_time: float) -> None:
897
+ """
898
+ Log processing summary for a data source.
899
+
900
+ Args:
901
+ source_name: Name of the data source
902
+ stats: Processing statistics
903
+ processing_time: Time taken in seconds
904
+ """
905
+ logger.info(f" --- {source_name} ---")
906
+ logger.info(f" Total processed: {stats.get('total_processed', 0)}")
907
+ logger.info(f" Successfully stored: {stats.get('total_stored', 0)}")
908
+ logger.info(f" Failed: {stats.get('failed_count', 0)}")
909
+ logger.info(f" Relevant: {stats.get('relevant_count', 0)}")
910
+ logger.info(f" Not relevant: {stats.get('not_relevant_count', 0)}")
911
+ logger.info(f" Product mentions: {stats.get('products_mentioned_count', 0)}")
912
+ logger.info(f" Competitor mentions: {stats.get('competitors_mentioned_count', 0)}")
913
+ logger.info(f" Positive sentiment: {stats.get('positive_sentiment_count', 0)}")
914
+ logger.info(f" Negative sentiment: {stats.get('negative_sentiment_count', 0)}")
915
+ logger.info(f" Current owners: {stats.get('current_owner_count', 0)}")
916
+ logger.info(f" Potential buyers: {stats.get('potential_buyer_count', 0)}")
917
+ logger.info(f" Primary focus: {stats.get('primary_focus_count', 0)}")
918
+ if stats.get('failed_batches', 0) > 0:
919
+ logger.info(f" Failed batches: {stats['failed_batches']}")
920
+ logger.info(f" Processing time: {processing_time:.2f} seconds")
921
+ if stats.get('total_processed', 0) > 0:
922
+ logger.info(f" Average per item: {processing_time / stats['total_processed']:.2f} seconds")
923
+
924
+ def run(
925
+ self,
926
+ limit: int = None,
927
+ overwrite: bool = False,
928
+ sequential: bool = False,
929
+ data_source: str = 'all'
930
+ ):
931
+ """
932
+ Run the complete processing pipeline.
933
+
934
+ Args:
935
+ limit: Optional limit on items to process per source
936
+ overwrite: Whether to overwrite existing table
937
+ sequential: Use sequential processing instead of parallel
938
+ data_source: Which data source to process ('forums', 'comments', 'all')
939
+ """
940
+ try:
941
+ logger.info("=" * 80)
942
+ logger.info("Starting Brand Sentiment Analysis Workflow")
943
+ logger.info(f"Brand: {self.configs['brand'].get('brand', {}).get('name', 'Unknown')}")
944
+ logger.info(f"Mode: {'SEQUENTIAL' if sequential else 'PARALLEL'}")
945
+ logger.info(f"Data source: {data_source}")
946
+ logger.info("=" * 80)
947
+
948
+ process_forums = data_source in ('forums', 'all')
949
+ process_comments = data_source in ('comments', 'all')
950
+
951
+ # Track results for summary
952
+ forum_stats = None
953
+ forum_time = 0.0
954
+ comment_stats = None
955
+ comment_time = 0.0
956
+
957
+ # ---- Process Forums ----
958
+ if process_forums:
959
+ logger.info("-" * 40)
960
+ logger.info("Processing FORUMS")
961
+ logger.info("-" * 40)
962
+
963
+ df_posts = self.fetch_forum_posts(limit)
964
+
965
+ if df_posts.empty:
966
+ logger.warning("No forum posts to process")
967
+ else:
968
+ start_time = datetime.now()
969
+
970
+ if sequential:
971
+ forum_stats = self.process_forums_sequential(df_posts, overwrite)
972
+ else:
973
+ forum_stats = self.process_forums_parallel(df_posts, overwrite)
974
+
975
+ forum_time = (datetime.now() - start_time).total_seconds()
976
+
977
+ # ---- Process Comments ----
978
+ if process_comments:
979
+ logger.info("-" * 40)
980
+ logger.info("Processing SOCIAL MEDIA COMMENTS")
981
+ logger.info("-" * 40)
982
+
983
+ df_comments = self.fetch_comments(limit)
984
+
985
+ if df_comments.empty:
986
+ logger.warning("No social media comments to process")
987
+ else:
988
+ start_time = datetime.now()
989
+
990
+ if sequential:
991
+ comment_stats = self.process_comments_sequential(df_comments, overwrite)
992
+ else:
993
+ comment_stats = self.process_comments_parallel(df_comments, overwrite)
994
+
995
+ comment_time = (datetime.now() - start_time).total_seconds()
996
+
997
+ # ---- Summary ----
998
+ logger.info("=" * 80)
999
+ logger.info("Processing Summary:")
1000
+ logger.info(f" Mode: {'Sequential' if sequential else 'Parallel'}")
1001
+ logger.info(f" Data source: {data_source}")
1002
+
1003
+ if forum_stats is not None:
1004
+ self._log_source_summary("Forums", forum_stats, forum_time)
1005
+
1006
+ if comment_stats is not None:
1007
+ self._log_source_summary("Social Media Comments", comment_stats, comment_time)
1008
+
1009
+ logger.info("=" * 80)
1010
+
1011
+ except Exception as e:
1012
+ logger.error(f"Error in workflow execution: {str(e)}", exc_info=True)
1013
+ raise
1014
+
1015
+ finally:
1016
+ self.snowflake.close_connection()
1017
+ logger.info("Snowflake connection closed")
1018
+
1019
+
1020
+ # ============================================================
1021
+ # Legacy compatibility - keep old function names working
1022
+ # ============================================================
1023
+
1024
+ def prepare_output_dataframe(df: pd.DataFrame) -> pd.DataFrame:
1025
+ """Legacy wrapper for forum output preparation."""
1026
+ return prepare_forum_output_dataframe(df)
1027
+
1028
+
1029
+ def process_batch_worker(batch_data: tuple) -> Dict[str, Any]:
1030
+ """Legacy wrapper for forum batch worker."""
1031
+ return process_forum_batch_worker(batch_data)
1032
+
1033
+
1034
+ # ============================================================
1035
+ # Main Entry Point
1036
+ # ============================================================
1037
+
1038
+ def main():
1039
+ """Main entry point."""
1040
+ parser = argparse.ArgumentParser(
1041
+ description="Brand Sentiment Analysis - Analyze forum posts and social media comments for brand intelligence"
1042
+ )
1043
+ parser.add_argument(
1044
+ '--limit',
1045
+ type=int,
1046
+ default=None,
1047
+ help='Limit number of items to process per source (default: all unprocessed)'
1048
+ )
1049
+ parser.add_argument(
1050
+ '--overwrite',
1051
+ action='store_true',
1052
+ default=False,
1053
+ help='Overwrite existing Snowflake table (default: append)'
1054
+ )
1055
+ parser.add_argument(
1056
+ '--sequential',
1057
+ action='store_true',
1058
+ default=False,
1059
+ help='Use sequential processing instead of parallel (for debugging)'
1060
+ )
1061
+ parser.add_argument(
1062
+ '--config-dir',
1063
+ type=str,
1064
+ default=None,
1065
+ help='Path to configuration directory (default: config_files/)'
1066
+ )
1067
+ parser.add_argument(
1068
+ '--data-source',
1069
+ type=str,
1070
+ choices=['forums', 'comments', 'all'],
1071
+ default='all',
1072
+ help='Data source to process: forums, comments, or all (default: all)'
1073
+ )
1074
+
1075
+ args = parser.parse_args()
1076
+
1077
+ # Initialize and run
1078
+ processor = BrandSentimentProcessor(config_dir=args.config_dir)
1079
+ processor.run(
1080
+ limit=args.limit,
1081
+ overwrite=args.overwrite,
1082
+ sequential=args.sequential,
1083
+ data_source=args.data_source
1084
+ )
1085
+
1086
+
1087
+ if __name__ == "__main__":
1088
+ main()
processing_brand_sentiment/utils/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities module for brand sentiment analysis.
3
+ Contains HTML parsing and other helper functions.
4
+ """
5
+
6
+ from .html_parser import HTMLParser
7
+
8
+ __all__ = ['HTMLParser']
processing_brand_sentiment/utils/html_parser.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HTML Parser utility for extracting content from forum posts.
3
+ Handles the complex HTML structure where replies contain quoted parent content.
4
+ """
5
+
6
+ import re
7
+ import html
8
+ from typing import Dict, Optional, Tuple
9
+ from bs4 import BeautifulSoup
10
+ import logging
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class HTMLParser:
16
+ """
17
+ Parses HTML content from forum posts to extract actual reply content
18
+ and quoted parent content separately.
19
+ """
20
+
21
+ def __init__(self):
22
+ """Initialize the HTML parser."""
23
+ pass
24
+
25
+ def parse_post_content(self, html_content: str) -> Dict[str, Optional[str]]:
26
+ """
27
+ Parse HTML post content to extract reply and quoted content.
28
+
29
+ The forum posts have a structure where:
30
+ - <blockquote> contains the quoted parent post
31
+ - Content outside blockquote is the actual reply
32
+
33
+ Example input:
34
+ <blockquote><span class="post-id">125015</span>
35
+ <p class="quote-heading"><strong>JackO</strong><em> - Feb 3, 2015</em></p>
36
+ <br /><p>Parent content here...</p></blockquote>
37
+ <br /><p>Actual reply content here...</p>
38
+
39
+ Args:
40
+ html_content: Raw HTML content from POST_CONTENT field
41
+
42
+ Returns:
43
+ Dictionary with:
44
+ - reply_content: The actual reply text (cleaned)
45
+ - quoted_content: The quoted parent text (cleaned), if any
46
+ - quoted_author: Author of the quoted post, if any
47
+ - quoted_date: Date of the quoted post, if any
48
+ - has_quote: Boolean indicating if post contains a quote
49
+ """
50
+ if not html_content or not html_content.strip():
51
+ return {
52
+ "reply_content": "",
53
+ "quoted_content": None,
54
+ "quoted_author": None,
55
+ "quoted_date": None,
56
+ "has_quote": False
57
+ }
58
+
59
+ try:
60
+ soup = BeautifulSoup(html_content, 'html.parser')
61
+
62
+ # Extract quoted content from blockquotes
63
+ quoted_content = None
64
+ quoted_author = None
65
+ quoted_date = None
66
+ has_quote = False
67
+
68
+ blockquotes = soup.find_all('blockquote')
69
+
70
+ if blockquotes:
71
+ has_quote = True
72
+ quote_parts = []
73
+
74
+ for blockquote in blockquotes:
75
+ # Extract quote heading info (author and date)
76
+ quote_heading = blockquote.find('p', class_='quote-heading')
77
+ if quote_heading:
78
+ author_tag = quote_heading.find('strong')
79
+ if author_tag:
80
+ quoted_author = author_tag.get_text(strip=True)
81
+
82
+ date_tag = quote_heading.find('em')
83
+ if date_tag:
84
+ quoted_date = date_tag.get_text(strip=True).lstrip(' - ')
85
+
86
+ # Get the quote text content (excluding heading)
87
+ # Remove the heading first to get just the content
88
+ if quote_heading:
89
+ quote_heading.decompose()
90
+
91
+ # Remove post-id spans
92
+ for post_id_span in blockquote.find_all('span', class_='post-id'):
93
+ post_id_span.decompose()
94
+
95
+ quote_text = self._clean_text(blockquote.get_text())
96
+ if quote_text:
97
+ quote_parts.append(quote_text)
98
+
99
+ # Remove the blockquote from the soup to get remaining content
100
+ blockquote.decompose()
101
+
102
+ quoted_content = " ".join(quote_parts) if quote_parts else None
103
+
104
+ # Get the remaining content (actual reply)
105
+ reply_content = self._clean_text(soup.get_text())
106
+
107
+ return {
108
+ "reply_content": reply_content,
109
+ "quoted_content": quoted_content,
110
+ "quoted_author": quoted_author,
111
+ "quoted_date": quoted_date,
112
+ "has_quote": has_quote
113
+ }
114
+
115
+ except Exception as e:
116
+ logger.warning(f"Error parsing HTML content: {e}")
117
+ # Fallback: try to extract text directly
118
+ return {
119
+ "reply_content": self._clean_text(self._strip_html_tags(html_content)),
120
+ "quoted_content": None,
121
+ "quoted_author": None,
122
+ "quoted_date": None,
123
+ "has_quote": False
124
+ }
125
+
126
+ def _clean_text(self, text: str) -> str:
127
+ """
128
+ Clean extracted text by removing extra whitespace and normalizing.
129
+
130
+ Args:
131
+ text: Raw text to clean
132
+
133
+ Returns:
134
+ Cleaned text
135
+ """
136
+ if not text:
137
+ return ""
138
+
139
+ # Decode HTML entities
140
+ text = html.unescape(text)
141
+
142
+ # Replace multiple whitespace with single space
143
+ text = re.sub(r'\s+', ' ', text)
144
+
145
+ # Strip leading/trailing whitespace
146
+ text = text.strip()
147
+
148
+ return text
149
+
150
+ def _strip_html_tags(self, html_content: str) -> str:
151
+ """
152
+ Fallback method to strip HTML tags if BeautifulSoup fails.
153
+
154
+ Args:
155
+ html_content: HTML content
156
+
157
+ Returns:
158
+ Text without HTML tags
159
+ """
160
+ # Remove HTML tags
161
+ clean = re.sub(r'<[^>]+>', ' ', html_content)
162
+ # Decode entities
163
+ clean = html.unescape(clean)
164
+ # Clean whitespace
165
+ clean = re.sub(r'\s+', ' ', clean)
166
+ return clean.strip()
167
+
168
+ def extract_plain_text(self, html_content: str) -> str:
169
+ """
170
+ Extract plain text from HTML content, preserving readability.
171
+
172
+ Args:
173
+ html_content: HTML content
174
+
175
+ Returns:
176
+ Plain text version
177
+ """
178
+ if not html_content:
179
+ return ""
180
+
181
+ try:
182
+ soup = BeautifulSoup(html_content, 'html.parser')
183
+
184
+ # Add newlines for block elements
185
+ for br in soup.find_all('br'):
186
+ br.replace_with('\n')
187
+ for p in soup.find_all('p'):
188
+ p.append('\n')
189
+
190
+ text = soup.get_text()
191
+ return self._clean_text(text)
192
+
193
+ except Exception as e:
194
+ logger.warning(f"Error extracting plain text: {e}")
195
+ return self._clean_text(self._strip_html_tags(html_content))
196
+
197
+ def build_thread_context(
198
+ self,
199
+ thread_title: Optional[str],
200
+ first_post_content: Optional[str],
201
+ category_title: Optional[str] = None,
202
+ category_topic: Optional[str] = None
203
+ ) -> str:
204
+ """
205
+ Build a context string from thread information.
206
+
207
+ Args:
208
+ thread_title: Title of the discussion thread
209
+ first_post_content: Content of the first post in the thread
210
+ category_title: Category title
211
+ category_topic: Category topic
212
+
213
+ Returns:
214
+ Formatted context string
215
+ """
216
+ context_parts = []
217
+
218
+ if category_title:
219
+ context_parts.append(f"Category: {category_title}")
220
+
221
+ if category_topic:
222
+ context_parts.append(f"Topic: {category_topic}")
223
+
224
+ if thread_title:
225
+ context_parts.append(f"Thread: {thread_title}")
226
+
227
+ if first_post_content:
228
+ # Parse and clean the first post content
229
+ parsed = self.parse_post_content(first_post_content)
230
+ first_post_text = parsed.get("reply_content", "")
231
+ if first_post_text:
232
+ # Truncate if too long
233
+ if len(first_post_text) > 500:
234
+ first_post_text = first_post_text[:500] + "..."
235
+ context_parts.append(f"Original discussion: {first_post_text}")
236
+
237
+ return " | ".join(context_parts) if context_parts else ""
238
+
239
+ def is_empty_content(self, html_content: str) -> bool:
240
+ """
241
+ Check if HTML content is effectively empty.
242
+
243
+ Args:
244
+ html_content: HTML content to check
245
+
246
+ Returns:
247
+ True if content is empty or contains no meaningful text
248
+ """
249
+ if not html_content:
250
+ return True
251
+
252
+ text = self.extract_plain_text(html_content)
253
+ return len(text.strip()) == 0
processing_brand_sentiment/workflow/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Workflow module for brand sentiment analysis.
3
+ Contains the LangGraph orchestrators and agent implementations.
4
+ Supports both forum posts and social media comments.
5
+ """
6
+
7
+ from .orchestrator import BrandAnalysisWorkflow
8
+ from .comment_orchestrator import CommentAnalysisWorkflow
9
+
10
+ __all__ = ['BrandAnalysisWorkflow', 'CommentAnalysisWorkflow']
processing_brand_sentiment/workflow/agents/__init__.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agents module for brand sentiment analysis v4.0.
3
+
4
+ Contains specialized agents for the 4-stage pipeline:
5
+ 1. ContentPreprocessorAgent - HTML parsing, cleaning, keyword detection (forums)
6
+ CommentPreprocessorAgent - Plain text cleaning, keyword detection (comments)
7
+ 2. SabianRelevanceExtractionAgent - Relevance + fact extraction
8
+ 3. SabianSentimentAnalyzerAgent - Deep sentiment analysis
9
+ 4. OutputValidatorAgent - Rule-based validation
10
+ """
11
+
12
+ from .base_agent import BaseAgent
13
+ from .content_preprocessor_agent import ContentPreprocessorAgent
14
+ from .comment_preprocessor_agent import CommentPreprocessorAgent
15
+ from .sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent
16
+ from .sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent
17
+ from .output_validator_agent import OutputValidatorAgent
18
+
19
+ # Legacy imports for backward compatibility
20
+ from .preprocessor_agent import PreprocessorAgent
21
+ from .relevance_validator_agent import RelevanceValidatorAgent
22
+ from .sabian_analyzer_agent import SabianAnalyzerAgent
23
+
24
+ __all__ = [
25
+ # Base
26
+ 'BaseAgent',
27
+
28
+ # New agents (v4.0)
29
+ 'ContentPreprocessorAgent',
30
+ 'CommentPreprocessorAgent',
31
+ 'SabianRelevanceExtractionAgent',
32
+ 'SabianSentimentAnalyzerAgent',
33
+ 'OutputValidatorAgent',
34
+
35
+ # Legacy agents (for backward compatibility)
36
+ 'PreprocessorAgent',
37
+ 'RelevanceValidatorAgent',
38
+ 'SabianAnalyzerAgent'
39
+ ]
processing_brand_sentiment/workflow/agents/base_agent.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Base Agent class for all agents in the brand sentiment analysis workflow.
3
+ Provides a common interface and structure for extensibility.
4
+ """
5
+
6
+ from abc import ABC, abstractmethod
7
+ from typing import Dict, Any, Optional
8
+ import json
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class BaseAgent(ABC):
15
+ """
16
+ Abstract base class for all agents in the brand sentiment analysis workflow.
17
+ Provides common functionality and enforces consistent interface.
18
+ """
19
+
20
+ def __init__(self, name: str, config: Dict[str, Any]):
21
+ """
22
+ Initialize the base agent.
23
+
24
+ Args:
25
+ name: Name of the agent
26
+ config: Configuration dictionary for the agent
27
+ """
28
+ self.name = name
29
+ self.config = config
30
+ self.model = config.get("model", "gpt-5-nano")
31
+ self.temperature = config.get("temperature", 0.2)
32
+ self.max_retries = config.get("max_retries", 3)
33
+ logger.info(f"Initialized {self.name} with model {self.model}")
34
+
35
+ @abstractmethod
36
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
37
+ """
38
+ Process input data and return results.
39
+ This method must be implemented by all concrete agent classes.
40
+
41
+ Args:
42
+ input_data: Dictionary containing input data for processing
43
+
44
+ Returns:
45
+ Dictionary containing processing results
46
+ """
47
+ pass
48
+
49
+ @abstractmethod
50
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
51
+ """
52
+ Validate input data before processing.
53
+
54
+ Args:
55
+ input_data: Dictionary containing input data
56
+
57
+ Returns:
58
+ True if input is valid, False otherwise
59
+ """
60
+ pass
61
+
62
+ def get_name(self) -> str:
63
+ """Get the agent name."""
64
+ return self.name
65
+
66
+ def get_config(self) -> Dict[str, Any]:
67
+ """Get the agent configuration."""
68
+ return self.config
69
+
70
+ def log_processing(self, message: str, level: str = "info"):
71
+ """
72
+ Log processing information.
73
+
74
+ Args:
75
+ message: Log message
76
+ level: Log level (info, warning, error, debug)
77
+ """
78
+ log_method = getattr(logger, level, logger.info)
79
+ log_method(f"[{self.name}] {message}")
80
+
81
+ def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]:
82
+ """
83
+ Handle errors consistently across all agents.
84
+
85
+ Args:
86
+ error: The exception that occurred
87
+ context: Additional context about the error
88
+
89
+ Returns:
90
+ Error dictionary with details
91
+ """
92
+ error_msg = f"Error in {self.name}"
93
+ if context:
94
+ error_msg += f" ({context})"
95
+ error_msg += f": {str(error)}"
96
+
97
+ logger.error(error_msg)
98
+
99
+ return {
100
+ "success": False,
101
+ "error": str(error),
102
+ "agent": self.name,
103
+ "context": context
104
+ }
105
+
106
+ def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]:
107
+ """
108
+ Parse LLM response that may contain JSON wrapped in markdown code blocks.
109
+
110
+ Args:
111
+ response_content: Raw response content from LLM
112
+
113
+ Returns:
114
+ Parsed JSON dictionary
115
+
116
+ Raises:
117
+ json.JSONDecodeError: If JSON cannot be parsed
118
+ """
119
+ content = response_content.strip()
120
+
121
+ # Check if response is wrapped in markdown code block
122
+ if content.startswith("```json"):
123
+ # Remove ```json prefix and ``` suffix
124
+ content = content[7:] # Remove ```json
125
+ if content.endswith("```"):
126
+ content = content[:-3] # Remove trailing ```
127
+ content = content.strip()
128
+ elif content.startswith("```"):
129
+ # Remove generic ``` code block
130
+ content = content[3:]
131
+ if content.endswith("```"):
132
+ content = content[:-3]
133
+ content = content.strip()
134
+
135
+ # Parse the cleaned JSON
136
+ return json.loads(content)
137
+
138
+ def _safe_get(self, data: Dict[str, Any], key: str, default: Any = None) -> Any:
139
+ """
140
+ Safely get a value from a dictionary with a default.
141
+
142
+ Args:
143
+ data: Dictionary to get value from
144
+ key: Key to look up
145
+ default: Default value if key not found
146
+
147
+ Returns:
148
+ Value from dictionary or default
149
+ """
150
+ return data.get(key, default)
151
+
152
+ def _ensure_list(self, value: Any) -> list:
153
+ """
154
+ Ensure a value is a list.
155
+
156
+ Args:
157
+ value: Value to convert
158
+
159
+ Returns:
160
+ List version of value
161
+ """
162
+ if value is None:
163
+ return []
164
+ if isinstance(value, list):
165
+ return value
166
+ if isinstance(value, str):
167
+ # Try to parse as comma-separated
168
+ return [v.strip() for v in value.split(",") if v.strip()]
169
+ return [value]
processing_brand_sentiment/workflow/agents/comment_preprocessor_agent.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Comment Preprocessor Agent for brand sentiment analysis on social media comments.
3
+
4
+ Extends ContentPreprocessorAgent but handles plain text (no HTML parsing).
5
+ Builds context from content title, content description, and parent comment text
6
+ instead of thread title and first post.
7
+
8
+ Reuses: keyword sets, product alias mapping, language detection, relevance screening.
9
+ Overrides: process() method for plain text handling and comment-specific context building.
10
+ """
11
+
12
+ from typing import Dict, Any, Optional
13
+ import logging
14
+
15
+ from .content_preprocessor_agent import ContentPreprocessorAgent
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class CommentPreprocessorAgent(ContentPreprocessorAgent):
21
+ """
22
+ Agent that preprocesses social media comments for brand sentiment analysis.
23
+
24
+ Inherits keyword detection, product alias mapping, language detection,
25
+ and relevance screening from ContentPreprocessorAgent.
26
+
27
+ Key differences from forum preprocessor:
28
+ - No HTML parsing (comments are plain text)
29
+ - Context built from content title + description + parent comment
30
+ - Different input field names (comment_text vs post_content)
31
+ """
32
+
33
+ def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]):
34
+ """
35
+ Initialize the Comment Preprocessor Agent.
36
+
37
+ Args:
38
+ config: Agent configuration
39
+ brand_config: Brand-specific configuration with keywords, products, and aliases
40
+ """
41
+ super().__init__(config, brand_config)
42
+ self.name = "CommentPreprocessorAgent"
43
+
44
+ logger.info(
45
+ f"CommentPreprocessorAgent initialized with {len(self.primary_keywords)} primary keywords, "
46
+ f"{len(self.product_aliases)} product aliases"
47
+ )
48
+
49
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
50
+ """
51
+ Validate that input contains required fields for comment processing.
52
+
53
+ Args:
54
+ input_data: Input dictionary
55
+
56
+ Returns:
57
+ True if valid, False otherwise
58
+ """
59
+ required_fields = ["comment_sk", "comment_text"]
60
+ return all(field in input_data for field in required_fields)
61
+
62
+ def _build_comment_context(
63
+ self,
64
+ content_title: Optional[str] = None,
65
+ content_description: Optional[str] = None,
66
+ parent_comment_text: Optional[str] = None
67
+ ) -> str:
68
+ """
69
+ Build context string from social media content and parent comment information.
70
+
71
+ Args:
72
+ content_title: Title of the social media post/content
73
+ content_description: Description/message of the social media post
74
+ parent_comment_text: Text of the parent comment (if this is a reply)
75
+
76
+ Returns:
77
+ Formatted context string
78
+ """
79
+ context_parts = []
80
+
81
+ if content_title:
82
+ context_parts.append(f"Post title: {content_title}")
83
+
84
+ if content_description:
85
+ # Truncate if too long
86
+ truncated = content_description[:500] + "..." if len(content_description) > 500 else content_description
87
+ context_parts.append(f"Post description: {truncated}")
88
+
89
+ if parent_comment_text:
90
+ truncated = parent_comment_text[:500] + "..." if len(parent_comment_text) > 500 else parent_comment_text
91
+ context_parts.append(f"Parent comment: {truncated}")
92
+
93
+ return " | ".join(context_parts) if context_parts else ""
94
+
95
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
96
+ """
97
+ Process a social media comment through the preprocessing pipeline.
98
+
99
+ Unlike forum posts, comments are plain text (no HTML parsing needed).
100
+ Context is built from content title, description, and parent comment.
101
+
102
+ Args:
103
+ input_data: Dictionary containing comment data with at least:
104
+ - comment_sk: Comment surrogate key
105
+ - comment_text: Raw comment text (plain text)
106
+ - content_title: Title of the post (optional)
107
+ - content_description: Description of the post (optional)
108
+ - parent_comment_text: Parent comment text if reply (optional)
109
+
110
+ Returns:
111
+ Dictionary with preprocessing results
112
+ """
113
+ try:
114
+ # Validate input
115
+ if not self.validate_input(input_data):
116
+ return {
117
+ "success": False,
118
+ "error": "Invalid input: missing required fields (comment_sk, comment_text)",
119
+ **input_data
120
+ }
121
+
122
+ comment_text = input_data.get("comment_text", "")
123
+
124
+ # Step 1: Clean text (plain text - no HTML parsing needed)
125
+ cleaned_content = comment_text.strip() if comment_text else ""
126
+
127
+ # Check for empty content
128
+ if not cleaned_content or len(cleaned_content) < self.min_content_length:
129
+ return {
130
+ "success": True,
131
+ "cleaned_content": cleaned_content,
132
+ "quoted_content": None,
133
+ "is_empty": True,
134
+ "preliminary_relevant": False,
135
+ "needs_relevance_validation": False,
136
+ **{k: v for k, v in input_data.items() if k != "comment_text"}
137
+ }
138
+
139
+ # Step 2: Check relevance (reused from parent class)
140
+ relevance_result = self._check_relevance(cleaned_content)
141
+ has_primary_keywords = relevance_result.get("has_primary_keywords", False)
142
+
143
+ # Step 3: Build comment context
144
+ raw_thread_context = self._build_comment_context(
145
+ content_title=input_data.get("content_title"),
146
+ content_description=input_data.get("content_description"),
147
+ parent_comment_text=input_data.get("parent_comment_text")
148
+ )
149
+
150
+ # Step 4: Detect language (reused from parent class)
151
+ lang_result = self._detect_language(cleaned_content, has_primary_keywords)
152
+
153
+ # Step 5: Extract product and competitor mentions (reused from parent class)
154
+ products_found = self._extract_mentioned_products(cleaned_content)
155
+ competitors_found = self._extract_mentioned_competitors(cleaned_content)
156
+
157
+ # Determine quoted content (parent comment serves as quoted context)
158
+ parent_comment = input_data.get("parent_comment_text")
159
+ has_parent = parent_comment is not None and str(parent_comment).strip() != ""
160
+
161
+ # Build result
162
+ result = {
163
+ "success": True,
164
+ "is_empty": False,
165
+
166
+ # Cleaned content
167
+ "cleaned_content": cleaned_content,
168
+ "quoted_content": parent_comment if has_parent else None,
169
+ "has_quote": has_parent,
170
+ "quoted_author": None,
171
+ "raw_thread_context": raw_thread_context,
172
+
173
+ # Language detection
174
+ "detected_language": lang_result["language"],
175
+ "language_code": lang_result["language_code"],
176
+ "is_english": lang_result["is_english"],
177
+ "language_confidence": lang_result["confidence"],
178
+ "language_detection_skipped": lang_result.get("detection_skipped", False),
179
+
180
+ # Relevance assessment
181
+ "preliminary_relevant": relevance_result["preliminary_relevant"],
182
+ "needs_relevance_validation": relevance_result["needs_relevance_validation"],
183
+ "relevance_keywords_found": relevance_result["found_keywords"],
184
+ "relevance_type": relevance_result["relevance_type"],
185
+ "relevance_confidence": relevance_result["relevance_confidence"],
186
+ "has_primary_keywords": has_primary_keywords,
187
+
188
+ # Initial extractions
189
+ "products_detected": products_found,
190
+ "competitors_detected": competitors_found,
191
+
192
+ # Preserve original data (exclude raw text to avoid duplication)
193
+ **{k: v for k, v in input_data.items() if k not in ["comment_text"]}
194
+ }
195
+
196
+ # Keep original content for reference
197
+ result["original_text"] = comment_text
198
+
199
+ self.log_processing(
200
+ f"Processed comment {input_data.get('comment_sk')}: "
201
+ f"lang={lang_result['language']}, "
202
+ f"relevant={relevance_result['preliminary_relevant']}, "
203
+ f"needs_validation={relevance_result['needs_relevance_validation']}, "
204
+ f"products={products_found}",
205
+ "debug"
206
+ )
207
+
208
+ return result
209
+
210
+ except Exception as e:
211
+ return self.handle_error(e, f"preprocessing comment {input_data.get('comment_sk')}")
processing_brand_sentiment/workflow/agents/content_preprocessor_agent.py ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Content Preprocessor Agent for brand sentiment analysis.
3
+ Handles HTML parsing, text cleaning, language detection, product alias mapping,
4
+ and initial relevance screening. This is a deterministic agent (no LLM calls).
5
+
6
+ Enhanced version with:
7
+ - Product alias mapping (B8 -> B8X)
8
+ - Smart language detection (skip for short texts)
9
+ - Always process if primary keywords found
10
+ - Better content separation
11
+ """
12
+
13
+ import re
14
+ from typing import Dict, Any, List, Optional, Set
15
+ from lingua import Language, LanguageDetectorBuilder
16
+ import logging
17
+
18
+ from .base_agent import BaseAgent
19
+ from utils.html_parser import HTMLParser
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class ContentPreprocessorAgent(BaseAgent):
25
+ """
26
+ Agent that preprocesses forum posts:
27
+ - Parses HTML to extract reply and quoted content
28
+ - Cleans and normalizes text
29
+ - Maps product aliases to canonical names
30
+ - Detects language (with smart handling for short texts)
31
+ - Performs initial keyword-based relevance screening
32
+ """
33
+
34
+ # Lingua to ISO 639-1 language code mapping
35
+ LINGUA_TO_ISO = {
36
+ Language.ENGLISH: "en",
37
+ Language.SPANISH: "es",
38
+ Language.FRENCH: "fr",
39
+ Language.GERMAN: "de",
40
+ Language.ITALIAN: "it",
41
+ Language.PORTUGUESE: "pt",
42
+ Language.RUSSIAN: "ru",
43
+ Language.JAPANESE: "ja",
44
+ Language.KOREAN: "ko",
45
+ Language.CHINESE: "zh",
46
+ Language.ARABIC: "ar",
47
+ Language.HINDI: "hi",
48
+ Language.DUTCH: "nl",
49
+ Language.SWEDISH: "sv",
50
+ Language.POLISH: "pl",
51
+ Language.TURKISH: "tr"
52
+ }
53
+
54
+ def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]):
55
+ """
56
+ Initialize the Content Preprocessor Agent.
57
+
58
+ Args:
59
+ config: Agent configuration
60
+ brand_config: Brand-specific configuration with keywords, products, and aliases
61
+ """
62
+ super().__init__("ContentPreprocessorAgent", config)
63
+ self.brand_config = brand_config
64
+ self.html_parser = HTMLParser()
65
+
66
+ # Get preprocessing settings
67
+ preprocessing_config = brand_config.get("preprocessing", {})
68
+ self.min_length_for_lang_detection = preprocessing_config.get(
69
+ "min_length_for_language_detection", 50
70
+ )
71
+ self.default_language = preprocessing_config.get(
72
+ "default_language_for_short_text", "English"
73
+ )
74
+ self.always_process_primary = preprocessing_config.get(
75
+ "always_process_if_primary_keyword", True
76
+ )
77
+ self.min_content_length = preprocessing_config.get("min_content_length", 3)
78
+
79
+ # Initialize lingua detector
80
+ self.language_detector = LanguageDetectorBuilder.from_all_languages().build()
81
+
82
+ # Build keyword sets and alias mappings
83
+ self._build_keyword_sets()
84
+ self._build_alias_mappings()
85
+
86
+ logger.info(
87
+ f"ContentPreprocessorAgent initialized with {len(self.primary_keywords)} primary keywords, "
88
+ f"{len(self.product_aliases)} product aliases"
89
+ )
90
+
91
+ def _build_keyword_sets(self) -> None:
92
+ """Build keyword sets from brand configuration for efficient relevance checking."""
93
+ relevance_config = self.brand_config.get("relevance_keywords", {})
94
+
95
+ # Primary keywords - definitive Sabian mentions
96
+ primary = relevance_config.get("primary", {}).get("keywords", [])
97
+ self.primary_keywords: Set[str] = set(k.lower() for k in primary)
98
+
99
+ # Contextual keywords - need disambiguation (HH, AA)
100
+ contextual = relevance_config.get("contextual", {}).get("keywords", [])
101
+ self.contextual_keywords: Set[str] = set(k.lower() for k in contextual)
102
+
103
+ # Cymbal context keywords - help disambiguate contextual terms
104
+ cymbal_context = relevance_config.get("cymbal_context", {}).get("keywords", [])
105
+ self.cymbal_context_keywords: Set[str] = set(k.lower() for k in cymbal_context)
106
+
107
+ # Competitor names and aliases for detection
108
+ competitors = self.brand_config.get("brand", {}).get("competitors", [])
109
+ self.competitor_keywords: Set[str] = set()
110
+ self.competitor_name_map: Dict[str, str] = {} # alias -> canonical name
111
+
112
+ for comp in competitors:
113
+ if isinstance(comp, dict):
114
+ name = comp.get("name", "")
115
+ self.competitor_keywords.add(name.lower())
116
+ self.competitor_name_map[name.lower()] = name
117
+ for alias in comp.get("aliases", []):
118
+ alias_lower = alias.lower()
119
+ self.competitor_keywords.add(alias_lower)
120
+ self.competitor_name_map[alias_lower] = name
121
+ else:
122
+ comp_str = str(comp).lower()
123
+ self.competitor_keywords.add(comp_str)
124
+ self.competitor_name_map[comp_str] = str(comp)
125
+
126
+ # Product names
127
+ products = self.brand_config.get("brand", {}).get("products", [])
128
+ self.product_keywords: Set[str] = set(p.lower() for p in products)
129
+ self.products_list = products # Keep original case
130
+
131
+ logger.debug(
132
+ f"Built keyword sets: {len(self.primary_keywords)} primary, "
133
+ f"{len(self.contextual_keywords)} contextual, "
134
+ f"{len(self.product_keywords)} products, "
135
+ f"{len(self.competitor_keywords)} competitor terms"
136
+ )
137
+
138
+ def _build_alias_mappings(self) -> None:
139
+ """Build product alias mappings from brand configuration."""
140
+ aliases = self.brand_config.get("brand", {}).get("product_aliases", {})
141
+
142
+ # Build alias -> canonical product mapping
143
+ self.product_aliases: Dict[str, str] = {}
144
+ for alias, canonical in aliases.items():
145
+ self.product_aliases[alias.lower()] = canonical
146
+
147
+ # Also add primary keywords that are aliases to contextual keywords
148
+ # e.g., "b8" should trigger contextual check since it maps to "B8X"
149
+ for alias in self.product_aliases.keys():
150
+ if alias not in self.primary_keywords:
151
+ self.contextual_keywords.add(alias)
152
+
153
+ logger.debug(f"Built {len(self.product_aliases)} product alias mappings")
154
+
155
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
156
+ """
157
+ Validate that input contains required fields.
158
+
159
+ Args:
160
+ input_data: Input dictionary
161
+
162
+ Returns:
163
+ True if valid, False otherwise
164
+ """
165
+ required_fields = ["post_id", "post_content"]
166
+ return all(field in input_data for field in required_fields)
167
+
168
+ def _detect_language(self, text: str, has_primary_keywords: bool = False) -> Dict[str, Any]:
169
+ """
170
+ Detect the language of text using lingua library.
171
+
172
+ Enhanced logic:
173
+ - Skip detection for short texts (< min_length_for_lang_detection chars)
174
+ - Always return English if primary Sabian keywords are found
175
+
176
+ Args:
177
+ text: Text to analyze
178
+ has_primary_keywords: Whether primary Sabian keywords were found
179
+
180
+ Returns:
181
+ Dictionary with language detection results
182
+ """
183
+ try:
184
+ cleaned_text = text.strip()
185
+
186
+ # If text is too short, default to English
187
+ if len(cleaned_text) < self.min_length_for_lang_detection:
188
+ return {
189
+ "language": self.default_language,
190
+ "language_code": "en",
191
+ "is_english": True,
192
+ "confidence": "low",
193
+ "detection_skipped": True,
194
+ "skip_reason": f"Text too short ({len(cleaned_text)} < {self.min_length_for_lang_detection} chars)"
195
+ }
196
+
197
+ # If primary keywords found and always_process_primary is True, treat as English
198
+ if has_primary_keywords and self.always_process_primary:
199
+ # Still try to detect, but override if non-English
200
+ detected = self.language_detector.detect_language_of(cleaned_text)
201
+
202
+ if detected == Language.ENGLISH:
203
+ return {
204
+ "language": "English",
205
+ "language_code": "en",
206
+ "is_english": True,
207
+ "confidence": "high",
208
+ "detection_skipped": False,
209
+ "skip_reason": None
210
+ }
211
+ else:
212
+ # Primary keyword found but detected as non-English
213
+ # Force to English since Sabian is explicitly mentioned
214
+ lang_name = detected.name.capitalize() if detected else "Unknown"
215
+ return {
216
+ "language": "English",
217
+ "language_code": "en",
218
+ "is_english": True,
219
+ "confidence": "medium",
220
+ "detection_skipped": False,
221
+ "skip_reason": None,
222
+ "original_detected_language": lang_name,
223
+ "override_reason": "Primary Sabian keyword found, treating as English"
224
+ }
225
+
226
+ # Standard detection
227
+ detected = self.language_detector.detect_language_of(cleaned_text)
228
+
229
+ if detected is None:
230
+ return {
231
+ "language": self.default_language,
232
+ "language_code": "en",
233
+ "is_english": True,
234
+ "confidence": "low",
235
+ "detection_skipped": False,
236
+ "skip_reason": None
237
+ }
238
+
239
+ if detected == Language.ENGLISH:
240
+ return {
241
+ "language": "English",
242
+ "language_code": "en",
243
+ "is_english": True,
244
+ "confidence": "high",
245
+ "detection_skipped": False,
246
+ "skip_reason": None
247
+ }
248
+
249
+ lang_code = self.LINGUA_TO_ISO.get(detected, "unknown")
250
+ lang_name = detected.name.capitalize()
251
+
252
+ return {
253
+ "language": lang_name,
254
+ "language_code": lang_code,
255
+ "is_english": False,
256
+ "confidence": "high",
257
+ "detection_skipped": False,
258
+ "skip_reason": None
259
+ }
260
+
261
+ except Exception as e:
262
+ logger.warning(f"Language detection failed: {e}")
263
+ return {
264
+ "language": self.default_language,
265
+ "language_code": "en",
266
+ "is_english": True,
267
+ "confidence": "low",
268
+ "detection_skipped": False,
269
+ "skip_reason": None,
270
+ "detection_error": str(e)
271
+ }
272
+
273
+ def _normalize_product_mentions(self, found_products: List[str]) -> List[str]:
274
+ """
275
+ Normalize product mentions using alias mappings.
276
+
277
+ Args:
278
+ found_products: List of product terms found
279
+
280
+ Returns:
281
+ List of canonical product names
282
+ """
283
+ normalized = []
284
+ for product in found_products:
285
+ product_lower = product.lower()
286
+
287
+ # Check if it's an alias
288
+ if product_lower in self.product_aliases:
289
+ canonical = self.product_aliases[product_lower]
290
+ if canonical not in normalized:
291
+ normalized.append(canonical)
292
+ # Check if it's a direct product match
293
+ elif product_lower in self.product_keywords:
294
+ # Find the original case version
295
+ for p in self.products_list:
296
+ if p.lower() == product_lower:
297
+ if p not in normalized:
298
+ normalized.append(p)
299
+ break
300
+
301
+ return normalized
302
+
303
+ def _check_relevance(self, text: str) -> Dict[str, Any]:
304
+ """
305
+ Check if text is relevant to the brand using keyword matching.
306
+
307
+ Enhanced to handle product aliases.
308
+
309
+ Returns:
310
+ Dictionary with relevance assessment
311
+ """
312
+ text_lower = text.lower()
313
+
314
+ # Tokenize for word boundary matching
315
+ words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
316
+
317
+ # Also check for multi-word phrases (for aliases like "hand hammered")
318
+ all_aliases = set(self.product_aliases.keys())
319
+
320
+ # Check for primary keywords (definitive matches)
321
+ found_primary = self.primary_keywords.intersection(words)
322
+
323
+ # Check for product aliases in text
324
+ found_aliases = []
325
+ for alias in all_aliases:
326
+ if ' ' in alias:
327
+ # Multi-word alias - check in full text
328
+ if alias in text_lower:
329
+ found_aliases.append(alias)
330
+ elif alias in words:
331
+ found_aliases.append(alias)
332
+
333
+ # Map aliases to canonical products
334
+ alias_products = []
335
+ for alias in found_aliases:
336
+ if alias in self.product_aliases:
337
+ canonical = self.product_aliases[alias]
338
+ if canonical not in alias_products:
339
+ alias_products.append(canonical)
340
+
341
+ if found_primary or alias_products:
342
+ all_found = list(found_primary) + found_aliases
343
+ return {
344
+ "preliminary_relevant": True,
345
+ "needs_relevance_validation": False,
346
+ "found_keywords": all_found,
347
+ "mapped_products": alias_products,
348
+ "relevance_type": "primary",
349
+ "relevance_confidence": "high",
350
+ "has_primary_keywords": True
351
+ }
352
+
353
+ # Check for contextual keywords (need validation)
354
+ found_contextual = self.contextual_keywords.intersection(words)
355
+ if found_contextual:
356
+ # Check if there's cymbal context
357
+ found_cymbal_context = self.cymbal_context_keywords.intersection(words)
358
+ has_cymbal_context = len(found_cymbal_context) > 0
359
+
360
+ return {
361
+ "preliminary_relevant": True,
362
+ "needs_relevance_validation": True,
363
+ "found_keywords": list(found_contextual),
364
+ "cymbal_context_found": list(found_cymbal_context) if found_cymbal_context else [],
365
+ "has_cymbal_context": has_cymbal_context,
366
+ "mapped_products": [],
367
+ "relevance_type": "contextual",
368
+ "relevance_confidence": "medium" if has_cymbal_context else "low",
369
+ "has_primary_keywords": False
370
+ }
371
+
372
+ # Check for competitor mentions (might be comparative discussion)
373
+ found_competitors = self.competitor_keywords.intersection(words)
374
+ if found_competitors:
375
+ return {
376
+ "preliminary_relevant": False,
377
+ "needs_relevance_validation": True,
378
+ "found_keywords": list(found_competitors),
379
+ "mapped_products": [],
380
+ "relevance_type": "competitor_only",
381
+ "relevance_confidence": "low",
382
+ "has_primary_keywords": False
383
+ }
384
+
385
+ # No relevant keywords found
386
+ return {
387
+ "preliminary_relevant": False,
388
+ "needs_relevance_validation": False,
389
+ "found_keywords": [],
390
+ "mapped_products": [],
391
+ "relevance_type": "none",
392
+ "relevance_confidence": "high",
393
+ "has_primary_keywords": False
394
+ }
395
+
396
+ def _extract_mentioned_products(self, text: str) -> List[str]:
397
+ """
398
+ Extract product names mentioned in the text, including aliases.
399
+
400
+ Args:
401
+ text: Text to search
402
+
403
+ Returns:
404
+ List of canonical product names found
405
+ """
406
+ text_lower = text.lower()
407
+ words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
408
+
409
+ found_products = []
410
+
411
+ # Check direct product mentions
412
+ for product in self.products_list:
413
+ if product.lower() in words:
414
+ if product not in found_products:
415
+ found_products.append(product)
416
+
417
+ # Check aliases
418
+ for alias, canonical in self.product_aliases.items():
419
+ if ' ' in alias:
420
+ # Multi-word alias
421
+ if alias in text_lower:
422
+ if canonical not in found_products:
423
+ found_products.append(canonical)
424
+ elif alias in words:
425
+ if canonical not in found_products:
426
+ found_products.append(canonical)
427
+
428
+ return found_products
429
+
430
+ def _extract_mentioned_competitors(self, text: str) -> List[str]:
431
+ """
432
+ Extract competitor brand names mentioned in the text.
433
+
434
+ Args:
435
+ text: Text to search
436
+
437
+ Returns:
438
+ List of canonical competitor names found
439
+ """
440
+ text_lower = text.lower()
441
+ words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
442
+
443
+ found_competitors = set()
444
+
445
+ for alias in self.competitor_keywords:
446
+ if ' ' in alias:
447
+ # Multi-word check
448
+ if alias in text_lower:
449
+ canonical = self.competitor_name_map.get(alias, alias)
450
+ found_competitors.add(canonical)
451
+ elif alias in words:
452
+ canonical = self.competitor_name_map.get(alias, alias)
453
+ found_competitors.add(canonical)
454
+
455
+ return list(found_competitors)
456
+
457
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
458
+ """
459
+ Process a forum post through preprocessing pipeline.
460
+
461
+ Args:
462
+ input_data: Dictionary containing post data with at least:
463
+ - post_id: Post identifier
464
+ - post_content: Raw HTML content
465
+ - thread_title: Thread title (optional)
466
+ - thread_first_post: First post content (optional)
467
+ - category_title: Category title (optional)
468
+ - category_topic: Category topic (optional)
469
+
470
+ Returns:
471
+ Dictionary with preprocessing results
472
+ """
473
+ try:
474
+ # Validate input
475
+ if not self.validate_input(input_data):
476
+ return {
477
+ "success": False,
478
+ "error": "Invalid input: missing required fields",
479
+ **input_data
480
+ }
481
+
482
+ post_content = input_data.get("post_content", "")
483
+
484
+ # Step 1: Parse HTML content
485
+ parsed = self.html_parser.parse_post_content(post_content)
486
+ reply_content = parsed.get("reply_content", "")
487
+ quoted_content = parsed.get("quoted_content")
488
+
489
+ # Check for empty content
490
+ if not reply_content or len(reply_content.strip()) < self.min_content_length:
491
+ return {
492
+ "success": True,
493
+ "cleaned_content": reply_content,
494
+ "quoted_content": quoted_content,
495
+ "is_empty": True,
496
+ "preliminary_relevant": False,
497
+ "needs_relevance_validation": False,
498
+ **{k: v for k, v in input_data.items() if k != "post_content"}
499
+ }
500
+
501
+ # Step 2: Check relevance FIRST (needed for language detection logic)
502
+ relevance_result = self._check_relevance(reply_content)
503
+ has_primary_keywords = relevance_result.get("has_primary_keywords", False)
504
+
505
+ # Step 3: Build thread context (raw - will be summarized by extraction agent)
506
+ raw_thread_context = self.html_parser.build_thread_context(
507
+ thread_title=input_data.get("thread_title"),
508
+ first_post_content=input_data.get("thread_first_post"),
509
+ category_title=input_data.get("category_title"),
510
+ category_topic=input_data.get("category_topic")
511
+ )
512
+
513
+ # Step 4: Detect language (with smart handling)
514
+ lang_result = self._detect_language(reply_content, has_primary_keywords)
515
+
516
+ # Step 5: Extract product and competitor mentions from actual post content
517
+ products_found = self._extract_mentioned_products(reply_content)
518
+ competitors_found = self._extract_mentioned_competitors(reply_content)
519
+
520
+ # Build result
521
+ result = {
522
+ "success": True,
523
+ "is_empty": False,
524
+
525
+ # Cleaned content
526
+ "cleaned_content": reply_content,
527
+ "quoted_content": quoted_content,
528
+ "has_quote": parsed.get("has_quote", False),
529
+ "quoted_author": parsed.get("quoted_author"),
530
+ "raw_thread_context": raw_thread_context,
531
+
532
+ # Language detection
533
+ "detected_language": lang_result["language"],
534
+ "language_code": lang_result["language_code"],
535
+ "is_english": lang_result["is_english"],
536
+ "language_confidence": lang_result["confidence"],
537
+ "language_detection_skipped": lang_result.get("detection_skipped", False),
538
+
539
+ # Relevance assessment
540
+ "preliminary_relevant": relevance_result["preliminary_relevant"],
541
+ "needs_relevance_validation": relevance_result["needs_relevance_validation"],
542
+ "relevance_keywords_found": relevance_result["found_keywords"],
543
+ "relevance_type": relevance_result["relevance_type"],
544
+ "relevance_confidence": relevance_result["relevance_confidence"],
545
+ "has_primary_keywords": has_primary_keywords,
546
+
547
+ # Initial extractions
548
+ "products_detected": products_found,
549
+ "competitors_detected": competitors_found,
550
+
551
+ # Preserve original data
552
+ **{k: v for k, v in input_data.items() if k not in ["post_content"]}
553
+ }
554
+
555
+ # Keep original content for reference
556
+ result["original_content"] = post_content
557
+
558
+ self.log_processing(
559
+ f"Processed post {input_data.get('post_id')}: "
560
+ f"lang={lang_result['language']}, "
561
+ f"relevant={relevance_result['preliminary_relevant']}, "
562
+ f"needs_validation={relevance_result['needs_relevance_validation']}, "
563
+ f"products={products_found}",
564
+ "debug"
565
+ )
566
+
567
+ return result
568
+
569
+ except Exception as e:
570
+ return self.handle_error(e, f"preprocessing post {input_data.get('post_id')}")
processing_brand_sentiment/workflow/agents/output_validator_agent.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Output Validator Agent for brand sentiment analysis.
3
+
4
+ This agent performs rule-based validation on the final output to ensure:
5
+ 1. All values are from predefined lists
6
+ 2. Logical consistency between fields
7
+ 3. Anomaly detection for manual review flagging
8
+
9
+ This is a deterministic agent (no LLM calls) that acts as a quality gate.
10
+ """
11
+
12
+ from typing import Dict, Any, List, Set
13
+ import logging
14
+
15
+ from .base_agent import BaseAgent
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class OutputValidatorAgent(BaseAgent):
21
+ """
22
+ Agent that validates the final output for consistency and quality.
23
+
24
+ Performs rule-based checks without LLM calls to ensure data quality
25
+ and flag posts that may need manual review.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ config: Dict[str, Any],
31
+ brand_config: Dict[str, Any],
32
+ analysis_categories: Dict[str, Any]
33
+ ):
34
+ """
35
+ Initialize the Output Validator Agent.
36
+
37
+ Args:
38
+ config: Agent configuration
39
+ brand_config: Brand-specific configuration
40
+ analysis_categories: Category definitions for validation
41
+ """
42
+ super().__init__("OutputValidatorAgent", config)
43
+ self.brand_config = brand_config
44
+ self.analysis_categories = analysis_categories
45
+
46
+ # Build valid value sets for validation
47
+ self._build_valid_value_sets()
48
+
49
+ logger.info("OutputValidatorAgent initialized")
50
+
51
+ def _build_valid_value_sets(self) -> None:
52
+ """Build sets of valid values for efficient validation."""
53
+ brand = self.brand_config.get("brand", {})
54
+
55
+ # Products
56
+ self.valid_products: Set[str] = set(
57
+ p.lower() for p in brand.get("products", [])
58
+ )
59
+ self.products_canonical = {p.lower(): p for p in brand.get("products", [])}
60
+
61
+ # Competitors
62
+ self.valid_competitors: Set[str] = set()
63
+ self.competitors_canonical = {}
64
+ for comp in brand.get("competitors", []):
65
+ if isinstance(comp, dict):
66
+ name = comp.get("name", "")
67
+ self.valid_competitors.add(name.lower())
68
+ self.competitors_canonical[name.lower()] = name
69
+
70
+ # Extract all category values
71
+ self.valid_values = {}
72
+
73
+ category_configs = {
74
+ "author_role": self.analysis_categories.get("author_role", {}),
75
+ "sabian_mention_context": self.analysis_categories.get("sabian_mention_context", {}),
76
+ "sentiment_level": self.analysis_categories.get("sentiment", {}),
77
+ "emotion_type": self.analysis_categories.get("emotions", {}),
78
+ "intents": self.analysis_categories.get("intents", {}),
79
+ "purchase_stage": self.analysis_categories.get("purchase_stage", {}),
80
+ "comparison_type": self.analysis_categories.get("comparison_type", {}),
81
+ "feedback_aspects": self.analysis_categories.get("feedback_aspects", {}),
82
+ "decision_drivers": self.analysis_categories.get("decision_drivers", {}),
83
+ "product_attributes": self.analysis_categories.get("product_attributes", {}),
84
+ }
85
+
86
+ for key, config in category_configs.items():
87
+ if "categories" in config:
88
+ self.valid_values[key] = set(
89
+ c["value"].lower() for c in config["categories"]
90
+ )
91
+ elif "levels" in config:
92
+ self.valid_values[key] = set(
93
+ c["value"].lower() for c in config["levels"]
94
+ )
95
+ else:
96
+ self.valid_values[key] = set()
97
+
98
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
99
+ """Validate that input contains required fields."""
100
+ # The validator accepts any input - it will validate what's there
101
+ return True
102
+
103
+ def _validate_list_values(
104
+ self,
105
+ values: List[Any],
106
+ valid_set: Set[str],
107
+ field_name: str
108
+ ) -> Dict[str, Any]:
109
+ """
110
+ Validate list values against a set of valid values.
111
+
112
+ Returns:
113
+ Dictionary with validation results
114
+ """
115
+ if not values:
116
+ return {"valid": True, "invalid_values": [], "field": field_name}
117
+
118
+ invalid = []
119
+ for v in values:
120
+ if isinstance(v, str) and v.lower() not in valid_set:
121
+ invalid.append(v)
122
+
123
+ return {
124
+ "valid": len(invalid) == 0,
125
+ "invalid_values": invalid,
126
+ "field": field_name
127
+ }
128
+
129
+ def _validate_single_value(
130
+ self,
131
+ value: Any,
132
+ valid_set: Set[str],
133
+ field_name: str,
134
+ allow_none: bool = True
135
+ ) -> Dict[str, Any]:
136
+ """
137
+ Validate a single value against a set of valid values.
138
+
139
+ Returns:
140
+ Dictionary with validation results
141
+ """
142
+ if value is None:
143
+ return {"valid": allow_none, "invalid_value": None if allow_none else value, "field": field_name}
144
+
145
+ if isinstance(value, str) and value.lower() in valid_set:
146
+ return {"valid": True, "invalid_value": None, "field": field_name}
147
+
148
+ return {"valid": False, "invalid_value": value, "field": field_name}
149
+
150
+ def _check_logical_consistency(self, data: Dict[str, Any]) -> List[str]:
151
+ """
152
+ Check for logical consistency between fields.
153
+
154
+ Note: Empty products_mentioned is OK even when relevant - users may
155
+ discuss the Sabian brand generally without specific products.
156
+
157
+ Returns:
158
+ List of inconsistency warnings
159
+ """
160
+ warnings = []
161
+ is_relevant = data.get("is_relevant", False)
162
+
163
+ # Check 1: If not relevant, certain fields should be empty/null
164
+ if not is_relevant:
165
+ if data.get("sabian_mention_context"):
166
+ warnings.append(
167
+ "sabian_mention_context should be null when is_relevant=False"
168
+ )
169
+ if data.get("sentiment_level") and data.get("sentiment_level") != "neutral":
170
+ warnings.append(
171
+ "sentiment_level should be null/neutral when is_relevant=False"
172
+ )
173
+
174
+ # Check 2: Comparison type should only be set if comparing intent exists
175
+ if data.get("comparison_type"):
176
+ intents = data.get("intents", [])
177
+ if "comparing" not in intents:
178
+ warnings.append(
179
+ "comparison_type is set but 'comparing' not in intents"
180
+ )
181
+
182
+ # Check 3: Author perspective fields consistency
183
+ # If author is giving advice (providing_information) without sharing experience,
184
+ # pain_points and delight_factors should typically be empty
185
+ intents = data.get("intents", [])
186
+ if "providing_information" in intents and "sharing_experience" not in intents:
187
+ if data.get("pain_points") or data.get("delight_factors"):
188
+ warnings.append(
189
+ "pain_points/delight_factors set for advice-giving post without sharing_experience intent"
190
+ )
191
+
192
+ return warnings
193
+
194
+ def _fix_overlapping_feedback(self, data: Dict[str, Any]) -> Dict[str, Any]:
195
+ """
196
+ Fix overlapping values between pain_points and delight_factors.
197
+
198
+ Rule: The same aspect cannot be both a pain point and a delight factor.
199
+ Resolution: Use sentiment to determine which to keep, or clear both if neutral.
200
+
201
+ Args:
202
+ data: Dictionary with analysis results
203
+
204
+ Returns:
205
+ Updated dictionary with fixed pain_points and delight_factors
206
+ """
207
+ pain_points = data.get("pain_points", []) or []
208
+ delight_factors = data.get("delight_factors", []) or []
209
+
210
+ if not pain_points or not delight_factors:
211
+ return data
212
+
213
+ # Find overlapping values
214
+ pain_set = set(p.lower() if isinstance(p, str) else p for p in pain_points)
215
+ delight_set = set(d.lower() if isinstance(d, str) else d for d in delight_factors)
216
+ overlap = pain_set.intersection(delight_set)
217
+
218
+ if not overlap:
219
+ return data
220
+
221
+ # Get sentiment to determine which to keep
222
+ sentiment = data.get("sentiment_level", "neutral")
223
+
224
+ # Create new lists without overlapping values
225
+ if sentiment in ["positive", "very_positive"]:
226
+ # Keep in delight_factors, remove from pain_points
227
+ new_pain_points = [p for p in pain_points if p.lower() not in overlap]
228
+ new_delight_factors = delight_factors
229
+ elif sentiment in ["negative", "very_negative"]:
230
+ # Keep in pain_points, remove from delight_factors
231
+ new_pain_points = pain_points
232
+ new_delight_factors = [d for d in delight_factors if d.lower() not in overlap]
233
+ else:
234
+ # Neutral sentiment - clear both (can't determine intent)
235
+ new_pain_points = [p for p in pain_points if p.lower() not in overlap]
236
+ new_delight_factors = [d for d in delight_factors if d.lower() not in overlap]
237
+
238
+ # Update data
239
+ data["pain_points"] = new_pain_points
240
+ data["delight_factors"] = new_delight_factors
241
+
242
+ logger.debug(
243
+ f"Fixed overlapping feedback: removed {overlap} from "
244
+ f"{'pain_points' if sentiment in ['positive', 'very_positive'] else 'delight_factors' if sentiment in ['negative', 'very_negative'] else 'both'}"
245
+ )
246
+
247
+ return data
248
+
249
+ def _detect_anomalies(self, data: Dict[str, Any]) -> List[str]:
250
+ """
251
+ Detect anomalies that might need manual review.
252
+
253
+ Returns:
254
+ List of anomaly flags
255
+ """
256
+ anomalies = []
257
+
258
+ # Anomaly 1: Low confidence relevance
259
+ if data.get("is_relevant") and data.get("relevance_confidence") == "low":
260
+ anomalies.append("low_confidence_relevant")
261
+
262
+ # Anomaly 2: Sarcasm detected - sentiment might be inverted
263
+ if data.get("sarcasm_detected"):
264
+ anomalies.append("sarcasm_detected")
265
+
266
+ # Anomaly 3: Very short content marked as relevant
267
+ content = data.get("cleaned_content", "")
268
+ if data.get("is_relevant") and len(content) < 20:
269
+ anomalies.append("short_relevant_content")
270
+
271
+ # Anomaly 4: Switching behavior detected
272
+ comparison_type = data.get("comparison_type", "")
273
+ if comparison_type in ["switching_to_sabian", "switching_from_sabian"]:
274
+ anomalies.append(f"brand_switching_{comparison_type}")
275
+
276
+ return anomalies
277
+
278
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
279
+ """
280
+ Process and validate the analysis output.
281
+
282
+ Args:
283
+ input_data: Dictionary with all analysis results
284
+
285
+ Returns:
286
+ Dictionary with validation results added
287
+ """
288
+ try:
289
+ validation_errors = []
290
+ validation_warnings = []
291
+
292
+ # Skip detailed validation for non-relevant or skipped posts
293
+ if not input_data.get("is_relevant", False) or input_data.get("analysis_skipped", False):
294
+ return {
295
+ **input_data,
296
+ "validation_passed": True,
297
+ "validation_errors": [],
298
+ "validation_warnings": [],
299
+ "validation_flags": [],
300
+ "processing_status": "completed"
301
+ }
302
+
303
+ # Fix overlapping pain_points and delight_factors (safety net)
304
+ input_data = self._fix_overlapping_feedback(input_data)
305
+
306
+ # Validate products_mentioned
307
+ products_result = self._validate_list_values(
308
+ input_data.get("products_mentioned", []),
309
+ self.valid_products,
310
+ "products_mentioned"
311
+ )
312
+ if not products_result["valid"]:
313
+ validation_errors.append(
314
+ f"Invalid products: {products_result['invalid_values']}"
315
+ )
316
+
317
+ # Validate competitors_mentioned
318
+ competitors_result = self._validate_list_values(
319
+ input_data.get("competitors_mentioned", []),
320
+ self.valid_competitors,
321
+ "competitors_mentioned"
322
+ )
323
+ if not competitors_result["valid"]:
324
+ validation_errors.append(
325
+ f"Invalid competitors: {competitors_result['invalid_values']}"
326
+ )
327
+
328
+ # Validate categorical fields
329
+ categorical_validations = [
330
+ ("author_role", "author_role", True),
331
+ ("sabian_mention_context", "sabian_mention_context", True),
332
+ ("sentiment_level", "sentiment_level", True),
333
+ ("emotion_type", "emotion_type", True),
334
+ ("purchase_stage", "purchase_stage", True),
335
+ ("comparison_type", "comparison_type", True),
336
+ ]
337
+
338
+ for field, valid_key, allow_none in categorical_validations:
339
+ result = self._validate_single_value(
340
+ input_data.get(field),
341
+ self.valid_values.get(valid_key, set()),
342
+ field,
343
+ allow_none
344
+ )
345
+ if not result["valid"]:
346
+ validation_errors.append(
347
+ f"Invalid {field}: {result['invalid_value']}"
348
+ )
349
+
350
+ # Validate list fields
351
+ list_validations = [
352
+ ("intents", "intents"),
353
+ ("product_attributes", "product_attributes"),
354
+ ("pain_points", "feedback_aspects"),
355
+ ("delight_factors", "feedback_aspects"),
356
+ ("decision_drivers", "decision_drivers"),
357
+ ]
358
+
359
+ for field, valid_key in list_validations:
360
+ result = self._validate_list_values(
361
+ input_data.get(field, []),
362
+ self.valid_values.get(valid_key, set()),
363
+ field
364
+ )
365
+ if not result["valid"]:
366
+ validation_warnings.append(
367
+ f"Invalid values in {field}: {result['invalid_values']}"
368
+ )
369
+
370
+ # Check logical consistency
371
+ consistency_warnings = self._check_logical_consistency(input_data)
372
+ validation_warnings.extend(consistency_warnings)
373
+
374
+ # Detect anomalies
375
+ anomalies = self._detect_anomalies(input_data)
376
+
377
+ # Determine overall validation status
378
+ validation_passed = len(validation_errors) == 0
379
+
380
+ # Set processing status
381
+ if validation_errors:
382
+ processing_status = "validation_failed"
383
+ elif anomalies:
384
+ processing_status = "completed_with_flags"
385
+ else:
386
+ processing_status = "completed"
387
+
388
+ result = {
389
+ **input_data,
390
+ "validation_passed": validation_passed,
391
+ "validation_errors": validation_errors,
392
+ "validation_warnings": validation_warnings,
393
+ "validation_flags": anomalies,
394
+ "processing_status": processing_status
395
+ }
396
+
397
+ if validation_errors or validation_warnings or anomalies:
398
+ self.log_processing(
399
+ f"Validation complete: passed={validation_passed}, "
400
+ f"errors={len(validation_errors)}, warnings={len(validation_warnings)}, "
401
+ f"flags={anomalies}",
402
+ "debug"
403
+ )
404
+
405
+ return result
406
+
407
+ except Exception as e:
408
+ return self.handle_error(e, "output validation")
processing_brand_sentiment/workflow/agents/preprocessor_agent.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Preprocessor Agent for brand sentiment analysis.
3
+ Handles HTML parsing, text cleaning, language detection, and initial relevance screening.
4
+ This is a deterministic agent (no LLM calls except for language detection fallback).
5
+ """
6
+
7
+ import re
8
+ from typing import Dict, Any, List, Optional, Set
9
+ from lingua import Language, LanguageDetectorBuilder
10
+ import logging
11
+
12
+ from .base_agent import BaseAgent
13
+ from utils.html_parser import HTMLParser
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class PreprocessorAgent(BaseAgent):
19
+ """
20
+ Agent that preprocesses forum posts:
21
+ - Parses HTML to extract reply and quoted content
22
+ - Cleans and normalizes text
23
+ - Detects language
24
+ - Performs initial keyword-based relevance screening
25
+ """
26
+
27
+ # Lingua to ISO 639-1 language code mapping
28
+ LINGUA_TO_ISO = {
29
+ Language.ENGLISH: "en",
30
+ Language.SPANISH: "es",
31
+ Language.FRENCH: "fr",
32
+ Language.GERMAN: "de",
33
+ Language.ITALIAN: "it",
34
+ Language.PORTUGUESE: "pt",
35
+ Language.RUSSIAN: "ru",
36
+ Language.JAPANESE: "ja",
37
+ Language.KOREAN: "ko",
38
+ Language.CHINESE: "zh",
39
+ Language.ARABIC: "ar",
40
+ Language.HINDI: "hi",
41
+ Language.DUTCH: "nl",
42
+ Language.SWEDISH: "sv",
43
+ Language.POLISH: "pl",
44
+ Language.TURKISH: "tr"
45
+ }
46
+
47
+ def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]):
48
+ """
49
+ Initialize the Preprocessor Agent.
50
+
51
+ Args:
52
+ config: Agent configuration
53
+ brand_config: Brand-specific configuration with keywords and products
54
+ """
55
+ super().__init__("PreprocessorAgent", config)
56
+ self.brand_config = brand_config
57
+ self.html_parser = HTMLParser()
58
+
59
+ # Initialize lingua detector
60
+ self.language_detector = LanguageDetectorBuilder.from_all_languages().build()
61
+
62
+ # Build keyword sets for efficient lookup
63
+ self._build_keyword_sets()
64
+
65
+ logger.info("PreprocessorAgent initialized")
66
+
67
+ def _build_keyword_sets(self) -> None:
68
+ """Build keyword sets from brand configuration for efficient relevance checking."""
69
+ relevance_config = self.brand_config.get("relevance_keywords", {})
70
+
71
+ # Primary keywords - definitive Sabian mentions
72
+ primary = relevance_config.get("primary", {}).get("keywords", [])
73
+ self.primary_keywords: Set[str] = set(k.lower() for k in primary)
74
+
75
+ # Contextual keywords - need disambiguation (HH, AA)
76
+ contextual = relevance_config.get("contextual", {}).get("keywords", [])
77
+ self.contextual_keywords: Set[str] = set(k.lower() for k in contextual)
78
+
79
+ # Cymbal context keywords - help disambiguate contextual terms
80
+ cymbal_context = relevance_config.get("cymbal_context", {}).get("keywords", [])
81
+ self.cymbal_context_keywords: Set[str] = set(k.lower() for k in cymbal_context)
82
+
83
+ # Competitor names for detection
84
+ competitors = self.brand_config.get("brand", {}).get("competitors", [])
85
+ self.competitor_keywords: Set[str] = set()
86
+ for comp in competitors:
87
+ if isinstance(comp, dict):
88
+ self.competitor_keywords.add(comp.get("name", "").lower())
89
+ for alias in comp.get("aliases", []):
90
+ self.competitor_keywords.add(alias.lower())
91
+ else:
92
+ self.competitor_keywords.add(str(comp).lower())
93
+
94
+ # Product names
95
+ products = self.brand_config.get("brand", {}).get("products", [])
96
+ self.product_keywords: Set[str] = set(p.lower() for p in products)
97
+
98
+ logger.info(f"Built keyword sets: {len(self.primary_keywords)} primary, "
99
+ f"{len(self.contextual_keywords)} contextual, "
100
+ f"{len(self.product_keywords)} products")
101
+
102
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
103
+ """
104
+ Validate that input contains required fields.
105
+
106
+ Args:
107
+ input_data: Input dictionary
108
+
109
+ Returns:
110
+ True if valid, False otherwise
111
+ """
112
+ required_fields = ["post_id", "post_content"]
113
+ return all(field in input_data for field in required_fields)
114
+
115
+ def _detect_language(self, text: str) -> Dict[str, Any]:
116
+ """
117
+ Detect the language of text using lingua library.
118
+
119
+ Args:
120
+ text: Text to analyze
121
+
122
+ Returns:
123
+ Dictionary with language detection results
124
+ """
125
+ try:
126
+ cleaned_text = text.strip()
127
+ if not cleaned_text or len(cleaned_text) < 3:
128
+ return {
129
+ "language": "English",
130
+ "language_code": "en",
131
+ "is_english": True,
132
+ "confidence": "low"
133
+ }
134
+
135
+ detected = self.language_detector.detect_language_of(cleaned_text)
136
+
137
+ if detected is None:
138
+ return {
139
+ "language": "English",
140
+ "language_code": "en",
141
+ "is_english": True,
142
+ "confidence": "low"
143
+ }
144
+
145
+ if detected == Language.ENGLISH:
146
+ return {
147
+ "language": "English",
148
+ "language_code": "en",
149
+ "is_english": True,
150
+ "confidence": "high"
151
+ }
152
+
153
+ lang_code = self.LINGUA_TO_ISO.get(detected, "unknown")
154
+ lang_name = detected.name.capitalize()
155
+
156
+ return {
157
+ "language": lang_name,
158
+ "language_code": lang_code,
159
+ "is_english": False,
160
+ "confidence": "high"
161
+ }
162
+
163
+ except Exception as e:
164
+ logger.warning(f"Language detection failed: {e}")
165
+ return {
166
+ "language": "English",
167
+ "language_code": "en",
168
+ "is_english": True,
169
+ "confidence": "low"
170
+ }
171
+
172
+ def _check_relevance(self, text: str) -> Dict[str, Any]:
173
+ """
174
+ Check if text is relevant to the brand using keyword matching.
175
+
176
+ Returns:
177
+ Dictionary with relevance assessment:
178
+ - preliminary_relevant: Initial relevance assessment
179
+ - needs_relevance_validation: True if contains ambiguous terms needing LLM check
180
+ - found_keywords: Keywords found in the text
181
+ - relevance_type: 'primary', 'contextual', or 'none'
182
+ """
183
+ text_lower = text.lower()
184
+
185
+ # Tokenize for word boundary matching
186
+ words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
187
+
188
+ # Check for primary keywords (definitive matches)
189
+ found_primary = self.primary_keywords.intersection(words)
190
+ if found_primary:
191
+ return {
192
+ "preliminary_relevant": True,
193
+ "needs_relevance_validation": False,
194
+ "found_keywords": list(found_primary),
195
+ "relevance_type": "primary",
196
+ "relevance_confidence": "high"
197
+ }
198
+
199
+ # Check for contextual keywords (need validation)
200
+ found_contextual = self.contextual_keywords.intersection(words)
201
+ if found_contextual:
202
+ # Check if there's cymbal context
203
+ found_cymbal_context = self.cymbal_context_keywords.intersection(words)
204
+ has_cymbal_context = len(found_cymbal_context) > 0
205
+
206
+ return {
207
+ "preliminary_relevant": True, # Potentially relevant
208
+ "needs_relevance_validation": True, # Needs LLM confirmation
209
+ "found_keywords": list(found_contextual),
210
+ "cymbal_context_found": list(found_cymbal_context) if found_cymbal_context else [],
211
+ "has_cymbal_context": has_cymbal_context,
212
+ "relevance_type": "contextual",
213
+ "relevance_confidence": "medium" if has_cymbal_context else "low"
214
+ }
215
+
216
+ # Check for competitor mentions (might be comparative discussion)
217
+ found_competitors = self.competitor_keywords.intersection(words)
218
+ if found_competitors:
219
+ # Has competitor mention but no Sabian mention
220
+ # Could still be relevant in a comparison context
221
+ return {
222
+ "preliminary_relevant": False,
223
+ "needs_relevance_validation": True, # LLM should check context
224
+ "found_keywords": list(found_competitors),
225
+ "relevance_type": "competitor_only",
226
+ "relevance_confidence": "low"
227
+ }
228
+
229
+ # No relevant keywords found
230
+ return {
231
+ "preliminary_relevant": False,
232
+ "needs_relevance_validation": False,
233
+ "found_keywords": [],
234
+ "relevance_type": "none",
235
+ "relevance_confidence": "high"
236
+ }
237
+
238
+ def _extract_mentioned_products(self, text: str) -> List[str]:
239
+ """
240
+ Extract product names mentioned in the text.
241
+
242
+ Args:
243
+ text: Text to search
244
+
245
+ Returns:
246
+ List of product names found
247
+ """
248
+ text_lower = text.lower()
249
+ words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
250
+
251
+ found_products = []
252
+ products = self.brand_config.get("brand", {}).get("products", [])
253
+
254
+ for product in products:
255
+ if product.lower() in words:
256
+ found_products.append(product)
257
+
258
+ return found_products
259
+
260
+ def _extract_mentioned_competitors(self, text: str) -> List[str]:
261
+ """
262
+ Extract competitor names mentioned in the text.
263
+
264
+ Args:
265
+ text: Text to search
266
+
267
+ Returns:
268
+ List of competitor names found
269
+ """
270
+ text_lower = text.lower()
271
+ words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
272
+
273
+ found_competitors = []
274
+ competitors = self.brand_config.get("brand", {}).get("competitors", [])
275
+
276
+ for comp in competitors:
277
+ if isinstance(comp, dict):
278
+ name = comp.get("name", "")
279
+ aliases = comp.get("aliases", [])
280
+
281
+ # Check name and aliases
282
+ if name.lower() in words:
283
+ if name not in found_competitors:
284
+ found_competitors.append(name)
285
+ else:
286
+ for alias in aliases:
287
+ if alias.lower() in words:
288
+ if name not in found_competitors:
289
+ found_competitors.append(name)
290
+ break
291
+ else:
292
+ if str(comp).lower() in words:
293
+ found_competitors.append(str(comp))
294
+
295
+ return found_competitors
296
+
297
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
298
+ """
299
+ Process a forum post through preprocessing pipeline.
300
+
301
+ Args:
302
+ input_data: Dictionary containing post data with at least:
303
+ - post_id: Post identifier
304
+ - post_content: Raw HTML content
305
+ - thread_title: Thread title (optional)
306
+ - thread_first_post: First post content (optional)
307
+ - category_title: Category title (optional)
308
+ - category_topic: Category topic (optional)
309
+
310
+ Returns:
311
+ Dictionary with preprocessing results
312
+ """
313
+ try:
314
+ # Validate input
315
+ if not self.validate_input(input_data):
316
+ return {
317
+ "success": False,
318
+ "error": "Invalid input: missing required fields",
319
+ **input_data
320
+ }
321
+
322
+ post_content = input_data.get("post_content", "")
323
+
324
+ # Step 1: Parse HTML content
325
+ parsed = self.html_parser.parse_post_content(post_content)
326
+ reply_content = parsed.get("reply_content", "")
327
+ quoted_content = parsed.get("quoted_content")
328
+
329
+ # Check for empty content
330
+ if not reply_content or len(reply_content.strip()) < 3:
331
+ return {
332
+ "success": True,
333
+ "cleaned_content": reply_content,
334
+ "quoted_content": quoted_content,
335
+ "is_empty": True,
336
+ "preliminary_relevant": False,
337
+ "needs_relevance_validation": False,
338
+ **{k: v for k, v in input_data.items() if k != "post_content"}
339
+ }
340
+
341
+ # Step 2: Build thread context
342
+ thread_context = self.html_parser.build_thread_context(
343
+ thread_title=input_data.get("thread_title"),
344
+ first_post_content=input_data.get("thread_first_post"),
345
+ category_title=input_data.get("category_title"),
346
+ category_topic=input_data.get("category_topic")
347
+ )
348
+
349
+ # Step 3: Detect language
350
+ lang_result = self._detect_language(reply_content)
351
+
352
+ # Step 4: Check relevance - ONLY on the actual post content, NOT quoted/context
353
+ # The quoted content and thread context are for understanding, not for relevance determination
354
+ relevance_result = self._check_relevance(reply_content)
355
+
356
+ # Step 5: Extract product and competitor mentions - ONLY from actual post content
357
+ # We don't want to extract from quoted content as that will be processed separately
358
+ products_found = self._extract_mentioned_products(reply_content)
359
+ competitors_found = self._extract_mentioned_competitors(reply_content)
360
+
361
+ # Build result
362
+ result = {
363
+ "success": True,
364
+ "is_empty": False,
365
+
366
+ # Cleaned content
367
+ "cleaned_content": reply_content,
368
+ "quoted_content": quoted_content,
369
+ "has_quote": parsed.get("has_quote", False),
370
+ "quoted_author": parsed.get("quoted_author"),
371
+ "thread_context": thread_context,
372
+
373
+ # Language detection
374
+ "detected_language": lang_result["language"],
375
+ "language_code": lang_result["language_code"],
376
+ "is_english": lang_result["is_english"],
377
+ "language_confidence": lang_result["confidence"],
378
+
379
+ # Relevance assessment
380
+ "preliminary_relevant": relevance_result["preliminary_relevant"],
381
+ "needs_relevance_validation": relevance_result["needs_relevance_validation"],
382
+ "relevance_keywords_found": relevance_result["found_keywords"],
383
+ "relevance_type": relevance_result["relevance_type"],
384
+ "relevance_confidence": relevance_result["relevance_confidence"],
385
+
386
+ # Initial extractions
387
+ "products_detected": products_found,
388
+ "competitors_detected": competitors_found,
389
+
390
+ # Preserve original data
391
+ **{k: v for k, v in input_data.items() if k not in ["post_content"]}
392
+ }
393
+
394
+ # Keep original content for reference
395
+ result["original_content"] = post_content
396
+
397
+ self.log_processing(
398
+ f"Processed post {input_data.get('post_id')}: "
399
+ f"lang={lang_result['language']}, "
400
+ f"relevant={relevance_result['preliminary_relevant']}, "
401
+ f"needs_validation={relevance_result['needs_relevance_validation']}",
402
+ "debug"
403
+ )
404
+
405
+ return result
406
+
407
+ except Exception as e:
408
+ return self.handle_error(e, f"preprocessing post {input_data.get('post_id')}")
processing_brand_sentiment/workflow/agents/relevance_validator_agent.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Relevance Validator Agent for brand sentiment analysis.
3
+ Lightweight LLM-based agent that confirms whether ambiguous terms (HH, AA)
4
+ refer to Sabian products or generic terms.
5
+ """
6
+
7
+ from typing import Dict, Any
8
+ import json
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain.schema import HumanMessage, SystemMessage
11
+ import logging
12
+
13
+ from .base_agent import BaseAgent
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class RelevanceValidatorAgent(BaseAgent):
19
+ """
20
+ Agent that validates whether posts with ambiguous terms (like HH, AA)
21
+ are actually referring to Sabian products or generic terms.
22
+
23
+ This is a lightweight LLM call specifically for disambiguation.
24
+ """
25
+
26
+ def __init__(self, config: Dict[str, Any], api_key: str, brand_config: Dict[str, Any]):
27
+ """
28
+ Initialize the Relevance Validator Agent.
29
+
30
+ Args:
31
+ config: Agent configuration
32
+ api_key: OpenAI API key
33
+ brand_config: Brand-specific configuration with product info
34
+ """
35
+ super().__init__("RelevanceValidatorAgent", config)
36
+ self.api_key = api_key
37
+ self.brand_config = brand_config
38
+
39
+ self.llm = ChatOpenAI(
40
+ model=self.model,
41
+ temperature=self.temperature,
42
+ api_key=self.api_key
43
+ )
44
+
45
+ # Build disambiguation context from brand config
46
+ self._build_disambiguation_context()
47
+
48
+ logger.info("RelevanceValidatorAgent initialized")
49
+
50
+ def _build_disambiguation_context(self) -> None:
51
+ """Build context strings for disambiguation from brand config."""
52
+ brand = self.brand_config.get("brand", {})
53
+ ambiguous = brand.get("ambiguous_terms", {})
54
+
55
+ self.disambiguation_info = {}
56
+ for term, info in ambiguous.items():
57
+ if isinstance(info, dict):
58
+ self.disambiguation_info[term] = {
59
+ "description": info.get("description", ""),
60
+ "context_clues": info.get("disambiguation_context", [])
61
+ }
62
+ else:
63
+ self.disambiguation_info[term] = {
64
+ "description": str(info),
65
+ "context_clues": []
66
+ }
67
+
68
+ # Product descriptions for context
69
+ self.product_descriptions = brand.get("product_descriptions", {})
70
+
71
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
72
+ """
73
+ Validate that input contains required fields.
74
+
75
+ Args:
76
+ input_data: Input dictionary
77
+
78
+ Returns:
79
+ True if valid, False otherwise
80
+ """
81
+ required = ["cleaned_content", "relevance_keywords_found"]
82
+ return all(field in input_data for field in required)
83
+
84
+ def _build_system_prompt(self) -> str:
85
+ """Build the system prompt for relevance validation."""
86
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
87
+ products = self.brand_config.get("brand", {}).get("products", [])
88
+
89
+ # Build disambiguation rules
90
+ disambiguation_rules = []
91
+ for term, info in self.disambiguation_info.items():
92
+ desc = info.get("description", "")
93
+ clues = info.get("context_clues", [])
94
+ rule = f"- '{term}': {desc}"
95
+ if clues:
96
+ rule += f" Context clues for {brand_name}: {', '.join(clues)}"
97
+ disambiguation_rules.append(rule)
98
+
99
+ disambiguation_text = "\n".join(disambiguation_rules) if disambiguation_rules else "No specific disambiguation rules."
100
+
101
+ system_prompt = f"""You are an expert at identifying brand mentions in drum/cymbal forum discussions.
102
+
103
+ Your task is to determine if the POST CONTENT itself discusses {brand_name} products.
104
+
105
+ **CRITICAL RULE:**
106
+ - You must determine relevance based ONLY on the POST CONTENT
107
+ - The context (thread info, quoted/parent content) is provided to help you understand ambiguous terms
108
+ - But if the POST CONTENT itself does not mention or discuss {brand_name}, it is NOT relevant
109
+ - Example: If quoted content mentions Sabian but the post just says "Got it! Thanks!" → NOT relevant
110
+
111
+ **{brand_name} Product Lines:**
112
+ {', '.join(products)}
113
+
114
+ **Ambiguous Terms to Watch For:**
115
+ {disambiguation_text}
116
+
117
+ **Key Disambiguation Rules:**
118
+ - "HH" alone usually means "Hi-Hat" (a type of cymbal), NOT Sabian HH series
119
+ - "HH" WITH Sabian context IN THE POST (e.g., "Sabian HH", "HH crashes", "my HH ride") likely refers to Sabian
120
+ - "AA" alone might be a general abbreviation, NOT Sabian AA series
121
+ - "AA" WITH Sabian context IN THE POST (e.g., "Sabian AA", "AA cymbals", "AA medium ride") likely refers to Sabian
122
+ - Generic replies like "Thanks!", "Got it!", "Good point!" are NOT relevant even if context mentions {brand_name}
123
+
124
+ **Return JSON with:**
125
+ - is_relevant: boolean - true ONLY if the POST CONTENT itself discusses {brand_name} products
126
+ - confidence: "high", "medium", or "low"
127
+ - reason: brief explanation (1-2 sentences) - explain what IN THE POST made you decide
128
+ - detected_products: list of {brand_name} products mentioned IN THE POST (empty if none)
129
+
130
+ Return only valid JSON."""
131
+
132
+ return system_prompt
133
+
134
+ def validate_relevance(
135
+ self,
136
+ content: str,
137
+ keywords_found: list,
138
+ thread_context: str = "",
139
+ quoted_content: str = ""
140
+ ) -> Dict[str, Any]:
141
+ """
142
+ Validate whether content is relevant to the brand.
143
+
144
+ Args:
145
+ content: The cleaned post content
146
+ keywords_found: Keywords that triggered validation
147
+ thread_context: Thread context for additional context
148
+ quoted_content: Quoted content if any
149
+
150
+ Returns:
151
+ Dictionary with validation results
152
+ """
153
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
154
+
155
+ # Build context for the LLM
156
+ context_parts = []
157
+ if thread_context:
158
+ context_parts.append(f"Thread context: {thread_context}")
159
+ if quoted_content:
160
+ context_parts.append(f"Replying to: {quoted_content[:300]}...")
161
+
162
+ context_str = "\n".join(context_parts) if context_parts else "No additional context."
163
+
164
+ user_prompt = f"""Determine if this POST CONTENT discusses {brand_name} cymbal products.
165
+
166
+ **Keywords found in post:** {', '.join(keywords_found)}
167
+
168
+ **CONTEXT (for understanding ambiguous terms only - do NOT base relevance on this):**
169
+ {context_str}
170
+
171
+ **POST CONTENT TO EVALUATE (base your relevance decision ONLY on this):**
172
+ "{content}"
173
+
174
+ Does the POST CONTENT itself discuss {brand_name} products? Remember: generic replies are NOT relevant even if context mentions {brand_name}. Return JSON only."""
175
+
176
+ try:
177
+ messages = [
178
+ SystemMessage(content=self._build_system_prompt()),
179
+ HumanMessage(content=user_prompt)
180
+ ]
181
+
182
+ response = self.llm.invoke(messages)
183
+ result = self._parse_llm_json_response(response.content)
184
+
185
+ return {
186
+ "success": True,
187
+ "is_relevant": result.get("is_relevant", False),
188
+ "relevance_confidence": result.get("confidence", "low"),
189
+ "relevance_reason": result.get("reason", ""),
190
+ "detected_products": result.get("detected_products", [])
191
+ }
192
+
193
+ except json.JSONDecodeError as e:
194
+ self.log_processing(f"JSON decode error in relevance validation: {e}", "warning")
195
+ # Default to relevant if we can't determine
196
+ return {
197
+ "success": True,
198
+ "is_relevant": True,
199
+ "relevance_confidence": "low",
200
+ "relevance_reason": "Could not parse LLM response, defaulting to relevant",
201
+ "detected_products": []
202
+ }
203
+
204
+ except Exception as e:
205
+ self.log_processing(f"Relevance validation error: {e}", "error")
206
+ return {
207
+ "success": False,
208
+ "is_relevant": True, # Default to relevant on error
209
+ "relevance_confidence": "low",
210
+ "relevance_reason": f"Error during validation: {str(e)}",
211
+ "detected_products": [],
212
+ "error": str(e)
213
+ }
214
+
215
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
216
+ """
217
+ Process a post to validate its relevance to the brand.
218
+
219
+ Args:
220
+ input_data: Dictionary containing:
221
+ - cleaned_content: Cleaned post text
222
+ - relevance_keywords_found: Keywords that triggered validation
223
+ - thread_context: Optional thread context
224
+ - quoted_content: Optional quoted content
225
+
226
+ Returns:
227
+ Dictionary with validation results and original data
228
+ """
229
+ try:
230
+ if not self.validate_input(input_data):
231
+ return {
232
+ "success": False,
233
+ "error": "Invalid input: missing required fields",
234
+ "is_relevant": True, # Default to relevant
235
+ "relevance_confidence": "low",
236
+ **input_data
237
+ }
238
+
239
+ # Check if validation is actually needed
240
+ if not input_data.get("needs_relevance_validation", False):
241
+ # No validation needed, use preliminary assessment
242
+ return {
243
+ "success": True,
244
+ "is_relevant": input_data.get("preliminary_relevant", False),
245
+ "relevance_confidence": input_data.get("relevance_confidence", "high"),
246
+ "relevance_reason": "No validation needed - preliminary assessment used",
247
+ "validation_performed": False,
248
+ **input_data
249
+ }
250
+
251
+ # Perform LLM validation
252
+ validation_result = self.validate_relevance(
253
+ content=input_data.get("cleaned_content", ""),
254
+ keywords_found=input_data.get("relevance_keywords_found", []),
255
+ thread_context=input_data.get("thread_context", ""),
256
+ quoted_content=input_data.get("quoted_content", "")
257
+ )
258
+
259
+ # Merge results
260
+ result = {
261
+ **input_data,
262
+ "is_relevant": validation_result["is_relevant"],
263
+ "relevance_confidence": validation_result["relevance_confidence"],
264
+ "relevance_reason": validation_result["relevance_reason"],
265
+ "validation_performed": True,
266
+ "success": validation_result["success"]
267
+ }
268
+
269
+ # Update products detected if LLM found any
270
+ if validation_result.get("detected_products"):
271
+ existing_products = input_data.get("products_detected", [])
272
+ llm_products = validation_result["detected_products"]
273
+ # Merge without duplicates
274
+ all_products = list(set(existing_products + llm_products))
275
+ result["products_detected"] = all_products
276
+
277
+ if "error" in validation_result:
278
+ result["validation_error"] = validation_result["error"]
279
+
280
+ self.log_processing(
281
+ f"Validated relevance for post: is_relevant={result['is_relevant']}, "
282
+ f"confidence={result['relevance_confidence']}",
283
+ "debug"
284
+ )
285
+
286
+ return result
287
+
288
+ except Exception as e:
289
+ return self.handle_error(e, "relevance validation")
processing_brand_sentiment/workflow/agents/sabian_analyzer_agent.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sabian Analyzer Agent for comprehensive brand sentiment analysis.
3
+ LLM-based agent that extracts products, competitors, sentiment, intents,
4
+ pain points, and other brand intelligence from forum posts.
5
+ """
6
+
7
+ from typing import Dict, Any, List
8
+ import json
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain.schema import HumanMessage, SystemMessage
11
+ import logging
12
+
13
+ from .base_agent import BaseAgent
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class SabianAnalyzerAgent(BaseAgent):
19
+ """
20
+ Comprehensive brand analysis agent for Sabian cymbal discussions.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ config: Dict[str, Any],
26
+ api_key: str,
27
+ brand_config: Dict[str, Any],
28
+ analysis_categories: Dict[str, Any]
29
+ ):
30
+ super().__init__("SabianAnalyzerAgent", config)
31
+ self.api_key = api_key
32
+ self.brand_config = brand_config
33
+ self.analysis_categories = analysis_categories
34
+
35
+ self.llm = ChatOpenAI(
36
+ model=self.model,
37
+ temperature=self.temperature,
38
+ api_key=self.api_key
39
+ )
40
+
41
+ # Pre-compute valid values for validation
42
+ self._valid_values = self._compute_valid_values()
43
+ logger.info("SabianAnalyzerAgent initialized")
44
+
45
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
46
+ required = ["cleaned_content", "is_relevant"]
47
+ return all(field in input_data for field in required)
48
+
49
+ def _compute_valid_values(self) -> Dict[str, List[str]]:
50
+ """Pre-compute all valid values from config for validation."""
51
+ valid = {}
52
+
53
+ # Products from brand config
54
+ valid["products"] = self.brand_config.get("brand", {}).get("products", [])
55
+
56
+ # Competitors
57
+ competitor_names = []
58
+ for comp in self.brand_config.get("brand", {}).get("competitors", []):
59
+ if isinstance(comp, dict):
60
+ competitor_names.append(comp.get("name", ""))
61
+ valid["competitors"] = competitor_names
62
+
63
+ # Extract category values from analysis_categories
64
+ category_map = {
65
+ "author_role": "author_role",
66
+ "sabian_mention_context": "sabian_mention_context",
67
+ "sentiment_level": "sentiment",
68
+ "emotion_type": "emotions",
69
+ "intents": "intents",
70
+ "purchase_stage": "purchase_stage",
71
+ "comparison_type": "comparison_type",
72
+ "feedback_aspects": "feedback_aspects",
73
+ "decision_drivers": "decision_drivers",
74
+ "product_attributes": "product_attributes",
75
+ }
76
+
77
+ for key, config_key in category_map.items():
78
+ config_section = self.analysis_categories.get(config_key, {})
79
+ if "categories" in config_section:
80
+ valid[key] = [c["value"] for c in config_section["categories"]]
81
+ elif "levels" in config_section:
82
+ valid[key] = [c["value"] for c in config_section["levels"]]
83
+ else:
84
+ valid[key] = []
85
+
86
+ return valid
87
+
88
+ def _get_category_list(self, key: str) -> List[str]:
89
+ """Get list of valid values for a category."""
90
+ return self._valid_values.get(key, [])
91
+
92
+ def _build_system_prompt(self) -> str:
93
+ """Build optimized system prompt for brand analysis."""
94
+ brand = self.brand_config.get("brand", {})
95
+ brand_name = brand.get("name", "Sabian")
96
+ products = brand.get("products", [])
97
+
98
+ competitors = [c.get("name", "") for c in brand.get("competitors", []) if isinstance(c, dict)]
99
+
100
+ # Get all valid values
101
+ v = self._valid_values
102
+
103
+ return f"""You are a brand analyst extracting insights from forum posts about {brand_name} cymbals.
104
+
105
+ ## STRICT RULES
106
+ 1. Extract ONLY from POST CONTENT, never from quoted/context text
107
+ 2. Use ONLY values from the lists below - return null/[] if no match
108
+ 3. Sentiment must be about {brand_name} specifically, NOT overall post tone
109
+ 4. pain_points/delight_factors use SAME value list (feedback_aspects) - classification determines positive vs negative
110
+
111
+ ## VALID VALUES
112
+
113
+ **{brand_name} Products:** {products}
114
+ **Competitors:** {competitors}
115
+
116
+ | Field | Valid Values |
117
+ |-------|--------------|
118
+ | author_role | {v.get('author_role', [])} |
119
+ | sabian_mention_context | {v.get('sabian_mention_context', [])} |
120
+ | sentiment_level | {v.get('sentiment_level', [])} |
121
+ | emotion_type | {v.get('emotion_type', [])} |
122
+ | intents (multi) | {v.get('intents', [])} |
123
+ | purchase_stage | {v.get('purchase_stage', [])} |
124
+ | comparison_type | {v.get('comparison_type', [])} |
125
+ | feedback_aspects | {v.get('feedback_aspects', [])} |
126
+ | decision_drivers | {v.get('decision_drivers', [])} |
127
+ | product_attributes | {v.get('product_attributes', [])} |
128
+
129
+ ## KEY DISTINCTIONS
130
+
131
+ **Sentiment vs Intent:**
132
+ - sentiment_level = How author FEELS about {brand_name} (positive/negative/neutral)
133
+ - praising/criticizing intent = Author is actively ENDORSING or WARNING others
134
+
135
+ **Author-only fields (null if giving advice to others):**
136
+ - purchase_stage, decision_drivers, pain_points, delight_factors
137
+
138
+ **Example - Sabian-specific sentiment:**
139
+ Post: "Love my new drum kit! The SBR cymbals sound terrible though."
140
+ - Overall post: positive (happy about kit)
141
+ - {brand_name} sentiment: NEGATIVE (dislikes SBR sound)
142
+ - pain_points: ["sound_quality"]
143
+
144
+ ## OUTPUT JSON
145
+ ```json
146
+ {{
147
+ "author_role": "value from list",
148
+ "sabian_mention_context": "value from list",
149
+ "sentiment_level": "value from list",
150
+ "emotion_type": "value or null",
151
+ "sentiment_confidence": "high|medium|low",
152
+ "sarcasm_detected": false,
153
+ "products_mentioned": [],
154
+ "product_attributes": [],
155
+ "competitors_mentioned": [],
156
+ "competitor_products_owned": [],
157
+ "comparison_type": "value or null",
158
+ "intents": [],
159
+ "purchase_stage": "value or null",
160
+ "decision_drivers": [],
161
+ "pain_points": [],
162
+ "delight_factors": [],
163
+ "analysis_notes": "1-2 sentences on key {brand_name}-specific insights"
164
+ }}
165
+ ```
166
+
167
+ Return ONLY valid JSON."""
168
+
169
+ def analyze_post(
170
+ self,
171
+ content: str,
172
+ thread_context: str = "",
173
+ quoted_content: str = ""
174
+ ) -> Dict[str, Any]:
175
+ """Perform brand analysis on a post."""
176
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
177
+
178
+ context_str = ""
179
+ if thread_context:
180
+ context_str += f"[Thread: {thread_context[:200]}] "
181
+ if quoted_content:
182
+ context_str += f"[Replying to: {quoted_content[:200]}...]"
183
+
184
+ user_prompt = f"""Analyze this post about {brand_name}.
185
+
186
+ CONTEXT (for understanding only, DO NOT extract from): {context_str or "None"}
187
+
188
+ POST CONTENT (extract from THIS only):
189
+ "{content}"
190
+
191
+ Return JSON only."""
192
+
193
+ try:
194
+ messages = [
195
+ SystemMessage(content=self._build_system_prompt()),
196
+ HumanMessage(content=user_prompt)
197
+ ]
198
+
199
+ response = self.llm.invoke(messages)
200
+ result = self._parse_llm_json_response(response.content)
201
+ validated = self._validate_and_normalize(result)
202
+
203
+ return {"success": True, **validated}
204
+
205
+ except json.JSONDecodeError as e:
206
+ self.log_processing(f"JSON decode error: {e}", "warning")
207
+ return {
208
+ "success": False,
209
+ "error": f"JSON parse error: {str(e)}",
210
+ "sentiment_level": "neutral",
211
+ "intents": ["general_discussion"]
212
+ }
213
+ except Exception as e:
214
+ self.log_processing(f"Analysis error: {e}", "error")
215
+ return {"success": False, "error": str(e)}
216
+
217
+ def _validate_single(self, value: Any, valid_list: List[str], default: Any = None) -> Any:
218
+ """Validate single value against list, return canonical form or default."""
219
+ if value is None:
220
+ return default
221
+ if isinstance(value, str):
222
+ val_lower = value.lower()
223
+ for v in valid_list:
224
+ if v.lower() == val_lower:
225
+ return v
226
+ return default
227
+
228
+ def _validate_list(self, values: Any, valid_list: List[str]) -> List[str]:
229
+ """Validate list values, return only valid items in canonical form."""
230
+ if not values:
231
+ return []
232
+ if not isinstance(values, list):
233
+ values = [values]
234
+
235
+ validated = []
236
+ valid_lower = {v.lower(): v for v in valid_list}
237
+ for val in values:
238
+ if isinstance(val, str) and val.lower() in valid_lower:
239
+ validated.append(valid_lower[val.lower()])
240
+ return validated
241
+
242
+ def _validate_and_normalize(self, result: Dict[str, Any]) -> Dict[str, Any]:
243
+ """Validate all fields against predefined values and normalize."""
244
+ v = self._valid_values
245
+
246
+ normalized = {
247
+ # Classification
248
+ "author_role": self._validate_single(
249
+ result.get("author_role"), v["author_role"], "unknown"
250
+ ),
251
+ "sabian_mention_context": self._validate_single(
252
+ result.get("sabian_mention_context"), v["sabian_mention_context"], "casual_mention"
253
+ ),
254
+
255
+ # Sentiment
256
+ "sentiment_level": self._validate_single(
257
+ result.get("sentiment_level"), v["sentiment_level"], "neutral"
258
+ ),
259
+ "emotion_type": self._validate_single(
260
+ result.get("emotion_type"), v["emotion_type"], None
261
+ ),
262
+ "sentiment_confidence": result.get("sentiment_confidence", "medium"),
263
+ "sarcasm_detected": bool(result.get("sarcasm_detected", False)),
264
+
265
+ # Products
266
+ "products_mentioned": self._validate_list(
267
+ result.get("products_mentioned"), v["products"]
268
+ ),
269
+ "product_attributes": self._validate_list(
270
+ result.get("product_attributes"), v["product_attributes"]
271
+ ),
272
+
273
+ # Competitors
274
+ "competitors_mentioned": self._validate_list(
275
+ result.get("competitors_mentioned"), v["competitors"]
276
+ ),
277
+ "competitor_products_owned": self._validate_list(
278
+ result.get("competitor_products_owned"), v["competitors"]
279
+ ),
280
+ "comparison_type": self._validate_single(
281
+ result.get("comparison_type"), v["comparison_type"], None
282
+ ),
283
+
284
+ # Intents
285
+ "intents": self._validate_list(
286
+ result.get("intents"), v["intents"]
287
+ ) or ["general_discussion"],
288
+
289
+ # Author journey (null if advising others)
290
+ "purchase_stage": self._validate_single(
291
+ result.get("purchase_stage"), v["purchase_stage"], None
292
+ ),
293
+ "decision_drivers": self._validate_list(
294
+ result.get("decision_drivers"), v["decision_drivers"]
295
+ ),
296
+
297
+ # Feedback - both use feedback_aspects
298
+ "pain_points": self._validate_list(
299
+ result.get("pain_points"), v["feedback_aspects"]
300
+ ),
301
+ "delight_factors": self._validate_list(
302
+ result.get("delight_factors"), v["feedback_aspects"]
303
+ ),
304
+
305
+ # Notes
306
+ "analysis_notes": result.get("analysis_notes", ""),
307
+ }
308
+
309
+ # Log filtered values for debugging
310
+ for field in ["products_mentioned", "product_attributes", "pain_points", "delight_factors"]:
311
+ original = result.get(field, [])
312
+ if isinstance(original, list) and len(original) > len(normalized[field]):
313
+ filtered = set(str(x) for x in original) - set(normalized[field])
314
+ if filtered:
315
+ logger.debug(f"Filtered invalid {field}: {filtered}")
316
+
317
+ return normalized
318
+
319
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
320
+ """Process a post through brand analysis."""
321
+ try:
322
+ if not self.validate_input(input_data):
323
+ return {
324
+ "success": False,
325
+ "error": "Invalid input: missing required fields",
326
+ **input_data
327
+ }
328
+
329
+ # Skip non-relevant posts
330
+ if not input_data.get("is_relevant", False):
331
+ return {
332
+ "success": True,
333
+ "analysis_skipped": True,
334
+ "analysis_skip_reason": "Post marked as not relevant",
335
+ "author_role": None,
336
+ "sabian_mention_context": None,
337
+ "sentiment_level": None,
338
+ "emotion_type": None,
339
+ "products_mentioned": [],
340
+ "competitors_mentioned": [],
341
+ "competitor_products_owned": [],
342
+ "intents": [],
343
+ "purchase_stage": None,
344
+ "decision_drivers": [],
345
+ "pain_points": [],
346
+ "delight_factors": [],
347
+ **input_data
348
+ }
349
+
350
+ # Skip non-English posts
351
+ if not input_data.get("is_english", True):
352
+ return {
353
+ "success": True,
354
+ "analysis_skipped": True,
355
+ "analysis_skip_reason": f"Non-English: {input_data.get('detected_language')}",
356
+ "author_role": None,
357
+ "sabian_mention_context": None,
358
+ "sentiment_level": None,
359
+ "emotion_type": None,
360
+ "intents": [],
361
+ "competitor_products_owned": [],
362
+ **input_data
363
+ }
364
+
365
+ # Perform analysis
366
+ analysis_result = self.analyze_post(
367
+ content=input_data.get("cleaned_content", ""),
368
+ thread_context=input_data.get("thread_context", ""),
369
+ quoted_content=input_data.get("quoted_content", "")
370
+ )
371
+
372
+ result = {
373
+ **input_data,
374
+ **analysis_result,
375
+ "analysis_skipped": False
376
+ }
377
+
378
+ self.log_processing(
379
+ f"Analyzed: sentiment={result.get('sentiment_level')}, "
380
+ f"products={len(result.get('products_mentioned', []))}, "
381
+ f"intents={result.get('intents', [])}",
382
+ "debug"
383
+ )
384
+
385
+ return result
386
+
387
+ except Exception as e:
388
+ return self.handle_error(e, "brand analysis")
processing_brand_sentiment/workflow/agents/sabian_relevance_extraction_agent.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sabian Relevance & Extraction Agent for brand sentiment analysis.
3
+
4
+ This agent performs two critical functions:
5
+ 1. Determines relevance with HIGH confidence using strict rules
6
+ 2. Extracts verifiable facts (products, author role, context summary)
7
+
8
+ Key Design Principles:
9
+ - Strict product matching: ONLY return products from predefined list
10
+ - Competitor awareness: Know what products belong to competitors
11
+ - Conservative relevance: When uncertain, mark as NOT relevant
12
+ - Thread context summarization: Provide clean, concise context for next agent
13
+ """
14
+
15
+ from typing import Dict, Any, List
16
+ import json
17
+ from langchain_openai import ChatOpenAI
18
+ from langchain.schema import HumanMessage, SystemMessage
19
+ import logging
20
+
21
+ from .base_agent import BaseAgent
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class SabianRelevanceExtractionAgent(BaseAgent):
27
+ """
28
+ Agent that validates relevance and extracts key facts from posts.
29
+
30
+ This agent is the first LLM call in the pipeline and serves as the
31
+ gatekeeper for relevance while also extracting structured information
32
+ for downstream analysis.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ config: Dict[str, Any],
38
+ api_key: str,
39
+ brand_config: Dict[str, Any],
40
+ analysis_categories: Dict[str, Any]
41
+ ):
42
+ """
43
+ Initialize the Relevance & Extraction Agent.
44
+
45
+ Args:
46
+ config: Agent configuration
47
+ api_key: OpenAI API key
48
+ brand_config: Brand-specific configuration with products and competitors
49
+ analysis_categories: Category definitions for validation
50
+ """
51
+ super().__init__("SabianRelevanceExtractionAgent", config)
52
+ self.api_key = api_key
53
+ self.brand_config = brand_config
54
+ self.analysis_categories = analysis_categories
55
+
56
+ self.llm = ChatOpenAI(
57
+ model=self.model,
58
+ temperature=self.temperature,
59
+ api_key=self.api_key
60
+ )
61
+
62
+ # Pre-compute valid values
63
+ self._build_valid_values()
64
+ self._build_competitor_product_warnings()
65
+
66
+ logger.info("SabianRelevanceExtractionAgent initialized")
67
+
68
+ def _build_valid_values(self) -> None:
69
+ """Build valid value lists for validation."""
70
+ brand = self.brand_config.get("brand", {})
71
+
72
+ # Products
73
+ self.valid_products = brand.get("products", [])
74
+
75
+ # Competitors (brand names only)
76
+ self.valid_competitors = []
77
+ for comp in brand.get("competitors", []):
78
+ if isinstance(comp, dict):
79
+ self.valid_competitors.append(comp.get("name", ""))
80
+ else:
81
+ self.valid_competitors.append(str(comp))
82
+
83
+ # Author roles from categories
84
+ author_role_config = self.analysis_categories.get("author_role", {})
85
+ self.valid_author_roles = [
86
+ c["value"] for c in author_role_config.get("categories", [])
87
+ ]
88
+
89
+ # Sabian mention context from categories
90
+ mention_context_config = self.analysis_categories.get("sabian_mention_context", {})
91
+ self.valid_mention_contexts = [
92
+ c["value"] for c in mention_context_config.get("categories", [])
93
+ ]
94
+
95
+ def _build_competitor_product_warnings(self) -> None:
96
+ """Build list of competitor products to warn about in prompts."""
97
+ warnings = self.brand_config.get("brand", {}).get("competitor_products_warning", {})
98
+
99
+ self.competitor_products_by_brand = {}
100
+ for key, products in warnings.items():
101
+ if key == "description":
102
+ continue
103
+ # Extract brand name from key (e.g., "paiste_products" -> "Paiste")
104
+ brand_name = key.replace("_products", "").capitalize()
105
+ self.competitor_products_by_brand[brand_name] = products
106
+
107
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
108
+ """Validate input contains required fields."""
109
+ required = ["cleaned_content"]
110
+ return all(field in input_data for field in required)
111
+
112
+ def _build_system_prompt(self) -> str:
113
+ """Build the system prompt for relevance and extraction."""
114
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
115
+
116
+ # Build competitor product warnings
117
+ competitor_warnings = []
118
+ for brand, products in self.competitor_products_by_brand.items():
119
+ products_str = ", ".join(f'"{p}"' for p in products[:5]) # Limit to 5 examples
120
+ if len(products) > 5:
121
+ products_str += f" (and {len(products)-5} more)"
122
+ competitor_warnings.append(f"- {brand}: {products_str}")
123
+
124
+ competitor_warnings_text = "\n".join(competitor_warnings) if competitor_warnings else "None specified"
125
+
126
+ return f"""You are a brand mention extractor for {brand_name} cymbals. Your job is to:
127
+ 1. Determine if the POST CONTENT discusses {brand_name} products or brand
128
+ 2. Extract ONLY verifiable facts, not interpretations
129
+
130
+ ## CRITICAL RULES
131
+
132
+ ### Rule 1: Relevance Based on POST CONTENT Only
133
+ - The post is relevant ONLY if the POST CONTENT itself mentions {brand_name} brand or products
134
+ - Quoted/parent content mentioning {brand_name} does NOT make the post relevant
135
+ - Generic replies ("Thanks!", "Got it!", "Good point!") are NEVER relevant
136
+ - Posts can be relevant even without specific product mentions if they discuss the {brand_name} brand
137
+
138
+ ### Rule 2: Strict Product Matching
139
+ {brand_name.upper()} PRODUCTS (use ONLY these exact values):
140
+ {self.valid_products}
141
+
142
+ CRITICAL:
143
+ - Return ONLY products from this exact list above
144
+ - If you see a product not in this list, do NOT include it
145
+ - Return empty list [] if no products from the list are mentioned
146
+ - It's OK to have empty products_mentioned if the post discusses {brand_name} brand generally
147
+
148
+ ### Rule 3: Competitor Product Awareness
149
+ These products belong to COMPETITORS, NOT {brand_name}:
150
+ {competitor_warnings_text}
151
+
152
+ COMPETITOR BRANDS: {self.valid_competitors}
153
+ - Only return competitor BRAND names in competitors_mentioned (not their products)
154
+ - If you see "2002", "Signature", "Sound Edge", "Formula 602" - these are PAISTE, not {brand_name}
155
+ - If you see "K Custom", "A Custom" - these are ZILDJIAN, not {brand_name}
156
+
157
+ ### Rule 4: Thread Context Summary
158
+ - Summarize thread context in 1-2 sentences MAXIMUM
159
+ - Focus only on what helps understand what the post is responding to
160
+ - If thread is about unrelated topics (pizza, general life), say so briefly
161
+ - Keep it factual and concise
162
+
163
+ ### Rule 5: Author Role Classification
164
+ Determine the author's relationship to {brand_name}:
165
+ - current_owner: Currently owns/uses {brand_name} products
166
+ - past_owner: Previously owned but sold/replaced
167
+ - potential_buyer: Considering purchasing {brand_name}
168
+ - never_owned: Explicitly states they don't own {brand_name}
169
+ - unknown: Cannot determine from post content
170
+
171
+ ### Rule 6: Mention Context Classification
172
+ How prominently is {brand_name} discussed IN THE POST CONTENT:
173
+ - primary_focus: {brand_name} is the main topic of the post
174
+ - significant_mention: {brand_name} discussed with some detail, but not main focus
175
+ - casual_mention: Brief mention among other topics
176
+ - comparison_context: Mentioned while comparing to competitors
177
+ - null: Not relevant (use when is_relevant=false)
178
+
179
+ ## OUTPUT FORMAT
180
+ Return ONLY valid JSON with these exact fields:
181
+ ```json
182
+ {{
183
+ "is_relevant": true/false,
184
+ "relevance_confidence": "high" | "medium" | "low",
185
+ "relevance_reason": "1-2 sentences explaining your decision",
186
+ "products_mentioned": [],
187
+ "sabian_mention_context": "value from list" | null,
188
+ "author_role": "value from list",
189
+ "competitors_mentioned": [],
190
+ "thread_context_summary": "1-2 sentence summary of thread context"
191
+ }}
192
+ ```
193
+
194
+ IMPORTANT: Return ONLY the JSON object, no additional text."""
195
+
196
+ def _build_user_prompt(
197
+ self,
198
+ content: str,
199
+ quoted_content: str,
200
+ raw_thread_context: str,
201
+ keywords_found: List[str]
202
+ ) -> str:
203
+ """Build the user prompt with post content and context."""
204
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
205
+
206
+ context_section = ""
207
+ if raw_thread_context:
208
+ # Truncate if too long
209
+ truncated_context = raw_thread_context[:1000] if len(raw_thread_context) > 1000 else raw_thread_context
210
+ context_section += f"THREAD CONTEXT (for understanding only):\n{truncated_context}\n\n"
211
+
212
+ if quoted_content:
213
+ truncated_quote = quoted_content[:500] if len(quoted_content) > 500 else quoted_content
214
+ context_section += f"QUOTED/PARENT CONTENT (for understanding only):\n{truncated_quote}\n\n"
215
+
216
+ keywords_info = ""
217
+ if keywords_found:
218
+ keywords_info = f"Keywords detected by preprocessor: {', '.join(keywords_found)}\n\n"
219
+
220
+ return f"""Analyze this post for {brand_name} relevance and extract facts.
221
+
222
+ {keywords_info}{context_section}POST CONTENT TO EVALUATE (base your decision ONLY on this):
223
+ \"\"\"{content}\"\"\"
224
+
225
+ Remember:
226
+ - is_relevant=true ONLY if POST CONTENT discusses {brand_name}
227
+ - products_mentioned must be from the exact product list provided
228
+ - competitors_mentioned should be brand names only (Zildjian, Paiste, etc.)
229
+ - thread_context_summary should be 1-2 sentences max
230
+
231
+ Return JSON only."""
232
+
233
+ def extract_and_validate(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
234
+ """
235
+ Perform relevance check and fact extraction.
236
+
237
+ Args:
238
+ input_data: Preprocessed post data
239
+
240
+ Returns:
241
+ Dictionary with extraction results
242
+ """
243
+ content = input_data.get("cleaned_content", "")
244
+ quoted_content = input_data.get("quoted_content", "")
245
+ raw_thread_context = input_data.get("raw_thread_context", "")
246
+ keywords_found = input_data.get("relevance_keywords_found", [])
247
+
248
+ try:
249
+ messages = [
250
+ SystemMessage(content=self._build_system_prompt()),
251
+ HumanMessage(content=self._build_user_prompt(
252
+ content, quoted_content, raw_thread_context, keywords_found
253
+ ))
254
+ ]
255
+
256
+ response = self.llm.invoke(messages)
257
+ result = self._parse_llm_json_response(response.content)
258
+
259
+ # Validate and normalize the response
260
+ validated = self._validate_response(result)
261
+
262
+ return {
263
+ "success": True,
264
+ **validated
265
+ }
266
+
267
+ except json.JSONDecodeError as e:
268
+ self.log_processing(f"JSON decode error: {e}", "warning")
269
+ return {
270
+ "success": False,
271
+ "error": f"JSON parse error: {str(e)}",
272
+ "is_relevant": False,
273
+ "relevance_confidence": "low",
274
+ "relevance_reason": "Failed to parse LLM response"
275
+ }
276
+
277
+ except Exception as e:
278
+ self.log_processing(f"Extraction error: {e}", "error")
279
+ return {
280
+ "success": False,
281
+ "error": str(e),
282
+ "is_relevant": False,
283
+ "relevance_confidence": "low",
284
+ "relevance_reason": f"Error during extraction: {str(e)}"
285
+ }
286
+
287
+ def _validate_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
288
+ """Validate and normalize LLM response against allowed values."""
289
+
290
+ # Validate products
291
+ products = result.get("products_mentioned", [])
292
+ if not isinstance(products, list):
293
+ products = []
294
+ valid_products = [
295
+ p for p in products
296
+ if any(p.lower() == vp.lower() for vp in self.valid_products)
297
+ ]
298
+ # Normalize to canonical case
299
+ normalized_products = []
300
+ for p in valid_products:
301
+ for vp in self.valid_products:
302
+ if p.lower() == vp.lower():
303
+ normalized_products.append(vp)
304
+ break
305
+
306
+ # Validate competitors
307
+ competitors = result.get("competitors_mentioned", [])
308
+ if not isinstance(competitors, list):
309
+ competitors = []
310
+ valid_competitors = [
311
+ c for c in competitors
312
+ if any(c.lower() == vc.lower() for vc in self.valid_competitors)
313
+ ]
314
+ # Normalize to canonical case
315
+ normalized_competitors = []
316
+ for c in valid_competitors:
317
+ for vc in self.valid_competitors:
318
+ if c.lower() == vc.lower():
319
+ normalized_competitors.append(vc)
320
+ break
321
+
322
+ # Validate author_role
323
+ author_role = result.get("author_role", "unknown")
324
+ if author_role not in self.valid_author_roles:
325
+ author_role = "unknown"
326
+
327
+ # Validate sabian_mention_context
328
+ mention_context = result.get("sabian_mention_context")
329
+ is_relevant = result.get("is_relevant", False)
330
+
331
+ if not is_relevant:
332
+ mention_context = None
333
+ elif mention_context and mention_context not in self.valid_mention_contexts:
334
+ mention_context = "casual_mention" # Default for relevant posts
335
+
336
+ # Validate confidence
337
+ confidence = result.get("relevance_confidence", "medium")
338
+ if confidence not in ["high", "medium", "low"]:
339
+ confidence = "medium"
340
+
341
+ return {
342
+ "is_relevant": bool(is_relevant),
343
+ "relevance_confidence": confidence,
344
+ "relevance_reason": result.get("relevance_reason", ""),
345
+ "products_mentioned": normalized_products,
346
+ "sabian_mention_context": mention_context,
347
+ "author_role": author_role,
348
+ "competitors_mentioned": normalized_competitors,
349
+ "thread_context_summary": result.get("thread_context_summary", "")
350
+ }
351
+
352
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
353
+ """
354
+ Process a post through relevance validation and fact extraction.
355
+
356
+ Args:
357
+ input_data: Dictionary from preprocessor containing:
358
+ - cleaned_content: Cleaned post text
359
+ - quoted_content: Quoted content if any
360
+ - raw_thread_context: Raw thread context
361
+ - relevance_keywords_found: Keywords from preprocessor
362
+ - preliminary_relevant: Preprocessor's relevance assessment
363
+ - needs_relevance_validation: Whether LLM validation needed
364
+
365
+ Returns:
366
+ Dictionary with extraction results and original data
367
+ """
368
+ try:
369
+ if not self.validate_input(input_data):
370
+ return {
371
+ "success": False,
372
+ "error": "Invalid input: missing required fields",
373
+ "is_relevant": False,
374
+ **input_data
375
+ }
376
+
377
+ # Skip if already determined not relevant and no validation needed
378
+ if (not input_data.get("preliminary_relevant", False) and
379
+ not input_data.get("needs_relevance_validation", False)):
380
+ return {
381
+ "success": True,
382
+ "is_relevant": False,
383
+ "relevance_confidence": "high",
384
+ "relevance_reason": "No Sabian-related keywords found in post",
385
+ "products_mentioned": [],
386
+ "sabian_mention_context": None,
387
+ "author_role": "unknown",
388
+ "competitors_mentioned": input_data.get("competitors_detected", []),
389
+ "thread_context_summary": "",
390
+ "extraction_performed": False,
391
+ **input_data
392
+ }
393
+
394
+ # Skip non-English posts
395
+ if not input_data.get("is_english", True):
396
+ return {
397
+ "success": True,
398
+ "is_relevant": False,
399
+ "relevance_confidence": "high",
400
+ "relevance_reason": f"Non-English post: {input_data.get('detected_language')}",
401
+ "products_mentioned": [],
402
+ "sabian_mention_context": None,
403
+ "author_role": "unknown",
404
+ "competitors_mentioned": [],
405
+ "thread_context_summary": "",
406
+ "extraction_performed": False,
407
+ **input_data
408
+ }
409
+
410
+ # Perform LLM extraction
411
+ extraction_result = self.extract_and_validate(input_data)
412
+
413
+ # Merge results
414
+ result = {
415
+ **input_data,
416
+ **extraction_result,
417
+ "extraction_performed": True
418
+ }
419
+
420
+ # Log the result
421
+ self.log_processing(
422
+ f"Extraction complete: is_relevant={result.get('is_relevant')}, "
423
+ f"products={result.get('products_mentioned')}, "
424
+ f"context={result.get('sabian_mention_context')}",
425
+ "debug"
426
+ )
427
+
428
+ return result
429
+
430
+ except Exception as e:
431
+ return self.handle_error(e, "relevance extraction")
processing_brand_sentiment/workflow/agents/sabian_sentiment_analyzer_agent.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sabian Sentiment & Intent Analyzer Agent for brand sentiment analysis.
3
+
4
+ This agent performs deep analysis on VERIFIED relevant posts with STRUCTURED input.
5
+ It receives pre-validated data from the Relevance Extraction Agent including:
6
+ - Products already extracted and validated
7
+ - Thread context already summarized
8
+ - Author role already determined
9
+
10
+ Key Design Principles:
11
+ - Focused analysis: Only sentiment, intents, and customer journey
12
+ - No re-extraction: Products are given, not re-detected
13
+ - Sabian-specific sentiment: How author feels about Sabian, not overall post tone
14
+ - Author perspective: Pain points/delights only from author's own experience
15
+ """
16
+
17
+ from typing import Dict, Any, List
18
+ import json
19
+ from langchain_openai import ChatOpenAI
20
+ from langchain.schema import HumanMessage, SystemMessage
21
+ import logging
22
+
23
+ from .base_agent import BaseAgent
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class SabianSentimentAnalyzerAgent(BaseAgent):
29
+ """
30
+ Agent that performs deep sentiment and intent analysis on relevant posts.
31
+
32
+ This agent is the second LLM call in the pipeline and focuses purely on
33
+ analysis, not extraction. It receives structured input from the extraction
34
+ agent and produces sentiment, intent, and customer journey insights.
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ config: Dict[str, Any],
40
+ api_key: str,
41
+ brand_config: Dict[str, Any],
42
+ analysis_categories: Dict[str, Any]
43
+ ):
44
+ """
45
+ Initialize the Sentiment Analyzer Agent.
46
+
47
+ Args:
48
+ config: Agent configuration
49
+ api_key: OpenAI API key
50
+ brand_config: Brand-specific configuration
51
+ analysis_categories: Category definitions for analysis
52
+ """
53
+ super().__init__("SabianSentimentAnalyzerAgent", config)
54
+ self.api_key = api_key
55
+ self.brand_config = brand_config
56
+ self.analysis_categories = analysis_categories
57
+
58
+ self.llm = ChatOpenAI(
59
+ model=self.model,
60
+ temperature=self.temperature,
61
+ api_key=self.api_key
62
+ )
63
+
64
+ # Pre-compute valid values for validation
65
+ self._valid_values = self._compute_valid_values()
66
+
67
+ logger.info("SabianSentimentAnalyzerAgent initialized")
68
+
69
+ def _compute_valid_values(self) -> Dict[str, List[str]]:
70
+ """Pre-compute all valid values from config for validation."""
71
+ valid = {}
72
+
73
+ # Products from brand config
74
+ valid["products"] = self.brand_config.get("brand", {}).get("products", [])
75
+
76
+ # Competitors
77
+ competitor_names = []
78
+ for comp in self.brand_config.get("brand", {}).get("competitors", []):
79
+ if isinstance(comp, dict):
80
+ competitor_names.append(comp.get("name", ""))
81
+ valid["competitors"] = competitor_names
82
+
83
+ # Extract category values from analysis_categories
84
+ category_map = {
85
+ "sentiment_level": "sentiment",
86
+ "emotion_type": "emotions",
87
+ "intents": "intents",
88
+ "purchase_stage": "purchase_stage",
89
+ "comparison_type": "comparison_type",
90
+ "feedback_aspects": "feedback_aspects",
91
+ "decision_drivers": "decision_drivers",
92
+ "product_attributes": "product_attributes",
93
+ }
94
+
95
+ for key, config_key in category_map.items():
96
+ config_section = self.analysis_categories.get(config_key, {})
97
+ if "categories" in config_section:
98
+ valid[key] = [c["value"] for c in config_section["categories"]]
99
+ elif "levels" in config_section:
100
+ valid[key] = [c["value"] for c in config_section["levels"]]
101
+ else:
102
+ valid[key] = []
103
+
104
+ return valid
105
+
106
+ def _get_valid_list(self, key: str) -> List[str]:
107
+ """Get list of valid values for a category."""
108
+ return self._valid_values.get(key, [])
109
+
110
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
111
+ """Validate that input contains required fields."""
112
+ required = ["cleaned_content", "is_relevant"]
113
+ return all(field in input_data for field in required)
114
+
115
+ def _build_system_prompt(self) -> str:
116
+ """Build optimized system prompt for sentiment analysis."""
117
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
118
+ v = self._valid_values
119
+
120
+ return f"""You are a sentiment analyst for {brand_name} cymbal discussions.
121
+
122
+ ## YOUR TASK
123
+ Analyze the sentiment, emotions, and intents in posts about {brand_name}.
124
+ You will receive PRE-VALIDATED context (products, author role, etc.) - trust these values.
125
+
126
+ ## CRITICAL RULES
127
+
128
+ ### Rule 1: Neutral by Default
129
+ Sentiment defaults to NEUTRAL unless there is EXPLICIT positive or negative language toward {brand_name}.
130
+ - Factual statements = neutral
131
+ - Comparative statements ("sounds different", "not the same as") = neutral (different ≠ worse)
132
+ - Advice-giving without personal opinion = neutral
133
+
134
+ Only assign positive/negative sentiment when the author CLEARLY expresses satisfaction or dissatisfaction with {brand_name}.
135
+
136
+ ### Rule 2: {brand_name}-Specific Sentiment
137
+ Sentiment MUST be about {brand_name} specifically, NOT overall post tone or other products.
138
+
139
+ EXAMPLE:
140
+ Post: "I have SBR cymbals and bought a Pearl crash. The Pearl sounds different from the SBR. Go with what feels best!"
141
+ - This is NEUTRAL toward {brand_name} - "different" is not criticism
142
+ - The author owns SBR (no complaint), is giving advice
143
+ - pain_points: [] (no negative experience expressed)
144
+ - delight_factors: [] (no positive experience expressed)
145
+
146
+ ### Rule 3: Mutually Exclusive Feedback
147
+ pain_points and delight_factors CANNOT contain the same values.
148
+ - If an aspect is positive → delight_factors only
149
+ - If an aspect is negative → pain_points only
150
+ - Never both
151
+
152
+ ### Rule 4: Author Perspective Only
153
+ These fields are ONLY for author's OWN experience, not advice to others:
154
+ - purchase_stage, decision_drivers, pain_points, delight_factors
155
+
156
+ If author is primarily giving ADVICE to someone else, these should be null/empty.
157
+
158
+ ### Rule 5: Valid Values
159
+
160
+ | Field | Valid Values |
161
+ |-------|--------------|
162
+ | sentiment_level | {v.get('sentiment_level', [])} |
163
+ | emotion_type | {v.get('emotion_type', [])} |
164
+ | intents (multi-select) | {v.get('intents', [])} |
165
+ | purchase_stage | {v.get('purchase_stage', [])} |
166
+ | comparison_type | {v.get('comparison_type', [])} |
167
+ | feedback_aspects | {v.get('feedback_aspects', [])} |
168
+ | decision_drivers | {v.get('decision_drivers', [])} |
169
+ | product_attributes | {v.get('product_attributes', [])} |
170
+ | competitor brands | {v.get('competitors', [])} |
171
+
172
+ ### Rule 6: Intent Classification
173
+ - seeking_information: Asking questions, seeking advice
174
+ - providing_information: Answering questions, giving advice
175
+ - sharing_experience: Personal experience, review, testimonial
176
+ - comparing: Comparing brands/products
177
+ - praising: Actively endorsing {brand_name}
178
+ - criticizing: Actively complaining about {brand_name}
179
+ - buying_selling: Listing gear for sale/trade
180
+ - general_discussion: General conversation
181
+
182
+ ## OUTPUT FORMAT
183
+ ```json
184
+ {{
185
+ "sentiment_level": "neutral unless explicit positive/negative",
186
+ "emotion_type": "value or null",
187
+ "sentiment_confidence": "high" | "medium" | "low",
188
+ "sarcasm_detected": false,
189
+ "product_attributes": [],
190
+ "competitor_products_owned": [],
191
+ "comparison_type": "value or null",
192
+ "intents": [],
193
+ "purchase_stage": "value or null",
194
+ "decision_drivers": [],
195
+ "pain_points": [],
196
+ "delight_factors": [],
197
+ "analysis_notes": "1-2 sentences"
198
+ }}
199
+ ```
200
+
201
+ Return ONLY valid JSON."""
202
+
203
+ def _build_user_prompt(self, input_data: Dict[str, Any]) -> str:
204
+ """Build user prompt with structured context."""
205
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
206
+
207
+ content = input_data.get("cleaned_content", "")
208
+ products_mentioned = input_data.get("products_mentioned", [])
209
+ sabian_context = input_data.get("sabian_mention_context", "")
210
+ author_role = input_data.get("author_role", "unknown")
211
+ thread_summary = input_data.get("thread_context_summary", "")
212
+ competitors_mentioned = input_data.get("competitors_mentioned", [])
213
+
214
+ context_section = f"""## PRE-VALIDATED CONTEXT (trust these values)
215
+ - Products mentioned: {products_mentioned if products_mentioned else 'None specific'}
216
+ - {brand_name} mention context: {sabian_context}
217
+ - Author role: {author_role}
218
+ - Competitors mentioned: {competitors_mentioned if competitors_mentioned else 'None'}
219
+ - Thread summary: {thread_summary if thread_summary else 'Not available'}
220
+ """
221
+
222
+ return f"""Analyze this post about {brand_name} for sentiment and intents.
223
+
224
+ {context_section}
225
+ ## POST CONTENT TO ANALYZE:
226
+ \"\"\"{content}\"\"\"
227
+
228
+ Remember:
229
+ - Sentiment is about {brand_name} ONLY, not overall post tone
230
+ - pain_points/delight_factors only from author's OWN experience
231
+ - Use only values from the valid lists provided
232
+
233
+ Return JSON only."""
234
+
235
+ def analyze_post(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
236
+ """
237
+ Perform sentiment and intent analysis.
238
+
239
+ Args:
240
+ input_data: Structured data from extraction agent
241
+
242
+ Returns:
243
+ Dictionary with analysis results
244
+ """
245
+ try:
246
+ messages = [
247
+ SystemMessage(content=self._build_system_prompt()),
248
+ HumanMessage(content=self._build_user_prompt(input_data))
249
+ ]
250
+
251
+ response = self.llm.invoke(messages)
252
+ result = self._parse_llm_json_response(response.content)
253
+
254
+ # Validate and normalize
255
+ validated = self._validate_and_normalize(result)
256
+
257
+ return {"success": True, **validated}
258
+
259
+ except json.JSONDecodeError as e:
260
+ self.log_processing(f"JSON decode error: {e}", "warning")
261
+ return {
262
+ "success": False,
263
+ "error": f"JSON parse error: {str(e)}",
264
+ "sentiment_level": "neutral",
265
+ "intents": ["general_discussion"]
266
+ }
267
+
268
+ except Exception as e:
269
+ self.log_processing(f"Analysis error: {e}", "error")
270
+ return {"success": False, "error": str(e)}
271
+
272
+ def _validate_single(self, value: Any, valid_list: List[str], default: Any = None) -> Any:
273
+ """Validate single value against list, return canonical form or default."""
274
+ if value is None:
275
+ return default
276
+ if isinstance(value, str):
277
+ val_lower = value.lower()
278
+ for v in valid_list:
279
+ if v.lower() == val_lower:
280
+ return v
281
+ return default
282
+
283
+ def _validate_list(self, values: Any, valid_list: List[str]) -> List[str]:
284
+ """Validate list values, return only valid items in canonical form."""
285
+ if not values:
286
+ return []
287
+ if not isinstance(values, list):
288
+ values = [values]
289
+
290
+ validated = []
291
+ valid_lower = {v.lower(): v for v in valid_list}
292
+ for val in values:
293
+ if isinstance(val, str) and val.lower() in valid_lower:
294
+ validated.append(valid_lower[val.lower()])
295
+ return validated
296
+
297
+ def _validate_and_normalize(self, result: Dict[str, Any]) -> Dict[str, Any]:
298
+ """Validate all fields against predefined values and normalize."""
299
+ v = self._valid_values
300
+
301
+ normalized = {
302
+ # Sentiment
303
+ "sentiment_level": self._validate_single(
304
+ result.get("sentiment_level"), v["sentiment_level"], "neutral"
305
+ ),
306
+ "emotion_type": self._validate_single(
307
+ result.get("emotion_type"), v["emotion_type"], None
308
+ ),
309
+ "sentiment_confidence": result.get("sentiment_confidence", "medium"),
310
+ "sarcasm_detected": bool(result.get("sarcasm_detected", False)),
311
+
312
+ # Product info
313
+ "product_attributes": self._validate_list(
314
+ result.get("product_attributes"), v["product_attributes"]
315
+ ),
316
+
317
+ # Competitors
318
+ "competitor_products_owned": self._validate_list(
319
+ result.get("competitor_products_owned"), v["competitors"]
320
+ ),
321
+ "comparison_type": self._validate_single(
322
+ result.get("comparison_type"), v["comparison_type"], None
323
+ ),
324
+
325
+ # Intents
326
+ "intents": self._validate_list(
327
+ result.get("intents"), v["intents"]
328
+ ) or ["general_discussion"],
329
+
330
+ # Author journey (null if advising others)
331
+ "purchase_stage": self._validate_single(
332
+ result.get("purchase_stage"), v["purchase_stage"], None
333
+ ),
334
+ "decision_drivers": self._validate_list(
335
+ result.get("decision_drivers"), v["decision_drivers"]
336
+ ),
337
+
338
+ # Feedback - both use feedback_aspects
339
+ "pain_points": self._validate_list(
340
+ result.get("pain_points"), v["feedback_aspects"]
341
+ ),
342
+ "delight_factors": self._validate_list(
343
+ result.get("delight_factors"), v["feedback_aspects"]
344
+ ),
345
+
346
+ # Notes
347
+ "analysis_notes": result.get("analysis_notes", ""),
348
+ }
349
+
350
+ # Validate confidence
351
+ if normalized["sentiment_confidence"] not in ["high", "medium", "low"]:
352
+ normalized["sentiment_confidence"] = "medium"
353
+
354
+ return normalized
355
+
356
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
357
+ """
358
+ Process a post through sentiment and intent analysis.
359
+
360
+ Args:
361
+ input_data: Dictionary from extraction agent containing:
362
+ - cleaned_content: Post text
363
+ - is_relevant: Relevance determination
364
+ - products_mentioned: Pre-validated products
365
+ - sabian_mention_context: How Sabian is discussed
366
+ - author_role: Author's relationship to Sabian
367
+ - thread_context_summary: Summarized context
368
+ - competitors_mentioned: Competitor brands
369
+
370
+ Returns:
371
+ Dictionary with analysis results and original data
372
+ """
373
+ try:
374
+ if not self.validate_input(input_data):
375
+ return {
376
+ "success": False,
377
+ "error": "Invalid input: missing required fields",
378
+ **input_data
379
+ }
380
+
381
+ # Skip non-relevant posts
382
+ if not input_data.get("is_relevant", False):
383
+ return {
384
+ "success": True,
385
+ "analysis_skipped": True,
386
+ "analysis_skip_reason": "Post marked as not relevant",
387
+ "sentiment_level": None,
388
+ "emotion_type": None,
389
+ "sentiment_confidence": None,
390
+ "sarcasm_detected": False,
391
+ "product_attributes": [],
392
+ "competitor_products_owned": [],
393
+ "comparison_type": None,
394
+ "intents": [],
395
+ "purchase_stage": None,
396
+ "decision_drivers": [],
397
+ "pain_points": [],
398
+ "delight_factors": [],
399
+ "analysis_notes": "",
400
+ **input_data
401
+ }
402
+
403
+ # Skip non-English posts (should already be filtered, but double-check)
404
+ if not input_data.get("is_english", True):
405
+ return {
406
+ "success": True,
407
+ "analysis_skipped": True,
408
+ "analysis_skip_reason": f"Non-English: {input_data.get('detected_language')}",
409
+ "sentiment_level": None,
410
+ "emotion_type": None,
411
+ "intents": [],
412
+ **input_data
413
+ }
414
+
415
+ # Perform analysis
416
+ analysis_result = self.analyze_post(input_data)
417
+
418
+ result = {
419
+ **input_data,
420
+ **analysis_result,
421
+ "analysis_skipped": False
422
+ }
423
+
424
+ self.log_processing(
425
+ f"Analyzed: sentiment={result.get('sentiment_level')}, "
426
+ f"intents={result.get('intents')}, "
427
+ f"pain_points={result.get('pain_points')}",
428
+ "debug"
429
+ )
430
+
431
+ return result
432
+
433
+ except Exception as e:
434
+ return self.handle_error(e, "sentiment analysis")
processing_brand_sentiment/workflow/comment_orchestrator.py ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Comment Analysis Workflow Orchestrator using LangGraph.
3
+
4
+ Coordinates the 4-agent pipeline for social media comments:
5
+ 1. CommentPreprocessorAgent - Plain text cleaning, keyword detection (no LLM)
6
+ 2. SabianRelevanceExtractionAgent - Relevance + fact extraction (LLM #1) [shared]
7
+ 3. SabianSentimentAnalyzerAgent - Deep sentiment analysis (LLM #2) [shared]
8
+ 4. OutputValidatorAgent - Rule-based validation (no LLM) [shared]
9
+
10
+ Architecture v4.0:
11
+ - Same analysis pipeline as forums, different preprocessing and state
12
+ - Plain text input (no HTML parsing)
13
+ - Context from social media content metadata and parent comments
14
+ - Comment-specific identifiers (comment_sk, comment_id, platform, etc.)
15
+ """
16
+
17
+ from typing import Dict, Any, List, TypedDict, Annotated, Optional
18
+ import operator
19
+ import json
20
+ import os
21
+ from langgraph.graph import StateGraph, END
22
+ import logging
23
+
24
+ from .agents.comment_preprocessor_agent import CommentPreprocessorAgent
25
+ from .agents.sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent
26
+ from .agents.sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent
27
+ from .agents.output_validator_agent import OutputValidatorAgent
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class CommentAnalysisState(TypedDict):
33
+ """
34
+ State definition for the comment analysis workflow v4.0.
35
+
36
+ Uses comment-specific identifiers but shares the same analysis fields
37
+ as the forum workflow for consistent output.
38
+ """
39
+ # ============== Source Identifiers (Comment-specific) ==============
40
+ comment_sk: int
41
+ comment_id: str
42
+ platform: str
43
+ comment_timestamp: Any
44
+ author_name: str
45
+ author_id: str
46
+ parent_comment_id: str
47
+ parent_comment_text: str
48
+
49
+ # Content metadata
50
+ content_sk: int
51
+ content_id: str
52
+ content_description: str
53
+ content_title: str
54
+ channel_sk: int
55
+ channel_name: str
56
+ channel_display_name: str
57
+
58
+ # ============== Original Content ==============
59
+ comment_text: str
60
+ original_text: str
61
+
62
+ # ============== Preprocessor Output ==============
63
+ cleaned_content: str
64
+ quoted_content: str
65
+ has_quote: bool
66
+ quoted_author: str
67
+ raw_thread_context: str # Comment context (reuses field name for agent compatibility)
68
+ is_empty: bool
69
+
70
+ # Language detection
71
+ detected_language: str
72
+ language_code: str
73
+ is_english: bool
74
+ language_confidence: str
75
+ language_detection_skipped: bool
76
+
77
+ # Preliminary relevance (keyword-based)
78
+ preliminary_relevant: bool
79
+ needs_relevance_validation: bool
80
+ relevance_keywords_found: List[str]
81
+ relevance_type: str
82
+ has_primary_keywords: bool
83
+
84
+ # Initial detections
85
+ products_detected: List[str]
86
+ competitors_detected: List[str]
87
+
88
+ # ============== Extraction Agent Output ==============
89
+ is_relevant: bool
90
+ relevance_confidence: str
91
+ relevance_reason: str
92
+ extraction_performed: bool
93
+
94
+ # Extracted facts
95
+ products_mentioned: List[str]
96
+ sabian_mention_context: str
97
+ author_role: str
98
+ competitors_mentioned: List[str]
99
+ thread_context_summary: str
100
+
101
+ # ============== Sentiment Analyzer Output ==============
102
+ sentiment_level: str
103
+ emotion_type: str
104
+ sentiment_confidence: str
105
+ sarcasm_detected: bool
106
+
107
+ # Product information
108
+ product_attributes: List[str]
109
+
110
+ # Competitive intelligence
111
+ competitor_products_owned: List[str]
112
+ comparison_type: str
113
+
114
+ # Customer journey (AUTHOR PERSPECTIVE ONLY)
115
+ intents: List[str]
116
+ purchase_stage: str
117
+ decision_drivers: List[str]
118
+ pain_points: List[str]
119
+ delight_factors: List[str]
120
+
121
+ # Analysis notes
122
+ analysis_notes: str
123
+ analysis_skipped: bool
124
+ analysis_skip_reason: str
125
+
126
+ # ============== Validator Output ==============
127
+ validation_passed: bool
128
+ validation_errors: List[str]
129
+ validation_warnings: List[str]
130
+ validation_flags: List[str]
131
+ processing_status: str
132
+
133
+ # ============== Processing Metadata ==============
134
+ processing_errors: Annotated[List[str], operator.add]
135
+ success: bool
136
+
137
+
138
+ class CommentAnalysisWorkflow:
139
+ """
140
+ LangGraph-based workflow for comment brand sentiment analysis v4.0.
141
+
142
+ Pipeline:
143
+ 1. Comment Preprocessor (no LLM) - plain text, comment context
144
+ 2. Relevance & Extraction Agent (LLM #1) - shared with forums
145
+ 3. Sentiment Analyzer Agent (LLM #2) - shared with forums
146
+ 4. Output Validator (no LLM) - shared with forums
147
+ """
148
+
149
+ def __init__(
150
+ self,
151
+ workflow_config: Dict[str, Any],
152
+ brand_config: Dict[str, Any],
153
+ analysis_categories: Dict[str, Any],
154
+ api_key: str
155
+ ):
156
+ """
157
+ Initialize the workflow with agents and configuration.
158
+
159
+ Args:
160
+ workflow_config: Workflow and agent configuration
161
+ brand_config: Brand-specific configuration
162
+ analysis_categories: Analysis category definitions
163
+ api_key: OpenAI API key
164
+ """
165
+ self.workflow_config = workflow_config
166
+ self.brand_config = brand_config
167
+ self.analysis_categories = analysis_categories
168
+ self.api_key = api_key
169
+
170
+ # Initialize agents
171
+ self._init_agents()
172
+
173
+ # Build the workflow graph
174
+ self.workflow = self._build_workflow()
175
+
176
+ logger.info("CommentAnalysisWorkflow v4.0 initialized successfully")
177
+
178
+ def _init_agents(self) -> None:
179
+ """Initialize all agents with their configurations."""
180
+ agents_config = self.workflow_config.get("agents", {})
181
+
182
+ # 1. Comment Preprocessor Agent (no LLM) - comment-specific
183
+ preprocessor_config = agents_config.get("preprocessor", {})
184
+ self.preprocessor = CommentPreprocessorAgent(
185
+ preprocessor_config,
186
+ self.brand_config
187
+ )
188
+
189
+ # 2. Relevance & Extraction Agent (LLM #1) - shared with forums
190
+ extraction_config = agents_config.get("relevance_extraction",
191
+ agents_config.get("relevance_validator", {})
192
+ )
193
+ self.extraction_agent = SabianRelevanceExtractionAgent(
194
+ extraction_config,
195
+ self.api_key,
196
+ self.brand_config,
197
+ self.analysis_categories
198
+ )
199
+
200
+ # 3. Sentiment Analyzer Agent (LLM #2) - shared with forums
201
+ analyzer_config = agents_config.get("sentiment_analyzer",
202
+ agents_config.get("brand_analyzer", {})
203
+ )
204
+ self.sentiment_analyzer = SabianSentimentAnalyzerAgent(
205
+ analyzer_config,
206
+ self.api_key,
207
+ self.brand_config,
208
+ self.analysis_categories
209
+ )
210
+
211
+ # 4. Output Validator Agent (no LLM) - shared with forums
212
+ validator_config = agents_config.get("output_validator", {})
213
+ self.output_validator = OutputValidatorAgent(
214
+ validator_config,
215
+ self.brand_config,
216
+ self.analysis_categories
217
+ )
218
+
219
+ logger.info("All 4 agents initialized for comment processing")
220
+
221
+ def _build_workflow(self) -> StateGraph:
222
+ """
223
+ Build the LangGraph workflow.
224
+
225
+ Flow:
226
+ preprocessing -> extraction -> (analysis if relevant) -> validation -> END
227
+
228
+ Returns:
229
+ Compiled StateGraph workflow
230
+ """
231
+ workflow = StateGraph(CommentAnalysisState)
232
+
233
+ # Add nodes
234
+ workflow.add_node("preprocessing", self._preprocessing_node)
235
+ workflow.add_node("extraction", self._extraction_node)
236
+ workflow.add_node("analysis", self._analysis_node)
237
+ workflow.add_node("validation", self._validation_node)
238
+
239
+ # Set entry point
240
+ workflow.set_entry_point("preprocessing")
241
+
242
+ # Define edges
243
+ workflow.add_conditional_edges(
244
+ "preprocessing",
245
+ self._route_after_preprocessing,
246
+ {
247
+ "extract": "extraction",
248
+ "skip_to_validation": "validation"
249
+ }
250
+ )
251
+
252
+ workflow.add_conditional_edges(
253
+ "extraction",
254
+ self._route_after_extraction,
255
+ {
256
+ "analyze": "analysis",
257
+ "skip_to_validation": "validation"
258
+ }
259
+ )
260
+
261
+ workflow.add_edge("analysis", "validation")
262
+ workflow.add_edge("validation", END)
263
+
264
+ return workflow.compile()
265
+
266
+ def _preprocessing_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
267
+ """
268
+ Preprocessing node: Plain text cleaning, language detection, keyword check.
269
+ """
270
+ try:
271
+ input_data = {
272
+ "comment_sk": state.get("comment_sk"),
273
+ "comment_text": state.get("comment_text", ""),
274
+ "content_title": state.get("content_title"),
275
+ "content_description": state.get("content_description"),
276
+ "parent_comment_text": state.get("parent_comment_text")
277
+ }
278
+
279
+ result = self.preprocessor.process(input_data)
280
+
281
+ if result.get("success", False):
282
+ # Content
283
+ state["cleaned_content"] = result.get("cleaned_content", "")
284
+ state["quoted_content"] = result.get("quoted_content")
285
+ state["has_quote"] = result.get("has_quote", False)
286
+ state["quoted_author"] = result.get("quoted_author")
287
+ state["raw_thread_context"] = result.get("raw_thread_context", "")
288
+ state["is_empty"] = result.get("is_empty", False)
289
+ state["original_text"] = result.get("original_text", state.get("comment_text", ""))
290
+
291
+ # Language
292
+ state["detected_language"] = result.get("detected_language", "English")
293
+ state["language_code"] = result.get("language_code", "en")
294
+ state["is_english"] = result.get("is_english", True)
295
+ state["language_confidence"] = result.get("language_confidence", "low")
296
+ state["language_detection_skipped"] = result.get("language_detection_skipped", False)
297
+
298
+ # Relevance
299
+ state["preliminary_relevant"] = result.get("preliminary_relevant", False)
300
+ state["needs_relevance_validation"] = result.get("needs_relevance_validation", False)
301
+ state["relevance_keywords_found"] = result.get("relevance_keywords_found", [])
302
+ state["relevance_type"] = result.get("relevance_type", "none")
303
+ state["has_primary_keywords"] = result.get("has_primary_keywords", False)
304
+
305
+ # Detections
306
+ state["products_detected"] = result.get("products_detected", [])
307
+ state["competitors_detected"] = result.get("competitors_detected", [])
308
+
309
+ state["success"] = True
310
+ else:
311
+ error_msg = f"Preprocessing failed: {result.get('error', 'Unknown error')}"
312
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
313
+ state["success"] = False
314
+
315
+ logger.debug(f"Preprocessing complete for comment {state.get('comment_sk')}")
316
+ return state
317
+
318
+ except Exception as e:
319
+ error_msg = f"Preprocessing node error: {str(e)}"
320
+ logger.error(error_msg)
321
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
322
+ state["success"] = False
323
+ return state
324
+
325
+ def _extraction_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
326
+ """
327
+ Extraction node: LLM-based relevance validation and fact extraction.
328
+ Reuses the same extraction agent as forums.
329
+ """
330
+ try:
331
+ input_data = {
332
+ "cleaned_content": state.get("cleaned_content", ""),
333
+ "quoted_content": state.get("quoted_content"),
334
+ "raw_thread_context": state.get("raw_thread_context", ""),
335
+ "relevance_keywords_found": state.get("relevance_keywords_found", []),
336
+ "preliminary_relevant": state.get("preliminary_relevant", False),
337
+ "needs_relevance_validation": state.get("needs_relevance_validation", True),
338
+ "products_detected": state.get("products_detected", []),
339
+ "competitors_detected": state.get("competitors_detected", []),
340
+ "is_english": state.get("is_english", True),
341
+ "detected_language": state.get("detected_language", "English")
342
+ }
343
+
344
+ result = self.extraction_agent.process(input_data)
345
+
346
+ # Update state with extraction results
347
+ state["is_relevant"] = result.get("is_relevant", False)
348
+ state["relevance_confidence"] = result.get("relevance_confidence", "low")
349
+ state["relevance_reason"] = result.get("relevance_reason", "")
350
+ state["extraction_performed"] = result.get("extraction_performed", True)
351
+
352
+ # Extracted facts
353
+ state["products_mentioned"] = result.get("products_mentioned", [])
354
+ state["sabian_mention_context"] = result.get("sabian_mention_context")
355
+ state["author_role"] = result.get("author_role", "unknown")
356
+ state["competitors_mentioned"] = result.get("competitors_mentioned", [])
357
+ state["thread_context_summary"] = result.get("thread_context_summary", "")
358
+
359
+ if not result.get("success", False) and result.get("error"):
360
+ state["processing_errors"] = state.get("processing_errors", []) + [result["error"]]
361
+
362
+ logger.debug(f"Extraction complete: is_relevant={state['is_relevant']}")
363
+ return state
364
+
365
+ except Exception as e:
366
+ error_msg = f"Extraction node error: {str(e)}"
367
+ logger.error(error_msg)
368
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
369
+ state["is_relevant"] = False
370
+ state["relevance_confidence"] = "low"
371
+ return state
372
+
373
+ def _analysis_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
374
+ """
375
+ Analysis node: Deep sentiment and intent analysis for relevant comments.
376
+ Reuses the same sentiment analyzer as forums.
377
+ """
378
+ try:
379
+ input_data = {
380
+ "cleaned_content": state.get("cleaned_content", ""),
381
+ "is_relevant": state.get("is_relevant", True),
382
+ "is_english": state.get("is_english", True),
383
+ "detected_language": state.get("detected_language", "English"),
384
+ "products_mentioned": state.get("products_mentioned", []),
385
+ "sabian_mention_context": state.get("sabian_mention_context"),
386
+ "author_role": state.get("author_role", "unknown"),
387
+ "competitors_mentioned": state.get("competitors_mentioned", []),
388
+ "thread_context_summary": state.get("thread_context_summary", "")
389
+ }
390
+
391
+ result = self.sentiment_analyzer.process(input_data)
392
+
393
+ if result.get("success", False):
394
+ # Sentiment
395
+ state["sentiment_level"] = result.get("sentiment_level")
396
+ state["emotion_type"] = result.get("emotion_type")
397
+ state["sentiment_confidence"] = result.get("sentiment_confidence", "medium")
398
+ state["sarcasm_detected"] = result.get("sarcasm_detected", False)
399
+
400
+ # Products
401
+ state["product_attributes"] = result.get("product_attributes", [])
402
+
403
+ # Competitive
404
+ state["competitor_products_owned"] = result.get("competitor_products_owned", [])
405
+ state["comparison_type"] = result.get("comparison_type")
406
+
407
+ # Journey
408
+ state["intents"] = result.get("intents", [])
409
+ state["purchase_stage"] = result.get("purchase_stage")
410
+ state["decision_drivers"] = result.get("decision_drivers", [])
411
+ state["pain_points"] = result.get("pain_points", [])
412
+ state["delight_factors"] = result.get("delight_factors", [])
413
+
414
+ # Notes
415
+ state["analysis_notes"] = result.get("analysis_notes", "")
416
+ state["analysis_skipped"] = result.get("analysis_skipped", False)
417
+ state["analysis_skip_reason"] = result.get("analysis_skip_reason", "")
418
+ else:
419
+ error_msg = f"Analysis failed: {result.get('error', 'Unknown error')}"
420
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
421
+
422
+ logger.debug(f"Analysis complete for comment {state.get('comment_sk')}")
423
+ return state
424
+
425
+ except Exception as e:
426
+ error_msg = f"Analysis node error: {str(e)}"
427
+ logger.error(error_msg)
428
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
429
+ return state
430
+
431
+ def _validation_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
432
+ """
433
+ Validation node: Rule-based validation and anomaly detection.
434
+ Reuses the same output validator as forums.
435
+ """
436
+ try:
437
+ result = self.output_validator.process(dict(state))
438
+
439
+ state["validation_passed"] = result.get("validation_passed", True)
440
+ state["validation_errors"] = result.get("validation_errors", [])
441
+ state["validation_warnings"] = result.get("validation_warnings", [])
442
+ state["validation_flags"] = result.get("validation_flags", [])
443
+ state["processing_status"] = result.get("processing_status", "completed")
444
+
445
+ # Set overall success
446
+ has_errors = len(state.get("processing_errors", [])) > 0
447
+ state["success"] = not has_errors or state.get("is_relevant") is not None
448
+
449
+ logger.debug(f"Validation complete: status={state['processing_status']}")
450
+ return state
451
+
452
+ except Exception as e:
453
+ error_msg = f"Validation node error: {str(e)}"
454
+ logger.error(error_msg)
455
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
456
+ state["validation_passed"] = False
457
+ state["processing_status"] = "validation_failed"
458
+ state["success"] = False
459
+ return state
460
+
461
+ def _route_after_preprocessing(self, state: CommentAnalysisState) -> str:
462
+ """Determine routing after preprocessing."""
463
+ if state.get("is_empty", False):
464
+ state["is_relevant"] = False
465
+ state["relevance_reason"] = "Empty content"
466
+ return "skip_to_validation"
467
+
468
+ if not state.get("is_english", True):
469
+ state["is_relevant"] = False
470
+ state["relevance_reason"] = f"Non-English: {state.get('detected_language')}"
471
+ return "skip_to_validation"
472
+
473
+ if (not state.get("preliminary_relevant", False) and
474
+ not state.get("needs_relevance_validation", False)):
475
+ state["is_relevant"] = False
476
+ state["relevance_reason"] = "No relevant keywords found"
477
+ return "skip_to_validation"
478
+
479
+ return "extract"
480
+
481
+ def _route_after_extraction(self, state: CommentAnalysisState) -> str:
482
+ """Determine routing after extraction."""
483
+ if state.get("is_relevant", False):
484
+ return "analyze"
485
+ return "skip_to_validation"
486
+
487
+ def process_comment(self, comment_data: Dict[str, Any]) -> Dict[str, Any]:
488
+ """
489
+ Process a single social media comment through the workflow.
490
+
491
+ Args:
492
+ comment_data: Dictionary containing comment data
493
+
494
+ Returns:
495
+ Dictionary with processed results
496
+ """
497
+ try:
498
+ initial_state = {
499
+ # Comment identifiers
500
+ "comment_sk": comment_data.get("comment_sk"),
501
+ "comment_id": comment_data.get("comment_id"),
502
+ "platform": comment_data.get("platform"),
503
+ "comment_timestamp": comment_data.get("comment_timestamp"),
504
+ "author_name": comment_data.get("author_name"),
505
+ "author_id": comment_data.get("author_id"),
506
+ "parent_comment_id": comment_data.get("parent_comment_id"),
507
+ "parent_comment_text": comment_data.get("parent_comment_text"),
508
+
509
+ # Content metadata
510
+ "content_sk": comment_data.get("content_sk"),
511
+ "content_id": comment_data.get("content_id"),
512
+ "content_description": comment_data.get("content_description"),
513
+ "content_title": comment_data.get("content_title"),
514
+ "channel_sk": comment_data.get("channel_sk"),
515
+ "channel_name": comment_data.get("channel_name"),
516
+ "channel_display_name": comment_data.get("channel_display_name"),
517
+
518
+ # Comment text
519
+ "comment_text": comment_data.get("comment_text", ""),
520
+
521
+ # Processing metadata
522
+ "processing_errors": [],
523
+ "success": True
524
+ }
525
+
526
+ final_state = self.workflow.invoke(initial_state)
527
+
528
+ return dict(final_state)
529
+
530
+ except Exception as e:
531
+ logger.error(f"Workflow execution error: {str(e)}")
532
+ return {
533
+ **comment_data,
534
+ "success": False,
535
+ "processing_errors": [str(e)],
536
+ "processing_status": "workflow_error"
537
+ }
538
+
539
+ def process_batch(self, comments: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
540
+ """
541
+ Process a batch of social media comments.
542
+
543
+ Args:
544
+ comments: List of comment dictionaries
545
+
546
+ Returns:
547
+ List of processed comment dictionaries
548
+ """
549
+ results = []
550
+ total = len(comments)
551
+
552
+ for idx, comment in enumerate(comments, 1):
553
+ logger.info(f"Processing comment {idx}/{total} (SK: {comment.get('comment_sk')})")
554
+ result = self.process_comment(comment)
555
+ results.append(result)
556
+
557
+ logger.info(f"Batch processing complete: {total} comments processed")
558
+ return results
processing_brand_sentiment/workflow/orchestrator.py ADDED
@@ -0,0 +1,551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Brand Analysis Workflow Orchestrator using LangGraph.
3
+
4
+ Coordinates the 4-agent pipeline:
5
+ 1. ContentPreprocessorAgent - HTML parsing, cleaning, keyword detection (no LLM)
6
+ 2. SabianRelevanceExtractionAgent - Relevance + fact extraction (LLM #1)
7
+ 3. SabianSentimentAnalyzerAgent - Deep sentiment analysis (LLM #2)
8
+ 4. OutputValidatorAgent - Rule-based validation (no LLM)
9
+
10
+ Architecture v4.0:
11
+ - Separation of concerns: extraction vs analysis
12
+ - Strict validation at every step
13
+ - Structured data flow between agents
14
+ - Conservative relevance determination
15
+ """
16
+
17
+ from typing import Dict, Any, List, TypedDict, Annotated, Optional
18
+ import operator
19
+ import json
20
+ import os
21
+ from langgraph.graph import StateGraph, END
22
+ import logging
23
+
24
+ from .agents.content_preprocessor_agent import ContentPreprocessorAgent
25
+ from .agents.sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent
26
+ from .agents.sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent
27
+ from .agents.output_validator_agent import OutputValidatorAgent
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class BrandAnalysisState(TypedDict):
33
+ """
34
+ State definition for the brand analysis workflow v4.0.
35
+
36
+ This state flows through all agents, accumulating data at each step.
37
+ """
38
+ # ============== Source Identifiers ==============
39
+ post_id: int
40
+ thread_id: int
41
+ post_author_id: int
42
+
43
+ # ============== Original Content ==============
44
+ post_content: str
45
+ original_content: str
46
+
47
+ # ============== Thread Context ==============
48
+ thread_title: str
49
+ thread_first_post: str
50
+ thread_started_at: Any
51
+ category_title: str
52
+ category_topic: str
53
+
54
+ # ============== Timestamps ==============
55
+ post_created_at: Any
56
+
57
+ # ============== Preprocessor Output ==============
58
+ cleaned_content: str
59
+ quoted_content: str
60
+ has_quote: bool
61
+ quoted_author: str
62
+ raw_thread_context: str # Raw context for extraction agent
63
+ is_empty: bool
64
+
65
+ # Language detection
66
+ detected_language: str
67
+ language_code: str
68
+ is_english: bool
69
+ language_confidence: str
70
+ language_detection_skipped: bool
71
+
72
+ # Preliminary relevance (keyword-based)
73
+ preliminary_relevant: bool
74
+ needs_relevance_validation: bool
75
+ relevance_keywords_found: List[str]
76
+ relevance_type: str
77
+ has_primary_keywords: bool
78
+
79
+ # Initial detections
80
+ products_detected: List[str]
81
+ competitors_detected: List[str]
82
+
83
+ # ============== Extraction Agent Output ==============
84
+ is_relevant: bool
85
+ relevance_confidence: str
86
+ relevance_reason: str
87
+ extraction_performed: bool
88
+
89
+ # Extracted facts
90
+ products_mentioned: List[str]
91
+ sabian_mention_context: str # primary_focus, significant_mention, casual_mention, comparison_context
92
+ author_role: str # current_owner, past_owner, potential_buyer, never_owned, unknown
93
+ competitors_mentioned: List[str]
94
+ thread_context_summary: str # NEW: Summarized context for storage and analysis
95
+
96
+ # ============== Sentiment Analyzer Output ==============
97
+ sentiment_level: str
98
+ emotion_type: str
99
+ sentiment_confidence: str
100
+ sarcasm_detected: bool
101
+
102
+ # Product information
103
+ product_attributes: List[str]
104
+
105
+ # Competitive intelligence
106
+ competitor_products_owned: List[str]
107
+ comparison_type: str
108
+
109
+ # Customer journey (AUTHOR PERSPECTIVE ONLY)
110
+ intents: List[str]
111
+ purchase_stage: str
112
+ decision_drivers: List[str]
113
+ pain_points: List[str]
114
+ delight_factors: List[str]
115
+
116
+ # Analysis notes
117
+ analysis_notes: str
118
+ analysis_skipped: bool
119
+ analysis_skip_reason: str
120
+
121
+ # ============== Validator Output ==============
122
+ validation_passed: bool
123
+ validation_errors: List[str]
124
+ validation_warnings: List[str]
125
+ validation_flags: List[str]
126
+ processing_status: str # completed, completed_with_flags, validation_failed
127
+
128
+ # ============== Processing Metadata ==============
129
+ processing_errors: Annotated[List[str], operator.add]
130
+ success: bool
131
+
132
+
133
+ class BrandAnalysisWorkflow:
134
+ """
135
+ LangGraph-based workflow for brand sentiment analysis v4.0.
136
+
137
+ Pipeline:
138
+ 1. Content Preprocessor (no LLM)
139
+ 2. Relevance & Extraction Agent (LLM #1)
140
+ 3. Sentiment Analyzer Agent (LLM #2) - only for relevant posts
141
+ 4. Output Validator (no LLM)
142
+ """
143
+
144
+ def __init__(
145
+ self,
146
+ workflow_config: Dict[str, Any],
147
+ brand_config: Dict[str, Any],
148
+ analysis_categories: Dict[str, Any],
149
+ api_key: str
150
+ ):
151
+ """
152
+ Initialize the workflow with agents and configuration.
153
+
154
+ Args:
155
+ workflow_config: Workflow and agent configuration
156
+ brand_config: Brand-specific configuration
157
+ analysis_categories: Analysis category definitions
158
+ api_key: OpenAI API key
159
+ """
160
+ self.workflow_config = workflow_config
161
+ self.brand_config = brand_config
162
+ self.analysis_categories = analysis_categories
163
+ self.api_key = api_key
164
+
165
+ # Initialize agents
166
+ self._init_agents()
167
+
168
+ # Build the workflow graph
169
+ self.workflow = self._build_workflow()
170
+
171
+ logger.info("BrandAnalysisWorkflow v4.0 initialized successfully")
172
+
173
+ def _init_agents(self) -> None:
174
+ """Initialize all agents with their configurations."""
175
+ agents_config = self.workflow_config.get("agents", {})
176
+
177
+ # 1. Content Preprocessor Agent (no LLM)
178
+ preprocessor_config = agents_config.get("preprocessor", {})
179
+ self.preprocessor = ContentPreprocessorAgent(
180
+ preprocessor_config,
181
+ self.brand_config
182
+ )
183
+
184
+ # 2. Relevance & Extraction Agent (LLM #1)
185
+ extraction_config = agents_config.get("relevance_extraction",
186
+ agents_config.get("relevance_validator", {}) # Fallback to old config
187
+ )
188
+ self.extraction_agent = SabianRelevanceExtractionAgent(
189
+ extraction_config,
190
+ self.api_key,
191
+ self.brand_config,
192
+ self.analysis_categories
193
+ )
194
+
195
+ # 3. Sentiment Analyzer Agent (LLM #2)
196
+ analyzer_config = agents_config.get("sentiment_analyzer",
197
+ agents_config.get("brand_analyzer", {}) # Fallback to old config
198
+ )
199
+ self.sentiment_analyzer = SabianSentimentAnalyzerAgent(
200
+ analyzer_config,
201
+ self.api_key,
202
+ self.brand_config,
203
+ self.analysis_categories
204
+ )
205
+
206
+ # 4. Output Validator Agent (no LLM)
207
+ validator_config = agents_config.get("output_validator", {})
208
+ self.output_validator = OutputValidatorAgent(
209
+ validator_config,
210
+ self.brand_config,
211
+ self.analysis_categories
212
+ )
213
+
214
+ logger.info("All 4 agents initialized")
215
+
216
+ def _build_workflow(self) -> StateGraph:
217
+ """
218
+ Build the LangGraph workflow.
219
+
220
+ Flow:
221
+ preprocessing -> extraction -> (analysis if relevant) -> validation -> END
222
+
223
+ Returns:
224
+ Compiled StateGraph workflow
225
+ """
226
+ workflow = StateGraph(BrandAnalysisState)
227
+
228
+ # Add nodes
229
+ workflow.add_node("preprocessing", self._preprocessing_node)
230
+ workflow.add_node("extraction", self._extraction_node)
231
+ workflow.add_node("analysis", self._analysis_node)
232
+ workflow.add_node("validation", self._validation_node)
233
+
234
+ # Set entry point
235
+ workflow.set_entry_point("preprocessing")
236
+
237
+ # Define edges
238
+ # Preprocessing -> conditional routing
239
+ workflow.add_conditional_edges(
240
+ "preprocessing",
241
+ self._route_after_preprocessing,
242
+ {
243
+ "extract": "extraction",
244
+ "skip_to_validation": "validation"
245
+ }
246
+ )
247
+
248
+ # Extraction -> conditional routing
249
+ workflow.add_conditional_edges(
250
+ "extraction",
251
+ self._route_after_extraction,
252
+ {
253
+ "analyze": "analysis",
254
+ "skip_to_validation": "validation"
255
+ }
256
+ )
257
+
258
+ # Analysis -> validation
259
+ workflow.add_edge("analysis", "validation")
260
+
261
+ # Validation -> END
262
+ workflow.add_edge("validation", END)
263
+
264
+ return workflow.compile()
265
+
266
+ def _preprocessing_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
267
+ """
268
+ Preprocessing node: HTML parsing, cleaning, language detection, keyword check.
269
+ """
270
+ try:
271
+ input_data = {
272
+ "post_id": state.get("post_id"),
273
+ "post_content": state.get("post_content", ""),
274
+ "thread_title": state.get("thread_title"),
275
+ "thread_first_post": state.get("thread_first_post"),
276
+ "category_title": state.get("category_title"),
277
+ "category_topic": state.get("category_topic")
278
+ }
279
+
280
+ result = self.preprocessor.process(input_data)
281
+
282
+ if result.get("success", False):
283
+ # Content
284
+ state["cleaned_content"] = result.get("cleaned_content", "")
285
+ state["quoted_content"] = result.get("quoted_content")
286
+ state["has_quote"] = result.get("has_quote", False)
287
+ state["quoted_author"] = result.get("quoted_author")
288
+ state["raw_thread_context"] = result.get("raw_thread_context", "")
289
+ state["is_empty"] = result.get("is_empty", False)
290
+ state["original_content"] = result.get("original_content", state.get("post_content", ""))
291
+
292
+ # Language
293
+ state["detected_language"] = result.get("detected_language", "English")
294
+ state["language_code"] = result.get("language_code", "en")
295
+ state["is_english"] = result.get("is_english", True)
296
+ state["language_confidence"] = result.get("language_confidence", "low")
297
+ state["language_detection_skipped"] = result.get("language_detection_skipped", False)
298
+
299
+ # Relevance
300
+ state["preliminary_relevant"] = result.get("preliminary_relevant", False)
301
+ state["needs_relevance_validation"] = result.get("needs_relevance_validation", False)
302
+ state["relevance_keywords_found"] = result.get("relevance_keywords_found", [])
303
+ state["relevance_type"] = result.get("relevance_type", "none")
304
+ state["has_primary_keywords"] = result.get("has_primary_keywords", False)
305
+
306
+ # Detections
307
+ state["products_detected"] = result.get("products_detected", [])
308
+ state["competitors_detected"] = result.get("competitors_detected", [])
309
+
310
+ state["success"] = True
311
+ else:
312
+ error_msg = f"Preprocessing failed: {result.get('error', 'Unknown error')}"
313
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
314
+ state["success"] = False
315
+
316
+ logger.debug(f"Preprocessing complete for post {state.get('post_id')}")
317
+ return state
318
+
319
+ except Exception as e:
320
+ error_msg = f"Preprocessing node error: {str(e)}"
321
+ logger.error(error_msg)
322
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
323
+ state["success"] = False
324
+ return state
325
+
326
+ def _extraction_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
327
+ """
328
+ Extraction node: LLM-based relevance validation and fact extraction.
329
+ """
330
+ try:
331
+ input_data = {
332
+ "cleaned_content": state.get("cleaned_content", ""),
333
+ "quoted_content": state.get("quoted_content"),
334
+ "raw_thread_context": state.get("raw_thread_context", ""),
335
+ "relevance_keywords_found": state.get("relevance_keywords_found", []),
336
+ "preliminary_relevant": state.get("preliminary_relevant", False),
337
+ "needs_relevance_validation": state.get("needs_relevance_validation", True),
338
+ "products_detected": state.get("products_detected", []),
339
+ "competitors_detected": state.get("competitors_detected", []),
340
+ "is_english": state.get("is_english", True),
341
+ "detected_language": state.get("detected_language", "English")
342
+ }
343
+
344
+ result = self.extraction_agent.process(input_data)
345
+
346
+ # Update state with extraction results
347
+ state["is_relevant"] = result.get("is_relevant", False)
348
+ state["relevance_confidence"] = result.get("relevance_confidence", "low")
349
+ state["relevance_reason"] = result.get("relevance_reason", "")
350
+ state["extraction_performed"] = result.get("extraction_performed", True)
351
+
352
+ # Extracted facts
353
+ state["products_mentioned"] = result.get("products_mentioned", [])
354
+ state["sabian_mention_context"] = result.get("sabian_mention_context")
355
+ state["author_role"] = result.get("author_role", "unknown")
356
+ state["competitors_mentioned"] = result.get("competitors_mentioned", [])
357
+ state["thread_context_summary"] = result.get("thread_context_summary", "")
358
+
359
+ if not result.get("success", False) and result.get("error"):
360
+ state["processing_errors"] = state.get("processing_errors", []) + [result["error"]]
361
+
362
+ logger.debug(f"Extraction complete: is_relevant={state['is_relevant']}")
363
+ return state
364
+
365
+ except Exception as e:
366
+ error_msg = f"Extraction node error: {str(e)}"
367
+ logger.error(error_msg)
368
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
369
+ state["is_relevant"] = False
370
+ state["relevance_confidence"] = "low"
371
+ return state
372
+
373
+ def _analysis_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
374
+ """
375
+ Analysis node: Deep sentiment and intent analysis for relevant posts.
376
+ """
377
+ try:
378
+ input_data = {
379
+ "cleaned_content": state.get("cleaned_content", ""),
380
+ "is_relevant": state.get("is_relevant", True),
381
+ "is_english": state.get("is_english", True),
382
+ "detected_language": state.get("detected_language", "English"),
383
+ "products_mentioned": state.get("products_mentioned", []),
384
+ "sabian_mention_context": state.get("sabian_mention_context"),
385
+ "author_role": state.get("author_role", "unknown"),
386
+ "competitors_mentioned": state.get("competitors_mentioned", []),
387
+ "thread_context_summary": state.get("thread_context_summary", "")
388
+ }
389
+
390
+ result = self.sentiment_analyzer.process(input_data)
391
+
392
+ if result.get("success", False):
393
+ # Sentiment
394
+ state["sentiment_level"] = result.get("sentiment_level")
395
+ state["emotion_type"] = result.get("emotion_type")
396
+ state["sentiment_confidence"] = result.get("sentiment_confidence", "medium")
397
+ state["sarcasm_detected"] = result.get("sarcasm_detected", False)
398
+
399
+ # Products
400
+ state["product_attributes"] = result.get("product_attributes", [])
401
+
402
+ # Competitive
403
+ state["competitor_products_owned"] = result.get("competitor_products_owned", [])
404
+ state["comparison_type"] = result.get("comparison_type")
405
+
406
+ # Journey
407
+ state["intents"] = result.get("intents", [])
408
+ state["purchase_stage"] = result.get("purchase_stage")
409
+ state["decision_drivers"] = result.get("decision_drivers", [])
410
+ state["pain_points"] = result.get("pain_points", [])
411
+ state["delight_factors"] = result.get("delight_factors", [])
412
+
413
+ # Notes
414
+ state["analysis_notes"] = result.get("analysis_notes", "")
415
+ state["analysis_skipped"] = result.get("analysis_skipped", False)
416
+ state["analysis_skip_reason"] = result.get("analysis_skip_reason", "")
417
+ else:
418
+ error_msg = f"Analysis failed: {result.get('error', 'Unknown error')}"
419
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
420
+
421
+ logger.debug(f"Analysis complete for post {state.get('post_id')}")
422
+ return state
423
+
424
+ except Exception as e:
425
+ error_msg = f"Analysis node error: {str(e)}"
426
+ logger.error(error_msg)
427
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
428
+ return state
429
+
430
+ def _validation_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
431
+ """
432
+ Validation node: Rule-based validation and anomaly detection.
433
+ """
434
+ try:
435
+ result = self.output_validator.process(dict(state))
436
+
437
+ state["validation_passed"] = result.get("validation_passed", True)
438
+ state["validation_errors"] = result.get("validation_errors", [])
439
+ state["validation_warnings"] = result.get("validation_warnings", [])
440
+ state["validation_flags"] = result.get("validation_flags", [])
441
+ state["processing_status"] = result.get("processing_status", "completed")
442
+
443
+ # Set overall success
444
+ has_errors = len(state.get("processing_errors", [])) > 0
445
+ state["success"] = not has_errors or state.get("is_relevant") is not None
446
+
447
+ logger.debug(f"Validation complete: status={state['processing_status']}")
448
+ return state
449
+
450
+ except Exception as e:
451
+ error_msg = f"Validation node error: {str(e)}"
452
+ logger.error(error_msg)
453
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
454
+ state["validation_passed"] = False
455
+ state["processing_status"] = "validation_failed"
456
+ state["success"] = False
457
+ return state
458
+
459
+ def _route_after_preprocessing(self, state: BrandAnalysisState) -> str:
460
+ """
461
+ Determine routing after preprocessing.
462
+ """
463
+ # If empty content, skip to validation
464
+ if state.get("is_empty", False):
465
+ state["is_relevant"] = False
466
+ state["relevance_reason"] = "Empty content"
467
+ return "skip_to_validation"
468
+
469
+ # If not English, skip to validation
470
+ if not state.get("is_english", True):
471
+ state["is_relevant"] = False
472
+ state["relevance_reason"] = f"Non-English: {state.get('detected_language')}"
473
+ return "skip_to_validation"
474
+
475
+ # If no keywords found and no need for validation, skip
476
+ if (not state.get("preliminary_relevant", False) and
477
+ not state.get("needs_relevance_validation", False)):
478
+ state["is_relevant"] = False
479
+ state["relevance_reason"] = "No relevant keywords found"
480
+ return "skip_to_validation"
481
+
482
+ # Otherwise, go to extraction
483
+ return "extract"
484
+
485
+ def _route_after_extraction(self, state: BrandAnalysisState) -> str:
486
+ """
487
+ Determine routing after extraction.
488
+ """
489
+ if state.get("is_relevant", False):
490
+ return "analyze"
491
+ return "skip_to_validation"
492
+
493
+ def process_post(self, post_data: Dict[str, Any]) -> Dict[str, Any]:
494
+ """
495
+ Process a single forum post through the workflow.
496
+
497
+ Args:
498
+ post_data: Dictionary containing post data
499
+
500
+ Returns:
501
+ Dictionary with processed results
502
+ """
503
+ try:
504
+ initial_state = {
505
+ "post_id": post_data.get("post_id"),
506
+ "thread_id": post_data.get("thread_id"),
507
+ "post_author_id": post_data.get("post_author_id"),
508
+ "post_content": post_data.get("post_content", ""),
509
+ "thread_title": post_data.get("thread_title"),
510
+ "thread_first_post": post_data.get("thread_first_post"),
511
+ "thread_started_at": post_data.get("thread_started_at"),
512
+ "category_title": post_data.get("category_title"),
513
+ "category_topic": post_data.get("category_topic"),
514
+ "post_created_at": post_data.get("post_created_at"),
515
+ "processing_errors": [],
516
+ "success": True
517
+ }
518
+
519
+ final_state = self.workflow.invoke(initial_state)
520
+
521
+ return dict(final_state)
522
+
523
+ except Exception as e:
524
+ logger.error(f"Workflow execution error: {str(e)}")
525
+ return {
526
+ **post_data,
527
+ "success": False,
528
+ "processing_errors": [str(e)],
529
+ "processing_status": "workflow_error"
530
+ }
531
+
532
+ def process_batch(self, posts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
533
+ """
534
+ Process a batch of forum posts.
535
+
536
+ Args:
537
+ posts: List of post dictionaries
538
+
539
+ Returns:
540
+ List of processed post dictionaries
541
+ """
542
+ results = []
543
+ total = len(posts)
544
+
545
+ for idx, post in enumerate(posts, 1):
546
+ logger.info(f"Processing post {idx}/{total} (ID: {post.get('post_id')})")
547
+ result = self.process_post(post)
548
+ results.append(result)
549
+
550
+ logger.info(f"Batch processing complete: {total} posts processed")
551
+ return results
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
src/streamlit_app.py DELETED
@@ -1,40 +0,0 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
- import streamlit as st
5
-
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
visualization/README.md ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Musora Sentiment Analysis Dashboard
2
+
3
+ A comprehensive, interactive Streamlit dashboard for visualizing sentiment analysis results from **multiple data sources**: social media comments (Facebook, Instagram, YouTube, Twitter) and Musora internal app comments across Musora brands (Drumeo, Pianote, Guitareo, Singeo).
4
+
5
+ ## Features
6
+
7
+ ### Main Dashboard
8
+ - **Overall sentiment distribution** with interactive pie charts and gauge indicators
9
+ - **Sentiment analysis by brand** (Drumeo, Pianote, Musora) with stacked bar charts
10
+ - **Sentiment analysis by platform** (Facebook, Instagram, etc.) with percentage distributions
11
+ - **Intent analysis** showing multi-label intent distributions (praise, question, request, etc.)
12
+ - **Cross-dimensional heatmaps** showing negative sentiment by brand and platform
13
+ - **Reply requirements analysis** with urgency breakdown
14
+ - **Language distribution** analysis
15
+ - **Temporal trends** with customizable time granularity (daily, weekly, monthly)
16
+ - **Hierarchical sunburst** visualization for brand > platform > sentiment
17
+
18
+ ### Sentiment Analysis Page
19
+ - **Multi-sentiment filtering** - Filter by any combination of sentiments (positive, negative, neutral, etc.) to analyze both good and bad performance
20
+ - **Intent filtering** - Filter contents by specific user intents (question, praise, feedback_negative, etc.)
21
+ - **Dynamic severity scoring** - Ranks contents based on selected sentiments, adapts calculations to your filter choices
22
+ - **Advanced ranking controls** - Customize with minimum comment thresholds and multiple dynamic sort options
23
+ - **Sort options** - Severity Score (balanced), Sentiment %, Sentiment Count (absolute), or Total Comments (volume)
24
+ - **Engagement scatter plot** showing relationship between comment volume and sentiment
25
+ - **Thumbnail display** for Musora internal app content (visual content previews)
26
+ - **Detailed content analysis** with sentiment and intent distributions for each content
27
+ - **AI-Powered Analysis** - Optional AI-generated insights and recommendations for each content
28
+ - **View filtered comments** for each content with expandable sections
29
+ - **Actionable insights** and recommendations based on sentiment patterns
30
+ - **Export functionality** to download results as CSV with dynamic columns
31
+
32
+ ### Reply Required Page
33
+ - **Prioritized comment queue** with urgency indicators (Urgent, High, Medium, Low)
34
+ - **Smart filtering** by priority, platform, brand, and intent
35
+ - **Pagination** for easy navigation through large comment lists
36
+ - **Comment cards** showing full context (author, timestamp, sentiment, intent)
37
+ - **Original and translated text** with expandable view for non-English comments
38
+ - **Reply requirements by content** showing which contents need most attention
39
+ - **Export functionality** for team collaboration or CRM import
40
+
41
+ ## Architecture
42
+
43
+ ```
44
+ visualization/
45
+ ├── app.py # Main Streamlit application
46
+ ├── config/
47
+ │ └── viz_config.json # Configuration for colors, settings, queries
48
+ ├── data/
49
+ │ └── data_loader.py # Snowflake data loading with caching
50
+ ├── utils/
51
+ │ ├── data_processor.py # Data aggregation and processing
52
+ │ └── metrics.py # Metrics calculation (KPIs, scores)
53
+ ├── components/
54
+ │ ├── dashboard.py # Main dashboard page
55
+ │ ├── sentiment_analysis.py # Comprehensive sentiment analysis page
56
+ │ └── reply_required.py # Reply management page
57
+ ├── visualizations/
58
+ │ ├── sentiment_charts.py # Sentiment visualization functions
59
+ │ ├── distribution_charts.py # Distribution visualization functions
60
+ │ └── content_cards.py # Display components and cards
61
+ ├── requirements.txt # Python dependencies
62
+ └── README.md # This file
63
+ ```
64
+
65
+ ## Installation
66
+
67
+ ### Prerequisites
68
+ - Python 3.8+
69
+ - Snowflake account with access to sentiment analysis data
70
+ - Required environment variables in parent `.env` file:
71
+ - `SNOWFLAKE_USER`
72
+ - `SNOWFLAKE_PASSWORD`
73
+ - `SNOWFLAKE_ACCOUNT`
74
+ - `SNOWFLAKE_ROLE`
75
+ - `SNOWFLAKE_DATABASE`
76
+ - `SNOWFLAKE_WAREHOUSE`
77
+ - `SNOWFLAKE_SCHEMA`
78
+
79
+ ### Setup
80
+
81
+ 1. Navigate to the visualization directory:
82
+ ```bash
83
+ cd visualization
84
+ ```
85
+
86
+ 2. Install dependencies:
87
+ ```bash
88
+ pip install -r requirements.txt
89
+ ```
90
+
91
+ 3. Ensure parent `.env` file is properly configured with Snowflake credentials
92
+
93
+ ## Usage
94
+
95
+ ### Running the Dashboard
96
+
97
+ From the `visualization` directory:
98
+
99
+ ```bash
100
+ streamlit run app.py
101
+ ```
102
+
103
+ The dashboard will open in your default browser at `http://localhost:8501`
104
+
105
+ ### Navigation
106
+
107
+ Use the sidebar to:
108
+ - **Select pages** (Dashboard, Sentiment Analysis, Reply Required)
109
+ - **Apply global filters** by platform, brand, sentiment, and date range
110
+ - **Reload data** to fetch latest updates from Snowflake
111
+ - **View data information** (record count, last update time)
112
+
113
+ ### Filtering Data
114
+
115
+ 1. Select desired filters in the sidebar:
116
+ - **Platforms**: Filter by data source (Facebook, Instagram, YouTube, Twitter, musora_app)
117
+ - **Brands**: Filter by Musora brand (Drumeo, Pianote, Guitareo, Singeo, Musora)
118
+ - **Sentiments**: Filter by sentiment polarity
119
+ - **Date Range**: Filter by comment timestamp
120
+
121
+ 2. Click "Apply Filters" to update visualizations
122
+
123
+ 3. Click "Reset Filters" to clear all filters
124
+
125
+ ### Exporting Data
126
+
127
+ Each page provides export functionality:
128
+ - **Sentiment Analysis**: Download top N contents as CSV with dynamic columns based on active filters
129
+ - **Reply Required**: Download filtered comments as CSV
130
+
131
+ ## Configuration
132
+
133
+ ### Color Schemes
134
+
135
+ Edit `config/viz_config.json` to customize:
136
+ - **Sentiment colors**: Colors for each sentiment polarity
137
+ - **Intent colors**: Colors for each intent category
138
+ - **Platform colors**: Brand colors for each platform
139
+ - **Brand colors**: Colors for each Musora brand
140
+
141
+ ### Dashboard Settings
142
+
143
+ Configure in `viz_config.json`:
144
+ - `default_date_range_days`: Default date range for filtering
145
+ - `max_comments_display`: Maximum comments to display per page
146
+ - `chart_height`: Default height for charts
147
+ - `top_n_contents`: Number of contents to show in poor sentiment page
148
+
149
+ ### Data Query
150
+
151
+ The Snowflake query is configured in `viz_config.json`:
152
+ ```json
153
+ "snowflake": {
154
+ "query": "SELECT s.*, c.CHANNEL_NAME as BRAND, c.MESSAGE as CONTENT_DESCRIPTION, c.PERMALINK_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT c ON s.CONTENT_SK = c.CONTENT_SK"
155
+ }
156
+ ```
157
+
158
+ ## Extending the Dashboard
159
+
160
+ ### Adding New Pages
161
+
162
+ 1. Create a new component file in `components/`:
163
+ ```python
164
+ # components/new_page.py
165
+ def render_new_page(df):
166
+ st.title("New Page")
167
+ # Your page logic here
168
+ ```
169
+
170
+ 2. Import and add to navigation in `app.py`:
171
+ ```python
172
+ from components.new_page import render_new_page
173
+
174
+ # Add to page selection
175
+ page = st.radio("Select Page", [..., "New Page"])
176
+
177
+ # Add to page rendering
178
+ elif page == "New Page":
179
+ render_new_page(df)
180
+ ```
181
+
182
+ ### Adding New Visualizations
183
+
184
+ 1. Add visualization function to appropriate module:
185
+ - `visualizations/sentiment_charts.py` for sentiment-related charts
186
+ - `visualizations/distribution_charts.py` for distribution charts
187
+
188
+ 2. Use the function in page components
189
+
190
+ Example:
191
+ ```python
192
+ def create_new_chart(df, title="New Chart"):
193
+ fig = go.Figure(...)
194
+ return fig
195
+ ```
196
+
197
+ ### Adding New Metrics
198
+
199
+ Add calculation methods to `utils/metrics.py`:
200
+ ```python
201
+ @staticmethod
202
+ def calculate_new_metric(df):
203
+ # Your metric calculation
204
+ return metric_value
205
+ ```
206
+
207
+ ### Customizing Card Displays
208
+
209
+ Modify display methods in `visualizations/content_cards.py`:
210
+ ```python
211
+ @staticmethod
212
+ def display_custom_card(data):
213
+ # Your custom card layout
214
+ pass
215
+ ```
216
+
217
+ ## Data Schema
218
+
219
+ The dashboard expects the following columns from Snowflake:
220
+
221
+ ### Required Columns
222
+ - `comment_sk`: Unique comment identifier
223
+ - `comment_id`: Comment ID
224
+ - `original_text`: Original comment text
225
+ - `platform`: Social media platform
226
+ - `brand`: Musora brand name
227
+ - `sentiment_polarity`: Sentiment classification (very_positive, positive, neutral, negative, very_negative)
228
+ - `intent`: Comma-separated intent labels
229
+ - `requires_reply`: Boolean indicating if reply is needed
230
+ - `content_sk`: Content identifier
231
+ - `content_description`: Description of the content
232
+ - `permalink_url`: URL to the original content
233
+
234
+ ### Optional Columns
235
+ - `comment_timestamp`: When comment was posted
236
+ - `processed_at`: When sentiment analysis was performed
237
+ - `translated_text`: English translation for non-English comments
238
+ - `detected_language`: Detected language of comment
239
+ - `is_english`: Boolean indicating if comment is in English
240
+ - `sentiment_confidence`: Confidence level of sentiment analysis
241
+ - `author_name`: Comment author name
242
+ - `channel_name`: Channel name
243
+ - `thumbnail_url`: Content thumbnail URL (for Musora internal app content)
244
+ - `parent_comment_id`: ID of parent comment (for replies)
245
+ - `parent_comment_text`: Text of parent comment (for reply context)
246
+
247
+ ## Performance Optimization
248
+
249
+ ### Caching
250
+ - Data loading is cached for 5 minutes using `@st.cache_data`
251
+ - Clear cache using "Reload Data" button in sidebar
252
+
253
+ ### Pagination
254
+ - Comments requiring reply are paginated (10 per page)
255
+ - Reduces memory usage and improves rendering speed
256
+
257
+ ### Filtering
258
+ - Apply filters to reduce dataset size before visualization
259
+ - Filters are applied efficiently using pandas operations
260
+
261
+ ## Troubleshooting
262
+
263
+ ### Connection Issues
264
+ - Verify Snowflake credentials in parent `.env` file
265
+ - Check network connectivity to Snowflake
266
+ - Ensure correct database, schema, and table names
267
+
268
+ ### No Data Displayed
269
+ - Check if Snowflake query returns data
270
+ - Verify column names match expected schema
271
+ - Check applied filters - try resetting them
272
+
273
+ ### Slow Performance
274
+ - Reduce date range in filters
275
+ - Use "Apply Filters" to work with smaller datasets
276
+ - Consider adding database indexes on frequently filtered columns
277
+
278
+ ### Visualization Errors
279
+ - Check for missing or null values in data
280
+ - Verify data types match expected types (dates, booleans, etc.)
281
+ - Review browser console for JavaScript errors
282
+
283
+ ## Best Practices
284
+
285
+ 1. **Regular Data Updates**: Reload data periodically to see latest comments
286
+ 2. **Use Filters**: Apply filters to focus on specific segments
287
+ 3. **Export Insights**: Download CSV reports for offline analysis
288
+ 4. **Monitor Reply Queue**: Check "Reply Required" page daily
289
+ 5. **Track Trends**: Use temporal visualizations to identify patterns
290
+ 6. **Prioritize Urgent**: Address urgent replies (negative sentiment) first
291
+
292
+ ## Support
293
+
294
+ For issues or feature requests:
295
+ 1. Check the troubleshooting section
296
+ 2. Review configuration files for correct settings
297
+ 3. Consult the main project README for sentiment analysis pipeline details
298
+
299
+ ## Version History
300
+
301
+ ### v1.3 (Current)
302
+ - **Comprehensive Sentiment Analysis Page Redesign**
303
+ - Renamed "Poor Sentiment Contents" to "Sentiment Analysis" page
304
+ - **NEW: Multi-Sentiment Filtering** - Filter by any combination of sentiments (positive, negative, neutral, very_positive, very_negative)
305
+ - **NEW: Intent Filtering** - Filter contents by specific user intents (question, praise, feedback_negative, request, etc.)
306
+ - **Filter Status Indicator** - Visual feedback showing when filters are active
307
+
308
+ - **Dynamic Ranking & Calculations**
309
+ - **Dynamic severity scoring** - Automatically calculates based on selected sentiments (not just negative)
310
+ - **Dynamic metrics** - Sentiment percentages and counts adapt to your filter selection
311
+ - Sort options now work with any sentiment combination
312
+
313
+ - **Enhanced User Experience**
314
+ - Summary statistics that dynamically adapt to filters
315
+ - Contextual explanations that change based on selected sentiments
316
+ - Export with dynamic columns based on active filters
317
+ - Backward compatible - works like original when no filters selected
318
+
319
+ - **New Use Cases Enabled**
320
+ - Analyze high-performing content (filter by positive sentiments)
321
+ - Identify successful patterns (combine sentiment + intent filters)
322
+ - Compare sentiment types side-by-side
323
+ - Focus on specific user behaviors
324
+
325
+ ### v1.2
326
+ - **Multi-source data support** - Integrated Musora internal app comments alongside social media
327
+ - **Smart severity scoring** - Content ranking now balances sentiment % with comment volume
328
+ - **Advanced ranking controls** - Min comments filter and multiple sort options (severity, %, count, volume)
329
+ - **Thumbnail display** - Visual content previews for Musora internal app content
330
+ - **Platform disambiguation** - Renamed internal platform to "musora_app" to differentiate from "musora" brand
331
+ - **Improved chart stability** - Fixed duplicate chart ID errors with unique keys
332
+ - **Enhanced data schema** - Added support for thumbnail_url and parent comment fields
333
+
334
+ ### v1.1
335
+ - **AI-Powered Agents** - ContentSummaryAgent for intelligent comment analysis
336
+ - AI Analysis button on Sentiment Analysis page
337
+ - LLM Helper with OpenAI API integration
338
+ - Modular agent architecture ready for expansion
339
+
340
+ ### v1.0
341
+ - Initial release
342
+ - Main dashboard with comprehensive visualizations
343
+ - Sentiment contents analysis page
344
+ - Reply required management page
345
+ - Global filtering and export functionality
346
+ - Plotly-based interactive visualizations
347
+ - Modular, extensible architecture
visualization/SnowFlakeConnection.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This class create a connection to Snowflake, run queries (read and write)
3
+ """
4
+ import json
5
+ import os
6
+ from snowflake.snowpark import Session
7
+ from dotenv import load_dotenv
8
+ import logging
9
+ logger = logging.getLogger()
10
+ load_dotenv()
11
+
12
+ class SnowFlakeConn:
13
+ def __init__(self):
14
+ self.session = self.connect_to_snowflake()
15
+
16
+
17
+ # =========================================================
18
+ def connect_to_snowflake(self):
19
+ # --- Snowflake connection via env vars ---
20
+ # Validate all required credentials exist
21
+ required_credentials = [
22
+ "SNOWFLAKE_USER",
23
+ "SNOWFLAKE_PASSWORD",
24
+ "SNOWFLAKE_ACCOUNT",
25
+ "SNOWFLAKE_ROLE",
26
+ "SNOWFLAKE_DATABASE",
27
+ "SNOWFLAKE_WAREHOUSE",
28
+ "SNOWFLAKE_SCHEMA"
29
+ ]
30
+
31
+ missing_credentials = []
32
+ for cred in required_credentials:
33
+ if not self.get_credential(cred):
34
+ missing_credentials.append(cred)
35
+
36
+ if missing_credentials:
37
+ error_msg = f"Missing required Snowflake credentials: {', '.join(missing_credentials)}"
38
+ logger.error(error_msg)
39
+ raise ValueError(error_msg)
40
+
41
+ conn = dict(
42
+ user=self.get_credential("SNOWFLAKE_USER"),
43
+ password=self.get_credential("SNOWFLAKE_PASSWORD"),
44
+ account=self.get_credential("SNOWFLAKE_ACCOUNT"),
45
+ role=self.get_credential("SNOWFLAKE_ROLE"),
46
+ database=self.get_credential("SNOWFLAKE_DATABASE"),
47
+ warehouse=self.get_credential("SNOWFLAKE_WAREHOUSE"),
48
+ schema=self.get_credential("SNOWFLAKE_SCHEMA"),
49
+ )
50
+
51
+ try:
52
+ session = Session.builder.configs(conn).create()
53
+ logger.info("Successfully connected to Snowflake")
54
+ return session
55
+ except Exception as e:
56
+ logger.error(f"Failed to connect to Snowflake: {e}")
57
+ raise
58
+
59
+ # =========================================================
60
+ def get_credential(self, key):
61
+ return os.getenv(key)
62
+
63
+ # =========================================================
64
+ def run_read_query(self, query, data):
65
+ """
66
+ Executes a SQL query on Snowflake that fetch the data
67
+ :return: Pandas dataframe containing the query results
68
+ """
69
+
70
+ # Connect to Snowflake
71
+ try:
72
+ dataframe = self.session.sql(query).to_pandas()
73
+ dataframe.columns = dataframe.columns.str.lower()
74
+ print(f"reading {data} table successfully")
75
+ return dataframe
76
+ except Exception as e:
77
+ error_msg = f"Error reading {data}: {e}"
78
+ print(error_msg)
79
+ logger.error(error_msg)
80
+ raise
81
+
82
+ # =========================================================
83
+ def store_df_to_snowflake(self, table_name, dataframe, database="SOCIAL_MEDIA_DB", schema="ML_FEATURES", overwrite=False):
84
+ """
85
+ Executes a SQL query on Snowflake that write the preprocessed data on new tables
86
+ :param query: SQL query string to be executed
87
+ :return: None
88
+ """
89
+
90
+ try:
91
+ self.session.use_database(database)
92
+ self.session.use_schema(schema)
93
+
94
+ dataframe = dataframe.reset_index(drop=True)
95
+ dataframe.columns = dataframe.columns.str.upper()
96
+
97
+ self.session.write_pandas(df=dataframe,
98
+ table_name=table_name.strip().upper(),
99
+ auto_create_table=True,
100
+ overwrite=overwrite,
101
+ use_logical_type=True)
102
+ print(f"Data inserted into {table_name} successfully.")
103
+
104
+ except Exception as e:
105
+ print(f"Error in creating/updating/inserting table: {e}")
106
+
107
+ # =========================================================
108
+ def execute_sql_file(self, file_path):
109
+ """
110
+ Executes SQL queries from a file
111
+ :param file_path: Path to SQL file
112
+ :return: Query result or None for DDL/DML
113
+ """
114
+ try:
115
+ with open(file_path, 'r', encoding='utf-8') as file:
116
+ sql_content = file.read()
117
+
118
+ result = self.session.sql(sql_content).collect()
119
+ print(f"Successfully executed SQL from {file_path}")
120
+ return result
121
+ except Exception as e:
122
+ print(f"Error executing SQL file {file_path}: {e}")
123
+ return None
124
+
125
+ # =========================================================
126
+ def execute_query(self, query, description="query"):
127
+ """
128
+ Executes a SQL query and returns results
129
+ :param query: SQL query string
130
+ :param description: Description of the query for logging
131
+ :return: Query results
132
+ """
133
+ try:
134
+ result = self.session.sql(query).collect()
135
+ print(f"Successfully executed {description}")
136
+ return result
137
+ except Exception as e:
138
+ print(f"Error executing {description}: {e}")
139
+ return None
140
+
141
+
142
+ # =========================================================
143
+ def get_data(self, data):
144
+ # get any sort of data based on requirement --> comments, contents, etc
145
+ pass
146
+
147
+ # =========================================================
148
+ def close_connection(self):
149
+ self.session.close()
150
+
visualization/agents/README.md ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Visualization Agents
2
+
3
+ ## Overview
4
+ This folder contains AI-powered agents that enhance the sentiment analysis dashboard with intelligent, context-aware insights and analysis capabilities.
5
+
6
+ ## Architecture
7
+
8
+ ### Base Agent Pattern
9
+ All agents inherit from `BaseVisualizationAgent` which provides:
10
+ - Common interface (`process()`, `validate_input()`)
11
+ - Error handling
12
+ - Logging functionality
13
+ - Consistent configuration
14
+
15
+ ### LLM Helper
16
+ `utils/llm_helper.py` provides:
17
+ - OpenAI API integration
18
+ - Retry logic with exponential backoff
19
+ - JSON mode support
20
+ - Token usage tracking
21
+
22
+ ## Available Agents
23
+
24
+ ### 1. ContentSummaryAgent
25
+
26
+ **Purpose**: Analyze and summarize comments for content pieces
27
+
28
+ **Location**: `agents/content_summary_agent.py`
29
+
30
+ **Input**:
31
+ ```python
32
+ {
33
+ 'content_sk': str, # Content identifier
34
+ 'content_description': str, # Content title/description
35
+ 'comments': DataFrame or list # Comments data
36
+ }
37
+ ```
38
+
39
+ **Output**:
40
+ ```python
41
+ {
42
+ 'success': bool,
43
+ 'content_sk': str,
44
+ 'summary': {
45
+ 'executive_summary': str, # 2-3 sentence overview
46
+ 'main_themes': [ # Top themes discussed
47
+ {
48
+ 'theme': str,
49
+ 'sentiment': str, # positive/negative/mixed
50
+ 'description': str
51
+ }
52
+ ],
53
+ 'praise_points': [str], # What users love
54
+ 'key_complaints': [str], # Main concerns
55
+ 'frequently_asked_questions': [str], # Common questions
56
+ 'unexpected_insights': [str], # Surprising patterns
57
+ 'action_recommendations': [ # Suggested actions
58
+ {
59
+ 'priority': str, # high/medium/low
60
+ 'action': str
61
+ }
62
+ ]
63
+ },
64
+ 'metadata': {
65
+ 'total_comments_analyzed': int,
66
+ 'model_used': str,
67
+ 'tokens_used': int
68
+ }
69
+ }
70
+ ```
71
+
72
+ **Configuration**:
73
+ - Model: `gpt-5-nano` (configurable)
74
+ - Temperature: 0.3 (lower for focused summaries)
75
+ - Sampling: All negative comments + up to 50 positive/neutral (if >100 total)
76
+
77
+ **Features**:
78
+ - **Smart sampling**: Prioritizes negative comments, samples others
79
+ - **Context preservation**: Includes sentiment and intent metadata
80
+ - **Token optimization**: Truncates long comments to 300 chars
81
+ - **Structured output**: JSON format with guaranteed fields
82
+ - **Error handling**: Graceful failures with retry capability
83
+
84
+ ## UI Integration
85
+
86
+ ### Poor Sentiment Contents Page
87
+
88
+ **Location**: `components/poor_sentiment_contents.py`
89
+
90
+ **User Flow**:
91
+ 1. User views content cards on Poor Sentiment Contents page
92
+ 2. Clicks "🔍 Generate AI Analysis" button
93
+ 3. Agent processes comments (with spinner indicator)
94
+ 4. Summary displays in expandable section
95
+ 5. Result cached in session state
96
+
97
+ **Display Sections**:
98
+ - **Executive Summary**: High-level overview (info box)
99
+ - **Main Themes**: Key topics with sentiment indicators
100
+ - **Praise Points** ✅ & **Key Complaints** ⚠️ (side-by-side)
101
+ - **FAQs** ❓ & **Unexpected Insights** 💡 (side-by-side)
102
+ - **Recommended Actions** 🎯 (priority-coded)
103
+ - **Analysis Metadata** ℹ️ (expandable details)
104
+
105
+ **Session Caching**:
106
+ - Summaries stored in `st.session_state.content_summaries`
107
+ - Key: `content_sk`
108
+ - Persists during session, cleared on page reload
109
+ - Prevents redundant API calls
110
+
111
+ ## Usage Example
112
+
113
+ ```python
114
+ from agents.content_summary_agent import ContentSummaryAgent
115
+ import pandas as pd
116
+
117
+ # Initialize agent
118
+ agent = ContentSummaryAgent(model="gpt-5-nano", temperature=0.3)
119
+
120
+ # Prepare input
121
+ input_data = {
122
+ 'content_sk': '12345',
123
+ 'content_description': 'Advanced Drum Fills Tutorial',
124
+ 'comments': comments_df # DataFrame with comments
125
+ }
126
+
127
+ # Generate summary
128
+ result = agent.process(input_data)
129
+
130
+ if result['success']:
131
+ summary = result['summary']
132
+ print(summary['executive_summary'])
133
+
134
+ for theme in summary['main_themes']:
135
+ print(f"Theme: {theme['theme']} ({theme['sentiment']})")
136
+ print(f" {theme['description']}")
137
+ else:
138
+ print(f"Error: {result['error']}")
139
+ ```
140
+
141
+ ## Environment Setup
142
+
143
+ ### Required Environment Variables
144
+ Add to `.env` file (parent directory):
145
+ ```bash
146
+ OPENAI_API_KEY=your_openai_api_key_here
147
+ ```
148
+
149
+ ### Dependencies
150
+ All dependencies already in `visualization/requirements.txt`:
151
+ - `streamlit>=1.28.0`
152
+ - `pandas>=2.0.0`
153
+ - `python-dotenv>=1.0.0`
154
+ - OpenAI library (inherited from parent project)
155
+
156
+ ## Error Handling
157
+
158
+ ### Agent-Level Errors
159
+ - **Invalid input**: Returns `{'success': False, 'error': 'Invalid input data'}`
160
+ - **LLM API failure**: Retries up to 3 times with exponential backoff
161
+ - **JSON parsing error**: Returns error with raw content
162
+ - **Exception**: Catches all exceptions, logs, returns error dict
163
+
164
+ ### UI-Level Errors
165
+ - Displays error message in red box
166
+ - Provides "🔄 Retry Analysis" button
167
+ - Clears cache and regenerates on retry
168
+ - Logs errors to agent logger
169
+
170
+ ## Performance Considerations
171
+
172
+ ### API Costs
173
+ - Model: `gpt-5-nano` (cost-effective)
174
+ - Sampling strategy: Reduces tokens by up to 50% for large comment sets
175
+ - Comment truncation: Max 300 chars per comment
176
+ - Session caching: Eliminates duplicate API calls
177
+
178
+ ### Response Time
179
+ - Average: 5-10 seconds for 50-100 comments
180
+ - Depends on: Comment count, OpenAI API latency
181
+ - User feedback: Spinner shows "Analyzing comments with AI..."
182
+
183
+ ### Scalability
184
+ - Handles up to 100 comments per analysis (after sampling)
185
+ - Parallel requests: Each content analyzed independently
186
+ - Session state: Memory usage scales with number of analyzed contents
187
+
188
+ ## Extending Agents
189
+
190
+ ### Adding New Agents
191
+
192
+ 1. **Create agent file**:
193
+ ```python
194
+ # agents/new_agent.py
195
+ from agents.base_agent import BaseVisualizationAgent
196
+ from utils.llm_helper import LLMHelper
197
+
198
+ class NewAgent(BaseVisualizationAgent):
199
+ def __init__(self, model="gpt-5-nano", temperature=0.7):
200
+ super().__init__(name="NewAgent", model=model, temperature=temperature)
201
+ self.llm_helper = LLMHelper(model=model, temperature=temperature)
202
+
203
+ def validate_input(self, input_data):
204
+ # Validation logic
205
+ return True
206
+
207
+ def process(self, input_data):
208
+ # Processing logic
209
+ pass
210
+ ```
211
+
212
+ 2. **Update `__init__.py`**:
213
+ ```python
214
+ from .new_agent import NewAgent
215
+
216
+ __all__ = ['ContentSummaryAgent', 'NewAgent']
217
+ ```
218
+
219
+ 3. **Integrate in UI**:
220
+ - Import agent in component file
221
+ - Add UI controls (buttons, inputs)
222
+ - Display results
223
+ - Handle caching if needed
224
+
225
+ ### Best Practices
226
+
227
+ 1. **Input Validation**: Always validate required fields
228
+ 2. **Error Handling**: Use `handle_error()` method
229
+ 3. **Logging**: Use `log_processing()` for debugging
230
+ 4. **Structured Output**: Return consistent dict format
231
+ 5. **Caching**: Use session state for expensive operations
232
+ 6. **Token Optimization**: Sample/truncate data for large inputs
233
+ 7. **User Feedback**: Show spinners for async operations
234
+ 8. **Graceful Degradation**: Provide fallbacks for failures
235
+
236
+ ## Testing
237
+
238
+ ### Manual Testing
239
+ 1. Start dashboard: `streamlit run app.py`
240
+ 2. Navigate to "⚠️ Poor Sentiment Contents" page
241
+ 3. Click "🔍 Generate AI Analysis" for any content
242
+ 4. Verify summary displays correctly
243
+ 5. Check session caching (click button again)
244
+ 6. Test error handling (disconnect network)
245
+
246
+ ### Unit Testing
247
+ ```python
248
+ # tests/test_content_summary_agent.py
249
+ import pytest
250
+ from agents.content_summary_agent import ContentSummaryAgent
251
+
252
+ def test_validate_input():
253
+ agent = ContentSummaryAgent()
254
+
255
+ # Valid input
256
+ valid_input = {
257
+ 'content_sk': '123',
258
+ 'content_description': 'Test',
259
+ 'comments': []
260
+ }
261
+ assert agent.validate_input(valid_input) == True
262
+
263
+ # Missing field
264
+ invalid_input = {'content_sk': '123'}
265
+ assert agent.validate_input(invalid_input) == False
266
+ ```
267
+
268
+ ## Future Enhancements
269
+
270
+ ### Planned Features
271
+ 1. **Batch Analysis**: Analyze multiple contents at once
272
+ 2. **Trend Detection**: Compare with historical summaries
273
+ 3. **Export Summaries**: Download as PDF/CSV
274
+ 4. **Custom Prompts**: User-defined analysis focus
275
+ 5. **Multi-language Support**: Summaries in user's language
276
+
277
+ ### Additional Agents (Roadmap)
278
+ - **InsightsSummaryAgent**: Overall dataset insights
279
+ - **InteractiveChatbotAgent**: Conversational analysis
280
+ - **ComparativeContentAgent**: Content comparison
281
+ - **ReplySuggestionAgent**: Generate reply suggestions
282
+ - **TrendForecastingAgent**: Predict sentiment trends
283
+
284
+ ## Troubleshooting
285
+
286
+ ### Common Issues
287
+
288
+ **Issue**: `OPENAI_API_KEY not found`
289
+ - **Solution**: Add key to `.env` file in parent directory
290
+
291
+ **Issue**: Import error for `agents` module
292
+ - **Solution**: Ensure `__init__.py` exists in `visualization/agents/`
293
+
294
+ **Issue**: LLM timeout errors
295
+ - **Solution**: Reduce comment count or increase retry limit
296
+
297
+ **Issue**: JSON parsing errors
298
+ - **Solution**: Check LLM prompt format, ensure JSON mode enabled
299
+
300
+ **Issue**: Cached summaries not showing
301
+ - **Solution**: Check `st.session_state.content_summaries` initialization
302
+
303
+ ## Support
304
+
305
+ For issues or questions:
306
+ 1. Check this README
307
+ 2. Review agent logs in console
308
+ 3. Inspect session state in Streamlit
309
+ 4. Verify environment variables
310
+ 5. Check OpenAI API status
311
+
312
+ ## Version History
313
+
314
+ ### v1.0.0 (Current)
315
+ - Initial release
316
+ - ContentSummaryAgent implementation
317
+ - Poor Sentiment Contents page integration
318
+ - Session-based caching
319
+ - Error handling and retry logic
320
+ - Comprehensive UI display
visualization/agents/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Visualization Agents Package
3
+ Contains AI agents for intelligent dashboard features
4
+ """
5
+
6
+ from .content_summary_agent import ContentSummaryAgent
7
+
8
+ __all__ = ['ContentSummaryAgent']
visualization/agents/base_agent.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Base Agent class for visualization agents
3
+ Provides common functionality and interface for all agents
4
+ """
5
+ from abc import ABC, abstractmethod
6
+ from typing import Dict, Any
7
+ import logging
8
+
9
+
10
+ class BaseVisualizationAgent(ABC):
11
+ """
12
+ Abstract base class for all visualization agents
13
+ """
14
+
15
+ def __init__(self, name: str, model: str = "gpt-5-nano", temperature: float = 0.7):
16
+ """
17
+ Initialize base agent
18
+
19
+ Args:
20
+ name: Agent name
21
+ model: LLM model to use
22
+ temperature: LLM temperature
23
+ """
24
+ self.name = name
25
+ self.model = model
26
+ self.temperature = temperature
27
+ self.logger = logging.getLogger(f"visualization.agents.{name}")
28
+
29
+ @abstractmethod
30
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
31
+ """
32
+ Process input data and return results
33
+
34
+ Args:
35
+ input_data: Input data dictionary
36
+
37
+ Returns:
38
+ Results dictionary
39
+ """
40
+ pass
41
+
42
+ @abstractmethod
43
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
44
+ """
45
+ Validate input data
46
+
47
+ Args:
48
+ input_data: Input data dictionary
49
+
50
+ Returns:
51
+ True if valid, False otherwise
52
+ """
53
+ pass
54
+
55
+ def log_processing(self, message: str, level: str = "info"):
56
+ """
57
+ Log processing information
58
+
59
+ Args:
60
+ message: Log message
61
+ level: Log level (info, warning, error)
62
+ """
63
+ log_func = getattr(self.logger, level.lower(), self.logger.info)
64
+ log_func(f"[{self.name}] {message}")
65
+
66
+ def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]:
67
+ """
68
+ Handle errors consistently
69
+
70
+ Args:
71
+ error: Exception that occurred
72
+ context: Additional context information
73
+
74
+ Returns:
75
+ Error response dictionary
76
+ """
77
+ error_msg = f"Error in {self.name}: {str(error)}"
78
+ if context:
79
+ error_msg += f" | Context: {context}"
80
+
81
+ self.log_processing(error_msg, level="error")
82
+
83
+ return {
84
+ 'success': False,
85
+ 'error': str(error),
86
+ 'error_type': type(error).__name__,
87
+ 'context': context
88
+ }
visualization/agents/content_summary_agent.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Content Summary Agent
3
+ Analyzes and summarizes comments for content pieces
4
+ """
5
+ import pandas as pd
6
+ from typing import Dict, Any, List
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ # Add parent directory to path
11
+ parent_dir = Path(__file__).resolve().parent.parent
12
+ sys.path.append(str(parent_dir))
13
+
14
+ from agents.base_agent import BaseVisualizationAgent
15
+ from utils.llm_helper import LLMHelper
16
+
17
+
18
+ class ContentSummaryAgent(BaseVisualizationAgent):
19
+ """
20
+ Agent that analyzes and summarizes comments for content
21
+ Extracts themes, praise points, complaints, FAQs, and insights
22
+ """
23
+
24
+ def __init__(self, model: str = "gpt-5-nano", temperature: float = 1):
25
+ """
26
+ Initialize Content Summary Agent
27
+
28
+ Args:
29
+ model: LLM model to use
30
+ temperature: Temperature for generation (lower for more focused summaries)
31
+ """
32
+ super().__init__(name="ContentSummaryAgent", model=model, temperature=temperature)
33
+ self.llm_helper = LLMHelper(model=model, temperature=temperature)
34
+
35
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
36
+ """
37
+ Validate input data
38
+
39
+ Args:
40
+ input_data: Input dictionary
41
+
42
+ Returns:
43
+ True if valid, False otherwise
44
+ """
45
+ required_fields = ['content_sk', 'content_description', 'comments']
46
+
47
+ for field in required_fields:
48
+ if field not in input_data:
49
+ self.log_processing(f"Missing required field: {field}", level="error")
50
+ return False
51
+
52
+ if not isinstance(input_data['comments'], (list, pd.DataFrame)):
53
+ self.log_processing("Comments must be a list or DataFrame", level="error")
54
+ return False
55
+
56
+ return True
57
+
58
+ def _prepare_comments_context(self, comments: Any, sentiment_type: str = 'negative') -> str:
59
+ """
60
+ Prepare comments data for LLM analysis
61
+
62
+ Args:
63
+ comments: Comments as DataFrame or list of dicts
64
+ sentiment_type: Type of sentiment to analyze ('negative', 'positive', 'combined')
65
+
66
+ Returns:
67
+ Formatted string with comment data
68
+ """
69
+ # Convert to DataFrame if needed
70
+ if isinstance(comments, list):
71
+ comments_df = pd.DataFrame(comments)
72
+ else:
73
+ comments_df = comments.copy()
74
+
75
+ # Filter based on sentiment type
76
+ if sentiment_type == 'negative':
77
+ # Only negative comments
78
+ comments_df = comments_df[
79
+ comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])
80
+ ]
81
+ elif sentiment_type == 'positive':
82
+ # Only positive comments
83
+ comments_df = comments_df[
84
+ comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])
85
+ ]
86
+ # else: combined - use all comments
87
+
88
+ # Limit to reasonable number for API
89
+ if len(comments_df) > 100:
90
+ if sentiment_type == 'combined':
91
+ # For combined: sample from both positive and negative
92
+ negative_comments = comments_df[
93
+ comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])
94
+ ].sample(n=min(50, len(comments_df[comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])])), random_state=42)
95
+
96
+ positive_comments = comments_df[
97
+ comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])
98
+ ].sample(n=min(50, len(comments_df[comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])])), random_state=42)
99
+
100
+ comments_df = pd.concat([negative_comments, positive_comments])
101
+ else:
102
+ # For single sentiment type: just sample
103
+ comments_df = comments_df.sample(n=min(100, len(comments_df)), random_state=42)
104
+
105
+ # Format comments for analysis
106
+ comments_text = []
107
+ for idx, row in comments_df.iterrows():
108
+ text = row.get('display_text', row.get('original_text', ''))
109
+ sentiment = row.get('sentiment_polarity', 'unknown')
110
+ intent = row.get('intent', 'unknown')
111
+
112
+ comment_entry = f"""
113
+ Comment #{idx + 1}:
114
+ - Text: {text[:300]}{'...' if len(str(text)) > 300 else ''}
115
+ - Sentiment: {sentiment}
116
+ - Intent: {intent}
117
+ """
118
+ comments_text.append(comment_entry)
119
+
120
+ return "\n".join(comments_text)
121
+
122
+ def _generate_summary_prompt(
123
+ self,
124
+ content_description: str,
125
+ comments_context: str,
126
+ total_comments: int,
127
+ sentiment_type: str = 'negative'
128
+ ) -> str:
129
+ """
130
+ Generate prompt for LLM
131
+
132
+ Args:
133
+ content_description: Description of the content
134
+ comments_context: Formatted comments
135
+ total_comments: Total number of comments
136
+ sentiment_type: Type of sentiment being analyzed ('negative', 'positive', 'combined')
137
+
138
+ Returns:
139
+ Prompt string
140
+ """
141
+ # Customize prompt based on sentiment type
142
+ if sentiment_type == 'negative':
143
+ focus_instruction = "Focus on understanding negative feedback, complaints, and issues that need attention."
144
+ elif sentiment_type == 'positive':
145
+ focus_instruction = "Focus on understanding what users love, praise points, and successful elements that should be maintained or amplified."
146
+ else: # combined
147
+ focus_instruction = "Provide a balanced analysis covering both positive feedback and areas for improvement."
148
+
149
+ prompt = f"""Analyze the {sentiment_type} comments below for the following content and provide a brief executive summary.
150
+
151
+ **Content:** {content_description}
152
+
153
+ **Total Comments Analyzed:** {total_comments}
154
+
155
+ **Analysis Focus:** {focus_instruction}
156
+
157
+ **Comments to Analyze:**
158
+ {comments_context}
159
+
160
+ **Task:** Provide a concise executive summary in JSON format with the following structure:
161
+
162
+ {{
163
+ "executive_summary": "2-3 sentence high-level overview focusing on {sentiment_type} sentiment",
164
+ "main_themes": [
165
+ {{
166
+ "theme": "theme name",
167
+ "sentiment": "positive/negative/mixed",
168
+ "description": "brief description"
169
+ }}
170
+ ],
171
+ "praise_points": ["point 1", "point 2", "point 3"],
172
+ "key_complaints": ["complaint 1", "complaint 2", "complaint 3"],
173
+ "frequently_asked_questions": ["question 1", "question 2"],
174
+ "unexpected_insights": ["insight 1", "insight 2"],
175
+ "action_recommendations": [
176
+ {{
177
+ "priority": "high/medium/low",
178
+ "action": "recommended action"
179
+ }}
180
+ ]
181
+ }}
182
+
183
+ **Guidelines:**
184
+ - Be concise and actionable
185
+ - Focus on the most important insights from {sentiment_type} comments
186
+ - Limit each list to top 3-5 items
187
+ - If a section has no relevant items, use an empty list
188
+ - Executive summary should capture the overall patterns and key takeaways
189
+ """
190
+ return prompt
191
+
192
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
193
+ """
194
+ Process comments and generate summary
195
+
196
+ Args:
197
+ input_data: {
198
+ 'content_sk': content identifier,
199
+ 'content_description': content title/description,
200
+ 'comments': DataFrame or list of comment dicts,
201
+ 'sentiment_type': 'negative', 'positive', or 'combined' (optional, defaults to 'negative')
202
+ }
203
+
204
+ Returns:
205
+ {
206
+ 'success': bool,
207
+ 'content_sk': str,
208
+ 'sentiment_type': str,
209
+ 'summary': {
210
+ 'executive_summary': str,
211
+ 'main_themes': list,
212
+ 'praise_points': list,
213
+ 'key_complaints': list,
214
+ 'frequently_asked_questions': list,
215
+ 'unexpected_insights': list,
216
+ 'action_recommendations': list
217
+ },
218
+ 'metadata': {
219
+ 'total_comments_analyzed': int,
220
+ 'model_used': str,
221
+ 'tokens_used': int
222
+ }
223
+ }
224
+ """
225
+ try:
226
+ # Validate input
227
+ if not self.validate_input(input_data):
228
+ return {
229
+ 'success': False,
230
+ 'error': 'Invalid input data',
231
+ 'content_sk': input_data.get('content_sk', 'unknown')
232
+ }
233
+
234
+ content_sk = input_data['content_sk']
235
+ content_description = input_data['content_description']
236
+ comments = input_data['comments']
237
+ sentiment_type = input_data.get('sentiment_type', 'negative') # Default to negative for backward compatibility
238
+
239
+ self.log_processing(f"Starting {sentiment_type} analysis for content: {content_sk}")
240
+
241
+ # Convert to DataFrame if needed
242
+ if isinstance(comments, list):
243
+ comments_df = pd.DataFrame(comments)
244
+ else:
245
+ comments_df = comments.copy()
246
+
247
+ total_comments = len(comments_df)
248
+
249
+ if total_comments == 0:
250
+ return {
251
+ 'success': True,
252
+ 'content_sk': content_sk,
253
+ 'sentiment_type': sentiment_type,
254
+ 'summary': {
255
+ 'executive_summary': 'No comments available for analysis.',
256
+ 'main_themes': [],
257
+ 'praise_points': [],
258
+ 'key_complaints': [],
259
+ 'frequently_asked_questions': [],
260
+ 'unexpected_insights': [],
261
+ 'action_recommendations': []
262
+ },
263
+ 'metadata': {
264
+ 'total_comments_analyzed': 0,
265
+ 'model_used': self.model,
266
+ 'tokens_used': 0
267
+ }
268
+ }
269
+
270
+ # Prepare comments context based on sentiment type
271
+ comments_context = self._prepare_comments_context(comments_df, sentiment_type)
272
+
273
+ # Get count of comments after filtering
274
+ if sentiment_type == 'negative':
275
+ filtered_count = len(comments_df[comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])])
276
+ elif sentiment_type == 'positive':
277
+ filtered_count = len(comments_df[comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])])
278
+ else:
279
+ filtered_count = total_comments
280
+
281
+ if filtered_count == 0:
282
+ return {
283
+ 'success': True,
284
+ 'content_sk': content_sk,
285
+ 'sentiment_type': sentiment_type,
286
+ 'summary': {
287
+ 'executive_summary': f'No {sentiment_type} comments available for analysis.',
288
+ 'main_themes': [],
289
+ 'praise_points': [],
290
+ 'key_complaints': [],
291
+ 'frequently_asked_questions': [],
292
+ 'unexpected_insights': [],
293
+ 'action_recommendations': []
294
+ },
295
+ 'metadata': {
296
+ 'total_comments_analyzed': 0,
297
+ 'model_used': self.model,
298
+ 'tokens_used': 0
299
+ }
300
+ }
301
+
302
+ # Generate prompt
303
+ prompt = self._generate_summary_prompt(
304
+ content_description,
305
+ comments_context,
306
+ filtered_count,
307
+ sentiment_type
308
+ )
309
+
310
+ # System message
311
+ system_message = """You are an expert social media analyst specializing in
312
+ sentiment analysis and community insights. Provide concise, actionable summaries
313
+ that help content creators understand their audience feedback."""
314
+
315
+ # Get LLM response
316
+ self.log_processing(f"Calling LLM for {sentiment_type} summary generation")
317
+ response = self.llm_helper.get_structured_completion(
318
+ prompt=prompt,
319
+ system_message=system_message,
320
+ max_retries=3
321
+ )
322
+
323
+ if not response['success']:
324
+ return self.handle_error(
325
+ Exception(response.get('error', 'LLM call failed')),
326
+ context=f"content_sk={content_sk}, sentiment_type={sentiment_type}"
327
+ )
328
+
329
+ # Extract summary
330
+ summary = response['content']
331
+
332
+ # Ensure all expected fields exist
333
+ default_summary = {
334
+ 'executive_summary': '',
335
+ 'main_themes': [],
336
+ 'praise_points': [],
337
+ 'key_complaints': [],
338
+ 'frequently_asked_questions': [],
339
+ 'unexpected_insights': [],
340
+ 'action_recommendations': []
341
+ }
342
+
343
+ # Merge with defaults
344
+ for key in default_summary:
345
+ if key not in summary:
346
+ summary[key] = default_summary[key]
347
+
348
+ self.log_processing(f"Successfully generated {sentiment_type} summary for content: {content_sk}")
349
+
350
+ return {
351
+ 'success': True,
352
+ 'content_sk': content_sk,
353
+ 'sentiment_type': sentiment_type,
354
+ 'summary': summary,
355
+ 'metadata': {
356
+ 'total_comments_analyzed': filtered_count,
357
+ 'model_used': response['model'],
358
+ 'tokens_used': response['usage']['total_tokens']
359
+ }
360
+ }
361
+
362
+ except Exception as e:
363
+ return self.handle_error(
364
+ e,
365
+ context=f"content_sk={input_data.get('content_sk', 'unknown')}, sentiment_type={input_data.get('sentiment_type', 'negative')}"
366
+ )
visualization/app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Musora Sentiment Analysis Dashboard
3
+ Main Streamlit Application
4
+
5
+ Run with: streamlit run app.py
6
+ """
7
+ import streamlit as st
8
+ import sys
9
+ from pathlib import Path
10
+ import json
11
+
12
+ # Add parent directory to path
13
+ parent_dir = Path(__file__).resolve().parent
14
+ sys.path.append(str(parent_dir))
15
+
16
+ from data.data_loader import SentimentDataLoader
17
+ from components.dashboard import render_dashboard
18
+ from components.sentiment_analysis import render_sentiment_analysis
19
+ from components.reply_required import render_reply_required
20
+
21
+
22
+ # Load configuration
23
+ config_path = parent_dir / "config" / "viz_config.json"
24
+ with open(config_path, 'r') as f:
25
+ config = json.load(f)
26
+
27
+ # Page configuration
28
+ st.set_page_config(
29
+ page_title=config['page_config']['page_title'],
30
+ page_icon=config['page_config']['page_icon'],
31
+ layout=config['page_config']['layout'],
32
+ initial_sidebar_state=config['page_config']['initial_sidebar_state']
33
+ )
34
+
35
+
36
+ def main():
37
+ """
38
+ Main application function
39
+ """
40
+ # Sidebar
41
+ with st.sidebar:
42
+ st.image("visualization/img/musora.png", use_container_width=True)
43
+ st.title("Navigation")
44
+
45
+ # Page selection
46
+ page = st.radio(
47
+ "Select Page",
48
+ ["📊 Dashboard", "🔍 Sentiment Analysis", "💬 Reply Required"],
49
+ index=0
50
+ )
51
+
52
+ st.markdown("---")
53
+
54
+ # Filters section
55
+ st.markdown("### 🔍 Global Filters")
56
+
57
+ # Initialize session state for filters
58
+ if 'filters_applied' not in st.session_state:
59
+ st.session_state.filters_applied = False
60
+
61
+ # Load data first to get filter options
62
+ with st.spinner("Loading data..."):
63
+ data_loader = SentimentDataLoader()
64
+ df = data_loader.load_data()
65
+
66
+ if df.empty:
67
+ st.error("No data available. Please check your Snowflake connection.")
68
+ return
69
+
70
+ # Get filter options
71
+ filter_options = data_loader.get_filter_options(df)
72
+
73
+ # Platform filter
74
+ selected_platforms = st.multiselect(
75
+ "Platforms",
76
+ options=filter_options['platforms'],
77
+ default=[]
78
+ )
79
+
80
+ # Brand filter
81
+ selected_brands = st.multiselect(
82
+ "Brands",
83
+ options=filter_options['brands'],
84
+ default=[]
85
+ )
86
+
87
+ # Sentiment filter
88
+ selected_sentiments = st.multiselect(
89
+ "Sentiments",
90
+ options=filter_options['sentiments'],
91
+ default=[]
92
+ )
93
+
94
+ # Date range filter (if available)
95
+ if 'comment_timestamp' in df.columns and not df.empty:
96
+ min_date = df['comment_timestamp'].min().date()
97
+ max_date = df['comment_timestamp'].max().date()
98
+
99
+ date_range = st.date_input(
100
+ "Date Range",
101
+ value=(min_date, max_date),
102
+ min_value=min_date,
103
+ max_value=max_date
104
+ )
105
+ else:
106
+ date_range = None
107
+
108
+ # Apply filters button
109
+ if st.button("🔍 Apply Filters", use_container_width=True):
110
+ st.session_state.filters_applied = True
111
+
112
+ # Reset filters button
113
+ if st.button("🔄 Reset Filters", use_container_width=True):
114
+ st.session_state.filters_applied = False
115
+ st.rerun()
116
+
117
+ st.markdown("---")
118
+
119
+ # Data refresh
120
+ st.markdown("### 🔄 Data Management")
121
+
122
+ if st.button("♻️ Reload Data", use_container_width=True):
123
+ st.cache_data.clear()
124
+ st.rerun()
125
+
126
+ # Display data info
127
+ st.markdown("---")
128
+ st.markdown("### ℹ️ Data Info")
129
+ st.info(f"**Total Records:** {len(df):,}")
130
+
131
+ if 'processed_at' in df.columns and not df.empty:
132
+ last_update = df['processed_at'].max()
133
+ st.info(f"**Last Updated:** {last_update.strftime('%Y-%m-%d %H:%M')}")
134
+
135
+ # Apply filters if needed
136
+ if st.session_state.filters_applied:
137
+ df = data_loader.apply_filters(
138
+ df,
139
+ platforms=selected_platforms if selected_platforms else None,
140
+ brands=selected_brands if selected_brands else None,
141
+ sentiments=selected_sentiments if selected_sentiments else None,
142
+ date_range=date_range if date_range and len(date_range) == 2 else None
143
+ )
144
+
145
+ # Show filter summary
146
+ if df.empty:
147
+ st.warning("No data matches the selected filters. Please adjust your filters.")
148
+ return
149
+ else:
150
+ st.info(f"Showing {len(df):,} records after applying filters")
151
+
152
+ # Main content area - render selected page
153
+ if page == "📊 Dashboard":
154
+ render_dashboard(df)
155
+
156
+ elif page == "🔍 Sentiment Analysis":
157
+ render_sentiment_analysis(df)
158
+
159
+ elif page == "💬 Reply Required":
160
+ render_reply_required(df)
161
+
162
+ # Footer
163
+ st.markdown("---")
164
+ st.markdown(
165
+ """
166
+ <div style='text-align: center; color: gray; padding: 20px;'>
167
+ <p>Musora Sentiment Analysis Dashboard v1.0</p>
168
+ <p>Powered by Streamlit | Data from Snowflake</p>
169
+ </div>
170
+ """,
171
+ unsafe_allow_html=True
172
+ )
173
+
174
+
175
+ if __name__ == "__main__":
176
+ try:
177
+ main()
178
+ except Exception as e:
179
+ st.error(f"An error occurred: {str(e)}")
180
+ st.exception(e)
visualization/components/dashboard.py ADDED
@@ -0,0 +1,538 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main Dashboard Page
3
+ Displays overall sentiment distributions by brand and platform
4
+ """
5
+ import streamlit as st
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add parent directory to path
10
+ parent_dir = Path(__file__).resolve().parent.parent
11
+ sys.path.append(str(parent_dir))
12
+
13
+ from utils.data_processor import SentimentDataProcessor
14
+ from utils.metrics import SentimentMetrics
15
+ from visualizations.sentiment_charts import SentimentCharts
16
+ from visualizations.distribution_charts import DistributionCharts
17
+ from visualizations.demographic_charts import DemographicCharts
18
+ from visualizations.content_cards import ContentCards
19
+
20
+
21
+ def render_dashboard(df):
22
+ """
23
+ Render the main dashboard page
24
+
25
+ Args:
26
+ df: Sentiment dataframe
27
+ """
28
+ st.title("📊 Sentiment Analysis Dashboard")
29
+
30
+ # Performance tip
31
+ if len(df) > 10000:
32
+ st.info(f"💡 **Performance Tip**: Loaded {len(df):,} comments. Use the global filters in the sidebar to narrow down your analysis for faster performance.")
33
+
34
+ st.markdown("---")
35
+
36
+ # Initialize components
37
+ sentiment_charts = SentimentCharts()
38
+ distribution_charts = DistributionCharts()
39
+ processor = SentimentDataProcessor()
40
+
41
+ # Display overall summary statistics
42
+ ContentCards.display_summary_stats(df)
43
+
44
+ st.markdown("---")
45
+
46
+ # Calculate overall metrics
47
+ overall_metrics = SentimentMetrics.calculate_overall_metrics(df)
48
+
49
+ # Display health indicator
50
+ col1, col2, col3 = st.columns([1, 2, 1])
51
+ with col2:
52
+ ContentCards.display_health_indicator(overall_metrics['negative_pct'])
53
+
54
+ st.markdown("---")
55
+
56
+ # Overall sentiment distribution
57
+ st.markdown("## 🎯 Overall Sentiment Distribution")
58
+
59
+ col1, col2 = st.columns(2)
60
+
61
+ with col1:
62
+ # Sentiment pie chart
63
+ sentiment_pie = sentiment_charts.create_sentiment_pie_chart(df, title="Overall Sentiment Distribution")
64
+ st.plotly_chart(sentiment_pie, use_container_width=True)
65
+
66
+ with col2:
67
+ # Sentiment score gauge
68
+ sentiment_gauge = sentiment_charts.create_sentiment_score_gauge(
69
+ overall_metrics['avg_sentiment_score'],
70
+ title="Overall Sentiment Score"
71
+ )
72
+ st.plotly_chart(sentiment_gauge, use_container_width=True)
73
+
74
+ # Additional metrics
75
+ metric_col1, metric_col2 = st.columns(2)
76
+ with metric_col1:
77
+ st.metric("Positive %", f"{overall_metrics['positive_pct']:.1f}%")
78
+ with metric_col2:
79
+ st.metric("Reply Rate %", f"{overall_metrics['reply_required_pct']:.1f}%")
80
+
81
+ st.markdown("---")
82
+
83
+ # Sentiment by Brand
84
+ st.markdown("## 🏢 Sentiment Analysis by Brand")
85
+
86
+ col1, col2 = st.columns(2)
87
+
88
+ with col1:
89
+ # Stacked bar chart
90
+ brand_sentiment_bar = sentiment_charts.create_sentiment_bar_chart(
91
+ df, group_by='brand', title="Sentiment Distribution by Brand"
92
+ )
93
+ st.plotly_chart(brand_sentiment_bar, use_container_width=True)
94
+
95
+ with col2:
96
+ # Percentage bar chart
97
+ brand_sentiment_pct = sentiment_charts.create_sentiment_percentage_bar_chart(
98
+ df, group_by='brand', title="Sentiment Distribution by Brand (%)"
99
+ )
100
+ st.plotly_chart(brand_sentiment_pct, use_container_width=True)
101
+
102
+ # Brand metrics table
103
+ with st.expander("📈 Detailed Brand Metrics"):
104
+ brand_metrics = SentimentMetrics.calculate_brand_metrics(df)
105
+
106
+ brand_data = []
107
+ for brand, metrics in brand_metrics.items():
108
+ brand_data.append({
109
+ 'Brand': brand.title(),
110
+ 'Total Comments': metrics['total_comments'],
111
+ 'Replies Needed': metrics['total_reply_required'],
112
+ 'Negative %': f"{metrics['negative_pct']:.1f}%",
113
+ 'Positive %': f"{metrics['positive_pct']:.1f}%",
114
+ 'Avg Sentiment Score': f"{metrics['avg_sentiment_score']:.2f}"
115
+ })
116
+
117
+ st.table(brand_data)
118
+
119
+ st.markdown("---")
120
+
121
+ # Sentiment by Platform
122
+ st.markdown("## 🌐 Sentiment Analysis by Platform")
123
+
124
+ col1, col2 = st.columns(2)
125
+
126
+ with col1:
127
+ # Stacked bar chart
128
+ platform_sentiment_bar = sentiment_charts.create_sentiment_bar_chart(
129
+ df, group_by='platform', title="Sentiment Distribution by Platform"
130
+ )
131
+ st.plotly_chart(platform_sentiment_bar, use_container_width=True)
132
+
133
+ with col2:
134
+ # Percentage bar chart
135
+ platform_sentiment_pct = sentiment_charts.create_sentiment_percentage_bar_chart(
136
+ df, group_by='platform', title="Sentiment Distribution by Platform (%)"
137
+ )
138
+ st.plotly_chart(platform_sentiment_pct, use_container_width=True)
139
+
140
+ # Platform metrics table
141
+ with st.expander("📈 Detailed Platform Metrics"):
142
+ platform_metrics = SentimentMetrics.calculate_platform_metrics(df)
143
+
144
+ platform_data = []
145
+ for platform, metrics in platform_metrics.items():
146
+ platform_data.append({
147
+ 'Platform': platform.title(),
148
+ 'Total Comments': metrics['total_comments'],
149
+ 'Replies Needed': metrics['total_reply_required'],
150
+ 'Negative %': f"{metrics['negative_pct']:.1f}%",
151
+ 'Positive %': f"{metrics['positive_pct']:.1f}%",
152
+ 'Avg Sentiment Score': f"{metrics['avg_sentiment_score']:.2f}"
153
+ })
154
+
155
+ st.table(platform_data)
156
+
157
+ st.markdown("---")
158
+
159
+ # Intent Analysis
160
+ st.markdown("## 🎭 Intent Analysis")
161
+
162
+ col1, col2 = st.columns(2)
163
+
164
+ with col1:
165
+ # Intent bar chart
166
+ intent_bar = distribution_charts.create_intent_bar_chart(
167
+ df, title="Intent Distribution", orientation='h'
168
+ )
169
+ st.plotly_chart(intent_bar, use_container_width=True)
170
+
171
+ with col2:
172
+ # Intent pie chart
173
+ intent_pie = distribution_charts.create_intent_pie_chart(df, title="Intent Distribution")
174
+ st.plotly_chart(intent_pie, use_container_width=True)
175
+
176
+ st.markdown("---")
177
+
178
+ # Brand-Platform Matrix
179
+ st.markdown("## 🔀 Cross-Dimensional Analysis")
180
+
181
+ col1, col2 = st.columns(2)
182
+
183
+ with col1:
184
+ # Heatmap showing comment distribution
185
+ brand_platform_matrix = distribution_charts.create_brand_platform_matrix(
186
+ df, title="Brand-Platform Comment Matrix"
187
+ )
188
+ st.plotly_chart(brand_platform_matrix, use_container_width=True)
189
+
190
+ with col2:
191
+ # Sentiment heatmap
192
+ sentiment_heatmap = sentiment_charts.create_sentiment_heatmap(
193
+ df, row_dimension='brand', col_dimension='platform', title="Negative Sentiment Heatmap"
194
+ )
195
+ st.plotly_chart(sentiment_heatmap, use_container_width=True)
196
+
197
+ st.markdown("---")
198
+
199
+ # Platform and Brand Distribution
200
+ st.markdown("## 📊 Volume Analysis")
201
+
202
+ col1, col2 = st.columns(2)
203
+
204
+ with col1:
205
+ # Platform distribution
206
+ platform_dist = distribution_charts.create_platform_distribution(df, title="Comments by Platform")
207
+ st.plotly_chart(platform_dist, use_container_width=True)
208
+
209
+ with col2:
210
+ # Brand distribution
211
+ brand_dist = distribution_charts.create_brand_distribution(df, title="Comments by Brand")
212
+ st.plotly_chart(brand_dist, use_container_width=True)
213
+
214
+ st.markdown("---")
215
+
216
+ # Reply Requirements
217
+ st.markdown("## ⚠️ Reply Requirements Analysis")
218
+
219
+ col1, col2 = st.columns(2)
220
+
221
+ with col1:
222
+ # Reply required by brand
223
+ reply_brand = distribution_charts.create_reply_required_chart(
224
+ df, group_by='brand', title="Comments Requiring Reply by Brand"
225
+ )
226
+ st.plotly_chart(reply_brand, use_container_width=True)
227
+
228
+ with col2:
229
+ # Reply required by platform
230
+ reply_platform = distribution_charts.create_reply_required_chart(
231
+ df, group_by='platform', title="Comments Requiring Reply by Platform"
232
+ )
233
+ st.plotly_chart(reply_platform, use_container_width=True)
234
+
235
+ # Response urgency metrics
236
+ urgency_metrics = SentimentMetrics.calculate_response_urgency(df)
237
+
238
+ st.markdown("### 🚨 Response Urgency Breakdown")
239
+ urgency_col1, urgency_col2, urgency_col3, urgency_col4 = st.columns(4)
240
+
241
+ with urgency_col1:
242
+ st.metric("🔴 Urgent", urgency_metrics['urgent_count'], help="Negative sentiment + requires reply")
243
+
244
+ with urgency_col2:
245
+ st.metric("🟠 High Priority", urgency_metrics['high_priority_count'], help="Neutral with feedback/request")
246
+
247
+ with urgency_col3:
248
+ st.metric("🟡 Medium Priority", urgency_metrics['medium_priority_count'], help="Positive requiring reply")
249
+
250
+ with urgency_col4:
251
+ st.metric("🟢 Low Priority", urgency_metrics['low_priority_count'], help="Very positive requiring reply")
252
+
253
+ st.markdown("---")
254
+
255
+ st.markdown("---")
256
+
257
+ # Demographics Analysis (for musora_app only)
258
+ # Check if we have musora_app data and demographic fields
259
+ has_musora_app = 'platform' in df.columns and 'musora_app' in df['platform'].values
260
+ has_demographics = (
261
+ has_musora_app and
262
+ 'age_group' in df.columns and
263
+ 'timezone' in df.columns and
264
+ 'experience_level' in df.columns
265
+ )
266
+
267
+ if has_demographics:
268
+ # Filter for musora_app data only
269
+ df_musora = df[df['platform'] == 'musora_app'].copy()
270
+
271
+ # Check if we have any demographic data (not all Unknown)
272
+ has_valid_demographics = (
273
+ (df_musora['age_group'] != 'Unknown').any() or
274
+ (df_musora['timezone_region'] != 'Unknown').any() or
275
+ (df_musora['experience_group'] != 'Unknown').any()
276
+ )
277
+
278
+ if has_valid_demographics and len(df_musora) > 0:
279
+ st.markdown("## 👥 Demographics Analysis (Musora App)")
280
+ st.info(f"📊 Analyzing demographics for **{len(df_musora):,}** Musora App comments")
281
+
282
+ # Initialize demographic charts
283
+ demographic_charts = DemographicCharts()
284
+
285
+ # Get demographic summary
286
+ demo_summary = processor.get_demographics_summary(df_musora)
287
+
288
+ # Display summary metrics
289
+ demo_col1, demo_col2, demo_col3, demo_col4 = st.columns(4)
290
+
291
+ with demo_col1:
292
+ st.metric(
293
+ "Comments with Demographics",
294
+ f"{demo_summary['users_with_demographics']:,}",
295
+ f"{demo_summary['coverage_percentage']:.1f}% coverage"
296
+ )
297
+
298
+ with demo_col2:
299
+ if demo_summary['avg_age'] is not None:
300
+ st.metric("Average Age", f"{demo_summary['avg_age']:.1f} years")
301
+ else:
302
+ st.metric("Average Age", "N/A")
303
+
304
+ with demo_col3:
305
+ st.metric("Most Common Region", demo_summary['most_common_region'])
306
+
307
+ with demo_col4:
308
+ if demo_summary['avg_experience'] is not None:
309
+ st.metric("Avg Experience", f"{demo_summary['avg_experience']:.1f}/10")
310
+ else:
311
+ st.metric("Avg Experience", "N/A")
312
+
313
+ st.markdown("---")
314
+
315
+ # Age Analysis
316
+ st.markdown("### 🎂 Age Distribution")
317
+
318
+ age_dist = processor.get_demographics_distribution(df_musora, 'age_group')
319
+ age_sentiment = processor.get_demographics_by_sentiment(df_musora, 'age_group')
320
+
321
+ if not age_dist.empty:
322
+ col1, col2 = st.columns(2)
323
+
324
+ with col1:
325
+ age_chart = demographic_charts.create_age_distribution_chart(
326
+ age_dist,
327
+ title="Comments by Age Group"
328
+ )
329
+ st.plotly_chart(age_chart, use_container_width=True)
330
+
331
+ with col2:
332
+ age_sent_chart = demographic_charts.create_age_sentiment_chart(
333
+ age_sentiment,
334
+ title="Sentiment Distribution by Age Group"
335
+ )
336
+ st.plotly_chart(age_sent_chart, use_container_width=True)
337
+
338
+ # Insights
339
+ with st.expander("💡 Age Insights"):
340
+ if len(age_dist) > 0:
341
+ top_age_group = age_dist.iloc[0]['age_group']
342
+ top_age_count = age_dist.iloc[0]['count']
343
+ top_age_pct = age_dist.iloc[0]['percentage']
344
+
345
+ st.write(f"**Most Active Age Group:** {top_age_group} ({top_age_count:,} comments, {top_age_pct:.1f}%)")
346
+
347
+ # Find age group with most negative sentiment
348
+ if not age_sentiment.empty:
349
+ negative_sentiments = age_sentiment[
350
+ age_sentiment['sentiment_polarity'].isin(['negative', 'very_negative'])
351
+ ].groupby('age_group')['percentage'].sum().reset_index()
352
+
353
+ if len(negative_sentiments) > 0:
354
+ negative_sentiments = negative_sentiments.sort_values('percentage', ascending=False)
355
+ most_negative_age = negative_sentiments.iloc[0]['age_group']
356
+ most_negative_pct = negative_sentiments.iloc[0]['percentage']
357
+ st.write(f"**Highest Negative Sentiment:** {most_negative_age} ({most_negative_pct:.1f}% negative)")
358
+ else:
359
+ st.info("No age data available for visualization")
360
+
361
+ st.markdown("---")
362
+
363
+ # Timezone Analysis
364
+ st.markdown("### 🌍 Geographic Distribution")
365
+
366
+ # Get timezone data
367
+ top_timezones = processor.get_top_timezones(df_musora, top_n=15)
368
+ region_dist = processor.get_timezone_regions_distribution(df_musora)
369
+ region_sentiment = processor.get_demographics_by_sentiment(df_musora, 'timezone_region')
370
+
371
+ if not top_timezones.empty or not region_dist.empty:
372
+ # Top timezones
373
+ if not top_timezones.empty:
374
+ st.markdown("#### Top 15 Timezones")
375
+ timezone_chart = demographic_charts.create_timezone_chart(
376
+ top_timezones,
377
+ title="Most Common Timezones",
378
+ top_n=15
379
+ )
380
+ st.plotly_chart(timezone_chart, use_container_width=True)
381
+
382
+ # Regional distribution
383
+ if not region_dist.empty:
384
+ st.markdown("#### Regional Distribution")
385
+ col1, col2 = st.columns(2)
386
+
387
+ with col1:
388
+ region_chart = demographic_charts.create_region_distribution_chart(
389
+ region_dist,
390
+ title="Comments by Region"
391
+ )
392
+ st.plotly_chart(region_chart, use_container_width=True)
393
+
394
+ with col2:
395
+ if not region_sentiment.empty:
396
+ region_sent_chart = demographic_charts.create_region_sentiment_chart(
397
+ region_sentiment,
398
+ title="Sentiment Distribution by Region"
399
+ )
400
+ st.plotly_chart(region_sent_chart, use_container_width=True)
401
+
402
+ # Insights
403
+ with st.expander("💡 Geographic Insights"):
404
+ if not top_timezones.empty:
405
+ top_tz = top_timezones.iloc[0]['timezone']
406
+ top_tz_count = top_timezones.iloc[0]['count']
407
+ top_tz_pct = top_timezones.iloc[0]['percentage']
408
+ st.write(f"**Most Common Timezone:** {top_tz} ({top_tz_count:,} comments, {top_tz_pct:.1f}%)")
409
+
410
+ if not region_dist.empty:
411
+ top_region = region_dist.iloc[0]['timezone_region']
412
+ top_region_count = region_dist.iloc[0]['count']
413
+ top_region_pct = region_dist.iloc[0]['percentage']
414
+ st.write(f"**Most Active Region:** {top_region} ({top_region_count:,} comments, {top_region_pct:.1f}%)")
415
+
416
+ # Find region with most negative sentiment
417
+ if not region_sentiment.empty:
418
+ negative_regions = region_sentiment[
419
+ region_sentiment['sentiment_polarity'].isin(['negative', 'very_negative'])
420
+ ].groupby('timezone_region')['percentage'].sum().reset_index()
421
+
422
+ if len(negative_regions) > 0:
423
+ negative_regions = negative_regions.sort_values('percentage', ascending=False)
424
+ most_negative_region = negative_regions.iloc[0]['timezone_region']
425
+ most_negative_region_pct = negative_regions.iloc[0]['percentage']
426
+ st.write(f"**Highest Negative Sentiment:** {most_negative_region} ({most_negative_region_pct:.1f}% negative)")
427
+ else:
428
+ st.info("No timezone/region data available for visualization")
429
+
430
+ st.markdown("---")
431
+
432
+ # Experience Level Analysis
433
+ st.markdown("### 🎯 Experience Level Distribution")
434
+
435
+ # Get both detailed and grouped experience data
436
+ exp_dist_detailed = processor.get_experience_level_distribution(df_musora, use_groups=False)
437
+ exp_dist_grouped = processor.get_experience_level_distribution(df_musora, use_groups=True)
438
+ exp_sentiment_grouped = processor.get_demographics_by_sentiment(df_musora, 'experience_group')
439
+
440
+ if not exp_dist_detailed.empty or not exp_dist_grouped.empty:
441
+ # Tabs for detailed vs grouped view
442
+ tab1, tab2 = st.tabs(["📊 Detailed (0-10)", "📊 Grouped (Beginner/Intermediate/Advanced)"])
443
+
444
+ with tab1:
445
+ if not exp_dist_detailed.empty:
446
+ exp_chart_detailed = demographic_charts.create_experience_distribution_chart(
447
+ exp_dist_detailed,
448
+ title="Comments by Experience Level (0-10 Scale)",
449
+ use_groups=False
450
+ )
451
+ st.plotly_chart(exp_chart_detailed, use_container_width=True)
452
+ else:
453
+ st.info("No detailed experience level data available")
454
+
455
+ with tab2:
456
+ if not exp_dist_grouped.empty:
457
+ col1, col2 = st.columns(2)
458
+
459
+ with col1:
460
+ exp_chart_grouped = demographic_charts.create_experience_distribution_chart(
461
+ exp_dist_grouped,
462
+ title="Comments by Experience Group",
463
+ use_groups=True
464
+ )
465
+ st.plotly_chart(exp_chart_grouped, use_container_width=True)
466
+
467
+ with col2:
468
+ if not exp_sentiment_grouped.empty:
469
+ exp_sent_chart = demographic_charts.create_experience_sentiment_chart(
470
+ exp_sentiment_grouped,
471
+ title="Sentiment by Experience Group",
472
+ use_groups=True
473
+ )
474
+ st.plotly_chart(exp_sent_chart, use_container_width=True)
475
+ else:
476
+ st.info("No grouped experience level data available")
477
+
478
+ # Insights
479
+ with st.expander("💡 Experience Insights"):
480
+ if not exp_dist_grouped.empty:
481
+ top_exp_group = exp_dist_grouped.iloc[0]['experience_group']
482
+ top_exp_count = exp_dist_grouped.iloc[0]['count']
483
+ top_exp_pct = exp_dist_grouped.iloc[0]['percentage']
484
+ st.write(f"**Most Active Group:** {top_exp_group} ({top_exp_count:,} comments, {top_exp_pct:.1f}%)")
485
+
486
+ # Find experience group with most negative sentiment
487
+ if not exp_sentiment_grouped.empty:
488
+ negative_exp = exp_sentiment_grouped[
489
+ exp_sentiment_grouped['sentiment_polarity'].isin(['negative', 'very_negative'])
490
+ ].groupby('experience_group')['percentage'].sum().reset_index()
491
+
492
+ if len(negative_exp) > 0:
493
+ negative_exp = negative_exp.sort_values('percentage', ascending=False)
494
+ most_negative_exp = negative_exp.iloc[0]['experience_group']
495
+ most_negative_exp_pct = negative_exp.iloc[0]['percentage']
496
+ st.write(f"**Highest Negative Sentiment:** {most_negative_exp} ({most_negative_exp_pct:.1f}% negative)")
497
+
498
+ if demo_summary['avg_experience'] is not None:
499
+ st.write(f"**Average Experience Level:** {demo_summary['avg_experience']:.2f}/10")
500
+ st.write(f"**Most Common Experience Group:** {demo_summary.get('most_common_experience', 'Unknown')}")
501
+ else:
502
+ st.info("No experience level data available for visualization")
503
+
504
+ st.markdown("---")
505
+
506
+ # Language Distribution (if available)
507
+ if 'detected_language' in df.columns:
508
+ st.markdown("## 🌍 Language Distribution")
509
+
510
+ lang_dist = distribution_charts.create_language_distribution(df, top_n=10, title="Top 10 Languages")
511
+ st.plotly_chart(lang_dist, use_container_width=True)
512
+
513
+ st.markdown("---")
514
+
515
+ # Temporal trends (if timestamp available)
516
+ if 'comment_timestamp' in df.columns and not df.empty:
517
+ with st.expander("📈 Temporal Trends", expanded=False):
518
+ # Frequency selector
519
+ freq_col1, freq_col2 = st.columns([1, 3])
520
+
521
+ with freq_col1:
522
+ freq = st.selectbox(
523
+ "Time Granularity",
524
+ options=['D', 'W', 'M'],
525
+ format_func=lambda x: {'D': 'Daily', 'W': 'Weekly', 'M': 'Monthly'}[x],
526
+ index=1 # Default to Weekly
527
+ )
528
+
529
+ sentiment_timeline = sentiment_charts.create_sentiment_timeline(df, freq=freq, title="Sentiment Trends Over Time")
530
+ st.plotly_chart(sentiment_timeline, use_container_width=True)
531
+
532
+ # Hierarchical sunburst
533
+ with st.expander("🌟 Hierarchical View", expanded=False):
534
+ st.markdown("**Interactive Brand > Platform > Sentiment Distribution**")
535
+ sunburst = distribution_charts.create_combined_distribution_sunburst(
536
+ df, title="Brand > Platform > Sentiment Distribution"
537
+ )
538
+ st.plotly_chart(sunburst, use_container_width=True)
visualization/components/reply_required.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Reply Required Page
3
+ Displays comments that require replies with filtering and prioritization
4
+ """
5
+ import streamlit as st
6
+ import pandas as pd
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ # Add parent directory to path
11
+ parent_dir = Path(__file__).resolve().parent.parent
12
+ sys.path.append(str(parent_dir))
13
+
14
+ from utils.data_processor import SentimentDataProcessor
15
+ from utils.metrics import SentimentMetrics
16
+ from visualizations.sentiment_charts import SentimentCharts
17
+ from visualizations.distribution_charts import DistributionCharts
18
+ from visualizations.content_cards import ContentCards
19
+
20
+
21
+ def render_reply_required(df):
22
+ """
23
+ Render the reply required page
24
+
25
+ Args:
26
+ df: Sentiment dataframe
27
+ """
28
+ st.title("⚠️ Comments Requiring Reply")
29
+ st.markdown("Manage and prioritize comments that need responses")
30
+ st.markdown("---")
31
+
32
+ # Initialize components
33
+ processor = SentimentDataProcessor()
34
+ metrics = SentimentMetrics()
35
+
36
+ # Get comments requiring reply
37
+ reply_comments = processor.get_comments_requiring_reply(df)
38
+
39
+ if reply_comments.empty:
40
+ st.success("🎉 Great news! No comments currently require replies.")
41
+ return
42
+
43
+ # Display summary statistics
44
+ st.markdown("### 📊 Summary")
45
+
46
+ col1, col2, col3, col4 = st.columns(4)
47
+
48
+ with col1:
49
+ st.metric("Total Replies Needed", len(reply_comments))
50
+
51
+ with col2:
52
+ urgency = metrics.calculate_response_urgency(df)
53
+ st.metric("🔴 Urgent", urgency['urgent_count'], help="Negative sentiment")
54
+
55
+ with col3:
56
+ unique_contents = reply_comments['content_sk'].nunique() if 'content_sk' in reply_comments.columns else 0
57
+ st.metric("Affected Contents", unique_contents)
58
+
59
+ with col4:
60
+ negative_count = reply_comments['sentiment_polarity'].isin(['negative', 'very_negative']).sum()
61
+ negative_pct = (negative_count / len(reply_comments) * 100) if len(reply_comments) > 0 else 0
62
+ st.metric("Negative %", f"{negative_pct:.1f}%")
63
+
64
+ st.markdown("---")
65
+
66
+ # Urgency breakdown
67
+ st.markdown("### 🚨 Response Urgency Breakdown")
68
+
69
+ urgency_metrics = metrics.calculate_response_urgency(df)
70
+
71
+ urgency_col1, urgency_col2, urgency_col3, urgency_col4 = st.columns(4)
72
+
73
+ with urgency_col1:
74
+ st.metric(
75
+ "🔴 Urgent",
76
+ urgency_metrics['urgent_count'],
77
+ help="Negative sentiment requiring reply - immediate action needed"
78
+ )
79
+
80
+ with urgency_col2:
81
+ st.metric(
82
+ "🟠 High Priority",
83
+ urgency_metrics['high_priority_count'],
84
+ help="Neutral with feedback/request - respond within 24 hours"
85
+ )
86
+
87
+ with urgency_col3:
88
+ st.metric(
89
+ "🟡 Medium Priority",
90
+ urgency_metrics['medium_priority_count'],
91
+ help="Positive requiring reply - respond within 48 hours"
92
+ )
93
+
94
+ with urgency_col4:
95
+ st.metric(
96
+ "🟢 Low Priority",
97
+ urgency_metrics['low_priority_count'],
98
+ help="Very positive requiring reply - respond when convenient"
99
+ )
100
+
101
+ st.markdown("---")
102
+
103
+ # Filters
104
+ st.markdown("### 🔍 Filters")
105
+
106
+ filter_col1, filter_col2, filter_col3, filter_col4 = st.columns(4)
107
+
108
+ with filter_col1:
109
+ # Priority filter
110
+ priority_options = ['All', '🔴 Urgent', '🟠 High', '🟡 Medium', '🟢 Low']
111
+ selected_priority = st.selectbox("Priority", priority_options, index=0)
112
+
113
+ with filter_col2:
114
+ # Platform filter
115
+ platform_options = ['All'] + sorted(reply_comments['platform'].unique().tolist())
116
+ selected_platform = st.selectbox("Platform", platform_options, index=0)
117
+
118
+ with filter_col3:
119
+ # Brand filter
120
+ brand_options = ['All'] + sorted(reply_comments['brand'].unique().tolist())
121
+ selected_brand = st.selectbox("Brand", brand_options, index=0)
122
+
123
+ with filter_col4:
124
+ # Intent filter
125
+ intent_list = reply_comments['intent'].str.split(',').explode().str.strip().unique().tolist()
126
+ intent_options = ['All'] + sorted(intent_list)
127
+ selected_intent = st.selectbox("Intent", intent_options, index=0)
128
+
129
+ # Apply filters
130
+ filtered_comments = reply_comments.copy()
131
+
132
+ # Priority filtering
133
+ if selected_priority != 'All':
134
+ if selected_priority == '🔴 Urgent':
135
+ filtered_comments = filtered_comments[
136
+ filtered_comments['sentiment_polarity'].isin(['negative', 'very_negative'])
137
+ ]
138
+ elif selected_priority == '🟠 High':
139
+ filtered_comments = filtered_comments[
140
+ (filtered_comments['sentiment_polarity'] == 'neutral') &
141
+ (filtered_comments['intent'].str.contains('feedback_negative|request', na=False))
142
+ ]
143
+ elif selected_priority == '🟡 Medium':
144
+ filtered_comments = filtered_comments[
145
+ filtered_comments['sentiment_polarity'] == 'positive'
146
+ ]
147
+ elif selected_priority == '🟢 Low':
148
+ filtered_comments = filtered_comments[
149
+ filtered_comments['sentiment_polarity'] == 'very_positive'
150
+ ]
151
+
152
+ # Platform filtering
153
+ if selected_platform != 'All':
154
+ filtered_comments = filtered_comments[filtered_comments['platform'] == selected_platform]
155
+
156
+ # Brand filtering
157
+ if selected_brand != 'All':
158
+ filtered_comments = filtered_comments[filtered_comments['brand'] == selected_brand]
159
+
160
+ # Intent filtering
161
+ if selected_intent != 'All':
162
+ filtered_comments = filtered_comments[
163
+ filtered_comments['intent'].str.contains(selected_intent, na=False)
164
+ ]
165
+
166
+ st.markdown(f"**Showing {len(filtered_comments)} comments after filtering**")
167
+
168
+ st.markdown("---")
169
+
170
+ # Visualizations
171
+ if not filtered_comments.empty:
172
+ st.markdown("### 📈 Analysis")
173
+
174
+ viz_col1, viz_col2 = st.columns(2)
175
+
176
+ with viz_col1:
177
+ # Sentiment distribution
178
+ sentiment_charts = SentimentCharts()
179
+ sentiment_pie = sentiment_charts.create_sentiment_pie_chart(
180
+ filtered_comments, title="Sentiment Distribution"
181
+ )
182
+ st.plotly_chart(sentiment_pie, use_container_width=True)
183
+
184
+ with viz_col2:
185
+ # Intent distribution
186
+ distribution_charts = DistributionCharts()
187
+ intent_bar = distribution_charts.create_intent_bar_chart(
188
+ filtered_comments, title="Intent Distribution", orientation='h'
189
+ )
190
+ st.plotly_chart(intent_bar, use_container_width=True)
191
+
192
+ st.markdown("---")
193
+
194
+ # Display comments
195
+ st.markdown("### 💬 Comments Requiring Reply")
196
+
197
+ # Pagination
198
+ items_per_page = 10
199
+ total_pages = (len(filtered_comments) - 1) // items_per_page + 1
200
+
201
+ if 'reply_page' not in st.session_state:
202
+ st.session_state.reply_page = 1
203
+
204
+ # Pagination controls at top
205
+ if total_pages > 1:
206
+ page_col1, page_col2, page_col3 = st.columns([1, 2, 1])
207
+
208
+ with page_col1:
209
+ if st.button("⬅️ Previous", key="prev_top", disabled=(st.session_state.reply_page <= 1)):
210
+ st.session_state.reply_page -= 1
211
+ st.rerun()
212
+
213
+ with page_col2:
214
+ st.markdown(f"<center>Page {st.session_state.reply_page} of {total_pages}</center>", unsafe_allow_html=True)
215
+
216
+ with page_col3:
217
+ if st.button("Next ➡️", key="next_top", disabled=(st.session_state.reply_page >= total_pages)):
218
+ st.session_state.reply_page += 1
219
+ st.rerun()
220
+
221
+ st.markdown("---")
222
+
223
+ # Get paginated comments
224
+ start_idx = (st.session_state.reply_page - 1) * items_per_page
225
+ end_idx = start_idx + items_per_page
226
+ paginated_comments = filtered_comments.iloc[start_idx:end_idx]
227
+
228
+ # Display comments
229
+ if paginated_comments.empty:
230
+ st.info("No comments match the selected filters")
231
+ else:
232
+ for idx, (_, comment) in enumerate(paginated_comments.iterrows(), start=start_idx + 1):
233
+ # Priority badge
234
+ priority_emoji = "🟢"
235
+ if comment['sentiment_polarity'] in ['negative', 'very_negative']:
236
+ priority_emoji = "🔴"
237
+ elif comment['sentiment_polarity'] == 'neutral' and any(
238
+ intent in comment['intent'] for intent in ['feedback_negative', 'request']
239
+ ):
240
+ priority_emoji = "🟠"
241
+ elif comment['sentiment_polarity'] == 'positive':
242
+ priority_emoji = "🟡"
243
+
244
+ st.markdown(f"#### {priority_emoji} Comment #{idx}")
245
+
246
+ # Display comment card
247
+ ContentCards.display_comment_card(comment, show_original=True)
248
+
249
+ # Pagination controls at bottom
250
+ if total_pages > 1:
251
+ st.markdown("---")
252
+
253
+ page_col1, page_col2, page_col3 = st.columns([1, 2, 1])
254
+
255
+ with page_col1:
256
+ if st.button("⬅️ Previous", key="prev_bottom", disabled=(st.session_state.reply_page <= 1)):
257
+ st.session_state.reply_page -= 1
258
+ st.rerun()
259
+
260
+ with page_col2:
261
+ st.markdown(f"<center>Page {st.session_state.reply_page} of {total_pages}</center>", unsafe_allow_html=True)
262
+
263
+ with page_col3:
264
+ if st.button("Next ➡️", key="next_bottom", disabled=(st.session_state.reply_page >= total_pages)):
265
+ st.session_state.reply_page += 1
266
+ st.rerun()
267
+
268
+ st.markdown("---")
269
+
270
+ # Export option
271
+ st.markdown("### 💾 Export Data")
272
+
273
+ col1, col2 = st.columns([1, 3])
274
+
275
+ with col1:
276
+ # Prepare export data
277
+ export_columns = [
278
+ 'comment_id', 'author_name', 'platform', 'brand', 'comment_timestamp',
279
+ 'display_text', 'original_text', 'detected_language', 'sentiment_polarity',
280
+ 'intent', 'sentiment_confidence', 'content_description', 'permalink_url'
281
+ ]
282
+
283
+ # Filter only available columns
284
+ available_columns = [col for col in export_columns if col in filtered_comments.columns]
285
+ export_data = filtered_comments[available_columns]
286
+
287
+ csv = export_data.to_csv(index=False)
288
+
289
+ st.download_button(
290
+ label="📥 Download as CSV",
291
+ data=csv,
292
+ file_name=f"comments_requiring_reply.csv",
293
+ mime="text/csv"
294
+ )
295
+
296
+ with col2:
297
+ st.info("Download the filtered comments for team collaboration or CRM import")
298
+
299
+ st.markdown("---")
300
+
301
+ # Quick stats by content
302
+ st.markdown("### 📋 Reply Requirements by Content")
303
+
304
+ content_reply_summary = filtered_comments.groupby('content_sk').agg({
305
+ 'comment_sk': 'count',
306
+ 'content_description': 'first',
307
+ 'permalink_url': 'first'
308
+ }).reset_index()
309
+
310
+ content_reply_summary.columns = ['content_sk', 'replies_needed', 'content_description', 'permalink_url']
311
+ content_reply_summary = content_reply_summary.sort_values('replies_needed', ascending=False).head(10)
312
+
313
+ for idx, (_, content) in enumerate(content_reply_summary.iterrows(), 1):
314
+ with st.expander(f"📝 Content #{idx} - {content['replies_needed']} replies needed"):
315
+ st.markdown(f"**Description:** {content['content_description']}")
316
+ if pd.notna(content['permalink_url']):
317
+ st.markdown(f"**Link:** [View Content]({content['permalink_url']})")
318
+
319
+ # Show comments for this content
320
+ content_comments = filtered_comments[filtered_comments['content_sk'] == content['content_sk']].head(3)
321
+
322
+ st.markdown(f"**Top {len(content_comments)} comments:**")
323
+ for _, comment in content_comments.iterrows():
324
+ ContentCards.display_comment_card(comment, show_original=True)
visualization/components/sentiment_analysis.py ADDED
@@ -0,0 +1,671 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sentiment Analysis Page
3
+ Analyze content performance across all sentiment types with advanced filtering
4
+ """
5
+ import streamlit as st
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add parent directory to path
10
+ parent_dir = Path(__file__).resolve().parent.parent
11
+ sys.path.append(str(parent_dir))
12
+
13
+ from utils.data_processor import SentimentDataProcessor
14
+ from visualizations.sentiment_charts import SentimentCharts
15
+ from visualizations.distribution_charts import DistributionCharts
16
+ from visualizations.content_cards import ContentCards
17
+ from agents.content_summary_agent import ContentSummaryAgent
18
+
19
+
20
+ def render_sentiment_analysis(df):
21
+ """
22
+ Render the sentiment analysis page
23
+
24
+ Args:
25
+ df: Sentiment dataframe (full dataset from app.py)
26
+ """
27
+ st.title("🔍 Sentiment Analysis")
28
+ st.markdown("Analyze content performance based on sentiment patterns and user feedback")
29
+ st.markdown("---")
30
+
31
+ # Initialize components
32
+ processor = SentimentDataProcessor()
33
+ sentiment_charts = SentimentCharts()
34
+ distribution_charts = DistributionCharts()
35
+
36
+ # Initialize AI agent
37
+ summary_agent = ContentSummaryAgent(model="gpt-5-nano", temperature=1)
38
+
39
+ # Initialize session state for caching summaries
40
+ if 'content_summaries' not in st.session_state:
41
+ st.session_state.content_summaries = {}
42
+
43
+ # Page-Specific Filters (Platform and Brand Selection)
44
+ st.markdown("### 🎯 Select Platform and Brand")
45
+ st.info("⚡ **Performance Optimization**: Select a specific platform and brand to analyze. This filters the data and makes the page load faster.")
46
+
47
+ filter_col1, filter_col2 = st.columns(2)
48
+
49
+ with filter_col1:
50
+ # Get available platforms
51
+ available_platforms = sorted(df['platform'].unique().tolist())
52
+ selected_platform = st.selectbox(
53
+ "Platform *",
54
+ options=[''] + available_platforms,
55
+ index=0,
56
+ help="Select the platform to analyze"
57
+ )
58
+
59
+ with filter_col2:
60
+ # Get available brands
61
+ available_brands = sorted(df['brand'].unique().tolist())
62
+ selected_brand = st.selectbox(
63
+ "Brand *",
64
+ options=[''] + available_brands,
65
+ index=0,
66
+ help="Select the brand to analyze"
67
+ )
68
+
69
+ # Check if both platform and brand are selected
70
+ if not selected_platform or not selected_brand:
71
+ st.warning("⚠️ Please select both **Platform** and **Brand** to view sentiment analysis.")
72
+ st.markdown("---")
73
+
74
+ # Show summary of available data
75
+ st.markdown("### 📊 Available Data Summary")
76
+ col1, col2, col3 = st.columns(3)
77
+
78
+ with col1:
79
+ st.metric("Total Comments", f"{len(df):,}")
80
+
81
+ with col2:
82
+ st.metric("Platforms", len(available_platforms))
83
+ with st.expander("View Platforms"):
84
+ for platform in available_platforms:
85
+ count = len(df[df['platform'] == platform])
86
+ st.write(f"- **{platform}**: {count:,} comments")
87
+
88
+ with col3:
89
+ st.metric("Brands", len(available_brands))
90
+ with st.expander("View Brands"):
91
+ for brand in available_brands:
92
+ count = len(df[df['brand'] == brand])
93
+ st.write(f"- **{brand}**: {count:,} comments")
94
+
95
+ return
96
+
97
+ # Filter dataframe by selected platform and brand
98
+ df_filtered = df[
99
+ (df['platform'] == selected_platform) &
100
+ (df['brand'] == selected_brand)
101
+ ].copy()
102
+
103
+ if df_filtered.empty:
104
+ st.error(f"❌ No data found for **{selected_platform}** + **{selected_brand}** combination.")
105
+ return
106
+
107
+ # Show data info
108
+ st.success(f"✅ Loaded **{len(df_filtered):,}** comments for **{selected_platform}** + **{selected_brand}**")
109
+ st.markdown("---")
110
+
111
+ # Filters Section
112
+ st.markdown("### 🔍 Content Filters")
113
+ st.markdown("Filter contents by sentiment and intent to focus your analysis")
114
+
115
+ filter_col1, filter_col2, filter_col3, filter_col4 = st.columns(4)
116
+
117
+ with filter_col1:
118
+ # Sentiment filter
119
+ sentiment_options = df_filtered['sentiment_polarity'].unique().tolist()
120
+ selected_sentiments = st.multiselect(
121
+ "Sentiment",
122
+ options=sorted(sentiment_options),
123
+ default=[],
124
+ help="Filter by dominant sentiment. Leave empty to show all sentiments."
125
+ )
126
+
127
+ with filter_col2:
128
+ # Intent filter
129
+ intent_list = df_filtered['intent'].str.split(',').explode().str.strip().unique().tolist()
130
+ selected_intents = st.multiselect(
131
+ "Intent",
132
+ options=sorted([i for i in intent_list if i]),
133
+ default=[],
134
+ help="Filter contents that have comments with these intents"
135
+ )
136
+
137
+ with filter_col3:
138
+ # Top N selector
139
+ top_n = st.selectbox(
140
+ "Top N Contents",
141
+ options=[5, 10, 15, 20, 25],
142
+ index=1, # Default to 10
143
+ help="Number of contents to display"
144
+ )
145
+
146
+ with filter_col4:
147
+ # Show filter status
148
+ filter_active = bool(selected_sentiments or selected_intents)
149
+ if filter_active:
150
+ st.metric("Filters Active", "✓ Yes", help="Sentiment or intent filters are applied")
151
+ else:
152
+ st.metric("Filters Active", "✗ No", help="Showing all sentiments")
153
+
154
+ st.markdown("---")
155
+
156
+ # Advanced controls (defaults optimized for balanced analysis)
157
+ with st.expander("⚙️ Advanced Ranking Controls", expanded=False):
158
+ st.markdown("**Customize how contents are ranked and filtered**")
159
+
160
+ adv_col1, adv_col2 = st.columns(2)
161
+
162
+ with adv_col1:
163
+ min_comments = st.slider(
164
+ "Minimum Comments Required",
165
+ min_value=1,
166
+ max_value=50,
167
+ value=10,
168
+ step=1,
169
+ help="Filter out contents with fewer comments. Default: 10 (excludes low-volume contents)"
170
+ )
171
+
172
+ with adv_col2:
173
+ sort_by = st.selectbox(
174
+ "Sort By",
175
+ options=[
176
+ ('severity_score', '🎯 Severity Score (Balanced) - Recommended'),
177
+ ('sentiment_percentage', '📊 Sentiment Percentage'),
178
+ ('sentiment_count', '🔢 Sentiment Count (Absolute)'),
179
+ ('total_comments', '💬 Total Comments (Volume)')
180
+ ],
181
+ format_func=lambda x: x[1],
182
+ index=0,
183
+ help="Severity Score balances sentiment % with comment volume for smarter ranking"
184
+ )
185
+ sort_by_value = sort_by[0]
186
+
187
+ # Show explanation of current sort method
188
+ sentiment_label = "selected sentiments" if selected_sentiments else "all sentiments"
189
+ if sort_by_value == 'severity_score':
190
+ st.info(f"📘 **Severity Score** = Sentiment % × √(Total Comments). Balances {sentiment_label} percentage with volume for high-impact ranking.")
191
+ elif sort_by_value == 'sentiment_percentage':
192
+ st.info(f"📘 Ranks by highest % of {sentiment_label}. May include low-volume contents.")
193
+ elif sort_by_value == 'sentiment_count':
194
+ st.info(f"📘 Ranks by absolute number of comments with {sentiment_label}. Prioritizes volume over percentage.")
195
+ else:
196
+ st.info("📘 Ranks by total comment volume, regardless of sentiment.")
197
+
198
+ # Get filtered contents using the new method
199
+ filtered_contents = processor.get_sentiment_filtered_contents(
200
+ df_filtered,
201
+ selected_sentiments=selected_sentiments if selected_sentiments else None,
202
+ selected_intents=selected_intents if selected_intents else None,
203
+ top_n=top_n,
204
+ min_comments=min_comments,
205
+ sort_by=sort_by_value
206
+ )
207
+
208
+ # Reset pagination when filters change
209
+ filter_key = f"{selected_platform}_{selected_brand}_{top_n}_{min_comments}_{sort_by_value}_{str(selected_sentiments)}_{str(selected_intents)}"
210
+ if 'last_filter_key' not in st.session_state or st.session_state.last_filter_key != filter_key:
211
+ st.session_state.sentiment_page = 1
212
+ st.session_state.last_filter_key = filter_key
213
+
214
+ if filtered_contents.empty:
215
+ st.warning("No content data available with the selected filters. Try adjusting your filters.")
216
+ return
217
+
218
+ # Display summary statistics
219
+ st.markdown("### 📊 Summary")
220
+
221
+ col1, col2, col3, col4 = st.columns(4)
222
+
223
+ with col1:
224
+ st.metric("Contents Analyzed", len(filtered_contents))
225
+
226
+ with col2:
227
+ # Dynamic metric based on selected sentiments
228
+ if 'selected_sentiment_percentage' in filtered_contents.columns:
229
+ avg_sentiment_pct = filtered_contents['selected_sentiment_percentage'].mean()
230
+ sentiment_label = "Selected Sentiment %" if selected_sentiments else "All Sentiment %"
231
+ st.metric(sentiment_label, f"{avg_sentiment_pct:.1f}%")
232
+ else:
233
+ avg_negative_pct = filtered_contents['negative_percentage'].mean()
234
+ st.metric("Avg Negative %", f"{avg_negative_pct:.1f}%")
235
+
236
+ with col3:
237
+ total_comments = filtered_contents['total_comments'].sum()
238
+ st.metric("Total Comments", int(total_comments))
239
+
240
+ with col4:
241
+ total_replies_needed = filtered_contents['reply_required_count'].sum()
242
+ st.metric("Total Replies Needed", int(total_replies_needed))
243
+
244
+ st.markdown("---")
245
+
246
+ # Engagement scatter plot
247
+ st.markdown("### 📈 Content Engagement Analysis")
248
+ engagement_scatter = distribution_charts.create_engagement_scatter(
249
+ filtered_contents, title="Content Engagement vs. Sentiment"
250
+ )
251
+ st.plotly_chart(engagement_scatter, use_container_width=True, key="engagement_scatter_chart")
252
+
253
+ st.markdown("---")
254
+
255
+ # Display each content with detailed analysis
256
+ st.markdown("### 🔍 Detailed Content Analysis")
257
+
258
+ # Add pagination controls for better performance
259
+ if 'sentiment_page' not in st.session_state:
260
+ st.session_state.sentiment_page = 1
261
+
262
+ items_per_page = 5 # Show 5 contents per page
263
+ total_contents = len(filtered_contents)
264
+ total_pages = (total_contents + items_per_page - 1) // items_per_page # Ceiling division
265
+
266
+ if total_contents > items_per_page:
267
+ # Display pagination info
268
+ st.info(f"📄 Showing page {st.session_state.sentiment_page} of {total_pages} ({total_contents} total contents)")
269
+
270
+ # Pagination controls at top
271
+ col_prev, col_info, col_next = st.columns([1, 2, 1])
272
+
273
+ with col_prev:
274
+ if st.button("⬅️ Previous", key="prev_top", disabled=st.session_state.sentiment_page == 1):
275
+ st.session_state.sentiment_page -= 1
276
+ st.rerun()
277
+
278
+ with col_info:
279
+ st.markdown(f"<div style='text-align: center; padding-top: 8px;'>Page {st.session_state.sentiment_page} / {total_pages}</div>", unsafe_allow_html=True)
280
+
281
+ with col_next:
282
+ if st.button("Next ➡️", key="next_top", disabled=st.session_state.sentiment_page >= total_pages):
283
+ st.session_state.sentiment_page += 1
284
+ st.rerun()
285
+
286
+ st.markdown("---")
287
+
288
+ # Calculate pagination indices
289
+ start_idx = (st.session_state.sentiment_page - 1) * items_per_page
290
+ end_idx = min(start_idx + items_per_page, total_contents)
291
+
292
+ # Get paginated contents
293
+ paginated_contents = filtered_contents.iloc[start_idx:end_idx]
294
+
295
+ for idx, (_, content_row) in enumerate(paginated_contents.iterrows(), start_idx + 1):
296
+ # Display content card
297
+ ContentCards.display_content_card(content_row, rank=idx)
298
+
299
+ # Get comments for this content
300
+ content_comments = df_filtered[df_filtered['content_sk'] == content_row['content_sk']]
301
+
302
+ if content_comments.empty:
303
+ st.info("No comment details available for this content")
304
+ continue
305
+
306
+ # Create columns for visualizations
307
+ viz_col1, viz_col2 = st.columns(2)
308
+
309
+ with viz_col1:
310
+ # Sentiment distribution for this content
311
+ content_sentiment_pie = sentiment_charts.create_sentiment_pie_chart(
312
+ content_comments, title=f"Sentiment Distribution"
313
+ )
314
+ st.plotly_chart(content_sentiment_pie, use_container_width=True, key=f"sentiment_pie_{content_row['content_sk']}")
315
+
316
+ with viz_col2:
317
+ # Intent distribution for this content
318
+ content_intent_bar = distribution_charts.create_intent_bar_chart(
319
+ content_comments, title=f"Intent Distribution", orientation='h'
320
+ )
321
+ st.plotly_chart(content_intent_bar, use_container_width=True, key=f"intent_bar_{content_row['content_sk']}")
322
+
323
+ # AI Analysis Section
324
+ st.markdown("#### 🤖 AI-Powered Analysis")
325
+
326
+ content_sk = content_row['content_sk']
327
+
328
+ # Three buttons in a row for different analysis types
329
+ st.markdown("**Select analysis type:**")
330
+ btn_col1, btn_col2, btn_col3 = st.columns(3)
331
+
332
+ with btn_col1:
333
+ generate_negative = st.button(
334
+ "📉 Negative Summary",
335
+ key=f"ai_negative_{content_sk}",
336
+ help="Analyze negative comments only",
337
+ use_container_width=True
338
+ )
339
+
340
+ with btn_col2:
341
+ generate_combined = st.button(
342
+ "📊 Combined Summary",
343
+ key=f"ai_combined_{content_sk}",
344
+ help="Analyze both positive and negative comments",
345
+ use_container_width=True
346
+ )
347
+
348
+ with btn_col3:
349
+ generate_positive = st.button(
350
+ "📈 Positive Summary",
351
+ key=f"ai_positive_{content_sk}",
352
+ help="Analyze positive comments only",
353
+ use_container_width=True
354
+ )
355
+
356
+ # Determine which summary to generate/display
357
+ summary_type = None
358
+ if generate_negative:
359
+ summary_type = 'negative'
360
+ elif generate_positive:
361
+ summary_type = 'positive'
362
+ elif generate_combined:
363
+ summary_type = 'combined'
364
+
365
+ # Check if any summary already exists in session state
366
+ summary_key_negative = f"{content_sk}_negative"
367
+ summary_key_positive = f"{content_sk}_positive"
368
+ summary_key_combined = f"{content_sk}_combined"
369
+
370
+ # Display existing summaries or generate new ones
371
+ if summary_type or summary_key_negative in st.session_state.content_summaries or \
372
+ summary_key_positive in st.session_state.content_summaries or \
373
+ summary_key_combined in st.session_state.content_summaries:
374
+
375
+ # Generate new summary if button was clicked
376
+ if summary_type:
377
+ summary_key = f"{content_sk}_{summary_type}"
378
+ with st.spinner(f"Analyzing {summary_type} comments with AI..."):
379
+ # Prepare input for agent
380
+ agent_input = {
381
+ 'content_sk': content_sk,
382
+ 'content_description': content_row['content_description'],
383
+ 'comments': content_comments,
384
+ 'sentiment_type': summary_type
385
+ }
386
+
387
+ # Generate summary
388
+ result = summary_agent.process(agent_input)
389
+
390
+ # Cache the result
391
+ st.session_state.content_summaries[summary_key] = result
392
+
393
+ # Display all available summaries
394
+ available_summaries = []
395
+ if summary_key_negative in st.session_state.content_summaries:
396
+ available_summaries.append(('Negative', summary_key_negative))
397
+ if summary_key_combined in st.session_state.content_summaries:
398
+ available_summaries.append(('Combined', summary_key_combined))
399
+ if summary_key_positive in st.session_state.content_summaries:
400
+ available_summaries.append(('Positive', summary_key_positive))
401
+
402
+ # Display each available summary
403
+ for summary_label, summary_key in available_summaries:
404
+ result = st.session_state.content_summaries[summary_key]
405
+
406
+ # Display the summary
407
+ if result['success']:
408
+ summary = result['summary']
409
+
410
+ with st.expander(f"📊 AI Analysis Report - {summary_label}", expanded=True):
411
+ # Executive Summary
412
+ st.markdown("### Executive Summary")
413
+ st.info(summary['executive_summary'])
414
+
415
+ # Main Themes
416
+ if summary['main_themes']:
417
+ st.markdown("### 🎯 Main Themes")
418
+ for theme in summary['main_themes']:
419
+ sentiment_emoji = {
420
+ 'positive': '😊',
421
+ 'negative': '😟',
422
+ 'mixed': '🤔'
423
+ }.get(theme.get('sentiment', 'mixed'), '🤔')
424
+
425
+ st.markdown(f"""
426
+ **{sentiment_emoji} {theme.get('theme', 'Unknown')}** ({theme.get('sentiment', 'mixed').title()})
427
+ - {theme.get('description', 'No description')}
428
+ """)
429
+
430
+ # Two-column layout for praise and complaints
431
+ col_praise, col_complaints = st.columns(2)
432
+
433
+ with col_praise:
434
+ st.markdown("### ✅ Praise Points")
435
+ if summary['praise_points']:
436
+ for point in summary['praise_points']:
437
+ st.markdown(f"- {point}")
438
+ else:
439
+ st.markdown("*No significant praise points identified*")
440
+
441
+ with col_complaints:
442
+ st.markdown("### ⚠️ Key Complaints")
443
+ if summary['key_complaints']:
444
+ for complaint in summary['key_complaints']:
445
+ st.markdown(f"- {complaint}")
446
+ else:
447
+ st.markdown("*No significant complaints identified*")
448
+
449
+ # FAQs and Insights
450
+ col_faq, col_insights = st.columns(2)
451
+
452
+ with col_faq:
453
+ st.markdown("### ❓ Frequently Asked Questions")
454
+ if summary['frequently_asked_questions']:
455
+ for faq in summary['frequently_asked_questions']:
456
+ st.markdown(f"- {faq}")
457
+ else:
458
+ st.markdown("*No frequent questions identified*")
459
+
460
+ with col_insights:
461
+ st.markdown("### 💡 Unexpected Insights")
462
+ if summary['unexpected_insights']:
463
+ for insight in summary['unexpected_insights']:
464
+ st.markdown(f"- {insight}")
465
+ else:
466
+ st.markdown("*No unexpected insights identified*")
467
+
468
+ # Action Recommendations
469
+ if summary['action_recommendations']:
470
+ st.markdown("### 🎯 Recommended Actions")
471
+ for action in summary['action_recommendations']:
472
+ priority = action.get('priority', 'medium').upper()
473
+ priority_color = {
474
+ 'HIGH': '🔴',
475
+ 'MEDIUM': '🟡',
476
+ 'LOW': '🟢'
477
+ }.get(priority, '🟡')
478
+
479
+ st.markdown(f"{priority_color} **[{priority}]** {action.get('action', 'No action specified')}")
480
+
481
+ # Metadata
482
+ with st.expander("ℹ️ Analysis Metadata"):
483
+ metadata = result.get('metadata', {})
484
+ meta_col1, meta_col2, meta_col3 = st.columns(3)
485
+
486
+ with meta_col1:
487
+ st.metric("Comments Analyzed", metadata.get('total_comments_analyzed', 0))
488
+
489
+ with meta_col2:
490
+ st.metric("Model Used", metadata.get('model_used', 'N/A'))
491
+
492
+ with meta_col3:
493
+ st.metric("Tokens Used", metadata.get('tokens_used', 0))
494
+
495
+ else:
496
+ # Display error
497
+ st.error(f"❌ Failed to generate AI analysis: {result.get('error', 'Unknown error')}")
498
+
499
+ # Option to retry
500
+ if st.button("🔄 Retry Analysis", key=f"retry_{summary_key}"):
501
+ # Clear from cache and rerun
502
+ if summary_key in st.session_state.content_summaries:
503
+ del st.session_state.content_summaries[summary_key]
504
+ st.rerun()
505
+
506
+ # Show comments using expandable sections - one for negative, one for positive
507
+ st.markdown("#### 💬 View Comments by Sentiment")
508
+
509
+ # Get negative and positive comments
510
+ negative_comments = content_comments[
511
+ content_comments['sentiment_polarity'].isin(['negative', 'very_negative'])
512
+ ]
513
+ positive_comments = content_comments[
514
+ content_comments['sentiment_polarity'].isin(['positive', 'very_positive'])
515
+ ]
516
+
517
+ negative_count = len(negative_comments)
518
+ positive_count = len(positive_comments)
519
+
520
+ # Create two columns for side-by-side expandable sections
521
+ col_neg, col_pos = st.columns(2)
522
+
523
+ with col_neg:
524
+ # Negative comments expandable section (collapsed by default)
525
+ with st.expander(f"📉 Negative Comments ({negative_count})", expanded=False):
526
+ if not negative_comments.empty:
527
+ st.markdown(f"**Showing all {negative_count} negative comments:**")
528
+ for _, comment in negative_comments.iterrows():
529
+ ContentCards.display_comment_card(comment, show_original=True)
530
+ else:
531
+ st.info("No negative comments found for this content")
532
+
533
+ with col_pos:
534
+ # Positive comments expandable section (collapsed by default)
535
+ with st.expander(f"📈 Positive Comments ({positive_count})", expanded=False):
536
+ if not positive_comments.empty:
537
+ st.markdown(f"**Showing all {positive_count} positive comments:**")
538
+ for _, comment in positive_comments.iterrows():
539
+ ContentCards.display_comment_card(comment, show_original=True)
540
+ else:
541
+ st.info("No positive comments found for this content")
542
+
543
+ st.markdown("---")
544
+
545
+ # Pagination controls at bottom
546
+ if total_contents > items_per_page:
547
+ st.markdown("---")
548
+
549
+ col_prev_bottom, col_info_bottom, col_next_bottom = st.columns([1, 2, 1])
550
+
551
+ with col_prev_bottom:
552
+ if st.button("⬅️ Previous", key="prev_bottom", disabled=st.session_state.sentiment_page == 1):
553
+ st.session_state.sentiment_page -= 1
554
+ st.rerun()
555
+
556
+ with col_info_bottom:
557
+ st.markdown(f"<div style='text-align: center; padding-top: 8px;'>Page {st.session_state.sentiment_page} / {total_pages}</div>", unsafe_allow_html=True)
558
+
559
+ with col_next_bottom:
560
+ if st.button("Next ➡️", key="next_bottom", disabled=st.session_state.sentiment_page >= total_pages):
561
+ st.session_state.sentiment_page += 1
562
+ st.rerun()
563
+
564
+ st.markdown("---")
565
+
566
+ # Additional insights
567
+ st.markdown("### 💡 Insights & Recommendations")
568
+
569
+ # Find common patterns
570
+ all_filtered_comments = df_filtered[df_filtered['content_sk'].isin(filtered_contents['content_sk'])]
571
+
572
+ insight_col1, insight_col2 = st.columns(2)
573
+
574
+ with insight_col1:
575
+ st.markdown("#### 🎯 Common Intent Patterns")
576
+
577
+ # Get intent distribution for filtered contents
578
+ intent_distribution = processor.get_intent_distribution(all_filtered_comments)
579
+ intent_distribution = intent_distribution.sort_values('count', ascending=False).head(5)
580
+
581
+ for _, intent_row in intent_distribution.iterrows():
582
+ st.markdown(f"- **{intent_row['intent']}**: {intent_row['count']} occurrences ({intent_row['percentage']:.1f}%)")
583
+
584
+ with insight_col2:
585
+ st.markdown("#### 🌐 Platform Breakdown")
586
+
587
+ # Get platform distribution
588
+ platform_dist = all_filtered_comments['platform'].value_counts()
589
+
590
+ for platform, count in platform_dist.items():
591
+ pct = (count / len(all_filtered_comments) * 100)
592
+ st.markdown(f"- **{platform.title()}**: {count} comments ({pct:.1f}%)")
593
+
594
+ st.markdown("---")
595
+
596
+ # Action items
597
+ st.markdown("### ✅ Recommended Actions")
598
+
599
+ action_items = []
600
+
601
+ # Check for high reply requirement
602
+ if filtered_contents['reply_required_count'].sum() > 0:
603
+ action_items.append(
604
+ f"🔴 **High Priority**: {int(filtered_contents['reply_required_count'].sum())} comments require immediate response"
605
+ )
606
+
607
+ # Check for critical negative percentage
608
+ critical_contents = filtered_contents[filtered_contents['negative_percentage'] > 50]
609
+ if not critical_contents.empty:
610
+ action_items.append(
611
+ f"🚨 **Critical**: {len(critical_contents)} content(s) have over 50% negative sentiment - investigate root causes"
612
+ )
613
+
614
+ # Check for feedback patterns
615
+ feedback_comments = all_filtered_comments[
616
+ all_filtered_comments['intent'].str.contains('feedback_negative', na=False)
617
+ ]
618
+ if not feedback_comments.empty:
619
+ action_items.append(
620
+ f"💬 **Feedback**: {len(feedback_comments)} comments contain negative feedback - consider product improvements"
621
+ )
622
+
623
+ # Check for questions
624
+ question_comments = all_filtered_comments[
625
+ all_filtered_comments['intent'].str.contains('question', na=False)
626
+ ]
627
+ if not question_comments.empty:
628
+ action_items.append(
629
+ f"❓ **Questions**: {len(question_comments)} unanswered questions - improve FAQ or support documentation"
630
+ )
631
+
632
+ if action_items:
633
+ for action in action_items:
634
+ st.markdown(action)
635
+ else:
636
+ st.success("No critical action items at this time")
637
+
638
+ st.markdown("---")
639
+
640
+ # Export option
641
+ st.markdown("### 💾 Export Data")
642
+
643
+ col1, col2 = st.columns([1, 3])
644
+
645
+ with col1:
646
+ # Prepare export data columns dynamically
647
+ base_columns = ['content_sk', 'content_description', 'permalink_url',
648
+ 'total_comments', 'reply_required_count', 'dominant_sentiment']
649
+
650
+ # Add sentiment-specific columns if they exist
651
+ if 'selected_sentiment_count' in filtered_contents.columns:
652
+ base_columns.extend(['selected_sentiment_count', 'selected_sentiment_percentage'])
653
+
654
+ if 'negative_count' in filtered_contents.columns:
655
+ base_columns.extend(['negative_count', 'negative_percentage'])
656
+
657
+ # Filter to only include existing columns
658
+ export_columns = [col for col in base_columns if col in filtered_contents.columns]
659
+ export_data = filtered_contents[export_columns]
660
+
661
+ csv = export_data.to_csv(index=False)
662
+
663
+ st.download_button(
664
+ label="📥 Download as CSV",
665
+ data=csv,
666
+ file_name=f"sentiment_analysis_top{top_n}.csv",
667
+ mime="text/csv"
668
+ )
669
+
670
+ with col2:
671
+ st.info("Download the data for further analysis or reporting")
visualization/config/viz_config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "color_schemes": {
3
+ "sentiment_polarity": {
4
+ "very_positive": "#00C851",
5
+ "positive": "#7CB342",
6
+ "neutral": "#FFB300",
7
+ "negative": "#FF6F00",
8
+ "very_negative": "#D32F2F"
9
+ },
10
+ "intent": {
11
+ "praise": "#4CAF50",
12
+ "question": "#2196F3",
13
+ "request": "#9C27B0",
14
+ "feedback_negative": "#FF5722",
15
+ "suggestion": "#00BCD4",
16
+ "humor_sarcasm": "#FFC107",
17
+ "off_topic": "#9E9E9E",
18
+ "spam_selfpromo": "#795548"
19
+ },
20
+ "platform": {
21
+ "facebook": "#1877F2",
22
+ "instagram": "#E4405F",
23
+ "youtube": "#FF0000",
24
+ "twitter": "#1DA1F2",
25
+ "musora_app": "#1982C4",
26
+ "default": "#607D8B"
27
+ },
28
+ "brand": {
29
+ "drumeo": "#FF6B35",
30
+ "pianote": "#6A4C93",
31
+ "musora": "#1982C4",
32
+ "default": "#8AC926"
33
+ }
34
+ },
35
+ "sentiment_order": [
36
+ "very_positive",
37
+ "positive",
38
+ "neutral",
39
+ "negative",
40
+ "very_negative"
41
+ ],
42
+ "intent_order": [
43
+ "praise",
44
+ "question",
45
+ "request",
46
+ "feedback_negative",
47
+ "suggestion",
48
+ "humor_sarcasm",
49
+ "off_topic",
50
+ "spam_selfpromo"
51
+ ],
52
+ "negative_sentiments": [
53
+ "negative",
54
+ "very_negative"
55
+ ],
56
+ "dashboard": {
57
+ "default_date_range_days": 30,
58
+ "max_comments_display": 100,
59
+ "chart_height": 400,
60
+ "top_n_contents": 10
61
+ },
62
+ "page_config": {
63
+ "page_title": "Musora Sentiment Analysis Dashboard",
64
+ "page_icon": "📊",
65
+ "layout": "wide",
66
+ "initial_sidebar_state": "expanded"
67
+ },
68
+ "snowflake": {
69
+ "query": "SELECT s.COMMENT_SK, s.COMMENT_ID, s.ORIGINAL_TEXT, s.PLATFORM, s.COMMENT_TIMESTAMP, s.AUTHOR_NAME, s.AUTHOR_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_TEXT, s.CONTENT_SK, s.CONTENT_ID, s.CONTENT_DESCRIPTION, s.CHANNEL_SK, s.CHANNEL_NAME, s.CHANNEL_DISPLAY_NAME, s.DETECTED_LANGUAGE, s.LANGUAGE_CODE, s.IS_ENGLISH, s.LANGUAGE_CONFIDENCE, s.DETECTION_METHOD, s.HAS_TEXT, s.TRANSLATED_TEXT, s.TRANSLATION_PERFORMED, s.TRANSLATION_CONFIDENCE, s.TRANSLATION_NOTES, s.SENTIMENT_POLARITY, s.INTENT, s.REQUIRES_REPLY, s.SENTIMENT_CONFIDENCE, s.ANALYSIS_NOTES, s.PROCESSING_SUCCESS, CAST(NULL AS VARCHAR(16777216)) as PROCESSING_ERRORS, s.PROCESSED_AT, s.WORKFLOW_VERSION, CAST(NULL AS TIMESTAMP_NTZ(9)) as CREATED_AT, CAST(NULL AS TIMESTAMP_NTZ(9)) as UPDATED_AT, s.CHANNEL_NAME as BRAND, c.PERMALINK_URL, CAST(NULL AS VARCHAR(16777216)) as THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT c ON s.CONTENT_SK = c.CONTENT_SK UNION ALL SELECT COMMENT_SK, COMMENT_ID, ORIGINAL_TEXT, CASE WHEN PLATFORM = 'musora' THEN 'musora_app' ELSE PLATFORM END as PLATFORM, COMMENT_TIMESTAMP, AUTHOR_NAME, AUTHOR_ID, PARENT_COMMENT_ID, PARENT_COMMENT_TEXT, CONTENT_SK, CONTENT_ID, CONTENT_DESCRIPTION, CHANNEL_SK, CHANNEL_NAME, CHANNEL_DISPLAY_NAME, DETECTED_LANGUAGE, LANGUAGE_CODE, IS_ENGLISH, LANGUAGE_CONFIDENCE, DETECTION_METHOD, HAS_TEXT, TRANSLATED_TEXT, TRANSLATION_PERFORMED, TRANSLATION_CONFIDENCE, TRANSLATION_NOTES, SENTIMENT_POLARITY, INTENT, REQUIRES_REPLY, SENTIMENT_CONFIDENCE, ANALYSIS_NOTES, PROCESSING_SUCCESS, PROCESSING_ERRORS, PROCESSED_AT, WORKFLOW_VERSION, CREATED_AT, UPDATED_AT, CHANNEL_NAME as BRAND, PERMALINK_URL, THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES",
70
+ "demographics_query": "SELECT u.id as USER_ID, u.birthday as BIRTHDAY, u.timezone as TIMEZONE, GREATEST(COALESCE(p.difficulty, 0), COALESCE(p.self_report_difficulty, 0), COALESCE(p.method_experience, 0)) AS EXPERIENCE_LEVEL FROM stitch.musora_ecom_db.usora_users u JOIN online_recsys.preprocessed.users p ON u.id = p.user_id"
71
+ },
72
+ "demographics": {
73
+ "age_groups": {
74
+ "18-24": [18, 24],
75
+ "25-34": [25, 34],
76
+ "35-44": [35, 44],
77
+ "45-54": [45, 54],
78
+ "55+": [55, 150]
79
+ },
80
+ "experience_groups": {
81
+ "Beginner (0-3)": [0, 3],
82
+ "Intermediate (4-7)": [4, 7],
83
+ "Advanced (8-10)": [8, 10]
84
+ },
85
+ "top_timezones_count": 15
86
+ }
87
+ }
visualization/data/data_loader.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data loader module for Sentiment Analysis Visualization
3
+ Handles Snowflake connection and data loading with caching
4
+ """
5
+ import sys
6
+ import os
7
+ import pandas as pd
8
+ import streamlit as st
9
+ from pathlib import Path
10
+ import json
11
+ from datetime import datetime, timedelta
12
+ from dateutil.relativedelta import relativedelta
13
+
14
+ # Add parent directory to path to import SnowFlakeConnection
15
+ parent_dir = Path(__file__).resolve().parent.parent.parent
16
+ sys.path.append(str(parent_dir))
17
+
18
+ from visualization.SnowFlakeConnection import SnowFlakeConn
19
+
20
+
21
+ class SentimentDataLoader:
22
+ """
23
+ Loads sentiment analysis data from Snowflake with caching
24
+ """
25
+
26
+ def __init__(self, config_path=None):
27
+ """
28
+ Initialize data loader
29
+
30
+ Args:
31
+ config_path: Path to configuration file
32
+ """
33
+ if config_path is None:
34
+ config_path = Path(__file__).parent.parent / "config" / "viz_config.json"
35
+
36
+ with open(config_path, 'r') as f:
37
+ self.config = json.load(f)
38
+
39
+ self.query = self.config['snowflake']['query']
40
+ self.demographics_query = self.config['snowflake'].get('demographics_query', None)
41
+
42
+ @st.cache_data(ttl=300) # Cache for 5 minutes
43
+ def load_data(_self, reload=False):
44
+ """
45
+ Load sentiment data from Snowflake
46
+
47
+ Args:
48
+ reload: Force reload data (bypass cache)
49
+
50
+ Returns:
51
+ pd.DataFrame: Sentiment analysis data
52
+ """
53
+ try:
54
+ # Connect to Snowflake
55
+ conn = SnowFlakeConn()
56
+
57
+ # Execute query
58
+ df = conn.run_read_query(_self.query, "sentiment features")
59
+
60
+ # Close connection
61
+ conn.close_connection()
62
+
63
+ if df is None or df.empty:
64
+ st.error("No data returned from Snowflake")
65
+ return pd.DataFrame()
66
+
67
+ # Process dataframe
68
+ df = _self._process_dataframe(df)
69
+
70
+ # Load and merge demographics data for musora_app users
71
+ if _self.demographics_query:
72
+ demographics_df = _self.load_demographics_data()
73
+ df = _self.merge_demographics_with_comments(df, demographics_df)
74
+
75
+ return df
76
+
77
+ except Exception as e:
78
+ st.error(f"Error loading data from Snowflake: {e}")
79
+ return pd.DataFrame()
80
+
81
+ def _process_dataframe(self, df):
82
+ """
83
+ Process and clean the dataframe
84
+
85
+ Args:
86
+ df: Raw dataframe from Snowflake
87
+
88
+ Returns:
89
+ pd.DataFrame: Processed dataframe
90
+ """
91
+ # Convert column names to lowercase (should already be done by run_read_query)
92
+ df.columns = df.columns.str.lower()
93
+
94
+ # Parse datetime columns
95
+ if 'comment_timestamp' in df.columns:
96
+ df['comment_timestamp'] = pd.to_datetime(df['comment_timestamp'], errors='coerce')
97
+
98
+ if 'processed_at' in df.columns:
99
+ df['processed_at'] = pd.to_datetime(df['processed_at'], errors='coerce')
100
+
101
+ # Handle null values in key columns
102
+ df['sentiment_polarity'] = df['sentiment_polarity'].fillna('unknown')
103
+ df['intent'] = df['intent'].fillna('unknown')
104
+ df['platform'] = df['platform'].fillna('unknown').str.lower()
105
+ df['brand'] = df['brand'].fillna('unknown').str.lower()
106
+
107
+ # Convert requires_reply to boolean
108
+ if 'requires_reply' in df.columns:
109
+ df['requires_reply'] = df['requires_reply'].astype(bool)
110
+
111
+ # Extract display text (translated if available, otherwise original)
112
+ df['display_text'] = df.apply(
113
+ lambda row: row['translated_text'] if pd.notna(row.get('translated_text')) and row.get('is_english') == False
114
+ else row.get('original_text', ''),
115
+ axis=1
116
+ )
117
+
118
+ # Create a shortened version for display
119
+ df['display_text_short'] = df['display_text'].apply(
120
+ lambda x: x[:100] + '...' if isinstance(x, str) and len(x) > 100 else x
121
+ )
122
+
123
+ return df
124
+
125
+ @staticmethod
126
+ def get_filter_options(df):
127
+ """
128
+ Get unique values for filters
129
+
130
+ Args:
131
+ df: Sentiment dataframe
132
+
133
+ Returns:
134
+ dict: Filter options
135
+ """
136
+ return {
137
+ 'platforms': sorted(df['platform'].unique().tolist()),
138
+ 'brands': sorted(df['brand'].unique().tolist()),
139
+ 'sentiments': sorted(df['sentiment_polarity'].unique().tolist()),
140
+ 'languages': sorted(df['detected_language'].dropna().unique().tolist()) if 'detected_language' in df.columns else []
141
+ }
142
+
143
+ @staticmethod
144
+ def apply_filters(df, platforms=None, brands=None, sentiments=None,
145
+ date_range=None, languages=None):
146
+ """
147
+ Apply filters to dataframe
148
+
149
+ Args:
150
+ df: Sentiment dataframe
151
+ platforms: List of platforms to include
152
+ brands: List of brands to include
153
+ sentiments: List of sentiment polarities to include
154
+ date_range: Tuple of (start_date, end_date)
155
+ languages: List of languages to include
156
+
157
+ Returns:
158
+ pd.DataFrame: Filtered dataframe
159
+ """
160
+ filtered_df = df.copy()
161
+
162
+ if platforms and len(platforms) > 0:
163
+ filtered_df = filtered_df[filtered_df['platform'].isin(platforms)]
164
+
165
+ if brands and len(brands) > 0:
166
+ filtered_df = filtered_df[filtered_df['brand'].isin(brands)]
167
+
168
+ if sentiments and len(sentiments) > 0:
169
+ filtered_df = filtered_df[filtered_df['sentiment_polarity'].isin(sentiments)]
170
+
171
+ if languages and len(languages) > 0:
172
+ filtered_df = filtered_df[filtered_df['detected_language'].isin(languages)]
173
+
174
+ if date_range and len(date_range) == 2 and 'comment_timestamp' in filtered_df.columns:
175
+ start_date, end_date = date_range
176
+ filtered_df = filtered_df[
177
+ (filtered_df['comment_timestamp'] >= pd.Timestamp(start_date)) &
178
+ (filtered_df['comment_timestamp'] <= pd.Timestamp(end_date))
179
+ ]
180
+
181
+ return filtered_df
182
+
183
+ @staticmethod
184
+ def get_date_range(df, default_days=30):
185
+ """
186
+ Get default date range for filtering
187
+
188
+ Args:
189
+ df: Sentiment dataframe
190
+ default_days: Number of days to include by default
191
+
192
+ Returns:
193
+ tuple: (min_date, max_date)
194
+ """
195
+ if 'comment_timestamp' in df.columns and not df.empty:
196
+ max_date = df['comment_timestamp'].max()
197
+ min_date = max_date - timedelta(days=default_days)
198
+ return (min_date, max_date)
199
+ else:
200
+ return (datetime.now() - timedelta(days=default_days), datetime.now())
201
+
202
+ @st.cache_data(ttl=600) # Cache for 10 minutes (demographics change less frequently)
203
+ def load_demographics_data(_self):
204
+ """
205
+ Load user demographic data from Snowflake
206
+
207
+ Returns:
208
+ pd.DataFrame: User demographic data with user_id, birthday, timezone, experience_level
209
+ """
210
+ if not _self.demographics_query:
211
+ return pd.DataFrame()
212
+
213
+ try:
214
+ # Connect to Snowflake
215
+ conn = SnowFlakeConn()
216
+
217
+ # Execute demographics query with explicit timestamp conversion
218
+ # Cast timestamp to string to avoid precision issues, then convert in pandas
219
+ query_with_cast = _self.demographics_query.replace(
220
+ "u.birthday as BIRTHDAY",
221
+ "TO_VARCHAR(u.birthday, 'YYYY-MM-DD HH24:MI:SS.FF6 TZHTZM') as BIRTHDAY"
222
+ )
223
+
224
+ df = conn.run_read_query(query_with_cast, "user demographics")
225
+
226
+ # Close connection
227
+ conn.close_connection()
228
+
229
+ if df is None or df.empty:
230
+ return pd.DataFrame()
231
+
232
+ # Process demographics dataframe
233
+ df = _self._process_demographics_dataframe(df)
234
+
235
+ return df
236
+
237
+ except Exception as e:
238
+ # More detailed error message
239
+ st.warning(f"Could not load demographic data: {str(e)}")
240
+ import traceback
241
+ st.error(f"Error reading user demographics: {traceback.format_exc()}")
242
+ return pd.DataFrame()
243
+
244
+ def _process_demographics_dataframe(self, df):
245
+ """
246
+ Process and enrich demographic dataframe
247
+
248
+ Args:
249
+ df: Raw demographics dataframe
250
+
251
+ Returns:
252
+ pd.DataFrame: Processed demographics with age and region fields
253
+ """
254
+ # Convert column names to lowercase
255
+ df.columns = df.columns.str.lower()
256
+
257
+ # Parse birthday as datetime
258
+ if 'birthday' in df.columns:
259
+ # Convert to datetime, handling both string and timestamp formats
260
+ # First convert to string to ensure consistency
261
+ df['birthday'] = df['birthday'].astype(str)
262
+ # Then parse as datetime
263
+ df['birthday'] = pd.to_datetime(df['birthday'], errors='coerce', utc=True)
264
+ # Remove timezone info to avoid issues
265
+ df['birthday'] = df['birthday'].dt.tz_localize(None)
266
+
267
+ # Calculate age
268
+ df['age'] = df['birthday'].apply(self._calculate_age)
269
+
270
+ # Create age groups
271
+ df['age_group'] = df['age'].apply(self._categorize_age)
272
+
273
+ # Extract region from timezone
274
+ if 'timezone' in df.columns:
275
+ df['timezone_region'] = df['timezone'].apply(self._extract_timezone_region)
276
+
277
+ # Create experience level groups
278
+ if 'experience_level' in df.columns:
279
+ df['experience_group'] = df['experience_level'].apply(self._categorize_experience)
280
+
281
+ # Remove records with null user_id
282
+ if 'user_id' in df.columns:
283
+ df = df[df['user_id'].notna()]
284
+
285
+ return df
286
+
287
+ @staticmethod
288
+ def _calculate_age(birthday):
289
+ """
290
+ Calculate age from birthday
291
+
292
+ Args:
293
+ birthday: datetime object
294
+
295
+ Returns:
296
+ int: Age in years, or None if birthday is invalid
297
+ """
298
+ if pd.isna(birthday):
299
+ return None
300
+
301
+ try:
302
+ today = datetime.now()
303
+ age = relativedelta(today, birthday).years
304
+
305
+ # Sanity check: age should be between 0 and 120
306
+ if 0 <= age <= 120:
307
+ return age
308
+ return None
309
+ except:
310
+ return None
311
+
312
+ def _categorize_age(self, age):
313
+ """
314
+ Categorize age into groups based on config
315
+
316
+ Args:
317
+ age: Age in years
318
+
319
+ Returns:
320
+ str: Age group label or 'Unknown'
321
+ """
322
+ if pd.isna(age) or age is None:
323
+ return 'Unknown'
324
+
325
+ age_groups = self.config.get('demographics', {}).get('age_groups', {})
326
+
327
+ for group_name, (min_age, max_age) in age_groups.items():
328
+ if min_age <= age <= max_age:
329
+ return group_name
330
+
331
+ return 'Unknown'
332
+
333
+ @staticmethod
334
+ def _extract_timezone_region(timezone):
335
+ """
336
+ Extract region from timezone string (e.g., 'America/New_York' -> 'America')
337
+
338
+ Args:
339
+ timezone: Timezone string
340
+
341
+ Returns:
342
+ str: Region name or 'Unknown'
343
+ """
344
+ if pd.isna(timezone) or not isinstance(timezone, str):
345
+ return 'Unknown'
346
+
347
+ # Split by '/' and take the first part
348
+ parts = timezone.split('/')
349
+ if len(parts) > 0:
350
+ return parts[0]
351
+
352
+ return 'Unknown'
353
+
354
+ def _categorize_experience(self, experience_level):
355
+ """
356
+ Categorize experience level into groups based on config
357
+
358
+ Args:
359
+ experience_level: Numeric experience level
360
+
361
+ Returns:
362
+ str: Experience group label or 'Unknown'
363
+ """
364
+ if pd.isna(experience_level):
365
+ return 'Unknown'
366
+
367
+ try:
368
+ exp_level = float(experience_level)
369
+ except:
370
+ return 'Unknown'
371
+
372
+ exp_groups = self.config.get('demographics', {}).get('experience_groups', {})
373
+
374
+ for group_name, (min_exp, max_exp) in exp_groups.items():
375
+ if min_exp <= exp_level <= max_exp:
376
+ return group_name
377
+
378
+ return 'Unknown'
379
+
380
+ def merge_demographics_with_comments(self, comments_df, demographics_df):
381
+ """
382
+ Merge demographic data with comment data for musora_app platform only
383
+
384
+ Args:
385
+ comments_df: Main comments dataframe
386
+ demographics_df: Demographics dataframe
387
+
388
+ Returns:
389
+ pd.DataFrame: Merged dataframe with demographic fields for musora_app comments
390
+ """
391
+ if demographics_df.empty:
392
+ # Add empty demographic columns if no demographics data
393
+ comments_df['age'] = None
394
+ comments_df['age_group'] = 'Unknown'
395
+ comments_df['timezone'] = None
396
+ comments_df['timezone_region'] = 'Unknown'
397
+ comments_df['experience_level'] = None
398
+ comments_df['experience_group'] = 'Unknown'
399
+ return comments_df
400
+
401
+ # Ensure author_id is in the same format for merging
402
+ if 'author_id' in comments_df.columns and 'user_id' in demographics_df.columns:
403
+ # Convert both to string for consistent merging
404
+ comments_df['author_id_str'] = comments_df['author_id'].astype(str)
405
+ demographics_df['user_id_str'] = demographics_df['user_id'].astype(str)
406
+
407
+ # Merge demographics only for musora_app platform
408
+ merged_df = comments_df.merge(
409
+ demographics_df[['user_id_str', 'age', 'age_group', 'timezone', 'timezone_region',
410
+ 'experience_level', 'experience_group']],
411
+ left_on='author_id_str',
412
+ right_on='user_id_str',
413
+ how='left'
414
+ )
415
+
416
+ # Drop temporary merge columns
417
+ merged_df = merged_df.drop(columns=['author_id_str', 'user_id_str'], errors='ignore')
418
+
419
+ # Fill NaN demographic fields with 'Unknown' for non-musora_app platforms
420
+ demographic_cols = ['age_group', 'timezone_region', 'experience_group']
421
+ for col in demographic_cols:
422
+ if col in merged_df.columns:
423
+ merged_df[col] = merged_df[col].fillna('Unknown')
424
+
425
+ return merged_df
426
+
427
+ return comments_df
visualization/img/musora.png ADDED
visualization/requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Visualization Requirements
2
+ # Install with: pip install -r requirements.txt
3
+
4
+ # Core visualization
5
+ streamlit>=1.28.0
6
+ plotly>=5.17.0
7
+
8
+ # Data processing
9
+ pandas>=2.0.0
10
+ numpy>=1.24.0
11
+ python-dateutil>=2.8.0
12
+
13
+ # Snowflake connectivity (inherited from parent project)
14
+ snowflake-snowpark-python>=1.8.0
15
+
16
+ # Environment management (inherited from parent project)
17
+ python-dotenv>=1.0.0
visualization/utils/data_processor.py ADDED
@@ -0,0 +1,604 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data processing utilities for sentiment analysis
3
+ Handles aggregation, grouping, and transformation operations
4
+ """
5
+ import pandas as pd
6
+ import numpy as np
7
+ from typing import List, Dict, Tuple
8
+
9
+
10
+ class SentimentDataProcessor:
11
+ """
12
+ Processes sentiment data for visualization
13
+ """
14
+
15
+ @staticmethod
16
+ def aggregate_by_dimensions(df, group_by_cols, agg_cols=None):
17
+ """
18
+ Aggregate data by specified dimensions
19
+
20
+ Args:
21
+ df: Sentiment dataframe
22
+ group_by_cols: List of columns to group by
23
+ agg_cols: Dictionary of columns and aggregation functions
24
+
25
+ Returns:
26
+ pd.DataFrame: Aggregated dataframe
27
+ """
28
+ if agg_cols is None:
29
+ agg_cols = {
30
+ 'comment_sk': 'count',
31
+ 'requires_reply': 'sum'
32
+ }
33
+
34
+ return df.groupby(group_by_cols, as_index=False).agg(agg_cols)
35
+
36
+ @staticmethod
37
+ def get_sentiment_distribution(df, group_by=None):
38
+ """
39
+ Calculate sentiment distribution
40
+
41
+ Args:
42
+ df: Sentiment dataframe
43
+ group_by: Optional column(s) to group by
44
+
45
+ Returns:
46
+ pd.DataFrame: Sentiment distribution
47
+ """
48
+ if group_by:
49
+ # Group by specified columns and sentiment
50
+ if isinstance(group_by, str):
51
+ group_by = [group_by]
52
+
53
+ sentiment_counts = df.groupby(
54
+ group_by + ['sentiment_polarity'],
55
+ as_index=False
56
+ ).size().rename(columns={'size': 'count'})
57
+
58
+ # Calculate percentages within each group
59
+ sentiment_counts['percentage'] = sentiment_counts.groupby(group_by)['count'].transform(
60
+ lambda x: (x / x.sum() * 100).round(2)
61
+ )
62
+
63
+ else:
64
+ # Overall sentiment distribution
65
+ sentiment_counts = df['sentiment_polarity'].value_counts().reset_index()
66
+ sentiment_counts.columns = ['sentiment_polarity', 'count']
67
+ sentiment_counts['percentage'] = (
68
+ sentiment_counts['count'] / sentiment_counts['count'].sum() * 100
69
+ ).round(2)
70
+
71
+ return sentiment_counts
72
+
73
+ @staticmethod
74
+ def get_intent_distribution(df, group_by=None):
75
+ """
76
+ Calculate intent distribution (handles multi-label)
77
+
78
+ Args:
79
+ df: Sentiment dataframe
80
+ group_by: Optional column(s) to group by
81
+
82
+ Returns:
83
+ pd.DataFrame: Intent distribution
84
+ """
85
+ # Explode intents (split comma-separated values)
86
+ df_exploded = df.copy()
87
+ df_exploded['intent'] = df_exploded['intent'].str.split(',')
88
+ df_exploded = df_exploded.explode('intent')
89
+ df_exploded['intent'] = df_exploded['intent'].str.strip()
90
+
91
+ if group_by:
92
+ # Group by specified columns and intent
93
+ if isinstance(group_by, str):
94
+ group_by = [group_by]
95
+
96
+ intent_counts = df_exploded.groupby(
97
+ group_by + ['intent'],
98
+ as_index=False
99
+ ).size().rename(columns={'size': 'count'})
100
+
101
+ # Calculate percentages within each group
102
+ intent_counts['percentage'] = intent_counts.groupby(group_by)['count'].transform(
103
+ lambda x: (x / x.sum() * 100).round(2)
104
+ )
105
+
106
+ else:
107
+ # Overall intent distribution
108
+ intent_counts = df_exploded['intent'].value_counts().reset_index()
109
+ intent_counts.columns = ['intent', 'count']
110
+ intent_counts['percentage'] = (
111
+ intent_counts['count'] / intent_counts['count'].sum() * 100
112
+ ).round(2)
113
+
114
+ return intent_counts
115
+
116
+ @staticmethod
117
+ def get_content_summary(df):
118
+ """
119
+ Get summary statistics for each content
120
+
121
+ Args:
122
+ df: Sentiment dataframe
123
+
124
+ Returns:
125
+ pd.DataFrame: Content summary with statistics
126
+ """
127
+ # Group by content (dropna=False to include records with NULL permalink_url, e.g., YouTube)
128
+ content_summary = df.groupby(['content_sk', 'content_description', 'permalink_url'], dropna=False).agg({
129
+ 'comment_sk': 'count',
130
+ 'requires_reply': 'sum',
131
+ 'sentiment_polarity': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'unknown'
132
+ }).reset_index()
133
+
134
+ content_summary.columns = [
135
+ 'content_sk', 'content_description', 'permalink_url',
136
+ 'total_comments', 'reply_required_count', 'dominant_sentiment'
137
+ ]
138
+
139
+ # Calculate negative sentiment percentage for each content
140
+ negative_sentiments = ['negative', 'very_negative']
141
+ content_negative = df[df['sentiment_polarity'].isin(negative_sentiments)].groupby(
142
+ 'content_sk'
143
+ ).size().reset_index(name='negative_count')
144
+
145
+ content_summary = content_summary.merge(content_negative, on='content_sk', how='left')
146
+ content_summary['negative_count'] = content_summary['negative_count'].fillna(0)
147
+ content_summary['negative_percentage'] = (
148
+ content_summary['negative_count'] / content_summary['total_comments'] * 100
149
+ ).round(2)
150
+
151
+ # Calculate severity score (balances percentage and volume)
152
+ # Formula: negative_percentage * sqrt(total_comments)
153
+ # This gives weight to both high negative % and high comment volume
154
+ content_summary['severity_score'] = (
155
+ content_summary['negative_percentage'] *
156
+ (content_summary['total_comments'] ** 0.5)
157
+ ).round(2)
158
+
159
+ return content_summary
160
+
161
+ @staticmethod
162
+ def get_top_poor_sentiment_contents(df, top_n=10, min_comments=1, sort_by='severity_score'):
163
+ """
164
+ Get contents with highest poor sentiment based on selected criteria
165
+
166
+ Args:
167
+ df: Sentiment dataframe
168
+ top_n: Number of top contents to return
169
+ min_comments: Minimum number of comments a content must have to be included
170
+ sort_by: Sorting criteria - 'severity_score', 'negative_percentage', 'negative_count', 'total_comments'
171
+
172
+ Returns:
173
+ pd.DataFrame: Top contents with poor sentiment
174
+ """
175
+ content_summary = SentimentDataProcessor.get_content_summary(df)
176
+
177
+ # Filter by minimum comments
178
+ content_summary = content_summary[content_summary['total_comments'] >= min_comments]
179
+
180
+ # Determine sort columns based on sort_by parameter
181
+ if sort_by == 'severity_score':
182
+ # Sort by severity score (balanced), then by negative percentage as tie-breaker
183
+ sort_columns = ['severity_score', 'negative_percentage']
184
+ elif sort_by == 'negative_percentage':
185
+ # Sort by negative percentage, then by total comments
186
+ sort_columns = ['negative_percentage', 'total_comments']
187
+ elif sort_by == 'negative_count':
188
+ # Sort by absolute negative count, then by negative percentage
189
+ sort_columns = ['negative_count', 'negative_percentage']
190
+ elif sort_by == 'total_comments':
191
+ # Sort by total comments volume
192
+ sort_columns = ['total_comments', 'negative_count']
193
+ else:
194
+ # Default to severity score
195
+ sort_columns = ['severity_score', 'negative_percentage']
196
+
197
+ # Sort and get top N
198
+ top_poor = content_summary.sort_values(
199
+ by=sort_columns,
200
+ ascending=[False, False]
201
+ ).head(top_n)
202
+
203
+ return top_poor
204
+
205
+ @staticmethod
206
+ def get_comments_requiring_reply(df):
207
+ """
208
+ Get all comments that require reply
209
+
210
+ Args:
211
+ df: Sentiment dataframe
212
+
213
+ Returns:
214
+ pd.DataFrame: Comments requiring reply
215
+ """
216
+ reply_df = df[df['requires_reply'] == True].copy()
217
+
218
+ # Sort by timestamp (most recent first)
219
+ if 'comment_timestamp' in reply_df.columns:
220
+ reply_df = reply_df.sort_values('comment_timestamp', ascending=False)
221
+
222
+ return reply_df
223
+
224
+ @staticmethod
225
+ def get_platform_brand_summary(df):
226
+ """
227
+ Get summary statistics by platform and brand
228
+
229
+ Args:
230
+ df: Sentiment dataframe
231
+
232
+ Returns:
233
+ pd.DataFrame: Platform and brand summary
234
+ """
235
+ summary = df.groupby(['platform', 'brand']).agg({
236
+ 'comment_sk': 'count',
237
+ 'requires_reply': 'sum'
238
+ }).reset_index()
239
+
240
+ summary.columns = ['platform', 'brand', 'total_comments', 'reply_required']
241
+
242
+ # Add sentiment distribution
243
+ sentiment_dist = SentimentDataProcessor.get_sentiment_distribution(
244
+ df, group_by=['platform', 'brand']
245
+ )
246
+
247
+ # Pivot sentiment distribution
248
+ sentiment_pivot = sentiment_dist.pivot_table(
249
+ index=['platform', 'brand'],
250
+ columns='sentiment_polarity',
251
+ values='count',
252
+ fill_value=0
253
+ ).reset_index()
254
+
255
+ # Merge with summary
256
+ summary = summary.merge(sentiment_pivot, on=['platform', 'brand'], how='left')
257
+
258
+ return summary
259
+
260
+ @staticmethod
261
+ def get_temporal_trends(df, freq='D'):
262
+ """
263
+ Get temporal trends of sentiment over time
264
+
265
+ Args:
266
+ df: Sentiment dataframe
267
+ freq: Frequency for aggregation ('D'=daily, 'W'=weekly, 'M'=monthly)
268
+
269
+ Returns:
270
+ pd.DataFrame: Temporal sentiment trends
271
+ """
272
+ if 'comment_timestamp' not in df.columns:
273
+ return pd.DataFrame()
274
+
275
+ df_temporal = df.copy()
276
+ df_temporal['date'] = pd.to_datetime(df_temporal['comment_timestamp']).dt.to_period(freq)
277
+
278
+ # Aggregate by date and sentiment
279
+ trends = df_temporal.groupby(['date', 'sentiment_polarity']).size().reset_index(name='count')
280
+ trends['date'] = trends['date'].dt.to_timestamp()
281
+
282
+ return trends
283
+
284
+ @staticmethod
285
+ def calculate_sentiment_score(df):
286
+ """
287
+ Calculate weighted sentiment score
288
+
289
+ Args:
290
+ df: Sentiment dataframe
291
+
292
+ Returns:
293
+ float: Average sentiment score (-2 to +2)
294
+ """
295
+ sentiment_weights = {
296
+ 'very_negative': -2,
297
+ 'negative': -1,
298
+ 'neutral': 0,
299
+ 'positive': 1,
300
+ 'very_positive': 2
301
+ }
302
+
303
+ df['sentiment_score'] = df['sentiment_polarity'].map(sentiment_weights)
304
+ return df['sentiment_score'].mean()
305
+
306
+ @staticmethod
307
+ def get_language_distribution(df):
308
+ """
309
+ Get distribution of detected languages
310
+
311
+ Args:
312
+ df: Sentiment dataframe
313
+
314
+ Returns:
315
+ pd.DataFrame: Language distribution
316
+ """
317
+ if 'detected_language' not in df.columns:
318
+ return pd.DataFrame()
319
+
320
+ lang_dist = df['detected_language'].value_counts().reset_index()
321
+ lang_dist.columns = ['language', 'count']
322
+ lang_dist['percentage'] = (lang_dist['count'] / lang_dist['count'].sum() * 100).round(2)
323
+
324
+ return lang_dist
325
+
326
+ @staticmethod
327
+ def get_sentiment_filtered_contents(df, selected_sentiments=None, selected_intents=None,
328
+ top_n=10, min_comments=1, sort_by='severity_score'):
329
+ """
330
+ Get contents filtered by selected sentiments and intents with dynamic sorting
331
+
332
+ Args:
333
+ df: Sentiment dataframe
334
+ selected_sentiments: List of sentiments to filter by (filters by dominant sentiment)
335
+ selected_intents: List of intents to filter by (content must have at least one comment with these intents)
336
+ top_n: Number of top contents to return
337
+ min_comments: Minimum number of comments a content must have
338
+ sort_by: Sorting criteria - 'severity_score', 'sentiment_percentage', 'sentiment_count', 'total_comments'
339
+
340
+ Returns:
341
+ pd.DataFrame: Filtered and sorted contents
342
+ """
343
+ content_summary = SentimentDataProcessor.get_content_summary(df)
344
+
345
+ # Filter by minimum comments
346
+ content_summary = content_summary[content_summary['total_comments'] >= min_comments]
347
+
348
+ # If no sentiments selected, default to all sentiments
349
+ if not selected_sentiments:
350
+ selected_sentiments = df['sentiment_polarity'].unique().tolist()
351
+
352
+ # Filter by dominant sentiment
353
+ content_summary = content_summary[content_summary['dominant_sentiment'].isin(selected_sentiments)]
354
+
355
+ # Filter by intents if specified
356
+ if selected_intents:
357
+ # Get content_sks that have at least one comment with the selected intents
358
+ content_sks_with_intent = set()
359
+ for intent in selected_intents:
360
+ matching_contents = df[df['intent'].str.contains(intent, na=False, case=False)]['content_sk'].unique()
361
+ content_sks_with_intent.update(matching_contents)
362
+
363
+ content_summary = content_summary[content_summary['content_sk'].isin(content_sks_with_intent)]
364
+
365
+ # Calculate percentage and count for selected sentiments
366
+ sentiment_counts = df[df['sentiment_polarity'].isin(selected_sentiments)].groupby(
367
+ 'content_sk'
368
+ ).size().reset_index(name='selected_sentiment_count')
369
+
370
+ content_summary = content_summary.merge(sentiment_counts, on='content_sk', how='left')
371
+ content_summary['selected_sentiment_count'] = content_summary['selected_sentiment_count'].fillna(0)
372
+ content_summary['selected_sentiment_percentage'] = (
373
+ content_summary['selected_sentiment_count'] / content_summary['total_comments'] * 100
374
+ ).round(2)
375
+
376
+ # Calculate dynamic severity score based on selected sentiments
377
+ content_summary['dynamic_severity_score'] = (
378
+ content_summary['selected_sentiment_percentage'] *
379
+ (content_summary['total_comments'] ** 0.5)
380
+ ).round(2)
381
+
382
+ # Determine sort columns based on sort_by parameter
383
+ if sort_by == 'severity_score':
384
+ sort_columns = ['dynamic_severity_score', 'selected_sentiment_percentage']
385
+ elif sort_by == 'sentiment_percentage':
386
+ sort_columns = ['selected_sentiment_percentage', 'total_comments']
387
+ elif sort_by == 'sentiment_count':
388
+ sort_columns = ['selected_sentiment_count', 'selected_sentiment_percentage']
389
+ elif sort_by == 'total_comments':
390
+ sort_columns = ['total_comments', 'selected_sentiment_count']
391
+ else:
392
+ sort_columns = ['dynamic_severity_score', 'selected_sentiment_percentage']
393
+
394
+ # Sort and get top N
395
+ filtered_contents = content_summary.sort_values(
396
+ by=sort_columns,
397
+ ascending=[False, False]
398
+ ).head(top_n)
399
+
400
+ return filtered_contents
401
+
402
+ @staticmethod
403
+ def get_demographics_distribution(df, demographic_field, filter_platform='musora_app'):
404
+ """
405
+ Get distribution of a demographic field (only for specified platform)
406
+
407
+ Args:
408
+ df: Sentiment dataframe with demographic fields
409
+ demographic_field: Field to analyze ('age_group', 'timezone', 'timezone_region', 'experience_level', 'experience_group')
410
+ filter_platform: Platform to filter (default: 'musora_app')
411
+
412
+ Returns:
413
+ pd.DataFrame: Distribution with count and percentage
414
+ """
415
+ # Filter for specified platform only
416
+ if filter_platform and 'platform' in df.columns:
417
+ df_filtered = df[df['platform'] == filter_platform].copy()
418
+ else:
419
+ df_filtered = df.copy()
420
+
421
+ if df_filtered.empty or demographic_field not in df_filtered.columns:
422
+ return pd.DataFrame()
423
+
424
+ # Remove 'Unknown' and null values
425
+ df_filtered = df_filtered[
426
+ (df_filtered[demographic_field].notna()) &
427
+ (df_filtered[demographic_field] != 'Unknown')
428
+ ]
429
+
430
+ if df_filtered.empty:
431
+ return pd.DataFrame()
432
+
433
+ # Count distribution
434
+ distribution = df_filtered[demographic_field].value_counts().reset_index()
435
+ distribution.columns = [demographic_field, 'count']
436
+
437
+ # Calculate percentage
438
+ distribution['percentage'] = (
439
+ distribution['count'] / distribution['count'].sum() * 100
440
+ ).round(2)
441
+
442
+ # Sort by count descending
443
+ distribution = distribution.sort_values('count', ascending=False)
444
+
445
+ return distribution
446
+
447
+ @staticmethod
448
+ def get_demographics_by_sentiment(df, demographic_field, filter_platform='musora_app'):
449
+ """
450
+ Get sentiment distribution for each demographic group
451
+
452
+ Args:
453
+ df: Sentiment dataframe with demographic fields
454
+ demographic_field: Field to analyze
455
+ filter_platform: Platform to filter (default: 'musora_app')
456
+
457
+ Returns:
458
+ pd.DataFrame: Sentiment distribution per demographic group
459
+ """
460
+ # Filter for specified platform only
461
+ if filter_platform and 'platform' in df.columns:
462
+ df_filtered = df[df['platform'] == filter_platform].copy()
463
+ else:
464
+ df_filtered = df.copy()
465
+
466
+ if df_filtered.empty or demographic_field not in df_filtered.columns:
467
+ return pd.DataFrame()
468
+
469
+ # Remove 'Unknown' and null values
470
+ df_filtered = df_filtered[
471
+ (df_filtered[demographic_field].notna()) &
472
+ (df_filtered[demographic_field] != 'Unknown')
473
+ ]
474
+
475
+ if df_filtered.empty:
476
+ return pd.DataFrame()
477
+
478
+ # Group by demographic field and sentiment
479
+ sentiment_by_demo = df_filtered.groupby(
480
+ [demographic_field, 'sentiment_polarity'],
481
+ as_index=False
482
+ ).size().rename(columns={'size': 'count'})
483
+
484
+ # Calculate percentage within each demographic group
485
+ sentiment_by_demo['percentage'] = sentiment_by_demo.groupby(demographic_field)['count'].transform(
486
+ lambda x: (x / x.sum() * 100).round(2)
487
+ )
488
+
489
+ return sentiment_by_demo
490
+
491
+ @staticmethod
492
+ def get_top_timezones(df, top_n=15, filter_platform='musora_app'):
493
+ """
494
+ Get top N timezones with most comments
495
+
496
+ Args:
497
+ df: Sentiment dataframe with timezone field
498
+ top_n: Number of top timezones to return
499
+ filter_platform: Platform to filter (default: 'musora_app')
500
+
501
+ Returns:
502
+ pd.DataFrame: Top timezones with counts
503
+ """
504
+ return SentimentDataProcessor.get_demographics_distribution(
505
+ df, 'timezone', filter_platform
506
+ ).head(top_n)
507
+
508
+ @staticmethod
509
+ def get_timezone_regions_distribution(df, filter_platform='musora_app'):
510
+ """
511
+ Get distribution of timezone regions
512
+
513
+ Args:
514
+ df: Sentiment dataframe with timezone_region field
515
+ filter_platform: Platform to filter (default: 'musora_app')
516
+
517
+ Returns:
518
+ pd.DataFrame: Region distribution with counts
519
+ """
520
+ return SentimentDataProcessor.get_demographics_distribution(
521
+ df, 'timezone_region', filter_platform
522
+ )
523
+
524
+ @staticmethod
525
+ def get_experience_level_distribution(df, filter_platform='musora_app', use_groups=False):
526
+ """
527
+ Get distribution of experience levels
528
+
529
+ Args:
530
+ df: Sentiment dataframe with experience fields
531
+ filter_platform: Platform to filter (default: 'musora_app')
532
+ use_groups: If True, use grouped experience levels, otherwise use raw values
533
+
534
+ Returns:
535
+ pd.DataFrame: Experience distribution
536
+ """
537
+ field = 'experience_group' if use_groups else 'experience_level'
538
+ return SentimentDataProcessor.get_demographics_distribution(
539
+ df, field, filter_platform
540
+ )
541
+
542
+ @staticmethod
543
+ def get_demographics_summary(df, filter_platform='musora_app'):
544
+ """
545
+ Get summary statistics for demographic data
546
+
547
+ Args:
548
+ df: Sentiment dataframe with demographic fields
549
+ filter_platform: Platform to filter (default: 'musora_app')
550
+
551
+ Returns:
552
+ dict: Summary statistics
553
+ """
554
+ # Filter for specified platform only
555
+ if filter_platform and 'platform' in df.columns:
556
+ df_filtered = df[df['platform'] == filter_platform].copy()
557
+ else:
558
+ df_filtered = df.copy()
559
+
560
+ if df_filtered.empty:
561
+ return {
562
+ 'total_comments': 0,
563
+ 'users_with_demographics': 0,
564
+ 'avg_age': None,
565
+ 'most_common_age_group': 'Unknown',
566
+ 'most_common_region': 'Unknown',
567
+ 'avg_experience': None
568
+ }
569
+
570
+ # Remove records without demographic data
571
+ df_with_demo = df_filtered[
572
+ (df_filtered['age'].notna()) |
573
+ (df_filtered['timezone'].notna()) |
574
+ (df_filtered['experience_level'].notna())
575
+ ].copy()
576
+
577
+ summary = {
578
+ 'total_comments': len(df_filtered),
579
+ 'users_with_demographics': len(df_with_demo),
580
+ 'coverage_percentage': round(len(df_with_demo) / len(df_filtered) * 100, 2) if len(df_filtered) > 0 else 0
581
+ }
582
+
583
+ # Age statistics
584
+ if 'age' in df_with_demo.columns:
585
+ valid_ages = df_with_demo['age'].dropna()
586
+ summary['avg_age'] = round(valid_ages.mean(), 1) if len(valid_ages) > 0 else None
587
+
588
+ age_groups = df_with_demo['age_group'].value_counts()
589
+ summary['most_common_age_group'] = age_groups.index[0] if len(age_groups) > 0 else 'Unknown'
590
+
591
+ # Timezone statistics
592
+ if 'timezone_region' in df_with_demo.columns:
593
+ regions = df_with_demo[df_with_demo['timezone_region'] != 'Unknown']['timezone_region'].value_counts()
594
+ summary['most_common_region'] = regions.index[0] if len(regions) > 0 else 'Unknown'
595
+
596
+ # Experience statistics
597
+ if 'experience_level' in df_with_demo.columns:
598
+ valid_exp = df_with_demo['experience_level'].dropna()
599
+ summary['avg_experience'] = round(valid_exp.mean(), 2) if len(valid_exp) > 0 else None
600
+
601
+ exp_groups = df_with_demo['experience_group'].value_counts()
602
+ summary['most_common_experience'] = exp_groups.index[0] if len(exp_groups) > 0 else 'Unknown'
603
+
604
+ return summary
visualization/utils/llm_helper.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM Helper for visualization agents
3
+ Handles OpenAI API calls with retry logic and error handling
4
+ """
5
+ import os
6
+ import json
7
+ from typing import Dict, Any, Optional
8
+ from openai import OpenAI
9
+ from dotenv import load_dotenv
10
+ import time
11
+
12
+ # Load environment variables from root directory (parent of visualization)
13
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
14
+ VISUALIZATION_DIR = os.path.dirname(SCRIPT_DIR)
15
+ ROOT_DIR = os.path.dirname(VISUALIZATION_DIR)
16
+ load_dotenv(os.path.join(ROOT_DIR, '.env'))
17
+
18
+
19
+ class LLMHelper:
20
+ """
21
+ Helper class for LLM interactions
22
+ """
23
+
24
+ def __init__(self, model: str = "gpt-5-nano", temperature: float = 1):
25
+ """
26
+ Initialize LLM helper
27
+
28
+ Args:
29
+ model: Model name to use
30
+ temperature: Temperature for generation
31
+ """
32
+ self.model = model
33
+ self.temperature = temperature
34
+ self.api_key = os.getenv('OPENAI_API_KEY')
35
+
36
+ if not self.api_key:
37
+ raise ValueError("OPENAI_API_KEY not found in environment variables")
38
+
39
+ self.client = OpenAI(api_key=self.api_key)
40
+
41
+ def get_completion(
42
+ self,
43
+ prompt: str,
44
+ system_message: Optional[str] = None,
45
+ max_retries: int = 3,
46
+ json_mode: bool = False
47
+ ) -> Dict[str, Any]:
48
+ """
49
+ Get completion from LLM with retry logic
50
+
51
+ Args:
52
+ prompt: User prompt
53
+ system_message: Optional system message
54
+ max_retries: Maximum number of retries
55
+ json_mode: Whether to force JSON response
56
+
57
+ Returns:
58
+ Dictionary with response data
59
+ """
60
+ messages = []
61
+
62
+ if system_message:
63
+ messages.append({"role": "system", "content": system_message})
64
+
65
+ messages.append({"role": "user", "content": prompt})
66
+
67
+ for attempt in range(max_retries):
68
+ try:
69
+ # Prepare API call parameters
70
+ api_params = {
71
+ "model": self.model,
72
+ "messages": messages,
73
+ "temperature": self.temperature,
74
+ "reasoning_effort": "low",
75
+ "n": 1
76
+ }
77
+
78
+ # Add response format if JSON mode requested
79
+ if json_mode:
80
+ api_params["response_format"] = {"type": "json_object"}
81
+
82
+ # Make API call
83
+ response = self.client.chat.completions.create(**api_params)
84
+
85
+ # Extract response
86
+ content = response.choices[0].message.content
87
+
88
+ # Parse JSON if requested
89
+ if json_mode:
90
+ try:
91
+ content = json.loads(content)
92
+ except json.JSONDecodeError as e:
93
+ return {
94
+ 'success': False,
95
+ 'error': f"Failed to parse JSON response: {str(e)}",
96
+ 'raw_content': content
97
+ }
98
+
99
+ return {
100
+ 'success': True,
101
+ 'content': content,
102
+ 'model': response.model,
103
+ 'usage': {
104
+ 'prompt_tokens': response.usage.prompt_tokens,
105
+ 'completion_tokens': response.usage.completion_tokens,
106
+ 'total_tokens': response.usage.total_tokens
107
+ }
108
+ }
109
+
110
+ except Exception as e:
111
+ if attempt < max_retries - 1:
112
+ # Wait before retry (exponential backoff)
113
+ time.sleep(2 ** attempt)
114
+ continue
115
+ else:
116
+ return {
117
+ 'success': False,
118
+ 'error': str(e),
119
+ 'error_type': type(e).__name__
120
+ }
121
+
122
+ return {
123
+ 'success': False,
124
+ 'error': f"Failed after {max_retries} attempts"
125
+ }
126
+
127
+ def get_structured_completion(
128
+ self,
129
+ prompt: str,
130
+ system_message: str,
131
+ max_retries: int = 3
132
+ ) -> Dict[str, Any]:
133
+ """
134
+ Get structured JSON completion
135
+
136
+ Args:
137
+ prompt: User prompt
138
+ system_message: System message
139
+ max_retries: Maximum retries
140
+
141
+ Returns:
142
+ Structured response dictionary
143
+ """
144
+ return self.get_completion(
145
+ prompt=prompt,
146
+ system_message=system_message,
147
+ max_retries=max_retries,
148
+ json_mode=True
149
+ )