Commit ·
d61f3de
1
Parent(s): dac4ffa
Adding files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +0 -35
- Dockerfile +0 -20
- README.md +0 -19
- processing_brand_sentiment/ARCHITECTURE_PROPOSAL.md +437 -0
- processing_brand_sentiment/README.md +402 -0
- processing_brand_sentiment/config_files/analysis_categories.json +123 -0
- processing_brand_sentiment/config_files/brand_config.json +111 -0
- processing_brand_sentiment/config_files/workflow_config.json +60 -0
- processing_brand_sentiment/database/__init__.py +8 -0
- processing_brand_sentiment/database/snowflake_connection.py +240 -0
- processing_brand_sentiment/database/sql/create_comments_output_table.sql +161 -0
- processing_brand_sentiment/database/sql/create_output_table.sql +250 -0
- processing_brand_sentiment/database/sql/fetch_comments.sql +82 -0
- processing_brand_sentiment/database/sql/fetch_forum_posts.sql +106 -0
- processing_brand_sentiment/database/sql/init_comments_output_table.sql +78 -0
- processing_brand_sentiment/database/sql/init_output_table.sql +89 -0
- processing_brand_sentiment/main.py +1088 -0
- processing_brand_sentiment/utils/__init__.py +8 -0
- processing_brand_sentiment/utils/html_parser.py +253 -0
- processing_brand_sentiment/workflow/__init__.py +10 -0
- processing_brand_sentiment/workflow/agents/__init__.py +39 -0
- processing_brand_sentiment/workflow/agents/base_agent.py +169 -0
- processing_brand_sentiment/workflow/agents/comment_preprocessor_agent.py +211 -0
- processing_brand_sentiment/workflow/agents/content_preprocessor_agent.py +570 -0
- processing_brand_sentiment/workflow/agents/output_validator_agent.py +408 -0
- processing_brand_sentiment/workflow/agents/preprocessor_agent.py +408 -0
- processing_brand_sentiment/workflow/agents/relevance_validator_agent.py +289 -0
- processing_brand_sentiment/workflow/agents/sabian_analyzer_agent.py +388 -0
- processing_brand_sentiment/workflow/agents/sabian_relevance_extraction_agent.py +431 -0
- processing_brand_sentiment/workflow/agents/sabian_sentiment_analyzer_agent.py +434 -0
- processing_brand_sentiment/workflow/comment_orchestrator.py +558 -0
- processing_brand_sentiment/workflow/orchestrator.py +551 -0
- requirements.txt +0 -0
- src/streamlit_app.py +0 -40
- visualization/README.md +347 -0
- visualization/SnowFlakeConnection.py +150 -0
- visualization/agents/README.md +320 -0
- visualization/agents/__init__.py +8 -0
- visualization/agents/base_agent.py +88 -0
- visualization/agents/content_summary_agent.py +366 -0
- visualization/app.py +180 -0
- visualization/components/dashboard.py +538 -0
- visualization/components/reply_required.py +324 -0
- visualization/components/sentiment_analysis.py +671 -0
- visualization/config/viz_config.json +87 -0
- visualization/data/data_loader.py +427 -0
- visualization/img/musora.png +0 -0
- visualization/requirements.txt +17 -0
- visualization/utils/data_processor.py +604 -0
- visualization/utils/llm_helper.py +149 -0
.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
DELETED
|
@@ -1,20 +0,0 @@
|
|
| 1 |
-
FROM python:3.13.5-slim
|
| 2 |
-
|
| 3 |
-
WORKDIR /app
|
| 4 |
-
|
| 5 |
-
RUN apt-get update && apt-get install -y \
|
| 6 |
-
build-essential \
|
| 7 |
-
curl \
|
| 8 |
-
git \
|
| 9 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
-
|
| 11 |
-
COPY requirements.txt ./
|
| 12 |
-
COPY src/ ./src/
|
| 13 |
-
|
| 14 |
-
RUN pip3 install -r requirements.txt
|
| 15 |
-
|
| 16 |
-
EXPOSE 8501
|
| 17 |
-
|
| 18 |
-
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 19 |
-
|
| 20 |
-
ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Brand Sentiment Analysis
|
| 3 |
-
emoji: 🚀
|
| 4 |
-
colorFrom: red
|
| 5 |
-
colorTo: red
|
| 6 |
-
sdk: docker
|
| 7 |
-
app_port: 8501
|
| 8 |
-
tags:
|
| 9 |
-
- streamlit
|
| 10 |
-
pinned: false
|
| 11 |
-
short_description: Interactive UI for brand sentiment Analysis
|
| 12 |
-
---
|
| 13 |
-
|
| 14 |
-
# Welcome to Streamlit!
|
| 15 |
-
|
| 16 |
-
Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
|
| 17 |
-
|
| 18 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 19 |
-
forums](https://discuss.streamlit.io).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
processing_brand_sentiment/ARCHITECTURE_PROPOSAL.md
ADDED
|
@@ -0,0 +1,437 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Brand Sentiment Analysis - Architecture Redesign Proposal
|
| 2 |
+
|
| 3 |
+
## Executive Summary
|
| 4 |
+
|
| 5 |
+
This document proposes a redesigned multi-agent architecture to address accuracy issues identified during manual evaluation. The new design separates **fact extraction** from **analysis**, adds strict validation, and improves content preprocessing.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Current Issues Analysis
|
| 10 |
+
|
| 11 |
+
| Issue | Root Cause | Impact |
|
| 12 |
+
|-------|------------|--------|
|
| 13 |
+
| **B8X/B8 variation** | Word-boundary matching misses aliases | Missing relevant posts |
|
| 14 |
+
| **Competitor products attributed to Sabian** | LLM lacks competitor awareness, no strict list enforcement | False positives, wrong product attribution |
|
| 15 |
+
| **Short text language detection** | Lingua fails on short brand-heavy text | Skipping valid English posts |
|
| 16 |
+
| **False positive relevance** | Single-pass relevance + no verification | Pizza oven marked as Sabian discussion |
|
| 17 |
+
| **Long posts with overlapping content** | Poor quote separation, raw thread context | Confusing LLM, extraction from wrong content |
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## Proposed Architecture
|
| 22 |
+
|
| 23 |
+
### Design Principles
|
| 24 |
+
|
| 25 |
+
1. **Separation of Concerns**: Fact extraction vs. interpretation/analysis
|
| 26 |
+
2. **Strict Validation**: Enforce predefined value lists at every step
|
| 27 |
+
3. **Structured Data Flow**: Each agent receives clean, relevant input
|
| 28 |
+
4. **Fail-Safe Defaults**: Conservative approach - when uncertain, mark as not relevant
|
| 29 |
+
|
| 30 |
+
### New Workflow
|
| 31 |
+
|
| 32 |
+
```
|
| 33 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 34 |
+
│ 1. CONTENT PREPROCESSOR │
|
| 35 |
+
│ (No LLM) │
|
| 36 |
+
│ • Enhanced HTML parsing (better quote separation) │
|
| 37 |
+
│ • Text cleaning and normalization │
|
| 38 |
+
│ • Language detection (skip for short texts < 50 chars) │
|
| 39 |
+
│ • Keyword screening with aliases (B8 → B8X) │
|
| 40 |
+
│ • Extract: cleaned_content, quoted_content, raw_thread_context │
|
| 41 |
+
└─────────────────────────────┬───────────────────────────────────┘
|
| 42 |
+
│
|
| 43 |
+
▼
|
| 44 |
+
┌───────────────────────────────┐
|
| 45 |
+
│ Has any Sabian-related │
|
| 46 |
+
│ keywords (primary/contextual)?│
|
| 47 |
+
└───────────────┬───────────────┘
|
| 48 |
+
│ │
|
| 49 |
+
YES NO
|
| 50 |
+
│ │
|
| 51 |
+
▼ ▼
|
| 52 |
+
┌─────────────────────────────────┐ ┌──────────────────┐
|
| 53 |
+
│ 2. RELEVANCE & EXTRACTION │ │ Mark as │
|
| 54 |
+
│ AGENT (LLM #1) │ │ NOT RELEVANT │
|
| 55 |
+
│ │ │ (0 LLM calls) │
|
| 56 |
+
│ INPUT: │ └──────────────────┘
|
| 57 |
+
│ • cleaned_content │
|
| 58 |
+
│ • quoted_content │
|
| 59 |
+
│ • raw_thread_context │
|
| 60 |
+
│ • keywords_found │
|
| 61 |
+
│ │
|
| 62 |
+
│ OUTPUT: │
|
| 63 |
+
│ • IS_RELEVANT: boolean │
|
| 64 |
+
│ • RELEVANCE_CONFIDENCE: h/m/l │
|
| 65 |
+
│ • RELEVANCE_REASON: string │
|
| 66 |
+
│ • PRODUCTS_MENTIONED: [] │ ← STRICT: only from predefined list
|
| 67 |
+
│ • SABIAN_MENTION_CONTEXT │
|
| 68 |
+
│ • AUTHOR_ROLE │
|
| 69 |
+
│ • COMPETITORS_MENTIONED: [] │ ← Brand names only, no products
|
| 70 |
+
│ • THREAD_CONTEXT_SUMMARY │ ← 1-2 sentence summary
|
| 71 |
+
└─────────────────┬───────────────┘
|
| 72 |
+
│
|
| 73 |
+
▼
|
| 74 |
+
┌─────────────────┐
|
| 75 |
+
│ IS_RELEVANT? │
|
| 76 |
+
└────────┬────────┘
|
| 77 |
+
│ │
|
| 78 |
+
YES NO
|
| 79 |
+
│ │
|
| 80 |
+
▼ ▼
|
| 81 |
+
┌─────────────────────────────────┐ ┌──────────────────┐
|
| 82 |
+
│ 3. SENTIMENT & INTENT │ │ Store with │
|
| 83 |
+
│ ANALYZER (LLM #2) │ │ is_relevant=F │
|
| 84 |
+
│ │ │ (1 LLM call) │
|
| 85 |
+
│ INPUT (structured): │ └──────────────────┘
|
| 86 |
+
│ • cleaned_content │
|
| 87 |
+
│ • PRODUCTS_MENTIONED │ ← Pre-validated list
|
| 88 |
+
│ • SABIAN_MENTION_CONTEXT │
|
| 89 |
+
│ • AUTHOR_ROLE │
|
| 90 |
+
│ • COMPETITORS_MENTIONED │
|
| 91 |
+
│ • THREAD_CONTEXT_SUMMARY │ ← Clean, concise context
|
| 92 |
+
│ │
|
| 93 |
+
│ OUTPUT: │
|
| 94 |
+
│ • SENTIMENT_LEVEL │
|
| 95 |
+
│ • EMOTION_TYPE │
|
| 96 |
+
│ • SENTIMENT_CONFIDENCE │
|
| 97 |
+
│ • SARCASM_DETECTED │
|
| 98 |
+
│ • PRODUCT_ATTRIBUTES: [] │
|
| 99 |
+
│ • COMPETITOR_PRODUCTS_OWNED: []│
|
| 100 |
+
│ • COMPARISON_TYPE │
|
| 101 |
+
│ • INTENTS: [] │
|
| 102 |
+
│ • PURCHASE_STAGE │
|
| 103 |
+
│ • DECISION_DRIVERS: [] │
|
| 104 |
+
│ • PAIN_POINTS: [] │
|
| 105 |
+
│ • DELIGHT_FACTORS: [] │
|
| 106 |
+
│ • ANALYSIS_NOTES │
|
| 107 |
+
└─────────────────┬───────────────┘
|
| 108 |
+
│
|
| 109 |
+
▼
|
| 110 |
+
┌─────────────────────────────────┐
|
| 111 |
+
│ 4. OUTPUT VALIDATOR │
|
| 112 |
+
│ (No LLM - Rule-based) │
|
| 113 |
+
│ │
|
| 114 |
+
│ • Verify all values from lists │
|
| 115 |
+
│ • Check logical consistency │
|
| 116 |
+
│ • Flag anomalies for review │
|
| 117 |
+
│ • Set processing_status │
|
| 118 |
+
└─────────────────────────────────┘
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
---
|
| 122 |
+
|
| 123 |
+
## API Call Summary
|
| 124 |
+
|
| 125 |
+
| Scenario | Current Calls | New Calls | Notes |
|
| 126 |
+
|----------|--------------|-----------|-------|
|
| 127 |
+
| No keywords found | 0 | 0 | Same |
|
| 128 |
+
| Primary keywords, relevant | 1 | 2 | +1 for better extraction |
|
| 129 |
+
| Primary keywords, not relevant | 1 | 1 | Extraction determines not relevant |
|
| 130 |
+
| Ambiguous keywords, relevant | 2 | 2 | Same |
|
| 131 |
+
| Ambiguous keywords, not relevant | 2 | 1 | Early exit after extraction |
|
| 132 |
+
|
| 133 |
+
**Net Impact**: Slight increase for some cases, but significantly better accuracy.
|
| 134 |
+
|
| 135 |
+
---
|
| 136 |
+
|
| 137 |
+
## Agent Specifications
|
| 138 |
+
|
| 139 |
+
### Agent 1: Content Preprocessor (No LLM)
|
| 140 |
+
|
| 141 |
+
**File**: `workflow/agents/content_preprocessor_agent.py`
|
| 142 |
+
|
| 143 |
+
**Improvements over current**:
|
| 144 |
+
1. Enhanced HTML parsing with better quote/reply separation
|
| 145 |
+
2. Product alias mapping (B8 → B8X, etc.)
|
| 146 |
+
3. Skip language detection for texts < 50 characters
|
| 147 |
+
4. Always process if primary Sabian keywords found (regardless of language detection)
|
| 148 |
+
|
| 149 |
+
**Product Aliases** (add to brand_config.json):
|
| 150 |
+
```json
|
| 151 |
+
"product_aliases": {
|
| 152 |
+
"B8": "B8X",
|
| 153 |
+
"sbrs": "SBR",
|
| 154 |
+
"hand hammered": "HH",
|
| 155 |
+
"hand-hammered": "HH"
|
| 156 |
+
}
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
---
|
| 160 |
+
|
| 161 |
+
### Agent 2: Relevance & Extraction Agent (LLM #1)
|
| 162 |
+
|
| 163 |
+
**File**: `workflow/agents/relevance_extraction_agent.py`
|
| 164 |
+
|
| 165 |
+
**Purpose**: Determine relevance with HIGH confidence and extract verifiable facts.
|
| 166 |
+
|
| 167 |
+
**Key Design Decisions**:
|
| 168 |
+
|
| 169 |
+
1. **Strict Product Matching**:
|
| 170 |
+
- Provide explicit product list in prompt
|
| 171 |
+
- Instruction: "ONLY return products that EXACTLY match items in this list"
|
| 172 |
+
- Return empty list if no exact matches (not hallucinated guesses)
|
| 173 |
+
|
| 174 |
+
2. **Competitor Awareness**:
|
| 175 |
+
- List competitor BRAND names (not products)
|
| 176 |
+
- Instruction: "Products like '2002', 'Signature', 'K Custom' belong to competitors, NOT Sabian"
|
| 177 |
+
- Prevent cross-brand attribution
|
| 178 |
+
|
| 179 |
+
3. **Thread Context Summarization**:
|
| 180 |
+
- Summarize in 1-2 sentences maximum
|
| 181 |
+
- Focus only on information relevant to understanding the post's context
|
| 182 |
+
|
| 183 |
+
4. **Conservative Relevance**:
|
| 184 |
+
- When uncertain, mark as NOT relevant
|
| 185 |
+
- Require explicit Sabian product/brand mention IN THE POST CONTENT
|
| 186 |
+
- Quoted content mentioning Sabian does NOT make post relevant
|
| 187 |
+
|
| 188 |
+
**System Prompt Structure**:
|
| 189 |
+
```
|
| 190 |
+
You are a brand mention extractor for Sabian cymbals. Your job is to:
|
| 191 |
+
1. Determine if the POST CONTENT discusses Sabian products
|
| 192 |
+
2. Extract ONLY facts, not interpretations
|
| 193 |
+
|
| 194 |
+
## CRITICAL RULES
|
| 195 |
+
|
| 196 |
+
### Rule 1: Relevance Based on POST CONTENT Only
|
| 197 |
+
- The post is relevant ONLY if the POST CONTENT itself mentions Sabian products
|
| 198 |
+
- Quoted/parent content mentioning Sabian does NOT make the post relevant
|
| 199 |
+
- Generic replies ("Thanks!", "Got it!") are NEVER relevant
|
| 200 |
+
|
| 201 |
+
### Rule 2: Strict Product Matching
|
| 202 |
+
SABIAN PRODUCTS (use ONLY these exact values):
|
| 203 |
+
[HHX, HH, AAX, AA, Artisan, FRX, Omni, Chopper, Stratus, XSR, B8X, SBR]
|
| 204 |
+
|
| 205 |
+
- Return ONLY products from this list
|
| 206 |
+
- If you see a product not in this list, do NOT include it
|
| 207 |
+
- "2002", "Signature", "Sound Edge", "Formula 602" are PAISTE products, NOT Sabian
|
| 208 |
+
- "K Custom", "A Custom", "K Zildjian" are ZILDJIAN products, NOT Sabian
|
| 209 |
+
- When uncertain, return empty list []
|
| 210 |
+
|
| 211 |
+
### Rule 3: Competitor Brand Awareness
|
| 212 |
+
COMPETITOR BRANDS: [Zildjian, Paiste, Meinl, Dream Cymbals, Istanbul Agop, Bosphorus]
|
| 213 |
+
|
| 214 |
+
- Only return competitor BRAND names in competitors_mentioned
|
| 215 |
+
- Do NOT guess competitor products
|
| 216 |
+
|
| 217 |
+
### Rule 4: Thread Context Summary
|
| 218 |
+
- Summarize thread context in 1-2 sentences maximum
|
| 219 |
+
- Focus on what helps understand the post's topic
|
| 220 |
+
- If thread is about pizza ovens, say "Thread discusses pizza ovens and cooking"
|
| 221 |
+
|
| 222 |
+
## OUTPUT FORMAT
|
| 223 |
+
Return ONLY valid JSON:
|
| 224 |
+
{
|
| 225 |
+
"is_relevant": boolean,
|
| 226 |
+
"relevance_confidence": "high" | "medium" | "low",
|
| 227 |
+
"relevance_reason": "1-2 sentences explaining decision",
|
| 228 |
+
"products_mentioned": [], // ONLY from Sabian list above
|
| 229 |
+
"sabian_mention_context": "primary_focus" | "significant_mention" | "casual_mention" | "comparison_context" | null,
|
| 230 |
+
"author_role": "current_owner" | "past_owner" | "potential_buyer" | "never_owned" | "unknown",
|
| 231 |
+
"competitors_mentioned": [], // Brand names only
|
| 232 |
+
"thread_context_summary": "1-2 sentence summary"
|
| 233 |
+
}
|
| 234 |
+
```
|
| 235 |
+
|
| 236 |
+
---
|
| 237 |
+
|
| 238 |
+
### Agent 3: Sentiment & Intent Analyzer (LLM #2)
|
| 239 |
+
|
| 240 |
+
**File**: `workflow/agents/sentiment_analyzer_agent.py`
|
| 241 |
+
|
| 242 |
+
**Purpose**: Deep analysis on VERIFIED relevant posts with STRUCTURED input.
|
| 243 |
+
|
| 244 |
+
**Key Design Decisions**:
|
| 245 |
+
|
| 246 |
+
1. **Receives Pre-Validated Input**:
|
| 247 |
+
- Products already extracted and validated
|
| 248 |
+
- Thread context already summarized
|
| 249 |
+
- Author role already determined
|
| 250 |
+
|
| 251 |
+
2. **Focused Analysis**:
|
| 252 |
+
- Sentiment TOWARDS SABIAN ONLY
|
| 253 |
+
- Intent classification
|
| 254 |
+
- Pain points / Delights (author's own experience only)
|
| 255 |
+
- Purchase journey (author's own journey only)
|
| 256 |
+
|
| 257 |
+
3. **No Hallucination on Products**:
|
| 258 |
+
- Products are GIVEN in input, not re-extracted
|
| 259 |
+
- Can only discuss attributes of provided products
|
| 260 |
+
|
| 261 |
+
**System Prompt Structure**:
|
| 262 |
+
```
|
| 263 |
+
You are a sentiment analyst for Sabian cymbal discussions.
|
| 264 |
+
|
| 265 |
+
## INPUT CONTEXT (Pre-validated, trust these values)
|
| 266 |
+
- Products mentioned: {products_mentioned}
|
| 267 |
+
- Sabian mention context: {sabian_mention_context}
|
| 268 |
+
- Author role: {author_role}
|
| 269 |
+
- Thread summary: {thread_context_summary}
|
| 270 |
+
- Competitors mentioned: {competitors_mentioned}
|
| 271 |
+
|
| 272 |
+
## YOUR TASK
|
| 273 |
+
Analyze the sentiment, emotions, and intents in this post about Sabian.
|
| 274 |
+
|
| 275 |
+
## CRITICAL RULES
|
| 276 |
+
|
| 277 |
+
### Rule 1: Sabian-Specific Sentiment
|
| 278 |
+
- Sentiment MUST be about Sabian, NOT overall post tone
|
| 279 |
+
- Example: "Love my new kit! The SBR cymbals sound terrible."
|
| 280 |
+
- Overall: positive | Sabian sentiment: NEGATIVE
|
| 281 |
+
|
| 282 |
+
### Rule 2: Author Perspective Only
|
| 283 |
+
These fields are ONLY for author's OWN experience:
|
| 284 |
+
- purchase_stage, decision_drivers, pain_points, delight_factors
|
| 285 |
+
- If author is giving ADVICE to others, these should be null/empty
|
| 286 |
+
|
| 287 |
+
### Rule 3: Use Only Valid Values
|
| 288 |
+
[List all valid values for each field]
|
| 289 |
+
|
| 290 |
+
## OUTPUT FORMAT
|
| 291 |
+
{
|
| 292 |
+
"sentiment_level": "...",
|
| 293 |
+
"emotion_type": "..." or null,
|
| 294 |
+
"sentiment_confidence": "high" | "medium" | "low",
|
| 295 |
+
"sarcasm_detected": boolean,
|
| 296 |
+
"product_attributes": [],
|
| 297 |
+
"competitor_products_owned": [],
|
| 298 |
+
"comparison_type": "..." or null,
|
| 299 |
+
"intents": [],
|
| 300 |
+
"purchase_stage": "..." or null,
|
| 301 |
+
"decision_drivers": [],
|
| 302 |
+
"pain_points": [],
|
| 303 |
+
"delight_factors": [],
|
| 304 |
+
"analysis_notes": "1-2 sentences"
|
| 305 |
+
}
|
| 306 |
+
```
|
| 307 |
+
|
| 308 |
+
---
|
| 309 |
+
|
| 310 |
+
### Agent 4: Output Validator (No LLM)
|
| 311 |
+
|
| 312 |
+
**File**: `workflow/agents/output_validator_agent.py`
|
| 313 |
+
|
| 314 |
+
**Purpose**: Final validation and anomaly detection.
|
| 315 |
+
|
| 316 |
+
**Validation Rules**:
|
| 317 |
+
|
| 318 |
+
1. **List Validation**:
|
| 319 |
+
- All products_mentioned are in Sabian product list
|
| 320 |
+
- All competitors_mentioned are in competitor list
|
| 321 |
+
- All categorical values are from predefined lists
|
| 322 |
+
|
| 323 |
+
2. **Logical Consistency**:
|
| 324 |
+
- If is_relevant=True, products_mentioned should not be empty (flag if empty)
|
| 325 |
+
- If sabian_mention_context="primary_focus", products_mentioned should have items
|
| 326 |
+
- If sentiment_level="very_negative", pain_points should not be empty (warn)
|
| 327 |
+
|
| 328 |
+
3. **Anomaly Flagging**:
|
| 329 |
+
- Flag for manual review if inconsistencies detected
|
| 330 |
+
- Add `validation_flags` field to output
|
| 331 |
+
|
| 332 |
+
---
|
| 333 |
+
|
| 334 |
+
## Configuration Changes
|
| 335 |
+
|
| 336 |
+
### brand_config.json Updates
|
| 337 |
+
|
| 338 |
+
```json
|
| 339 |
+
{
|
| 340 |
+
"brand": {
|
| 341 |
+
"name": "Sabian",
|
| 342 |
+
"products": ["HHX", "HH", "AAX", "AA", "Artisan", "FRX", "Omni", "Chopper", "Stratus", "XSR", "B8X", "SBR"],
|
| 343 |
+
"product_aliases": {
|
| 344 |
+
"B8": "B8X",
|
| 345 |
+
"sbrs": "SBR",
|
| 346 |
+
"hhx's": "HHX",
|
| 347 |
+
"aax's": "AAX"
|
| 348 |
+
},
|
| 349 |
+
"competitor_products_warning": [
|
| 350 |
+
"2002", "Signature", "Sound Edge", "Formula 602", "Giant Beat",
|
| 351 |
+
"K Custom", "A Custom", "K Zildjian", "A Zildjian", "S Family",
|
| 352 |
+
"Byzance", "Pure Alloy", "HCS",
|
| 353 |
+
"Bliss", "Contact", "Energy"
|
| 354 |
+
],
|
| 355 |
+
"competitors": [...]
|
| 356 |
+
},
|
| 357 |
+
"preprocessing": {
|
| 358 |
+
"min_length_for_language_detection": 50,
|
| 359 |
+
"always_process_if_primary_keyword": true
|
| 360 |
+
}
|
| 361 |
+
}
|
| 362 |
+
```
|
| 363 |
+
|
| 364 |
+
---
|
| 365 |
+
|
| 366 |
+
## File Structure
|
| 367 |
+
|
| 368 |
+
```
|
| 369 |
+
processing_brand_sentiment/
|
| 370 |
+
├── config_files/
|
| 371 |
+
│ ├── brand_config.json # Updated with aliases, warnings
|
| 372 |
+
│ ├── workflow_config.json # Agent configurations
|
| 373 |
+
│ └── analysis_categories.json # Category definitions (unchanged)
|
| 374 |
+
├── workflow/
|
| 375 |
+
│ ├── orchestrator.py # Updated workflow graph
|
| 376 |
+
│ └── agents/
|
| 377 |
+
│ ├── base_agent.py # Base class (unchanged)
|
| 378 |
+
│ ├── content_preprocessor_agent.py # Enhanced preprocessing
|
| 379 |
+
│ ├── relevance_extraction_agent.py # NEW: Extraction + relevance
|
| 380 |
+
│ ├── sentiment_analyzer_agent.py # NEW: Focused analysis
|
| 381 |
+
│ └── output_validator_agent.py # NEW: Validation
|
| 382 |
+
```
|
| 383 |
+
|
| 384 |
+
---
|
| 385 |
+
|
| 386 |
+
## Migration Path
|
| 387 |
+
|
| 388 |
+
### Phase 1: Configuration Updates
|
| 389 |
+
1. Update brand_config.json with product aliases
|
| 390 |
+
2. Add competitor product warnings
|
| 391 |
+
3. Update preprocessing settings
|
| 392 |
+
|
| 393 |
+
### Phase 2: New Agents
|
| 394 |
+
1. Create relevance_extraction_agent.py
|
| 395 |
+
2. Create sentiment_analyzer_agent.py
|
| 396 |
+
3. Create output_validator_agent.py
|
| 397 |
+
4. Update content_preprocessor_agent.py
|
| 398 |
+
|
| 399 |
+
### Phase 3: Orchestrator Update
|
| 400 |
+
1. Update workflow graph with new flow
|
| 401 |
+
2. Update state definition
|
| 402 |
+
3. Add new routing logic
|
| 403 |
+
|
| 404 |
+
### Phase 4: Testing & Validation
|
| 405 |
+
1. Run on test batch with known issues
|
| 406 |
+
2. Compare accuracy metrics
|
| 407 |
+
3. Fine-tune prompts based on results
|
| 408 |
+
|
| 409 |
+
---
|
| 410 |
+
|
| 411 |
+
## Expected Improvements
|
| 412 |
+
|
| 413 |
+
| Issue | Current Behavior | Expected After |
|
| 414 |
+
|-------|------------------|----------------|
|
| 415 |
+
| B8/B8X | Missed | Caught via alias mapping |
|
| 416 |
+
| Paiste products as Sabian | Attributed to Sabian | Correctly identified as competitor |
|
| 417 |
+
| Short text language | Marked as Latin | Processed as English |
|
| 418 |
+
| False positive (pizza) | Marked relevant | Marked not relevant |
|
| 419 |
+
| Long confusing context | Raw text confuses LLM | Summarized 1-2 sentences |
|
| 420 |
+
|
| 421 |
+
---
|
| 422 |
+
|
| 423 |
+
## Success Metrics
|
| 424 |
+
|
| 425 |
+
1. **Relevance Accuracy**: >99% (currently ~90%)
|
| 426 |
+
2. **Product Attribution Accuracy**: >99% (currently ~85%)
|
| 427 |
+
3. **Sentiment Accuracy**: >95% (current unknown)
|
| 428 |
+
4. **False Positive Rate**: <1%
|
| 429 |
+
5. **False Negative Rate**: <1%
|
| 430 |
+
|
| 431 |
+
---
|
| 432 |
+
|
| 433 |
+
## Questions for Review
|
| 434 |
+
|
| 435 |
+
1. Should we add a manual review queue for flagged posts?
|
| 436 |
+
2. Should thread_context_summary be stored in output for debugging?
|
| 437 |
+
3. Preferred batch size for re-processing existing data?
|
processing_brand_sentiment/README.md
ADDED
|
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Brand Sentiment Analysis Pipeline
|
| 2 |
+
|
| 3 |
+
A modular, scalable system for analyzing forum discussions and social media comments about specific brands using an agentic workflow with LLMs. The initial implementation focuses on **Sabian** (a cymbal manufacturer), but the architecture supports easy addition of new brands through configuration.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
The pipeline fetches data from Snowflake (forum posts and/or social media comments), preprocesses them (parsing HTML for forums or cleaning plain text for comments), detects language, validates brand relevance, performs comprehensive sentiment and intelligence extraction using OpenAI's API, and stores enriched results back to Snowflake.
|
| 8 |
+
|
| 9 |
+
## Data Sources
|
| 10 |
+
|
| 11 |
+
| Source | Table | Output Table | Description |
|
| 12 |
+
|--------|-------|--------------|-------------|
|
| 13 |
+
| **Forums** | `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS` | `SABIAN_BRAND_ANALYSIS` | Forum posts with thread context |
|
| 14 |
+
| **Comments** | `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS` | `SABIAN_BRAND_ANALYSIS_COMMENTS` | Social media comments with content context |
|
| 15 |
+
|
| 16 |
+
## Architecture v4.0
|
| 17 |
+
|
| 18 |
+
The system uses a 4-agent pipeline that separates **fact extraction** from **analysis** for improved accuracy. Both data sources share the same extraction, analysis, and validation agents - only the preprocessor differs.
|
| 19 |
+
|
| 20 |
+
```
|
| 21 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 22 |
+
│ 1a. CONTENT PREPROCESSOR (Forums) │
|
| 23 |
+
│ (No LLM) │
|
| 24 |
+
│ - HTML parsing with quote/reply separation │
|
| 25 |
+
│ - Product alias mapping (B8 → B8X) │
|
| 26 |
+
│ - Smart language detection │
|
| 27 |
+
│ - Keyword-based relevance screening │
|
| 28 |
+
├─────────────────────────────────────────────────────────────────┤
|
| 29 |
+
│ 1b. COMMENT PREPROCESSOR (Comments) │
|
| 30 |
+
│ (No LLM) │
|
| 31 |
+
│ - Plain text cleaning (no HTML) │
|
| 32 |
+
│ - Product alias mapping (B8 → B8X) │
|
| 33 |
+
│ - Smart language detection │
|
| 34 |
+
│ - Keyword-based relevance screening │
|
| 35 |
+
│ - Context: content title + description + parent comment │
|
| 36 |
+
└─────────────────────────────┬───────────────────────────────────┘
|
| 37 |
+
│
|
| 38 |
+
▼
|
| 39 |
+
┌───────────────────────────────┐
|
| 40 |
+
│ Has Sabian-related keywords? │
|
| 41 |
+
└───────────────┬───────────────┘
|
| 42 |
+
│ │
|
| 43 |
+
YES NO
|
| 44 |
+
│ │
|
| 45 |
+
▼ ▼
|
| 46 |
+
┌─────────────────────────────────┐ ┌──────────────────┐
|
| 47 |
+
│ 2. RELEVANCE & EXTRACTION │ │ Mark as │
|
| 48 |
+
│ AGENT (LLM #1) │ │ NOT RELEVANT │
|
| 49 |
+
│ [SHARED] │ │ (0 LLM calls) │
|
| 50 |
+
│ - Validates relevance │ └──────────────────┘
|
| 51 |
+
│ - Extracts products (strict) │
|
| 52 |
+
│ - Identifies author role │
|
| 53 |
+
│ - Summarizes context │
|
| 54 |
+
│ - Detects competitors │
|
| 55 |
+
└─────────────────┬───────────────┘
|
| 56 |
+
│
|
| 57 |
+
▼
|
| 58 |
+
┌─────────────────┐
|
| 59 |
+
│ IS_RELEVANT? │
|
| 60 |
+
└────────┬────────┘
|
| 61 |
+
│ │
|
| 62 |
+
YES NO
|
| 63 |
+
│ │
|
| 64 |
+
▼ ▼
|
| 65 |
+
┌─────────────────────────────────┐ ┌──────────────────┐
|
| 66 |
+
│ 3. SENTIMENT & INTENT │ │ Store with │
|
| 67 |
+
│ ANALYZER (LLM #2) │ │ is_relevant=F │
|
| 68 |
+
│ [SHARED] │ │ (1 LLM call) │
|
| 69 |
+
│ - Sabian-specific sentiment │ └──────────────────┘
|
| 70 |
+
│ - Intent classification │
|
| 71 |
+
│ - Pain points / Delights │
|
| 72 |
+
��� - Purchase journey (author) │
|
| 73 |
+
│ - Competitor products owned │
|
| 74 |
+
└─────────────────┬───────────────┘
|
| 75 |
+
│
|
| 76 |
+
▼
|
| 77 |
+
┌─────────────────────────────────┐
|
| 78 |
+
│ 4. OUTPUT VALIDATOR │
|
| 79 |
+
│ (No LLM - Rule-based) │
|
| 80 |
+
│ [SHARED] │
|
| 81 |
+
│ - Validates all values │
|
| 82 |
+
│ - Checks logical consistency │
|
| 83 |
+
│ - Flags anomalies for review │
|
| 84 |
+
└─────────────────────────────────┘
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
## Features
|
| 88 |
+
|
| 89 |
+
- **Multi-Source Support**: Process forums, social media comments, or both
|
| 90 |
+
- **4-Agent Pipeline**: Separation of extraction and analysis for improved accuracy
|
| 91 |
+
- **Strict Product Matching**: Only returns products from predefined list, preventing hallucination
|
| 92 |
+
- **Competitor Awareness**: Knows which products belong to competitors
|
| 93 |
+
- **Smart Language Detection**: Skips detection for short texts, always processes if primary keywords found
|
| 94 |
+
- **Product Alias Mapping**: Handles variations (B8 → B8X, "hand hammered" → HH)
|
| 95 |
+
- **Thread/Comment Context**: LLM summarizes context for clarity
|
| 96 |
+
- **Validation & Anomaly Detection**: Rule-based validator catches errors and flags edge cases
|
| 97 |
+
- **Author Perspective Tracking**: Distinguishes author's own experience from advice to others
|
| 98 |
+
- **Platform Tracking**: Records source platform for each processed item
|
| 99 |
+
|
| 100 |
+
## Project Structure
|
| 101 |
+
|
| 102 |
+
```
|
| 103 |
+
processing_brand_sentiment/
|
| 104 |
+
├── config_files/
|
| 105 |
+
│ ├── brand_config.json # Brand products, aliases, competitors, keywords, data sources
|
| 106 |
+
│ ├── workflow_config.json # LLM settings, batch sizes, output config (forums + comments)
|
| 107 |
+
│ └── analysis_categories.json # Sentiment, intent, pain point categories
|
| 108 |
+
├── database/
|
| 109 |
+
│ ├── __init__.py
|
| 110 |
+
│ ├── snowflake_connection.py # Snowflake connection handler
|
| 111 |
+
│ └── sql/
|
| 112 |
+
│ ├── fetch_forum_posts.sql # Query for forum posts with thread context
|
| 113 |
+
│ ├── fetch_comments.sql # Query for social media comments with content context
|
| 114 |
+
│ ├── create_output_table.sql # Forum output schema with views
|
| 115 |
+
│ ├── init_output_table.sql # Forum table initialization
|
| 116 |
+
│ ├── create_comments_output_table.sql # Comment output schema with views
|
| 117 |
+
│ └── init_comments_output_table.sql # Comment table initialization
|
| 118 |
+
├── workflow/
|
| 119 |
+
│ ├── __init__.py
|
| 120 |
+
│ ├── orchestrator.py # Forum LangGraph workflow coordinator
|
| 121 |
+
│ ├── comment_orchestrator.py # Comment LangGraph workflow coordinator
|
| 122 |
+
│ └── agents/
|
| 123 |
+
│ ├── __init__.py
|
| 124 |
+
│ ├── base_agent.py # Abstract base class
|
| 125 |
+
│ ├── content_preprocessor_agent.py # Forum: HTML parsing, alias mapping
|
| 126 |
+
│ ├── comment_preprocessor_agent.py # Comments: plain text, comment context
|
| 127 |
+
│ ├── sabian_relevance_extraction_agent.py # Shared: relevance + extraction
|
| 128 |
+
│ ├── sabian_sentiment_analyzer_agent.py # Shared: sentiment analysis
|
| 129 |
+
│ └── output_validator_agent.py # Shared: rule-based validation
|
| 130 |
+
├── utils/
|
| 131 |
+
│ ├── __init__.py
|
| 132 |
+
│ └── html_parser.py # HTML content extraction (forums only)
|
| 133 |
+
├── logs/ # Processing logs (auto-created)
|
| 134 |
+
├── main.py # Main execution script (multi-source)
|
| 135 |
+
├── .env # Environment variables
|
| 136 |
+
└── README.md # This file
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
## Setup
|
| 140 |
+
|
| 141 |
+
### 1. Install Dependencies
|
| 142 |
+
|
| 143 |
+
```bash
|
| 144 |
+
pip install langchain-openai langgraph snowflake-snowpark-python python-dotenv pandas beautifulsoup4 lingua-language-detector
|
| 145 |
+
```
|
| 146 |
+
|
| 147 |
+
### 2. Configure Environment Variables
|
| 148 |
+
|
| 149 |
+
Ensure `.env` file contains:
|
| 150 |
+
|
| 151 |
+
```env
|
| 152 |
+
# Snowflake
|
| 153 |
+
SNOWFLAKE_USER=your_user
|
| 154 |
+
SNOWFLAKE_PASSWORD=your_password
|
| 155 |
+
SNOWFLAKE_ACCOUNT=your_account
|
| 156 |
+
SNOWFLAKE_ROLE=your_role
|
| 157 |
+
SNOWFLAKE_DATABASE=SOCIAL_MEDIA_DB
|
| 158 |
+
SNOWFLAKE_WAREHOUSE=your_warehouse
|
| 159 |
+
SNOWFLAKE_SCHEMA=ML_FEATURES
|
| 160 |
+
|
| 161 |
+
# OpenAI
|
| 162 |
+
OPENAI_API_KEY=your_openai_key
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
### 3. Initialize Snowflake Tables
|
| 166 |
+
|
| 167 |
+
Run the initialization scripts before first processing:
|
| 168 |
+
|
| 169 |
+
```sql
|
| 170 |
+
-- For forums
|
| 171 |
+
database/sql/init_output_table.sql
|
| 172 |
+
|
| 173 |
+
-- For social media comments
|
| 174 |
+
database/sql/init_comments_output_table.sql
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
## Usage
|
| 178 |
+
|
| 179 |
+
### Process All Sources (Default)
|
| 180 |
+
|
| 181 |
+
```bash
|
| 182 |
+
python main.py
|
| 183 |
+
```
|
| 184 |
+
|
| 185 |
+
### Process Forums Only
|
| 186 |
+
|
| 187 |
+
```bash
|
| 188 |
+
python main.py --data-source forums
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
### Process Social Media Comments Only
|
| 192 |
+
|
| 193 |
+
```bash
|
| 194 |
+
python main.py --data-source comments
|
| 195 |
+
```
|
| 196 |
+
|
| 197 |
+
### Process Limited Number
|
| 198 |
+
|
| 199 |
+
```bash
|
| 200 |
+
python main.py --limit 100
|
| 201 |
+
python main.py --data-source comments --limit 50
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
### Sequential Processing (Debug Mode)
|
| 205 |
+
|
| 206 |
+
```bash
|
| 207 |
+
python main.py --limit 50 --sequential
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
### First Run (Overwrite Mode)
|
| 211 |
+
|
| 212 |
+
```bash
|
| 213 |
+
python main.py --overwrite --limit 100
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
### Command-Line Arguments
|
| 217 |
+
|
| 218 |
+
| Argument | Description | Default |
|
| 219 |
+
|----------|-------------|---------|
|
| 220 |
+
| `--limit N` | Process only N items per source | All unprocessed |
|
| 221 |
+
| `--overwrite` | Overwrite existing table | Append mode |
|
| 222 |
+
| `--sequential` | Single-threaded processing | Parallel |
|
| 223 |
+
| `--config-dir PATH` | Custom config directory | config_files/ |
|
| 224 |
+
| `--data-source SOURCE` | Source to process: `forums`, `comments`, `all` | `all` |
|
| 225 |
+
|
| 226 |
+
## Configuration
|
| 227 |
+
|
| 228 |
+
### brand_config.json
|
| 229 |
+
|
| 230 |
+
Key sections:
|
| 231 |
+
|
| 232 |
+
```json
|
| 233 |
+
{
|
| 234 |
+
"brand": {
|
| 235 |
+
"name": "Sabian",
|
| 236 |
+
"products": ["HHX", "HH", "AAX", "AA", "Artisan", "FRX", "Omni", "Chopper", "Stratus", "XSR", "B8X", "SBR"],
|
| 237 |
+
"product_aliases": {
|
| 238 |
+
"b8": "B8X",
|
| 239 |
+
"hand hammered": "HH"
|
| 240 |
+
},
|
| 241 |
+
"competitor_products_warning": {
|
| 242 |
+
"paiste_products": ["2002", "signature", "sound edge", "formula 602"],
|
| 243 |
+
"zildjian_products": ["k custom", "a custom", "k zildjian"]
|
| 244 |
+
},
|
| 245 |
+
"competitors": [...]
|
| 246 |
+
},
|
| 247 |
+
"data_sources": {
|
| 248 |
+
"forums": {
|
| 249 |
+
"table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS",
|
| 250 |
+
"platform": "musora_forums"
|
| 251 |
+
},
|
| 252 |
+
"comments": {
|
| 253 |
+
"table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS",
|
| 254 |
+
"platform_column": "PLATFORM"
|
| 255 |
+
}
|
| 256 |
+
}
|
| 257 |
+
}
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
### analysis_categories.json
|
| 261 |
+
|
| 262 |
+
Defines valid values for all categorical fields:
|
| 263 |
+
|
| 264 |
+
- `author_role`: current_owner, past_owner, potential_buyer, never_owned, unknown
|
| 265 |
+
- `sabian_mention_context`: primary_focus, significant_mention, casual_mention, comparison_context
|
| 266 |
+
- `sentiment_level`: very_negative, negative, neutral, positive, very_positive
|
| 267 |
+
- `intents`: seeking_information, providing_information, sharing_experience, comparing, praising, criticizing, buying_selling, general_discussion
|
| 268 |
+
- `feedback_aspects`: sound_quality, price_value, durability, playability, versatility, customer_service, availability, aesthetics
|
| 269 |
+
|
| 270 |
+
## Output Tables
|
| 271 |
+
|
| 272 |
+
### Forum Output: `SABIAN_BRAND_ANALYSIS`
|
| 273 |
+
|
| 274 |
+
| Category | Key Columns |
|
| 275 |
+
|----------|-------------|
|
| 276 |
+
| **Identifiers** | POST_ID, THREAD_ID, POST_AUTHOR_ID, PLATFORM |
|
| 277 |
+
| **Content** | ORIGINAL_CONTENT, CLEANED_CONTENT, QUOTED_CONTENT, THREAD_CONTEXT_SUMMARY |
|
| 278 |
+
| **Thread** | THREAD_TITLE, THREAD_FIRST_POST, POST_CREATED_AT, THREAD_STARTED_AT |
|
| 279 |
+
| **Category** | CATEGORY_TITLE, CATEGORY_TOPIC |
|
| 280 |
+
|
| 281 |
+
### Comment Output: `SABIAN_BRAND_ANALYSIS_COMMENTS`
|
| 282 |
+
|
| 283 |
+
| Category | Key Columns |
|
| 284 |
+
|----------|-------------|
|
| 285 |
+
| **Identifiers** | COMMENT_SK, COMMENT_ID, PLATFORM, AUTHOR_NAME, AUTHOR_ID |
|
| 286 |
+
| **Content** | ORIGINAL_TEXT, COMMENT_TIMESTAMP |
|
| 287 |
+
| **Context** | CONTENT_SK, CONTENT_ID, CONTENT_DESCRIPTION, PARENT_COMMENT_ID, PARENT_COMMENT_TEXT |
|
| 288 |
+
| **Channel** | CHANNEL_SK, CHANNEL_NAME, CHANNEL_DISPLAY_NAME |
|
| 289 |
+
|
| 290 |
+
### Shared Analysis Columns (Both Tables)
|
| 291 |
+
|
| 292 |
+
| Category | Fields | Notes |
|
| 293 |
+
|----------|--------|-------|
|
| 294 |
+
| **Language** | DETECTED_LANGUAGE, LANGUAGE_CODE, IS_ENGLISH | Language detection |
|
| 295 |
+
| **Relevance** | IS_RELEVANT, RELEVANCE_CONFIDENCE, RELEVANCE_REASON | Brand relevance |
|
| 296 |
+
| **Extraction** | PRODUCTS_MENTIONED, AUTHOR_ROLE, SABIAN_MENTION_CONTEXT | From Agent 1 |
|
| 297 |
+
| **Sentiment** | SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_CONFIDENCE | Sabian-specific |
|
| 298 |
+
| **Intents** | INTENTS (multi-label) | What author is trying to accomplish |
|
| 299 |
+
| **Journey** | PURCHASE_STAGE, DECISION_DRIVERS | Author perspective only |
|
| 300 |
+
| **Feedback** | PAIN_POINTS, DELIGHT_FACTORS | Author's own experience |
|
| 301 |
+
| **Competitive** | COMPETITORS_MENTIONED, COMPETITOR_PRODUCTS_OWNED, COMPARISON_TYPE | Competitive intel |
|
| 302 |
+
| **Validation** | VALIDATION_FLAGS, PROCESSING_STATUS | Anomaly detection |
|
| 303 |
+
|
| 304 |
+
### Processing Status Values
|
| 305 |
+
|
| 306 |
+
| Status | Description |
|
| 307 |
+
|--------|-------------|
|
| 308 |
+
| `completed` | Successfully processed, no issues |
|
| 309 |
+
| `completed_with_flags` | Processed but has anomalies to review |
|
| 310 |
+
| `validation_failed` | Validation errors detected |
|
| 311 |
+
| `workflow_error` | Unexpected error during processing |
|
| 312 |
+
|
| 313 |
+
### Available Views
|
| 314 |
+
|
| 315 |
+
#### Forum Views
|
| 316 |
+
|
| 317 |
+
| View | Description |
|
| 318 |
+
|------|-------------|
|
| 319 |
+
| `VW_SABIAN_RELEVANT_ANALYSIS` | Only relevant, successfully processed posts |
|
| 320 |
+
| `VW_SABIAN_FLAGGED_POSTS` | Posts with validation flags for review |
|
| 321 |
+
| `VW_SABIAN_SENTIMENT_DISTRIBUTION` | Sentiment breakdown statistics |
|
| 322 |
+
| `VW_SABIAN_PRODUCT_MENTIONS` | Product mention summary |
|
| 323 |
+
| `VW_SABIAN_COMPETITOR_ANALYSIS` | Competitor comparison analysis |
|
| 324 |
+
| `VW_SABIAN_PAIN_POINTS` | Pain point frequency analysis |
|
| 325 |
+
| `VW_SABIAN_AUTHOR_ROLES` | Author role distribution |
|
| 326 |
+
| `VW_SABIAN_COMPETITOR_OWNERSHIP` | Competitor brands owned by authors |
|
| 327 |
+
| `VW_SABIAN_VALIDATION_SUMMARY` | Processing status breakdown |
|
| 328 |
+
|
| 329 |
+
#### Comment Views
|
| 330 |
+
|
| 331 |
+
| View | Description |
|
| 332 |
+
|------|-------------|
|
| 333 |
+
| `VW_SABIAN_COMMENTS_RELEVANT_ANALYSIS` | Relevant, successful comments |
|
| 334 |
+
| `VW_SABIAN_COMMENTS_FLAGGED` | Comments with validation flags |
|
| 335 |
+
| `VW_SABIAN_COMMENTS_SENTIMENT_DISTRIBUTION` | Sentiment by platform |
|
| 336 |
+
| `VW_SABIAN_COMMENTS_PRODUCT_MENTIONS` | Product mentions by platform |
|
| 337 |
+
| `VW_SABIAN_COMMENTS_VALIDATION_SUMMARY` | Processing status by platform |
|
| 338 |
+
|
| 339 |
+
## API Call Efficiency
|
| 340 |
+
|
| 341 |
+
| Scenario | LLM Calls | Notes |
|
| 342 |
+
|----------|-----------|-------|
|
| 343 |
+
| No keywords found | 0 | Early exit in preprocessor |
|
| 344 |
+
| Primary keywords, relevant | 2 | Extraction + Analysis |
|
| 345 |
+
| Primary keywords, not relevant | 1 | Only Extraction |
|
| 346 |
+
| Non-English content | 0 | Skipped |
|
| 347 |
+
|
| 348 |
+
## Key Design Decisions
|
| 349 |
+
|
| 350 |
+
### Why Separate Forum and Comment Preprocessors?
|
| 351 |
+
|
| 352 |
+
1. **Different input formats**: Forums use HTML (quotes, blockquotes), comments are plain text
|
| 353 |
+
2. **Different context sources**: Forums have thread title + first post + category; comments have content title + description + parent comment
|
| 354 |
+
3. **Shared analysis**: Both feed into the same extraction and analysis agents
|
| 355 |
+
|
| 356 |
+
### Why Separate Output Tables?
|
| 357 |
+
|
| 358 |
+
1. **Different identifiers**: Forums use POST_ID/THREAD_ID; comments use COMMENT_SK/COMMENT_ID/PLATFORM
|
| 359 |
+
2. **Different metadata**: Forums have thread context; comments have content/channel metadata
|
| 360 |
+
3. **Clean separation**: Avoids NULL columns and schema confusion
|
| 361 |
+
4. **Shared analysis columns**: All extracted intelligence fields are identical
|
| 362 |
+
|
| 363 |
+
### Why Platform Column for Forums?
|
| 364 |
+
|
| 365 |
+
The `PLATFORM` column was added to `SABIAN_BRAND_ANALYSIS` (defaulting to `musora_forums`) to enable cross-source analysis and maintain consistency with the comments table which uses the dynamic platform value from the source data.
|
| 366 |
+
|
| 367 |
+
## Troubleshooting
|
| 368 |
+
|
| 369 |
+
### "Table does not exist" on First Run
|
| 370 |
+
|
| 371 |
+
Run the appropriate init SQL in Snowflake first:
|
| 372 |
+
- Forums: `database/sql/init_output_table.sql`
|
| 373 |
+
- Comments: `database/sql/init_comments_output_table.sql`
|
| 374 |
+
|
| 375 |
+
### No Comments Being Processed
|
| 376 |
+
|
| 377 |
+
Check that `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS` table exists and contains data. The query joins with `DIM_CONTENT` and `DIM_CHANNEL` - verify these dimension tables have matching records.
|
| 378 |
+
|
| 379 |
+
### Competitor Products Attributed to Sabian
|
| 380 |
+
|
| 381 |
+
Check `brand_config.json` for `competitor_products_warning` section. Add any missing competitor products.
|
| 382 |
+
|
| 383 |
+
### API Rate Limits
|
| 384 |
+
|
| 385 |
+
Use `--sequential` mode or reduce `--limit`:
|
| 386 |
+
```bash
|
| 387 |
+
python main.py --sequential --limit 50
|
| 388 |
+
```
|
| 389 |
+
|
| 390 |
+
## Schema Version History
|
| 391 |
+
|
| 392 |
+
| Version | Changes |
|
| 393 |
+
|---------|---------|
|
| 394 |
+
| 1.0 | Initial release |
|
| 395 |
+
| 2.0 | Added author_role, post_type, sabian_mention_context |
|
| 396 |
+
| 3.0 | Removed post_type (merged into intents), unified feedback_aspects |
|
| 397 |
+
| 4.0 | 4-agent pipeline, thread_context_summary, validation flags, product aliases |
|
| 398 |
+
| 4.0+ | Added social media comments support, PLATFORM column, separate comment output table |
|
| 399 |
+
|
| 400 |
+
## License
|
| 401 |
+
|
| 402 |
+
Internal use only - Brand sentiment analysis project.
|
processing_brand_sentiment/config_files/analysis_categories.json
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"author_role": {
|
| 3 |
+
"description": "Author's relationship to Sabian products",
|
| 4 |
+
"categories": [
|
| 5 |
+
{"value": "current_owner", "description": "Currently owns/uses Sabian"},
|
| 6 |
+
{"value": "past_owner", "description": "Previously owned, sold/replaced"},
|
| 7 |
+
{"value": "potential_buyer", "description": "Considering purchasing Sabian"},
|
| 8 |
+
{"value": "never_owned", "description": "Explicitly doesn't own Sabian"},
|
| 9 |
+
{"value": "unknown", "description": "Cannot determine from post"}
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
"sabian_mention_context": {
|
| 13 |
+
"description": "How prominently Sabian is discussed",
|
| 14 |
+
"categories": [
|
| 15 |
+
{"value": "primary_focus", "description": "Sabian is the main topic"},
|
| 16 |
+
{"value": "significant_mention", "description": "Discussed with detail, not main focus"},
|
| 17 |
+
{"value": "casual_mention", "description": "Brief mention among other topics"},
|
| 18 |
+
{"value": "comparison_context", "description": "Mentioned while comparing to competitors"}
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
"sentiment": {
|
| 22 |
+
"brand_specific": true,
|
| 23 |
+
"description": "Sentiment TOWARDS SABIAN ONLY (not overall post tone)",
|
| 24 |
+
"levels": [
|
| 25 |
+
{"value": "very_negative", "description": "Strong criticism, anger, severe disappointment"},
|
| 26 |
+
{"value": "negative", "description": "Complaints, dissatisfaction, mild criticism"},
|
| 27 |
+
{"value": "neutral", "description": "Factual mention, balanced, no clear sentiment"},
|
| 28 |
+
{"value": "positive", "description": "Satisfaction, appreciation, mild praise"},
|
| 29 |
+
{"value": "very_positive", "description": "Enthusiasm, strong praise, highly recommend"}
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
"emotions": {
|
| 33 |
+
"brand_specific": true,
|
| 34 |
+
"description": "Emotion towards SABIAN specifically",
|
| 35 |
+
"categories": [
|
| 36 |
+
{"value": "frustration", "description": "Annoyance with product issues"},
|
| 37 |
+
{"value": "disappointment", "description": "Unmet expectations"},
|
| 38 |
+
{"value": "anger", "description": "Strong negative emotion"},
|
| 39 |
+
{"value": "satisfaction", "description": "Expectations met, content"},
|
| 40 |
+
{"value": "excitement", "description": "Eagerness, anticipation"},
|
| 41 |
+
{"value": "curiosity", "description": "Interest, wanting to know more"},
|
| 42 |
+
{"value": "indifference", "description": "No strong feelings"}
|
| 43 |
+
]
|
| 44 |
+
},
|
| 45 |
+
"intents": {
|
| 46 |
+
"multi_label": true,
|
| 47 |
+
"description": "What the author is trying to accomplish (can select multiple)",
|
| 48 |
+
"categories": [
|
| 49 |
+
{"value": "seeking_information", "description": "Asking questions, seeking advice/recommendations"},
|
| 50 |
+
{"value": "providing_information", "description": "Answering questions, giving advice, helping others"},
|
| 51 |
+
{"value": "sharing_experience", "description": "Personal experience, review, testimonial, purchase announcement"},
|
| 52 |
+
{"value": "comparing", "description": "Comparing brands/products against each other"},
|
| 53 |
+
{"value": "praising", "description": "Actively endorsing, recommending, advocating for Sabian"},
|
| 54 |
+
{"value": "criticizing", "description": "Actively complaining, warning others, reporting issues"},
|
| 55 |
+
{"value": "buying_selling", "description": "Listing gear for sale, looking to buy/trade"},
|
| 56 |
+
{"value": "general_discussion", "description": "General conversation not fitting above"}
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
"purchase_stage": {
|
| 60 |
+
"author_perspective_only": true,
|
| 61 |
+
"description": "Author's own purchase journey stage (null if giving advice to others)",
|
| 62 |
+
"categories": [
|
| 63 |
+
{"value": "researching", "description": "Gathering info before buying"},
|
| 64 |
+
{"value": "deciding", "description": "Actively comparing, about to decide"},
|
| 65 |
+
{"value": "recently_purchased", "description": "Just bought the product"},
|
| 66 |
+
{"value": "long_term_owner", "description": "Owned for extended period"},
|
| 67 |
+
{"value": "selling_replacing", "description": "Selling or replacing gear"}
|
| 68 |
+
]
|
| 69 |
+
},
|
| 70 |
+
"comparison_type": {
|
| 71 |
+
"description": "Type of competitive comparison (if comparing)",
|
| 72 |
+
"categories": [
|
| 73 |
+
{"value": "direct_comparison", "description": "Side-by-side evaluation"},
|
| 74 |
+
{"value": "preference_statement", "description": "Stating brand preference"},
|
| 75 |
+
{"value": "switching_to_sabian", "description": "Moving or Moved from competitor to Sabian"},
|
| 76 |
+
{"value": "switching_from_sabian", "description": "Moving or Moved from Sabian to competitor"}
|
| 77 |
+
]
|
| 78 |
+
},
|
| 79 |
+
"feedback_aspects": {
|
| 80 |
+
"description": "Product/brand aspects discussed. Used for BOTH pain_points (negative) and delight_factors (positive)",
|
| 81 |
+
"categories": [
|
| 82 |
+
{"value": "sound_quality", "description": "Sound, tone, character, audio qualities"},
|
| 83 |
+
{"value": "price_value", "description": "Cost, value for money, deals"},
|
| 84 |
+
{"value": "durability", "description": "Build quality, longevity, cracking/wear"},
|
| 85 |
+
{"value": "playability", "description": "Feel, response, ease of playing"},
|
| 86 |
+
{"value": "versatility", "description": "Range of genres/applications, flexibility"},
|
| 87 |
+
{"value": "customer_service", "description": "Support, warranty, brand interaction"},
|
| 88 |
+
{"value": "availability", "description": "Stock, ease of finding/purchasing"},
|
| 89 |
+
{"value": "aesthetics", "description": "Appearance, finish, visual appeal"}
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
"decision_drivers": {
|
| 93 |
+
"author_perspective_only": true,
|
| 94 |
+
"description": "What influenced AUTHOR's own purchase decision (empty if giving advice)",
|
| 95 |
+
"categories": [
|
| 96 |
+
{"value": "sound_quality", "description": "Sound characteristics"},
|
| 97 |
+
{"value": "price", "description": "Cost/budget considerations"},
|
| 98 |
+
{"value": "durability", "description": "Build quality, longevity"},
|
| 99 |
+
{"value": "artist_endorsement", "description": "Influenced by endorsed artists"},
|
| 100 |
+
{"value": "peer_recommendation", "description": "Friends/community recommended"},
|
| 101 |
+
{"value": "hands_on_testing", "description": "Tried before buying"},
|
| 102 |
+
{"value": "brand_loyalty", "description": "Previous positive experience"},
|
| 103 |
+
{"value": "versatility", "description": "Multi-genre/application use"},
|
| 104 |
+
{"value": "online_reviews", "description": "Read reviews that influenced"}
|
| 105 |
+
]
|
| 106 |
+
},
|
| 107 |
+
"product_attributes": {
|
| 108 |
+
"description": "Attributes being discussed about Sabian products",
|
| 109 |
+
"categories": [
|
| 110 |
+
{"value": "sound_quality", "description": "Tone, character, audio qualities"},
|
| 111 |
+
{"value": "durability", "description": "Build quality, longevity"},
|
| 112 |
+
{"value": "price", "description": "Cost and value"},
|
| 113 |
+
{"value": "playability", "description": "Feel, response"},
|
| 114 |
+
{"value": "aesthetics", "description": "Appearance, finish"},
|
| 115 |
+
{"value": "volume", "description": "Loudness, projection"},
|
| 116 |
+
{"value": "sustain", "description": "How long sound lasts"},
|
| 117 |
+
{"value": "versatility", "description": "Range of applications"}
|
| 118 |
+
]
|
| 119 |
+
},
|
| 120 |
+
"analysis_notes_guidelines": {
|
| 121 |
+
"description": "Keep to 1-2 sentences. Focus on Sabian-specific insights not captured by other fields."
|
| 122 |
+
}
|
| 123 |
+
}
|
processing_brand_sentiment/config_files/brand_config.json
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"brand": {
|
| 3 |
+
"name": "Sabian",
|
| 4 |
+
"description": "Sabian is a Canadian manufacturer of cymbals founded in 1981",
|
| 5 |
+
"products": [
|
| 6 |
+
"HHX",
|
| 7 |
+
"AAX",
|
| 8 |
+
"Artisan",
|
| 9 |
+
"FRX",
|
| 10 |
+
"Omni",
|
| 11 |
+
"Chopper",
|
| 12 |
+
"Stratus",
|
| 13 |
+
"XSR",
|
| 14 |
+
"B8X",
|
| 15 |
+
"SBR"
|
| 16 |
+
],
|
| 17 |
+
"product_aliases": {
|
| 18 |
+
"b8": "B8X",
|
| 19 |
+
"sbrs": "SBR",
|
| 20 |
+
"hhxs": "HHX",
|
| 21 |
+
"aaxs": "AAX",
|
| 22 |
+
"hhx's": "HHX",
|
| 23 |
+
"aax's": "AAX"
|
| 24 |
+
},
|
| 25 |
+
"product_descriptions": {
|
| 26 |
+
"HHX": "Hand Hammered Xtreme - Professional series with dark, complex tones",
|
| 27 |
+
"AAX": "Bright, cutting cymbals for modern music",
|
| 28 |
+
"Artisan": "Premium hand-crafted cymbals with unique character",
|
| 29 |
+
"FRX": "Frequency Reduced Xtreme - Lower volume cymbals",
|
| 30 |
+
"Omni": "Multi-purpose cymbals for various playing styles",
|
| 31 |
+
"Chopper": "Effect cymbals with unique sound",
|
| 32 |
+
"Stratus": "Dark, complex sounds for jazz and fusion",
|
| 33 |
+
"XSR": "Entry-level professional cymbals",
|
| 34 |
+
"B8X": "Bronze entry-level cymbals",
|
| 35 |
+
"SBR": "Entry-level brass cymbals"
|
| 36 |
+
},
|
| 37 |
+
"competitor_products_warning": {
|
| 38 |
+
"description": "Products that belong to competitors - DO NOT attribute to Sabian",
|
| 39 |
+
"paiste_products": ["2002", "signature", "sound edge", "formula 602", "giant beat", "pst", "rude", "masters", "traditionals", "twenty", "dark energy"],
|
| 40 |
+
"zildjian_products": ["k custom", "a custom", "k zildjian", "a zildjian", "s family", "i family", "l80", "kerope", "constantinople", "k sweet"],
|
| 41 |
+
"meinl_products": ["byzance", "pure alloy", "hcs", "classics custom", "mb20", "mb10", "soundcaster"],
|
| 42 |
+
"dream_products": ["bliss", "contact", "energy", "dark matter", "vintage bliss", "eclipse"],
|
| 43 |
+
"istanbul_products": ["agop", "xist", "traditional", "sultan", "mehmet"]
|
| 44 |
+
},
|
| 45 |
+
"competitors": [
|
| 46 |
+
{
|
| 47 |
+
"name": "Zildjian",
|
| 48 |
+
"aliases": ["zildjian", "zil", "z custom", "a custom", "k custom", "k zildjian", "a zildjian"]
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"name": "Meinl",
|
| 52 |
+
"aliases": ["meinl", "byzance", "classics"]
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"name": "Paiste",
|
| 56 |
+
"aliases": ["paiste", "2002", "signature", "formula 602", "sound edge"]
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"name": "Dream Cymbals",
|
| 60 |
+
"aliases": ["dream", "dream cymbals", "bliss"]
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "Istanbul Agop",
|
| 64 |
+
"aliases": ["istanbul", "agop", "istanbul agop", "istanbul mehmet"]
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"name": "Bosphorus",
|
| 68 |
+
"aliases": ["bosphorus"]
|
| 69 |
+
}
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
"relevance_keywords": {
|
| 73 |
+
"primary": {
|
| 74 |
+
"description": "Keywords that definitively indicate Sabian content",
|
| 75 |
+
"keywords": ["sabian", "hhx", "aax", "artisan", "frx", "omni", "chopper", "stratus", "xsr", "b8x", "sbr"]
|
| 76 |
+
},
|
| 77 |
+
"contextual": {
|
| 78 |
+
"description": "Ambiguous keywords that need context verification",
|
| 79 |
+
"keywords": ["b8"]
|
| 80 |
+
},
|
| 81 |
+
"cymbal_context": {
|
| 82 |
+
"description": "Keywords that provide cymbal-related context for disambiguation",
|
| 83 |
+
"keywords": ["cymbal", "cymbals", "crash", "ride", "hi-hat", "hihat", "hi hat", "splash", "china", "bell", "stack", "effects"]
|
| 84 |
+
}
|
| 85 |
+
},
|
| 86 |
+
"preprocessing": {
|
| 87 |
+
"min_length_for_language_detection": 50,
|
| 88 |
+
"default_language_for_short_text": "English",
|
| 89 |
+
"always_process_if_primary_keyword": true,
|
| 90 |
+
"min_content_length": 3
|
| 91 |
+
},
|
| 92 |
+
"filter_conditions": {
|
| 93 |
+
"exclude_access_levels": ["team", "house-coach"],
|
| 94 |
+
"exclude_post_states": ["deleted", "spam"],
|
| 95 |
+
"require_content_length_min": 3
|
| 96 |
+
},
|
| 97 |
+
"data_sources": {
|
| 98 |
+
"forums": {
|
| 99 |
+
"table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS",
|
| 100 |
+
"description": "Forum posts mentioning Sabian and their products",
|
| 101 |
+
"sql_query_file": "database/sql/fetch_forum_posts.sql",
|
| 102 |
+
"platform": "musora_forums"
|
| 103 |
+
},
|
| 104 |
+
"comments": {
|
| 105 |
+
"table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS",
|
| 106 |
+
"description": "Social media comments potentially related to Sabian brand",
|
| 107 |
+
"sql_query_file": "database/sql/fetch_comments.sql",
|
| 108 |
+
"platform_column": "PLATFORM"
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
}
|
processing_brand_sentiment/config_files/workflow_config.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"llm": {
|
| 3 |
+
"default_model": "gpt-5-nano",
|
| 4 |
+
"default_temperature": 0.2,
|
| 5 |
+
"max_retries": 3,
|
| 6 |
+
"timeout": 60
|
| 7 |
+
},
|
| 8 |
+
"agents": {
|
| 9 |
+
"preprocessor": {
|
| 10 |
+
"name": "PreprocessorAgent",
|
| 11 |
+
"description": "Deterministic agent for HTML parsing, text cleaning, language detection",
|
| 12 |
+
"model": "gpt-5-nano",
|
| 13 |
+
"temperature": 0.0,
|
| 14 |
+
"uses_llm": false
|
| 15 |
+
},
|
| 16 |
+
"relevance_validator": {
|
| 17 |
+
"name": "RelevanceValidatorAgent",
|
| 18 |
+
"description": "Lightweight LLM for disambiguation of ambiguous terms (HH, AA)",
|
| 19 |
+
"model": "gpt-5-nano",
|
| 20 |
+
"temperature": 0.0,
|
| 21 |
+
"max_retries": 2
|
| 22 |
+
},
|
| 23 |
+
"brand_analyzer": {
|
| 24 |
+
"name": "SabianAnalyzerAgent",
|
| 25 |
+
"description": "Comprehensive brand analysis for Sabian products",
|
| 26 |
+
"model": "gpt-5-nano",
|
| 27 |
+
"temperature": 0.2,
|
| 28 |
+
"max_retries": 3
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"workflow": {
|
| 32 |
+
"parallel_processing": {
|
| 33 |
+
"enabled": true,
|
| 34 |
+
"worker_calculation": "CPU count - 2, max 5 workers",
|
| 35 |
+
"max_workers": 5,
|
| 36 |
+
"min_batch_size": 20,
|
| 37 |
+
"max_batch_size": 500
|
| 38 |
+
},
|
| 39 |
+
"thread_context": {
|
| 40 |
+
"enabled": true,
|
| 41 |
+
"include_thread_title": true,
|
| 42 |
+
"include_first_post": true
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"output": {
|
| 46 |
+
"table_name": "SABIAN_BRAND_ANALYSIS",
|
| 47 |
+
"database": "SOCIAL_MEDIA_DB",
|
| 48 |
+
"schema": "ML_FEATURES"
|
| 49 |
+
},
|
| 50 |
+
"comments_output": {
|
| 51 |
+
"table_name": "SABIAN_BRAND_ANALYSIS_COMMENTS",
|
| 52 |
+
"database": "SOCIAL_MEDIA_DB",
|
| 53 |
+
"schema": "ML_FEATURES"
|
| 54 |
+
},
|
| 55 |
+
"logging": {
|
| 56 |
+
"level": "INFO",
|
| 57 |
+
"log_directory": "logs",
|
| 58 |
+
"log_file_prefix": "brand_sentiment_processing"
|
| 59 |
+
}
|
| 60 |
+
}
|
processing_brand_sentiment/database/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Database module for brand sentiment analysis.
|
| 3 |
+
Contains Snowflake connection handler and SQL query utilities.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .snowflake_connection import SnowFlakeConn
|
| 7 |
+
|
| 8 |
+
__all__ = ['SnowFlakeConn']
|
processing_brand_sentiment/database/snowflake_connection.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Snowflake connection handler for brand sentiment analysis.
|
| 3 |
+
Provides methods for reading data, executing queries, and storing results.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from snowflake.snowpark import Session
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
import logging
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from typing import Optional, List, Any
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
# Load environment variables
|
| 16 |
+
load_dotenv()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class SnowFlakeConn:
|
| 20 |
+
"""
|
| 21 |
+
Handles Snowflake database connections and operations for brand sentiment analysis.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self):
|
| 25 |
+
"""Initialize Snowflake connection."""
|
| 26 |
+
self.session = self.connect_to_snowflake()
|
| 27 |
+
|
| 28 |
+
def connect_to_snowflake(self) -> Session:
|
| 29 |
+
"""
|
| 30 |
+
Create a connection to Snowflake using environment variables.
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
Snowflake Session object
|
| 34 |
+
"""
|
| 35 |
+
conn = dict(
|
| 36 |
+
user=self.get_credential("SNOWFLAKE_USER"),
|
| 37 |
+
password=self.get_credential("SNOWFLAKE_PASSWORD"),
|
| 38 |
+
account=self.get_credential("SNOWFLAKE_ACCOUNT"),
|
| 39 |
+
role=self.get_credential("SNOWFLAKE_ROLE"),
|
| 40 |
+
database=self.get_credential("SNOWFLAKE_DATABASE"),
|
| 41 |
+
warehouse=self.get_credential("SNOWFLAKE_WAREHOUSE"),
|
| 42 |
+
schema=self.get_credential("SNOWFLAKE_SCHEMA"),
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
session = Session.builder.configs(conn).create()
|
| 46 |
+
logger.info("Successfully connected to Snowflake")
|
| 47 |
+
return session
|
| 48 |
+
|
| 49 |
+
def get_credential(self, key: str) -> str:
|
| 50 |
+
"""
|
| 51 |
+
Get credential from environment variables.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
key: Environment variable name
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
Credential value
|
| 58 |
+
"""
|
| 59 |
+
return os.getenv(key)
|
| 60 |
+
|
| 61 |
+
def run_read_query(self, query: str, description: str = "data") -> pd.DataFrame:
|
| 62 |
+
"""
|
| 63 |
+
Execute a SQL query that fetches data.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
query: SQL query string
|
| 67 |
+
description: Description of what data is being fetched
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
Pandas DataFrame containing query results
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
dataframe = self.session.sql(query).to_pandas()
|
| 74 |
+
dataframe.columns = dataframe.columns.str.lower()
|
| 75 |
+
logger.info(f"Successfully read {len(dataframe)} rows for {description}")
|
| 76 |
+
return dataframe
|
| 77 |
+
except Exception as e:
|
| 78 |
+
logger.error(f"Error reading {description}: {e}")
|
| 79 |
+
raise
|
| 80 |
+
|
| 81 |
+
def store_df_to_snowflake(
|
| 82 |
+
self,
|
| 83 |
+
table_name: str,
|
| 84 |
+
dataframe: pd.DataFrame,
|
| 85 |
+
database: str = "SOCIAL_MEDIA_DB",
|
| 86 |
+
schema: str = "ML_FEATURES",
|
| 87 |
+
overwrite: bool = False
|
| 88 |
+
) -> None:
|
| 89 |
+
"""
|
| 90 |
+
Store a DataFrame to Snowflake.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
table_name: Target table name
|
| 94 |
+
dataframe: DataFrame to store
|
| 95 |
+
database: Target database
|
| 96 |
+
schema: Target schema
|
| 97 |
+
overwrite: If True, overwrite existing data; if False, append
|
| 98 |
+
"""
|
| 99 |
+
try:
|
| 100 |
+
self.session.use_database(database)
|
| 101 |
+
self.session.use_schema(schema)
|
| 102 |
+
|
| 103 |
+
dataframe = dataframe.reset_index(drop=True)
|
| 104 |
+
dataframe.columns = dataframe.columns.str.upper()
|
| 105 |
+
|
| 106 |
+
self.session.write_pandas(
|
| 107 |
+
df=dataframe,
|
| 108 |
+
table_name=table_name.strip().upper(),
|
| 109 |
+
auto_create_table=True,
|
| 110 |
+
overwrite=overwrite,
|
| 111 |
+
use_logical_type=True
|
| 112 |
+
)
|
| 113 |
+
logger.info(f"Successfully stored {len(dataframe)} rows to {table_name}")
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.error(f"Error storing data to {table_name}: {e}")
|
| 117 |
+
raise
|
| 118 |
+
|
| 119 |
+
def execute_sql_file(self, file_path: str) -> Optional[List[Any]]:
|
| 120 |
+
"""
|
| 121 |
+
Execute SQL queries from a file.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
file_path: Path to SQL file
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
Query result or None for DDL/DML
|
| 128 |
+
"""
|
| 129 |
+
try:
|
| 130 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
| 131 |
+
sql_content = file.read()
|
| 132 |
+
|
| 133 |
+
result = self.session.sql(sql_content).collect()
|
| 134 |
+
logger.info(f"Successfully executed SQL from {file_path}")
|
| 135 |
+
return result
|
| 136 |
+
except Exception as e:
|
| 137 |
+
logger.error(f"Error executing SQL file {file_path}: {e}")
|
| 138 |
+
return None
|
| 139 |
+
|
| 140 |
+
def execute_query(self, query: str, description: str = "query") -> Optional[List[Any]]:
|
| 141 |
+
"""
|
| 142 |
+
Execute a SQL query and return results.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
query: SQL query string
|
| 146 |
+
description: Description of the query for logging
|
| 147 |
+
|
| 148 |
+
Returns:
|
| 149 |
+
Query results
|
| 150 |
+
"""
|
| 151 |
+
try:
|
| 152 |
+
result = self.session.sql(query).collect()
|
| 153 |
+
logger.info(f"Successfully executed {description}")
|
| 154 |
+
return result
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logger.error(f"Error executing {description}: {e}")
|
| 157 |
+
return None
|
| 158 |
+
|
| 159 |
+
def fetch_forum_posts_with_context(
|
| 160 |
+
self,
|
| 161 |
+
sql_file_path: str,
|
| 162 |
+
limit: Optional[int] = None
|
| 163 |
+
) -> pd.DataFrame:
|
| 164 |
+
"""
|
| 165 |
+
Fetch forum posts with thread context from SQL file.
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
sql_file_path: Path to the SQL query file
|
| 169 |
+
limit: Optional limit on number of posts to fetch
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
DataFrame containing forum posts with context
|
| 173 |
+
"""
|
| 174 |
+
try:
|
| 175 |
+
with open(sql_file_path, 'r', encoding='utf-8') as f:
|
| 176 |
+
query = f.read()
|
| 177 |
+
|
| 178 |
+
# Add limit if specified
|
| 179 |
+
if limit:
|
| 180 |
+
# Strip whitespace first, then semicolon, to handle Windows line endings
|
| 181 |
+
query = query.strip().rstrip(';') + f"\nLIMIT {limit};"
|
| 182 |
+
|
| 183 |
+
df = self.run_read_query(query, "forum posts with context")
|
| 184 |
+
|
| 185 |
+
# Validate required columns
|
| 186 |
+
required_cols = ['post_id', 'post_content', 'thread_id']
|
| 187 |
+
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 188 |
+
if missing_cols:
|
| 189 |
+
logger.warning(f"Missing expected columns: {missing_cols}")
|
| 190 |
+
|
| 191 |
+
return df
|
| 192 |
+
|
| 193 |
+
except Exception as e:
|
| 194 |
+
logger.error(f"Error fetching forum posts: {e}")
|
| 195 |
+
raise
|
| 196 |
+
|
| 197 |
+
def fetch_comments(
|
| 198 |
+
self,
|
| 199 |
+
sql_file_path: str,
|
| 200 |
+
limit: Optional[int] = None
|
| 201 |
+
) -> pd.DataFrame:
|
| 202 |
+
"""
|
| 203 |
+
Fetch social media comments with context from SQL file.
|
| 204 |
+
|
| 205 |
+
Args:
|
| 206 |
+
sql_file_path: Path to the SQL query file
|
| 207 |
+
limit: Optional limit on number of comments to fetch
|
| 208 |
+
|
| 209 |
+
Returns:
|
| 210 |
+
DataFrame containing comments with context
|
| 211 |
+
"""
|
| 212 |
+
try:
|
| 213 |
+
with open(sql_file_path, 'r', encoding='utf-8') as f:
|
| 214 |
+
query = f.read()
|
| 215 |
+
|
| 216 |
+
# Add limit if specified
|
| 217 |
+
if limit:
|
| 218 |
+
query = query.strip().rstrip(';') + f"\nLIMIT {limit};"
|
| 219 |
+
|
| 220 |
+
df = self.run_read_query(query, "social media comments with context")
|
| 221 |
+
|
| 222 |
+
# Validate required columns
|
| 223 |
+
required_cols = ['comment_sk', 'comment_id', 'comment_text', 'platform']
|
| 224 |
+
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 225 |
+
if missing_cols:
|
| 226 |
+
logger.warning(f"Missing expected columns: {missing_cols}")
|
| 227 |
+
|
| 228 |
+
return df
|
| 229 |
+
|
| 230 |
+
except Exception as e:
|
| 231 |
+
logger.error(f"Error fetching comments: {e}")
|
| 232 |
+
raise
|
| 233 |
+
|
| 234 |
+
def close_connection(self) -> None:
|
| 235 |
+
"""Close the Snowflake session."""
|
| 236 |
+
try:
|
| 237 |
+
self.session.close()
|
| 238 |
+
logger.info("Snowflake connection closed")
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logger.error(f"Error closing connection: {e}")
|
processing_brand_sentiment/database/sql/create_comments_output_table.sql
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Create the output table for Sabian brand sentiment analysis on social media comments
|
| 2 |
+
-- Stores processed comments with extracted brand intelligence
|
| 3 |
+
-- Schema Version 4.0: Same analysis fields as forum table, different source identifiers
|
| 4 |
+
|
| 5 |
+
CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS (
|
| 6 |
+
-- Source identifiers (comment-specific)
|
| 7 |
+
COMMENT_SK NUMBER(38,0),
|
| 8 |
+
COMMENT_ID VARCHAR(16777216),
|
| 9 |
+
ORIGINAL_TEXT VARCHAR(16777216),
|
| 10 |
+
PLATFORM VARCHAR(16777216),
|
| 11 |
+
COMMENT_TIMESTAMP TIMESTAMP_NTZ(9),
|
| 12 |
+
AUTHOR_NAME VARCHAR(16777216),
|
| 13 |
+
AUTHOR_ID VARCHAR(16777216),
|
| 14 |
+
CONTENT_SK NUMBER(38,0),
|
| 15 |
+
CONTENT_ID VARCHAR(16777216),
|
| 16 |
+
CONTENT_DESCRIPTION VARCHAR(16777216),
|
| 17 |
+
CHANNEL_SK NUMBER(38,0),
|
| 18 |
+
CHANNEL_NAME VARCHAR(16777216),
|
| 19 |
+
CHANNEL_DISPLAY_NAME VARCHAR(16777216),
|
| 20 |
+
PARENT_COMMENT_ID VARCHAR(16777216),
|
| 21 |
+
PARENT_COMMENT_TEXT VARCHAR(16777216),
|
| 22 |
+
|
| 23 |
+
-- Language detection
|
| 24 |
+
DETECTED_LANGUAGE VARCHAR(100),
|
| 25 |
+
LANGUAGE_CODE VARCHAR(10),
|
| 26 |
+
IS_ENGLISH BOOLEAN,
|
| 27 |
+
|
| 28 |
+
-- Relevance assessment
|
| 29 |
+
IS_RELEVANT BOOLEAN,
|
| 30 |
+
RELEVANCE_CONFIDENCE VARCHAR(20),
|
| 31 |
+
RELEVANCE_REASON VARCHAR(500),
|
| 32 |
+
|
| 33 |
+
-- Author classification
|
| 34 |
+
AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
|
| 35 |
+
SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
|
| 36 |
+
|
| 37 |
+
-- Sentiment analysis
|
| 38 |
+
SENTIMENT_LEVEL VARCHAR(20),
|
| 39 |
+
EMOTION_TYPE VARCHAR(50),
|
| 40 |
+
SENTIMENT_TARGET VARCHAR(50),
|
| 41 |
+
SENTIMENT_CONFIDENCE VARCHAR(20),
|
| 42 |
+
|
| 43 |
+
-- Product information (stored as JSON arrays)
|
| 44 |
+
PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"]
|
| 45 |
+
PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
|
| 46 |
+
PURCHASE_STAGE VARCHAR(50),
|
| 47 |
+
|
| 48 |
+
-- Competitive intelligence
|
| 49 |
+
COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"]
|
| 50 |
+
COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
|
| 51 |
+
COMPARISON_TYPE VARCHAR(50),
|
| 52 |
+
COMPETITIVE_POSITIONING VARCHAR(500),
|
| 53 |
+
BRAND_SWITCHING VARCHAR(100),
|
| 54 |
+
|
| 55 |
+
-- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
|
| 56 |
+
INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"]
|
| 57 |
+
DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"]
|
| 58 |
+
PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"]
|
| 59 |
+
DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
|
| 60 |
+
|
| 61 |
+
-- Analysis notes
|
| 62 |
+
ANALYSIS_NOTES VARCHAR(16777216),
|
| 63 |
+
SARCASM_DETECTED BOOLEAN,
|
| 64 |
+
|
| 65 |
+
-- Validation results
|
| 66 |
+
VALIDATION_PASSED BOOLEAN,
|
| 67 |
+
VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
|
| 68 |
+
VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
|
| 69 |
+
VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags
|
| 70 |
+
|
| 71 |
+
-- Processing metadata
|
| 72 |
+
PROCESSING_SUCCESS BOOLEAN,
|
| 73 |
+
PROCESSING_STATUS VARCHAR(50), -- completed, completed_with_flags, validation_failed, workflow_error
|
| 74 |
+
PROCESSING_ERRORS VARCHAR(16777216),
|
| 75 |
+
PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
|
| 76 |
+
WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
|
| 77 |
+
)
|
| 78 |
+
COMMENT = 'Brand sentiment analysis results for Sabian social media comments. Schema v4.0: 4-agent pipeline with extraction/analysis separation and validation.';
|
| 79 |
+
|
| 80 |
+
-- Create indexes for common query patterns
|
| 81 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_SK ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(COMMENT_SK);
|
| 82 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_PLATFORM ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PLATFORM);
|
| 83 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_IS_RELEVANT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(IS_RELEVANT);
|
| 84 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_SENTIMENT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(SENTIMENT_LEVEL);
|
| 85 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_PROCESSED_AT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PROCESSED_AT);
|
| 86 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_AUTHOR_ROLE ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(AUTHOR_ROLE);
|
| 87 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_MENTION_CTX ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(SABIAN_MENTION_CONTEXT);
|
| 88 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_STATUS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PROCESSING_STATUS);
|
| 89 |
+
|
| 90 |
+
-- Create view for relevant comments only
|
| 91 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_RELEVANT_ANALYSIS AS
|
| 92 |
+
SELECT *
|
| 93 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
|
| 94 |
+
WHERE IS_RELEVANT = TRUE
|
| 95 |
+
AND PROCESSING_SUCCESS = TRUE;
|
| 96 |
+
|
| 97 |
+
-- Create view for comments needing review (flagged by validator)
|
| 98 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_FLAGGED AS
|
| 99 |
+
SELECT
|
| 100 |
+
COMMENT_SK,
|
| 101 |
+
COMMENT_ID,
|
| 102 |
+
PLATFORM,
|
| 103 |
+
ORIGINAL_TEXT,
|
| 104 |
+
IS_RELEVANT,
|
| 105 |
+
RELEVANCE_CONFIDENCE,
|
| 106 |
+
RELEVANCE_REASON,
|
| 107 |
+
PRODUCTS_MENTIONED,
|
| 108 |
+
SABIAN_MENTION_CONTEXT,
|
| 109 |
+
SENTIMENT_LEVEL,
|
| 110 |
+
VALIDATION_FLAGS,
|
| 111 |
+
VALIDATION_WARNINGS,
|
| 112 |
+
PROCESSING_STATUS
|
| 113 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
|
| 114 |
+
WHERE PROCESSING_STATUS = 'completed_with_flags'
|
| 115 |
+
OR VALIDATION_PASSED = FALSE
|
| 116 |
+
ORDER BY PROCESSED_AT DESC;
|
| 117 |
+
|
| 118 |
+
-- Create view for sentiment distribution
|
| 119 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_SENTIMENT_DISTRIBUTION AS
|
| 120 |
+
SELECT
|
| 121 |
+
PLATFORM,
|
| 122 |
+
SENTIMENT_LEVEL,
|
| 123 |
+
EMOTION_TYPE,
|
| 124 |
+
SENTIMENT_TARGET,
|
| 125 |
+
COUNT(*) AS COMMENT_COUNT,
|
| 126 |
+
COUNT(CASE WHEN SARCASM_DETECTED = TRUE THEN 1 END) AS SARCASM_COUNT
|
| 127 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
|
| 128 |
+
WHERE IS_RELEVANT = TRUE
|
| 129 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 130 |
+
GROUP BY PLATFORM, SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_TARGET
|
| 131 |
+
ORDER BY COMMENT_COUNT DESC;
|
| 132 |
+
|
| 133 |
+
-- Create view for product mentions summary
|
| 134 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_PRODUCT_MENTIONS AS
|
| 135 |
+
SELECT
|
| 136 |
+
PLATFORM,
|
| 137 |
+
TRIM(product.VALUE::STRING) AS PRODUCT,
|
| 138 |
+
SENTIMENT_LEVEL,
|
| 139 |
+
COUNT(*) AS MENTION_COUNT,
|
| 140 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT,
|
| 141 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT
|
| 142 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS,
|
| 143 |
+
LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PRODUCTS_MENTIONED)) AS product
|
| 144 |
+
WHERE IS_RELEVANT = TRUE
|
| 145 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 146 |
+
AND PRODUCTS_MENTIONED IS NOT NULL
|
| 147 |
+
GROUP BY PLATFORM, TRIM(product.VALUE::STRING), SENTIMENT_LEVEL
|
| 148 |
+
ORDER BY MENTION_COUNT DESC;
|
| 149 |
+
|
| 150 |
+
-- Create view for validation summary
|
| 151 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_VALIDATION_SUMMARY AS
|
| 152 |
+
SELECT
|
| 153 |
+
PLATFORM,
|
| 154 |
+
PROCESSING_STATUS,
|
| 155 |
+
VALIDATION_PASSED,
|
| 156 |
+
COUNT(*) AS COMMENT_COUNT,
|
| 157 |
+
COUNT(CASE WHEN IS_RELEVANT = TRUE THEN 1 END) AS RELEVANT_COUNT,
|
| 158 |
+
COUNT(CASE WHEN IS_RELEVANT = FALSE THEN 1 END) AS NOT_RELEVANT_COUNT
|
| 159 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
|
| 160 |
+
GROUP BY PLATFORM, PROCESSING_STATUS, VALIDATION_PASSED
|
| 161 |
+
ORDER BY COMMENT_COUNT DESC;
|
processing_brand_sentiment/database/sql/create_output_table.sql
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Create the output table for Sabian brand sentiment analysis
|
| 2 |
+
-- Stores processed forum posts with extracted brand intelligence
|
| 3 |
+
-- Schema Version 4.0: Added THREAD_CONTEXT_SUMMARY, validation fields, and processing status
|
| 4 |
+
|
| 5 |
+
CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS (
|
| 6 |
+
-- Source identifiers
|
| 7 |
+
POST_ID NUMBER(38,0) PRIMARY KEY,
|
| 8 |
+
THREAD_ID NUMBER(38,0),
|
| 9 |
+
POST_AUTHOR_ID NUMBER(38,0),
|
| 10 |
+
|
| 11 |
+
-- Original and processed content
|
| 12 |
+
ORIGINAL_CONTENT VARCHAR(16777216),
|
| 13 |
+
CLEANED_CONTENT VARCHAR(16777216),
|
| 14 |
+
QUOTED_CONTENT VARCHAR(16777216),
|
| 15 |
+
THREAD_CONTEXT VARCHAR(16777216), -- Raw thread context (legacy)
|
| 16 |
+
THREAD_CONTEXT_SUMMARY VARCHAR(4096), -- NEW v4.0: Summarized thread context for analysis
|
| 17 |
+
|
| 18 |
+
-- Thread metadata
|
| 19 |
+
THREAD_TITLE VARCHAR(16777216),
|
| 20 |
+
THREAD_FIRST_POST VARCHAR(16777216),
|
| 21 |
+
|
| 22 |
+
-- Timestamps
|
| 23 |
+
POST_CREATED_AT TIMESTAMP_LTZ(9),
|
| 24 |
+
THREAD_STARTED_AT TIMESTAMP_LTZ(9),
|
| 25 |
+
|
| 26 |
+
-- Category information
|
| 27 |
+
CATEGORY_TITLE VARCHAR(16777216),
|
| 28 |
+
CATEGORY_TOPIC VARCHAR(16777216),
|
| 29 |
+
|
| 30 |
+
-- Language detection
|
| 31 |
+
DETECTED_LANGUAGE VARCHAR(100),
|
| 32 |
+
LANGUAGE_CODE VARCHAR(10),
|
| 33 |
+
IS_ENGLISH BOOLEAN,
|
| 34 |
+
|
| 35 |
+
-- Relevance assessment
|
| 36 |
+
IS_RELEVANT BOOLEAN,
|
| 37 |
+
RELEVANCE_CONFIDENCE VARCHAR(20),
|
| 38 |
+
RELEVANCE_REASON VARCHAR(500),
|
| 39 |
+
|
| 40 |
+
-- Author classification
|
| 41 |
+
AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
|
| 42 |
+
SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
|
| 43 |
+
|
| 44 |
+
-- Sentiment analysis
|
| 45 |
+
SENTIMENT_LEVEL VARCHAR(20),
|
| 46 |
+
EMOTION_TYPE VARCHAR(50),
|
| 47 |
+
SENTIMENT_TARGET VARCHAR(50),
|
| 48 |
+
SENTIMENT_CONFIDENCE VARCHAR(20),
|
| 49 |
+
|
| 50 |
+
-- Product information (stored as JSON arrays)
|
| 51 |
+
PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"]
|
| 52 |
+
PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
|
| 53 |
+
PURCHASE_STAGE VARCHAR(50),
|
| 54 |
+
|
| 55 |
+
-- Competitive intelligence
|
| 56 |
+
COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"]
|
| 57 |
+
COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
|
| 58 |
+
COMPARISON_TYPE VARCHAR(50),
|
| 59 |
+
COMPETITIVE_POSITIONING VARCHAR(500),
|
| 60 |
+
BRAND_SWITCHING VARCHAR(100),
|
| 61 |
+
|
| 62 |
+
-- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
|
| 63 |
+
INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"]
|
| 64 |
+
DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"]
|
| 65 |
+
PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"]
|
| 66 |
+
DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
|
| 67 |
+
|
| 68 |
+
-- Analysis notes
|
| 69 |
+
ANALYSIS_NOTES VARCHAR(16777216),
|
| 70 |
+
SARCASM_DETECTED BOOLEAN,
|
| 71 |
+
|
| 72 |
+
-- Validation results (NEW v4.0)
|
| 73 |
+
VALIDATION_PASSED BOOLEAN,
|
| 74 |
+
VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
|
| 75 |
+
VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
|
| 76 |
+
VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags (e.g., "sarcasm_detected", "low_confidence_relevant")
|
| 77 |
+
|
| 78 |
+
-- Platform identifier
|
| 79 |
+
PLATFORM VARCHAR(50) DEFAULT 'musora_forums',
|
| 80 |
+
|
| 81 |
+
-- Processing metadata
|
| 82 |
+
PROCESSING_SUCCESS BOOLEAN,
|
| 83 |
+
PROCESSING_STATUS VARCHAR(50), -- NEW v4.0: completed, completed_with_flags, validation_failed, workflow_error
|
| 84 |
+
PROCESSING_ERRORS VARCHAR(16777216),
|
| 85 |
+
PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
|
| 86 |
+
WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
|
| 87 |
+
)
|
| 88 |
+
COMMENT = 'Brand sentiment analysis results for Sabian forum posts. Schema v4.0: 4-agent pipeline with extraction/analysis separation, thread context summarization, and validation.';
|
| 89 |
+
|
| 90 |
+
-- Create indexes for common query patterns
|
| 91 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_THREAD_ID ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(THREAD_ID);
|
| 92 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_IS_RELEVANT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(IS_RELEVANT);
|
| 93 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_SENTIMENT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(SENTIMENT_LEVEL);
|
| 94 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_PROCESSED_AT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(PROCESSED_AT);
|
| 95 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_AUTHOR_ROLE ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(AUTHOR_ROLE);
|
| 96 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_MENTION_CONTEXT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(SABIAN_MENTION_CONTEXT);
|
| 97 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_PROCESSING_STATUS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(PROCESSING_STATUS);
|
| 98 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_VALIDATION_FLAGS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(VALIDATION_PASSED);
|
| 99 |
+
|
| 100 |
+
-- Create view for relevant posts only
|
| 101 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_RELEVANT_ANALYSIS AS
|
| 102 |
+
SELECT *
|
| 103 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
|
| 104 |
+
WHERE IS_RELEVANT = TRUE
|
| 105 |
+
AND PROCESSING_SUCCESS = TRUE;
|
| 106 |
+
|
| 107 |
+
-- Create view for posts needing review (flagged by validator)
|
| 108 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_FLAGGED_POSTS AS
|
| 109 |
+
SELECT
|
| 110 |
+
POST_ID,
|
| 111 |
+
THREAD_ID,
|
| 112 |
+
CLEANED_CONTENT,
|
| 113 |
+
THREAD_CONTEXT_SUMMARY,
|
| 114 |
+
IS_RELEVANT,
|
| 115 |
+
RELEVANCE_CONFIDENCE,
|
| 116 |
+
RELEVANCE_REASON,
|
| 117 |
+
PRODUCTS_MENTIONED,
|
| 118 |
+
SABIAN_MENTION_CONTEXT,
|
| 119 |
+
SENTIMENT_LEVEL,
|
| 120 |
+
VALIDATION_FLAGS,
|
| 121 |
+
VALIDATION_WARNINGS,
|
| 122 |
+
PROCESSING_STATUS
|
| 123 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
|
| 124 |
+
WHERE PROCESSING_STATUS = 'completed_with_flags'
|
| 125 |
+
OR VALIDATION_PASSED = FALSE
|
| 126 |
+
ORDER BY PROCESSED_AT DESC;
|
| 127 |
+
|
| 128 |
+
-- Create view for sentiment distribution
|
| 129 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_SENTIMENT_DISTRIBUTION AS
|
| 130 |
+
SELECT
|
| 131 |
+
SENTIMENT_LEVEL,
|
| 132 |
+
EMOTION_TYPE,
|
| 133 |
+
SENTIMENT_TARGET,
|
| 134 |
+
COUNT(*) AS POST_COUNT,
|
| 135 |
+
COUNT(CASE WHEN SARCASM_DETECTED = TRUE THEN 1 END) AS SARCASM_COUNT
|
| 136 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
|
| 137 |
+
WHERE IS_RELEVANT = TRUE
|
| 138 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 139 |
+
GROUP BY SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_TARGET
|
| 140 |
+
ORDER BY POST_COUNT DESC;
|
| 141 |
+
|
| 142 |
+
-- Create view for product mentions summary
|
| 143 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_PRODUCT_MENTIONS AS
|
| 144 |
+
SELECT
|
| 145 |
+
TRIM(product.VALUE::STRING) AS PRODUCT,
|
| 146 |
+
SENTIMENT_LEVEL,
|
| 147 |
+
COUNT(*) AS MENTION_COUNT,
|
| 148 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT,
|
| 149 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT
|
| 150 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
|
| 151 |
+
LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PRODUCTS_MENTIONED)) AS product
|
| 152 |
+
WHERE IS_RELEVANT = TRUE
|
| 153 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 154 |
+
AND PRODUCTS_MENTIONED IS NOT NULL
|
| 155 |
+
GROUP BY TRIM(product.VALUE::STRING), SENTIMENT_LEVEL
|
| 156 |
+
ORDER BY MENTION_COUNT DESC;
|
| 157 |
+
|
| 158 |
+
-- Create view for competitor analysis
|
| 159 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMPETITOR_ANALYSIS AS
|
| 160 |
+
SELECT
|
| 161 |
+
TRIM(competitor.VALUE::STRING) AS COMPETITOR,
|
| 162 |
+
COMPARISON_TYPE,
|
| 163 |
+
BRAND_SWITCHING,
|
| 164 |
+
COUNT(*) AS MENTION_COUNT,
|
| 165 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_SENTIMENT,
|
| 166 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_SENTIMENT
|
| 167 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
|
| 168 |
+
LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(COMPETITORS_MENTIONED)) AS competitor
|
| 169 |
+
WHERE IS_RELEVANT = TRUE
|
| 170 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 171 |
+
AND COMPETITORS_MENTIONED IS NOT NULL
|
| 172 |
+
GROUP BY TRIM(competitor.VALUE::STRING), COMPARISON_TYPE, BRAND_SWITCHING
|
| 173 |
+
ORDER BY MENTION_COUNT DESC;
|
| 174 |
+
|
| 175 |
+
-- Create view for pain points analysis
|
| 176 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_PAIN_POINTS AS
|
| 177 |
+
SELECT
|
| 178 |
+
TRIM(pain_point.VALUE::STRING) AS PAIN_POINT,
|
| 179 |
+
COUNT(*) AS OCCURRENCE_COUNT,
|
| 180 |
+
ARRAY_AGG(DISTINCT SENTIMENT_LEVEL) AS SENTIMENT_LEVELS
|
| 181 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
|
| 182 |
+
LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PAIN_POINTS)) AS pain_point
|
| 183 |
+
WHERE IS_RELEVANT = TRUE
|
| 184 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 185 |
+
AND PAIN_POINTS IS NOT NULL
|
| 186 |
+
GROUP BY TRIM(pain_point.VALUE::STRING)
|
| 187 |
+
ORDER BY OCCURRENCE_COUNT DESC;
|
| 188 |
+
|
| 189 |
+
-- Create view for author role analysis
|
| 190 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_AUTHOR_ROLES AS
|
| 191 |
+
SELECT
|
| 192 |
+
AUTHOR_ROLE,
|
| 193 |
+
SABIAN_MENTION_CONTEXT,
|
| 194 |
+
COUNT(*) AS POST_COUNT,
|
| 195 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT,
|
| 196 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT,
|
| 197 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL = 'neutral' THEN 1 END) AS NEUTRAL_COUNT
|
| 198 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
|
| 199 |
+
WHERE IS_RELEVANT = TRUE
|
| 200 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 201 |
+
GROUP BY AUTHOR_ROLE, SABIAN_MENTION_CONTEXT
|
| 202 |
+
ORDER BY POST_COUNT DESC;
|
| 203 |
+
|
| 204 |
+
-- Create view for competitor ownership analysis
|
| 205 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMPETITOR_OWNERSHIP AS
|
| 206 |
+
SELECT
|
| 207 |
+
TRIM(competitor.VALUE::STRING) AS COMPETITOR_OWNED,
|
| 208 |
+
AUTHOR_ROLE,
|
| 209 |
+
COUNT(*) AS AUTHOR_COUNT,
|
| 210 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_TOWARD_SABIAN,
|
| 211 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_TOWARD_SABIAN
|
| 212 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
|
| 213 |
+
LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(COMPETITOR_PRODUCTS_OWNED)) AS competitor
|
| 214 |
+
WHERE IS_RELEVANT = TRUE
|
| 215 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 216 |
+
AND COMPETITOR_PRODUCTS_OWNED IS NOT NULL
|
| 217 |
+
GROUP BY TRIM(competitor.VALUE::STRING), AUTHOR_ROLE
|
| 218 |
+
ORDER BY AUTHOR_COUNT DESC;
|
| 219 |
+
|
| 220 |
+
-- Create view for mention context by sentiment
|
| 221 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_MENTION_DEPTH AS
|
| 222 |
+
SELECT
|
| 223 |
+
SABIAN_MENTION_CONTEXT,
|
| 224 |
+
SENTIMENT_LEVEL,
|
| 225 |
+
COUNT(*) AS POST_COUNT,
|
| 226 |
+
AVG(CASE
|
| 227 |
+
WHEN SENTIMENT_LEVEL = 'very_positive' THEN 2
|
| 228 |
+
WHEN SENTIMENT_LEVEL = 'positive' THEN 1
|
| 229 |
+
WHEN SENTIMENT_LEVEL = 'neutral' THEN 0
|
| 230 |
+
WHEN SENTIMENT_LEVEL = 'negative' THEN -1
|
| 231 |
+
WHEN SENTIMENT_LEVEL = 'very_negative' THEN -2
|
| 232 |
+
ELSE 0
|
| 233 |
+
END) AS AVG_SENTIMENT_SCORE
|
| 234 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
|
| 235 |
+
WHERE IS_RELEVANT = TRUE
|
| 236 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 237 |
+
GROUP BY SABIAN_MENTION_CONTEXT, SENTIMENT_LEVEL
|
| 238 |
+
ORDER BY SABIAN_MENTION_CONTEXT, POST_COUNT DESC;
|
| 239 |
+
|
| 240 |
+
-- Create view for validation flags analysis (NEW v4.0)
|
| 241 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_VALIDATION_SUMMARY AS
|
| 242 |
+
SELECT
|
| 243 |
+
PROCESSING_STATUS,
|
| 244 |
+
VALIDATION_PASSED,
|
| 245 |
+
COUNT(*) AS POST_COUNT,
|
| 246 |
+
COUNT(CASE WHEN IS_RELEVANT = TRUE THEN 1 END) AS RELEVANT_COUNT,
|
| 247 |
+
COUNT(CASE WHEN IS_RELEVANT = FALSE THEN 1 END) AS NOT_RELEVANT_COUNT
|
| 248 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
|
| 249 |
+
GROUP BY PROCESSING_STATUS, VALIDATION_PASSED
|
| 250 |
+
ORDER BY POST_COUNT DESC;
|
processing_brand_sentiment/database/sql/fetch_comments.sql
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Query to fetch social media comments with context for brand sentiment analysis
|
| 2 |
+
-- Source: SOCIAL_MEDIA_DB.brand_sentiment.SABIAN_comments (same structure as CORE.FACT_COMMENTS)
|
| 3 |
+
-- Includes: comment content, parent comment text, content metadata, channel info
|
| 4 |
+
-- Excludes: official accounts, already-processed comments, empty comments
|
| 5 |
+
|
| 6 |
+
SELECT
|
| 7 |
+
-- Comment identifiers
|
| 8 |
+
fc.COMMENT_SK,
|
| 9 |
+
fc.COMMENT_ID,
|
| 10 |
+
fc.PLATFORM,
|
| 11 |
+
fc.MESSAGE AS COMMENT_TEXT,
|
| 12 |
+
fc.CREATED_TIME AS COMMENT_TIMESTAMP,
|
| 13 |
+
fc.AUTHOR_NAME,
|
| 14 |
+
fc.AUTHOR_ID,
|
| 15 |
+
fc.LIKE_COUNT,
|
| 16 |
+
fc.PARENT_COMMENT_ID,
|
| 17 |
+
fc.REPLIES_COUNT,
|
| 18 |
+
fc.COMMENT_LENGTH,
|
| 19 |
+
fc.IS_ACTIVE AS COMMENT_IS_ACTIVE,
|
| 20 |
+
|
| 21 |
+
-- Parent comment information (self-join to get parent comment text)
|
| 22 |
+
parent_fc.MESSAGE AS PARENT_COMMENT_TEXT,
|
| 23 |
+
|
| 24 |
+
-- Content information
|
| 25 |
+
dc.CONTENT_SK,
|
| 26 |
+
dc.CONTENT_ID,
|
| 27 |
+
dc.CONTENT_TYPE,
|
| 28 |
+
dc.MESSAGE AS CONTENT_DESCRIPTION,
|
| 29 |
+
dc.TITLE AS CONTENT_TITLE,
|
| 30 |
+
dc.PERMALINK_URL,
|
| 31 |
+
dc.CREATED_TIME AS CONTENT_TIMESTAMP,
|
| 32 |
+
|
| 33 |
+
-- Channel information
|
| 34 |
+
dch.CHANNEL_SK,
|
| 35 |
+
dch.CHANNEL_NAME,
|
| 36 |
+
dch.CHANNEL_DISPLAY_NAME
|
| 37 |
+
|
| 38 |
+
FROM
|
| 39 |
+
SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS fc
|
| 40 |
+
|
| 41 |
+
-- Left join to get parent comment text if it exists
|
| 42 |
+
LEFT JOIN
|
| 43 |
+
SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS parent_fc
|
| 44 |
+
ON fc.PARENT_COMMENT_ID = parent_fc.COMMENT_ID
|
| 45 |
+
AND fc.PLATFORM = parent_fc.PLATFORM
|
| 46 |
+
|
| 47 |
+
INNER JOIN
|
| 48 |
+
SOCIAL_MEDIA_DB.CORE.DIM_CONTENT dc
|
| 49 |
+
ON fc.CONTENT_SK = dc.CONTENT_SK
|
| 50 |
+
|
| 51 |
+
INNER JOIN
|
| 52 |
+
SOCIAL_MEDIA_DB.CORE.DIM_CHANNEL dch
|
| 53 |
+
ON dc.CHANNEL_NAME = dch.CHANNEL_NAME
|
| 54 |
+
AND dc.PLATFORM = dch.PLATFORM
|
| 55 |
+
|
| 56 |
+
-- Left join with output table to exclude already-processed comments
|
| 57 |
+
LEFT JOIN
|
| 58 |
+
SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS sba
|
| 59 |
+
ON fc.COMMENT_SK = sba.COMMENT_SK
|
| 60 |
+
|
| 61 |
+
WHERE
|
| 62 |
+
-- Active records only
|
| 63 |
+
fc.IS_ACTIVE = TRUE
|
| 64 |
+
AND dc.IS_ACTIVE = TRUE
|
| 65 |
+
AND dch.IS_ACTIVE = TRUE
|
| 66 |
+
|
| 67 |
+
-- Exclude official accounts
|
| 68 |
+
AND (fc.AUTHOR_NAME IS NULL OR fc.AUTHOR_NAME NOT IN (
|
| 69 |
+
'Musora', 'Drumeo', 'Pianote',
|
| 70 |
+
'@PianoteOfficial', '@DrumeoOfficial', '@MusoraOfficial'
|
| 71 |
+
))
|
| 72 |
+
|
| 73 |
+
-- Exclude already-processed comments
|
| 74 |
+
AND sba.COMMENT_SK IS NULL
|
| 75 |
+
|
| 76 |
+
-- Ensure comment has content
|
| 77 |
+
AND fc.MESSAGE IS NOT NULL
|
| 78 |
+
AND TRIM(fc.MESSAGE) != ''
|
| 79 |
+
AND LENGTH(TRIM(fc.MESSAGE)) > 0
|
| 80 |
+
|
| 81 |
+
ORDER BY
|
| 82 |
+
fc.CREATED_TIME DESC;
|
processing_brand_sentiment/database/sql/fetch_forum_posts.sql
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Query to fetch forum posts with thread context for brand sentiment analysis
|
| 2 |
+
-- Includes: post content, thread context (title, first post), parent relationships
|
| 3 |
+
-- Excludes: team/house-coach posts, already-processed posts, deleted posts
|
| 4 |
+
|
| 5 |
+
WITH thread_first_posts AS (
|
| 6 |
+
-- Get the first post (by creation date) for each thread to use as context
|
| 7 |
+
-- Using ROW_NUMBER for reliable first post identification
|
| 8 |
+
SELECT
|
| 9 |
+
THREAD_ID,
|
| 10 |
+
POST_CONTENT AS FIRST_POST_CONTENT,
|
| 11 |
+
POST_AUTHOR_ID AS FIRST_POST_AUTHOR_ID,
|
| 12 |
+
POST_CREATED_AT AS FIRST_POST_CREATED_AT
|
| 13 |
+
FROM (
|
| 14 |
+
SELECT
|
| 15 |
+
THREAD_ID,
|
| 16 |
+
POST_CONTENT,
|
| 17 |
+
POST_AUTHOR_ID,
|
| 18 |
+
POST_CREATED_AT,
|
| 19 |
+
ROW_NUMBER() OVER (PARTITION BY THREAD_ID ORDER BY POST_CREATED_AT ASC) AS rn
|
| 20 |
+
FROM SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS
|
| 21 |
+
WHERE POST_CONTENT IS NOT NULL
|
| 22 |
+
AND TRIM(POST_CONTENT) != ''
|
| 23 |
+
) ranked
|
| 24 |
+
WHERE rn = 1
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
SELECT
|
| 28 |
+
-- Post identifiers
|
| 29 |
+
fp.POST_ID,
|
| 30 |
+
fp.POST_AUTHOR_ID,
|
| 31 |
+
fp.THREAD_ID,
|
| 32 |
+
|
| 33 |
+
-- Post content (may contain HTML with quoted parent)
|
| 34 |
+
fp.POST_CONTENT,
|
| 35 |
+
|
| 36 |
+
-- Post timestamps
|
| 37 |
+
fp.POST_CREATED_AT,
|
| 38 |
+
fp.POST_EDITED_ON,
|
| 39 |
+
fp.POST_PUBLISHED_ON,
|
| 40 |
+
fp.POST_STATE,
|
| 41 |
+
|
| 42 |
+
-- Parent/Child relationships (for context)
|
| 43 |
+
fp.PROMPTING_POST_ID,
|
| 44 |
+
fp.PARENT_ID,
|
| 45 |
+
fp.PARENT_CONTENT,
|
| 46 |
+
fp.PARENT_AUTHOR_ID,
|
| 47 |
+
fp.PARENT_CREATED_AT,
|
| 48 |
+
fp.CHILD_ID,
|
| 49 |
+
fp.CHILD_CONTENT,
|
| 50 |
+
|
| 51 |
+
-- Thread context
|
| 52 |
+
fp.THREAD_TITLE,
|
| 53 |
+
fp.THREAD_SLUG,
|
| 54 |
+
fp.THREAD_STATE,
|
| 55 |
+
fp.THREAD_LOCKED,
|
| 56 |
+
fp.THREAD_PINNED,
|
| 57 |
+
fp.THREAD_POST_COUNT,
|
| 58 |
+
fp.THREAD_PUBLISHED_ON,
|
| 59 |
+
|
| 60 |
+
-- First post of the thread (for context)
|
| 61 |
+
tfp.FIRST_POST_CONTENT AS THREAD_FIRST_POST,
|
| 62 |
+
tfp.FIRST_POST_CREATED_AT AS THREAD_STARTED_AT,
|
| 63 |
+
|
| 64 |
+
-- Category information
|
| 65 |
+
fp.CATEGORY_ID,
|
| 66 |
+
fp.CATEGORY_BRAND,
|
| 67 |
+
fp.CATEGORY_DESCRIPTION,
|
| 68 |
+
fp.CATEGORY_TITLE,
|
| 69 |
+
fp.CATEGORY_TOPIC,
|
| 70 |
+
fp.CATEGORY_SLUG,
|
| 71 |
+
|
| 72 |
+
-- Access levels (for filtering)
|
| 73 |
+
fp.POST_AUTHOR_ACCESS_LEVEL,
|
| 74 |
+
fp.PARENT_AUTHOR_ACCESS_LEVEL,
|
| 75 |
+
fp.CHILD_AUTHOR_ACCESS_LEVEL
|
| 76 |
+
|
| 77 |
+
FROM
|
| 78 |
+
SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS fp
|
| 79 |
+
|
| 80 |
+
-- Join to get thread's first post for context
|
| 81 |
+
LEFT JOIN
|
| 82 |
+
thread_first_posts tfp ON fp.THREAD_ID = tfp.THREAD_ID
|
| 83 |
+
|
| 84 |
+
-- Left join with output table to exclude already-processed posts
|
| 85 |
+
LEFT JOIN
|
| 86 |
+
SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS sba
|
| 87 |
+
ON fp.POST_ID = sba.POST_ID
|
| 88 |
+
|
| 89 |
+
WHERE
|
| 90 |
+
-- Exclude team and house-coach posts (internal comments)
|
| 91 |
+
(fp.POST_AUTHOR_ACCESS_LEVEL IS NULL OR fp.POST_AUTHOR_ACCESS_LEVEL NOT IN ('team', 'house-coach'))
|
| 92 |
+
|
| 93 |
+
-- Exclude deleted posts
|
| 94 |
+
AND (fp.POST_STATE IS NULL OR fp.POST_STATE != 'deleted')
|
| 95 |
+
AND fp.POST_DELETED_AT IS NULL
|
| 96 |
+
|
| 97 |
+
-- Exclude already-processed posts
|
| 98 |
+
AND sba.POST_ID IS NULL
|
| 99 |
+
|
| 100 |
+
-- Ensure post has content
|
| 101 |
+
AND fp.POST_CONTENT IS NOT NULL
|
| 102 |
+
AND TRIM(fp.POST_CONTENT) != ''
|
| 103 |
+
AND LENGTH(TRIM(fp.POST_CONTENT)) > 0
|
| 104 |
+
|
| 105 |
+
ORDER BY
|
| 106 |
+
fp.POST_CREATED_AT DESC;
|
processing_brand_sentiment/database/sql/init_comments_output_table.sql
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Initialize empty output table for Sabian brand sentiment analysis on social media comments
|
| 2 |
+
-- Run this script BEFORE the first processing run to create the table structure
|
| 3 |
+
-- This prevents "table not found" errors when the fetch query tries to check for already-processed comments
|
| 4 |
+
|
| 5 |
+
CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS (
|
| 6 |
+
-- Source identifiers (comment-specific)
|
| 7 |
+
COMMENT_SK NUMBER(38,0),
|
| 8 |
+
COMMENT_ID VARCHAR(16777216),
|
| 9 |
+
ORIGINAL_TEXT VARCHAR(16777216),
|
| 10 |
+
PLATFORM VARCHAR(16777216),
|
| 11 |
+
COMMENT_TIMESTAMP TIMESTAMP_NTZ(9),
|
| 12 |
+
AUTHOR_NAME VARCHAR(16777216),
|
| 13 |
+
AUTHOR_ID VARCHAR(16777216),
|
| 14 |
+
CONTENT_SK NUMBER(38,0),
|
| 15 |
+
CONTENT_ID VARCHAR(16777216),
|
| 16 |
+
CONTENT_DESCRIPTION VARCHAR(16777216),
|
| 17 |
+
CHANNEL_SK NUMBER(38,0),
|
| 18 |
+
CHANNEL_NAME VARCHAR(16777216),
|
| 19 |
+
CHANNEL_DISPLAY_NAME VARCHAR(16777216),
|
| 20 |
+
PARENT_COMMENT_ID VARCHAR(16777216),
|
| 21 |
+
PARENT_COMMENT_TEXT VARCHAR(16777216),
|
| 22 |
+
|
| 23 |
+
-- Language detection
|
| 24 |
+
DETECTED_LANGUAGE VARCHAR(100),
|
| 25 |
+
LANGUAGE_CODE VARCHAR(10),
|
| 26 |
+
IS_ENGLISH BOOLEAN,
|
| 27 |
+
|
| 28 |
+
-- Relevance assessment
|
| 29 |
+
IS_RELEVANT BOOLEAN,
|
| 30 |
+
RELEVANCE_CONFIDENCE VARCHAR(20),
|
| 31 |
+
RELEVANCE_REASON VARCHAR(500),
|
| 32 |
+
|
| 33 |
+
-- Author classification
|
| 34 |
+
AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
|
| 35 |
+
SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
|
| 36 |
+
|
| 37 |
+
-- Sentiment analysis
|
| 38 |
+
SENTIMENT_LEVEL VARCHAR(20),
|
| 39 |
+
EMOTION_TYPE VARCHAR(50),
|
| 40 |
+
SENTIMENT_TARGET VARCHAR(50),
|
| 41 |
+
SENTIMENT_CONFIDENCE VARCHAR(20),
|
| 42 |
+
|
| 43 |
+
-- Product information (stored as JSON arrays)
|
| 44 |
+
PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"]
|
| 45 |
+
PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
|
| 46 |
+
PURCHASE_STAGE VARCHAR(50),
|
| 47 |
+
|
| 48 |
+
-- Competitive intelligence
|
| 49 |
+
COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"]
|
| 50 |
+
COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
|
| 51 |
+
COMPARISON_TYPE VARCHAR(50),
|
| 52 |
+
COMPETITIVE_POSITIONING VARCHAR(500),
|
| 53 |
+
BRAND_SWITCHING VARCHAR(100),
|
| 54 |
+
|
| 55 |
+
-- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
|
| 56 |
+
INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"]
|
| 57 |
+
DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"]
|
| 58 |
+
PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"]
|
| 59 |
+
DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
|
| 60 |
+
|
| 61 |
+
-- Analysis notes
|
| 62 |
+
ANALYSIS_NOTES VARCHAR(16777216),
|
| 63 |
+
SARCASM_DETECTED BOOLEAN,
|
| 64 |
+
|
| 65 |
+
-- Validation results
|
| 66 |
+
VALIDATION_PASSED BOOLEAN,
|
| 67 |
+
VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
|
| 68 |
+
VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
|
| 69 |
+
VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags
|
| 70 |
+
|
| 71 |
+
-- Processing metadata
|
| 72 |
+
PROCESSING_SUCCESS BOOLEAN,
|
| 73 |
+
PROCESSING_STATUS VARCHAR(50), -- completed, completed_with_flags, validation_failed, workflow_error
|
| 74 |
+
PROCESSING_ERRORS VARCHAR(16777216),
|
| 75 |
+
PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
|
| 76 |
+
WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
|
| 77 |
+
)
|
| 78 |
+
COMMENT = 'Brand sentiment analysis results for Sabian social media comments. Schema v4.0: 4-agent pipeline with extraction/analysis separation and validation.';
|
processing_brand_sentiment/database/sql/init_output_table.sql
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Initialize empty output table for Sabian brand sentiment analysis
|
| 2 |
+
-- Run this script BEFORE the first processing run to create the table structure
|
| 3 |
+
-- This prevents "table not found" errors when the fetch query tries to check for already-processed posts
|
| 4 |
+
-- Schema Version 4.0: Added THREAD_CONTEXT_SUMMARY, validation fields, and processing status
|
| 5 |
+
|
| 6 |
+
CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS (
|
| 7 |
+
-- Source identifiers
|
| 8 |
+
POST_ID NUMBER(38,0) PRIMARY KEY,
|
| 9 |
+
THREAD_ID NUMBER(38,0),
|
| 10 |
+
POST_AUTHOR_ID NUMBER(38,0),
|
| 11 |
+
|
| 12 |
+
-- Original and processed content
|
| 13 |
+
ORIGINAL_CONTENT VARCHAR(16777216),
|
| 14 |
+
CLEANED_CONTENT VARCHAR(16777216),
|
| 15 |
+
QUOTED_CONTENT VARCHAR(16777216),
|
| 16 |
+
THREAD_CONTEXT VARCHAR(16777216), -- Raw thread context (legacy)
|
| 17 |
+
THREAD_CONTEXT_SUMMARY VARCHAR(4096), -- NEW v4.0: Summarized thread context
|
| 18 |
+
|
| 19 |
+
-- Thread metadata
|
| 20 |
+
THREAD_TITLE VARCHAR(16777216),
|
| 21 |
+
THREAD_FIRST_POST VARCHAR(16777216),
|
| 22 |
+
|
| 23 |
+
-- Timestamps
|
| 24 |
+
POST_CREATED_AT TIMESTAMP_LTZ(9),
|
| 25 |
+
THREAD_STARTED_AT TIMESTAMP_LTZ(9),
|
| 26 |
+
|
| 27 |
+
-- Category information
|
| 28 |
+
CATEGORY_TITLE VARCHAR(16777216),
|
| 29 |
+
CATEGORY_TOPIC VARCHAR(16777216),
|
| 30 |
+
|
| 31 |
+
-- Language detection
|
| 32 |
+
DETECTED_LANGUAGE VARCHAR(100),
|
| 33 |
+
LANGUAGE_CODE VARCHAR(10),
|
| 34 |
+
IS_ENGLISH BOOLEAN,
|
| 35 |
+
|
| 36 |
+
-- Relevance assessment
|
| 37 |
+
IS_RELEVANT BOOLEAN,
|
| 38 |
+
RELEVANCE_CONFIDENCE VARCHAR(20),
|
| 39 |
+
RELEVANCE_REASON VARCHAR(500),
|
| 40 |
+
|
| 41 |
+
-- Author classification
|
| 42 |
+
AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
|
| 43 |
+
SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
|
| 44 |
+
|
| 45 |
+
-- Sentiment analysis
|
| 46 |
+
SENTIMENT_LEVEL VARCHAR(20),
|
| 47 |
+
EMOTION_TYPE VARCHAR(50),
|
| 48 |
+
SENTIMENT_TARGET VARCHAR(50),
|
| 49 |
+
SENTIMENT_CONFIDENCE VARCHAR(20),
|
| 50 |
+
|
| 51 |
+
-- Product information (stored as JSON arrays)
|
| 52 |
+
PRODUCTS_MENTIONED VARCHAR(16777216),
|
| 53 |
+
PRODUCT_ATTRIBUTES VARCHAR(16777216),
|
| 54 |
+
|
| 55 |
+
-- Competitive intelligence
|
| 56 |
+
COMPETITORS_MENTIONED VARCHAR(16777216),
|
| 57 |
+
COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
|
| 58 |
+
COMPARISON_TYPE VARCHAR(50),
|
| 59 |
+
COMPETITIVE_POSITIONING VARCHAR(500),
|
| 60 |
+
BRAND_SWITCHING VARCHAR(100),
|
| 61 |
+
|
| 62 |
+
-- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
|
| 63 |
+
INTENTS VARCHAR(16777216), -- Multi-label: seeking_information, providing_information, sharing_experience, comparing, praising, criticizing, buying_selling, general_discussion
|
| 64 |
+
PURCHASE_STAGE VARCHAR(50), -- AUTHOR's own stage only
|
| 65 |
+
DECISION_DRIVERS VARCHAR(16777216), -- AUTHOR's own decision drivers only
|
| 66 |
+
PAIN_POINTS VARCHAR(16777216), -- AUTHOR's negative feedback aspects (uses feedback_aspects categories)
|
| 67 |
+
DELIGHT_FACTORS VARCHAR(16777216), -- AUTHOR's positive feedback aspects (uses feedback_aspects categories)
|
| 68 |
+
|
| 69 |
+
-- Analysis notes
|
| 70 |
+
ANALYSIS_NOTES VARCHAR(16777216),
|
| 71 |
+
SARCASM_DETECTED BOOLEAN,
|
| 72 |
+
|
| 73 |
+
-- Validation results (NEW v4.0)
|
| 74 |
+
VALIDATION_PASSED BOOLEAN,
|
| 75 |
+
VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
|
| 76 |
+
VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
|
| 77 |
+
VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags
|
| 78 |
+
|
| 79 |
+
-- Platform identifier
|
| 80 |
+
PLATFORM VARCHAR(50) DEFAULT 'musora_forums',
|
| 81 |
+
|
| 82 |
+
-- Processing metadata
|
| 83 |
+
PROCESSING_SUCCESS BOOLEAN,
|
| 84 |
+
PROCESSING_STATUS VARCHAR(50), -- NEW v4.0: completed, completed_with_flags, validation_failed, workflow_error
|
| 85 |
+
PROCESSING_ERRORS VARCHAR(16777216),
|
| 86 |
+
PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
|
| 87 |
+
WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
|
| 88 |
+
)
|
| 89 |
+
COMMENT = 'Brand sentiment analysis results for Sabian forum posts. Schema v4.0: Added thread_context_summary, validation fields, and processing status.';
|
processing_brand_sentiment/main.py
ADDED
|
@@ -0,0 +1,1088 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main execution script for brand sentiment analysis workflow.
|
| 3 |
+
Orchestrates data fetching, processing, and storage using an agentic workflow.
|
| 4 |
+
Supports parallel processing with multiprocessing for improved performance.
|
| 5 |
+
Supports multiple data sources: forums, social media comments, or both.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
import logging
|
| 11 |
+
import argparse
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from dotenv import load_dotenv
|
| 15 |
+
from multiprocessing import Pool, cpu_count
|
| 16 |
+
import traceback
|
| 17 |
+
from typing import Dict, Any, List
|
| 18 |
+
|
| 19 |
+
from database.snowflake_connection import SnowFlakeConn
|
| 20 |
+
from workflow.orchestrator import BrandAnalysisWorkflow
|
| 21 |
+
from workflow.comment_orchestrator import CommentAnalysisWorkflow
|
| 22 |
+
|
| 23 |
+
# Get the directory where this script is located
|
| 24 |
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 25 |
+
|
| 26 |
+
# Load environment variables
|
| 27 |
+
load_dotenv(os.path.join(SCRIPT_DIR, '.env'))
|
| 28 |
+
|
| 29 |
+
# Ensure logs directory exists
|
| 30 |
+
LOGS_DIR = os.path.join(SCRIPT_DIR, 'logs')
|
| 31 |
+
os.makedirs(LOGS_DIR, exist_ok=True)
|
| 32 |
+
|
| 33 |
+
# Configure logging
|
| 34 |
+
logging.basicConfig(
|
| 35 |
+
level=logging.INFO,
|
| 36 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 37 |
+
handlers=[
|
| 38 |
+
logging.FileHandler(
|
| 39 |
+
os.path.join(LOGS_DIR, f'brand_sentiment_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
|
| 40 |
+
),
|
| 41 |
+
logging.StreamHandler()
|
| 42 |
+
]
|
| 43 |
+
)
|
| 44 |
+
logger = logging.getLogger(__name__)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# ============================================================
|
| 48 |
+
# Configuration Loading
|
| 49 |
+
# ============================================================
|
| 50 |
+
|
| 51 |
+
def load_configs(config_dir: str = None) -> Dict[str, Dict]:
|
| 52 |
+
"""
|
| 53 |
+
Load all configuration files.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
config_dir: Directory containing config files
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Dictionary with all configurations
|
| 60 |
+
"""
|
| 61 |
+
if config_dir is None:
|
| 62 |
+
config_dir = os.path.join(SCRIPT_DIR, 'config_files')
|
| 63 |
+
|
| 64 |
+
configs = {}
|
| 65 |
+
|
| 66 |
+
# Load workflow config
|
| 67 |
+
with open(os.path.join(config_dir, 'workflow_config.json'), 'r') as f:
|
| 68 |
+
configs['workflow'] = json.load(f)
|
| 69 |
+
|
| 70 |
+
# Load brand config
|
| 71 |
+
with open(os.path.join(config_dir, 'brand_config.json'), 'r') as f:
|
| 72 |
+
configs['brand'] = json.load(f)
|
| 73 |
+
|
| 74 |
+
# Load analysis categories
|
| 75 |
+
with open(os.path.join(config_dir, 'analysis_categories.json'), 'r') as f:
|
| 76 |
+
configs['categories'] = json.load(f)
|
| 77 |
+
|
| 78 |
+
return configs
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# ============================================================
|
| 82 |
+
# Batch Processing Utilities
|
| 83 |
+
# ============================================================
|
| 84 |
+
|
| 85 |
+
def calculate_optimal_batch_size(
|
| 86 |
+
total_posts: int,
|
| 87 |
+
num_workers: int,
|
| 88 |
+
min_batch: int = 20,
|
| 89 |
+
max_batch: int = 500
|
| 90 |
+
) -> int:
|
| 91 |
+
"""
|
| 92 |
+
Calculate optimal batch size based on total posts and workers.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
total_posts: Total number of posts to process
|
| 96 |
+
num_workers: Number of parallel workers
|
| 97 |
+
min_batch: Minimum batch size
|
| 98 |
+
max_batch: Maximum batch size
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Optimal batch size
|
| 102 |
+
"""
|
| 103 |
+
if total_posts <= min_batch:
|
| 104 |
+
return total_posts
|
| 105 |
+
|
| 106 |
+
batch_size = total_posts // num_workers
|
| 107 |
+
batch_size = max(min_batch, min(max_batch, batch_size))
|
| 108 |
+
|
| 109 |
+
return batch_size
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def safe_to_json(value: Any) -> Any:
|
| 113 |
+
"""
|
| 114 |
+
Safely convert a value to JSON string.
|
| 115 |
+
Handles None, NaN, lists, and already-string values.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
value: Value to convert
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
JSON string if list, None if null, original value otherwise
|
| 122 |
+
"""
|
| 123 |
+
# Handle None and NaN
|
| 124 |
+
if value is None or (isinstance(value, float) and pd.isna(value)):
|
| 125 |
+
return None
|
| 126 |
+
# Handle lists - convert to JSON
|
| 127 |
+
if isinstance(value, list):
|
| 128 |
+
return json.dumps(value) if value else None
|
| 129 |
+
# Handle already-string values
|
| 130 |
+
if isinstance(value, str):
|
| 131 |
+
return value if value else None
|
| 132 |
+
# Return as-is for other types
|
| 133 |
+
return value
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def safe_json_list_length(value: Any) -> int:
|
| 137 |
+
"""
|
| 138 |
+
Safely get the length of a JSON array string.
|
| 139 |
+
Handles None, NaN, empty strings, and invalid JSON.
|
| 140 |
+
|
| 141 |
+
Args:
|
| 142 |
+
value: Value to parse (expected JSON string of array)
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
Length of the array, or 0 if invalid/empty
|
| 146 |
+
"""
|
| 147 |
+
# Handle None and NaN
|
| 148 |
+
if value is None or (isinstance(value, float) and pd.isna(value)):
|
| 149 |
+
return 0
|
| 150 |
+
# Handle non-string values
|
| 151 |
+
if not isinstance(value, str):
|
| 152 |
+
return 0
|
| 153 |
+
# Handle empty strings
|
| 154 |
+
if not value or value == '[]' or value == 'null':
|
| 155 |
+
return 0
|
| 156 |
+
# Try to parse JSON
|
| 157 |
+
try:
|
| 158 |
+
parsed = json.loads(value)
|
| 159 |
+
return len(parsed) if isinstance(parsed, list) else 0
|
| 160 |
+
except (json.JSONDecodeError, TypeError):
|
| 161 |
+
return 0
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def calculate_batch_stats(df: pd.DataFrame) -> Dict[str, int]:
|
| 165 |
+
"""
|
| 166 |
+
Calculate statistics from batch results.
|
| 167 |
+
Handles null values safely for all fields.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
df: DataFrame with processed results
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
Dictionary with statistics
|
| 174 |
+
"""
|
| 175 |
+
stats = {
|
| 176 |
+
'relevant_count': 0,
|
| 177 |
+
'not_relevant_count': 0,
|
| 178 |
+
'products_mentioned_count': 0,
|
| 179 |
+
'competitors_mentioned_count': 0,
|
| 180 |
+
'positive_sentiment_count': 0,
|
| 181 |
+
'negative_sentiment_count': 0,
|
| 182 |
+
# Author role stats
|
| 183 |
+
'current_owner_count': 0,
|
| 184 |
+
'potential_buyer_count': 0,
|
| 185 |
+
'primary_focus_count': 0
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
# Handle empty dataframe
|
| 189 |
+
if df.empty:
|
| 190 |
+
return stats
|
| 191 |
+
|
| 192 |
+
# Count relevant/not relevant posts
|
| 193 |
+
if 'IS_RELEVANT' in df.columns:
|
| 194 |
+
relevant_col = df['IS_RELEVANT']
|
| 195 |
+
non_null_mask = relevant_col.notna()
|
| 196 |
+
if non_null_mask.any():
|
| 197 |
+
stats['relevant_count'] = int(relevant_col[non_null_mask].astype(bool).sum())
|
| 198 |
+
stats['not_relevant_count'] = int((~relevant_col[non_null_mask].astype(bool)).sum())
|
| 199 |
+
|
| 200 |
+
# Count product mentions using safe helper
|
| 201 |
+
if 'PRODUCTS_MENTIONED' in df.columns:
|
| 202 |
+
stats['products_mentioned_count'] = int(
|
| 203 |
+
df['PRODUCTS_MENTIONED'].apply(safe_json_list_length).sum()
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
# Count competitor mentions using safe helper
|
| 207 |
+
if 'COMPETITORS_MENTIONED' in df.columns:
|
| 208 |
+
stats['competitors_mentioned_count'] = int(
|
| 209 |
+
df['COMPETITORS_MENTIONED'].apply(safe_json_list_length).sum()
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Count sentiment distribution
|
| 213 |
+
if 'SENTIMENT_LEVEL' in df.columns:
|
| 214 |
+
sentiment_values = df['SENTIMENT_LEVEL'].dropna()
|
| 215 |
+
if not sentiment_values.empty:
|
| 216 |
+
stats['positive_sentiment_count'] = int(
|
| 217 |
+
sentiment_values.isin(['positive', 'very_positive']).sum()
|
| 218 |
+
)
|
| 219 |
+
stats['negative_sentiment_count'] = int(
|
| 220 |
+
sentiment_values.isin(['negative', 'very_negative']).sum()
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
# Count author roles
|
| 224 |
+
if 'AUTHOR_ROLE' in df.columns:
|
| 225 |
+
author_roles = df['AUTHOR_ROLE'].dropna()
|
| 226 |
+
if not author_roles.empty:
|
| 227 |
+
stats['current_owner_count'] = int((author_roles == 'current_owner').sum())
|
| 228 |
+
stats['potential_buyer_count'] = int((author_roles == 'potential_buyer').sum())
|
| 229 |
+
|
| 230 |
+
# Count mention context
|
| 231 |
+
if 'SABIAN_MENTION_CONTEXT' in df.columns:
|
| 232 |
+
mention_context = df['SABIAN_MENTION_CONTEXT'].dropna()
|
| 233 |
+
if not mention_context.empty:
|
| 234 |
+
stats['primary_focus_count'] = int((mention_context == 'primary_focus').sum())
|
| 235 |
+
|
| 236 |
+
return stats
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def aggregate_results(results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 240 |
+
"""
|
| 241 |
+
Aggregate results from multiple batches.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
results: List of batch result dictionaries
|
| 245 |
+
|
| 246 |
+
Returns:
|
| 247 |
+
Aggregated statistics dictionary
|
| 248 |
+
"""
|
| 249 |
+
aggregated = {
|
| 250 |
+
'total_processed': sum(r.get('total_processed', 0) for r in results),
|
| 251 |
+
'total_stored': sum(r.get('total_stored', 0) for r in results),
|
| 252 |
+
'failed_count': sum(r.get('failed_count', 0) for r in results),
|
| 253 |
+
'relevant_count': sum(r.get('relevant_count', 0) for r in results),
|
| 254 |
+
'not_relevant_count': sum(r.get('not_relevant_count', 0) for r in results),
|
| 255 |
+
'products_mentioned_count': sum(r.get('products_mentioned_count', 0) for r in results),
|
| 256 |
+
'competitors_mentioned_count': sum(r.get('competitors_mentioned_count', 0) for r in results),
|
| 257 |
+
'positive_sentiment_count': sum(r.get('positive_sentiment_count', 0) for r in results),
|
| 258 |
+
'negative_sentiment_count': sum(r.get('negative_sentiment_count', 0) for r in results),
|
| 259 |
+
'current_owner_count': sum(r.get('current_owner_count', 0) for r in results),
|
| 260 |
+
'potential_buyer_count': sum(r.get('potential_buyer_count', 0) for r in results),
|
| 261 |
+
'primary_focus_count': sum(r.get('primary_focus_count', 0) for r in results),
|
| 262 |
+
'failed_batches': sum(1 for r in results if not r.get('success', False))
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
# Log failed batches
|
| 266 |
+
failed_batches = [r for r in results if not r.get('success', False)]
|
| 267 |
+
if failed_batches:
|
| 268 |
+
logger.error(f"{len(failed_batches)} batch(es) failed:")
|
| 269 |
+
for fb in failed_batches:
|
| 270 |
+
logger.error(f" Batch {fb.get('batch_num')}: {fb.get('error')}")
|
| 271 |
+
|
| 272 |
+
return aggregated
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
# ============================================================
|
| 276 |
+
# Forum Processing (existing functionality)
|
| 277 |
+
# ============================================================
|
| 278 |
+
|
| 279 |
+
# Columns that should be converted from lists to JSON strings
|
| 280 |
+
FORUM_JSON_ARRAY_COLUMNS = [
|
| 281 |
+
'products_mentioned', 'product_attributes', 'competitors_mentioned',
|
| 282 |
+
'competitor_products_owned', 'intents', 'decision_drivers',
|
| 283 |
+
'pain_points', 'delight_factors', 'processing_errors', 'relevance_keywords_found',
|
| 284 |
+
'validation_errors', 'validation_warnings', 'validation_flags'
|
| 285 |
+
]
|
| 286 |
+
|
| 287 |
+
# Column mapping from forum workflow state to output table
|
| 288 |
+
FORUM_COLUMN_MAPPING = {
|
| 289 |
+
'post_id': 'POST_ID',
|
| 290 |
+
'thread_id': 'THREAD_ID',
|
| 291 |
+
'post_author_id': 'POST_AUTHOR_ID',
|
| 292 |
+
'original_content': 'ORIGINAL_CONTENT',
|
| 293 |
+
'cleaned_content': 'CLEANED_CONTENT',
|
| 294 |
+
'quoted_content': 'QUOTED_CONTENT',
|
| 295 |
+
'raw_thread_context': 'THREAD_CONTEXT',
|
| 296 |
+
'thread_context_summary': 'THREAD_CONTEXT_SUMMARY',
|
| 297 |
+
'thread_title': 'THREAD_TITLE',
|
| 298 |
+
'thread_first_post': 'THREAD_FIRST_POST',
|
| 299 |
+
'post_created_at': 'POST_CREATED_AT',
|
| 300 |
+
'thread_started_at': 'THREAD_STARTED_AT',
|
| 301 |
+
'category_title': 'CATEGORY_TITLE',
|
| 302 |
+
'category_topic': 'CATEGORY_TOPIC',
|
| 303 |
+
'detected_language': 'DETECTED_LANGUAGE',
|
| 304 |
+
'language_code': 'LANGUAGE_CODE',
|
| 305 |
+
'is_english': 'IS_ENGLISH',
|
| 306 |
+
'is_relevant': 'IS_RELEVANT',
|
| 307 |
+
'relevance_confidence': 'RELEVANCE_CONFIDENCE',
|
| 308 |
+
'relevance_reason': 'RELEVANCE_REASON',
|
| 309 |
+
'author_role': 'AUTHOR_ROLE',
|
| 310 |
+
'sabian_mention_context': 'SABIAN_MENTION_CONTEXT',
|
| 311 |
+
'sentiment_level': 'SENTIMENT_LEVEL',
|
| 312 |
+
'emotion_type': 'EMOTION_TYPE',
|
| 313 |
+
'sentiment_target': 'SENTIMENT_TARGET',
|
| 314 |
+
'sentiment_confidence': 'SENTIMENT_CONFIDENCE',
|
| 315 |
+
'products_mentioned': 'PRODUCTS_MENTIONED',
|
| 316 |
+
'product_attributes': 'PRODUCT_ATTRIBUTES',
|
| 317 |
+
'competitors_mentioned': 'COMPETITORS_MENTIONED',
|
| 318 |
+
'competitor_products_owned': 'COMPETITOR_PRODUCTS_OWNED',
|
| 319 |
+
'comparison_type': 'COMPARISON_TYPE',
|
| 320 |
+
'competitive_positioning': 'COMPETITIVE_POSITIONING',
|
| 321 |
+
'brand_switching': 'BRAND_SWITCHING',
|
| 322 |
+
'intents': 'INTENTS',
|
| 323 |
+
'purchase_stage': 'PURCHASE_STAGE',
|
| 324 |
+
'decision_drivers': 'DECISION_DRIVERS',
|
| 325 |
+
'pain_points': 'PAIN_POINTS',
|
| 326 |
+
'delight_factors': 'DELIGHT_FACTORS',
|
| 327 |
+
'analysis_notes': 'ANALYSIS_NOTES',
|
| 328 |
+
'sarcasm_detected': 'SARCASM_DETECTED',
|
| 329 |
+
'validation_passed': 'VALIDATION_PASSED',
|
| 330 |
+
'validation_errors': 'VALIDATION_ERRORS',
|
| 331 |
+
'validation_warnings': 'VALIDATION_WARNINGS',
|
| 332 |
+
'validation_flags': 'VALIDATION_FLAGS',
|
| 333 |
+
'success': 'PROCESSING_SUCCESS',
|
| 334 |
+
'processing_status': 'PROCESSING_STATUS',
|
| 335 |
+
'processing_errors': 'PROCESSING_ERRORS'
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
def prepare_forum_output_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
| 340 |
+
"""
|
| 341 |
+
Prepare forum output DataFrame with proper column mapping.
|
| 342 |
+
|
| 343 |
+
Args:
|
| 344 |
+
df: DataFrame with processing results
|
| 345 |
+
|
| 346 |
+
Returns:
|
| 347 |
+
DataFrame ready for Snowflake storage
|
| 348 |
+
"""
|
| 349 |
+
output_df = pd.DataFrame()
|
| 350 |
+
|
| 351 |
+
for source_col, target_col in FORUM_COLUMN_MAPPING.items():
|
| 352 |
+
if source_col in df.columns:
|
| 353 |
+
value = df[source_col].copy()
|
| 354 |
+
if source_col in FORUM_JSON_ARRAY_COLUMNS:
|
| 355 |
+
value = value.apply(safe_to_json)
|
| 356 |
+
output_df[target_col] = value
|
| 357 |
+
else:
|
| 358 |
+
output_df[target_col] = None
|
| 359 |
+
|
| 360 |
+
# Add metadata
|
| 361 |
+
output_df['PLATFORM'] = 'musora_forums'
|
| 362 |
+
output_df['PROCESSED_AT'] = datetime.now()
|
| 363 |
+
output_df['WORKFLOW_VERSION'] = '4.0'
|
| 364 |
+
|
| 365 |
+
return output_df
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
def process_forum_batch_worker(batch_data: tuple) -> Dict[str, Any]:
|
| 369 |
+
"""
|
| 370 |
+
Worker function to process a single batch of forum posts.
|
| 371 |
+
Runs in a separate process.
|
| 372 |
+
|
| 373 |
+
Args:
|
| 374 |
+
batch_data: Tuple containing (batch_num, posts, configs, api_key, overwrite_first_batch, output_config)
|
| 375 |
+
|
| 376 |
+
Returns:
|
| 377 |
+
Dictionary with batch statistics
|
| 378 |
+
"""
|
| 379 |
+
batch_num, posts, configs, api_key, overwrite_first_batch, output_config = batch_data
|
| 380 |
+
|
| 381 |
+
worker_logger = logging.getLogger(f"ForumWorker-{batch_num}")
|
| 382 |
+
|
| 383 |
+
try:
|
| 384 |
+
worker_logger.info(f"Forum Batch {batch_num}: Starting processing of {len(posts)} posts")
|
| 385 |
+
|
| 386 |
+
# Initialize Snowflake connection for this worker
|
| 387 |
+
snowflake = SnowFlakeConn()
|
| 388 |
+
|
| 389 |
+
# Initialize workflow for this worker
|
| 390 |
+
workflow = BrandAnalysisWorkflow(
|
| 391 |
+
workflow_config=configs['workflow'],
|
| 392 |
+
brand_config=configs['brand'],
|
| 393 |
+
analysis_categories=configs['categories'],
|
| 394 |
+
api_key=api_key
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
# Process posts
|
| 398 |
+
results = workflow.process_batch(posts)
|
| 399 |
+
|
| 400 |
+
# Convert to DataFrame
|
| 401 |
+
results_df = pd.DataFrame(results)
|
| 402 |
+
|
| 403 |
+
# Filter successful results
|
| 404 |
+
initial_count = len(results_df)
|
| 405 |
+
df_successful = results_df[results_df['success'] == True].copy()
|
| 406 |
+
failed_count = initial_count - len(df_successful)
|
| 407 |
+
|
| 408 |
+
worker_logger.info(f"Forum Batch {batch_num}: Processed {initial_count} posts, {len(df_successful)} successful")
|
| 409 |
+
|
| 410 |
+
# Prepare output DataFrame
|
| 411 |
+
output_df = prepare_forum_output_dataframe(df_successful)
|
| 412 |
+
|
| 413 |
+
# Store results
|
| 414 |
+
if len(output_df) > 0:
|
| 415 |
+
overwrite = overwrite_first_batch and batch_num == 1
|
| 416 |
+
|
| 417 |
+
snowflake.store_df_to_snowflake(
|
| 418 |
+
table_name=output_config['table_name'],
|
| 419 |
+
dataframe=output_df,
|
| 420 |
+
database=output_config['database'],
|
| 421 |
+
schema=output_config['schema'],
|
| 422 |
+
overwrite=overwrite
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
worker_logger.info(f"Forum Batch {batch_num}: Stored {len(output_df)} records to Snowflake")
|
| 426 |
+
else:
|
| 427 |
+
worker_logger.warning(f"Forum Batch {batch_num}: No successful records to store")
|
| 428 |
+
|
| 429 |
+
# Close connection
|
| 430 |
+
snowflake.close_connection()
|
| 431 |
+
|
| 432 |
+
# Calculate statistics
|
| 433 |
+
stats = calculate_batch_stats(output_df)
|
| 434 |
+
stats.update({
|
| 435 |
+
'batch_num': batch_num,
|
| 436 |
+
'success': True,
|
| 437 |
+
'total_processed': initial_count,
|
| 438 |
+
'total_stored': len(output_df),
|
| 439 |
+
'failed_count': failed_count,
|
| 440 |
+
'error': None
|
| 441 |
+
})
|
| 442 |
+
|
| 443 |
+
return stats
|
| 444 |
+
|
| 445 |
+
except Exception as e:
|
| 446 |
+
error_msg = f"Forum Batch {batch_num} failed: {str(e)}"
|
| 447 |
+
worker_logger.error(error_msg)
|
| 448 |
+
worker_logger.error(traceback.format_exc())
|
| 449 |
+
|
| 450 |
+
return {
|
| 451 |
+
'batch_num': batch_num,
|
| 452 |
+
'success': False,
|
| 453 |
+
'total_processed': len(posts),
|
| 454 |
+
'total_stored': 0,
|
| 455 |
+
'failed_count': len(posts),
|
| 456 |
+
'error': error_msg
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
# ============================================================
|
| 461 |
+
# Comment Processing (new functionality)
|
| 462 |
+
# ============================================================
|
| 463 |
+
|
| 464 |
+
# Columns that should be converted from lists to JSON strings (same analysis fields)
|
| 465 |
+
COMMENT_JSON_ARRAY_COLUMNS = [
|
| 466 |
+
'products_mentioned', 'product_attributes', 'competitors_mentioned',
|
| 467 |
+
'competitor_products_owned', 'intents', 'decision_drivers',
|
| 468 |
+
'pain_points', 'delight_factors', 'processing_errors', 'relevance_keywords_found',
|
| 469 |
+
'validation_errors', 'validation_warnings', 'validation_flags'
|
| 470 |
+
]
|
| 471 |
+
|
| 472 |
+
# Column mapping from comment workflow state to output table
|
| 473 |
+
COMMENT_COLUMN_MAPPING = {
|
| 474 |
+
# Comment-specific identifiers
|
| 475 |
+
'comment_sk': 'COMMENT_SK',
|
| 476 |
+
'comment_id': 'COMMENT_ID',
|
| 477 |
+
'original_text': 'ORIGINAL_TEXT',
|
| 478 |
+
'platform': 'PLATFORM',
|
| 479 |
+
'comment_timestamp': 'COMMENT_TIMESTAMP',
|
| 480 |
+
'author_name': 'AUTHOR_NAME',
|
| 481 |
+
'author_id': 'AUTHOR_ID',
|
| 482 |
+
'content_sk': 'CONTENT_SK',
|
| 483 |
+
'content_id': 'CONTENT_ID',
|
| 484 |
+
'content_description': 'CONTENT_DESCRIPTION',
|
| 485 |
+
'channel_sk': 'CHANNEL_SK',
|
| 486 |
+
'channel_name': 'CHANNEL_NAME',
|
| 487 |
+
'channel_display_name': 'CHANNEL_DISPLAY_NAME',
|
| 488 |
+
'parent_comment_id': 'PARENT_COMMENT_ID',
|
| 489 |
+
'parent_comment_text': 'PARENT_COMMENT_TEXT',
|
| 490 |
+
# Analysis fields (same as forums)
|
| 491 |
+
'detected_language': 'DETECTED_LANGUAGE',
|
| 492 |
+
'language_code': 'LANGUAGE_CODE',
|
| 493 |
+
'is_english': 'IS_ENGLISH',
|
| 494 |
+
'is_relevant': 'IS_RELEVANT',
|
| 495 |
+
'relevance_confidence': 'RELEVANCE_CONFIDENCE',
|
| 496 |
+
'relevance_reason': 'RELEVANCE_REASON',
|
| 497 |
+
'author_role': 'AUTHOR_ROLE',
|
| 498 |
+
'sabian_mention_context': 'SABIAN_MENTION_CONTEXT',
|
| 499 |
+
'sentiment_level': 'SENTIMENT_LEVEL',
|
| 500 |
+
'emotion_type': 'EMOTION_TYPE',
|
| 501 |
+
'sentiment_target': 'SENTIMENT_TARGET',
|
| 502 |
+
'sentiment_confidence': 'SENTIMENT_CONFIDENCE',
|
| 503 |
+
'products_mentioned': 'PRODUCTS_MENTIONED',
|
| 504 |
+
'product_attributes': 'PRODUCT_ATTRIBUTES',
|
| 505 |
+
'purchase_stage': 'PURCHASE_STAGE',
|
| 506 |
+
'competitors_mentioned': 'COMPETITORS_MENTIONED',
|
| 507 |
+
'competitor_products_owned': 'COMPETITOR_PRODUCTS_OWNED',
|
| 508 |
+
'comparison_type': 'COMPARISON_TYPE',
|
| 509 |
+
'competitive_positioning': 'COMPETITIVE_POSITIONING',
|
| 510 |
+
'brand_switching': 'BRAND_SWITCHING',
|
| 511 |
+
'intents': 'INTENTS',
|
| 512 |
+
'decision_drivers': 'DECISION_DRIVERS',
|
| 513 |
+
'pain_points': 'PAIN_POINTS',
|
| 514 |
+
'delight_factors': 'DELIGHT_FACTORS',
|
| 515 |
+
'analysis_notes': 'ANALYSIS_NOTES',
|
| 516 |
+
'sarcasm_detected': 'SARCASM_DETECTED',
|
| 517 |
+
'validation_passed': 'VALIDATION_PASSED',
|
| 518 |
+
'validation_errors': 'VALIDATION_ERRORS',
|
| 519 |
+
'validation_warnings': 'VALIDATION_WARNINGS',
|
| 520 |
+
'validation_flags': 'VALIDATION_FLAGS',
|
| 521 |
+
'success': 'PROCESSING_SUCCESS',
|
| 522 |
+
'processing_status': 'PROCESSING_STATUS',
|
| 523 |
+
'processing_errors': 'PROCESSING_ERRORS'
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
|
| 527 |
+
def prepare_comment_output_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
| 528 |
+
"""
|
| 529 |
+
Prepare comment output DataFrame with proper column mapping.
|
| 530 |
+
|
| 531 |
+
Args:
|
| 532 |
+
df: DataFrame with processing results
|
| 533 |
+
|
| 534 |
+
Returns:
|
| 535 |
+
DataFrame ready for Snowflake storage
|
| 536 |
+
"""
|
| 537 |
+
output_df = pd.DataFrame()
|
| 538 |
+
|
| 539 |
+
for source_col, target_col in COMMENT_COLUMN_MAPPING.items():
|
| 540 |
+
if source_col in df.columns:
|
| 541 |
+
value = df[source_col].copy()
|
| 542 |
+
if source_col in COMMENT_JSON_ARRAY_COLUMNS:
|
| 543 |
+
value = value.apply(safe_to_json)
|
| 544 |
+
output_df[target_col] = value
|
| 545 |
+
else:
|
| 546 |
+
output_df[target_col] = None
|
| 547 |
+
|
| 548 |
+
# Add metadata
|
| 549 |
+
output_df['PROCESSED_AT'] = datetime.now()
|
| 550 |
+
output_df['WORKFLOW_VERSION'] = '4.0'
|
| 551 |
+
|
| 552 |
+
return output_df
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
def process_comment_batch_worker(batch_data: tuple) -> Dict[str, Any]:
|
| 556 |
+
"""
|
| 557 |
+
Worker function to process a single batch of social media comments.
|
| 558 |
+
Runs in a separate process.
|
| 559 |
+
|
| 560 |
+
Args:
|
| 561 |
+
batch_data: Tuple containing (batch_num, comments, configs, api_key, overwrite_first_batch, output_config)
|
| 562 |
+
|
| 563 |
+
Returns:
|
| 564 |
+
Dictionary with batch statistics
|
| 565 |
+
"""
|
| 566 |
+
batch_num, comments, configs, api_key, overwrite_first_batch, output_config = batch_data
|
| 567 |
+
|
| 568 |
+
worker_logger = logging.getLogger(f"CommentWorker-{batch_num}")
|
| 569 |
+
|
| 570 |
+
try:
|
| 571 |
+
worker_logger.info(f"Comment Batch {batch_num}: Starting processing of {len(comments)} comments")
|
| 572 |
+
|
| 573 |
+
# Initialize Snowflake connection for this worker
|
| 574 |
+
snowflake = SnowFlakeConn()
|
| 575 |
+
|
| 576 |
+
# Initialize comment workflow for this worker
|
| 577 |
+
workflow = CommentAnalysisWorkflow(
|
| 578 |
+
workflow_config=configs['workflow'],
|
| 579 |
+
brand_config=configs['brand'],
|
| 580 |
+
analysis_categories=configs['categories'],
|
| 581 |
+
api_key=api_key
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
+
# Process comments
|
| 585 |
+
results = workflow.process_batch(comments)
|
| 586 |
+
|
| 587 |
+
# Convert to DataFrame
|
| 588 |
+
results_df = pd.DataFrame(results)
|
| 589 |
+
|
| 590 |
+
# Filter successful results
|
| 591 |
+
initial_count = len(results_df)
|
| 592 |
+
df_successful = results_df[results_df['success'] == True].copy()
|
| 593 |
+
failed_count = initial_count - len(df_successful)
|
| 594 |
+
|
| 595 |
+
worker_logger.info(f"Comment Batch {batch_num}: Processed {initial_count} comments, {len(df_successful)} successful")
|
| 596 |
+
|
| 597 |
+
# Prepare output DataFrame
|
| 598 |
+
output_df = prepare_comment_output_dataframe(df_successful)
|
| 599 |
+
|
| 600 |
+
# Store results
|
| 601 |
+
if len(output_df) > 0:
|
| 602 |
+
overwrite = overwrite_first_batch and batch_num == 1
|
| 603 |
+
|
| 604 |
+
snowflake.store_df_to_snowflake(
|
| 605 |
+
table_name=output_config['table_name'],
|
| 606 |
+
dataframe=output_df,
|
| 607 |
+
database=output_config['database'],
|
| 608 |
+
schema=output_config['schema'],
|
| 609 |
+
overwrite=overwrite
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
worker_logger.info(f"Comment Batch {batch_num}: Stored {len(output_df)} records to Snowflake")
|
| 613 |
+
else:
|
| 614 |
+
worker_logger.warning(f"Comment Batch {batch_num}: No successful records to store")
|
| 615 |
+
|
| 616 |
+
# Close connection
|
| 617 |
+
snowflake.close_connection()
|
| 618 |
+
|
| 619 |
+
# Calculate statistics
|
| 620 |
+
stats = calculate_batch_stats(output_df)
|
| 621 |
+
stats.update({
|
| 622 |
+
'batch_num': batch_num,
|
| 623 |
+
'success': True,
|
| 624 |
+
'total_processed': initial_count,
|
| 625 |
+
'total_stored': len(output_df),
|
| 626 |
+
'failed_count': failed_count,
|
| 627 |
+
'error': None
|
| 628 |
+
})
|
| 629 |
+
|
| 630 |
+
return stats
|
| 631 |
+
|
| 632 |
+
except Exception as e:
|
| 633 |
+
error_msg = f"Comment Batch {batch_num} failed: {str(e)}"
|
| 634 |
+
worker_logger.error(error_msg)
|
| 635 |
+
worker_logger.error(traceback.format_exc())
|
| 636 |
+
|
| 637 |
+
return {
|
| 638 |
+
'batch_num': batch_num,
|
| 639 |
+
'success': False,
|
| 640 |
+
'total_processed': len(comments),
|
| 641 |
+
'total_stored': 0,
|
| 642 |
+
'failed_count': len(comments),
|
| 643 |
+
'error': error_msg
|
| 644 |
+
}
|
| 645 |
+
|
| 646 |
+
|
| 647 |
+
# ============================================================
|
| 648 |
+
# Main Processor Class
|
| 649 |
+
# ============================================================
|
| 650 |
+
|
| 651 |
+
class BrandSentimentProcessor:
|
| 652 |
+
"""
|
| 653 |
+
Main processor class that orchestrates the entire workflow.
|
| 654 |
+
Supports processing forums, social media comments, or both.
|
| 655 |
+
"""
|
| 656 |
+
|
| 657 |
+
def __init__(self, config_dir: str = None):
|
| 658 |
+
"""
|
| 659 |
+
Initialize the processor.
|
| 660 |
+
|
| 661 |
+
Args:
|
| 662 |
+
config_dir: Directory containing configuration files
|
| 663 |
+
"""
|
| 664 |
+
# Load configurations
|
| 665 |
+
self.configs = load_configs(config_dir)
|
| 666 |
+
|
| 667 |
+
# Initialize Snowflake connection
|
| 668 |
+
self.snowflake = SnowFlakeConn()
|
| 669 |
+
|
| 670 |
+
# Get OpenAI API key
|
| 671 |
+
self.api_key = os.getenv("OPENAI_API_KEY")
|
| 672 |
+
if not self.api_key:
|
| 673 |
+
raise ValueError("OPENAI_API_KEY not found in environment variables")
|
| 674 |
+
|
| 675 |
+
# Get output configurations
|
| 676 |
+
self.forum_output_config = self.configs['workflow'].get('output', {
|
| 677 |
+
'table_name': 'SABIAN_BRAND_ANALYSIS',
|
| 678 |
+
'database': 'SOCIAL_MEDIA_DB',
|
| 679 |
+
'schema': 'ML_FEATURES'
|
| 680 |
+
})
|
| 681 |
+
|
| 682 |
+
self.comment_output_config = self.configs['workflow'].get('comments_output', {
|
| 683 |
+
'table_name': 'SABIAN_BRAND_ANALYSIS_COMMENTS',
|
| 684 |
+
'database': 'SOCIAL_MEDIA_DB',
|
| 685 |
+
'schema': 'ML_FEATURES'
|
| 686 |
+
})
|
| 687 |
+
|
| 688 |
+
logger.info("BrandSentimentProcessor initialized successfully")
|
| 689 |
+
|
| 690 |
+
def fetch_forum_posts(self, limit: int = None) -> pd.DataFrame:
|
| 691 |
+
"""
|
| 692 |
+
Fetch forum posts from Snowflake.
|
| 693 |
+
|
| 694 |
+
Args:
|
| 695 |
+
limit: Optional limit on number of posts
|
| 696 |
+
|
| 697 |
+
Returns:
|
| 698 |
+
DataFrame containing post data
|
| 699 |
+
"""
|
| 700 |
+
logger.info("Fetching forum posts...")
|
| 701 |
+
|
| 702 |
+
sql_path = os.path.join(SCRIPT_DIR, 'database', 'sql', 'fetch_forum_posts.sql')
|
| 703 |
+
df = self.snowflake.fetch_forum_posts_with_context(sql_path, limit)
|
| 704 |
+
|
| 705 |
+
logger.info(f"Fetched {len(df)} forum posts")
|
| 706 |
+
return df
|
| 707 |
+
|
| 708 |
+
def fetch_comments(self, limit: int = None) -> pd.DataFrame:
|
| 709 |
+
"""
|
| 710 |
+
Fetch social media comments from Snowflake.
|
| 711 |
+
|
| 712 |
+
Args:
|
| 713 |
+
limit: Optional limit on number of comments
|
| 714 |
+
|
| 715 |
+
Returns:
|
| 716 |
+
DataFrame containing comment data
|
| 717 |
+
"""
|
| 718 |
+
logger.info("Fetching social media comments...")
|
| 719 |
+
|
| 720 |
+
sql_path = os.path.join(SCRIPT_DIR, 'database', 'sql', 'fetch_comments.sql')
|
| 721 |
+
df = self.snowflake.fetch_comments(sql_path, limit)
|
| 722 |
+
|
| 723 |
+
logger.info(f"Fetched {len(df)} social media comments")
|
| 724 |
+
return df
|
| 725 |
+
|
| 726 |
+
def calculate_num_workers(self) -> int:
|
| 727 |
+
"""
|
| 728 |
+
Calculate number of parallel workers.
|
| 729 |
+
|
| 730 |
+
Returns:
|
| 731 |
+
Number of workers
|
| 732 |
+
"""
|
| 733 |
+
parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {})
|
| 734 |
+
max_workers = parallel_config.get('max_workers', 5)
|
| 735 |
+
|
| 736 |
+
num_cpus = cpu_count()
|
| 737 |
+
num_workers = max(1, min(max_workers, num_cpus - 2))
|
| 738 |
+
|
| 739 |
+
logger.info(f"Using {num_workers} parallel workers (CPU count: {num_cpus})")
|
| 740 |
+
return num_workers
|
| 741 |
+
|
| 742 |
+
# ---- Forum Processing ----
|
| 743 |
+
|
| 744 |
+
def process_forums_parallel(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
|
| 745 |
+
"""
|
| 746 |
+
Process forum posts using parallel workers.
|
| 747 |
+
|
| 748 |
+
Args:
|
| 749 |
+
df: DataFrame containing posts
|
| 750 |
+
overwrite: Whether to overwrite existing table
|
| 751 |
+
|
| 752 |
+
Returns:
|
| 753 |
+
Dictionary with aggregated statistics
|
| 754 |
+
"""
|
| 755 |
+
posts = df.to_dict('records')
|
| 756 |
+
total_posts = len(posts)
|
| 757 |
+
|
| 758 |
+
logger.info(f"Processing {total_posts} forum posts using parallel processing...")
|
| 759 |
+
|
| 760 |
+
num_workers = self.calculate_num_workers()
|
| 761 |
+
|
| 762 |
+
parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {})
|
| 763 |
+
min_batch = parallel_config.get('min_batch_size', 20)
|
| 764 |
+
max_batch = parallel_config.get('max_batch_size', 400)
|
| 765 |
+
|
| 766 |
+
batch_size = calculate_optimal_batch_size(total_posts, num_workers, min_batch, max_batch)
|
| 767 |
+
logger.info(f"Forum batch size: {batch_size}")
|
| 768 |
+
|
| 769 |
+
# Create batches
|
| 770 |
+
batches = []
|
| 771 |
+
for i in range(0, total_posts, batch_size):
|
| 772 |
+
batch = posts[i:i + batch_size]
|
| 773 |
+
batch_num = (i // batch_size) + 1
|
| 774 |
+
batches.append((batch_num, batch, self.configs, self.api_key, overwrite, self.forum_output_config))
|
| 775 |
+
|
| 776 |
+
total_batches = len(batches)
|
| 777 |
+
logger.info(f"Split into {total_batches} forum batches")
|
| 778 |
+
|
| 779 |
+
# Process in parallel
|
| 780 |
+
with Pool(processes=num_workers) as pool:
|
| 781 |
+
results = pool.map(process_forum_batch_worker, batches)
|
| 782 |
+
|
| 783 |
+
return aggregate_results(results)
|
| 784 |
+
|
| 785 |
+
def process_forums_sequential(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
|
| 786 |
+
"""
|
| 787 |
+
Process forum posts sequentially (for debugging).
|
| 788 |
+
|
| 789 |
+
Args:
|
| 790 |
+
df: DataFrame containing posts
|
| 791 |
+
overwrite: Whether to overwrite existing table
|
| 792 |
+
|
| 793 |
+
Returns:
|
| 794 |
+
Dictionary with statistics
|
| 795 |
+
"""
|
| 796 |
+
logger.info(f"Processing {len(df)} forum posts using sequential processing...")
|
| 797 |
+
|
| 798 |
+
posts = df.to_dict('records')
|
| 799 |
+
batch_data = (1, posts, self.configs, self.api_key, overwrite, self.forum_output_config)
|
| 800 |
+
result = process_forum_batch_worker(batch_data)
|
| 801 |
+
|
| 802 |
+
return {
|
| 803 |
+
'total_processed': result.get('total_processed', 0),
|
| 804 |
+
'total_stored': result.get('total_stored', 0),
|
| 805 |
+
'failed_count': result.get('failed_count', 0),
|
| 806 |
+
'relevant_count': result.get('relevant_count', 0),
|
| 807 |
+
'not_relevant_count': result.get('not_relevant_count', 0),
|
| 808 |
+
'products_mentioned_count': result.get('products_mentioned_count', 0),
|
| 809 |
+
'competitors_mentioned_count': result.get('competitors_mentioned_count', 0),
|
| 810 |
+
'positive_sentiment_count': result.get('positive_sentiment_count', 0),
|
| 811 |
+
'negative_sentiment_count': result.get('negative_sentiment_count', 0),
|
| 812 |
+
'current_owner_count': result.get('current_owner_count', 0),
|
| 813 |
+
'potential_buyer_count': result.get('potential_buyer_count', 0),
|
| 814 |
+
'primary_focus_count': result.get('primary_focus_count', 0),
|
| 815 |
+
'failed_batches': 0 if result.get('success', False) else 1
|
| 816 |
+
}
|
| 817 |
+
|
| 818 |
+
# ---- Comment Processing ----
|
| 819 |
+
|
| 820 |
+
def process_comments_parallel(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
|
| 821 |
+
"""
|
| 822 |
+
Process social media comments using parallel workers.
|
| 823 |
+
|
| 824 |
+
Args:
|
| 825 |
+
df: DataFrame containing comments
|
| 826 |
+
overwrite: Whether to overwrite existing table
|
| 827 |
+
|
| 828 |
+
Returns:
|
| 829 |
+
Dictionary with aggregated statistics
|
| 830 |
+
"""
|
| 831 |
+
comments = df.to_dict('records')
|
| 832 |
+
total_comments = len(comments)
|
| 833 |
+
|
| 834 |
+
logger.info(f"Processing {total_comments} comments using parallel processing...")
|
| 835 |
+
|
| 836 |
+
num_workers = self.calculate_num_workers()
|
| 837 |
+
|
| 838 |
+
parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {})
|
| 839 |
+
min_batch = parallel_config.get('min_batch_size', 20)
|
| 840 |
+
max_batch = parallel_config.get('max_batch_size', 400)
|
| 841 |
+
|
| 842 |
+
batch_size = calculate_optimal_batch_size(total_comments, num_workers, min_batch, max_batch)
|
| 843 |
+
logger.info(f"Comment batch size: {batch_size}")
|
| 844 |
+
|
| 845 |
+
# Create batches
|
| 846 |
+
batches = []
|
| 847 |
+
for i in range(0, total_comments, batch_size):
|
| 848 |
+
batch = comments[i:i + batch_size]
|
| 849 |
+
batch_num = (i // batch_size) + 1
|
| 850 |
+
batches.append((batch_num, batch, self.configs, self.api_key, overwrite, self.comment_output_config))
|
| 851 |
+
|
| 852 |
+
total_batches = len(batches)
|
| 853 |
+
logger.info(f"Split into {total_batches} comment batches")
|
| 854 |
+
|
| 855 |
+
# Process in parallel
|
| 856 |
+
with Pool(processes=num_workers) as pool:
|
| 857 |
+
results = pool.map(process_comment_batch_worker, batches)
|
| 858 |
+
|
| 859 |
+
return aggregate_results(results)
|
| 860 |
+
|
| 861 |
+
def process_comments_sequential(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
|
| 862 |
+
"""
|
| 863 |
+
Process social media comments sequentially (for debugging).
|
| 864 |
+
|
| 865 |
+
Args:
|
| 866 |
+
df: DataFrame containing comments
|
| 867 |
+
overwrite: Whether to overwrite existing table
|
| 868 |
+
|
| 869 |
+
Returns:
|
| 870 |
+
Dictionary with statistics
|
| 871 |
+
"""
|
| 872 |
+
logger.info(f"Processing {len(df)} comments using sequential processing...")
|
| 873 |
+
|
| 874 |
+
comments = df.to_dict('records')
|
| 875 |
+
batch_data = (1, comments, self.configs, self.api_key, overwrite, self.comment_output_config)
|
| 876 |
+
result = process_comment_batch_worker(batch_data)
|
| 877 |
+
|
| 878 |
+
return {
|
| 879 |
+
'total_processed': result.get('total_processed', 0),
|
| 880 |
+
'total_stored': result.get('total_stored', 0),
|
| 881 |
+
'failed_count': result.get('failed_count', 0),
|
| 882 |
+
'relevant_count': result.get('relevant_count', 0),
|
| 883 |
+
'not_relevant_count': result.get('not_relevant_count', 0),
|
| 884 |
+
'products_mentioned_count': result.get('products_mentioned_count', 0),
|
| 885 |
+
'competitors_mentioned_count': result.get('competitors_mentioned_count', 0),
|
| 886 |
+
'positive_sentiment_count': result.get('positive_sentiment_count', 0),
|
| 887 |
+
'negative_sentiment_count': result.get('negative_sentiment_count', 0),
|
| 888 |
+
'current_owner_count': result.get('current_owner_count', 0),
|
| 889 |
+
'potential_buyer_count': result.get('potential_buyer_count', 0),
|
| 890 |
+
'primary_focus_count': result.get('primary_focus_count', 0),
|
| 891 |
+
'failed_batches': 0 if result.get('success', False) else 1
|
| 892 |
+
}
|
| 893 |
+
|
| 894 |
+
# ---- Unified Processing ----
|
| 895 |
+
|
| 896 |
+
def _log_source_summary(self, source_name: str, stats: Dict[str, Any], processing_time: float) -> None:
|
| 897 |
+
"""
|
| 898 |
+
Log processing summary for a data source.
|
| 899 |
+
|
| 900 |
+
Args:
|
| 901 |
+
source_name: Name of the data source
|
| 902 |
+
stats: Processing statistics
|
| 903 |
+
processing_time: Time taken in seconds
|
| 904 |
+
"""
|
| 905 |
+
logger.info(f" --- {source_name} ---")
|
| 906 |
+
logger.info(f" Total processed: {stats.get('total_processed', 0)}")
|
| 907 |
+
logger.info(f" Successfully stored: {stats.get('total_stored', 0)}")
|
| 908 |
+
logger.info(f" Failed: {stats.get('failed_count', 0)}")
|
| 909 |
+
logger.info(f" Relevant: {stats.get('relevant_count', 0)}")
|
| 910 |
+
logger.info(f" Not relevant: {stats.get('not_relevant_count', 0)}")
|
| 911 |
+
logger.info(f" Product mentions: {stats.get('products_mentioned_count', 0)}")
|
| 912 |
+
logger.info(f" Competitor mentions: {stats.get('competitors_mentioned_count', 0)}")
|
| 913 |
+
logger.info(f" Positive sentiment: {stats.get('positive_sentiment_count', 0)}")
|
| 914 |
+
logger.info(f" Negative sentiment: {stats.get('negative_sentiment_count', 0)}")
|
| 915 |
+
logger.info(f" Current owners: {stats.get('current_owner_count', 0)}")
|
| 916 |
+
logger.info(f" Potential buyers: {stats.get('potential_buyer_count', 0)}")
|
| 917 |
+
logger.info(f" Primary focus: {stats.get('primary_focus_count', 0)}")
|
| 918 |
+
if stats.get('failed_batches', 0) > 0:
|
| 919 |
+
logger.info(f" Failed batches: {stats['failed_batches']}")
|
| 920 |
+
logger.info(f" Processing time: {processing_time:.2f} seconds")
|
| 921 |
+
if stats.get('total_processed', 0) > 0:
|
| 922 |
+
logger.info(f" Average per item: {processing_time / stats['total_processed']:.2f} seconds")
|
| 923 |
+
|
| 924 |
+
def run(
|
| 925 |
+
self,
|
| 926 |
+
limit: int = None,
|
| 927 |
+
overwrite: bool = False,
|
| 928 |
+
sequential: bool = False,
|
| 929 |
+
data_source: str = 'all'
|
| 930 |
+
):
|
| 931 |
+
"""
|
| 932 |
+
Run the complete processing pipeline.
|
| 933 |
+
|
| 934 |
+
Args:
|
| 935 |
+
limit: Optional limit on items to process per source
|
| 936 |
+
overwrite: Whether to overwrite existing table
|
| 937 |
+
sequential: Use sequential processing instead of parallel
|
| 938 |
+
data_source: Which data source to process ('forums', 'comments', 'all')
|
| 939 |
+
"""
|
| 940 |
+
try:
|
| 941 |
+
logger.info("=" * 80)
|
| 942 |
+
logger.info("Starting Brand Sentiment Analysis Workflow")
|
| 943 |
+
logger.info(f"Brand: {self.configs['brand'].get('brand', {}).get('name', 'Unknown')}")
|
| 944 |
+
logger.info(f"Mode: {'SEQUENTIAL' if sequential else 'PARALLEL'}")
|
| 945 |
+
logger.info(f"Data source: {data_source}")
|
| 946 |
+
logger.info("=" * 80)
|
| 947 |
+
|
| 948 |
+
process_forums = data_source in ('forums', 'all')
|
| 949 |
+
process_comments = data_source in ('comments', 'all')
|
| 950 |
+
|
| 951 |
+
# Track results for summary
|
| 952 |
+
forum_stats = None
|
| 953 |
+
forum_time = 0.0
|
| 954 |
+
comment_stats = None
|
| 955 |
+
comment_time = 0.0
|
| 956 |
+
|
| 957 |
+
# ---- Process Forums ----
|
| 958 |
+
if process_forums:
|
| 959 |
+
logger.info("-" * 40)
|
| 960 |
+
logger.info("Processing FORUMS")
|
| 961 |
+
logger.info("-" * 40)
|
| 962 |
+
|
| 963 |
+
df_posts = self.fetch_forum_posts(limit)
|
| 964 |
+
|
| 965 |
+
if df_posts.empty:
|
| 966 |
+
logger.warning("No forum posts to process")
|
| 967 |
+
else:
|
| 968 |
+
start_time = datetime.now()
|
| 969 |
+
|
| 970 |
+
if sequential:
|
| 971 |
+
forum_stats = self.process_forums_sequential(df_posts, overwrite)
|
| 972 |
+
else:
|
| 973 |
+
forum_stats = self.process_forums_parallel(df_posts, overwrite)
|
| 974 |
+
|
| 975 |
+
forum_time = (datetime.now() - start_time).total_seconds()
|
| 976 |
+
|
| 977 |
+
# ---- Process Comments ----
|
| 978 |
+
if process_comments:
|
| 979 |
+
logger.info("-" * 40)
|
| 980 |
+
logger.info("Processing SOCIAL MEDIA COMMENTS")
|
| 981 |
+
logger.info("-" * 40)
|
| 982 |
+
|
| 983 |
+
df_comments = self.fetch_comments(limit)
|
| 984 |
+
|
| 985 |
+
if df_comments.empty:
|
| 986 |
+
logger.warning("No social media comments to process")
|
| 987 |
+
else:
|
| 988 |
+
start_time = datetime.now()
|
| 989 |
+
|
| 990 |
+
if sequential:
|
| 991 |
+
comment_stats = self.process_comments_sequential(df_comments, overwrite)
|
| 992 |
+
else:
|
| 993 |
+
comment_stats = self.process_comments_parallel(df_comments, overwrite)
|
| 994 |
+
|
| 995 |
+
comment_time = (datetime.now() - start_time).total_seconds()
|
| 996 |
+
|
| 997 |
+
# ---- Summary ----
|
| 998 |
+
logger.info("=" * 80)
|
| 999 |
+
logger.info("Processing Summary:")
|
| 1000 |
+
logger.info(f" Mode: {'Sequential' if sequential else 'Parallel'}")
|
| 1001 |
+
logger.info(f" Data source: {data_source}")
|
| 1002 |
+
|
| 1003 |
+
if forum_stats is not None:
|
| 1004 |
+
self._log_source_summary("Forums", forum_stats, forum_time)
|
| 1005 |
+
|
| 1006 |
+
if comment_stats is not None:
|
| 1007 |
+
self._log_source_summary("Social Media Comments", comment_stats, comment_time)
|
| 1008 |
+
|
| 1009 |
+
logger.info("=" * 80)
|
| 1010 |
+
|
| 1011 |
+
except Exception as e:
|
| 1012 |
+
logger.error(f"Error in workflow execution: {str(e)}", exc_info=True)
|
| 1013 |
+
raise
|
| 1014 |
+
|
| 1015 |
+
finally:
|
| 1016 |
+
self.snowflake.close_connection()
|
| 1017 |
+
logger.info("Snowflake connection closed")
|
| 1018 |
+
|
| 1019 |
+
|
| 1020 |
+
# ============================================================
|
| 1021 |
+
# Legacy compatibility - keep old function names working
|
| 1022 |
+
# ============================================================
|
| 1023 |
+
|
| 1024 |
+
def prepare_output_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
| 1025 |
+
"""Legacy wrapper for forum output preparation."""
|
| 1026 |
+
return prepare_forum_output_dataframe(df)
|
| 1027 |
+
|
| 1028 |
+
|
| 1029 |
+
def process_batch_worker(batch_data: tuple) -> Dict[str, Any]:
|
| 1030 |
+
"""Legacy wrapper for forum batch worker."""
|
| 1031 |
+
return process_forum_batch_worker(batch_data)
|
| 1032 |
+
|
| 1033 |
+
|
| 1034 |
+
# ============================================================
|
| 1035 |
+
# Main Entry Point
|
| 1036 |
+
# ============================================================
|
| 1037 |
+
|
| 1038 |
+
def main():
|
| 1039 |
+
"""Main entry point."""
|
| 1040 |
+
parser = argparse.ArgumentParser(
|
| 1041 |
+
description="Brand Sentiment Analysis - Analyze forum posts and social media comments for brand intelligence"
|
| 1042 |
+
)
|
| 1043 |
+
parser.add_argument(
|
| 1044 |
+
'--limit',
|
| 1045 |
+
type=int,
|
| 1046 |
+
default=None,
|
| 1047 |
+
help='Limit number of items to process per source (default: all unprocessed)'
|
| 1048 |
+
)
|
| 1049 |
+
parser.add_argument(
|
| 1050 |
+
'--overwrite',
|
| 1051 |
+
action='store_true',
|
| 1052 |
+
default=False,
|
| 1053 |
+
help='Overwrite existing Snowflake table (default: append)'
|
| 1054 |
+
)
|
| 1055 |
+
parser.add_argument(
|
| 1056 |
+
'--sequential',
|
| 1057 |
+
action='store_true',
|
| 1058 |
+
default=False,
|
| 1059 |
+
help='Use sequential processing instead of parallel (for debugging)'
|
| 1060 |
+
)
|
| 1061 |
+
parser.add_argument(
|
| 1062 |
+
'--config-dir',
|
| 1063 |
+
type=str,
|
| 1064 |
+
default=None,
|
| 1065 |
+
help='Path to configuration directory (default: config_files/)'
|
| 1066 |
+
)
|
| 1067 |
+
parser.add_argument(
|
| 1068 |
+
'--data-source',
|
| 1069 |
+
type=str,
|
| 1070 |
+
choices=['forums', 'comments', 'all'],
|
| 1071 |
+
default='all',
|
| 1072 |
+
help='Data source to process: forums, comments, or all (default: all)'
|
| 1073 |
+
)
|
| 1074 |
+
|
| 1075 |
+
args = parser.parse_args()
|
| 1076 |
+
|
| 1077 |
+
# Initialize and run
|
| 1078 |
+
processor = BrandSentimentProcessor(config_dir=args.config_dir)
|
| 1079 |
+
processor.run(
|
| 1080 |
+
limit=args.limit,
|
| 1081 |
+
overwrite=args.overwrite,
|
| 1082 |
+
sequential=args.sequential,
|
| 1083 |
+
data_source=args.data_source
|
| 1084 |
+
)
|
| 1085 |
+
|
| 1086 |
+
|
| 1087 |
+
if __name__ == "__main__":
|
| 1088 |
+
main()
|
processing_brand_sentiment/utils/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utilities module for brand sentiment analysis.
|
| 3 |
+
Contains HTML parsing and other helper functions.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .html_parser import HTMLParser
|
| 7 |
+
|
| 8 |
+
__all__ = ['HTMLParser']
|
processing_brand_sentiment/utils/html_parser.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HTML Parser utility for extracting content from forum posts.
|
| 3 |
+
Handles the complex HTML structure where replies contain quoted parent content.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
import html
|
| 8 |
+
from typing import Dict, Optional, Tuple
|
| 9 |
+
from bs4 import BeautifulSoup
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class HTMLParser:
|
| 16 |
+
"""
|
| 17 |
+
Parses HTML content from forum posts to extract actual reply content
|
| 18 |
+
and quoted parent content separately.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self):
|
| 22 |
+
"""Initialize the HTML parser."""
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
def parse_post_content(self, html_content: str) -> Dict[str, Optional[str]]:
|
| 26 |
+
"""
|
| 27 |
+
Parse HTML post content to extract reply and quoted content.
|
| 28 |
+
|
| 29 |
+
The forum posts have a structure where:
|
| 30 |
+
- <blockquote> contains the quoted parent post
|
| 31 |
+
- Content outside blockquote is the actual reply
|
| 32 |
+
|
| 33 |
+
Example input:
|
| 34 |
+
<blockquote><span class="post-id">125015</span>
|
| 35 |
+
<p class="quote-heading"><strong>JackO</strong><em> - Feb 3, 2015</em></p>
|
| 36 |
+
<br /><p>Parent content here...</p></blockquote>
|
| 37 |
+
<br /><p>Actual reply content here...</p>
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
html_content: Raw HTML content from POST_CONTENT field
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
Dictionary with:
|
| 44 |
+
- reply_content: The actual reply text (cleaned)
|
| 45 |
+
- quoted_content: The quoted parent text (cleaned), if any
|
| 46 |
+
- quoted_author: Author of the quoted post, if any
|
| 47 |
+
- quoted_date: Date of the quoted post, if any
|
| 48 |
+
- has_quote: Boolean indicating if post contains a quote
|
| 49 |
+
"""
|
| 50 |
+
if not html_content or not html_content.strip():
|
| 51 |
+
return {
|
| 52 |
+
"reply_content": "",
|
| 53 |
+
"quoted_content": None,
|
| 54 |
+
"quoted_author": None,
|
| 55 |
+
"quoted_date": None,
|
| 56 |
+
"has_quote": False
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 61 |
+
|
| 62 |
+
# Extract quoted content from blockquotes
|
| 63 |
+
quoted_content = None
|
| 64 |
+
quoted_author = None
|
| 65 |
+
quoted_date = None
|
| 66 |
+
has_quote = False
|
| 67 |
+
|
| 68 |
+
blockquotes = soup.find_all('blockquote')
|
| 69 |
+
|
| 70 |
+
if blockquotes:
|
| 71 |
+
has_quote = True
|
| 72 |
+
quote_parts = []
|
| 73 |
+
|
| 74 |
+
for blockquote in blockquotes:
|
| 75 |
+
# Extract quote heading info (author and date)
|
| 76 |
+
quote_heading = blockquote.find('p', class_='quote-heading')
|
| 77 |
+
if quote_heading:
|
| 78 |
+
author_tag = quote_heading.find('strong')
|
| 79 |
+
if author_tag:
|
| 80 |
+
quoted_author = author_tag.get_text(strip=True)
|
| 81 |
+
|
| 82 |
+
date_tag = quote_heading.find('em')
|
| 83 |
+
if date_tag:
|
| 84 |
+
quoted_date = date_tag.get_text(strip=True).lstrip(' - ')
|
| 85 |
+
|
| 86 |
+
# Get the quote text content (excluding heading)
|
| 87 |
+
# Remove the heading first to get just the content
|
| 88 |
+
if quote_heading:
|
| 89 |
+
quote_heading.decompose()
|
| 90 |
+
|
| 91 |
+
# Remove post-id spans
|
| 92 |
+
for post_id_span in blockquote.find_all('span', class_='post-id'):
|
| 93 |
+
post_id_span.decompose()
|
| 94 |
+
|
| 95 |
+
quote_text = self._clean_text(blockquote.get_text())
|
| 96 |
+
if quote_text:
|
| 97 |
+
quote_parts.append(quote_text)
|
| 98 |
+
|
| 99 |
+
# Remove the blockquote from the soup to get remaining content
|
| 100 |
+
blockquote.decompose()
|
| 101 |
+
|
| 102 |
+
quoted_content = " ".join(quote_parts) if quote_parts else None
|
| 103 |
+
|
| 104 |
+
# Get the remaining content (actual reply)
|
| 105 |
+
reply_content = self._clean_text(soup.get_text())
|
| 106 |
+
|
| 107 |
+
return {
|
| 108 |
+
"reply_content": reply_content,
|
| 109 |
+
"quoted_content": quoted_content,
|
| 110 |
+
"quoted_author": quoted_author,
|
| 111 |
+
"quoted_date": quoted_date,
|
| 112 |
+
"has_quote": has_quote
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.warning(f"Error parsing HTML content: {e}")
|
| 117 |
+
# Fallback: try to extract text directly
|
| 118 |
+
return {
|
| 119 |
+
"reply_content": self._clean_text(self._strip_html_tags(html_content)),
|
| 120 |
+
"quoted_content": None,
|
| 121 |
+
"quoted_author": None,
|
| 122 |
+
"quoted_date": None,
|
| 123 |
+
"has_quote": False
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
def _clean_text(self, text: str) -> str:
|
| 127 |
+
"""
|
| 128 |
+
Clean extracted text by removing extra whitespace and normalizing.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
text: Raw text to clean
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
Cleaned text
|
| 135 |
+
"""
|
| 136 |
+
if not text:
|
| 137 |
+
return ""
|
| 138 |
+
|
| 139 |
+
# Decode HTML entities
|
| 140 |
+
text = html.unescape(text)
|
| 141 |
+
|
| 142 |
+
# Replace multiple whitespace with single space
|
| 143 |
+
text = re.sub(r'\s+', ' ', text)
|
| 144 |
+
|
| 145 |
+
# Strip leading/trailing whitespace
|
| 146 |
+
text = text.strip()
|
| 147 |
+
|
| 148 |
+
return text
|
| 149 |
+
|
| 150 |
+
def _strip_html_tags(self, html_content: str) -> str:
|
| 151 |
+
"""
|
| 152 |
+
Fallback method to strip HTML tags if BeautifulSoup fails.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
html_content: HTML content
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
Text without HTML tags
|
| 159 |
+
"""
|
| 160 |
+
# Remove HTML tags
|
| 161 |
+
clean = re.sub(r'<[^>]+>', ' ', html_content)
|
| 162 |
+
# Decode entities
|
| 163 |
+
clean = html.unescape(clean)
|
| 164 |
+
# Clean whitespace
|
| 165 |
+
clean = re.sub(r'\s+', ' ', clean)
|
| 166 |
+
return clean.strip()
|
| 167 |
+
|
| 168 |
+
def extract_plain_text(self, html_content: str) -> str:
|
| 169 |
+
"""
|
| 170 |
+
Extract plain text from HTML content, preserving readability.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
html_content: HTML content
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
Plain text version
|
| 177 |
+
"""
|
| 178 |
+
if not html_content:
|
| 179 |
+
return ""
|
| 180 |
+
|
| 181 |
+
try:
|
| 182 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 183 |
+
|
| 184 |
+
# Add newlines for block elements
|
| 185 |
+
for br in soup.find_all('br'):
|
| 186 |
+
br.replace_with('\n')
|
| 187 |
+
for p in soup.find_all('p'):
|
| 188 |
+
p.append('\n')
|
| 189 |
+
|
| 190 |
+
text = soup.get_text()
|
| 191 |
+
return self._clean_text(text)
|
| 192 |
+
|
| 193 |
+
except Exception as e:
|
| 194 |
+
logger.warning(f"Error extracting plain text: {e}")
|
| 195 |
+
return self._clean_text(self._strip_html_tags(html_content))
|
| 196 |
+
|
| 197 |
+
def build_thread_context(
|
| 198 |
+
self,
|
| 199 |
+
thread_title: Optional[str],
|
| 200 |
+
first_post_content: Optional[str],
|
| 201 |
+
category_title: Optional[str] = None,
|
| 202 |
+
category_topic: Optional[str] = None
|
| 203 |
+
) -> str:
|
| 204 |
+
"""
|
| 205 |
+
Build a context string from thread information.
|
| 206 |
+
|
| 207 |
+
Args:
|
| 208 |
+
thread_title: Title of the discussion thread
|
| 209 |
+
first_post_content: Content of the first post in the thread
|
| 210 |
+
category_title: Category title
|
| 211 |
+
category_topic: Category topic
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
Formatted context string
|
| 215 |
+
"""
|
| 216 |
+
context_parts = []
|
| 217 |
+
|
| 218 |
+
if category_title:
|
| 219 |
+
context_parts.append(f"Category: {category_title}")
|
| 220 |
+
|
| 221 |
+
if category_topic:
|
| 222 |
+
context_parts.append(f"Topic: {category_topic}")
|
| 223 |
+
|
| 224 |
+
if thread_title:
|
| 225 |
+
context_parts.append(f"Thread: {thread_title}")
|
| 226 |
+
|
| 227 |
+
if first_post_content:
|
| 228 |
+
# Parse and clean the first post content
|
| 229 |
+
parsed = self.parse_post_content(first_post_content)
|
| 230 |
+
first_post_text = parsed.get("reply_content", "")
|
| 231 |
+
if first_post_text:
|
| 232 |
+
# Truncate if too long
|
| 233 |
+
if len(first_post_text) > 500:
|
| 234 |
+
first_post_text = first_post_text[:500] + "..."
|
| 235 |
+
context_parts.append(f"Original discussion: {first_post_text}")
|
| 236 |
+
|
| 237 |
+
return " | ".join(context_parts) if context_parts else ""
|
| 238 |
+
|
| 239 |
+
def is_empty_content(self, html_content: str) -> bool:
|
| 240 |
+
"""
|
| 241 |
+
Check if HTML content is effectively empty.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
html_content: HTML content to check
|
| 245 |
+
|
| 246 |
+
Returns:
|
| 247 |
+
True if content is empty or contains no meaningful text
|
| 248 |
+
"""
|
| 249 |
+
if not html_content:
|
| 250 |
+
return True
|
| 251 |
+
|
| 252 |
+
text = self.extract_plain_text(html_content)
|
| 253 |
+
return len(text.strip()) == 0
|
processing_brand_sentiment/workflow/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Workflow module for brand sentiment analysis.
|
| 3 |
+
Contains the LangGraph orchestrators and agent implementations.
|
| 4 |
+
Supports both forum posts and social media comments.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .orchestrator import BrandAnalysisWorkflow
|
| 8 |
+
from .comment_orchestrator import CommentAnalysisWorkflow
|
| 9 |
+
|
| 10 |
+
__all__ = ['BrandAnalysisWorkflow', 'CommentAnalysisWorkflow']
|
processing_brand_sentiment/workflow/agents/__init__.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agents module for brand sentiment analysis v4.0.
|
| 3 |
+
|
| 4 |
+
Contains specialized agents for the 4-stage pipeline:
|
| 5 |
+
1. ContentPreprocessorAgent - HTML parsing, cleaning, keyword detection (forums)
|
| 6 |
+
CommentPreprocessorAgent - Plain text cleaning, keyword detection (comments)
|
| 7 |
+
2. SabianRelevanceExtractionAgent - Relevance + fact extraction
|
| 8 |
+
3. SabianSentimentAnalyzerAgent - Deep sentiment analysis
|
| 9 |
+
4. OutputValidatorAgent - Rule-based validation
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from .base_agent import BaseAgent
|
| 13 |
+
from .content_preprocessor_agent import ContentPreprocessorAgent
|
| 14 |
+
from .comment_preprocessor_agent import CommentPreprocessorAgent
|
| 15 |
+
from .sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent
|
| 16 |
+
from .sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent
|
| 17 |
+
from .output_validator_agent import OutputValidatorAgent
|
| 18 |
+
|
| 19 |
+
# Legacy imports for backward compatibility
|
| 20 |
+
from .preprocessor_agent import PreprocessorAgent
|
| 21 |
+
from .relevance_validator_agent import RelevanceValidatorAgent
|
| 22 |
+
from .sabian_analyzer_agent import SabianAnalyzerAgent
|
| 23 |
+
|
| 24 |
+
__all__ = [
|
| 25 |
+
# Base
|
| 26 |
+
'BaseAgent',
|
| 27 |
+
|
| 28 |
+
# New agents (v4.0)
|
| 29 |
+
'ContentPreprocessorAgent',
|
| 30 |
+
'CommentPreprocessorAgent',
|
| 31 |
+
'SabianRelevanceExtractionAgent',
|
| 32 |
+
'SabianSentimentAnalyzerAgent',
|
| 33 |
+
'OutputValidatorAgent',
|
| 34 |
+
|
| 35 |
+
# Legacy agents (for backward compatibility)
|
| 36 |
+
'PreprocessorAgent',
|
| 37 |
+
'RelevanceValidatorAgent',
|
| 38 |
+
'SabianAnalyzerAgent'
|
| 39 |
+
]
|
processing_brand_sentiment/workflow/agents/base_agent.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Base Agent class for all agents in the brand sentiment analysis workflow.
|
| 3 |
+
Provides a common interface and structure for extensibility.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from abc import ABC, abstractmethod
|
| 7 |
+
from typing import Dict, Any, Optional
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class BaseAgent(ABC):
|
| 15 |
+
"""
|
| 16 |
+
Abstract base class for all agents in the brand sentiment analysis workflow.
|
| 17 |
+
Provides common functionality and enforces consistent interface.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, name: str, config: Dict[str, Any]):
|
| 21 |
+
"""
|
| 22 |
+
Initialize the base agent.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
name: Name of the agent
|
| 26 |
+
config: Configuration dictionary for the agent
|
| 27 |
+
"""
|
| 28 |
+
self.name = name
|
| 29 |
+
self.config = config
|
| 30 |
+
self.model = config.get("model", "gpt-5-nano")
|
| 31 |
+
self.temperature = config.get("temperature", 0.2)
|
| 32 |
+
self.max_retries = config.get("max_retries", 3)
|
| 33 |
+
logger.info(f"Initialized {self.name} with model {self.model}")
|
| 34 |
+
|
| 35 |
+
@abstractmethod
|
| 36 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 37 |
+
"""
|
| 38 |
+
Process input data and return results.
|
| 39 |
+
This method must be implemented by all concrete agent classes.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
input_data: Dictionary containing input data for processing
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
Dictionary containing processing results
|
| 46 |
+
"""
|
| 47 |
+
pass
|
| 48 |
+
|
| 49 |
+
@abstractmethod
|
| 50 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 51 |
+
"""
|
| 52 |
+
Validate input data before processing.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
input_data: Dictionary containing input data
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
True if input is valid, False otherwise
|
| 59 |
+
"""
|
| 60 |
+
pass
|
| 61 |
+
|
| 62 |
+
def get_name(self) -> str:
|
| 63 |
+
"""Get the agent name."""
|
| 64 |
+
return self.name
|
| 65 |
+
|
| 66 |
+
def get_config(self) -> Dict[str, Any]:
|
| 67 |
+
"""Get the agent configuration."""
|
| 68 |
+
return self.config
|
| 69 |
+
|
| 70 |
+
def log_processing(self, message: str, level: str = "info"):
|
| 71 |
+
"""
|
| 72 |
+
Log processing information.
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
message: Log message
|
| 76 |
+
level: Log level (info, warning, error, debug)
|
| 77 |
+
"""
|
| 78 |
+
log_method = getattr(logger, level, logger.info)
|
| 79 |
+
log_method(f"[{self.name}] {message}")
|
| 80 |
+
|
| 81 |
+
def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]:
|
| 82 |
+
"""
|
| 83 |
+
Handle errors consistently across all agents.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
error: The exception that occurred
|
| 87 |
+
context: Additional context about the error
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
Error dictionary with details
|
| 91 |
+
"""
|
| 92 |
+
error_msg = f"Error in {self.name}"
|
| 93 |
+
if context:
|
| 94 |
+
error_msg += f" ({context})"
|
| 95 |
+
error_msg += f": {str(error)}"
|
| 96 |
+
|
| 97 |
+
logger.error(error_msg)
|
| 98 |
+
|
| 99 |
+
return {
|
| 100 |
+
"success": False,
|
| 101 |
+
"error": str(error),
|
| 102 |
+
"agent": self.name,
|
| 103 |
+
"context": context
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]:
|
| 107 |
+
"""
|
| 108 |
+
Parse LLM response that may contain JSON wrapped in markdown code blocks.
|
| 109 |
+
|
| 110 |
+
Args:
|
| 111 |
+
response_content: Raw response content from LLM
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
Parsed JSON dictionary
|
| 115 |
+
|
| 116 |
+
Raises:
|
| 117 |
+
json.JSONDecodeError: If JSON cannot be parsed
|
| 118 |
+
"""
|
| 119 |
+
content = response_content.strip()
|
| 120 |
+
|
| 121 |
+
# Check if response is wrapped in markdown code block
|
| 122 |
+
if content.startswith("```json"):
|
| 123 |
+
# Remove ```json prefix and ``` suffix
|
| 124 |
+
content = content[7:] # Remove ```json
|
| 125 |
+
if content.endswith("```"):
|
| 126 |
+
content = content[:-3] # Remove trailing ```
|
| 127 |
+
content = content.strip()
|
| 128 |
+
elif content.startswith("```"):
|
| 129 |
+
# Remove generic ``` code block
|
| 130 |
+
content = content[3:]
|
| 131 |
+
if content.endswith("```"):
|
| 132 |
+
content = content[:-3]
|
| 133 |
+
content = content.strip()
|
| 134 |
+
|
| 135 |
+
# Parse the cleaned JSON
|
| 136 |
+
return json.loads(content)
|
| 137 |
+
|
| 138 |
+
def _safe_get(self, data: Dict[str, Any], key: str, default: Any = None) -> Any:
|
| 139 |
+
"""
|
| 140 |
+
Safely get a value from a dictionary with a default.
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
data: Dictionary to get value from
|
| 144 |
+
key: Key to look up
|
| 145 |
+
default: Default value if key not found
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
Value from dictionary or default
|
| 149 |
+
"""
|
| 150 |
+
return data.get(key, default)
|
| 151 |
+
|
| 152 |
+
def _ensure_list(self, value: Any) -> list:
|
| 153 |
+
"""
|
| 154 |
+
Ensure a value is a list.
|
| 155 |
+
|
| 156 |
+
Args:
|
| 157 |
+
value: Value to convert
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
List version of value
|
| 161 |
+
"""
|
| 162 |
+
if value is None:
|
| 163 |
+
return []
|
| 164 |
+
if isinstance(value, list):
|
| 165 |
+
return value
|
| 166 |
+
if isinstance(value, str):
|
| 167 |
+
# Try to parse as comma-separated
|
| 168 |
+
return [v.strip() for v in value.split(",") if v.strip()]
|
| 169 |
+
return [value]
|
processing_brand_sentiment/workflow/agents/comment_preprocessor_agent.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Comment Preprocessor Agent for brand sentiment analysis on social media comments.
|
| 3 |
+
|
| 4 |
+
Extends ContentPreprocessorAgent but handles plain text (no HTML parsing).
|
| 5 |
+
Builds context from content title, content description, and parent comment text
|
| 6 |
+
instead of thread title and first post.
|
| 7 |
+
|
| 8 |
+
Reuses: keyword sets, product alias mapping, language detection, relevance screening.
|
| 9 |
+
Overrides: process() method for plain text handling and comment-specific context building.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from typing import Dict, Any, Optional
|
| 13 |
+
import logging
|
| 14 |
+
|
| 15 |
+
from .content_preprocessor_agent import ContentPreprocessorAgent
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class CommentPreprocessorAgent(ContentPreprocessorAgent):
|
| 21 |
+
"""
|
| 22 |
+
Agent that preprocesses social media comments for brand sentiment analysis.
|
| 23 |
+
|
| 24 |
+
Inherits keyword detection, product alias mapping, language detection,
|
| 25 |
+
and relevance screening from ContentPreprocessorAgent.
|
| 26 |
+
|
| 27 |
+
Key differences from forum preprocessor:
|
| 28 |
+
- No HTML parsing (comments are plain text)
|
| 29 |
+
- Context built from content title + description + parent comment
|
| 30 |
+
- Different input field names (comment_text vs post_content)
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]):
|
| 34 |
+
"""
|
| 35 |
+
Initialize the Comment Preprocessor Agent.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
config: Agent configuration
|
| 39 |
+
brand_config: Brand-specific configuration with keywords, products, and aliases
|
| 40 |
+
"""
|
| 41 |
+
super().__init__(config, brand_config)
|
| 42 |
+
self.name = "CommentPreprocessorAgent"
|
| 43 |
+
|
| 44 |
+
logger.info(
|
| 45 |
+
f"CommentPreprocessorAgent initialized with {len(self.primary_keywords)} primary keywords, "
|
| 46 |
+
f"{len(self.product_aliases)} product aliases"
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 50 |
+
"""
|
| 51 |
+
Validate that input contains required fields for comment processing.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
input_data: Input dictionary
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
True if valid, False otherwise
|
| 58 |
+
"""
|
| 59 |
+
required_fields = ["comment_sk", "comment_text"]
|
| 60 |
+
return all(field in input_data for field in required_fields)
|
| 61 |
+
|
| 62 |
+
def _build_comment_context(
|
| 63 |
+
self,
|
| 64 |
+
content_title: Optional[str] = None,
|
| 65 |
+
content_description: Optional[str] = None,
|
| 66 |
+
parent_comment_text: Optional[str] = None
|
| 67 |
+
) -> str:
|
| 68 |
+
"""
|
| 69 |
+
Build context string from social media content and parent comment information.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
content_title: Title of the social media post/content
|
| 73 |
+
content_description: Description/message of the social media post
|
| 74 |
+
parent_comment_text: Text of the parent comment (if this is a reply)
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
Formatted context string
|
| 78 |
+
"""
|
| 79 |
+
context_parts = []
|
| 80 |
+
|
| 81 |
+
if content_title:
|
| 82 |
+
context_parts.append(f"Post title: {content_title}")
|
| 83 |
+
|
| 84 |
+
if content_description:
|
| 85 |
+
# Truncate if too long
|
| 86 |
+
truncated = content_description[:500] + "..." if len(content_description) > 500 else content_description
|
| 87 |
+
context_parts.append(f"Post description: {truncated}")
|
| 88 |
+
|
| 89 |
+
if parent_comment_text:
|
| 90 |
+
truncated = parent_comment_text[:500] + "..." if len(parent_comment_text) > 500 else parent_comment_text
|
| 91 |
+
context_parts.append(f"Parent comment: {truncated}")
|
| 92 |
+
|
| 93 |
+
return " | ".join(context_parts) if context_parts else ""
|
| 94 |
+
|
| 95 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 96 |
+
"""
|
| 97 |
+
Process a social media comment through the preprocessing pipeline.
|
| 98 |
+
|
| 99 |
+
Unlike forum posts, comments are plain text (no HTML parsing needed).
|
| 100 |
+
Context is built from content title, description, and parent comment.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
input_data: Dictionary containing comment data with at least:
|
| 104 |
+
- comment_sk: Comment surrogate key
|
| 105 |
+
- comment_text: Raw comment text (plain text)
|
| 106 |
+
- content_title: Title of the post (optional)
|
| 107 |
+
- content_description: Description of the post (optional)
|
| 108 |
+
- parent_comment_text: Parent comment text if reply (optional)
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
Dictionary with preprocessing results
|
| 112 |
+
"""
|
| 113 |
+
try:
|
| 114 |
+
# Validate input
|
| 115 |
+
if not self.validate_input(input_data):
|
| 116 |
+
return {
|
| 117 |
+
"success": False,
|
| 118 |
+
"error": "Invalid input: missing required fields (comment_sk, comment_text)",
|
| 119 |
+
**input_data
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
comment_text = input_data.get("comment_text", "")
|
| 123 |
+
|
| 124 |
+
# Step 1: Clean text (plain text - no HTML parsing needed)
|
| 125 |
+
cleaned_content = comment_text.strip() if comment_text else ""
|
| 126 |
+
|
| 127 |
+
# Check for empty content
|
| 128 |
+
if not cleaned_content or len(cleaned_content) < self.min_content_length:
|
| 129 |
+
return {
|
| 130 |
+
"success": True,
|
| 131 |
+
"cleaned_content": cleaned_content,
|
| 132 |
+
"quoted_content": None,
|
| 133 |
+
"is_empty": True,
|
| 134 |
+
"preliminary_relevant": False,
|
| 135 |
+
"needs_relevance_validation": False,
|
| 136 |
+
**{k: v for k, v in input_data.items() if k != "comment_text"}
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
# Step 2: Check relevance (reused from parent class)
|
| 140 |
+
relevance_result = self._check_relevance(cleaned_content)
|
| 141 |
+
has_primary_keywords = relevance_result.get("has_primary_keywords", False)
|
| 142 |
+
|
| 143 |
+
# Step 3: Build comment context
|
| 144 |
+
raw_thread_context = self._build_comment_context(
|
| 145 |
+
content_title=input_data.get("content_title"),
|
| 146 |
+
content_description=input_data.get("content_description"),
|
| 147 |
+
parent_comment_text=input_data.get("parent_comment_text")
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
# Step 4: Detect language (reused from parent class)
|
| 151 |
+
lang_result = self._detect_language(cleaned_content, has_primary_keywords)
|
| 152 |
+
|
| 153 |
+
# Step 5: Extract product and competitor mentions (reused from parent class)
|
| 154 |
+
products_found = self._extract_mentioned_products(cleaned_content)
|
| 155 |
+
competitors_found = self._extract_mentioned_competitors(cleaned_content)
|
| 156 |
+
|
| 157 |
+
# Determine quoted content (parent comment serves as quoted context)
|
| 158 |
+
parent_comment = input_data.get("parent_comment_text")
|
| 159 |
+
has_parent = parent_comment is not None and str(parent_comment).strip() != ""
|
| 160 |
+
|
| 161 |
+
# Build result
|
| 162 |
+
result = {
|
| 163 |
+
"success": True,
|
| 164 |
+
"is_empty": False,
|
| 165 |
+
|
| 166 |
+
# Cleaned content
|
| 167 |
+
"cleaned_content": cleaned_content,
|
| 168 |
+
"quoted_content": parent_comment if has_parent else None,
|
| 169 |
+
"has_quote": has_parent,
|
| 170 |
+
"quoted_author": None,
|
| 171 |
+
"raw_thread_context": raw_thread_context,
|
| 172 |
+
|
| 173 |
+
# Language detection
|
| 174 |
+
"detected_language": lang_result["language"],
|
| 175 |
+
"language_code": lang_result["language_code"],
|
| 176 |
+
"is_english": lang_result["is_english"],
|
| 177 |
+
"language_confidence": lang_result["confidence"],
|
| 178 |
+
"language_detection_skipped": lang_result.get("detection_skipped", False),
|
| 179 |
+
|
| 180 |
+
# Relevance assessment
|
| 181 |
+
"preliminary_relevant": relevance_result["preliminary_relevant"],
|
| 182 |
+
"needs_relevance_validation": relevance_result["needs_relevance_validation"],
|
| 183 |
+
"relevance_keywords_found": relevance_result["found_keywords"],
|
| 184 |
+
"relevance_type": relevance_result["relevance_type"],
|
| 185 |
+
"relevance_confidence": relevance_result["relevance_confidence"],
|
| 186 |
+
"has_primary_keywords": has_primary_keywords,
|
| 187 |
+
|
| 188 |
+
# Initial extractions
|
| 189 |
+
"products_detected": products_found,
|
| 190 |
+
"competitors_detected": competitors_found,
|
| 191 |
+
|
| 192 |
+
# Preserve original data (exclude raw text to avoid duplication)
|
| 193 |
+
**{k: v for k, v in input_data.items() if k not in ["comment_text"]}
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
# Keep original content for reference
|
| 197 |
+
result["original_text"] = comment_text
|
| 198 |
+
|
| 199 |
+
self.log_processing(
|
| 200 |
+
f"Processed comment {input_data.get('comment_sk')}: "
|
| 201 |
+
f"lang={lang_result['language']}, "
|
| 202 |
+
f"relevant={relevance_result['preliminary_relevant']}, "
|
| 203 |
+
f"needs_validation={relevance_result['needs_relevance_validation']}, "
|
| 204 |
+
f"products={products_found}",
|
| 205 |
+
"debug"
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
return result
|
| 209 |
+
|
| 210 |
+
except Exception as e:
|
| 211 |
+
return self.handle_error(e, f"preprocessing comment {input_data.get('comment_sk')}")
|
processing_brand_sentiment/workflow/agents/content_preprocessor_agent.py
ADDED
|
@@ -0,0 +1,570 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Content Preprocessor Agent for brand sentiment analysis.
|
| 3 |
+
Handles HTML parsing, text cleaning, language detection, product alias mapping,
|
| 4 |
+
and initial relevance screening. This is a deterministic agent (no LLM calls).
|
| 5 |
+
|
| 6 |
+
Enhanced version with:
|
| 7 |
+
- Product alias mapping (B8 -> B8X)
|
| 8 |
+
- Smart language detection (skip for short texts)
|
| 9 |
+
- Always process if primary keywords found
|
| 10 |
+
- Better content separation
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import re
|
| 14 |
+
from typing import Dict, Any, List, Optional, Set
|
| 15 |
+
from lingua import Language, LanguageDetectorBuilder
|
| 16 |
+
import logging
|
| 17 |
+
|
| 18 |
+
from .base_agent import BaseAgent
|
| 19 |
+
from utils.html_parser import HTMLParser
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class ContentPreprocessorAgent(BaseAgent):
|
| 25 |
+
"""
|
| 26 |
+
Agent that preprocesses forum posts:
|
| 27 |
+
- Parses HTML to extract reply and quoted content
|
| 28 |
+
- Cleans and normalizes text
|
| 29 |
+
- Maps product aliases to canonical names
|
| 30 |
+
- Detects language (with smart handling for short texts)
|
| 31 |
+
- Performs initial keyword-based relevance screening
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
# Lingua to ISO 639-1 language code mapping
|
| 35 |
+
LINGUA_TO_ISO = {
|
| 36 |
+
Language.ENGLISH: "en",
|
| 37 |
+
Language.SPANISH: "es",
|
| 38 |
+
Language.FRENCH: "fr",
|
| 39 |
+
Language.GERMAN: "de",
|
| 40 |
+
Language.ITALIAN: "it",
|
| 41 |
+
Language.PORTUGUESE: "pt",
|
| 42 |
+
Language.RUSSIAN: "ru",
|
| 43 |
+
Language.JAPANESE: "ja",
|
| 44 |
+
Language.KOREAN: "ko",
|
| 45 |
+
Language.CHINESE: "zh",
|
| 46 |
+
Language.ARABIC: "ar",
|
| 47 |
+
Language.HINDI: "hi",
|
| 48 |
+
Language.DUTCH: "nl",
|
| 49 |
+
Language.SWEDISH: "sv",
|
| 50 |
+
Language.POLISH: "pl",
|
| 51 |
+
Language.TURKISH: "tr"
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]):
|
| 55 |
+
"""
|
| 56 |
+
Initialize the Content Preprocessor Agent.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
config: Agent configuration
|
| 60 |
+
brand_config: Brand-specific configuration with keywords, products, and aliases
|
| 61 |
+
"""
|
| 62 |
+
super().__init__("ContentPreprocessorAgent", config)
|
| 63 |
+
self.brand_config = brand_config
|
| 64 |
+
self.html_parser = HTMLParser()
|
| 65 |
+
|
| 66 |
+
# Get preprocessing settings
|
| 67 |
+
preprocessing_config = brand_config.get("preprocessing", {})
|
| 68 |
+
self.min_length_for_lang_detection = preprocessing_config.get(
|
| 69 |
+
"min_length_for_language_detection", 50
|
| 70 |
+
)
|
| 71 |
+
self.default_language = preprocessing_config.get(
|
| 72 |
+
"default_language_for_short_text", "English"
|
| 73 |
+
)
|
| 74 |
+
self.always_process_primary = preprocessing_config.get(
|
| 75 |
+
"always_process_if_primary_keyword", True
|
| 76 |
+
)
|
| 77 |
+
self.min_content_length = preprocessing_config.get("min_content_length", 3)
|
| 78 |
+
|
| 79 |
+
# Initialize lingua detector
|
| 80 |
+
self.language_detector = LanguageDetectorBuilder.from_all_languages().build()
|
| 81 |
+
|
| 82 |
+
# Build keyword sets and alias mappings
|
| 83 |
+
self._build_keyword_sets()
|
| 84 |
+
self._build_alias_mappings()
|
| 85 |
+
|
| 86 |
+
logger.info(
|
| 87 |
+
f"ContentPreprocessorAgent initialized with {len(self.primary_keywords)} primary keywords, "
|
| 88 |
+
f"{len(self.product_aliases)} product aliases"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
def _build_keyword_sets(self) -> None:
|
| 92 |
+
"""Build keyword sets from brand configuration for efficient relevance checking."""
|
| 93 |
+
relevance_config = self.brand_config.get("relevance_keywords", {})
|
| 94 |
+
|
| 95 |
+
# Primary keywords - definitive Sabian mentions
|
| 96 |
+
primary = relevance_config.get("primary", {}).get("keywords", [])
|
| 97 |
+
self.primary_keywords: Set[str] = set(k.lower() for k in primary)
|
| 98 |
+
|
| 99 |
+
# Contextual keywords - need disambiguation (HH, AA)
|
| 100 |
+
contextual = relevance_config.get("contextual", {}).get("keywords", [])
|
| 101 |
+
self.contextual_keywords: Set[str] = set(k.lower() for k in contextual)
|
| 102 |
+
|
| 103 |
+
# Cymbal context keywords - help disambiguate contextual terms
|
| 104 |
+
cymbal_context = relevance_config.get("cymbal_context", {}).get("keywords", [])
|
| 105 |
+
self.cymbal_context_keywords: Set[str] = set(k.lower() for k in cymbal_context)
|
| 106 |
+
|
| 107 |
+
# Competitor names and aliases for detection
|
| 108 |
+
competitors = self.brand_config.get("brand", {}).get("competitors", [])
|
| 109 |
+
self.competitor_keywords: Set[str] = set()
|
| 110 |
+
self.competitor_name_map: Dict[str, str] = {} # alias -> canonical name
|
| 111 |
+
|
| 112 |
+
for comp in competitors:
|
| 113 |
+
if isinstance(comp, dict):
|
| 114 |
+
name = comp.get("name", "")
|
| 115 |
+
self.competitor_keywords.add(name.lower())
|
| 116 |
+
self.competitor_name_map[name.lower()] = name
|
| 117 |
+
for alias in comp.get("aliases", []):
|
| 118 |
+
alias_lower = alias.lower()
|
| 119 |
+
self.competitor_keywords.add(alias_lower)
|
| 120 |
+
self.competitor_name_map[alias_lower] = name
|
| 121 |
+
else:
|
| 122 |
+
comp_str = str(comp).lower()
|
| 123 |
+
self.competitor_keywords.add(comp_str)
|
| 124 |
+
self.competitor_name_map[comp_str] = str(comp)
|
| 125 |
+
|
| 126 |
+
# Product names
|
| 127 |
+
products = self.brand_config.get("brand", {}).get("products", [])
|
| 128 |
+
self.product_keywords: Set[str] = set(p.lower() for p in products)
|
| 129 |
+
self.products_list = products # Keep original case
|
| 130 |
+
|
| 131 |
+
logger.debug(
|
| 132 |
+
f"Built keyword sets: {len(self.primary_keywords)} primary, "
|
| 133 |
+
f"{len(self.contextual_keywords)} contextual, "
|
| 134 |
+
f"{len(self.product_keywords)} products, "
|
| 135 |
+
f"{len(self.competitor_keywords)} competitor terms"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
def _build_alias_mappings(self) -> None:
|
| 139 |
+
"""Build product alias mappings from brand configuration."""
|
| 140 |
+
aliases = self.brand_config.get("brand", {}).get("product_aliases", {})
|
| 141 |
+
|
| 142 |
+
# Build alias -> canonical product mapping
|
| 143 |
+
self.product_aliases: Dict[str, str] = {}
|
| 144 |
+
for alias, canonical in aliases.items():
|
| 145 |
+
self.product_aliases[alias.lower()] = canonical
|
| 146 |
+
|
| 147 |
+
# Also add primary keywords that are aliases to contextual keywords
|
| 148 |
+
# e.g., "b8" should trigger contextual check since it maps to "B8X"
|
| 149 |
+
for alias in self.product_aliases.keys():
|
| 150 |
+
if alias not in self.primary_keywords:
|
| 151 |
+
self.contextual_keywords.add(alias)
|
| 152 |
+
|
| 153 |
+
logger.debug(f"Built {len(self.product_aliases)} product alias mappings")
|
| 154 |
+
|
| 155 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 156 |
+
"""
|
| 157 |
+
Validate that input contains required fields.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
input_data: Input dictionary
|
| 161 |
+
|
| 162 |
+
Returns:
|
| 163 |
+
True if valid, False otherwise
|
| 164 |
+
"""
|
| 165 |
+
required_fields = ["post_id", "post_content"]
|
| 166 |
+
return all(field in input_data for field in required_fields)
|
| 167 |
+
|
| 168 |
+
def _detect_language(self, text: str, has_primary_keywords: bool = False) -> Dict[str, Any]:
|
| 169 |
+
"""
|
| 170 |
+
Detect the language of text using lingua library.
|
| 171 |
+
|
| 172 |
+
Enhanced logic:
|
| 173 |
+
- Skip detection for short texts (< min_length_for_lang_detection chars)
|
| 174 |
+
- Always return English if primary Sabian keywords are found
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
text: Text to analyze
|
| 178 |
+
has_primary_keywords: Whether primary Sabian keywords were found
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
Dictionary with language detection results
|
| 182 |
+
"""
|
| 183 |
+
try:
|
| 184 |
+
cleaned_text = text.strip()
|
| 185 |
+
|
| 186 |
+
# If text is too short, default to English
|
| 187 |
+
if len(cleaned_text) < self.min_length_for_lang_detection:
|
| 188 |
+
return {
|
| 189 |
+
"language": self.default_language,
|
| 190 |
+
"language_code": "en",
|
| 191 |
+
"is_english": True,
|
| 192 |
+
"confidence": "low",
|
| 193 |
+
"detection_skipped": True,
|
| 194 |
+
"skip_reason": f"Text too short ({len(cleaned_text)} < {self.min_length_for_lang_detection} chars)"
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
# If primary keywords found and always_process_primary is True, treat as English
|
| 198 |
+
if has_primary_keywords and self.always_process_primary:
|
| 199 |
+
# Still try to detect, but override if non-English
|
| 200 |
+
detected = self.language_detector.detect_language_of(cleaned_text)
|
| 201 |
+
|
| 202 |
+
if detected == Language.ENGLISH:
|
| 203 |
+
return {
|
| 204 |
+
"language": "English",
|
| 205 |
+
"language_code": "en",
|
| 206 |
+
"is_english": True,
|
| 207 |
+
"confidence": "high",
|
| 208 |
+
"detection_skipped": False,
|
| 209 |
+
"skip_reason": None
|
| 210 |
+
}
|
| 211 |
+
else:
|
| 212 |
+
# Primary keyword found but detected as non-English
|
| 213 |
+
# Force to English since Sabian is explicitly mentioned
|
| 214 |
+
lang_name = detected.name.capitalize() if detected else "Unknown"
|
| 215 |
+
return {
|
| 216 |
+
"language": "English",
|
| 217 |
+
"language_code": "en",
|
| 218 |
+
"is_english": True,
|
| 219 |
+
"confidence": "medium",
|
| 220 |
+
"detection_skipped": False,
|
| 221 |
+
"skip_reason": None,
|
| 222 |
+
"original_detected_language": lang_name,
|
| 223 |
+
"override_reason": "Primary Sabian keyword found, treating as English"
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
# Standard detection
|
| 227 |
+
detected = self.language_detector.detect_language_of(cleaned_text)
|
| 228 |
+
|
| 229 |
+
if detected is None:
|
| 230 |
+
return {
|
| 231 |
+
"language": self.default_language,
|
| 232 |
+
"language_code": "en",
|
| 233 |
+
"is_english": True,
|
| 234 |
+
"confidence": "low",
|
| 235 |
+
"detection_skipped": False,
|
| 236 |
+
"skip_reason": None
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
if detected == Language.ENGLISH:
|
| 240 |
+
return {
|
| 241 |
+
"language": "English",
|
| 242 |
+
"language_code": "en",
|
| 243 |
+
"is_english": True,
|
| 244 |
+
"confidence": "high",
|
| 245 |
+
"detection_skipped": False,
|
| 246 |
+
"skip_reason": None
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
lang_code = self.LINGUA_TO_ISO.get(detected, "unknown")
|
| 250 |
+
lang_name = detected.name.capitalize()
|
| 251 |
+
|
| 252 |
+
return {
|
| 253 |
+
"language": lang_name,
|
| 254 |
+
"language_code": lang_code,
|
| 255 |
+
"is_english": False,
|
| 256 |
+
"confidence": "high",
|
| 257 |
+
"detection_skipped": False,
|
| 258 |
+
"skip_reason": None
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
except Exception as e:
|
| 262 |
+
logger.warning(f"Language detection failed: {e}")
|
| 263 |
+
return {
|
| 264 |
+
"language": self.default_language,
|
| 265 |
+
"language_code": "en",
|
| 266 |
+
"is_english": True,
|
| 267 |
+
"confidence": "low",
|
| 268 |
+
"detection_skipped": False,
|
| 269 |
+
"skip_reason": None,
|
| 270 |
+
"detection_error": str(e)
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
def _normalize_product_mentions(self, found_products: List[str]) -> List[str]:
|
| 274 |
+
"""
|
| 275 |
+
Normalize product mentions using alias mappings.
|
| 276 |
+
|
| 277 |
+
Args:
|
| 278 |
+
found_products: List of product terms found
|
| 279 |
+
|
| 280 |
+
Returns:
|
| 281 |
+
List of canonical product names
|
| 282 |
+
"""
|
| 283 |
+
normalized = []
|
| 284 |
+
for product in found_products:
|
| 285 |
+
product_lower = product.lower()
|
| 286 |
+
|
| 287 |
+
# Check if it's an alias
|
| 288 |
+
if product_lower in self.product_aliases:
|
| 289 |
+
canonical = self.product_aliases[product_lower]
|
| 290 |
+
if canonical not in normalized:
|
| 291 |
+
normalized.append(canonical)
|
| 292 |
+
# Check if it's a direct product match
|
| 293 |
+
elif product_lower in self.product_keywords:
|
| 294 |
+
# Find the original case version
|
| 295 |
+
for p in self.products_list:
|
| 296 |
+
if p.lower() == product_lower:
|
| 297 |
+
if p not in normalized:
|
| 298 |
+
normalized.append(p)
|
| 299 |
+
break
|
| 300 |
+
|
| 301 |
+
return normalized
|
| 302 |
+
|
| 303 |
+
def _check_relevance(self, text: str) -> Dict[str, Any]:
|
| 304 |
+
"""
|
| 305 |
+
Check if text is relevant to the brand using keyword matching.
|
| 306 |
+
|
| 307 |
+
Enhanced to handle product aliases.
|
| 308 |
+
|
| 309 |
+
Returns:
|
| 310 |
+
Dictionary with relevance assessment
|
| 311 |
+
"""
|
| 312 |
+
text_lower = text.lower()
|
| 313 |
+
|
| 314 |
+
# Tokenize for word boundary matching
|
| 315 |
+
words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
|
| 316 |
+
|
| 317 |
+
# Also check for multi-word phrases (for aliases like "hand hammered")
|
| 318 |
+
all_aliases = set(self.product_aliases.keys())
|
| 319 |
+
|
| 320 |
+
# Check for primary keywords (definitive matches)
|
| 321 |
+
found_primary = self.primary_keywords.intersection(words)
|
| 322 |
+
|
| 323 |
+
# Check for product aliases in text
|
| 324 |
+
found_aliases = []
|
| 325 |
+
for alias in all_aliases:
|
| 326 |
+
if ' ' in alias:
|
| 327 |
+
# Multi-word alias - check in full text
|
| 328 |
+
if alias in text_lower:
|
| 329 |
+
found_aliases.append(alias)
|
| 330 |
+
elif alias in words:
|
| 331 |
+
found_aliases.append(alias)
|
| 332 |
+
|
| 333 |
+
# Map aliases to canonical products
|
| 334 |
+
alias_products = []
|
| 335 |
+
for alias in found_aliases:
|
| 336 |
+
if alias in self.product_aliases:
|
| 337 |
+
canonical = self.product_aliases[alias]
|
| 338 |
+
if canonical not in alias_products:
|
| 339 |
+
alias_products.append(canonical)
|
| 340 |
+
|
| 341 |
+
if found_primary or alias_products:
|
| 342 |
+
all_found = list(found_primary) + found_aliases
|
| 343 |
+
return {
|
| 344 |
+
"preliminary_relevant": True,
|
| 345 |
+
"needs_relevance_validation": False,
|
| 346 |
+
"found_keywords": all_found,
|
| 347 |
+
"mapped_products": alias_products,
|
| 348 |
+
"relevance_type": "primary",
|
| 349 |
+
"relevance_confidence": "high",
|
| 350 |
+
"has_primary_keywords": True
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
# Check for contextual keywords (need validation)
|
| 354 |
+
found_contextual = self.contextual_keywords.intersection(words)
|
| 355 |
+
if found_contextual:
|
| 356 |
+
# Check if there's cymbal context
|
| 357 |
+
found_cymbal_context = self.cymbal_context_keywords.intersection(words)
|
| 358 |
+
has_cymbal_context = len(found_cymbal_context) > 0
|
| 359 |
+
|
| 360 |
+
return {
|
| 361 |
+
"preliminary_relevant": True,
|
| 362 |
+
"needs_relevance_validation": True,
|
| 363 |
+
"found_keywords": list(found_contextual),
|
| 364 |
+
"cymbal_context_found": list(found_cymbal_context) if found_cymbal_context else [],
|
| 365 |
+
"has_cymbal_context": has_cymbal_context,
|
| 366 |
+
"mapped_products": [],
|
| 367 |
+
"relevance_type": "contextual",
|
| 368 |
+
"relevance_confidence": "medium" if has_cymbal_context else "low",
|
| 369 |
+
"has_primary_keywords": False
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
# Check for competitor mentions (might be comparative discussion)
|
| 373 |
+
found_competitors = self.competitor_keywords.intersection(words)
|
| 374 |
+
if found_competitors:
|
| 375 |
+
return {
|
| 376 |
+
"preliminary_relevant": False,
|
| 377 |
+
"needs_relevance_validation": True,
|
| 378 |
+
"found_keywords": list(found_competitors),
|
| 379 |
+
"mapped_products": [],
|
| 380 |
+
"relevance_type": "competitor_only",
|
| 381 |
+
"relevance_confidence": "low",
|
| 382 |
+
"has_primary_keywords": False
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
# No relevant keywords found
|
| 386 |
+
return {
|
| 387 |
+
"preliminary_relevant": False,
|
| 388 |
+
"needs_relevance_validation": False,
|
| 389 |
+
"found_keywords": [],
|
| 390 |
+
"mapped_products": [],
|
| 391 |
+
"relevance_type": "none",
|
| 392 |
+
"relevance_confidence": "high",
|
| 393 |
+
"has_primary_keywords": False
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
def _extract_mentioned_products(self, text: str) -> List[str]:
|
| 397 |
+
"""
|
| 398 |
+
Extract product names mentioned in the text, including aliases.
|
| 399 |
+
|
| 400 |
+
Args:
|
| 401 |
+
text: Text to search
|
| 402 |
+
|
| 403 |
+
Returns:
|
| 404 |
+
List of canonical product names found
|
| 405 |
+
"""
|
| 406 |
+
text_lower = text.lower()
|
| 407 |
+
words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
|
| 408 |
+
|
| 409 |
+
found_products = []
|
| 410 |
+
|
| 411 |
+
# Check direct product mentions
|
| 412 |
+
for product in self.products_list:
|
| 413 |
+
if product.lower() in words:
|
| 414 |
+
if product not in found_products:
|
| 415 |
+
found_products.append(product)
|
| 416 |
+
|
| 417 |
+
# Check aliases
|
| 418 |
+
for alias, canonical in self.product_aliases.items():
|
| 419 |
+
if ' ' in alias:
|
| 420 |
+
# Multi-word alias
|
| 421 |
+
if alias in text_lower:
|
| 422 |
+
if canonical not in found_products:
|
| 423 |
+
found_products.append(canonical)
|
| 424 |
+
elif alias in words:
|
| 425 |
+
if canonical not in found_products:
|
| 426 |
+
found_products.append(canonical)
|
| 427 |
+
|
| 428 |
+
return found_products
|
| 429 |
+
|
| 430 |
+
def _extract_mentioned_competitors(self, text: str) -> List[str]:
|
| 431 |
+
"""
|
| 432 |
+
Extract competitor brand names mentioned in the text.
|
| 433 |
+
|
| 434 |
+
Args:
|
| 435 |
+
text: Text to search
|
| 436 |
+
|
| 437 |
+
Returns:
|
| 438 |
+
List of canonical competitor names found
|
| 439 |
+
"""
|
| 440 |
+
text_lower = text.lower()
|
| 441 |
+
words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
|
| 442 |
+
|
| 443 |
+
found_competitors = set()
|
| 444 |
+
|
| 445 |
+
for alias in self.competitor_keywords:
|
| 446 |
+
if ' ' in alias:
|
| 447 |
+
# Multi-word check
|
| 448 |
+
if alias in text_lower:
|
| 449 |
+
canonical = self.competitor_name_map.get(alias, alias)
|
| 450 |
+
found_competitors.add(canonical)
|
| 451 |
+
elif alias in words:
|
| 452 |
+
canonical = self.competitor_name_map.get(alias, alias)
|
| 453 |
+
found_competitors.add(canonical)
|
| 454 |
+
|
| 455 |
+
return list(found_competitors)
|
| 456 |
+
|
| 457 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 458 |
+
"""
|
| 459 |
+
Process a forum post through preprocessing pipeline.
|
| 460 |
+
|
| 461 |
+
Args:
|
| 462 |
+
input_data: Dictionary containing post data with at least:
|
| 463 |
+
- post_id: Post identifier
|
| 464 |
+
- post_content: Raw HTML content
|
| 465 |
+
- thread_title: Thread title (optional)
|
| 466 |
+
- thread_first_post: First post content (optional)
|
| 467 |
+
- category_title: Category title (optional)
|
| 468 |
+
- category_topic: Category topic (optional)
|
| 469 |
+
|
| 470 |
+
Returns:
|
| 471 |
+
Dictionary with preprocessing results
|
| 472 |
+
"""
|
| 473 |
+
try:
|
| 474 |
+
# Validate input
|
| 475 |
+
if not self.validate_input(input_data):
|
| 476 |
+
return {
|
| 477 |
+
"success": False,
|
| 478 |
+
"error": "Invalid input: missing required fields",
|
| 479 |
+
**input_data
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
+
post_content = input_data.get("post_content", "")
|
| 483 |
+
|
| 484 |
+
# Step 1: Parse HTML content
|
| 485 |
+
parsed = self.html_parser.parse_post_content(post_content)
|
| 486 |
+
reply_content = parsed.get("reply_content", "")
|
| 487 |
+
quoted_content = parsed.get("quoted_content")
|
| 488 |
+
|
| 489 |
+
# Check for empty content
|
| 490 |
+
if not reply_content or len(reply_content.strip()) < self.min_content_length:
|
| 491 |
+
return {
|
| 492 |
+
"success": True,
|
| 493 |
+
"cleaned_content": reply_content,
|
| 494 |
+
"quoted_content": quoted_content,
|
| 495 |
+
"is_empty": True,
|
| 496 |
+
"preliminary_relevant": False,
|
| 497 |
+
"needs_relevance_validation": False,
|
| 498 |
+
**{k: v for k, v in input_data.items() if k != "post_content"}
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
# Step 2: Check relevance FIRST (needed for language detection logic)
|
| 502 |
+
relevance_result = self._check_relevance(reply_content)
|
| 503 |
+
has_primary_keywords = relevance_result.get("has_primary_keywords", False)
|
| 504 |
+
|
| 505 |
+
# Step 3: Build thread context (raw - will be summarized by extraction agent)
|
| 506 |
+
raw_thread_context = self.html_parser.build_thread_context(
|
| 507 |
+
thread_title=input_data.get("thread_title"),
|
| 508 |
+
first_post_content=input_data.get("thread_first_post"),
|
| 509 |
+
category_title=input_data.get("category_title"),
|
| 510 |
+
category_topic=input_data.get("category_topic")
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
+
# Step 4: Detect language (with smart handling)
|
| 514 |
+
lang_result = self._detect_language(reply_content, has_primary_keywords)
|
| 515 |
+
|
| 516 |
+
# Step 5: Extract product and competitor mentions from actual post content
|
| 517 |
+
products_found = self._extract_mentioned_products(reply_content)
|
| 518 |
+
competitors_found = self._extract_mentioned_competitors(reply_content)
|
| 519 |
+
|
| 520 |
+
# Build result
|
| 521 |
+
result = {
|
| 522 |
+
"success": True,
|
| 523 |
+
"is_empty": False,
|
| 524 |
+
|
| 525 |
+
# Cleaned content
|
| 526 |
+
"cleaned_content": reply_content,
|
| 527 |
+
"quoted_content": quoted_content,
|
| 528 |
+
"has_quote": parsed.get("has_quote", False),
|
| 529 |
+
"quoted_author": parsed.get("quoted_author"),
|
| 530 |
+
"raw_thread_context": raw_thread_context,
|
| 531 |
+
|
| 532 |
+
# Language detection
|
| 533 |
+
"detected_language": lang_result["language"],
|
| 534 |
+
"language_code": lang_result["language_code"],
|
| 535 |
+
"is_english": lang_result["is_english"],
|
| 536 |
+
"language_confidence": lang_result["confidence"],
|
| 537 |
+
"language_detection_skipped": lang_result.get("detection_skipped", False),
|
| 538 |
+
|
| 539 |
+
# Relevance assessment
|
| 540 |
+
"preliminary_relevant": relevance_result["preliminary_relevant"],
|
| 541 |
+
"needs_relevance_validation": relevance_result["needs_relevance_validation"],
|
| 542 |
+
"relevance_keywords_found": relevance_result["found_keywords"],
|
| 543 |
+
"relevance_type": relevance_result["relevance_type"],
|
| 544 |
+
"relevance_confidence": relevance_result["relevance_confidence"],
|
| 545 |
+
"has_primary_keywords": has_primary_keywords,
|
| 546 |
+
|
| 547 |
+
# Initial extractions
|
| 548 |
+
"products_detected": products_found,
|
| 549 |
+
"competitors_detected": competitors_found,
|
| 550 |
+
|
| 551 |
+
# Preserve original data
|
| 552 |
+
**{k: v for k, v in input_data.items() if k not in ["post_content"]}
|
| 553 |
+
}
|
| 554 |
+
|
| 555 |
+
# Keep original content for reference
|
| 556 |
+
result["original_content"] = post_content
|
| 557 |
+
|
| 558 |
+
self.log_processing(
|
| 559 |
+
f"Processed post {input_data.get('post_id')}: "
|
| 560 |
+
f"lang={lang_result['language']}, "
|
| 561 |
+
f"relevant={relevance_result['preliminary_relevant']}, "
|
| 562 |
+
f"needs_validation={relevance_result['needs_relevance_validation']}, "
|
| 563 |
+
f"products={products_found}",
|
| 564 |
+
"debug"
|
| 565 |
+
)
|
| 566 |
+
|
| 567 |
+
return result
|
| 568 |
+
|
| 569 |
+
except Exception as e:
|
| 570 |
+
return self.handle_error(e, f"preprocessing post {input_data.get('post_id')}")
|
processing_brand_sentiment/workflow/agents/output_validator_agent.py
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Output Validator Agent for brand sentiment analysis.
|
| 3 |
+
|
| 4 |
+
This agent performs rule-based validation on the final output to ensure:
|
| 5 |
+
1. All values are from predefined lists
|
| 6 |
+
2. Logical consistency between fields
|
| 7 |
+
3. Anomaly detection for manual review flagging
|
| 8 |
+
|
| 9 |
+
This is a deterministic agent (no LLM calls) that acts as a quality gate.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from typing import Dict, Any, List, Set
|
| 13 |
+
import logging
|
| 14 |
+
|
| 15 |
+
from .base_agent import BaseAgent
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class OutputValidatorAgent(BaseAgent):
|
| 21 |
+
"""
|
| 22 |
+
Agent that validates the final output for consistency and quality.
|
| 23 |
+
|
| 24 |
+
Performs rule-based checks without LLM calls to ensure data quality
|
| 25 |
+
and flag posts that may need manual review.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(
|
| 29 |
+
self,
|
| 30 |
+
config: Dict[str, Any],
|
| 31 |
+
brand_config: Dict[str, Any],
|
| 32 |
+
analysis_categories: Dict[str, Any]
|
| 33 |
+
):
|
| 34 |
+
"""
|
| 35 |
+
Initialize the Output Validator Agent.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
config: Agent configuration
|
| 39 |
+
brand_config: Brand-specific configuration
|
| 40 |
+
analysis_categories: Category definitions for validation
|
| 41 |
+
"""
|
| 42 |
+
super().__init__("OutputValidatorAgent", config)
|
| 43 |
+
self.brand_config = brand_config
|
| 44 |
+
self.analysis_categories = analysis_categories
|
| 45 |
+
|
| 46 |
+
# Build valid value sets for validation
|
| 47 |
+
self._build_valid_value_sets()
|
| 48 |
+
|
| 49 |
+
logger.info("OutputValidatorAgent initialized")
|
| 50 |
+
|
| 51 |
+
def _build_valid_value_sets(self) -> None:
|
| 52 |
+
"""Build sets of valid values for efficient validation."""
|
| 53 |
+
brand = self.brand_config.get("brand", {})
|
| 54 |
+
|
| 55 |
+
# Products
|
| 56 |
+
self.valid_products: Set[str] = set(
|
| 57 |
+
p.lower() for p in brand.get("products", [])
|
| 58 |
+
)
|
| 59 |
+
self.products_canonical = {p.lower(): p for p in brand.get("products", [])}
|
| 60 |
+
|
| 61 |
+
# Competitors
|
| 62 |
+
self.valid_competitors: Set[str] = set()
|
| 63 |
+
self.competitors_canonical = {}
|
| 64 |
+
for comp in brand.get("competitors", []):
|
| 65 |
+
if isinstance(comp, dict):
|
| 66 |
+
name = comp.get("name", "")
|
| 67 |
+
self.valid_competitors.add(name.lower())
|
| 68 |
+
self.competitors_canonical[name.lower()] = name
|
| 69 |
+
|
| 70 |
+
# Extract all category values
|
| 71 |
+
self.valid_values = {}
|
| 72 |
+
|
| 73 |
+
category_configs = {
|
| 74 |
+
"author_role": self.analysis_categories.get("author_role", {}),
|
| 75 |
+
"sabian_mention_context": self.analysis_categories.get("sabian_mention_context", {}),
|
| 76 |
+
"sentiment_level": self.analysis_categories.get("sentiment", {}),
|
| 77 |
+
"emotion_type": self.analysis_categories.get("emotions", {}),
|
| 78 |
+
"intents": self.analysis_categories.get("intents", {}),
|
| 79 |
+
"purchase_stage": self.analysis_categories.get("purchase_stage", {}),
|
| 80 |
+
"comparison_type": self.analysis_categories.get("comparison_type", {}),
|
| 81 |
+
"feedback_aspects": self.analysis_categories.get("feedback_aspects", {}),
|
| 82 |
+
"decision_drivers": self.analysis_categories.get("decision_drivers", {}),
|
| 83 |
+
"product_attributes": self.analysis_categories.get("product_attributes", {}),
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
for key, config in category_configs.items():
|
| 87 |
+
if "categories" in config:
|
| 88 |
+
self.valid_values[key] = set(
|
| 89 |
+
c["value"].lower() for c in config["categories"]
|
| 90 |
+
)
|
| 91 |
+
elif "levels" in config:
|
| 92 |
+
self.valid_values[key] = set(
|
| 93 |
+
c["value"].lower() for c in config["levels"]
|
| 94 |
+
)
|
| 95 |
+
else:
|
| 96 |
+
self.valid_values[key] = set()
|
| 97 |
+
|
| 98 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 99 |
+
"""Validate that input contains required fields."""
|
| 100 |
+
# The validator accepts any input - it will validate what's there
|
| 101 |
+
return True
|
| 102 |
+
|
| 103 |
+
def _validate_list_values(
|
| 104 |
+
self,
|
| 105 |
+
values: List[Any],
|
| 106 |
+
valid_set: Set[str],
|
| 107 |
+
field_name: str
|
| 108 |
+
) -> Dict[str, Any]:
|
| 109 |
+
"""
|
| 110 |
+
Validate list values against a set of valid values.
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
Dictionary with validation results
|
| 114 |
+
"""
|
| 115 |
+
if not values:
|
| 116 |
+
return {"valid": True, "invalid_values": [], "field": field_name}
|
| 117 |
+
|
| 118 |
+
invalid = []
|
| 119 |
+
for v in values:
|
| 120 |
+
if isinstance(v, str) and v.lower() not in valid_set:
|
| 121 |
+
invalid.append(v)
|
| 122 |
+
|
| 123 |
+
return {
|
| 124 |
+
"valid": len(invalid) == 0,
|
| 125 |
+
"invalid_values": invalid,
|
| 126 |
+
"field": field_name
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
def _validate_single_value(
|
| 130 |
+
self,
|
| 131 |
+
value: Any,
|
| 132 |
+
valid_set: Set[str],
|
| 133 |
+
field_name: str,
|
| 134 |
+
allow_none: bool = True
|
| 135 |
+
) -> Dict[str, Any]:
|
| 136 |
+
"""
|
| 137 |
+
Validate a single value against a set of valid values.
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
Dictionary with validation results
|
| 141 |
+
"""
|
| 142 |
+
if value is None:
|
| 143 |
+
return {"valid": allow_none, "invalid_value": None if allow_none else value, "field": field_name}
|
| 144 |
+
|
| 145 |
+
if isinstance(value, str) and value.lower() in valid_set:
|
| 146 |
+
return {"valid": True, "invalid_value": None, "field": field_name}
|
| 147 |
+
|
| 148 |
+
return {"valid": False, "invalid_value": value, "field": field_name}
|
| 149 |
+
|
| 150 |
+
def _check_logical_consistency(self, data: Dict[str, Any]) -> List[str]:
|
| 151 |
+
"""
|
| 152 |
+
Check for logical consistency between fields.
|
| 153 |
+
|
| 154 |
+
Note: Empty products_mentioned is OK even when relevant - users may
|
| 155 |
+
discuss the Sabian brand generally without specific products.
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
List of inconsistency warnings
|
| 159 |
+
"""
|
| 160 |
+
warnings = []
|
| 161 |
+
is_relevant = data.get("is_relevant", False)
|
| 162 |
+
|
| 163 |
+
# Check 1: If not relevant, certain fields should be empty/null
|
| 164 |
+
if not is_relevant:
|
| 165 |
+
if data.get("sabian_mention_context"):
|
| 166 |
+
warnings.append(
|
| 167 |
+
"sabian_mention_context should be null when is_relevant=False"
|
| 168 |
+
)
|
| 169 |
+
if data.get("sentiment_level") and data.get("sentiment_level") != "neutral":
|
| 170 |
+
warnings.append(
|
| 171 |
+
"sentiment_level should be null/neutral when is_relevant=False"
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# Check 2: Comparison type should only be set if comparing intent exists
|
| 175 |
+
if data.get("comparison_type"):
|
| 176 |
+
intents = data.get("intents", [])
|
| 177 |
+
if "comparing" not in intents:
|
| 178 |
+
warnings.append(
|
| 179 |
+
"comparison_type is set but 'comparing' not in intents"
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
# Check 3: Author perspective fields consistency
|
| 183 |
+
# If author is giving advice (providing_information) without sharing experience,
|
| 184 |
+
# pain_points and delight_factors should typically be empty
|
| 185 |
+
intents = data.get("intents", [])
|
| 186 |
+
if "providing_information" in intents and "sharing_experience" not in intents:
|
| 187 |
+
if data.get("pain_points") or data.get("delight_factors"):
|
| 188 |
+
warnings.append(
|
| 189 |
+
"pain_points/delight_factors set for advice-giving post without sharing_experience intent"
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
return warnings
|
| 193 |
+
|
| 194 |
+
def _fix_overlapping_feedback(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
| 195 |
+
"""
|
| 196 |
+
Fix overlapping values between pain_points and delight_factors.
|
| 197 |
+
|
| 198 |
+
Rule: The same aspect cannot be both a pain point and a delight factor.
|
| 199 |
+
Resolution: Use sentiment to determine which to keep, or clear both if neutral.
|
| 200 |
+
|
| 201 |
+
Args:
|
| 202 |
+
data: Dictionary with analysis results
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
Updated dictionary with fixed pain_points and delight_factors
|
| 206 |
+
"""
|
| 207 |
+
pain_points = data.get("pain_points", []) or []
|
| 208 |
+
delight_factors = data.get("delight_factors", []) or []
|
| 209 |
+
|
| 210 |
+
if not pain_points or not delight_factors:
|
| 211 |
+
return data
|
| 212 |
+
|
| 213 |
+
# Find overlapping values
|
| 214 |
+
pain_set = set(p.lower() if isinstance(p, str) else p for p in pain_points)
|
| 215 |
+
delight_set = set(d.lower() if isinstance(d, str) else d for d in delight_factors)
|
| 216 |
+
overlap = pain_set.intersection(delight_set)
|
| 217 |
+
|
| 218 |
+
if not overlap:
|
| 219 |
+
return data
|
| 220 |
+
|
| 221 |
+
# Get sentiment to determine which to keep
|
| 222 |
+
sentiment = data.get("sentiment_level", "neutral")
|
| 223 |
+
|
| 224 |
+
# Create new lists without overlapping values
|
| 225 |
+
if sentiment in ["positive", "very_positive"]:
|
| 226 |
+
# Keep in delight_factors, remove from pain_points
|
| 227 |
+
new_pain_points = [p for p in pain_points if p.lower() not in overlap]
|
| 228 |
+
new_delight_factors = delight_factors
|
| 229 |
+
elif sentiment in ["negative", "very_negative"]:
|
| 230 |
+
# Keep in pain_points, remove from delight_factors
|
| 231 |
+
new_pain_points = pain_points
|
| 232 |
+
new_delight_factors = [d for d in delight_factors if d.lower() not in overlap]
|
| 233 |
+
else:
|
| 234 |
+
# Neutral sentiment - clear both (can't determine intent)
|
| 235 |
+
new_pain_points = [p for p in pain_points if p.lower() not in overlap]
|
| 236 |
+
new_delight_factors = [d for d in delight_factors if d.lower() not in overlap]
|
| 237 |
+
|
| 238 |
+
# Update data
|
| 239 |
+
data["pain_points"] = new_pain_points
|
| 240 |
+
data["delight_factors"] = new_delight_factors
|
| 241 |
+
|
| 242 |
+
logger.debug(
|
| 243 |
+
f"Fixed overlapping feedback: removed {overlap} from "
|
| 244 |
+
f"{'pain_points' if sentiment in ['positive', 'very_positive'] else 'delight_factors' if sentiment in ['negative', 'very_negative'] else 'both'}"
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
return data
|
| 248 |
+
|
| 249 |
+
def _detect_anomalies(self, data: Dict[str, Any]) -> List[str]:
|
| 250 |
+
"""
|
| 251 |
+
Detect anomalies that might need manual review.
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
List of anomaly flags
|
| 255 |
+
"""
|
| 256 |
+
anomalies = []
|
| 257 |
+
|
| 258 |
+
# Anomaly 1: Low confidence relevance
|
| 259 |
+
if data.get("is_relevant") and data.get("relevance_confidence") == "low":
|
| 260 |
+
anomalies.append("low_confidence_relevant")
|
| 261 |
+
|
| 262 |
+
# Anomaly 2: Sarcasm detected - sentiment might be inverted
|
| 263 |
+
if data.get("sarcasm_detected"):
|
| 264 |
+
anomalies.append("sarcasm_detected")
|
| 265 |
+
|
| 266 |
+
# Anomaly 3: Very short content marked as relevant
|
| 267 |
+
content = data.get("cleaned_content", "")
|
| 268 |
+
if data.get("is_relevant") and len(content) < 20:
|
| 269 |
+
anomalies.append("short_relevant_content")
|
| 270 |
+
|
| 271 |
+
# Anomaly 4: Switching behavior detected
|
| 272 |
+
comparison_type = data.get("comparison_type", "")
|
| 273 |
+
if comparison_type in ["switching_to_sabian", "switching_from_sabian"]:
|
| 274 |
+
anomalies.append(f"brand_switching_{comparison_type}")
|
| 275 |
+
|
| 276 |
+
return anomalies
|
| 277 |
+
|
| 278 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 279 |
+
"""
|
| 280 |
+
Process and validate the analysis output.
|
| 281 |
+
|
| 282 |
+
Args:
|
| 283 |
+
input_data: Dictionary with all analysis results
|
| 284 |
+
|
| 285 |
+
Returns:
|
| 286 |
+
Dictionary with validation results added
|
| 287 |
+
"""
|
| 288 |
+
try:
|
| 289 |
+
validation_errors = []
|
| 290 |
+
validation_warnings = []
|
| 291 |
+
|
| 292 |
+
# Skip detailed validation for non-relevant or skipped posts
|
| 293 |
+
if not input_data.get("is_relevant", False) or input_data.get("analysis_skipped", False):
|
| 294 |
+
return {
|
| 295 |
+
**input_data,
|
| 296 |
+
"validation_passed": True,
|
| 297 |
+
"validation_errors": [],
|
| 298 |
+
"validation_warnings": [],
|
| 299 |
+
"validation_flags": [],
|
| 300 |
+
"processing_status": "completed"
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
# Fix overlapping pain_points and delight_factors (safety net)
|
| 304 |
+
input_data = self._fix_overlapping_feedback(input_data)
|
| 305 |
+
|
| 306 |
+
# Validate products_mentioned
|
| 307 |
+
products_result = self._validate_list_values(
|
| 308 |
+
input_data.get("products_mentioned", []),
|
| 309 |
+
self.valid_products,
|
| 310 |
+
"products_mentioned"
|
| 311 |
+
)
|
| 312 |
+
if not products_result["valid"]:
|
| 313 |
+
validation_errors.append(
|
| 314 |
+
f"Invalid products: {products_result['invalid_values']}"
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
# Validate competitors_mentioned
|
| 318 |
+
competitors_result = self._validate_list_values(
|
| 319 |
+
input_data.get("competitors_mentioned", []),
|
| 320 |
+
self.valid_competitors,
|
| 321 |
+
"competitors_mentioned"
|
| 322 |
+
)
|
| 323 |
+
if not competitors_result["valid"]:
|
| 324 |
+
validation_errors.append(
|
| 325 |
+
f"Invalid competitors: {competitors_result['invalid_values']}"
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
# Validate categorical fields
|
| 329 |
+
categorical_validations = [
|
| 330 |
+
("author_role", "author_role", True),
|
| 331 |
+
("sabian_mention_context", "sabian_mention_context", True),
|
| 332 |
+
("sentiment_level", "sentiment_level", True),
|
| 333 |
+
("emotion_type", "emotion_type", True),
|
| 334 |
+
("purchase_stage", "purchase_stage", True),
|
| 335 |
+
("comparison_type", "comparison_type", True),
|
| 336 |
+
]
|
| 337 |
+
|
| 338 |
+
for field, valid_key, allow_none in categorical_validations:
|
| 339 |
+
result = self._validate_single_value(
|
| 340 |
+
input_data.get(field),
|
| 341 |
+
self.valid_values.get(valid_key, set()),
|
| 342 |
+
field,
|
| 343 |
+
allow_none
|
| 344 |
+
)
|
| 345 |
+
if not result["valid"]:
|
| 346 |
+
validation_errors.append(
|
| 347 |
+
f"Invalid {field}: {result['invalid_value']}"
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
# Validate list fields
|
| 351 |
+
list_validations = [
|
| 352 |
+
("intents", "intents"),
|
| 353 |
+
("product_attributes", "product_attributes"),
|
| 354 |
+
("pain_points", "feedback_aspects"),
|
| 355 |
+
("delight_factors", "feedback_aspects"),
|
| 356 |
+
("decision_drivers", "decision_drivers"),
|
| 357 |
+
]
|
| 358 |
+
|
| 359 |
+
for field, valid_key in list_validations:
|
| 360 |
+
result = self._validate_list_values(
|
| 361 |
+
input_data.get(field, []),
|
| 362 |
+
self.valid_values.get(valid_key, set()),
|
| 363 |
+
field
|
| 364 |
+
)
|
| 365 |
+
if not result["valid"]:
|
| 366 |
+
validation_warnings.append(
|
| 367 |
+
f"Invalid values in {field}: {result['invalid_values']}"
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
# Check logical consistency
|
| 371 |
+
consistency_warnings = self._check_logical_consistency(input_data)
|
| 372 |
+
validation_warnings.extend(consistency_warnings)
|
| 373 |
+
|
| 374 |
+
# Detect anomalies
|
| 375 |
+
anomalies = self._detect_anomalies(input_data)
|
| 376 |
+
|
| 377 |
+
# Determine overall validation status
|
| 378 |
+
validation_passed = len(validation_errors) == 0
|
| 379 |
+
|
| 380 |
+
# Set processing status
|
| 381 |
+
if validation_errors:
|
| 382 |
+
processing_status = "validation_failed"
|
| 383 |
+
elif anomalies:
|
| 384 |
+
processing_status = "completed_with_flags"
|
| 385 |
+
else:
|
| 386 |
+
processing_status = "completed"
|
| 387 |
+
|
| 388 |
+
result = {
|
| 389 |
+
**input_data,
|
| 390 |
+
"validation_passed": validation_passed,
|
| 391 |
+
"validation_errors": validation_errors,
|
| 392 |
+
"validation_warnings": validation_warnings,
|
| 393 |
+
"validation_flags": anomalies,
|
| 394 |
+
"processing_status": processing_status
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
if validation_errors or validation_warnings or anomalies:
|
| 398 |
+
self.log_processing(
|
| 399 |
+
f"Validation complete: passed={validation_passed}, "
|
| 400 |
+
f"errors={len(validation_errors)}, warnings={len(validation_warnings)}, "
|
| 401 |
+
f"flags={anomalies}",
|
| 402 |
+
"debug"
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
return result
|
| 406 |
+
|
| 407 |
+
except Exception as e:
|
| 408 |
+
return self.handle_error(e, "output validation")
|
processing_brand_sentiment/workflow/agents/preprocessor_agent.py
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Preprocessor Agent for brand sentiment analysis.
|
| 3 |
+
Handles HTML parsing, text cleaning, language detection, and initial relevance screening.
|
| 4 |
+
This is a deterministic agent (no LLM calls except for language detection fallback).
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import re
|
| 8 |
+
from typing import Dict, Any, List, Optional, Set
|
| 9 |
+
from lingua import Language, LanguageDetectorBuilder
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
from .base_agent import BaseAgent
|
| 13 |
+
from utils.html_parser import HTMLParser
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class PreprocessorAgent(BaseAgent):
|
| 19 |
+
"""
|
| 20 |
+
Agent that preprocesses forum posts:
|
| 21 |
+
- Parses HTML to extract reply and quoted content
|
| 22 |
+
- Cleans and normalizes text
|
| 23 |
+
- Detects language
|
| 24 |
+
- Performs initial keyword-based relevance screening
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
# Lingua to ISO 639-1 language code mapping
|
| 28 |
+
LINGUA_TO_ISO = {
|
| 29 |
+
Language.ENGLISH: "en",
|
| 30 |
+
Language.SPANISH: "es",
|
| 31 |
+
Language.FRENCH: "fr",
|
| 32 |
+
Language.GERMAN: "de",
|
| 33 |
+
Language.ITALIAN: "it",
|
| 34 |
+
Language.PORTUGUESE: "pt",
|
| 35 |
+
Language.RUSSIAN: "ru",
|
| 36 |
+
Language.JAPANESE: "ja",
|
| 37 |
+
Language.KOREAN: "ko",
|
| 38 |
+
Language.CHINESE: "zh",
|
| 39 |
+
Language.ARABIC: "ar",
|
| 40 |
+
Language.HINDI: "hi",
|
| 41 |
+
Language.DUTCH: "nl",
|
| 42 |
+
Language.SWEDISH: "sv",
|
| 43 |
+
Language.POLISH: "pl",
|
| 44 |
+
Language.TURKISH: "tr"
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]):
|
| 48 |
+
"""
|
| 49 |
+
Initialize the Preprocessor Agent.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
config: Agent configuration
|
| 53 |
+
brand_config: Brand-specific configuration with keywords and products
|
| 54 |
+
"""
|
| 55 |
+
super().__init__("PreprocessorAgent", config)
|
| 56 |
+
self.brand_config = brand_config
|
| 57 |
+
self.html_parser = HTMLParser()
|
| 58 |
+
|
| 59 |
+
# Initialize lingua detector
|
| 60 |
+
self.language_detector = LanguageDetectorBuilder.from_all_languages().build()
|
| 61 |
+
|
| 62 |
+
# Build keyword sets for efficient lookup
|
| 63 |
+
self._build_keyword_sets()
|
| 64 |
+
|
| 65 |
+
logger.info("PreprocessorAgent initialized")
|
| 66 |
+
|
| 67 |
+
def _build_keyword_sets(self) -> None:
|
| 68 |
+
"""Build keyword sets from brand configuration for efficient relevance checking."""
|
| 69 |
+
relevance_config = self.brand_config.get("relevance_keywords", {})
|
| 70 |
+
|
| 71 |
+
# Primary keywords - definitive Sabian mentions
|
| 72 |
+
primary = relevance_config.get("primary", {}).get("keywords", [])
|
| 73 |
+
self.primary_keywords: Set[str] = set(k.lower() for k in primary)
|
| 74 |
+
|
| 75 |
+
# Contextual keywords - need disambiguation (HH, AA)
|
| 76 |
+
contextual = relevance_config.get("contextual", {}).get("keywords", [])
|
| 77 |
+
self.contextual_keywords: Set[str] = set(k.lower() for k in contextual)
|
| 78 |
+
|
| 79 |
+
# Cymbal context keywords - help disambiguate contextual terms
|
| 80 |
+
cymbal_context = relevance_config.get("cymbal_context", {}).get("keywords", [])
|
| 81 |
+
self.cymbal_context_keywords: Set[str] = set(k.lower() for k in cymbal_context)
|
| 82 |
+
|
| 83 |
+
# Competitor names for detection
|
| 84 |
+
competitors = self.brand_config.get("brand", {}).get("competitors", [])
|
| 85 |
+
self.competitor_keywords: Set[str] = set()
|
| 86 |
+
for comp in competitors:
|
| 87 |
+
if isinstance(comp, dict):
|
| 88 |
+
self.competitor_keywords.add(comp.get("name", "").lower())
|
| 89 |
+
for alias in comp.get("aliases", []):
|
| 90 |
+
self.competitor_keywords.add(alias.lower())
|
| 91 |
+
else:
|
| 92 |
+
self.competitor_keywords.add(str(comp).lower())
|
| 93 |
+
|
| 94 |
+
# Product names
|
| 95 |
+
products = self.brand_config.get("brand", {}).get("products", [])
|
| 96 |
+
self.product_keywords: Set[str] = set(p.lower() for p in products)
|
| 97 |
+
|
| 98 |
+
logger.info(f"Built keyword sets: {len(self.primary_keywords)} primary, "
|
| 99 |
+
f"{len(self.contextual_keywords)} contextual, "
|
| 100 |
+
f"{len(self.product_keywords)} products")
|
| 101 |
+
|
| 102 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 103 |
+
"""
|
| 104 |
+
Validate that input contains required fields.
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
input_data: Input dictionary
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
True if valid, False otherwise
|
| 111 |
+
"""
|
| 112 |
+
required_fields = ["post_id", "post_content"]
|
| 113 |
+
return all(field in input_data for field in required_fields)
|
| 114 |
+
|
| 115 |
+
def _detect_language(self, text: str) -> Dict[str, Any]:
|
| 116 |
+
"""
|
| 117 |
+
Detect the language of text using lingua library.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
text: Text to analyze
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
Dictionary with language detection results
|
| 124 |
+
"""
|
| 125 |
+
try:
|
| 126 |
+
cleaned_text = text.strip()
|
| 127 |
+
if not cleaned_text or len(cleaned_text) < 3:
|
| 128 |
+
return {
|
| 129 |
+
"language": "English",
|
| 130 |
+
"language_code": "en",
|
| 131 |
+
"is_english": True,
|
| 132 |
+
"confidence": "low"
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
detected = self.language_detector.detect_language_of(cleaned_text)
|
| 136 |
+
|
| 137 |
+
if detected is None:
|
| 138 |
+
return {
|
| 139 |
+
"language": "English",
|
| 140 |
+
"language_code": "en",
|
| 141 |
+
"is_english": True,
|
| 142 |
+
"confidence": "low"
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
if detected == Language.ENGLISH:
|
| 146 |
+
return {
|
| 147 |
+
"language": "English",
|
| 148 |
+
"language_code": "en",
|
| 149 |
+
"is_english": True,
|
| 150 |
+
"confidence": "high"
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
lang_code = self.LINGUA_TO_ISO.get(detected, "unknown")
|
| 154 |
+
lang_name = detected.name.capitalize()
|
| 155 |
+
|
| 156 |
+
return {
|
| 157 |
+
"language": lang_name,
|
| 158 |
+
"language_code": lang_code,
|
| 159 |
+
"is_english": False,
|
| 160 |
+
"confidence": "high"
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
except Exception as e:
|
| 164 |
+
logger.warning(f"Language detection failed: {e}")
|
| 165 |
+
return {
|
| 166 |
+
"language": "English",
|
| 167 |
+
"language_code": "en",
|
| 168 |
+
"is_english": True,
|
| 169 |
+
"confidence": "low"
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
def _check_relevance(self, text: str) -> Dict[str, Any]:
|
| 173 |
+
"""
|
| 174 |
+
Check if text is relevant to the brand using keyword matching.
|
| 175 |
+
|
| 176 |
+
Returns:
|
| 177 |
+
Dictionary with relevance assessment:
|
| 178 |
+
- preliminary_relevant: Initial relevance assessment
|
| 179 |
+
- needs_relevance_validation: True if contains ambiguous terms needing LLM check
|
| 180 |
+
- found_keywords: Keywords found in the text
|
| 181 |
+
- relevance_type: 'primary', 'contextual', or 'none'
|
| 182 |
+
"""
|
| 183 |
+
text_lower = text.lower()
|
| 184 |
+
|
| 185 |
+
# Tokenize for word boundary matching
|
| 186 |
+
words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
|
| 187 |
+
|
| 188 |
+
# Check for primary keywords (definitive matches)
|
| 189 |
+
found_primary = self.primary_keywords.intersection(words)
|
| 190 |
+
if found_primary:
|
| 191 |
+
return {
|
| 192 |
+
"preliminary_relevant": True,
|
| 193 |
+
"needs_relevance_validation": False,
|
| 194 |
+
"found_keywords": list(found_primary),
|
| 195 |
+
"relevance_type": "primary",
|
| 196 |
+
"relevance_confidence": "high"
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
# Check for contextual keywords (need validation)
|
| 200 |
+
found_contextual = self.contextual_keywords.intersection(words)
|
| 201 |
+
if found_contextual:
|
| 202 |
+
# Check if there's cymbal context
|
| 203 |
+
found_cymbal_context = self.cymbal_context_keywords.intersection(words)
|
| 204 |
+
has_cymbal_context = len(found_cymbal_context) > 0
|
| 205 |
+
|
| 206 |
+
return {
|
| 207 |
+
"preliminary_relevant": True, # Potentially relevant
|
| 208 |
+
"needs_relevance_validation": True, # Needs LLM confirmation
|
| 209 |
+
"found_keywords": list(found_contextual),
|
| 210 |
+
"cymbal_context_found": list(found_cymbal_context) if found_cymbal_context else [],
|
| 211 |
+
"has_cymbal_context": has_cymbal_context,
|
| 212 |
+
"relevance_type": "contextual",
|
| 213 |
+
"relevance_confidence": "medium" if has_cymbal_context else "low"
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
# Check for competitor mentions (might be comparative discussion)
|
| 217 |
+
found_competitors = self.competitor_keywords.intersection(words)
|
| 218 |
+
if found_competitors:
|
| 219 |
+
# Has competitor mention but no Sabian mention
|
| 220 |
+
# Could still be relevant in a comparison context
|
| 221 |
+
return {
|
| 222 |
+
"preliminary_relevant": False,
|
| 223 |
+
"needs_relevance_validation": True, # LLM should check context
|
| 224 |
+
"found_keywords": list(found_competitors),
|
| 225 |
+
"relevance_type": "competitor_only",
|
| 226 |
+
"relevance_confidence": "low"
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
# No relevant keywords found
|
| 230 |
+
return {
|
| 231 |
+
"preliminary_relevant": False,
|
| 232 |
+
"needs_relevance_validation": False,
|
| 233 |
+
"found_keywords": [],
|
| 234 |
+
"relevance_type": "none",
|
| 235 |
+
"relevance_confidence": "high"
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
def _extract_mentioned_products(self, text: str) -> List[str]:
|
| 239 |
+
"""
|
| 240 |
+
Extract product names mentioned in the text.
|
| 241 |
+
|
| 242 |
+
Args:
|
| 243 |
+
text: Text to search
|
| 244 |
+
|
| 245 |
+
Returns:
|
| 246 |
+
List of product names found
|
| 247 |
+
"""
|
| 248 |
+
text_lower = text.lower()
|
| 249 |
+
words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
|
| 250 |
+
|
| 251 |
+
found_products = []
|
| 252 |
+
products = self.brand_config.get("brand", {}).get("products", [])
|
| 253 |
+
|
| 254 |
+
for product in products:
|
| 255 |
+
if product.lower() in words:
|
| 256 |
+
found_products.append(product)
|
| 257 |
+
|
| 258 |
+
return found_products
|
| 259 |
+
|
| 260 |
+
def _extract_mentioned_competitors(self, text: str) -> List[str]:
|
| 261 |
+
"""
|
| 262 |
+
Extract competitor names mentioned in the text.
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
text: Text to search
|
| 266 |
+
|
| 267 |
+
Returns:
|
| 268 |
+
List of competitor names found
|
| 269 |
+
"""
|
| 270 |
+
text_lower = text.lower()
|
| 271 |
+
words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
|
| 272 |
+
|
| 273 |
+
found_competitors = []
|
| 274 |
+
competitors = self.brand_config.get("brand", {}).get("competitors", [])
|
| 275 |
+
|
| 276 |
+
for comp in competitors:
|
| 277 |
+
if isinstance(comp, dict):
|
| 278 |
+
name = comp.get("name", "")
|
| 279 |
+
aliases = comp.get("aliases", [])
|
| 280 |
+
|
| 281 |
+
# Check name and aliases
|
| 282 |
+
if name.lower() in words:
|
| 283 |
+
if name not in found_competitors:
|
| 284 |
+
found_competitors.append(name)
|
| 285 |
+
else:
|
| 286 |
+
for alias in aliases:
|
| 287 |
+
if alias.lower() in words:
|
| 288 |
+
if name not in found_competitors:
|
| 289 |
+
found_competitors.append(name)
|
| 290 |
+
break
|
| 291 |
+
else:
|
| 292 |
+
if str(comp).lower() in words:
|
| 293 |
+
found_competitors.append(str(comp))
|
| 294 |
+
|
| 295 |
+
return found_competitors
|
| 296 |
+
|
| 297 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 298 |
+
"""
|
| 299 |
+
Process a forum post through preprocessing pipeline.
|
| 300 |
+
|
| 301 |
+
Args:
|
| 302 |
+
input_data: Dictionary containing post data with at least:
|
| 303 |
+
- post_id: Post identifier
|
| 304 |
+
- post_content: Raw HTML content
|
| 305 |
+
- thread_title: Thread title (optional)
|
| 306 |
+
- thread_first_post: First post content (optional)
|
| 307 |
+
- category_title: Category title (optional)
|
| 308 |
+
- category_topic: Category topic (optional)
|
| 309 |
+
|
| 310 |
+
Returns:
|
| 311 |
+
Dictionary with preprocessing results
|
| 312 |
+
"""
|
| 313 |
+
try:
|
| 314 |
+
# Validate input
|
| 315 |
+
if not self.validate_input(input_data):
|
| 316 |
+
return {
|
| 317 |
+
"success": False,
|
| 318 |
+
"error": "Invalid input: missing required fields",
|
| 319 |
+
**input_data
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
post_content = input_data.get("post_content", "")
|
| 323 |
+
|
| 324 |
+
# Step 1: Parse HTML content
|
| 325 |
+
parsed = self.html_parser.parse_post_content(post_content)
|
| 326 |
+
reply_content = parsed.get("reply_content", "")
|
| 327 |
+
quoted_content = parsed.get("quoted_content")
|
| 328 |
+
|
| 329 |
+
# Check for empty content
|
| 330 |
+
if not reply_content or len(reply_content.strip()) < 3:
|
| 331 |
+
return {
|
| 332 |
+
"success": True,
|
| 333 |
+
"cleaned_content": reply_content,
|
| 334 |
+
"quoted_content": quoted_content,
|
| 335 |
+
"is_empty": True,
|
| 336 |
+
"preliminary_relevant": False,
|
| 337 |
+
"needs_relevance_validation": False,
|
| 338 |
+
**{k: v for k, v in input_data.items() if k != "post_content"}
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
# Step 2: Build thread context
|
| 342 |
+
thread_context = self.html_parser.build_thread_context(
|
| 343 |
+
thread_title=input_data.get("thread_title"),
|
| 344 |
+
first_post_content=input_data.get("thread_first_post"),
|
| 345 |
+
category_title=input_data.get("category_title"),
|
| 346 |
+
category_topic=input_data.get("category_topic")
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
# Step 3: Detect language
|
| 350 |
+
lang_result = self._detect_language(reply_content)
|
| 351 |
+
|
| 352 |
+
# Step 4: Check relevance - ONLY on the actual post content, NOT quoted/context
|
| 353 |
+
# The quoted content and thread context are for understanding, not for relevance determination
|
| 354 |
+
relevance_result = self._check_relevance(reply_content)
|
| 355 |
+
|
| 356 |
+
# Step 5: Extract product and competitor mentions - ONLY from actual post content
|
| 357 |
+
# We don't want to extract from quoted content as that will be processed separately
|
| 358 |
+
products_found = self._extract_mentioned_products(reply_content)
|
| 359 |
+
competitors_found = self._extract_mentioned_competitors(reply_content)
|
| 360 |
+
|
| 361 |
+
# Build result
|
| 362 |
+
result = {
|
| 363 |
+
"success": True,
|
| 364 |
+
"is_empty": False,
|
| 365 |
+
|
| 366 |
+
# Cleaned content
|
| 367 |
+
"cleaned_content": reply_content,
|
| 368 |
+
"quoted_content": quoted_content,
|
| 369 |
+
"has_quote": parsed.get("has_quote", False),
|
| 370 |
+
"quoted_author": parsed.get("quoted_author"),
|
| 371 |
+
"thread_context": thread_context,
|
| 372 |
+
|
| 373 |
+
# Language detection
|
| 374 |
+
"detected_language": lang_result["language"],
|
| 375 |
+
"language_code": lang_result["language_code"],
|
| 376 |
+
"is_english": lang_result["is_english"],
|
| 377 |
+
"language_confidence": lang_result["confidence"],
|
| 378 |
+
|
| 379 |
+
# Relevance assessment
|
| 380 |
+
"preliminary_relevant": relevance_result["preliminary_relevant"],
|
| 381 |
+
"needs_relevance_validation": relevance_result["needs_relevance_validation"],
|
| 382 |
+
"relevance_keywords_found": relevance_result["found_keywords"],
|
| 383 |
+
"relevance_type": relevance_result["relevance_type"],
|
| 384 |
+
"relevance_confidence": relevance_result["relevance_confidence"],
|
| 385 |
+
|
| 386 |
+
# Initial extractions
|
| 387 |
+
"products_detected": products_found,
|
| 388 |
+
"competitors_detected": competitors_found,
|
| 389 |
+
|
| 390 |
+
# Preserve original data
|
| 391 |
+
**{k: v for k, v in input_data.items() if k not in ["post_content"]}
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
# Keep original content for reference
|
| 395 |
+
result["original_content"] = post_content
|
| 396 |
+
|
| 397 |
+
self.log_processing(
|
| 398 |
+
f"Processed post {input_data.get('post_id')}: "
|
| 399 |
+
f"lang={lang_result['language']}, "
|
| 400 |
+
f"relevant={relevance_result['preliminary_relevant']}, "
|
| 401 |
+
f"needs_validation={relevance_result['needs_relevance_validation']}",
|
| 402 |
+
"debug"
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
return result
|
| 406 |
+
|
| 407 |
+
except Exception as e:
|
| 408 |
+
return self.handle_error(e, f"preprocessing post {input_data.get('post_id')}")
|
processing_brand_sentiment/workflow/agents/relevance_validator_agent.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Relevance Validator Agent for brand sentiment analysis.
|
| 3 |
+
Lightweight LLM-based agent that confirms whether ambiguous terms (HH, AA)
|
| 4 |
+
refer to Sabian products or generic terms.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import Dict, Any
|
| 8 |
+
import json
|
| 9 |
+
from langchain_openai import ChatOpenAI
|
| 10 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 11 |
+
import logging
|
| 12 |
+
|
| 13 |
+
from .base_agent import BaseAgent
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class RelevanceValidatorAgent(BaseAgent):
|
| 19 |
+
"""
|
| 20 |
+
Agent that validates whether posts with ambiguous terms (like HH, AA)
|
| 21 |
+
are actually referring to Sabian products or generic terms.
|
| 22 |
+
|
| 23 |
+
This is a lightweight LLM call specifically for disambiguation.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(self, config: Dict[str, Any], api_key: str, brand_config: Dict[str, Any]):
|
| 27 |
+
"""
|
| 28 |
+
Initialize the Relevance Validator Agent.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
config: Agent configuration
|
| 32 |
+
api_key: OpenAI API key
|
| 33 |
+
brand_config: Brand-specific configuration with product info
|
| 34 |
+
"""
|
| 35 |
+
super().__init__("RelevanceValidatorAgent", config)
|
| 36 |
+
self.api_key = api_key
|
| 37 |
+
self.brand_config = brand_config
|
| 38 |
+
|
| 39 |
+
self.llm = ChatOpenAI(
|
| 40 |
+
model=self.model,
|
| 41 |
+
temperature=self.temperature,
|
| 42 |
+
api_key=self.api_key
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# Build disambiguation context from brand config
|
| 46 |
+
self._build_disambiguation_context()
|
| 47 |
+
|
| 48 |
+
logger.info("RelevanceValidatorAgent initialized")
|
| 49 |
+
|
| 50 |
+
def _build_disambiguation_context(self) -> None:
|
| 51 |
+
"""Build context strings for disambiguation from brand config."""
|
| 52 |
+
brand = self.brand_config.get("brand", {})
|
| 53 |
+
ambiguous = brand.get("ambiguous_terms", {})
|
| 54 |
+
|
| 55 |
+
self.disambiguation_info = {}
|
| 56 |
+
for term, info in ambiguous.items():
|
| 57 |
+
if isinstance(info, dict):
|
| 58 |
+
self.disambiguation_info[term] = {
|
| 59 |
+
"description": info.get("description", ""),
|
| 60 |
+
"context_clues": info.get("disambiguation_context", [])
|
| 61 |
+
}
|
| 62 |
+
else:
|
| 63 |
+
self.disambiguation_info[term] = {
|
| 64 |
+
"description": str(info),
|
| 65 |
+
"context_clues": []
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
# Product descriptions for context
|
| 69 |
+
self.product_descriptions = brand.get("product_descriptions", {})
|
| 70 |
+
|
| 71 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 72 |
+
"""
|
| 73 |
+
Validate that input contains required fields.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
input_data: Input dictionary
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
True if valid, False otherwise
|
| 80 |
+
"""
|
| 81 |
+
required = ["cleaned_content", "relevance_keywords_found"]
|
| 82 |
+
return all(field in input_data for field in required)
|
| 83 |
+
|
| 84 |
+
def _build_system_prompt(self) -> str:
|
| 85 |
+
"""Build the system prompt for relevance validation."""
|
| 86 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 87 |
+
products = self.brand_config.get("brand", {}).get("products", [])
|
| 88 |
+
|
| 89 |
+
# Build disambiguation rules
|
| 90 |
+
disambiguation_rules = []
|
| 91 |
+
for term, info in self.disambiguation_info.items():
|
| 92 |
+
desc = info.get("description", "")
|
| 93 |
+
clues = info.get("context_clues", [])
|
| 94 |
+
rule = f"- '{term}': {desc}"
|
| 95 |
+
if clues:
|
| 96 |
+
rule += f" Context clues for {brand_name}: {', '.join(clues)}"
|
| 97 |
+
disambiguation_rules.append(rule)
|
| 98 |
+
|
| 99 |
+
disambiguation_text = "\n".join(disambiguation_rules) if disambiguation_rules else "No specific disambiguation rules."
|
| 100 |
+
|
| 101 |
+
system_prompt = f"""You are an expert at identifying brand mentions in drum/cymbal forum discussions.
|
| 102 |
+
|
| 103 |
+
Your task is to determine if the POST CONTENT itself discusses {brand_name} products.
|
| 104 |
+
|
| 105 |
+
**CRITICAL RULE:**
|
| 106 |
+
- You must determine relevance based ONLY on the POST CONTENT
|
| 107 |
+
- The context (thread info, quoted/parent content) is provided to help you understand ambiguous terms
|
| 108 |
+
- But if the POST CONTENT itself does not mention or discuss {brand_name}, it is NOT relevant
|
| 109 |
+
- Example: If quoted content mentions Sabian but the post just says "Got it! Thanks!" → NOT relevant
|
| 110 |
+
|
| 111 |
+
**{brand_name} Product Lines:**
|
| 112 |
+
{', '.join(products)}
|
| 113 |
+
|
| 114 |
+
**Ambiguous Terms to Watch For:**
|
| 115 |
+
{disambiguation_text}
|
| 116 |
+
|
| 117 |
+
**Key Disambiguation Rules:**
|
| 118 |
+
- "HH" alone usually means "Hi-Hat" (a type of cymbal), NOT Sabian HH series
|
| 119 |
+
- "HH" WITH Sabian context IN THE POST (e.g., "Sabian HH", "HH crashes", "my HH ride") likely refers to Sabian
|
| 120 |
+
- "AA" alone might be a general abbreviation, NOT Sabian AA series
|
| 121 |
+
- "AA" WITH Sabian context IN THE POST (e.g., "Sabian AA", "AA cymbals", "AA medium ride") likely refers to Sabian
|
| 122 |
+
- Generic replies like "Thanks!", "Got it!", "Good point!" are NOT relevant even if context mentions {brand_name}
|
| 123 |
+
|
| 124 |
+
**Return JSON with:**
|
| 125 |
+
- is_relevant: boolean - true ONLY if the POST CONTENT itself discusses {brand_name} products
|
| 126 |
+
- confidence: "high", "medium", or "low"
|
| 127 |
+
- reason: brief explanation (1-2 sentences) - explain what IN THE POST made you decide
|
| 128 |
+
- detected_products: list of {brand_name} products mentioned IN THE POST (empty if none)
|
| 129 |
+
|
| 130 |
+
Return only valid JSON."""
|
| 131 |
+
|
| 132 |
+
return system_prompt
|
| 133 |
+
|
| 134 |
+
def validate_relevance(
|
| 135 |
+
self,
|
| 136 |
+
content: str,
|
| 137 |
+
keywords_found: list,
|
| 138 |
+
thread_context: str = "",
|
| 139 |
+
quoted_content: str = ""
|
| 140 |
+
) -> Dict[str, Any]:
|
| 141 |
+
"""
|
| 142 |
+
Validate whether content is relevant to the brand.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
content: The cleaned post content
|
| 146 |
+
keywords_found: Keywords that triggered validation
|
| 147 |
+
thread_context: Thread context for additional context
|
| 148 |
+
quoted_content: Quoted content if any
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
Dictionary with validation results
|
| 152 |
+
"""
|
| 153 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 154 |
+
|
| 155 |
+
# Build context for the LLM
|
| 156 |
+
context_parts = []
|
| 157 |
+
if thread_context:
|
| 158 |
+
context_parts.append(f"Thread context: {thread_context}")
|
| 159 |
+
if quoted_content:
|
| 160 |
+
context_parts.append(f"Replying to: {quoted_content[:300]}...")
|
| 161 |
+
|
| 162 |
+
context_str = "\n".join(context_parts) if context_parts else "No additional context."
|
| 163 |
+
|
| 164 |
+
user_prompt = f"""Determine if this POST CONTENT discusses {brand_name} cymbal products.
|
| 165 |
+
|
| 166 |
+
**Keywords found in post:** {', '.join(keywords_found)}
|
| 167 |
+
|
| 168 |
+
**CONTEXT (for understanding ambiguous terms only - do NOT base relevance on this):**
|
| 169 |
+
{context_str}
|
| 170 |
+
|
| 171 |
+
**POST CONTENT TO EVALUATE (base your relevance decision ONLY on this):**
|
| 172 |
+
"{content}"
|
| 173 |
+
|
| 174 |
+
Does the POST CONTENT itself discuss {brand_name} products? Remember: generic replies are NOT relevant even if context mentions {brand_name}. Return JSON only."""
|
| 175 |
+
|
| 176 |
+
try:
|
| 177 |
+
messages = [
|
| 178 |
+
SystemMessage(content=self._build_system_prompt()),
|
| 179 |
+
HumanMessage(content=user_prompt)
|
| 180 |
+
]
|
| 181 |
+
|
| 182 |
+
response = self.llm.invoke(messages)
|
| 183 |
+
result = self._parse_llm_json_response(response.content)
|
| 184 |
+
|
| 185 |
+
return {
|
| 186 |
+
"success": True,
|
| 187 |
+
"is_relevant": result.get("is_relevant", False),
|
| 188 |
+
"relevance_confidence": result.get("confidence", "low"),
|
| 189 |
+
"relevance_reason": result.get("reason", ""),
|
| 190 |
+
"detected_products": result.get("detected_products", [])
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
except json.JSONDecodeError as e:
|
| 194 |
+
self.log_processing(f"JSON decode error in relevance validation: {e}", "warning")
|
| 195 |
+
# Default to relevant if we can't determine
|
| 196 |
+
return {
|
| 197 |
+
"success": True,
|
| 198 |
+
"is_relevant": True,
|
| 199 |
+
"relevance_confidence": "low",
|
| 200 |
+
"relevance_reason": "Could not parse LLM response, defaulting to relevant",
|
| 201 |
+
"detected_products": []
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
except Exception as e:
|
| 205 |
+
self.log_processing(f"Relevance validation error: {e}", "error")
|
| 206 |
+
return {
|
| 207 |
+
"success": False,
|
| 208 |
+
"is_relevant": True, # Default to relevant on error
|
| 209 |
+
"relevance_confidence": "low",
|
| 210 |
+
"relevance_reason": f"Error during validation: {str(e)}",
|
| 211 |
+
"detected_products": [],
|
| 212 |
+
"error": str(e)
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 216 |
+
"""
|
| 217 |
+
Process a post to validate its relevance to the brand.
|
| 218 |
+
|
| 219 |
+
Args:
|
| 220 |
+
input_data: Dictionary containing:
|
| 221 |
+
- cleaned_content: Cleaned post text
|
| 222 |
+
- relevance_keywords_found: Keywords that triggered validation
|
| 223 |
+
- thread_context: Optional thread context
|
| 224 |
+
- quoted_content: Optional quoted content
|
| 225 |
+
|
| 226 |
+
Returns:
|
| 227 |
+
Dictionary with validation results and original data
|
| 228 |
+
"""
|
| 229 |
+
try:
|
| 230 |
+
if not self.validate_input(input_data):
|
| 231 |
+
return {
|
| 232 |
+
"success": False,
|
| 233 |
+
"error": "Invalid input: missing required fields",
|
| 234 |
+
"is_relevant": True, # Default to relevant
|
| 235 |
+
"relevance_confidence": "low",
|
| 236 |
+
**input_data
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
# Check if validation is actually needed
|
| 240 |
+
if not input_data.get("needs_relevance_validation", False):
|
| 241 |
+
# No validation needed, use preliminary assessment
|
| 242 |
+
return {
|
| 243 |
+
"success": True,
|
| 244 |
+
"is_relevant": input_data.get("preliminary_relevant", False),
|
| 245 |
+
"relevance_confidence": input_data.get("relevance_confidence", "high"),
|
| 246 |
+
"relevance_reason": "No validation needed - preliminary assessment used",
|
| 247 |
+
"validation_performed": False,
|
| 248 |
+
**input_data
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
# Perform LLM validation
|
| 252 |
+
validation_result = self.validate_relevance(
|
| 253 |
+
content=input_data.get("cleaned_content", ""),
|
| 254 |
+
keywords_found=input_data.get("relevance_keywords_found", []),
|
| 255 |
+
thread_context=input_data.get("thread_context", ""),
|
| 256 |
+
quoted_content=input_data.get("quoted_content", "")
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
# Merge results
|
| 260 |
+
result = {
|
| 261 |
+
**input_data,
|
| 262 |
+
"is_relevant": validation_result["is_relevant"],
|
| 263 |
+
"relevance_confidence": validation_result["relevance_confidence"],
|
| 264 |
+
"relevance_reason": validation_result["relevance_reason"],
|
| 265 |
+
"validation_performed": True,
|
| 266 |
+
"success": validation_result["success"]
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
# Update products detected if LLM found any
|
| 270 |
+
if validation_result.get("detected_products"):
|
| 271 |
+
existing_products = input_data.get("products_detected", [])
|
| 272 |
+
llm_products = validation_result["detected_products"]
|
| 273 |
+
# Merge without duplicates
|
| 274 |
+
all_products = list(set(existing_products + llm_products))
|
| 275 |
+
result["products_detected"] = all_products
|
| 276 |
+
|
| 277 |
+
if "error" in validation_result:
|
| 278 |
+
result["validation_error"] = validation_result["error"]
|
| 279 |
+
|
| 280 |
+
self.log_processing(
|
| 281 |
+
f"Validated relevance for post: is_relevant={result['is_relevant']}, "
|
| 282 |
+
f"confidence={result['relevance_confidence']}",
|
| 283 |
+
"debug"
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
return result
|
| 287 |
+
|
| 288 |
+
except Exception as e:
|
| 289 |
+
return self.handle_error(e, "relevance validation")
|
processing_brand_sentiment/workflow/agents/sabian_analyzer_agent.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sabian Analyzer Agent for comprehensive brand sentiment analysis.
|
| 3 |
+
LLM-based agent that extracts products, competitors, sentiment, intents,
|
| 4 |
+
pain points, and other brand intelligence from forum posts.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import Dict, Any, List
|
| 8 |
+
import json
|
| 9 |
+
from langchain_openai import ChatOpenAI
|
| 10 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 11 |
+
import logging
|
| 12 |
+
|
| 13 |
+
from .base_agent import BaseAgent
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class SabianAnalyzerAgent(BaseAgent):
|
| 19 |
+
"""
|
| 20 |
+
Comprehensive brand analysis agent for Sabian cymbal discussions.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(
|
| 24 |
+
self,
|
| 25 |
+
config: Dict[str, Any],
|
| 26 |
+
api_key: str,
|
| 27 |
+
brand_config: Dict[str, Any],
|
| 28 |
+
analysis_categories: Dict[str, Any]
|
| 29 |
+
):
|
| 30 |
+
super().__init__("SabianAnalyzerAgent", config)
|
| 31 |
+
self.api_key = api_key
|
| 32 |
+
self.brand_config = brand_config
|
| 33 |
+
self.analysis_categories = analysis_categories
|
| 34 |
+
|
| 35 |
+
self.llm = ChatOpenAI(
|
| 36 |
+
model=self.model,
|
| 37 |
+
temperature=self.temperature,
|
| 38 |
+
api_key=self.api_key
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Pre-compute valid values for validation
|
| 42 |
+
self._valid_values = self._compute_valid_values()
|
| 43 |
+
logger.info("SabianAnalyzerAgent initialized")
|
| 44 |
+
|
| 45 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 46 |
+
required = ["cleaned_content", "is_relevant"]
|
| 47 |
+
return all(field in input_data for field in required)
|
| 48 |
+
|
| 49 |
+
def _compute_valid_values(self) -> Dict[str, List[str]]:
|
| 50 |
+
"""Pre-compute all valid values from config for validation."""
|
| 51 |
+
valid = {}
|
| 52 |
+
|
| 53 |
+
# Products from brand config
|
| 54 |
+
valid["products"] = self.brand_config.get("brand", {}).get("products", [])
|
| 55 |
+
|
| 56 |
+
# Competitors
|
| 57 |
+
competitor_names = []
|
| 58 |
+
for comp in self.brand_config.get("brand", {}).get("competitors", []):
|
| 59 |
+
if isinstance(comp, dict):
|
| 60 |
+
competitor_names.append(comp.get("name", ""))
|
| 61 |
+
valid["competitors"] = competitor_names
|
| 62 |
+
|
| 63 |
+
# Extract category values from analysis_categories
|
| 64 |
+
category_map = {
|
| 65 |
+
"author_role": "author_role",
|
| 66 |
+
"sabian_mention_context": "sabian_mention_context",
|
| 67 |
+
"sentiment_level": "sentiment",
|
| 68 |
+
"emotion_type": "emotions",
|
| 69 |
+
"intents": "intents",
|
| 70 |
+
"purchase_stage": "purchase_stage",
|
| 71 |
+
"comparison_type": "comparison_type",
|
| 72 |
+
"feedback_aspects": "feedback_aspects",
|
| 73 |
+
"decision_drivers": "decision_drivers",
|
| 74 |
+
"product_attributes": "product_attributes",
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
for key, config_key in category_map.items():
|
| 78 |
+
config_section = self.analysis_categories.get(config_key, {})
|
| 79 |
+
if "categories" in config_section:
|
| 80 |
+
valid[key] = [c["value"] for c in config_section["categories"]]
|
| 81 |
+
elif "levels" in config_section:
|
| 82 |
+
valid[key] = [c["value"] for c in config_section["levels"]]
|
| 83 |
+
else:
|
| 84 |
+
valid[key] = []
|
| 85 |
+
|
| 86 |
+
return valid
|
| 87 |
+
|
| 88 |
+
def _get_category_list(self, key: str) -> List[str]:
|
| 89 |
+
"""Get list of valid values for a category."""
|
| 90 |
+
return self._valid_values.get(key, [])
|
| 91 |
+
|
| 92 |
+
def _build_system_prompt(self) -> str:
|
| 93 |
+
"""Build optimized system prompt for brand analysis."""
|
| 94 |
+
brand = self.brand_config.get("brand", {})
|
| 95 |
+
brand_name = brand.get("name", "Sabian")
|
| 96 |
+
products = brand.get("products", [])
|
| 97 |
+
|
| 98 |
+
competitors = [c.get("name", "") for c in brand.get("competitors", []) if isinstance(c, dict)]
|
| 99 |
+
|
| 100 |
+
# Get all valid values
|
| 101 |
+
v = self._valid_values
|
| 102 |
+
|
| 103 |
+
return f"""You are a brand analyst extracting insights from forum posts about {brand_name} cymbals.
|
| 104 |
+
|
| 105 |
+
## STRICT RULES
|
| 106 |
+
1. Extract ONLY from POST CONTENT, never from quoted/context text
|
| 107 |
+
2. Use ONLY values from the lists below - return null/[] if no match
|
| 108 |
+
3. Sentiment must be about {brand_name} specifically, NOT overall post tone
|
| 109 |
+
4. pain_points/delight_factors use SAME value list (feedback_aspects) - classification determines positive vs negative
|
| 110 |
+
|
| 111 |
+
## VALID VALUES
|
| 112 |
+
|
| 113 |
+
**{brand_name} Products:** {products}
|
| 114 |
+
**Competitors:** {competitors}
|
| 115 |
+
|
| 116 |
+
| Field | Valid Values |
|
| 117 |
+
|-------|--------------|
|
| 118 |
+
| author_role | {v.get('author_role', [])} |
|
| 119 |
+
| sabian_mention_context | {v.get('sabian_mention_context', [])} |
|
| 120 |
+
| sentiment_level | {v.get('sentiment_level', [])} |
|
| 121 |
+
| emotion_type | {v.get('emotion_type', [])} |
|
| 122 |
+
| intents (multi) | {v.get('intents', [])} |
|
| 123 |
+
| purchase_stage | {v.get('purchase_stage', [])} |
|
| 124 |
+
| comparison_type | {v.get('comparison_type', [])} |
|
| 125 |
+
| feedback_aspects | {v.get('feedback_aspects', [])} |
|
| 126 |
+
| decision_drivers | {v.get('decision_drivers', [])} |
|
| 127 |
+
| product_attributes | {v.get('product_attributes', [])} |
|
| 128 |
+
|
| 129 |
+
## KEY DISTINCTIONS
|
| 130 |
+
|
| 131 |
+
**Sentiment vs Intent:**
|
| 132 |
+
- sentiment_level = How author FEELS about {brand_name} (positive/negative/neutral)
|
| 133 |
+
- praising/criticizing intent = Author is actively ENDORSING or WARNING others
|
| 134 |
+
|
| 135 |
+
**Author-only fields (null if giving advice to others):**
|
| 136 |
+
- purchase_stage, decision_drivers, pain_points, delight_factors
|
| 137 |
+
|
| 138 |
+
**Example - Sabian-specific sentiment:**
|
| 139 |
+
Post: "Love my new drum kit! The SBR cymbals sound terrible though."
|
| 140 |
+
- Overall post: positive (happy about kit)
|
| 141 |
+
- {brand_name} sentiment: NEGATIVE (dislikes SBR sound)
|
| 142 |
+
- pain_points: ["sound_quality"]
|
| 143 |
+
|
| 144 |
+
## OUTPUT JSON
|
| 145 |
+
```json
|
| 146 |
+
{{
|
| 147 |
+
"author_role": "value from list",
|
| 148 |
+
"sabian_mention_context": "value from list",
|
| 149 |
+
"sentiment_level": "value from list",
|
| 150 |
+
"emotion_type": "value or null",
|
| 151 |
+
"sentiment_confidence": "high|medium|low",
|
| 152 |
+
"sarcasm_detected": false,
|
| 153 |
+
"products_mentioned": [],
|
| 154 |
+
"product_attributes": [],
|
| 155 |
+
"competitors_mentioned": [],
|
| 156 |
+
"competitor_products_owned": [],
|
| 157 |
+
"comparison_type": "value or null",
|
| 158 |
+
"intents": [],
|
| 159 |
+
"purchase_stage": "value or null",
|
| 160 |
+
"decision_drivers": [],
|
| 161 |
+
"pain_points": [],
|
| 162 |
+
"delight_factors": [],
|
| 163 |
+
"analysis_notes": "1-2 sentences on key {brand_name}-specific insights"
|
| 164 |
+
}}
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
Return ONLY valid JSON."""
|
| 168 |
+
|
| 169 |
+
def analyze_post(
|
| 170 |
+
self,
|
| 171 |
+
content: str,
|
| 172 |
+
thread_context: str = "",
|
| 173 |
+
quoted_content: str = ""
|
| 174 |
+
) -> Dict[str, Any]:
|
| 175 |
+
"""Perform brand analysis on a post."""
|
| 176 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 177 |
+
|
| 178 |
+
context_str = ""
|
| 179 |
+
if thread_context:
|
| 180 |
+
context_str += f"[Thread: {thread_context[:200]}] "
|
| 181 |
+
if quoted_content:
|
| 182 |
+
context_str += f"[Replying to: {quoted_content[:200]}...]"
|
| 183 |
+
|
| 184 |
+
user_prompt = f"""Analyze this post about {brand_name}.
|
| 185 |
+
|
| 186 |
+
CONTEXT (for understanding only, DO NOT extract from): {context_str or "None"}
|
| 187 |
+
|
| 188 |
+
POST CONTENT (extract from THIS only):
|
| 189 |
+
"{content}"
|
| 190 |
+
|
| 191 |
+
Return JSON only."""
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
messages = [
|
| 195 |
+
SystemMessage(content=self._build_system_prompt()),
|
| 196 |
+
HumanMessage(content=user_prompt)
|
| 197 |
+
]
|
| 198 |
+
|
| 199 |
+
response = self.llm.invoke(messages)
|
| 200 |
+
result = self._parse_llm_json_response(response.content)
|
| 201 |
+
validated = self._validate_and_normalize(result)
|
| 202 |
+
|
| 203 |
+
return {"success": True, **validated}
|
| 204 |
+
|
| 205 |
+
except json.JSONDecodeError as e:
|
| 206 |
+
self.log_processing(f"JSON decode error: {e}", "warning")
|
| 207 |
+
return {
|
| 208 |
+
"success": False,
|
| 209 |
+
"error": f"JSON parse error: {str(e)}",
|
| 210 |
+
"sentiment_level": "neutral",
|
| 211 |
+
"intents": ["general_discussion"]
|
| 212 |
+
}
|
| 213 |
+
except Exception as e:
|
| 214 |
+
self.log_processing(f"Analysis error: {e}", "error")
|
| 215 |
+
return {"success": False, "error": str(e)}
|
| 216 |
+
|
| 217 |
+
def _validate_single(self, value: Any, valid_list: List[str], default: Any = None) -> Any:
|
| 218 |
+
"""Validate single value against list, return canonical form or default."""
|
| 219 |
+
if value is None:
|
| 220 |
+
return default
|
| 221 |
+
if isinstance(value, str):
|
| 222 |
+
val_lower = value.lower()
|
| 223 |
+
for v in valid_list:
|
| 224 |
+
if v.lower() == val_lower:
|
| 225 |
+
return v
|
| 226 |
+
return default
|
| 227 |
+
|
| 228 |
+
def _validate_list(self, values: Any, valid_list: List[str]) -> List[str]:
|
| 229 |
+
"""Validate list values, return only valid items in canonical form."""
|
| 230 |
+
if not values:
|
| 231 |
+
return []
|
| 232 |
+
if not isinstance(values, list):
|
| 233 |
+
values = [values]
|
| 234 |
+
|
| 235 |
+
validated = []
|
| 236 |
+
valid_lower = {v.lower(): v for v in valid_list}
|
| 237 |
+
for val in values:
|
| 238 |
+
if isinstance(val, str) and val.lower() in valid_lower:
|
| 239 |
+
validated.append(valid_lower[val.lower()])
|
| 240 |
+
return validated
|
| 241 |
+
|
| 242 |
+
def _validate_and_normalize(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
| 243 |
+
"""Validate all fields against predefined values and normalize."""
|
| 244 |
+
v = self._valid_values
|
| 245 |
+
|
| 246 |
+
normalized = {
|
| 247 |
+
# Classification
|
| 248 |
+
"author_role": self._validate_single(
|
| 249 |
+
result.get("author_role"), v["author_role"], "unknown"
|
| 250 |
+
),
|
| 251 |
+
"sabian_mention_context": self._validate_single(
|
| 252 |
+
result.get("sabian_mention_context"), v["sabian_mention_context"], "casual_mention"
|
| 253 |
+
),
|
| 254 |
+
|
| 255 |
+
# Sentiment
|
| 256 |
+
"sentiment_level": self._validate_single(
|
| 257 |
+
result.get("sentiment_level"), v["sentiment_level"], "neutral"
|
| 258 |
+
),
|
| 259 |
+
"emotion_type": self._validate_single(
|
| 260 |
+
result.get("emotion_type"), v["emotion_type"], None
|
| 261 |
+
),
|
| 262 |
+
"sentiment_confidence": result.get("sentiment_confidence", "medium"),
|
| 263 |
+
"sarcasm_detected": bool(result.get("sarcasm_detected", False)),
|
| 264 |
+
|
| 265 |
+
# Products
|
| 266 |
+
"products_mentioned": self._validate_list(
|
| 267 |
+
result.get("products_mentioned"), v["products"]
|
| 268 |
+
),
|
| 269 |
+
"product_attributes": self._validate_list(
|
| 270 |
+
result.get("product_attributes"), v["product_attributes"]
|
| 271 |
+
),
|
| 272 |
+
|
| 273 |
+
# Competitors
|
| 274 |
+
"competitors_mentioned": self._validate_list(
|
| 275 |
+
result.get("competitors_mentioned"), v["competitors"]
|
| 276 |
+
),
|
| 277 |
+
"competitor_products_owned": self._validate_list(
|
| 278 |
+
result.get("competitor_products_owned"), v["competitors"]
|
| 279 |
+
),
|
| 280 |
+
"comparison_type": self._validate_single(
|
| 281 |
+
result.get("comparison_type"), v["comparison_type"], None
|
| 282 |
+
),
|
| 283 |
+
|
| 284 |
+
# Intents
|
| 285 |
+
"intents": self._validate_list(
|
| 286 |
+
result.get("intents"), v["intents"]
|
| 287 |
+
) or ["general_discussion"],
|
| 288 |
+
|
| 289 |
+
# Author journey (null if advising others)
|
| 290 |
+
"purchase_stage": self._validate_single(
|
| 291 |
+
result.get("purchase_stage"), v["purchase_stage"], None
|
| 292 |
+
),
|
| 293 |
+
"decision_drivers": self._validate_list(
|
| 294 |
+
result.get("decision_drivers"), v["decision_drivers"]
|
| 295 |
+
),
|
| 296 |
+
|
| 297 |
+
# Feedback - both use feedback_aspects
|
| 298 |
+
"pain_points": self._validate_list(
|
| 299 |
+
result.get("pain_points"), v["feedback_aspects"]
|
| 300 |
+
),
|
| 301 |
+
"delight_factors": self._validate_list(
|
| 302 |
+
result.get("delight_factors"), v["feedback_aspects"]
|
| 303 |
+
),
|
| 304 |
+
|
| 305 |
+
# Notes
|
| 306 |
+
"analysis_notes": result.get("analysis_notes", ""),
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
# Log filtered values for debugging
|
| 310 |
+
for field in ["products_mentioned", "product_attributes", "pain_points", "delight_factors"]:
|
| 311 |
+
original = result.get(field, [])
|
| 312 |
+
if isinstance(original, list) and len(original) > len(normalized[field]):
|
| 313 |
+
filtered = set(str(x) for x in original) - set(normalized[field])
|
| 314 |
+
if filtered:
|
| 315 |
+
logger.debug(f"Filtered invalid {field}: {filtered}")
|
| 316 |
+
|
| 317 |
+
return normalized
|
| 318 |
+
|
| 319 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 320 |
+
"""Process a post through brand analysis."""
|
| 321 |
+
try:
|
| 322 |
+
if not self.validate_input(input_data):
|
| 323 |
+
return {
|
| 324 |
+
"success": False,
|
| 325 |
+
"error": "Invalid input: missing required fields",
|
| 326 |
+
**input_data
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
# Skip non-relevant posts
|
| 330 |
+
if not input_data.get("is_relevant", False):
|
| 331 |
+
return {
|
| 332 |
+
"success": True,
|
| 333 |
+
"analysis_skipped": True,
|
| 334 |
+
"analysis_skip_reason": "Post marked as not relevant",
|
| 335 |
+
"author_role": None,
|
| 336 |
+
"sabian_mention_context": None,
|
| 337 |
+
"sentiment_level": None,
|
| 338 |
+
"emotion_type": None,
|
| 339 |
+
"products_mentioned": [],
|
| 340 |
+
"competitors_mentioned": [],
|
| 341 |
+
"competitor_products_owned": [],
|
| 342 |
+
"intents": [],
|
| 343 |
+
"purchase_stage": None,
|
| 344 |
+
"decision_drivers": [],
|
| 345 |
+
"pain_points": [],
|
| 346 |
+
"delight_factors": [],
|
| 347 |
+
**input_data
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
# Skip non-English posts
|
| 351 |
+
if not input_data.get("is_english", True):
|
| 352 |
+
return {
|
| 353 |
+
"success": True,
|
| 354 |
+
"analysis_skipped": True,
|
| 355 |
+
"analysis_skip_reason": f"Non-English: {input_data.get('detected_language')}",
|
| 356 |
+
"author_role": None,
|
| 357 |
+
"sabian_mention_context": None,
|
| 358 |
+
"sentiment_level": None,
|
| 359 |
+
"emotion_type": None,
|
| 360 |
+
"intents": [],
|
| 361 |
+
"competitor_products_owned": [],
|
| 362 |
+
**input_data
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
# Perform analysis
|
| 366 |
+
analysis_result = self.analyze_post(
|
| 367 |
+
content=input_data.get("cleaned_content", ""),
|
| 368 |
+
thread_context=input_data.get("thread_context", ""),
|
| 369 |
+
quoted_content=input_data.get("quoted_content", "")
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
result = {
|
| 373 |
+
**input_data,
|
| 374 |
+
**analysis_result,
|
| 375 |
+
"analysis_skipped": False
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
self.log_processing(
|
| 379 |
+
f"Analyzed: sentiment={result.get('sentiment_level')}, "
|
| 380 |
+
f"products={len(result.get('products_mentioned', []))}, "
|
| 381 |
+
f"intents={result.get('intents', [])}",
|
| 382 |
+
"debug"
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
return result
|
| 386 |
+
|
| 387 |
+
except Exception as e:
|
| 388 |
+
return self.handle_error(e, "brand analysis")
|
processing_brand_sentiment/workflow/agents/sabian_relevance_extraction_agent.py
ADDED
|
@@ -0,0 +1,431 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sabian Relevance & Extraction Agent for brand sentiment analysis.
|
| 3 |
+
|
| 4 |
+
This agent performs two critical functions:
|
| 5 |
+
1. Determines relevance with HIGH confidence using strict rules
|
| 6 |
+
2. Extracts verifiable facts (products, author role, context summary)
|
| 7 |
+
|
| 8 |
+
Key Design Principles:
|
| 9 |
+
- Strict product matching: ONLY return products from predefined list
|
| 10 |
+
- Competitor awareness: Know what products belong to competitors
|
| 11 |
+
- Conservative relevance: When uncertain, mark as NOT relevant
|
| 12 |
+
- Thread context summarization: Provide clean, concise context for next agent
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from typing import Dict, Any, List
|
| 16 |
+
import json
|
| 17 |
+
from langchain_openai import ChatOpenAI
|
| 18 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 19 |
+
import logging
|
| 20 |
+
|
| 21 |
+
from .base_agent import BaseAgent
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class SabianRelevanceExtractionAgent(BaseAgent):
|
| 27 |
+
"""
|
| 28 |
+
Agent that validates relevance and extracts key facts from posts.
|
| 29 |
+
|
| 30 |
+
This agent is the first LLM call in the pipeline and serves as the
|
| 31 |
+
gatekeeper for relevance while also extracting structured information
|
| 32 |
+
for downstream analysis.
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
def __init__(
|
| 36 |
+
self,
|
| 37 |
+
config: Dict[str, Any],
|
| 38 |
+
api_key: str,
|
| 39 |
+
brand_config: Dict[str, Any],
|
| 40 |
+
analysis_categories: Dict[str, Any]
|
| 41 |
+
):
|
| 42 |
+
"""
|
| 43 |
+
Initialize the Relevance & Extraction Agent.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
config: Agent configuration
|
| 47 |
+
api_key: OpenAI API key
|
| 48 |
+
brand_config: Brand-specific configuration with products and competitors
|
| 49 |
+
analysis_categories: Category definitions for validation
|
| 50 |
+
"""
|
| 51 |
+
super().__init__("SabianRelevanceExtractionAgent", config)
|
| 52 |
+
self.api_key = api_key
|
| 53 |
+
self.brand_config = brand_config
|
| 54 |
+
self.analysis_categories = analysis_categories
|
| 55 |
+
|
| 56 |
+
self.llm = ChatOpenAI(
|
| 57 |
+
model=self.model,
|
| 58 |
+
temperature=self.temperature,
|
| 59 |
+
api_key=self.api_key
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Pre-compute valid values
|
| 63 |
+
self._build_valid_values()
|
| 64 |
+
self._build_competitor_product_warnings()
|
| 65 |
+
|
| 66 |
+
logger.info("SabianRelevanceExtractionAgent initialized")
|
| 67 |
+
|
| 68 |
+
def _build_valid_values(self) -> None:
|
| 69 |
+
"""Build valid value lists for validation."""
|
| 70 |
+
brand = self.brand_config.get("brand", {})
|
| 71 |
+
|
| 72 |
+
# Products
|
| 73 |
+
self.valid_products = brand.get("products", [])
|
| 74 |
+
|
| 75 |
+
# Competitors (brand names only)
|
| 76 |
+
self.valid_competitors = []
|
| 77 |
+
for comp in brand.get("competitors", []):
|
| 78 |
+
if isinstance(comp, dict):
|
| 79 |
+
self.valid_competitors.append(comp.get("name", ""))
|
| 80 |
+
else:
|
| 81 |
+
self.valid_competitors.append(str(comp))
|
| 82 |
+
|
| 83 |
+
# Author roles from categories
|
| 84 |
+
author_role_config = self.analysis_categories.get("author_role", {})
|
| 85 |
+
self.valid_author_roles = [
|
| 86 |
+
c["value"] for c in author_role_config.get("categories", [])
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
# Sabian mention context from categories
|
| 90 |
+
mention_context_config = self.analysis_categories.get("sabian_mention_context", {})
|
| 91 |
+
self.valid_mention_contexts = [
|
| 92 |
+
c["value"] for c in mention_context_config.get("categories", [])
|
| 93 |
+
]
|
| 94 |
+
|
| 95 |
+
def _build_competitor_product_warnings(self) -> None:
|
| 96 |
+
"""Build list of competitor products to warn about in prompts."""
|
| 97 |
+
warnings = self.brand_config.get("brand", {}).get("competitor_products_warning", {})
|
| 98 |
+
|
| 99 |
+
self.competitor_products_by_brand = {}
|
| 100 |
+
for key, products in warnings.items():
|
| 101 |
+
if key == "description":
|
| 102 |
+
continue
|
| 103 |
+
# Extract brand name from key (e.g., "paiste_products" -> "Paiste")
|
| 104 |
+
brand_name = key.replace("_products", "").capitalize()
|
| 105 |
+
self.competitor_products_by_brand[brand_name] = products
|
| 106 |
+
|
| 107 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 108 |
+
"""Validate input contains required fields."""
|
| 109 |
+
required = ["cleaned_content"]
|
| 110 |
+
return all(field in input_data for field in required)
|
| 111 |
+
|
| 112 |
+
def _build_system_prompt(self) -> str:
|
| 113 |
+
"""Build the system prompt for relevance and extraction."""
|
| 114 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 115 |
+
|
| 116 |
+
# Build competitor product warnings
|
| 117 |
+
competitor_warnings = []
|
| 118 |
+
for brand, products in self.competitor_products_by_brand.items():
|
| 119 |
+
products_str = ", ".join(f'"{p}"' for p in products[:5]) # Limit to 5 examples
|
| 120 |
+
if len(products) > 5:
|
| 121 |
+
products_str += f" (and {len(products)-5} more)"
|
| 122 |
+
competitor_warnings.append(f"- {brand}: {products_str}")
|
| 123 |
+
|
| 124 |
+
competitor_warnings_text = "\n".join(competitor_warnings) if competitor_warnings else "None specified"
|
| 125 |
+
|
| 126 |
+
return f"""You are a brand mention extractor for {brand_name} cymbals. Your job is to:
|
| 127 |
+
1. Determine if the POST CONTENT discusses {brand_name} products or brand
|
| 128 |
+
2. Extract ONLY verifiable facts, not interpretations
|
| 129 |
+
|
| 130 |
+
## CRITICAL RULES
|
| 131 |
+
|
| 132 |
+
### Rule 1: Relevance Based on POST CONTENT Only
|
| 133 |
+
- The post is relevant ONLY if the POST CONTENT itself mentions {brand_name} brand or products
|
| 134 |
+
- Quoted/parent content mentioning {brand_name} does NOT make the post relevant
|
| 135 |
+
- Generic replies ("Thanks!", "Got it!", "Good point!") are NEVER relevant
|
| 136 |
+
- Posts can be relevant even without specific product mentions if they discuss the {brand_name} brand
|
| 137 |
+
|
| 138 |
+
### Rule 2: Strict Product Matching
|
| 139 |
+
{brand_name.upper()} PRODUCTS (use ONLY these exact values):
|
| 140 |
+
{self.valid_products}
|
| 141 |
+
|
| 142 |
+
CRITICAL:
|
| 143 |
+
- Return ONLY products from this exact list above
|
| 144 |
+
- If you see a product not in this list, do NOT include it
|
| 145 |
+
- Return empty list [] if no products from the list are mentioned
|
| 146 |
+
- It's OK to have empty products_mentioned if the post discusses {brand_name} brand generally
|
| 147 |
+
|
| 148 |
+
### Rule 3: Competitor Product Awareness
|
| 149 |
+
These products belong to COMPETITORS, NOT {brand_name}:
|
| 150 |
+
{competitor_warnings_text}
|
| 151 |
+
|
| 152 |
+
COMPETITOR BRANDS: {self.valid_competitors}
|
| 153 |
+
- Only return competitor BRAND names in competitors_mentioned (not their products)
|
| 154 |
+
- If you see "2002", "Signature", "Sound Edge", "Formula 602" - these are PAISTE, not {brand_name}
|
| 155 |
+
- If you see "K Custom", "A Custom" - these are ZILDJIAN, not {brand_name}
|
| 156 |
+
|
| 157 |
+
### Rule 4: Thread Context Summary
|
| 158 |
+
- Summarize thread context in 1-2 sentences MAXIMUM
|
| 159 |
+
- Focus only on what helps understand what the post is responding to
|
| 160 |
+
- If thread is about unrelated topics (pizza, general life), say so briefly
|
| 161 |
+
- Keep it factual and concise
|
| 162 |
+
|
| 163 |
+
### Rule 5: Author Role Classification
|
| 164 |
+
Determine the author's relationship to {brand_name}:
|
| 165 |
+
- current_owner: Currently owns/uses {brand_name} products
|
| 166 |
+
- past_owner: Previously owned but sold/replaced
|
| 167 |
+
- potential_buyer: Considering purchasing {brand_name}
|
| 168 |
+
- never_owned: Explicitly states they don't own {brand_name}
|
| 169 |
+
- unknown: Cannot determine from post content
|
| 170 |
+
|
| 171 |
+
### Rule 6: Mention Context Classification
|
| 172 |
+
How prominently is {brand_name} discussed IN THE POST CONTENT:
|
| 173 |
+
- primary_focus: {brand_name} is the main topic of the post
|
| 174 |
+
- significant_mention: {brand_name} discussed with some detail, but not main focus
|
| 175 |
+
- casual_mention: Brief mention among other topics
|
| 176 |
+
- comparison_context: Mentioned while comparing to competitors
|
| 177 |
+
- null: Not relevant (use when is_relevant=false)
|
| 178 |
+
|
| 179 |
+
## OUTPUT FORMAT
|
| 180 |
+
Return ONLY valid JSON with these exact fields:
|
| 181 |
+
```json
|
| 182 |
+
{{
|
| 183 |
+
"is_relevant": true/false,
|
| 184 |
+
"relevance_confidence": "high" | "medium" | "low",
|
| 185 |
+
"relevance_reason": "1-2 sentences explaining your decision",
|
| 186 |
+
"products_mentioned": [],
|
| 187 |
+
"sabian_mention_context": "value from list" | null,
|
| 188 |
+
"author_role": "value from list",
|
| 189 |
+
"competitors_mentioned": [],
|
| 190 |
+
"thread_context_summary": "1-2 sentence summary of thread context"
|
| 191 |
+
}}
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
IMPORTANT: Return ONLY the JSON object, no additional text."""
|
| 195 |
+
|
| 196 |
+
def _build_user_prompt(
|
| 197 |
+
self,
|
| 198 |
+
content: str,
|
| 199 |
+
quoted_content: str,
|
| 200 |
+
raw_thread_context: str,
|
| 201 |
+
keywords_found: List[str]
|
| 202 |
+
) -> str:
|
| 203 |
+
"""Build the user prompt with post content and context."""
|
| 204 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 205 |
+
|
| 206 |
+
context_section = ""
|
| 207 |
+
if raw_thread_context:
|
| 208 |
+
# Truncate if too long
|
| 209 |
+
truncated_context = raw_thread_context[:1000] if len(raw_thread_context) > 1000 else raw_thread_context
|
| 210 |
+
context_section += f"THREAD CONTEXT (for understanding only):\n{truncated_context}\n\n"
|
| 211 |
+
|
| 212 |
+
if quoted_content:
|
| 213 |
+
truncated_quote = quoted_content[:500] if len(quoted_content) > 500 else quoted_content
|
| 214 |
+
context_section += f"QUOTED/PARENT CONTENT (for understanding only):\n{truncated_quote}\n\n"
|
| 215 |
+
|
| 216 |
+
keywords_info = ""
|
| 217 |
+
if keywords_found:
|
| 218 |
+
keywords_info = f"Keywords detected by preprocessor: {', '.join(keywords_found)}\n\n"
|
| 219 |
+
|
| 220 |
+
return f"""Analyze this post for {brand_name} relevance and extract facts.
|
| 221 |
+
|
| 222 |
+
{keywords_info}{context_section}POST CONTENT TO EVALUATE (base your decision ONLY on this):
|
| 223 |
+
\"\"\"{content}\"\"\"
|
| 224 |
+
|
| 225 |
+
Remember:
|
| 226 |
+
- is_relevant=true ONLY if POST CONTENT discusses {brand_name}
|
| 227 |
+
- products_mentioned must be from the exact product list provided
|
| 228 |
+
- competitors_mentioned should be brand names only (Zildjian, Paiste, etc.)
|
| 229 |
+
- thread_context_summary should be 1-2 sentences max
|
| 230 |
+
|
| 231 |
+
Return JSON only."""
|
| 232 |
+
|
| 233 |
+
def extract_and_validate(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 234 |
+
"""
|
| 235 |
+
Perform relevance check and fact extraction.
|
| 236 |
+
|
| 237 |
+
Args:
|
| 238 |
+
input_data: Preprocessed post data
|
| 239 |
+
|
| 240 |
+
Returns:
|
| 241 |
+
Dictionary with extraction results
|
| 242 |
+
"""
|
| 243 |
+
content = input_data.get("cleaned_content", "")
|
| 244 |
+
quoted_content = input_data.get("quoted_content", "")
|
| 245 |
+
raw_thread_context = input_data.get("raw_thread_context", "")
|
| 246 |
+
keywords_found = input_data.get("relevance_keywords_found", [])
|
| 247 |
+
|
| 248 |
+
try:
|
| 249 |
+
messages = [
|
| 250 |
+
SystemMessage(content=self._build_system_prompt()),
|
| 251 |
+
HumanMessage(content=self._build_user_prompt(
|
| 252 |
+
content, quoted_content, raw_thread_context, keywords_found
|
| 253 |
+
))
|
| 254 |
+
]
|
| 255 |
+
|
| 256 |
+
response = self.llm.invoke(messages)
|
| 257 |
+
result = self._parse_llm_json_response(response.content)
|
| 258 |
+
|
| 259 |
+
# Validate and normalize the response
|
| 260 |
+
validated = self._validate_response(result)
|
| 261 |
+
|
| 262 |
+
return {
|
| 263 |
+
"success": True,
|
| 264 |
+
**validated
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
except json.JSONDecodeError as e:
|
| 268 |
+
self.log_processing(f"JSON decode error: {e}", "warning")
|
| 269 |
+
return {
|
| 270 |
+
"success": False,
|
| 271 |
+
"error": f"JSON parse error: {str(e)}",
|
| 272 |
+
"is_relevant": False,
|
| 273 |
+
"relevance_confidence": "low",
|
| 274 |
+
"relevance_reason": "Failed to parse LLM response"
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
except Exception as e:
|
| 278 |
+
self.log_processing(f"Extraction error: {e}", "error")
|
| 279 |
+
return {
|
| 280 |
+
"success": False,
|
| 281 |
+
"error": str(e),
|
| 282 |
+
"is_relevant": False,
|
| 283 |
+
"relevance_confidence": "low",
|
| 284 |
+
"relevance_reason": f"Error during extraction: {str(e)}"
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
def _validate_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
| 288 |
+
"""Validate and normalize LLM response against allowed values."""
|
| 289 |
+
|
| 290 |
+
# Validate products
|
| 291 |
+
products = result.get("products_mentioned", [])
|
| 292 |
+
if not isinstance(products, list):
|
| 293 |
+
products = []
|
| 294 |
+
valid_products = [
|
| 295 |
+
p for p in products
|
| 296 |
+
if any(p.lower() == vp.lower() for vp in self.valid_products)
|
| 297 |
+
]
|
| 298 |
+
# Normalize to canonical case
|
| 299 |
+
normalized_products = []
|
| 300 |
+
for p in valid_products:
|
| 301 |
+
for vp in self.valid_products:
|
| 302 |
+
if p.lower() == vp.lower():
|
| 303 |
+
normalized_products.append(vp)
|
| 304 |
+
break
|
| 305 |
+
|
| 306 |
+
# Validate competitors
|
| 307 |
+
competitors = result.get("competitors_mentioned", [])
|
| 308 |
+
if not isinstance(competitors, list):
|
| 309 |
+
competitors = []
|
| 310 |
+
valid_competitors = [
|
| 311 |
+
c for c in competitors
|
| 312 |
+
if any(c.lower() == vc.lower() for vc in self.valid_competitors)
|
| 313 |
+
]
|
| 314 |
+
# Normalize to canonical case
|
| 315 |
+
normalized_competitors = []
|
| 316 |
+
for c in valid_competitors:
|
| 317 |
+
for vc in self.valid_competitors:
|
| 318 |
+
if c.lower() == vc.lower():
|
| 319 |
+
normalized_competitors.append(vc)
|
| 320 |
+
break
|
| 321 |
+
|
| 322 |
+
# Validate author_role
|
| 323 |
+
author_role = result.get("author_role", "unknown")
|
| 324 |
+
if author_role not in self.valid_author_roles:
|
| 325 |
+
author_role = "unknown"
|
| 326 |
+
|
| 327 |
+
# Validate sabian_mention_context
|
| 328 |
+
mention_context = result.get("sabian_mention_context")
|
| 329 |
+
is_relevant = result.get("is_relevant", False)
|
| 330 |
+
|
| 331 |
+
if not is_relevant:
|
| 332 |
+
mention_context = None
|
| 333 |
+
elif mention_context and mention_context not in self.valid_mention_contexts:
|
| 334 |
+
mention_context = "casual_mention" # Default for relevant posts
|
| 335 |
+
|
| 336 |
+
# Validate confidence
|
| 337 |
+
confidence = result.get("relevance_confidence", "medium")
|
| 338 |
+
if confidence not in ["high", "medium", "low"]:
|
| 339 |
+
confidence = "medium"
|
| 340 |
+
|
| 341 |
+
return {
|
| 342 |
+
"is_relevant": bool(is_relevant),
|
| 343 |
+
"relevance_confidence": confidence,
|
| 344 |
+
"relevance_reason": result.get("relevance_reason", ""),
|
| 345 |
+
"products_mentioned": normalized_products,
|
| 346 |
+
"sabian_mention_context": mention_context,
|
| 347 |
+
"author_role": author_role,
|
| 348 |
+
"competitors_mentioned": normalized_competitors,
|
| 349 |
+
"thread_context_summary": result.get("thread_context_summary", "")
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 353 |
+
"""
|
| 354 |
+
Process a post through relevance validation and fact extraction.
|
| 355 |
+
|
| 356 |
+
Args:
|
| 357 |
+
input_data: Dictionary from preprocessor containing:
|
| 358 |
+
- cleaned_content: Cleaned post text
|
| 359 |
+
- quoted_content: Quoted content if any
|
| 360 |
+
- raw_thread_context: Raw thread context
|
| 361 |
+
- relevance_keywords_found: Keywords from preprocessor
|
| 362 |
+
- preliminary_relevant: Preprocessor's relevance assessment
|
| 363 |
+
- needs_relevance_validation: Whether LLM validation needed
|
| 364 |
+
|
| 365 |
+
Returns:
|
| 366 |
+
Dictionary with extraction results and original data
|
| 367 |
+
"""
|
| 368 |
+
try:
|
| 369 |
+
if not self.validate_input(input_data):
|
| 370 |
+
return {
|
| 371 |
+
"success": False,
|
| 372 |
+
"error": "Invalid input: missing required fields",
|
| 373 |
+
"is_relevant": False,
|
| 374 |
+
**input_data
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
# Skip if already determined not relevant and no validation needed
|
| 378 |
+
if (not input_data.get("preliminary_relevant", False) and
|
| 379 |
+
not input_data.get("needs_relevance_validation", False)):
|
| 380 |
+
return {
|
| 381 |
+
"success": True,
|
| 382 |
+
"is_relevant": False,
|
| 383 |
+
"relevance_confidence": "high",
|
| 384 |
+
"relevance_reason": "No Sabian-related keywords found in post",
|
| 385 |
+
"products_mentioned": [],
|
| 386 |
+
"sabian_mention_context": None,
|
| 387 |
+
"author_role": "unknown",
|
| 388 |
+
"competitors_mentioned": input_data.get("competitors_detected", []),
|
| 389 |
+
"thread_context_summary": "",
|
| 390 |
+
"extraction_performed": False,
|
| 391 |
+
**input_data
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
# Skip non-English posts
|
| 395 |
+
if not input_data.get("is_english", True):
|
| 396 |
+
return {
|
| 397 |
+
"success": True,
|
| 398 |
+
"is_relevant": False,
|
| 399 |
+
"relevance_confidence": "high",
|
| 400 |
+
"relevance_reason": f"Non-English post: {input_data.get('detected_language')}",
|
| 401 |
+
"products_mentioned": [],
|
| 402 |
+
"sabian_mention_context": None,
|
| 403 |
+
"author_role": "unknown",
|
| 404 |
+
"competitors_mentioned": [],
|
| 405 |
+
"thread_context_summary": "",
|
| 406 |
+
"extraction_performed": False,
|
| 407 |
+
**input_data
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
# Perform LLM extraction
|
| 411 |
+
extraction_result = self.extract_and_validate(input_data)
|
| 412 |
+
|
| 413 |
+
# Merge results
|
| 414 |
+
result = {
|
| 415 |
+
**input_data,
|
| 416 |
+
**extraction_result,
|
| 417 |
+
"extraction_performed": True
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
# Log the result
|
| 421 |
+
self.log_processing(
|
| 422 |
+
f"Extraction complete: is_relevant={result.get('is_relevant')}, "
|
| 423 |
+
f"products={result.get('products_mentioned')}, "
|
| 424 |
+
f"context={result.get('sabian_mention_context')}",
|
| 425 |
+
"debug"
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
return result
|
| 429 |
+
|
| 430 |
+
except Exception as e:
|
| 431 |
+
return self.handle_error(e, "relevance extraction")
|
processing_brand_sentiment/workflow/agents/sabian_sentiment_analyzer_agent.py
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sabian Sentiment & Intent Analyzer Agent for brand sentiment analysis.
|
| 3 |
+
|
| 4 |
+
This agent performs deep analysis on VERIFIED relevant posts with STRUCTURED input.
|
| 5 |
+
It receives pre-validated data from the Relevance Extraction Agent including:
|
| 6 |
+
- Products already extracted and validated
|
| 7 |
+
- Thread context already summarized
|
| 8 |
+
- Author role already determined
|
| 9 |
+
|
| 10 |
+
Key Design Principles:
|
| 11 |
+
- Focused analysis: Only sentiment, intents, and customer journey
|
| 12 |
+
- No re-extraction: Products are given, not re-detected
|
| 13 |
+
- Sabian-specific sentiment: How author feels about Sabian, not overall post tone
|
| 14 |
+
- Author perspective: Pain points/delights only from author's own experience
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from typing import Dict, Any, List
|
| 18 |
+
import json
|
| 19 |
+
from langchain_openai import ChatOpenAI
|
| 20 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 21 |
+
import logging
|
| 22 |
+
|
| 23 |
+
from .base_agent import BaseAgent
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class SabianSentimentAnalyzerAgent(BaseAgent):
|
| 29 |
+
"""
|
| 30 |
+
Agent that performs deep sentiment and intent analysis on relevant posts.
|
| 31 |
+
|
| 32 |
+
This agent is the second LLM call in the pipeline and focuses purely on
|
| 33 |
+
analysis, not extraction. It receives structured input from the extraction
|
| 34 |
+
agent and produces sentiment, intent, and customer journey insights.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
config: Dict[str, Any],
|
| 40 |
+
api_key: str,
|
| 41 |
+
brand_config: Dict[str, Any],
|
| 42 |
+
analysis_categories: Dict[str, Any]
|
| 43 |
+
):
|
| 44 |
+
"""
|
| 45 |
+
Initialize the Sentiment Analyzer Agent.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
config: Agent configuration
|
| 49 |
+
api_key: OpenAI API key
|
| 50 |
+
brand_config: Brand-specific configuration
|
| 51 |
+
analysis_categories: Category definitions for analysis
|
| 52 |
+
"""
|
| 53 |
+
super().__init__("SabianSentimentAnalyzerAgent", config)
|
| 54 |
+
self.api_key = api_key
|
| 55 |
+
self.brand_config = brand_config
|
| 56 |
+
self.analysis_categories = analysis_categories
|
| 57 |
+
|
| 58 |
+
self.llm = ChatOpenAI(
|
| 59 |
+
model=self.model,
|
| 60 |
+
temperature=self.temperature,
|
| 61 |
+
api_key=self.api_key
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# Pre-compute valid values for validation
|
| 65 |
+
self._valid_values = self._compute_valid_values()
|
| 66 |
+
|
| 67 |
+
logger.info("SabianSentimentAnalyzerAgent initialized")
|
| 68 |
+
|
| 69 |
+
def _compute_valid_values(self) -> Dict[str, List[str]]:
|
| 70 |
+
"""Pre-compute all valid values from config for validation."""
|
| 71 |
+
valid = {}
|
| 72 |
+
|
| 73 |
+
# Products from brand config
|
| 74 |
+
valid["products"] = self.brand_config.get("brand", {}).get("products", [])
|
| 75 |
+
|
| 76 |
+
# Competitors
|
| 77 |
+
competitor_names = []
|
| 78 |
+
for comp in self.brand_config.get("brand", {}).get("competitors", []):
|
| 79 |
+
if isinstance(comp, dict):
|
| 80 |
+
competitor_names.append(comp.get("name", ""))
|
| 81 |
+
valid["competitors"] = competitor_names
|
| 82 |
+
|
| 83 |
+
# Extract category values from analysis_categories
|
| 84 |
+
category_map = {
|
| 85 |
+
"sentiment_level": "sentiment",
|
| 86 |
+
"emotion_type": "emotions",
|
| 87 |
+
"intents": "intents",
|
| 88 |
+
"purchase_stage": "purchase_stage",
|
| 89 |
+
"comparison_type": "comparison_type",
|
| 90 |
+
"feedback_aspects": "feedback_aspects",
|
| 91 |
+
"decision_drivers": "decision_drivers",
|
| 92 |
+
"product_attributes": "product_attributes",
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
for key, config_key in category_map.items():
|
| 96 |
+
config_section = self.analysis_categories.get(config_key, {})
|
| 97 |
+
if "categories" in config_section:
|
| 98 |
+
valid[key] = [c["value"] for c in config_section["categories"]]
|
| 99 |
+
elif "levels" in config_section:
|
| 100 |
+
valid[key] = [c["value"] for c in config_section["levels"]]
|
| 101 |
+
else:
|
| 102 |
+
valid[key] = []
|
| 103 |
+
|
| 104 |
+
return valid
|
| 105 |
+
|
| 106 |
+
def _get_valid_list(self, key: str) -> List[str]:
|
| 107 |
+
"""Get list of valid values for a category."""
|
| 108 |
+
return self._valid_values.get(key, [])
|
| 109 |
+
|
| 110 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 111 |
+
"""Validate that input contains required fields."""
|
| 112 |
+
required = ["cleaned_content", "is_relevant"]
|
| 113 |
+
return all(field in input_data for field in required)
|
| 114 |
+
|
| 115 |
+
def _build_system_prompt(self) -> str:
|
| 116 |
+
"""Build optimized system prompt for sentiment analysis."""
|
| 117 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 118 |
+
v = self._valid_values
|
| 119 |
+
|
| 120 |
+
return f"""You are a sentiment analyst for {brand_name} cymbal discussions.
|
| 121 |
+
|
| 122 |
+
## YOUR TASK
|
| 123 |
+
Analyze the sentiment, emotions, and intents in posts about {brand_name}.
|
| 124 |
+
You will receive PRE-VALIDATED context (products, author role, etc.) - trust these values.
|
| 125 |
+
|
| 126 |
+
## CRITICAL RULES
|
| 127 |
+
|
| 128 |
+
### Rule 1: Neutral by Default
|
| 129 |
+
Sentiment defaults to NEUTRAL unless there is EXPLICIT positive or negative language toward {brand_name}.
|
| 130 |
+
- Factual statements = neutral
|
| 131 |
+
- Comparative statements ("sounds different", "not the same as") = neutral (different ≠ worse)
|
| 132 |
+
- Advice-giving without personal opinion = neutral
|
| 133 |
+
|
| 134 |
+
Only assign positive/negative sentiment when the author CLEARLY expresses satisfaction or dissatisfaction with {brand_name}.
|
| 135 |
+
|
| 136 |
+
### Rule 2: {brand_name}-Specific Sentiment
|
| 137 |
+
Sentiment MUST be about {brand_name} specifically, NOT overall post tone or other products.
|
| 138 |
+
|
| 139 |
+
EXAMPLE:
|
| 140 |
+
Post: "I have SBR cymbals and bought a Pearl crash. The Pearl sounds different from the SBR. Go with what feels best!"
|
| 141 |
+
- This is NEUTRAL toward {brand_name} - "different" is not criticism
|
| 142 |
+
- The author owns SBR (no complaint), is giving advice
|
| 143 |
+
- pain_points: [] (no negative experience expressed)
|
| 144 |
+
- delight_factors: [] (no positive experience expressed)
|
| 145 |
+
|
| 146 |
+
### Rule 3: Mutually Exclusive Feedback
|
| 147 |
+
pain_points and delight_factors CANNOT contain the same values.
|
| 148 |
+
- If an aspect is positive → delight_factors only
|
| 149 |
+
- If an aspect is negative → pain_points only
|
| 150 |
+
- Never both
|
| 151 |
+
|
| 152 |
+
### Rule 4: Author Perspective Only
|
| 153 |
+
These fields are ONLY for author's OWN experience, not advice to others:
|
| 154 |
+
- purchase_stage, decision_drivers, pain_points, delight_factors
|
| 155 |
+
|
| 156 |
+
If author is primarily giving ADVICE to someone else, these should be null/empty.
|
| 157 |
+
|
| 158 |
+
### Rule 5: Valid Values
|
| 159 |
+
|
| 160 |
+
| Field | Valid Values |
|
| 161 |
+
|-------|--------------|
|
| 162 |
+
| sentiment_level | {v.get('sentiment_level', [])} |
|
| 163 |
+
| emotion_type | {v.get('emotion_type', [])} |
|
| 164 |
+
| intents (multi-select) | {v.get('intents', [])} |
|
| 165 |
+
| purchase_stage | {v.get('purchase_stage', [])} |
|
| 166 |
+
| comparison_type | {v.get('comparison_type', [])} |
|
| 167 |
+
| feedback_aspects | {v.get('feedback_aspects', [])} |
|
| 168 |
+
| decision_drivers | {v.get('decision_drivers', [])} |
|
| 169 |
+
| product_attributes | {v.get('product_attributes', [])} |
|
| 170 |
+
| competitor brands | {v.get('competitors', [])} |
|
| 171 |
+
|
| 172 |
+
### Rule 6: Intent Classification
|
| 173 |
+
- seeking_information: Asking questions, seeking advice
|
| 174 |
+
- providing_information: Answering questions, giving advice
|
| 175 |
+
- sharing_experience: Personal experience, review, testimonial
|
| 176 |
+
- comparing: Comparing brands/products
|
| 177 |
+
- praising: Actively endorsing {brand_name}
|
| 178 |
+
- criticizing: Actively complaining about {brand_name}
|
| 179 |
+
- buying_selling: Listing gear for sale/trade
|
| 180 |
+
- general_discussion: General conversation
|
| 181 |
+
|
| 182 |
+
## OUTPUT FORMAT
|
| 183 |
+
```json
|
| 184 |
+
{{
|
| 185 |
+
"sentiment_level": "neutral unless explicit positive/negative",
|
| 186 |
+
"emotion_type": "value or null",
|
| 187 |
+
"sentiment_confidence": "high" | "medium" | "low",
|
| 188 |
+
"sarcasm_detected": false,
|
| 189 |
+
"product_attributes": [],
|
| 190 |
+
"competitor_products_owned": [],
|
| 191 |
+
"comparison_type": "value or null",
|
| 192 |
+
"intents": [],
|
| 193 |
+
"purchase_stage": "value or null",
|
| 194 |
+
"decision_drivers": [],
|
| 195 |
+
"pain_points": [],
|
| 196 |
+
"delight_factors": [],
|
| 197 |
+
"analysis_notes": "1-2 sentences"
|
| 198 |
+
}}
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
Return ONLY valid JSON."""
|
| 202 |
+
|
| 203 |
+
def _build_user_prompt(self, input_data: Dict[str, Any]) -> str:
|
| 204 |
+
"""Build user prompt with structured context."""
|
| 205 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 206 |
+
|
| 207 |
+
content = input_data.get("cleaned_content", "")
|
| 208 |
+
products_mentioned = input_data.get("products_mentioned", [])
|
| 209 |
+
sabian_context = input_data.get("sabian_mention_context", "")
|
| 210 |
+
author_role = input_data.get("author_role", "unknown")
|
| 211 |
+
thread_summary = input_data.get("thread_context_summary", "")
|
| 212 |
+
competitors_mentioned = input_data.get("competitors_mentioned", [])
|
| 213 |
+
|
| 214 |
+
context_section = f"""## PRE-VALIDATED CONTEXT (trust these values)
|
| 215 |
+
- Products mentioned: {products_mentioned if products_mentioned else 'None specific'}
|
| 216 |
+
- {brand_name} mention context: {sabian_context}
|
| 217 |
+
- Author role: {author_role}
|
| 218 |
+
- Competitors mentioned: {competitors_mentioned if competitors_mentioned else 'None'}
|
| 219 |
+
- Thread summary: {thread_summary if thread_summary else 'Not available'}
|
| 220 |
+
"""
|
| 221 |
+
|
| 222 |
+
return f"""Analyze this post about {brand_name} for sentiment and intents.
|
| 223 |
+
|
| 224 |
+
{context_section}
|
| 225 |
+
## POST CONTENT TO ANALYZE:
|
| 226 |
+
\"\"\"{content}\"\"\"
|
| 227 |
+
|
| 228 |
+
Remember:
|
| 229 |
+
- Sentiment is about {brand_name} ONLY, not overall post tone
|
| 230 |
+
- pain_points/delight_factors only from author's OWN experience
|
| 231 |
+
- Use only values from the valid lists provided
|
| 232 |
+
|
| 233 |
+
Return JSON only."""
|
| 234 |
+
|
| 235 |
+
def analyze_post(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 236 |
+
"""
|
| 237 |
+
Perform sentiment and intent analysis.
|
| 238 |
+
|
| 239 |
+
Args:
|
| 240 |
+
input_data: Structured data from extraction agent
|
| 241 |
+
|
| 242 |
+
Returns:
|
| 243 |
+
Dictionary with analysis results
|
| 244 |
+
"""
|
| 245 |
+
try:
|
| 246 |
+
messages = [
|
| 247 |
+
SystemMessage(content=self._build_system_prompt()),
|
| 248 |
+
HumanMessage(content=self._build_user_prompt(input_data))
|
| 249 |
+
]
|
| 250 |
+
|
| 251 |
+
response = self.llm.invoke(messages)
|
| 252 |
+
result = self._parse_llm_json_response(response.content)
|
| 253 |
+
|
| 254 |
+
# Validate and normalize
|
| 255 |
+
validated = self._validate_and_normalize(result)
|
| 256 |
+
|
| 257 |
+
return {"success": True, **validated}
|
| 258 |
+
|
| 259 |
+
except json.JSONDecodeError as e:
|
| 260 |
+
self.log_processing(f"JSON decode error: {e}", "warning")
|
| 261 |
+
return {
|
| 262 |
+
"success": False,
|
| 263 |
+
"error": f"JSON parse error: {str(e)}",
|
| 264 |
+
"sentiment_level": "neutral",
|
| 265 |
+
"intents": ["general_discussion"]
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
except Exception as e:
|
| 269 |
+
self.log_processing(f"Analysis error: {e}", "error")
|
| 270 |
+
return {"success": False, "error": str(e)}
|
| 271 |
+
|
| 272 |
+
def _validate_single(self, value: Any, valid_list: List[str], default: Any = None) -> Any:
|
| 273 |
+
"""Validate single value against list, return canonical form or default."""
|
| 274 |
+
if value is None:
|
| 275 |
+
return default
|
| 276 |
+
if isinstance(value, str):
|
| 277 |
+
val_lower = value.lower()
|
| 278 |
+
for v in valid_list:
|
| 279 |
+
if v.lower() == val_lower:
|
| 280 |
+
return v
|
| 281 |
+
return default
|
| 282 |
+
|
| 283 |
+
def _validate_list(self, values: Any, valid_list: List[str]) -> List[str]:
|
| 284 |
+
"""Validate list values, return only valid items in canonical form."""
|
| 285 |
+
if not values:
|
| 286 |
+
return []
|
| 287 |
+
if not isinstance(values, list):
|
| 288 |
+
values = [values]
|
| 289 |
+
|
| 290 |
+
validated = []
|
| 291 |
+
valid_lower = {v.lower(): v for v in valid_list}
|
| 292 |
+
for val in values:
|
| 293 |
+
if isinstance(val, str) and val.lower() in valid_lower:
|
| 294 |
+
validated.append(valid_lower[val.lower()])
|
| 295 |
+
return validated
|
| 296 |
+
|
| 297 |
+
def _validate_and_normalize(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
| 298 |
+
"""Validate all fields against predefined values and normalize."""
|
| 299 |
+
v = self._valid_values
|
| 300 |
+
|
| 301 |
+
normalized = {
|
| 302 |
+
# Sentiment
|
| 303 |
+
"sentiment_level": self._validate_single(
|
| 304 |
+
result.get("sentiment_level"), v["sentiment_level"], "neutral"
|
| 305 |
+
),
|
| 306 |
+
"emotion_type": self._validate_single(
|
| 307 |
+
result.get("emotion_type"), v["emotion_type"], None
|
| 308 |
+
),
|
| 309 |
+
"sentiment_confidence": result.get("sentiment_confidence", "medium"),
|
| 310 |
+
"sarcasm_detected": bool(result.get("sarcasm_detected", False)),
|
| 311 |
+
|
| 312 |
+
# Product info
|
| 313 |
+
"product_attributes": self._validate_list(
|
| 314 |
+
result.get("product_attributes"), v["product_attributes"]
|
| 315 |
+
),
|
| 316 |
+
|
| 317 |
+
# Competitors
|
| 318 |
+
"competitor_products_owned": self._validate_list(
|
| 319 |
+
result.get("competitor_products_owned"), v["competitors"]
|
| 320 |
+
),
|
| 321 |
+
"comparison_type": self._validate_single(
|
| 322 |
+
result.get("comparison_type"), v["comparison_type"], None
|
| 323 |
+
),
|
| 324 |
+
|
| 325 |
+
# Intents
|
| 326 |
+
"intents": self._validate_list(
|
| 327 |
+
result.get("intents"), v["intents"]
|
| 328 |
+
) or ["general_discussion"],
|
| 329 |
+
|
| 330 |
+
# Author journey (null if advising others)
|
| 331 |
+
"purchase_stage": self._validate_single(
|
| 332 |
+
result.get("purchase_stage"), v["purchase_stage"], None
|
| 333 |
+
),
|
| 334 |
+
"decision_drivers": self._validate_list(
|
| 335 |
+
result.get("decision_drivers"), v["decision_drivers"]
|
| 336 |
+
),
|
| 337 |
+
|
| 338 |
+
# Feedback - both use feedback_aspects
|
| 339 |
+
"pain_points": self._validate_list(
|
| 340 |
+
result.get("pain_points"), v["feedback_aspects"]
|
| 341 |
+
),
|
| 342 |
+
"delight_factors": self._validate_list(
|
| 343 |
+
result.get("delight_factors"), v["feedback_aspects"]
|
| 344 |
+
),
|
| 345 |
+
|
| 346 |
+
# Notes
|
| 347 |
+
"analysis_notes": result.get("analysis_notes", ""),
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
# Validate confidence
|
| 351 |
+
if normalized["sentiment_confidence"] not in ["high", "medium", "low"]:
|
| 352 |
+
normalized["sentiment_confidence"] = "medium"
|
| 353 |
+
|
| 354 |
+
return normalized
|
| 355 |
+
|
| 356 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 357 |
+
"""
|
| 358 |
+
Process a post through sentiment and intent analysis.
|
| 359 |
+
|
| 360 |
+
Args:
|
| 361 |
+
input_data: Dictionary from extraction agent containing:
|
| 362 |
+
- cleaned_content: Post text
|
| 363 |
+
- is_relevant: Relevance determination
|
| 364 |
+
- products_mentioned: Pre-validated products
|
| 365 |
+
- sabian_mention_context: How Sabian is discussed
|
| 366 |
+
- author_role: Author's relationship to Sabian
|
| 367 |
+
- thread_context_summary: Summarized context
|
| 368 |
+
- competitors_mentioned: Competitor brands
|
| 369 |
+
|
| 370 |
+
Returns:
|
| 371 |
+
Dictionary with analysis results and original data
|
| 372 |
+
"""
|
| 373 |
+
try:
|
| 374 |
+
if not self.validate_input(input_data):
|
| 375 |
+
return {
|
| 376 |
+
"success": False,
|
| 377 |
+
"error": "Invalid input: missing required fields",
|
| 378 |
+
**input_data
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
# Skip non-relevant posts
|
| 382 |
+
if not input_data.get("is_relevant", False):
|
| 383 |
+
return {
|
| 384 |
+
"success": True,
|
| 385 |
+
"analysis_skipped": True,
|
| 386 |
+
"analysis_skip_reason": "Post marked as not relevant",
|
| 387 |
+
"sentiment_level": None,
|
| 388 |
+
"emotion_type": None,
|
| 389 |
+
"sentiment_confidence": None,
|
| 390 |
+
"sarcasm_detected": False,
|
| 391 |
+
"product_attributes": [],
|
| 392 |
+
"competitor_products_owned": [],
|
| 393 |
+
"comparison_type": None,
|
| 394 |
+
"intents": [],
|
| 395 |
+
"purchase_stage": None,
|
| 396 |
+
"decision_drivers": [],
|
| 397 |
+
"pain_points": [],
|
| 398 |
+
"delight_factors": [],
|
| 399 |
+
"analysis_notes": "",
|
| 400 |
+
**input_data
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
# Skip non-English posts (should already be filtered, but double-check)
|
| 404 |
+
if not input_data.get("is_english", True):
|
| 405 |
+
return {
|
| 406 |
+
"success": True,
|
| 407 |
+
"analysis_skipped": True,
|
| 408 |
+
"analysis_skip_reason": f"Non-English: {input_data.get('detected_language')}",
|
| 409 |
+
"sentiment_level": None,
|
| 410 |
+
"emotion_type": None,
|
| 411 |
+
"intents": [],
|
| 412 |
+
**input_data
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
# Perform analysis
|
| 416 |
+
analysis_result = self.analyze_post(input_data)
|
| 417 |
+
|
| 418 |
+
result = {
|
| 419 |
+
**input_data,
|
| 420 |
+
**analysis_result,
|
| 421 |
+
"analysis_skipped": False
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
self.log_processing(
|
| 425 |
+
f"Analyzed: sentiment={result.get('sentiment_level')}, "
|
| 426 |
+
f"intents={result.get('intents')}, "
|
| 427 |
+
f"pain_points={result.get('pain_points')}",
|
| 428 |
+
"debug"
|
| 429 |
+
)
|
| 430 |
+
|
| 431 |
+
return result
|
| 432 |
+
|
| 433 |
+
except Exception as e:
|
| 434 |
+
return self.handle_error(e, "sentiment analysis")
|
processing_brand_sentiment/workflow/comment_orchestrator.py
ADDED
|
@@ -0,0 +1,558 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Comment Analysis Workflow Orchestrator using LangGraph.
|
| 3 |
+
|
| 4 |
+
Coordinates the 4-agent pipeline for social media comments:
|
| 5 |
+
1. CommentPreprocessorAgent - Plain text cleaning, keyword detection (no LLM)
|
| 6 |
+
2. SabianRelevanceExtractionAgent - Relevance + fact extraction (LLM #1) [shared]
|
| 7 |
+
3. SabianSentimentAnalyzerAgent - Deep sentiment analysis (LLM #2) [shared]
|
| 8 |
+
4. OutputValidatorAgent - Rule-based validation (no LLM) [shared]
|
| 9 |
+
|
| 10 |
+
Architecture v4.0:
|
| 11 |
+
- Same analysis pipeline as forums, different preprocessing and state
|
| 12 |
+
- Plain text input (no HTML parsing)
|
| 13 |
+
- Context from social media content metadata and parent comments
|
| 14 |
+
- Comment-specific identifiers (comment_sk, comment_id, platform, etc.)
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from typing import Dict, Any, List, TypedDict, Annotated, Optional
|
| 18 |
+
import operator
|
| 19 |
+
import json
|
| 20 |
+
import os
|
| 21 |
+
from langgraph.graph import StateGraph, END
|
| 22 |
+
import logging
|
| 23 |
+
|
| 24 |
+
from .agents.comment_preprocessor_agent import CommentPreprocessorAgent
|
| 25 |
+
from .agents.sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent
|
| 26 |
+
from .agents.sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent
|
| 27 |
+
from .agents.output_validator_agent import OutputValidatorAgent
|
| 28 |
+
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class CommentAnalysisState(TypedDict):
|
| 33 |
+
"""
|
| 34 |
+
State definition for the comment analysis workflow v4.0.
|
| 35 |
+
|
| 36 |
+
Uses comment-specific identifiers but shares the same analysis fields
|
| 37 |
+
as the forum workflow for consistent output.
|
| 38 |
+
"""
|
| 39 |
+
# ============== Source Identifiers (Comment-specific) ==============
|
| 40 |
+
comment_sk: int
|
| 41 |
+
comment_id: str
|
| 42 |
+
platform: str
|
| 43 |
+
comment_timestamp: Any
|
| 44 |
+
author_name: str
|
| 45 |
+
author_id: str
|
| 46 |
+
parent_comment_id: str
|
| 47 |
+
parent_comment_text: str
|
| 48 |
+
|
| 49 |
+
# Content metadata
|
| 50 |
+
content_sk: int
|
| 51 |
+
content_id: str
|
| 52 |
+
content_description: str
|
| 53 |
+
content_title: str
|
| 54 |
+
channel_sk: int
|
| 55 |
+
channel_name: str
|
| 56 |
+
channel_display_name: str
|
| 57 |
+
|
| 58 |
+
# ============== Original Content ==============
|
| 59 |
+
comment_text: str
|
| 60 |
+
original_text: str
|
| 61 |
+
|
| 62 |
+
# ============== Preprocessor Output ==============
|
| 63 |
+
cleaned_content: str
|
| 64 |
+
quoted_content: str
|
| 65 |
+
has_quote: bool
|
| 66 |
+
quoted_author: str
|
| 67 |
+
raw_thread_context: str # Comment context (reuses field name for agent compatibility)
|
| 68 |
+
is_empty: bool
|
| 69 |
+
|
| 70 |
+
# Language detection
|
| 71 |
+
detected_language: str
|
| 72 |
+
language_code: str
|
| 73 |
+
is_english: bool
|
| 74 |
+
language_confidence: str
|
| 75 |
+
language_detection_skipped: bool
|
| 76 |
+
|
| 77 |
+
# Preliminary relevance (keyword-based)
|
| 78 |
+
preliminary_relevant: bool
|
| 79 |
+
needs_relevance_validation: bool
|
| 80 |
+
relevance_keywords_found: List[str]
|
| 81 |
+
relevance_type: str
|
| 82 |
+
has_primary_keywords: bool
|
| 83 |
+
|
| 84 |
+
# Initial detections
|
| 85 |
+
products_detected: List[str]
|
| 86 |
+
competitors_detected: List[str]
|
| 87 |
+
|
| 88 |
+
# ============== Extraction Agent Output ==============
|
| 89 |
+
is_relevant: bool
|
| 90 |
+
relevance_confidence: str
|
| 91 |
+
relevance_reason: str
|
| 92 |
+
extraction_performed: bool
|
| 93 |
+
|
| 94 |
+
# Extracted facts
|
| 95 |
+
products_mentioned: List[str]
|
| 96 |
+
sabian_mention_context: str
|
| 97 |
+
author_role: str
|
| 98 |
+
competitors_mentioned: List[str]
|
| 99 |
+
thread_context_summary: str
|
| 100 |
+
|
| 101 |
+
# ============== Sentiment Analyzer Output ==============
|
| 102 |
+
sentiment_level: str
|
| 103 |
+
emotion_type: str
|
| 104 |
+
sentiment_confidence: str
|
| 105 |
+
sarcasm_detected: bool
|
| 106 |
+
|
| 107 |
+
# Product information
|
| 108 |
+
product_attributes: List[str]
|
| 109 |
+
|
| 110 |
+
# Competitive intelligence
|
| 111 |
+
competitor_products_owned: List[str]
|
| 112 |
+
comparison_type: str
|
| 113 |
+
|
| 114 |
+
# Customer journey (AUTHOR PERSPECTIVE ONLY)
|
| 115 |
+
intents: List[str]
|
| 116 |
+
purchase_stage: str
|
| 117 |
+
decision_drivers: List[str]
|
| 118 |
+
pain_points: List[str]
|
| 119 |
+
delight_factors: List[str]
|
| 120 |
+
|
| 121 |
+
# Analysis notes
|
| 122 |
+
analysis_notes: str
|
| 123 |
+
analysis_skipped: bool
|
| 124 |
+
analysis_skip_reason: str
|
| 125 |
+
|
| 126 |
+
# ============== Validator Output ==============
|
| 127 |
+
validation_passed: bool
|
| 128 |
+
validation_errors: List[str]
|
| 129 |
+
validation_warnings: List[str]
|
| 130 |
+
validation_flags: List[str]
|
| 131 |
+
processing_status: str
|
| 132 |
+
|
| 133 |
+
# ============== Processing Metadata ==============
|
| 134 |
+
processing_errors: Annotated[List[str], operator.add]
|
| 135 |
+
success: bool
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class CommentAnalysisWorkflow:
|
| 139 |
+
"""
|
| 140 |
+
LangGraph-based workflow for comment brand sentiment analysis v4.0.
|
| 141 |
+
|
| 142 |
+
Pipeline:
|
| 143 |
+
1. Comment Preprocessor (no LLM) - plain text, comment context
|
| 144 |
+
2. Relevance & Extraction Agent (LLM #1) - shared with forums
|
| 145 |
+
3. Sentiment Analyzer Agent (LLM #2) - shared with forums
|
| 146 |
+
4. Output Validator (no LLM) - shared with forums
|
| 147 |
+
"""
|
| 148 |
+
|
| 149 |
+
def __init__(
|
| 150 |
+
self,
|
| 151 |
+
workflow_config: Dict[str, Any],
|
| 152 |
+
brand_config: Dict[str, Any],
|
| 153 |
+
analysis_categories: Dict[str, Any],
|
| 154 |
+
api_key: str
|
| 155 |
+
):
|
| 156 |
+
"""
|
| 157 |
+
Initialize the workflow with agents and configuration.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
workflow_config: Workflow and agent configuration
|
| 161 |
+
brand_config: Brand-specific configuration
|
| 162 |
+
analysis_categories: Analysis category definitions
|
| 163 |
+
api_key: OpenAI API key
|
| 164 |
+
"""
|
| 165 |
+
self.workflow_config = workflow_config
|
| 166 |
+
self.brand_config = brand_config
|
| 167 |
+
self.analysis_categories = analysis_categories
|
| 168 |
+
self.api_key = api_key
|
| 169 |
+
|
| 170 |
+
# Initialize agents
|
| 171 |
+
self._init_agents()
|
| 172 |
+
|
| 173 |
+
# Build the workflow graph
|
| 174 |
+
self.workflow = self._build_workflow()
|
| 175 |
+
|
| 176 |
+
logger.info("CommentAnalysisWorkflow v4.0 initialized successfully")
|
| 177 |
+
|
| 178 |
+
def _init_agents(self) -> None:
|
| 179 |
+
"""Initialize all agents with their configurations."""
|
| 180 |
+
agents_config = self.workflow_config.get("agents", {})
|
| 181 |
+
|
| 182 |
+
# 1. Comment Preprocessor Agent (no LLM) - comment-specific
|
| 183 |
+
preprocessor_config = agents_config.get("preprocessor", {})
|
| 184 |
+
self.preprocessor = CommentPreprocessorAgent(
|
| 185 |
+
preprocessor_config,
|
| 186 |
+
self.brand_config
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
# 2. Relevance & Extraction Agent (LLM #1) - shared with forums
|
| 190 |
+
extraction_config = agents_config.get("relevance_extraction",
|
| 191 |
+
agents_config.get("relevance_validator", {})
|
| 192 |
+
)
|
| 193 |
+
self.extraction_agent = SabianRelevanceExtractionAgent(
|
| 194 |
+
extraction_config,
|
| 195 |
+
self.api_key,
|
| 196 |
+
self.brand_config,
|
| 197 |
+
self.analysis_categories
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
# 3. Sentiment Analyzer Agent (LLM #2) - shared with forums
|
| 201 |
+
analyzer_config = agents_config.get("sentiment_analyzer",
|
| 202 |
+
agents_config.get("brand_analyzer", {})
|
| 203 |
+
)
|
| 204 |
+
self.sentiment_analyzer = SabianSentimentAnalyzerAgent(
|
| 205 |
+
analyzer_config,
|
| 206 |
+
self.api_key,
|
| 207 |
+
self.brand_config,
|
| 208 |
+
self.analysis_categories
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
# 4. Output Validator Agent (no LLM) - shared with forums
|
| 212 |
+
validator_config = agents_config.get("output_validator", {})
|
| 213 |
+
self.output_validator = OutputValidatorAgent(
|
| 214 |
+
validator_config,
|
| 215 |
+
self.brand_config,
|
| 216 |
+
self.analysis_categories
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
logger.info("All 4 agents initialized for comment processing")
|
| 220 |
+
|
| 221 |
+
def _build_workflow(self) -> StateGraph:
|
| 222 |
+
"""
|
| 223 |
+
Build the LangGraph workflow.
|
| 224 |
+
|
| 225 |
+
Flow:
|
| 226 |
+
preprocessing -> extraction -> (analysis if relevant) -> validation -> END
|
| 227 |
+
|
| 228 |
+
Returns:
|
| 229 |
+
Compiled StateGraph workflow
|
| 230 |
+
"""
|
| 231 |
+
workflow = StateGraph(CommentAnalysisState)
|
| 232 |
+
|
| 233 |
+
# Add nodes
|
| 234 |
+
workflow.add_node("preprocessing", self._preprocessing_node)
|
| 235 |
+
workflow.add_node("extraction", self._extraction_node)
|
| 236 |
+
workflow.add_node("analysis", self._analysis_node)
|
| 237 |
+
workflow.add_node("validation", self._validation_node)
|
| 238 |
+
|
| 239 |
+
# Set entry point
|
| 240 |
+
workflow.set_entry_point("preprocessing")
|
| 241 |
+
|
| 242 |
+
# Define edges
|
| 243 |
+
workflow.add_conditional_edges(
|
| 244 |
+
"preprocessing",
|
| 245 |
+
self._route_after_preprocessing,
|
| 246 |
+
{
|
| 247 |
+
"extract": "extraction",
|
| 248 |
+
"skip_to_validation": "validation"
|
| 249 |
+
}
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
workflow.add_conditional_edges(
|
| 253 |
+
"extraction",
|
| 254 |
+
self._route_after_extraction,
|
| 255 |
+
{
|
| 256 |
+
"analyze": "analysis",
|
| 257 |
+
"skip_to_validation": "validation"
|
| 258 |
+
}
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
workflow.add_edge("analysis", "validation")
|
| 262 |
+
workflow.add_edge("validation", END)
|
| 263 |
+
|
| 264 |
+
return workflow.compile()
|
| 265 |
+
|
| 266 |
+
def _preprocessing_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
|
| 267 |
+
"""
|
| 268 |
+
Preprocessing node: Plain text cleaning, language detection, keyword check.
|
| 269 |
+
"""
|
| 270 |
+
try:
|
| 271 |
+
input_data = {
|
| 272 |
+
"comment_sk": state.get("comment_sk"),
|
| 273 |
+
"comment_text": state.get("comment_text", ""),
|
| 274 |
+
"content_title": state.get("content_title"),
|
| 275 |
+
"content_description": state.get("content_description"),
|
| 276 |
+
"parent_comment_text": state.get("parent_comment_text")
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
result = self.preprocessor.process(input_data)
|
| 280 |
+
|
| 281 |
+
if result.get("success", False):
|
| 282 |
+
# Content
|
| 283 |
+
state["cleaned_content"] = result.get("cleaned_content", "")
|
| 284 |
+
state["quoted_content"] = result.get("quoted_content")
|
| 285 |
+
state["has_quote"] = result.get("has_quote", False)
|
| 286 |
+
state["quoted_author"] = result.get("quoted_author")
|
| 287 |
+
state["raw_thread_context"] = result.get("raw_thread_context", "")
|
| 288 |
+
state["is_empty"] = result.get("is_empty", False)
|
| 289 |
+
state["original_text"] = result.get("original_text", state.get("comment_text", ""))
|
| 290 |
+
|
| 291 |
+
# Language
|
| 292 |
+
state["detected_language"] = result.get("detected_language", "English")
|
| 293 |
+
state["language_code"] = result.get("language_code", "en")
|
| 294 |
+
state["is_english"] = result.get("is_english", True)
|
| 295 |
+
state["language_confidence"] = result.get("language_confidence", "low")
|
| 296 |
+
state["language_detection_skipped"] = result.get("language_detection_skipped", False)
|
| 297 |
+
|
| 298 |
+
# Relevance
|
| 299 |
+
state["preliminary_relevant"] = result.get("preliminary_relevant", False)
|
| 300 |
+
state["needs_relevance_validation"] = result.get("needs_relevance_validation", False)
|
| 301 |
+
state["relevance_keywords_found"] = result.get("relevance_keywords_found", [])
|
| 302 |
+
state["relevance_type"] = result.get("relevance_type", "none")
|
| 303 |
+
state["has_primary_keywords"] = result.get("has_primary_keywords", False)
|
| 304 |
+
|
| 305 |
+
# Detections
|
| 306 |
+
state["products_detected"] = result.get("products_detected", [])
|
| 307 |
+
state["competitors_detected"] = result.get("competitors_detected", [])
|
| 308 |
+
|
| 309 |
+
state["success"] = True
|
| 310 |
+
else:
|
| 311 |
+
error_msg = f"Preprocessing failed: {result.get('error', 'Unknown error')}"
|
| 312 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 313 |
+
state["success"] = False
|
| 314 |
+
|
| 315 |
+
logger.debug(f"Preprocessing complete for comment {state.get('comment_sk')}")
|
| 316 |
+
return state
|
| 317 |
+
|
| 318 |
+
except Exception as e:
|
| 319 |
+
error_msg = f"Preprocessing node error: {str(e)}"
|
| 320 |
+
logger.error(error_msg)
|
| 321 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 322 |
+
state["success"] = False
|
| 323 |
+
return state
|
| 324 |
+
|
| 325 |
+
def _extraction_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
|
| 326 |
+
"""
|
| 327 |
+
Extraction node: LLM-based relevance validation and fact extraction.
|
| 328 |
+
Reuses the same extraction agent as forums.
|
| 329 |
+
"""
|
| 330 |
+
try:
|
| 331 |
+
input_data = {
|
| 332 |
+
"cleaned_content": state.get("cleaned_content", ""),
|
| 333 |
+
"quoted_content": state.get("quoted_content"),
|
| 334 |
+
"raw_thread_context": state.get("raw_thread_context", ""),
|
| 335 |
+
"relevance_keywords_found": state.get("relevance_keywords_found", []),
|
| 336 |
+
"preliminary_relevant": state.get("preliminary_relevant", False),
|
| 337 |
+
"needs_relevance_validation": state.get("needs_relevance_validation", True),
|
| 338 |
+
"products_detected": state.get("products_detected", []),
|
| 339 |
+
"competitors_detected": state.get("competitors_detected", []),
|
| 340 |
+
"is_english": state.get("is_english", True),
|
| 341 |
+
"detected_language": state.get("detected_language", "English")
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
result = self.extraction_agent.process(input_data)
|
| 345 |
+
|
| 346 |
+
# Update state with extraction results
|
| 347 |
+
state["is_relevant"] = result.get("is_relevant", False)
|
| 348 |
+
state["relevance_confidence"] = result.get("relevance_confidence", "low")
|
| 349 |
+
state["relevance_reason"] = result.get("relevance_reason", "")
|
| 350 |
+
state["extraction_performed"] = result.get("extraction_performed", True)
|
| 351 |
+
|
| 352 |
+
# Extracted facts
|
| 353 |
+
state["products_mentioned"] = result.get("products_mentioned", [])
|
| 354 |
+
state["sabian_mention_context"] = result.get("sabian_mention_context")
|
| 355 |
+
state["author_role"] = result.get("author_role", "unknown")
|
| 356 |
+
state["competitors_mentioned"] = result.get("competitors_mentioned", [])
|
| 357 |
+
state["thread_context_summary"] = result.get("thread_context_summary", "")
|
| 358 |
+
|
| 359 |
+
if not result.get("success", False) and result.get("error"):
|
| 360 |
+
state["processing_errors"] = state.get("processing_errors", []) + [result["error"]]
|
| 361 |
+
|
| 362 |
+
logger.debug(f"Extraction complete: is_relevant={state['is_relevant']}")
|
| 363 |
+
return state
|
| 364 |
+
|
| 365 |
+
except Exception as e:
|
| 366 |
+
error_msg = f"Extraction node error: {str(e)}"
|
| 367 |
+
logger.error(error_msg)
|
| 368 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 369 |
+
state["is_relevant"] = False
|
| 370 |
+
state["relevance_confidence"] = "low"
|
| 371 |
+
return state
|
| 372 |
+
|
| 373 |
+
def _analysis_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
|
| 374 |
+
"""
|
| 375 |
+
Analysis node: Deep sentiment and intent analysis for relevant comments.
|
| 376 |
+
Reuses the same sentiment analyzer as forums.
|
| 377 |
+
"""
|
| 378 |
+
try:
|
| 379 |
+
input_data = {
|
| 380 |
+
"cleaned_content": state.get("cleaned_content", ""),
|
| 381 |
+
"is_relevant": state.get("is_relevant", True),
|
| 382 |
+
"is_english": state.get("is_english", True),
|
| 383 |
+
"detected_language": state.get("detected_language", "English"),
|
| 384 |
+
"products_mentioned": state.get("products_mentioned", []),
|
| 385 |
+
"sabian_mention_context": state.get("sabian_mention_context"),
|
| 386 |
+
"author_role": state.get("author_role", "unknown"),
|
| 387 |
+
"competitors_mentioned": state.get("competitors_mentioned", []),
|
| 388 |
+
"thread_context_summary": state.get("thread_context_summary", "")
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
result = self.sentiment_analyzer.process(input_data)
|
| 392 |
+
|
| 393 |
+
if result.get("success", False):
|
| 394 |
+
# Sentiment
|
| 395 |
+
state["sentiment_level"] = result.get("sentiment_level")
|
| 396 |
+
state["emotion_type"] = result.get("emotion_type")
|
| 397 |
+
state["sentiment_confidence"] = result.get("sentiment_confidence", "medium")
|
| 398 |
+
state["sarcasm_detected"] = result.get("sarcasm_detected", False)
|
| 399 |
+
|
| 400 |
+
# Products
|
| 401 |
+
state["product_attributes"] = result.get("product_attributes", [])
|
| 402 |
+
|
| 403 |
+
# Competitive
|
| 404 |
+
state["competitor_products_owned"] = result.get("competitor_products_owned", [])
|
| 405 |
+
state["comparison_type"] = result.get("comparison_type")
|
| 406 |
+
|
| 407 |
+
# Journey
|
| 408 |
+
state["intents"] = result.get("intents", [])
|
| 409 |
+
state["purchase_stage"] = result.get("purchase_stage")
|
| 410 |
+
state["decision_drivers"] = result.get("decision_drivers", [])
|
| 411 |
+
state["pain_points"] = result.get("pain_points", [])
|
| 412 |
+
state["delight_factors"] = result.get("delight_factors", [])
|
| 413 |
+
|
| 414 |
+
# Notes
|
| 415 |
+
state["analysis_notes"] = result.get("analysis_notes", "")
|
| 416 |
+
state["analysis_skipped"] = result.get("analysis_skipped", False)
|
| 417 |
+
state["analysis_skip_reason"] = result.get("analysis_skip_reason", "")
|
| 418 |
+
else:
|
| 419 |
+
error_msg = f"Analysis failed: {result.get('error', 'Unknown error')}"
|
| 420 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 421 |
+
|
| 422 |
+
logger.debug(f"Analysis complete for comment {state.get('comment_sk')}")
|
| 423 |
+
return state
|
| 424 |
+
|
| 425 |
+
except Exception as e:
|
| 426 |
+
error_msg = f"Analysis node error: {str(e)}"
|
| 427 |
+
logger.error(error_msg)
|
| 428 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 429 |
+
return state
|
| 430 |
+
|
| 431 |
+
def _validation_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
|
| 432 |
+
"""
|
| 433 |
+
Validation node: Rule-based validation and anomaly detection.
|
| 434 |
+
Reuses the same output validator as forums.
|
| 435 |
+
"""
|
| 436 |
+
try:
|
| 437 |
+
result = self.output_validator.process(dict(state))
|
| 438 |
+
|
| 439 |
+
state["validation_passed"] = result.get("validation_passed", True)
|
| 440 |
+
state["validation_errors"] = result.get("validation_errors", [])
|
| 441 |
+
state["validation_warnings"] = result.get("validation_warnings", [])
|
| 442 |
+
state["validation_flags"] = result.get("validation_flags", [])
|
| 443 |
+
state["processing_status"] = result.get("processing_status", "completed")
|
| 444 |
+
|
| 445 |
+
# Set overall success
|
| 446 |
+
has_errors = len(state.get("processing_errors", [])) > 0
|
| 447 |
+
state["success"] = not has_errors or state.get("is_relevant") is not None
|
| 448 |
+
|
| 449 |
+
logger.debug(f"Validation complete: status={state['processing_status']}")
|
| 450 |
+
return state
|
| 451 |
+
|
| 452 |
+
except Exception as e:
|
| 453 |
+
error_msg = f"Validation node error: {str(e)}"
|
| 454 |
+
logger.error(error_msg)
|
| 455 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 456 |
+
state["validation_passed"] = False
|
| 457 |
+
state["processing_status"] = "validation_failed"
|
| 458 |
+
state["success"] = False
|
| 459 |
+
return state
|
| 460 |
+
|
| 461 |
+
def _route_after_preprocessing(self, state: CommentAnalysisState) -> str:
|
| 462 |
+
"""Determine routing after preprocessing."""
|
| 463 |
+
if state.get("is_empty", False):
|
| 464 |
+
state["is_relevant"] = False
|
| 465 |
+
state["relevance_reason"] = "Empty content"
|
| 466 |
+
return "skip_to_validation"
|
| 467 |
+
|
| 468 |
+
if not state.get("is_english", True):
|
| 469 |
+
state["is_relevant"] = False
|
| 470 |
+
state["relevance_reason"] = f"Non-English: {state.get('detected_language')}"
|
| 471 |
+
return "skip_to_validation"
|
| 472 |
+
|
| 473 |
+
if (not state.get("preliminary_relevant", False) and
|
| 474 |
+
not state.get("needs_relevance_validation", False)):
|
| 475 |
+
state["is_relevant"] = False
|
| 476 |
+
state["relevance_reason"] = "No relevant keywords found"
|
| 477 |
+
return "skip_to_validation"
|
| 478 |
+
|
| 479 |
+
return "extract"
|
| 480 |
+
|
| 481 |
+
def _route_after_extraction(self, state: CommentAnalysisState) -> str:
|
| 482 |
+
"""Determine routing after extraction."""
|
| 483 |
+
if state.get("is_relevant", False):
|
| 484 |
+
return "analyze"
|
| 485 |
+
return "skip_to_validation"
|
| 486 |
+
|
| 487 |
+
def process_comment(self, comment_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 488 |
+
"""
|
| 489 |
+
Process a single social media comment through the workflow.
|
| 490 |
+
|
| 491 |
+
Args:
|
| 492 |
+
comment_data: Dictionary containing comment data
|
| 493 |
+
|
| 494 |
+
Returns:
|
| 495 |
+
Dictionary with processed results
|
| 496 |
+
"""
|
| 497 |
+
try:
|
| 498 |
+
initial_state = {
|
| 499 |
+
# Comment identifiers
|
| 500 |
+
"comment_sk": comment_data.get("comment_sk"),
|
| 501 |
+
"comment_id": comment_data.get("comment_id"),
|
| 502 |
+
"platform": comment_data.get("platform"),
|
| 503 |
+
"comment_timestamp": comment_data.get("comment_timestamp"),
|
| 504 |
+
"author_name": comment_data.get("author_name"),
|
| 505 |
+
"author_id": comment_data.get("author_id"),
|
| 506 |
+
"parent_comment_id": comment_data.get("parent_comment_id"),
|
| 507 |
+
"parent_comment_text": comment_data.get("parent_comment_text"),
|
| 508 |
+
|
| 509 |
+
# Content metadata
|
| 510 |
+
"content_sk": comment_data.get("content_sk"),
|
| 511 |
+
"content_id": comment_data.get("content_id"),
|
| 512 |
+
"content_description": comment_data.get("content_description"),
|
| 513 |
+
"content_title": comment_data.get("content_title"),
|
| 514 |
+
"channel_sk": comment_data.get("channel_sk"),
|
| 515 |
+
"channel_name": comment_data.get("channel_name"),
|
| 516 |
+
"channel_display_name": comment_data.get("channel_display_name"),
|
| 517 |
+
|
| 518 |
+
# Comment text
|
| 519 |
+
"comment_text": comment_data.get("comment_text", ""),
|
| 520 |
+
|
| 521 |
+
# Processing metadata
|
| 522 |
+
"processing_errors": [],
|
| 523 |
+
"success": True
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
final_state = self.workflow.invoke(initial_state)
|
| 527 |
+
|
| 528 |
+
return dict(final_state)
|
| 529 |
+
|
| 530 |
+
except Exception as e:
|
| 531 |
+
logger.error(f"Workflow execution error: {str(e)}")
|
| 532 |
+
return {
|
| 533 |
+
**comment_data,
|
| 534 |
+
"success": False,
|
| 535 |
+
"processing_errors": [str(e)],
|
| 536 |
+
"processing_status": "workflow_error"
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
def process_batch(self, comments: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 540 |
+
"""
|
| 541 |
+
Process a batch of social media comments.
|
| 542 |
+
|
| 543 |
+
Args:
|
| 544 |
+
comments: List of comment dictionaries
|
| 545 |
+
|
| 546 |
+
Returns:
|
| 547 |
+
List of processed comment dictionaries
|
| 548 |
+
"""
|
| 549 |
+
results = []
|
| 550 |
+
total = len(comments)
|
| 551 |
+
|
| 552 |
+
for idx, comment in enumerate(comments, 1):
|
| 553 |
+
logger.info(f"Processing comment {idx}/{total} (SK: {comment.get('comment_sk')})")
|
| 554 |
+
result = self.process_comment(comment)
|
| 555 |
+
results.append(result)
|
| 556 |
+
|
| 557 |
+
logger.info(f"Batch processing complete: {total} comments processed")
|
| 558 |
+
return results
|
processing_brand_sentiment/workflow/orchestrator.py
ADDED
|
@@ -0,0 +1,551 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Brand Analysis Workflow Orchestrator using LangGraph.
|
| 3 |
+
|
| 4 |
+
Coordinates the 4-agent pipeline:
|
| 5 |
+
1. ContentPreprocessorAgent - HTML parsing, cleaning, keyword detection (no LLM)
|
| 6 |
+
2. SabianRelevanceExtractionAgent - Relevance + fact extraction (LLM #1)
|
| 7 |
+
3. SabianSentimentAnalyzerAgent - Deep sentiment analysis (LLM #2)
|
| 8 |
+
4. OutputValidatorAgent - Rule-based validation (no LLM)
|
| 9 |
+
|
| 10 |
+
Architecture v4.0:
|
| 11 |
+
- Separation of concerns: extraction vs analysis
|
| 12 |
+
- Strict validation at every step
|
| 13 |
+
- Structured data flow between agents
|
| 14 |
+
- Conservative relevance determination
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from typing import Dict, Any, List, TypedDict, Annotated, Optional
|
| 18 |
+
import operator
|
| 19 |
+
import json
|
| 20 |
+
import os
|
| 21 |
+
from langgraph.graph import StateGraph, END
|
| 22 |
+
import logging
|
| 23 |
+
|
| 24 |
+
from .agents.content_preprocessor_agent import ContentPreprocessorAgent
|
| 25 |
+
from .agents.sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent
|
| 26 |
+
from .agents.sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent
|
| 27 |
+
from .agents.output_validator_agent import OutputValidatorAgent
|
| 28 |
+
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class BrandAnalysisState(TypedDict):
|
| 33 |
+
"""
|
| 34 |
+
State definition for the brand analysis workflow v4.0.
|
| 35 |
+
|
| 36 |
+
This state flows through all agents, accumulating data at each step.
|
| 37 |
+
"""
|
| 38 |
+
# ============== Source Identifiers ==============
|
| 39 |
+
post_id: int
|
| 40 |
+
thread_id: int
|
| 41 |
+
post_author_id: int
|
| 42 |
+
|
| 43 |
+
# ============== Original Content ==============
|
| 44 |
+
post_content: str
|
| 45 |
+
original_content: str
|
| 46 |
+
|
| 47 |
+
# ============== Thread Context ==============
|
| 48 |
+
thread_title: str
|
| 49 |
+
thread_first_post: str
|
| 50 |
+
thread_started_at: Any
|
| 51 |
+
category_title: str
|
| 52 |
+
category_topic: str
|
| 53 |
+
|
| 54 |
+
# ============== Timestamps ==============
|
| 55 |
+
post_created_at: Any
|
| 56 |
+
|
| 57 |
+
# ============== Preprocessor Output ==============
|
| 58 |
+
cleaned_content: str
|
| 59 |
+
quoted_content: str
|
| 60 |
+
has_quote: bool
|
| 61 |
+
quoted_author: str
|
| 62 |
+
raw_thread_context: str # Raw context for extraction agent
|
| 63 |
+
is_empty: bool
|
| 64 |
+
|
| 65 |
+
# Language detection
|
| 66 |
+
detected_language: str
|
| 67 |
+
language_code: str
|
| 68 |
+
is_english: bool
|
| 69 |
+
language_confidence: str
|
| 70 |
+
language_detection_skipped: bool
|
| 71 |
+
|
| 72 |
+
# Preliminary relevance (keyword-based)
|
| 73 |
+
preliminary_relevant: bool
|
| 74 |
+
needs_relevance_validation: bool
|
| 75 |
+
relevance_keywords_found: List[str]
|
| 76 |
+
relevance_type: str
|
| 77 |
+
has_primary_keywords: bool
|
| 78 |
+
|
| 79 |
+
# Initial detections
|
| 80 |
+
products_detected: List[str]
|
| 81 |
+
competitors_detected: List[str]
|
| 82 |
+
|
| 83 |
+
# ============== Extraction Agent Output ==============
|
| 84 |
+
is_relevant: bool
|
| 85 |
+
relevance_confidence: str
|
| 86 |
+
relevance_reason: str
|
| 87 |
+
extraction_performed: bool
|
| 88 |
+
|
| 89 |
+
# Extracted facts
|
| 90 |
+
products_mentioned: List[str]
|
| 91 |
+
sabian_mention_context: str # primary_focus, significant_mention, casual_mention, comparison_context
|
| 92 |
+
author_role: str # current_owner, past_owner, potential_buyer, never_owned, unknown
|
| 93 |
+
competitors_mentioned: List[str]
|
| 94 |
+
thread_context_summary: str # NEW: Summarized context for storage and analysis
|
| 95 |
+
|
| 96 |
+
# ============== Sentiment Analyzer Output ==============
|
| 97 |
+
sentiment_level: str
|
| 98 |
+
emotion_type: str
|
| 99 |
+
sentiment_confidence: str
|
| 100 |
+
sarcasm_detected: bool
|
| 101 |
+
|
| 102 |
+
# Product information
|
| 103 |
+
product_attributes: List[str]
|
| 104 |
+
|
| 105 |
+
# Competitive intelligence
|
| 106 |
+
competitor_products_owned: List[str]
|
| 107 |
+
comparison_type: str
|
| 108 |
+
|
| 109 |
+
# Customer journey (AUTHOR PERSPECTIVE ONLY)
|
| 110 |
+
intents: List[str]
|
| 111 |
+
purchase_stage: str
|
| 112 |
+
decision_drivers: List[str]
|
| 113 |
+
pain_points: List[str]
|
| 114 |
+
delight_factors: List[str]
|
| 115 |
+
|
| 116 |
+
# Analysis notes
|
| 117 |
+
analysis_notes: str
|
| 118 |
+
analysis_skipped: bool
|
| 119 |
+
analysis_skip_reason: str
|
| 120 |
+
|
| 121 |
+
# ============== Validator Output ==============
|
| 122 |
+
validation_passed: bool
|
| 123 |
+
validation_errors: List[str]
|
| 124 |
+
validation_warnings: List[str]
|
| 125 |
+
validation_flags: List[str]
|
| 126 |
+
processing_status: str # completed, completed_with_flags, validation_failed
|
| 127 |
+
|
| 128 |
+
# ============== Processing Metadata ==============
|
| 129 |
+
processing_errors: Annotated[List[str], operator.add]
|
| 130 |
+
success: bool
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
class BrandAnalysisWorkflow:
|
| 134 |
+
"""
|
| 135 |
+
LangGraph-based workflow for brand sentiment analysis v4.0.
|
| 136 |
+
|
| 137 |
+
Pipeline:
|
| 138 |
+
1. Content Preprocessor (no LLM)
|
| 139 |
+
2. Relevance & Extraction Agent (LLM #1)
|
| 140 |
+
3. Sentiment Analyzer Agent (LLM #2) - only for relevant posts
|
| 141 |
+
4. Output Validator (no LLM)
|
| 142 |
+
"""
|
| 143 |
+
|
| 144 |
+
def __init__(
|
| 145 |
+
self,
|
| 146 |
+
workflow_config: Dict[str, Any],
|
| 147 |
+
brand_config: Dict[str, Any],
|
| 148 |
+
analysis_categories: Dict[str, Any],
|
| 149 |
+
api_key: str
|
| 150 |
+
):
|
| 151 |
+
"""
|
| 152 |
+
Initialize the workflow with agents and configuration.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
workflow_config: Workflow and agent configuration
|
| 156 |
+
brand_config: Brand-specific configuration
|
| 157 |
+
analysis_categories: Analysis category definitions
|
| 158 |
+
api_key: OpenAI API key
|
| 159 |
+
"""
|
| 160 |
+
self.workflow_config = workflow_config
|
| 161 |
+
self.brand_config = brand_config
|
| 162 |
+
self.analysis_categories = analysis_categories
|
| 163 |
+
self.api_key = api_key
|
| 164 |
+
|
| 165 |
+
# Initialize agents
|
| 166 |
+
self._init_agents()
|
| 167 |
+
|
| 168 |
+
# Build the workflow graph
|
| 169 |
+
self.workflow = self._build_workflow()
|
| 170 |
+
|
| 171 |
+
logger.info("BrandAnalysisWorkflow v4.0 initialized successfully")
|
| 172 |
+
|
| 173 |
+
def _init_agents(self) -> None:
|
| 174 |
+
"""Initialize all agents with their configurations."""
|
| 175 |
+
agents_config = self.workflow_config.get("agents", {})
|
| 176 |
+
|
| 177 |
+
# 1. Content Preprocessor Agent (no LLM)
|
| 178 |
+
preprocessor_config = agents_config.get("preprocessor", {})
|
| 179 |
+
self.preprocessor = ContentPreprocessorAgent(
|
| 180 |
+
preprocessor_config,
|
| 181 |
+
self.brand_config
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
# 2. Relevance & Extraction Agent (LLM #1)
|
| 185 |
+
extraction_config = agents_config.get("relevance_extraction",
|
| 186 |
+
agents_config.get("relevance_validator", {}) # Fallback to old config
|
| 187 |
+
)
|
| 188 |
+
self.extraction_agent = SabianRelevanceExtractionAgent(
|
| 189 |
+
extraction_config,
|
| 190 |
+
self.api_key,
|
| 191 |
+
self.brand_config,
|
| 192 |
+
self.analysis_categories
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# 3. Sentiment Analyzer Agent (LLM #2)
|
| 196 |
+
analyzer_config = agents_config.get("sentiment_analyzer",
|
| 197 |
+
agents_config.get("brand_analyzer", {}) # Fallback to old config
|
| 198 |
+
)
|
| 199 |
+
self.sentiment_analyzer = SabianSentimentAnalyzerAgent(
|
| 200 |
+
analyzer_config,
|
| 201 |
+
self.api_key,
|
| 202 |
+
self.brand_config,
|
| 203 |
+
self.analysis_categories
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
# 4. Output Validator Agent (no LLM)
|
| 207 |
+
validator_config = agents_config.get("output_validator", {})
|
| 208 |
+
self.output_validator = OutputValidatorAgent(
|
| 209 |
+
validator_config,
|
| 210 |
+
self.brand_config,
|
| 211 |
+
self.analysis_categories
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
logger.info("All 4 agents initialized")
|
| 215 |
+
|
| 216 |
+
def _build_workflow(self) -> StateGraph:
|
| 217 |
+
"""
|
| 218 |
+
Build the LangGraph workflow.
|
| 219 |
+
|
| 220 |
+
Flow:
|
| 221 |
+
preprocessing -> extraction -> (analysis if relevant) -> validation -> END
|
| 222 |
+
|
| 223 |
+
Returns:
|
| 224 |
+
Compiled StateGraph workflow
|
| 225 |
+
"""
|
| 226 |
+
workflow = StateGraph(BrandAnalysisState)
|
| 227 |
+
|
| 228 |
+
# Add nodes
|
| 229 |
+
workflow.add_node("preprocessing", self._preprocessing_node)
|
| 230 |
+
workflow.add_node("extraction", self._extraction_node)
|
| 231 |
+
workflow.add_node("analysis", self._analysis_node)
|
| 232 |
+
workflow.add_node("validation", self._validation_node)
|
| 233 |
+
|
| 234 |
+
# Set entry point
|
| 235 |
+
workflow.set_entry_point("preprocessing")
|
| 236 |
+
|
| 237 |
+
# Define edges
|
| 238 |
+
# Preprocessing -> conditional routing
|
| 239 |
+
workflow.add_conditional_edges(
|
| 240 |
+
"preprocessing",
|
| 241 |
+
self._route_after_preprocessing,
|
| 242 |
+
{
|
| 243 |
+
"extract": "extraction",
|
| 244 |
+
"skip_to_validation": "validation"
|
| 245 |
+
}
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
# Extraction -> conditional routing
|
| 249 |
+
workflow.add_conditional_edges(
|
| 250 |
+
"extraction",
|
| 251 |
+
self._route_after_extraction,
|
| 252 |
+
{
|
| 253 |
+
"analyze": "analysis",
|
| 254 |
+
"skip_to_validation": "validation"
|
| 255 |
+
}
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
# Analysis -> validation
|
| 259 |
+
workflow.add_edge("analysis", "validation")
|
| 260 |
+
|
| 261 |
+
# Validation -> END
|
| 262 |
+
workflow.add_edge("validation", END)
|
| 263 |
+
|
| 264 |
+
return workflow.compile()
|
| 265 |
+
|
| 266 |
+
def _preprocessing_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
|
| 267 |
+
"""
|
| 268 |
+
Preprocessing node: HTML parsing, cleaning, language detection, keyword check.
|
| 269 |
+
"""
|
| 270 |
+
try:
|
| 271 |
+
input_data = {
|
| 272 |
+
"post_id": state.get("post_id"),
|
| 273 |
+
"post_content": state.get("post_content", ""),
|
| 274 |
+
"thread_title": state.get("thread_title"),
|
| 275 |
+
"thread_first_post": state.get("thread_first_post"),
|
| 276 |
+
"category_title": state.get("category_title"),
|
| 277 |
+
"category_topic": state.get("category_topic")
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
result = self.preprocessor.process(input_data)
|
| 281 |
+
|
| 282 |
+
if result.get("success", False):
|
| 283 |
+
# Content
|
| 284 |
+
state["cleaned_content"] = result.get("cleaned_content", "")
|
| 285 |
+
state["quoted_content"] = result.get("quoted_content")
|
| 286 |
+
state["has_quote"] = result.get("has_quote", False)
|
| 287 |
+
state["quoted_author"] = result.get("quoted_author")
|
| 288 |
+
state["raw_thread_context"] = result.get("raw_thread_context", "")
|
| 289 |
+
state["is_empty"] = result.get("is_empty", False)
|
| 290 |
+
state["original_content"] = result.get("original_content", state.get("post_content", ""))
|
| 291 |
+
|
| 292 |
+
# Language
|
| 293 |
+
state["detected_language"] = result.get("detected_language", "English")
|
| 294 |
+
state["language_code"] = result.get("language_code", "en")
|
| 295 |
+
state["is_english"] = result.get("is_english", True)
|
| 296 |
+
state["language_confidence"] = result.get("language_confidence", "low")
|
| 297 |
+
state["language_detection_skipped"] = result.get("language_detection_skipped", False)
|
| 298 |
+
|
| 299 |
+
# Relevance
|
| 300 |
+
state["preliminary_relevant"] = result.get("preliminary_relevant", False)
|
| 301 |
+
state["needs_relevance_validation"] = result.get("needs_relevance_validation", False)
|
| 302 |
+
state["relevance_keywords_found"] = result.get("relevance_keywords_found", [])
|
| 303 |
+
state["relevance_type"] = result.get("relevance_type", "none")
|
| 304 |
+
state["has_primary_keywords"] = result.get("has_primary_keywords", False)
|
| 305 |
+
|
| 306 |
+
# Detections
|
| 307 |
+
state["products_detected"] = result.get("products_detected", [])
|
| 308 |
+
state["competitors_detected"] = result.get("competitors_detected", [])
|
| 309 |
+
|
| 310 |
+
state["success"] = True
|
| 311 |
+
else:
|
| 312 |
+
error_msg = f"Preprocessing failed: {result.get('error', 'Unknown error')}"
|
| 313 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 314 |
+
state["success"] = False
|
| 315 |
+
|
| 316 |
+
logger.debug(f"Preprocessing complete for post {state.get('post_id')}")
|
| 317 |
+
return state
|
| 318 |
+
|
| 319 |
+
except Exception as e:
|
| 320 |
+
error_msg = f"Preprocessing node error: {str(e)}"
|
| 321 |
+
logger.error(error_msg)
|
| 322 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 323 |
+
state["success"] = False
|
| 324 |
+
return state
|
| 325 |
+
|
| 326 |
+
def _extraction_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
|
| 327 |
+
"""
|
| 328 |
+
Extraction node: LLM-based relevance validation and fact extraction.
|
| 329 |
+
"""
|
| 330 |
+
try:
|
| 331 |
+
input_data = {
|
| 332 |
+
"cleaned_content": state.get("cleaned_content", ""),
|
| 333 |
+
"quoted_content": state.get("quoted_content"),
|
| 334 |
+
"raw_thread_context": state.get("raw_thread_context", ""),
|
| 335 |
+
"relevance_keywords_found": state.get("relevance_keywords_found", []),
|
| 336 |
+
"preliminary_relevant": state.get("preliminary_relevant", False),
|
| 337 |
+
"needs_relevance_validation": state.get("needs_relevance_validation", True),
|
| 338 |
+
"products_detected": state.get("products_detected", []),
|
| 339 |
+
"competitors_detected": state.get("competitors_detected", []),
|
| 340 |
+
"is_english": state.get("is_english", True),
|
| 341 |
+
"detected_language": state.get("detected_language", "English")
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
result = self.extraction_agent.process(input_data)
|
| 345 |
+
|
| 346 |
+
# Update state with extraction results
|
| 347 |
+
state["is_relevant"] = result.get("is_relevant", False)
|
| 348 |
+
state["relevance_confidence"] = result.get("relevance_confidence", "low")
|
| 349 |
+
state["relevance_reason"] = result.get("relevance_reason", "")
|
| 350 |
+
state["extraction_performed"] = result.get("extraction_performed", True)
|
| 351 |
+
|
| 352 |
+
# Extracted facts
|
| 353 |
+
state["products_mentioned"] = result.get("products_mentioned", [])
|
| 354 |
+
state["sabian_mention_context"] = result.get("sabian_mention_context")
|
| 355 |
+
state["author_role"] = result.get("author_role", "unknown")
|
| 356 |
+
state["competitors_mentioned"] = result.get("competitors_mentioned", [])
|
| 357 |
+
state["thread_context_summary"] = result.get("thread_context_summary", "")
|
| 358 |
+
|
| 359 |
+
if not result.get("success", False) and result.get("error"):
|
| 360 |
+
state["processing_errors"] = state.get("processing_errors", []) + [result["error"]]
|
| 361 |
+
|
| 362 |
+
logger.debug(f"Extraction complete: is_relevant={state['is_relevant']}")
|
| 363 |
+
return state
|
| 364 |
+
|
| 365 |
+
except Exception as e:
|
| 366 |
+
error_msg = f"Extraction node error: {str(e)}"
|
| 367 |
+
logger.error(error_msg)
|
| 368 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 369 |
+
state["is_relevant"] = False
|
| 370 |
+
state["relevance_confidence"] = "low"
|
| 371 |
+
return state
|
| 372 |
+
|
| 373 |
+
def _analysis_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
|
| 374 |
+
"""
|
| 375 |
+
Analysis node: Deep sentiment and intent analysis for relevant posts.
|
| 376 |
+
"""
|
| 377 |
+
try:
|
| 378 |
+
input_data = {
|
| 379 |
+
"cleaned_content": state.get("cleaned_content", ""),
|
| 380 |
+
"is_relevant": state.get("is_relevant", True),
|
| 381 |
+
"is_english": state.get("is_english", True),
|
| 382 |
+
"detected_language": state.get("detected_language", "English"),
|
| 383 |
+
"products_mentioned": state.get("products_mentioned", []),
|
| 384 |
+
"sabian_mention_context": state.get("sabian_mention_context"),
|
| 385 |
+
"author_role": state.get("author_role", "unknown"),
|
| 386 |
+
"competitors_mentioned": state.get("competitors_mentioned", []),
|
| 387 |
+
"thread_context_summary": state.get("thread_context_summary", "")
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
result = self.sentiment_analyzer.process(input_data)
|
| 391 |
+
|
| 392 |
+
if result.get("success", False):
|
| 393 |
+
# Sentiment
|
| 394 |
+
state["sentiment_level"] = result.get("sentiment_level")
|
| 395 |
+
state["emotion_type"] = result.get("emotion_type")
|
| 396 |
+
state["sentiment_confidence"] = result.get("sentiment_confidence", "medium")
|
| 397 |
+
state["sarcasm_detected"] = result.get("sarcasm_detected", False)
|
| 398 |
+
|
| 399 |
+
# Products
|
| 400 |
+
state["product_attributes"] = result.get("product_attributes", [])
|
| 401 |
+
|
| 402 |
+
# Competitive
|
| 403 |
+
state["competitor_products_owned"] = result.get("competitor_products_owned", [])
|
| 404 |
+
state["comparison_type"] = result.get("comparison_type")
|
| 405 |
+
|
| 406 |
+
# Journey
|
| 407 |
+
state["intents"] = result.get("intents", [])
|
| 408 |
+
state["purchase_stage"] = result.get("purchase_stage")
|
| 409 |
+
state["decision_drivers"] = result.get("decision_drivers", [])
|
| 410 |
+
state["pain_points"] = result.get("pain_points", [])
|
| 411 |
+
state["delight_factors"] = result.get("delight_factors", [])
|
| 412 |
+
|
| 413 |
+
# Notes
|
| 414 |
+
state["analysis_notes"] = result.get("analysis_notes", "")
|
| 415 |
+
state["analysis_skipped"] = result.get("analysis_skipped", False)
|
| 416 |
+
state["analysis_skip_reason"] = result.get("analysis_skip_reason", "")
|
| 417 |
+
else:
|
| 418 |
+
error_msg = f"Analysis failed: {result.get('error', 'Unknown error')}"
|
| 419 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 420 |
+
|
| 421 |
+
logger.debug(f"Analysis complete for post {state.get('post_id')}")
|
| 422 |
+
return state
|
| 423 |
+
|
| 424 |
+
except Exception as e:
|
| 425 |
+
error_msg = f"Analysis node error: {str(e)}"
|
| 426 |
+
logger.error(error_msg)
|
| 427 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 428 |
+
return state
|
| 429 |
+
|
| 430 |
+
def _validation_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
|
| 431 |
+
"""
|
| 432 |
+
Validation node: Rule-based validation and anomaly detection.
|
| 433 |
+
"""
|
| 434 |
+
try:
|
| 435 |
+
result = self.output_validator.process(dict(state))
|
| 436 |
+
|
| 437 |
+
state["validation_passed"] = result.get("validation_passed", True)
|
| 438 |
+
state["validation_errors"] = result.get("validation_errors", [])
|
| 439 |
+
state["validation_warnings"] = result.get("validation_warnings", [])
|
| 440 |
+
state["validation_flags"] = result.get("validation_flags", [])
|
| 441 |
+
state["processing_status"] = result.get("processing_status", "completed")
|
| 442 |
+
|
| 443 |
+
# Set overall success
|
| 444 |
+
has_errors = len(state.get("processing_errors", [])) > 0
|
| 445 |
+
state["success"] = not has_errors or state.get("is_relevant") is not None
|
| 446 |
+
|
| 447 |
+
logger.debug(f"Validation complete: status={state['processing_status']}")
|
| 448 |
+
return state
|
| 449 |
+
|
| 450 |
+
except Exception as e:
|
| 451 |
+
error_msg = f"Validation node error: {str(e)}"
|
| 452 |
+
logger.error(error_msg)
|
| 453 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 454 |
+
state["validation_passed"] = False
|
| 455 |
+
state["processing_status"] = "validation_failed"
|
| 456 |
+
state["success"] = False
|
| 457 |
+
return state
|
| 458 |
+
|
| 459 |
+
def _route_after_preprocessing(self, state: BrandAnalysisState) -> str:
|
| 460 |
+
"""
|
| 461 |
+
Determine routing after preprocessing.
|
| 462 |
+
"""
|
| 463 |
+
# If empty content, skip to validation
|
| 464 |
+
if state.get("is_empty", False):
|
| 465 |
+
state["is_relevant"] = False
|
| 466 |
+
state["relevance_reason"] = "Empty content"
|
| 467 |
+
return "skip_to_validation"
|
| 468 |
+
|
| 469 |
+
# If not English, skip to validation
|
| 470 |
+
if not state.get("is_english", True):
|
| 471 |
+
state["is_relevant"] = False
|
| 472 |
+
state["relevance_reason"] = f"Non-English: {state.get('detected_language')}"
|
| 473 |
+
return "skip_to_validation"
|
| 474 |
+
|
| 475 |
+
# If no keywords found and no need for validation, skip
|
| 476 |
+
if (not state.get("preliminary_relevant", False) and
|
| 477 |
+
not state.get("needs_relevance_validation", False)):
|
| 478 |
+
state["is_relevant"] = False
|
| 479 |
+
state["relevance_reason"] = "No relevant keywords found"
|
| 480 |
+
return "skip_to_validation"
|
| 481 |
+
|
| 482 |
+
# Otherwise, go to extraction
|
| 483 |
+
return "extract"
|
| 484 |
+
|
| 485 |
+
def _route_after_extraction(self, state: BrandAnalysisState) -> str:
|
| 486 |
+
"""
|
| 487 |
+
Determine routing after extraction.
|
| 488 |
+
"""
|
| 489 |
+
if state.get("is_relevant", False):
|
| 490 |
+
return "analyze"
|
| 491 |
+
return "skip_to_validation"
|
| 492 |
+
|
| 493 |
+
def process_post(self, post_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 494 |
+
"""
|
| 495 |
+
Process a single forum post through the workflow.
|
| 496 |
+
|
| 497 |
+
Args:
|
| 498 |
+
post_data: Dictionary containing post data
|
| 499 |
+
|
| 500 |
+
Returns:
|
| 501 |
+
Dictionary with processed results
|
| 502 |
+
"""
|
| 503 |
+
try:
|
| 504 |
+
initial_state = {
|
| 505 |
+
"post_id": post_data.get("post_id"),
|
| 506 |
+
"thread_id": post_data.get("thread_id"),
|
| 507 |
+
"post_author_id": post_data.get("post_author_id"),
|
| 508 |
+
"post_content": post_data.get("post_content", ""),
|
| 509 |
+
"thread_title": post_data.get("thread_title"),
|
| 510 |
+
"thread_first_post": post_data.get("thread_first_post"),
|
| 511 |
+
"thread_started_at": post_data.get("thread_started_at"),
|
| 512 |
+
"category_title": post_data.get("category_title"),
|
| 513 |
+
"category_topic": post_data.get("category_topic"),
|
| 514 |
+
"post_created_at": post_data.get("post_created_at"),
|
| 515 |
+
"processing_errors": [],
|
| 516 |
+
"success": True
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
final_state = self.workflow.invoke(initial_state)
|
| 520 |
+
|
| 521 |
+
return dict(final_state)
|
| 522 |
+
|
| 523 |
+
except Exception as e:
|
| 524 |
+
logger.error(f"Workflow execution error: {str(e)}")
|
| 525 |
+
return {
|
| 526 |
+
**post_data,
|
| 527 |
+
"success": False,
|
| 528 |
+
"processing_errors": [str(e)],
|
| 529 |
+
"processing_status": "workflow_error"
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
def process_batch(self, posts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 533 |
+
"""
|
| 534 |
+
Process a batch of forum posts.
|
| 535 |
+
|
| 536 |
+
Args:
|
| 537 |
+
posts: List of post dictionaries
|
| 538 |
+
|
| 539 |
+
Returns:
|
| 540 |
+
List of processed post dictionaries
|
| 541 |
+
"""
|
| 542 |
+
results = []
|
| 543 |
+
total = len(posts)
|
| 544 |
+
|
| 545 |
+
for idx, post in enumerate(posts, 1):
|
| 546 |
+
logger.info(f"Processing post {idx}/{total} (ID: {post.get('post_id')})")
|
| 547 |
+
result = self.process_post(post)
|
| 548 |
+
results.append(result)
|
| 549 |
+
|
| 550 |
+
logger.info(f"Batch processing complete: {total} posts processed")
|
| 551 |
+
return results
|
requirements.txt
CHANGED
|
Binary files a/requirements.txt and b/requirements.txt differ
|
|
|
src/streamlit_app.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
| 1 |
-
import altair as alt
|
| 2 |
-
import numpy as np
|
| 3 |
-
import pandas as pd
|
| 4 |
-
import streamlit as st
|
| 5 |
-
|
| 6 |
-
"""
|
| 7 |
-
# Welcome to Streamlit!
|
| 8 |
-
|
| 9 |
-
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
| 10 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 11 |
-
forums](https://discuss.streamlit.io).
|
| 12 |
-
|
| 13 |
-
In the meantime, below is an example of what you can do with just a few lines of code:
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
|
| 17 |
-
num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
|
| 18 |
-
|
| 19 |
-
indices = np.linspace(0, 1, num_points)
|
| 20 |
-
theta = 2 * np.pi * num_turns * indices
|
| 21 |
-
radius = indices
|
| 22 |
-
|
| 23 |
-
x = radius * np.cos(theta)
|
| 24 |
-
y = radius * np.sin(theta)
|
| 25 |
-
|
| 26 |
-
df = pd.DataFrame({
|
| 27 |
-
"x": x,
|
| 28 |
-
"y": y,
|
| 29 |
-
"idx": indices,
|
| 30 |
-
"rand": np.random.randn(num_points),
|
| 31 |
-
})
|
| 32 |
-
|
| 33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
-
.mark_point(filled=True)
|
| 35 |
-
.encode(
|
| 36 |
-
x=alt.X("x", axis=None),
|
| 37 |
-
y=alt.Y("y", axis=None),
|
| 38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
-
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
visualization/README.md
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Musora Sentiment Analysis Dashboard
|
| 2 |
+
|
| 3 |
+
A comprehensive, interactive Streamlit dashboard for visualizing sentiment analysis results from **multiple data sources**: social media comments (Facebook, Instagram, YouTube, Twitter) and Musora internal app comments across Musora brands (Drumeo, Pianote, Guitareo, Singeo).
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
### Main Dashboard
|
| 8 |
+
- **Overall sentiment distribution** with interactive pie charts and gauge indicators
|
| 9 |
+
- **Sentiment analysis by brand** (Drumeo, Pianote, Musora) with stacked bar charts
|
| 10 |
+
- **Sentiment analysis by platform** (Facebook, Instagram, etc.) with percentage distributions
|
| 11 |
+
- **Intent analysis** showing multi-label intent distributions (praise, question, request, etc.)
|
| 12 |
+
- **Cross-dimensional heatmaps** showing negative sentiment by brand and platform
|
| 13 |
+
- **Reply requirements analysis** with urgency breakdown
|
| 14 |
+
- **Language distribution** analysis
|
| 15 |
+
- **Temporal trends** with customizable time granularity (daily, weekly, monthly)
|
| 16 |
+
- **Hierarchical sunburst** visualization for brand > platform > sentiment
|
| 17 |
+
|
| 18 |
+
### Sentiment Analysis Page
|
| 19 |
+
- **Multi-sentiment filtering** - Filter by any combination of sentiments (positive, negative, neutral, etc.) to analyze both good and bad performance
|
| 20 |
+
- **Intent filtering** - Filter contents by specific user intents (question, praise, feedback_negative, etc.)
|
| 21 |
+
- **Dynamic severity scoring** - Ranks contents based on selected sentiments, adapts calculations to your filter choices
|
| 22 |
+
- **Advanced ranking controls** - Customize with minimum comment thresholds and multiple dynamic sort options
|
| 23 |
+
- **Sort options** - Severity Score (balanced), Sentiment %, Sentiment Count (absolute), or Total Comments (volume)
|
| 24 |
+
- **Engagement scatter plot** showing relationship between comment volume and sentiment
|
| 25 |
+
- **Thumbnail display** for Musora internal app content (visual content previews)
|
| 26 |
+
- **Detailed content analysis** with sentiment and intent distributions for each content
|
| 27 |
+
- **AI-Powered Analysis** - Optional AI-generated insights and recommendations for each content
|
| 28 |
+
- **View filtered comments** for each content with expandable sections
|
| 29 |
+
- **Actionable insights** and recommendations based on sentiment patterns
|
| 30 |
+
- **Export functionality** to download results as CSV with dynamic columns
|
| 31 |
+
|
| 32 |
+
### Reply Required Page
|
| 33 |
+
- **Prioritized comment queue** with urgency indicators (Urgent, High, Medium, Low)
|
| 34 |
+
- **Smart filtering** by priority, platform, brand, and intent
|
| 35 |
+
- **Pagination** for easy navigation through large comment lists
|
| 36 |
+
- **Comment cards** showing full context (author, timestamp, sentiment, intent)
|
| 37 |
+
- **Original and translated text** with expandable view for non-English comments
|
| 38 |
+
- **Reply requirements by content** showing which contents need most attention
|
| 39 |
+
- **Export functionality** for team collaboration or CRM import
|
| 40 |
+
|
| 41 |
+
## Architecture
|
| 42 |
+
|
| 43 |
+
```
|
| 44 |
+
visualization/
|
| 45 |
+
├── app.py # Main Streamlit application
|
| 46 |
+
├── config/
|
| 47 |
+
│ └── viz_config.json # Configuration for colors, settings, queries
|
| 48 |
+
├── data/
|
| 49 |
+
│ └── data_loader.py # Snowflake data loading with caching
|
| 50 |
+
├── utils/
|
| 51 |
+
│ ├── data_processor.py # Data aggregation and processing
|
| 52 |
+
│ └── metrics.py # Metrics calculation (KPIs, scores)
|
| 53 |
+
├── components/
|
| 54 |
+
│ ├── dashboard.py # Main dashboard page
|
| 55 |
+
│ ├── sentiment_analysis.py # Comprehensive sentiment analysis page
|
| 56 |
+
│ └── reply_required.py # Reply management page
|
| 57 |
+
├── visualizations/
|
| 58 |
+
│ ├── sentiment_charts.py # Sentiment visualization functions
|
| 59 |
+
│ ├── distribution_charts.py # Distribution visualization functions
|
| 60 |
+
│ └── content_cards.py # Display components and cards
|
| 61 |
+
├── requirements.txt # Python dependencies
|
| 62 |
+
└── README.md # This file
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
## Installation
|
| 66 |
+
|
| 67 |
+
### Prerequisites
|
| 68 |
+
- Python 3.8+
|
| 69 |
+
- Snowflake account with access to sentiment analysis data
|
| 70 |
+
- Required environment variables in parent `.env` file:
|
| 71 |
+
- `SNOWFLAKE_USER`
|
| 72 |
+
- `SNOWFLAKE_PASSWORD`
|
| 73 |
+
- `SNOWFLAKE_ACCOUNT`
|
| 74 |
+
- `SNOWFLAKE_ROLE`
|
| 75 |
+
- `SNOWFLAKE_DATABASE`
|
| 76 |
+
- `SNOWFLAKE_WAREHOUSE`
|
| 77 |
+
- `SNOWFLAKE_SCHEMA`
|
| 78 |
+
|
| 79 |
+
### Setup
|
| 80 |
+
|
| 81 |
+
1. Navigate to the visualization directory:
|
| 82 |
+
```bash
|
| 83 |
+
cd visualization
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
2. Install dependencies:
|
| 87 |
+
```bash
|
| 88 |
+
pip install -r requirements.txt
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
3. Ensure parent `.env` file is properly configured with Snowflake credentials
|
| 92 |
+
|
| 93 |
+
## Usage
|
| 94 |
+
|
| 95 |
+
### Running the Dashboard
|
| 96 |
+
|
| 97 |
+
From the `visualization` directory:
|
| 98 |
+
|
| 99 |
+
```bash
|
| 100 |
+
streamlit run app.py
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
The dashboard will open in your default browser at `http://localhost:8501`
|
| 104 |
+
|
| 105 |
+
### Navigation
|
| 106 |
+
|
| 107 |
+
Use the sidebar to:
|
| 108 |
+
- **Select pages** (Dashboard, Sentiment Analysis, Reply Required)
|
| 109 |
+
- **Apply global filters** by platform, brand, sentiment, and date range
|
| 110 |
+
- **Reload data** to fetch latest updates from Snowflake
|
| 111 |
+
- **View data information** (record count, last update time)
|
| 112 |
+
|
| 113 |
+
### Filtering Data
|
| 114 |
+
|
| 115 |
+
1. Select desired filters in the sidebar:
|
| 116 |
+
- **Platforms**: Filter by data source (Facebook, Instagram, YouTube, Twitter, musora_app)
|
| 117 |
+
- **Brands**: Filter by Musora brand (Drumeo, Pianote, Guitareo, Singeo, Musora)
|
| 118 |
+
- **Sentiments**: Filter by sentiment polarity
|
| 119 |
+
- **Date Range**: Filter by comment timestamp
|
| 120 |
+
|
| 121 |
+
2. Click "Apply Filters" to update visualizations
|
| 122 |
+
|
| 123 |
+
3. Click "Reset Filters" to clear all filters
|
| 124 |
+
|
| 125 |
+
### Exporting Data
|
| 126 |
+
|
| 127 |
+
Each page provides export functionality:
|
| 128 |
+
- **Sentiment Analysis**: Download top N contents as CSV with dynamic columns based on active filters
|
| 129 |
+
- **Reply Required**: Download filtered comments as CSV
|
| 130 |
+
|
| 131 |
+
## Configuration
|
| 132 |
+
|
| 133 |
+
### Color Schemes
|
| 134 |
+
|
| 135 |
+
Edit `config/viz_config.json` to customize:
|
| 136 |
+
- **Sentiment colors**: Colors for each sentiment polarity
|
| 137 |
+
- **Intent colors**: Colors for each intent category
|
| 138 |
+
- **Platform colors**: Brand colors for each platform
|
| 139 |
+
- **Brand colors**: Colors for each Musora brand
|
| 140 |
+
|
| 141 |
+
### Dashboard Settings
|
| 142 |
+
|
| 143 |
+
Configure in `viz_config.json`:
|
| 144 |
+
- `default_date_range_days`: Default date range for filtering
|
| 145 |
+
- `max_comments_display`: Maximum comments to display per page
|
| 146 |
+
- `chart_height`: Default height for charts
|
| 147 |
+
- `top_n_contents`: Number of contents to show in poor sentiment page
|
| 148 |
+
|
| 149 |
+
### Data Query
|
| 150 |
+
|
| 151 |
+
The Snowflake query is configured in `viz_config.json`:
|
| 152 |
+
```json
|
| 153 |
+
"snowflake": {
|
| 154 |
+
"query": "SELECT s.*, c.CHANNEL_NAME as BRAND, c.MESSAGE as CONTENT_DESCRIPTION, c.PERMALINK_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT c ON s.CONTENT_SK = c.CONTENT_SK"
|
| 155 |
+
}
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
## Extending the Dashboard
|
| 159 |
+
|
| 160 |
+
### Adding New Pages
|
| 161 |
+
|
| 162 |
+
1. Create a new component file in `components/`:
|
| 163 |
+
```python
|
| 164 |
+
# components/new_page.py
|
| 165 |
+
def render_new_page(df):
|
| 166 |
+
st.title("New Page")
|
| 167 |
+
# Your page logic here
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
2. Import and add to navigation in `app.py`:
|
| 171 |
+
```python
|
| 172 |
+
from components.new_page import render_new_page
|
| 173 |
+
|
| 174 |
+
# Add to page selection
|
| 175 |
+
page = st.radio("Select Page", [..., "New Page"])
|
| 176 |
+
|
| 177 |
+
# Add to page rendering
|
| 178 |
+
elif page == "New Page":
|
| 179 |
+
render_new_page(df)
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
### Adding New Visualizations
|
| 183 |
+
|
| 184 |
+
1. Add visualization function to appropriate module:
|
| 185 |
+
- `visualizations/sentiment_charts.py` for sentiment-related charts
|
| 186 |
+
- `visualizations/distribution_charts.py` for distribution charts
|
| 187 |
+
|
| 188 |
+
2. Use the function in page components
|
| 189 |
+
|
| 190 |
+
Example:
|
| 191 |
+
```python
|
| 192 |
+
def create_new_chart(df, title="New Chart"):
|
| 193 |
+
fig = go.Figure(...)
|
| 194 |
+
return fig
|
| 195 |
+
```
|
| 196 |
+
|
| 197 |
+
### Adding New Metrics
|
| 198 |
+
|
| 199 |
+
Add calculation methods to `utils/metrics.py`:
|
| 200 |
+
```python
|
| 201 |
+
@staticmethod
|
| 202 |
+
def calculate_new_metric(df):
|
| 203 |
+
# Your metric calculation
|
| 204 |
+
return metric_value
|
| 205 |
+
```
|
| 206 |
+
|
| 207 |
+
### Customizing Card Displays
|
| 208 |
+
|
| 209 |
+
Modify display methods in `visualizations/content_cards.py`:
|
| 210 |
+
```python
|
| 211 |
+
@staticmethod
|
| 212 |
+
def display_custom_card(data):
|
| 213 |
+
# Your custom card layout
|
| 214 |
+
pass
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
## Data Schema
|
| 218 |
+
|
| 219 |
+
The dashboard expects the following columns from Snowflake:
|
| 220 |
+
|
| 221 |
+
### Required Columns
|
| 222 |
+
- `comment_sk`: Unique comment identifier
|
| 223 |
+
- `comment_id`: Comment ID
|
| 224 |
+
- `original_text`: Original comment text
|
| 225 |
+
- `platform`: Social media platform
|
| 226 |
+
- `brand`: Musora brand name
|
| 227 |
+
- `sentiment_polarity`: Sentiment classification (very_positive, positive, neutral, negative, very_negative)
|
| 228 |
+
- `intent`: Comma-separated intent labels
|
| 229 |
+
- `requires_reply`: Boolean indicating if reply is needed
|
| 230 |
+
- `content_sk`: Content identifier
|
| 231 |
+
- `content_description`: Description of the content
|
| 232 |
+
- `permalink_url`: URL to the original content
|
| 233 |
+
|
| 234 |
+
### Optional Columns
|
| 235 |
+
- `comment_timestamp`: When comment was posted
|
| 236 |
+
- `processed_at`: When sentiment analysis was performed
|
| 237 |
+
- `translated_text`: English translation for non-English comments
|
| 238 |
+
- `detected_language`: Detected language of comment
|
| 239 |
+
- `is_english`: Boolean indicating if comment is in English
|
| 240 |
+
- `sentiment_confidence`: Confidence level of sentiment analysis
|
| 241 |
+
- `author_name`: Comment author name
|
| 242 |
+
- `channel_name`: Channel name
|
| 243 |
+
- `thumbnail_url`: Content thumbnail URL (for Musora internal app content)
|
| 244 |
+
- `parent_comment_id`: ID of parent comment (for replies)
|
| 245 |
+
- `parent_comment_text`: Text of parent comment (for reply context)
|
| 246 |
+
|
| 247 |
+
## Performance Optimization
|
| 248 |
+
|
| 249 |
+
### Caching
|
| 250 |
+
- Data loading is cached for 5 minutes using `@st.cache_data`
|
| 251 |
+
- Clear cache using "Reload Data" button in sidebar
|
| 252 |
+
|
| 253 |
+
### Pagination
|
| 254 |
+
- Comments requiring reply are paginated (10 per page)
|
| 255 |
+
- Reduces memory usage and improves rendering speed
|
| 256 |
+
|
| 257 |
+
### Filtering
|
| 258 |
+
- Apply filters to reduce dataset size before visualization
|
| 259 |
+
- Filters are applied efficiently using pandas operations
|
| 260 |
+
|
| 261 |
+
## Troubleshooting
|
| 262 |
+
|
| 263 |
+
### Connection Issues
|
| 264 |
+
- Verify Snowflake credentials in parent `.env` file
|
| 265 |
+
- Check network connectivity to Snowflake
|
| 266 |
+
- Ensure correct database, schema, and table names
|
| 267 |
+
|
| 268 |
+
### No Data Displayed
|
| 269 |
+
- Check if Snowflake query returns data
|
| 270 |
+
- Verify column names match expected schema
|
| 271 |
+
- Check applied filters - try resetting them
|
| 272 |
+
|
| 273 |
+
### Slow Performance
|
| 274 |
+
- Reduce date range in filters
|
| 275 |
+
- Use "Apply Filters" to work with smaller datasets
|
| 276 |
+
- Consider adding database indexes on frequently filtered columns
|
| 277 |
+
|
| 278 |
+
### Visualization Errors
|
| 279 |
+
- Check for missing or null values in data
|
| 280 |
+
- Verify data types match expected types (dates, booleans, etc.)
|
| 281 |
+
- Review browser console for JavaScript errors
|
| 282 |
+
|
| 283 |
+
## Best Practices
|
| 284 |
+
|
| 285 |
+
1. **Regular Data Updates**: Reload data periodically to see latest comments
|
| 286 |
+
2. **Use Filters**: Apply filters to focus on specific segments
|
| 287 |
+
3. **Export Insights**: Download CSV reports for offline analysis
|
| 288 |
+
4. **Monitor Reply Queue**: Check "Reply Required" page daily
|
| 289 |
+
5. **Track Trends**: Use temporal visualizations to identify patterns
|
| 290 |
+
6. **Prioritize Urgent**: Address urgent replies (negative sentiment) first
|
| 291 |
+
|
| 292 |
+
## Support
|
| 293 |
+
|
| 294 |
+
For issues or feature requests:
|
| 295 |
+
1. Check the troubleshooting section
|
| 296 |
+
2. Review configuration files for correct settings
|
| 297 |
+
3. Consult the main project README for sentiment analysis pipeline details
|
| 298 |
+
|
| 299 |
+
## Version History
|
| 300 |
+
|
| 301 |
+
### v1.3 (Current)
|
| 302 |
+
- **Comprehensive Sentiment Analysis Page Redesign**
|
| 303 |
+
- Renamed "Poor Sentiment Contents" to "Sentiment Analysis" page
|
| 304 |
+
- **NEW: Multi-Sentiment Filtering** - Filter by any combination of sentiments (positive, negative, neutral, very_positive, very_negative)
|
| 305 |
+
- **NEW: Intent Filtering** - Filter contents by specific user intents (question, praise, feedback_negative, request, etc.)
|
| 306 |
+
- **Filter Status Indicator** - Visual feedback showing when filters are active
|
| 307 |
+
|
| 308 |
+
- **Dynamic Ranking & Calculations**
|
| 309 |
+
- **Dynamic severity scoring** - Automatically calculates based on selected sentiments (not just negative)
|
| 310 |
+
- **Dynamic metrics** - Sentiment percentages and counts adapt to your filter selection
|
| 311 |
+
- Sort options now work with any sentiment combination
|
| 312 |
+
|
| 313 |
+
- **Enhanced User Experience**
|
| 314 |
+
- Summary statistics that dynamically adapt to filters
|
| 315 |
+
- Contextual explanations that change based on selected sentiments
|
| 316 |
+
- Export with dynamic columns based on active filters
|
| 317 |
+
- Backward compatible - works like original when no filters selected
|
| 318 |
+
|
| 319 |
+
- **New Use Cases Enabled**
|
| 320 |
+
- Analyze high-performing content (filter by positive sentiments)
|
| 321 |
+
- Identify successful patterns (combine sentiment + intent filters)
|
| 322 |
+
- Compare sentiment types side-by-side
|
| 323 |
+
- Focus on specific user behaviors
|
| 324 |
+
|
| 325 |
+
### v1.2
|
| 326 |
+
- **Multi-source data support** - Integrated Musora internal app comments alongside social media
|
| 327 |
+
- **Smart severity scoring** - Content ranking now balances sentiment % with comment volume
|
| 328 |
+
- **Advanced ranking controls** - Min comments filter and multiple sort options (severity, %, count, volume)
|
| 329 |
+
- **Thumbnail display** - Visual content previews for Musora internal app content
|
| 330 |
+
- **Platform disambiguation** - Renamed internal platform to "musora_app" to differentiate from "musora" brand
|
| 331 |
+
- **Improved chart stability** - Fixed duplicate chart ID errors with unique keys
|
| 332 |
+
- **Enhanced data schema** - Added support for thumbnail_url and parent comment fields
|
| 333 |
+
|
| 334 |
+
### v1.1
|
| 335 |
+
- **AI-Powered Agents** - ContentSummaryAgent for intelligent comment analysis
|
| 336 |
+
- AI Analysis button on Sentiment Analysis page
|
| 337 |
+
- LLM Helper with OpenAI API integration
|
| 338 |
+
- Modular agent architecture ready for expansion
|
| 339 |
+
|
| 340 |
+
### v1.0
|
| 341 |
+
- Initial release
|
| 342 |
+
- Main dashboard with comprehensive visualizations
|
| 343 |
+
- Sentiment contents analysis page
|
| 344 |
+
- Reply required management page
|
| 345 |
+
- Global filtering and export functionality
|
| 346 |
+
- Plotly-based interactive visualizations
|
| 347 |
+
- Modular, extensible architecture
|
visualization/SnowFlakeConnection.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This class create a connection to Snowflake, run queries (read and write)
|
| 3 |
+
"""
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from snowflake.snowpark import Session
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
import logging
|
| 9 |
+
logger = logging.getLogger()
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
class SnowFlakeConn:
|
| 13 |
+
def __init__(self):
|
| 14 |
+
self.session = self.connect_to_snowflake()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# =========================================================
|
| 18 |
+
def connect_to_snowflake(self):
|
| 19 |
+
# --- Snowflake connection via env vars ---
|
| 20 |
+
# Validate all required credentials exist
|
| 21 |
+
required_credentials = [
|
| 22 |
+
"SNOWFLAKE_USER",
|
| 23 |
+
"SNOWFLAKE_PASSWORD",
|
| 24 |
+
"SNOWFLAKE_ACCOUNT",
|
| 25 |
+
"SNOWFLAKE_ROLE",
|
| 26 |
+
"SNOWFLAKE_DATABASE",
|
| 27 |
+
"SNOWFLAKE_WAREHOUSE",
|
| 28 |
+
"SNOWFLAKE_SCHEMA"
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
missing_credentials = []
|
| 32 |
+
for cred in required_credentials:
|
| 33 |
+
if not self.get_credential(cred):
|
| 34 |
+
missing_credentials.append(cred)
|
| 35 |
+
|
| 36 |
+
if missing_credentials:
|
| 37 |
+
error_msg = f"Missing required Snowflake credentials: {', '.join(missing_credentials)}"
|
| 38 |
+
logger.error(error_msg)
|
| 39 |
+
raise ValueError(error_msg)
|
| 40 |
+
|
| 41 |
+
conn = dict(
|
| 42 |
+
user=self.get_credential("SNOWFLAKE_USER"),
|
| 43 |
+
password=self.get_credential("SNOWFLAKE_PASSWORD"),
|
| 44 |
+
account=self.get_credential("SNOWFLAKE_ACCOUNT"),
|
| 45 |
+
role=self.get_credential("SNOWFLAKE_ROLE"),
|
| 46 |
+
database=self.get_credential("SNOWFLAKE_DATABASE"),
|
| 47 |
+
warehouse=self.get_credential("SNOWFLAKE_WAREHOUSE"),
|
| 48 |
+
schema=self.get_credential("SNOWFLAKE_SCHEMA"),
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
session = Session.builder.configs(conn).create()
|
| 53 |
+
logger.info("Successfully connected to Snowflake")
|
| 54 |
+
return session
|
| 55 |
+
except Exception as e:
|
| 56 |
+
logger.error(f"Failed to connect to Snowflake: {e}")
|
| 57 |
+
raise
|
| 58 |
+
|
| 59 |
+
# =========================================================
|
| 60 |
+
def get_credential(self, key):
|
| 61 |
+
return os.getenv(key)
|
| 62 |
+
|
| 63 |
+
# =========================================================
|
| 64 |
+
def run_read_query(self, query, data):
|
| 65 |
+
"""
|
| 66 |
+
Executes a SQL query on Snowflake that fetch the data
|
| 67 |
+
:return: Pandas dataframe containing the query results
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
# Connect to Snowflake
|
| 71 |
+
try:
|
| 72 |
+
dataframe = self.session.sql(query).to_pandas()
|
| 73 |
+
dataframe.columns = dataframe.columns.str.lower()
|
| 74 |
+
print(f"reading {data} table successfully")
|
| 75 |
+
return dataframe
|
| 76 |
+
except Exception as e:
|
| 77 |
+
error_msg = f"Error reading {data}: {e}"
|
| 78 |
+
print(error_msg)
|
| 79 |
+
logger.error(error_msg)
|
| 80 |
+
raise
|
| 81 |
+
|
| 82 |
+
# =========================================================
|
| 83 |
+
def store_df_to_snowflake(self, table_name, dataframe, database="SOCIAL_MEDIA_DB", schema="ML_FEATURES", overwrite=False):
|
| 84 |
+
"""
|
| 85 |
+
Executes a SQL query on Snowflake that write the preprocessed data on new tables
|
| 86 |
+
:param query: SQL query string to be executed
|
| 87 |
+
:return: None
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
self.session.use_database(database)
|
| 92 |
+
self.session.use_schema(schema)
|
| 93 |
+
|
| 94 |
+
dataframe = dataframe.reset_index(drop=True)
|
| 95 |
+
dataframe.columns = dataframe.columns.str.upper()
|
| 96 |
+
|
| 97 |
+
self.session.write_pandas(df=dataframe,
|
| 98 |
+
table_name=table_name.strip().upper(),
|
| 99 |
+
auto_create_table=True,
|
| 100 |
+
overwrite=overwrite,
|
| 101 |
+
use_logical_type=True)
|
| 102 |
+
print(f"Data inserted into {table_name} successfully.")
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"Error in creating/updating/inserting table: {e}")
|
| 106 |
+
|
| 107 |
+
# =========================================================
|
| 108 |
+
def execute_sql_file(self, file_path):
|
| 109 |
+
"""
|
| 110 |
+
Executes SQL queries from a file
|
| 111 |
+
:param file_path: Path to SQL file
|
| 112 |
+
:return: Query result or None for DDL/DML
|
| 113 |
+
"""
|
| 114 |
+
try:
|
| 115 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
| 116 |
+
sql_content = file.read()
|
| 117 |
+
|
| 118 |
+
result = self.session.sql(sql_content).collect()
|
| 119 |
+
print(f"Successfully executed SQL from {file_path}")
|
| 120 |
+
return result
|
| 121 |
+
except Exception as e:
|
| 122 |
+
print(f"Error executing SQL file {file_path}: {e}")
|
| 123 |
+
return None
|
| 124 |
+
|
| 125 |
+
# =========================================================
|
| 126 |
+
def execute_query(self, query, description="query"):
|
| 127 |
+
"""
|
| 128 |
+
Executes a SQL query and returns results
|
| 129 |
+
:param query: SQL query string
|
| 130 |
+
:param description: Description of the query for logging
|
| 131 |
+
:return: Query results
|
| 132 |
+
"""
|
| 133 |
+
try:
|
| 134 |
+
result = self.session.sql(query).collect()
|
| 135 |
+
print(f"Successfully executed {description}")
|
| 136 |
+
return result
|
| 137 |
+
except Exception as e:
|
| 138 |
+
print(f"Error executing {description}: {e}")
|
| 139 |
+
return None
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
# =========================================================
|
| 143 |
+
def get_data(self, data):
|
| 144 |
+
# get any sort of data based on requirement --> comments, contents, etc
|
| 145 |
+
pass
|
| 146 |
+
|
| 147 |
+
# =========================================================
|
| 148 |
+
def close_connection(self):
|
| 149 |
+
self.session.close()
|
| 150 |
+
|
visualization/agents/README.md
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Visualization Agents
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
This folder contains AI-powered agents that enhance the sentiment analysis dashboard with intelligent, context-aware insights and analysis capabilities.
|
| 5 |
+
|
| 6 |
+
## Architecture
|
| 7 |
+
|
| 8 |
+
### Base Agent Pattern
|
| 9 |
+
All agents inherit from `BaseVisualizationAgent` which provides:
|
| 10 |
+
- Common interface (`process()`, `validate_input()`)
|
| 11 |
+
- Error handling
|
| 12 |
+
- Logging functionality
|
| 13 |
+
- Consistent configuration
|
| 14 |
+
|
| 15 |
+
### LLM Helper
|
| 16 |
+
`utils/llm_helper.py` provides:
|
| 17 |
+
- OpenAI API integration
|
| 18 |
+
- Retry logic with exponential backoff
|
| 19 |
+
- JSON mode support
|
| 20 |
+
- Token usage tracking
|
| 21 |
+
|
| 22 |
+
## Available Agents
|
| 23 |
+
|
| 24 |
+
### 1. ContentSummaryAgent
|
| 25 |
+
|
| 26 |
+
**Purpose**: Analyze and summarize comments for content pieces
|
| 27 |
+
|
| 28 |
+
**Location**: `agents/content_summary_agent.py`
|
| 29 |
+
|
| 30 |
+
**Input**:
|
| 31 |
+
```python
|
| 32 |
+
{
|
| 33 |
+
'content_sk': str, # Content identifier
|
| 34 |
+
'content_description': str, # Content title/description
|
| 35 |
+
'comments': DataFrame or list # Comments data
|
| 36 |
+
}
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
**Output**:
|
| 40 |
+
```python
|
| 41 |
+
{
|
| 42 |
+
'success': bool,
|
| 43 |
+
'content_sk': str,
|
| 44 |
+
'summary': {
|
| 45 |
+
'executive_summary': str, # 2-3 sentence overview
|
| 46 |
+
'main_themes': [ # Top themes discussed
|
| 47 |
+
{
|
| 48 |
+
'theme': str,
|
| 49 |
+
'sentiment': str, # positive/negative/mixed
|
| 50 |
+
'description': str
|
| 51 |
+
}
|
| 52 |
+
],
|
| 53 |
+
'praise_points': [str], # What users love
|
| 54 |
+
'key_complaints': [str], # Main concerns
|
| 55 |
+
'frequently_asked_questions': [str], # Common questions
|
| 56 |
+
'unexpected_insights': [str], # Surprising patterns
|
| 57 |
+
'action_recommendations': [ # Suggested actions
|
| 58 |
+
{
|
| 59 |
+
'priority': str, # high/medium/low
|
| 60 |
+
'action': str
|
| 61 |
+
}
|
| 62 |
+
]
|
| 63 |
+
},
|
| 64 |
+
'metadata': {
|
| 65 |
+
'total_comments_analyzed': int,
|
| 66 |
+
'model_used': str,
|
| 67 |
+
'tokens_used': int
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
**Configuration**:
|
| 73 |
+
- Model: `gpt-5-nano` (configurable)
|
| 74 |
+
- Temperature: 0.3 (lower for focused summaries)
|
| 75 |
+
- Sampling: All negative comments + up to 50 positive/neutral (if >100 total)
|
| 76 |
+
|
| 77 |
+
**Features**:
|
| 78 |
+
- **Smart sampling**: Prioritizes negative comments, samples others
|
| 79 |
+
- **Context preservation**: Includes sentiment and intent metadata
|
| 80 |
+
- **Token optimization**: Truncates long comments to 300 chars
|
| 81 |
+
- **Structured output**: JSON format with guaranteed fields
|
| 82 |
+
- **Error handling**: Graceful failures with retry capability
|
| 83 |
+
|
| 84 |
+
## UI Integration
|
| 85 |
+
|
| 86 |
+
### Poor Sentiment Contents Page
|
| 87 |
+
|
| 88 |
+
**Location**: `components/poor_sentiment_contents.py`
|
| 89 |
+
|
| 90 |
+
**User Flow**:
|
| 91 |
+
1. User views content cards on Poor Sentiment Contents page
|
| 92 |
+
2. Clicks "🔍 Generate AI Analysis" button
|
| 93 |
+
3. Agent processes comments (with spinner indicator)
|
| 94 |
+
4. Summary displays in expandable section
|
| 95 |
+
5. Result cached in session state
|
| 96 |
+
|
| 97 |
+
**Display Sections**:
|
| 98 |
+
- **Executive Summary**: High-level overview (info box)
|
| 99 |
+
- **Main Themes**: Key topics with sentiment indicators
|
| 100 |
+
- **Praise Points** ✅ & **Key Complaints** ⚠️ (side-by-side)
|
| 101 |
+
- **FAQs** ❓ & **Unexpected Insights** 💡 (side-by-side)
|
| 102 |
+
- **Recommended Actions** 🎯 (priority-coded)
|
| 103 |
+
- **Analysis Metadata** ℹ️ (expandable details)
|
| 104 |
+
|
| 105 |
+
**Session Caching**:
|
| 106 |
+
- Summaries stored in `st.session_state.content_summaries`
|
| 107 |
+
- Key: `content_sk`
|
| 108 |
+
- Persists during session, cleared on page reload
|
| 109 |
+
- Prevents redundant API calls
|
| 110 |
+
|
| 111 |
+
## Usage Example
|
| 112 |
+
|
| 113 |
+
```python
|
| 114 |
+
from agents.content_summary_agent import ContentSummaryAgent
|
| 115 |
+
import pandas as pd
|
| 116 |
+
|
| 117 |
+
# Initialize agent
|
| 118 |
+
agent = ContentSummaryAgent(model="gpt-5-nano", temperature=0.3)
|
| 119 |
+
|
| 120 |
+
# Prepare input
|
| 121 |
+
input_data = {
|
| 122 |
+
'content_sk': '12345',
|
| 123 |
+
'content_description': 'Advanced Drum Fills Tutorial',
|
| 124 |
+
'comments': comments_df # DataFrame with comments
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
# Generate summary
|
| 128 |
+
result = agent.process(input_data)
|
| 129 |
+
|
| 130 |
+
if result['success']:
|
| 131 |
+
summary = result['summary']
|
| 132 |
+
print(summary['executive_summary'])
|
| 133 |
+
|
| 134 |
+
for theme in summary['main_themes']:
|
| 135 |
+
print(f"Theme: {theme['theme']} ({theme['sentiment']})")
|
| 136 |
+
print(f" {theme['description']}")
|
| 137 |
+
else:
|
| 138 |
+
print(f"Error: {result['error']}")
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
## Environment Setup
|
| 142 |
+
|
| 143 |
+
### Required Environment Variables
|
| 144 |
+
Add to `.env` file (parent directory):
|
| 145 |
+
```bash
|
| 146 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
### Dependencies
|
| 150 |
+
All dependencies already in `visualization/requirements.txt`:
|
| 151 |
+
- `streamlit>=1.28.0`
|
| 152 |
+
- `pandas>=2.0.0`
|
| 153 |
+
- `python-dotenv>=1.0.0`
|
| 154 |
+
- OpenAI library (inherited from parent project)
|
| 155 |
+
|
| 156 |
+
## Error Handling
|
| 157 |
+
|
| 158 |
+
### Agent-Level Errors
|
| 159 |
+
- **Invalid input**: Returns `{'success': False, 'error': 'Invalid input data'}`
|
| 160 |
+
- **LLM API failure**: Retries up to 3 times with exponential backoff
|
| 161 |
+
- **JSON parsing error**: Returns error with raw content
|
| 162 |
+
- **Exception**: Catches all exceptions, logs, returns error dict
|
| 163 |
+
|
| 164 |
+
### UI-Level Errors
|
| 165 |
+
- Displays error message in red box
|
| 166 |
+
- Provides "🔄 Retry Analysis" button
|
| 167 |
+
- Clears cache and regenerates on retry
|
| 168 |
+
- Logs errors to agent logger
|
| 169 |
+
|
| 170 |
+
## Performance Considerations
|
| 171 |
+
|
| 172 |
+
### API Costs
|
| 173 |
+
- Model: `gpt-5-nano` (cost-effective)
|
| 174 |
+
- Sampling strategy: Reduces tokens by up to 50% for large comment sets
|
| 175 |
+
- Comment truncation: Max 300 chars per comment
|
| 176 |
+
- Session caching: Eliminates duplicate API calls
|
| 177 |
+
|
| 178 |
+
### Response Time
|
| 179 |
+
- Average: 5-10 seconds for 50-100 comments
|
| 180 |
+
- Depends on: Comment count, OpenAI API latency
|
| 181 |
+
- User feedback: Spinner shows "Analyzing comments with AI..."
|
| 182 |
+
|
| 183 |
+
### Scalability
|
| 184 |
+
- Handles up to 100 comments per analysis (after sampling)
|
| 185 |
+
- Parallel requests: Each content analyzed independently
|
| 186 |
+
- Session state: Memory usage scales with number of analyzed contents
|
| 187 |
+
|
| 188 |
+
## Extending Agents
|
| 189 |
+
|
| 190 |
+
### Adding New Agents
|
| 191 |
+
|
| 192 |
+
1. **Create agent file**:
|
| 193 |
+
```python
|
| 194 |
+
# agents/new_agent.py
|
| 195 |
+
from agents.base_agent import BaseVisualizationAgent
|
| 196 |
+
from utils.llm_helper import LLMHelper
|
| 197 |
+
|
| 198 |
+
class NewAgent(BaseVisualizationAgent):
|
| 199 |
+
def __init__(self, model="gpt-5-nano", temperature=0.7):
|
| 200 |
+
super().__init__(name="NewAgent", model=model, temperature=temperature)
|
| 201 |
+
self.llm_helper = LLMHelper(model=model, temperature=temperature)
|
| 202 |
+
|
| 203 |
+
def validate_input(self, input_data):
|
| 204 |
+
# Validation logic
|
| 205 |
+
return True
|
| 206 |
+
|
| 207 |
+
def process(self, input_data):
|
| 208 |
+
# Processing logic
|
| 209 |
+
pass
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
2. **Update `__init__.py`**:
|
| 213 |
+
```python
|
| 214 |
+
from .new_agent import NewAgent
|
| 215 |
+
|
| 216 |
+
__all__ = ['ContentSummaryAgent', 'NewAgent']
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
3. **Integrate in UI**:
|
| 220 |
+
- Import agent in component file
|
| 221 |
+
- Add UI controls (buttons, inputs)
|
| 222 |
+
- Display results
|
| 223 |
+
- Handle caching if needed
|
| 224 |
+
|
| 225 |
+
### Best Practices
|
| 226 |
+
|
| 227 |
+
1. **Input Validation**: Always validate required fields
|
| 228 |
+
2. **Error Handling**: Use `handle_error()` method
|
| 229 |
+
3. **Logging**: Use `log_processing()` for debugging
|
| 230 |
+
4. **Structured Output**: Return consistent dict format
|
| 231 |
+
5. **Caching**: Use session state for expensive operations
|
| 232 |
+
6. **Token Optimization**: Sample/truncate data for large inputs
|
| 233 |
+
7. **User Feedback**: Show spinners for async operations
|
| 234 |
+
8. **Graceful Degradation**: Provide fallbacks for failures
|
| 235 |
+
|
| 236 |
+
## Testing
|
| 237 |
+
|
| 238 |
+
### Manual Testing
|
| 239 |
+
1. Start dashboard: `streamlit run app.py`
|
| 240 |
+
2. Navigate to "⚠️ Poor Sentiment Contents" page
|
| 241 |
+
3. Click "🔍 Generate AI Analysis" for any content
|
| 242 |
+
4. Verify summary displays correctly
|
| 243 |
+
5. Check session caching (click button again)
|
| 244 |
+
6. Test error handling (disconnect network)
|
| 245 |
+
|
| 246 |
+
### Unit Testing
|
| 247 |
+
```python
|
| 248 |
+
# tests/test_content_summary_agent.py
|
| 249 |
+
import pytest
|
| 250 |
+
from agents.content_summary_agent import ContentSummaryAgent
|
| 251 |
+
|
| 252 |
+
def test_validate_input():
|
| 253 |
+
agent = ContentSummaryAgent()
|
| 254 |
+
|
| 255 |
+
# Valid input
|
| 256 |
+
valid_input = {
|
| 257 |
+
'content_sk': '123',
|
| 258 |
+
'content_description': 'Test',
|
| 259 |
+
'comments': []
|
| 260 |
+
}
|
| 261 |
+
assert agent.validate_input(valid_input) == True
|
| 262 |
+
|
| 263 |
+
# Missing field
|
| 264 |
+
invalid_input = {'content_sk': '123'}
|
| 265 |
+
assert agent.validate_input(invalid_input) == False
|
| 266 |
+
```
|
| 267 |
+
|
| 268 |
+
## Future Enhancements
|
| 269 |
+
|
| 270 |
+
### Planned Features
|
| 271 |
+
1. **Batch Analysis**: Analyze multiple contents at once
|
| 272 |
+
2. **Trend Detection**: Compare with historical summaries
|
| 273 |
+
3. **Export Summaries**: Download as PDF/CSV
|
| 274 |
+
4. **Custom Prompts**: User-defined analysis focus
|
| 275 |
+
5. **Multi-language Support**: Summaries in user's language
|
| 276 |
+
|
| 277 |
+
### Additional Agents (Roadmap)
|
| 278 |
+
- **InsightsSummaryAgent**: Overall dataset insights
|
| 279 |
+
- **InteractiveChatbotAgent**: Conversational analysis
|
| 280 |
+
- **ComparativeContentAgent**: Content comparison
|
| 281 |
+
- **ReplySuggestionAgent**: Generate reply suggestions
|
| 282 |
+
- **TrendForecastingAgent**: Predict sentiment trends
|
| 283 |
+
|
| 284 |
+
## Troubleshooting
|
| 285 |
+
|
| 286 |
+
### Common Issues
|
| 287 |
+
|
| 288 |
+
**Issue**: `OPENAI_API_KEY not found`
|
| 289 |
+
- **Solution**: Add key to `.env` file in parent directory
|
| 290 |
+
|
| 291 |
+
**Issue**: Import error for `agents` module
|
| 292 |
+
- **Solution**: Ensure `__init__.py` exists in `visualization/agents/`
|
| 293 |
+
|
| 294 |
+
**Issue**: LLM timeout errors
|
| 295 |
+
- **Solution**: Reduce comment count or increase retry limit
|
| 296 |
+
|
| 297 |
+
**Issue**: JSON parsing errors
|
| 298 |
+
- **Solution**: Check LLM prompt format, ensure JSON mode enabled
|
| 299 |
+
|
| 300 |
+
**Issue**: Cached summaries not showing
|
| 301 |
+
- **Solution**: Check `st.session_state.content_summaries` initialization
|
| 302 |
+
|
| 303 |
+
## Support
|
| 304 |
+
|
| 305 |
+
For issues or questions:
|
| 306 |
+
1. Check this README
|
| 307 |
+
2. Review agent logs in console
|
| 308 |
+
3. Inspect session state in Streamlit
|
| 309 |
+
4. Verify environment variables
|
| 310 |
+
5. Check OpenAI API status
|
| 311 |
+
|
| 312 |
+
## Version History
|
| 313 |
+
|
| 314 |
+
### v1.0.0 (Current)
|
| 315 |
+
- Initial release
|
| 316 |
+
- ContentSummaryAgent implementation
|
| 317 |
+
- Poor Sentiment Contents page integration
|
| 318 |
+
- Session-based caching
|
| 319 |
+
- Error handling and retry logic
|
| 320 |
+
- Comprehensive UI display
|
visualization/agents/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Visualization Agents Package
|
| 3 |
+
Contains AI agents for intelligent dashboard features
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .content_summary_agent import ContentSummaryAgent
|
| 7 |
+
|
| 8 |
+
__all__ = ['ContentSummaryAgent']
|
visualization/agents/base_agent.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Base Agent class for visualization agents
|
| 3 |
+
Provides common functionality and interface for all agents
|
| 4 |
+
"""
|
| 5 |
+
from abc import ABC, abstractmethod
|
| 6 |
+
from typing import Dict, Any
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class BaseVisualizationAgent(ABC):
|
| 11 |
+
"""
|
| 12 |
+
Abstract base class for all visualization agents
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
def __init__(self, name: str, model: str = "gpt-5-nano", temperature: float = 0.7):
|
| 16 |
+
"""
|
| 17 |
+
Initialize base agent
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
name: Agent name
|
| 21 |
+
model: LLM model to use
|
| 22 |
+
temperature: LLM temperature
|
| 23 |
+
"""
|
| 24 |
+
self.name = name
|
| 25 |
+
self.model = model
|
| 26 |
+
self.temperature = temperature
|
| 27 |
+
self.logger = logging.getLogger(f"visualization.agents.{name}")
|
| 28 |
+
|
| 29 |
+
@abstractmethod
|
| 30 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 31 |
+
"""
|
| 32 |
+
Process input data and return results
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
input_data: Input data dictionary
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Results dictionary
|
| 39 |
+
"""
|
| 40 |
+
pass
|
| 41 |
+
|
| 42 |
+
@abstractmethod
|
| 43 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 44 |
+
"""
|
| 45 |
+
Validate input data
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
input_data: Input data dictionary
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
True if valid, False otherwise
|
| 52 |
+
"""
|
| 53 |
+
pass
|
| 54 |
+
|
| 55 |
+
def log_processing(self, message: str, level: str = "info"):
|
| 56 |
+
"""
|
| 57 |
+
Log processing information
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
message: Log message
|
| 61 |
+
level: Log level (info, warning, error)
|
| 62 |
+
"""
|
| 63 |
+
log_func = getattr(self.logger, level.lower(), self.logger.info)
|
| 64 |
+
log_func(f"[{self.name}] {message}")
|
| 65 |
+
|
| 66 |
+
def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]:
|
| 67 |
+
"""
|
| 68 |
+
Handle errors consistently
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
error: Exception that occurred
|
| 72 |
+
context: Additional context information
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
Error response dictionary
|
| 76 |
+
"""
|
| 77 |
+
error_msg = f"Error in {self.name}: {str(error)}"
|
| 78 |
+
if context:
|
| 79 |
+
error_msg += f" | Context: {context}"
|
| 80 |
+
|
| 81 |
+
self.log_processing(error_msg, level="error")
|
| 82 |
+
|
| 83 |
+
return {
|
| 84 |
+
'success': False,
|
| 85 |
+
'error': str(error),
|
| 86 |
+
'error_type': type(error).__name__,
|
| 87 |
+
'context': context
|
| 88 |
+
}
|
visualization/agents/content_summary_agent.py
ADDED
|
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Content Summary Agent
|
| 3 |
+
Analyzes and summarizes comments for content pieces
|
| 4 |
+
"""
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from typing import Dict, Any, List
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Add parent directory to path
|
| 11 |
+
parent_dir = Path(__file__).resolve().parent.parent
|
| 12 |
+
sys.path.append(str(parent_dir))
|
| 13 |
+
|
| 14 |
+
from agents.base_agent import BaseVisualizationAgent
|
| 15 |
+
from utils.llm_helper import LLMHelper
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ContentSummaryAgent(BaseVisualizationAgent):
|
| 19 |
+
"""
|
| 20 |
+
Agent that analyzes and summarizes comments for content
|
| 21 |
+
Extracts themes, praise points, complaints, FAQs, and insights
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, model: str = "gpt-5-nano", temperature: float = 1):
|
| 25 |
+
"""
|
| 26 |
+
Initialize Content Summary Agent
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
model: LLM model to use
|
| 30 |
+
temperature: Temperature for generation (lower for more focused summaries)
|
| 31 |
+
"""
|
| 32 |
+
super().__init__(name="ContentSummaryAgent", model=model, temperature=temperature)
|
| 33 |
+
self.llm_helper = LLMHelper(model=model, temperature=temperature)
|
| 34 |
+
|
| 35 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 36 |
+
"""
|
| 37 |
+
Validate input data
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
input_data: Input dictionary
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
True if valid, False otherwise
|
| 44 |
+
"""
|
| 45 |
+
required_fields = ['content_sk', 'content_description', 'comments']
|
| 46 |
+
|
| 47 |
+
for field in required_fields:
|
| 48 |
+
if field not in input_data:
|
| 49 |
+
self.log_processing(f"Missing required field: {field}", level="error")
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
if not isinstance(input_data['comments'], (list, pd.DataFrame)):
|
| 53 |
+
self.log_processing("Comments must be a list or DataFrame", level="error")
|
| 54 |
+
return False
|
| 55 |
+
|
| 56 |
+
return True
|
| 57 |
+
|
| 58 |
+
def _prepare_comments_context(self, comments: Any, sentiment_type: str = 'negative') -> str:
|
| 59 |
+
"""
|
| 60 |
+
Prepare comments data for LLM analysis
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
comments: Comments as DataFrame or list of dicts
|
| 64 |
+
sentiment_type: Type of sentiment to analyze ('negative', 'positive', 'combined')
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
Formatted string with comment data
|
| 68 |
+
"""
|
| 69 |
+
# Convert to DataFrame if needed
|
| 70 |
+
if isinstance(comments, list):
|
| 71 |
+
comments_df = pd.DataFrame(comments)
|
| 72 |
+
else:
|
| 73 |
+
comments_df = comments.copy()
|
| 74 |
+
|
| 75 |
+
# Filter based on sentiment type
|
| 76 |
+
if sentiment_type == 'negative':
|
| 77 |
+
# Only negative comments
|
| 78 |
+
comments_df = comments_df[
|
| 79 |
+
comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])
|
| 80 |
+
]
|
| 81 |
+
elif sentiment_type == 'positive':
|
| 82 |
+
# Only positive comments
|
| 83 |
+
comments_df = comments_df[
|
| 84 |
+
comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])
|
| 85 |
+
]
|
| 86 |
+
# else: combined - use all comments
|
| 87 |
+
|
| 88 |
+
# Limit to reasonable number for API
|
| 89 |
+
if len(comments_df) > 100:
|
| 90 |
+
if sentiment_type == 'combined':
|
| 91 |
+
# For combined: sample from both positive and negative
|
| 92 |
+
negative_comments = comments_df[
|
| 93 |
+
comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])
|
| 94 |
+
].sample(n=min(50, len(comments_df[comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])])), random_state=42)
|
| 95 |
+
|
| 96 |
+
positive_comments = comments_df[
|
| 97 |
+
comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])
|
| 98 |
+
].sample(n=min(50, len(comments_df[comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])])), random_state=42)
|
| 99 |
+
|
| 100 |
+
comments_df = pd.concat([negative_comments, positive_comments])
|
| 101 |
+
else:
|
| 102 |
+
# For single sentiment type: just sample
|
| 103 |
+
comments_df = comments_df.sample(n=min(100, len(comments_df)), random_state=42)
|
| 104 |
+
|
| 105 |
+
# Format comments for analysis
|
| 106 |
+
comments_text = []
|
| 107 |
+
for idx, row in comments_df.iterrows():
|
| 108 |
+
text = row.get('display_text', row.get('original_text', ''))
|
| 109 |
+
sentiment = row.get('sentiment_polarity', 'unknown')
|
| 110 |
+
intent = row.get('intent', 'unknown')
|
| 111 |
+
|
| 112 |
+
comment_entry = f"""
|
| 113 |
+
Comment #{idx + 1}:
|
| 114 |
+
- Text: {text[:300]}{'...' if len(str(text)) > 300 else ''}
|
| 115 |
+
- Sentiment: {sentiment}
|
| 116 |
+
- Intent: {intent}
|
| 117 |
+
"""
|
| 118 |
+
comments_text.append(comment_entry)
|
| 119 |
+
|
| 120 |
+
return "\n".join(comments_text)
|
| 121 |
+
|
| 122 |
+
def _generate_summary_prompt(
|
| 123 |
+
self,
|
| 124 |
+
content_description: str,
|
| 125 |
+
comments_context: str,
|
| 126 |
+
total_comments: int,
|
| 127 |
+
sentiment_type: str = 'negative'
|
| 128 |
+
) -> str:
|
| 129 |
+
"""
|
| 130 |
+
Generate prompt for LLM
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
content_description: Description of the content
|
| 134 |
+
comments_context: Formatted comments
|
| 135 |
+
total_comments: Total number of comments
|
| 136 |
+
sentiment_type: Type of sentiment being analyzed ('negative', 'positive', 'combined')
|
| 137 |
+
|
| 138 |
+
Returns:
|
| 139 |
+
Prompt string
|
| 140 |
+
"""
|
| 141 |
+
# Customize prompt based on sentiment type
|
| 142 |
+
if sentiment_type == 'negative':
|
| 143 |
+
focus_instruction = "Focus on understanding negative feedback, complaints, and issues that need attention."
|
| 144 |
+
elif sentiment_type == 'positive':
|
| 145 |
+
focus_instruction = "Focus on understanding what users love, praise points, and successful elements that should be maintained or amplified."
|
| 146 |
+
else: # combined
|
| 147 |
+
focus_instruction = "Provide a balanced analysis covering both positive feedback and areas for improvement."
|
| 148 |
+
|
| 149 |
+
prompt = f"""Analyze the {sentiment_type} comments below for the following content and provide a brief executive summary.
|
| 150 |
+
|
| 151 |
+
**Content:** {content_description}
|
| 152 |
+
|
| 153 |
+
**Total Comments Analyzed:** {total_comments}
|
| 154 |
+
|
| 155 |
+
**Analysis Focus:** {focus_instruction}
|
| 156 |
+
|
| 157 |
+
**Comments to Analyze:**
|
| 158 |
+
{comments_context}
|
| 159 |
+
|
| 160 |
+
**Task:** Provide a concise executive summary in JSON format with the following structure:
|
| 161 |
+
|
| 162 |
+
{{
|
| 163 |
+
"executive_summary": "2-3 sentence high-level overview focusing on {sentiment_type} sentiment",
|
| 164 |
+
"main_themes": [
|
| 165 |
+
{{
|
| 166 |
+
"theme": "theme name",
|
| 167 |
+
"sentiment": "positive/negative/mixed",
|
| 168 |
+
"description": "brief description"
|
| 169 |
+
}}
|
| 170 |
+
],
|
| 171 |
+
"praise_points": ["point 1", "point 2", "point 3"],
|
| 172 |
+
"key_complaints": ["complaint 1", "complaint 2", "complaint 3"],
|
| 173 |
+
"frequently_asked_questions": ["question 1", "question 2"],
|
| 174 |
+
"unexpected_insights": ["insight 1", "insight 2"],
|
| 175 |
+
"action_recommendations": [
|
| 176 |
+
{{
|
| 177 |
+
"priority": "high/medium/low",
|
| 178 |
+
"action": "recommended action"
|
| 179 |
+
}}
|
| 180 |
+
]
|
| 181 |
+
}}
|
| 182 |
+
|
| 183 |
+
**Guidelines:**
|
| 184 |
+
- Be concise and actionable
|
| 185 |
+
- Focus on the most important insights from {sentiment_type} comments
|
| 186 |
+
- Limit each list to top 3-5 items
|
| 187 |
+
- If a section has no relevant items, use an empty list
|
| 188 |
+
- Executive summary should capture the overall patterns and key takeaways
|
| 189 |
+
"""
|
| 190 |
+
return prompt
|
| 191 |
+
|
| 192 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 193 |
+
"""
|
| 194 |
+
Process comments and generate summary
|
| 195 |
+
|
| 196 |
+
Args:
|
| 197 |
+
input_data: {
|
| 198 |
+
'content_sk': content identifier,
|
| 199 |
+
'content_description': content title/description,
|
| 200 |
+
'comments': DataFrame or list of comment dicts,
|
| 201 |
+
'sentiment_type': 'negative', 'positive', or 'combined' (optional, defaults to 'negative')
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
{
|
| 206 |
+
'success': bool,
|
| 207 |
+
'content_sk': str,
|
| 208 |
+
'sentiment_type': str,
|
| 209 |
+
'summary': {
|
| 210 |
+
'executive_summary': str,
|
| 211 |
+
'main_themes': list,
|
| 212 |
+
'praise_points': list,
|
| 213 |
+
'key_complaints': list,
|
| 214 |
+
'frequently_asked_questions': list,
|
| 215 |
+
'unexpected_insights': list,
|
| 216 |
+
'action_recommendations': list
|
| 217 |
+
},
|
| 218 |
+
'metadata': {
|
| 219 |
+
'total_comments_analyzed': int,
|
| 220 |
+
'model_used': str,
|
| 221 |
+
'tokens_used': int
|
| 222 |
+
}
|
| 223 |
+
}
|
| 224 |
+
"""
|
| 225 |
+
try:
|
| 226 |
+
# Validate input
|
| 227 |
+
if not self.validate_input(input_data):
|
| 228 |
+
return {
|
| 229 |
+
'success': False,
|
| 230 |
+
'error': 'Invalid input data',
|
| 231 |
+
'content_sk': input_data.get('content_sk', 'unknown')
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
content_sk = input_data['content_sk']
|
| 235 |
+
content_description = input_data['content_description']
|
| 236 |
+
comments = input_data['comments']
|
| 237 |
+
sentiment_type = input_data.get('sentiment_type', 'negative') # Default to negative for backward compatibility
|
| 238 |
+
|
| 239 |
+
self.log_processing(f"Starting {sentiment_type} analysis for content: {content_sk}")
|
| 240 |
+
|
| 241 |
+
# Convert to DataFrame if needed
|
| 242 |
+
if isinstance(comments, list):
|
| 243 |
+
comments_df = pd.DataFrame(comments)
|
| 244 |
+
else:
|
| 245 |
+
comments_df = comments.copy()
|
| 246 |
+
|
| 247 |
+
total_comments = len(comments_df)
|
| 248 |
+
|
| 249 |
+
if total_comments == 0:
|
| 250 |
+
return {
|
| 251 |
+
'success': True,
|
| 252 |
+
'content_sk': content_sk,
|
| 253 |
+
'sentiment_type': sentiment_type,
|
| 254 |
+
'summary': {
|
| 255 |
+
'executive_summary': 'No comments available for analysis.',
|
| 256 |
+
'main_themes': [],
|
| 257 |
+
'praise_points': [],
|
| 258 |
+
'key_complaints': [],
|
| 259 |
+
'frequently_asked_questions': [],
|
| 260 |
+
'unexpected_insights': [],
|
| 261 |
+
'action_recommendations': []
|
| 262 |
+
},
|
| 263 |
+
'metadata': {
|
| 264 |
+
'total_comments_analyzed': 0,
|
| 265 |
+
'model_used': self.model,
|
| 266 |
+
'tokens_used': 0
|
| 267 |
+
}
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
# Prepare comments context based on sentiment type
|
| 271 |
+
comments_context = self._prepare_comments_context(comments_df, sentiment_type)
|
| 272 |
+
|
| 273 |
+
# Get count of comments after filtering
|
| 274 |
+
if sentiment_type == 'negative':
|
| 275 |
+
filtered_count = len(comments_df[comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])])
|
| 276 |
+
elif sentiment_type == 'positive':
|
| 277 |
+
filtered_count = len(comments_df[comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])])
|
| 278 |
+
else:
|
| 279 |
+
filtered_count = total_comments
|
| 280 |
+
|
| 281 |
+
if filtered_count == 0:
|
| 282 |
+
return {
|
| 283 |
+
'success': True,
|
| 284 |
+
'content_sk': content_sk,
|
| 285 |
+
'sentiment_type': sentiment_type,
|
| 286 |
+
'summary': {
|
| 287 |
+
'executive_summary': f'No {sentiment_type} comments available for analysis.',
|
| 288 |
+
'main_themes': [],
|
| 289 |
+
'praise_points': [],
|
| 290 |
+
'key_complaints': [],
|
| 291 |
+
'frequently_asked_questions': [],
|
| 292 |
+
'unexpected_insights': [],
|
| 293 |
+
'action_recommendations': []
|
| 294 |
+
},
|
| 295 |
+
'metadata': {
|
| 296 |
+
'total_comments_analyzed': 0,
|
| 297 |
+
'model_used': self.model,
|
| 298 |
+
'tokens_used': 0
|
| 299 |
+
}
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
# Generate prompt
|
| 303 |
+
prompt = self._generate_summary_prompt(
|
| 304 |
+
content_description,
|
| 305 |
+
comments_context,
|
| 306 |
+
filtered_count,
|
| 307 |
+
sentiment_type
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
# System message
|
| 311 |
+
system_message = """You are an expert social media analyst specializing in
|
| 312 |
+
sentiment analysis and community insights. Provide concise, actionable summaries
|
| 313 |
+
that help content creators understand their audience feedback."""
|
| 314 |
+
|
| 315 |
+
# Get LLM response
|
| 316 |
+
self.log_processing(f"Calling LLM for {sentiment_type} summary generation")
|
| 317 |
+
response = self.llm_helper.get_structured_completion(
|
| 318 |
+
prompt=prompt,
|
| 319 |
+
system_message=system_message,
|
| 320 |
+
max_retries=3
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
if not response['success']:
|
| 324 |
+
return self.handle_error(
|
| 325 |
+
Exception(response.get('error', 'LLM call failed')),
|
| 326 |
+
context=f"content_sk={content_sk}, sentiment_type={sentiment_type}"
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
# Extract summary
|
| 330 |
+
summary = response['content']
|
| 331 |
+
|
| 332 |
+
# Ensure all expected fields exist
|
| 333 |
+
default_summary = {
|
| 334 |
+
'executive_summary': '',
|
| 335 |
+
'main_themes': [],
|
| 336 |
+
'praise_points': [],
|
| 337 |
+
'key_complaints': [],
|
| 338 |
+
'frequently_asked_questions': [],
|
| 339 |
+
'unexpected_insights': [],
|
| 340 |
+
'action_recommendations': []
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
# Merge with defaults
|
| 344 |
+
for key in default_summary:
|
| 345 |
+
if key not in summary:
|
| 346 |
+
summary[key] = default_summary[key]
|
| 347 |
+
|
| 348 |
+
self.log_processing(f"Successfully generated {sentiment_type} summary for content: {content_sk}")
|
| 349 |
+
|
| 350 |
+
return {
|
| 351 |
+
'success': True,
|
| 352 |
+
'content_sk': content_sk,
|
| 353 |
+
'sentiment_type': sentiment_type,
|
| 354 |
+
'summary': summary,
|
| 355 |
+
'metadata': {
|
| 356 |
+
'total_comments_analyzed': filtered_count,
|
| 357 |
+
'model_used': response['model'],
|
| 358 |
+
'tokens_used': response['usage']['total_tokens']
|
| 359 |
+
}
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
except Exception as e:
|
| 363 |
+
return self.handle_error(
|
| 364 |
+
e,
|
| 365 |
+
context=f"content_sk={input_data.get('content_sk', 'unknown')}, sentiment_type={input_data.get('sentiment_type', 'negative')}"
|
| 366 |
+
)
|
visualization/app.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Musora Sentiment Analysis Dashboard
|
| 3 |
+
Main Streamlit Application
|
| 4 |
+
|
| 5 |
+
Run with: streamlit run app.py
|
| 6 |
+
"""
|
| 7 |
+
import streamlit as st
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import json
|
| 11 |
+
|
| 12 |
+
# Add parent directory to path
|
| 13 |
+
parent_dir = Path(__file__).resolve().parent
|
| 14 |
+
sys.path.append(str(parent_dir))
|
| 15 |
+
|
| 16 |
+
from data.data_loader import SentimentDataLoader
|
| 17 |
+
from components.dashboard import render_dashboard
|
| 18 |
+
from components.sentiment_analysis import render_sentiment_analysis
|
| 19 |
+
from components.reply_required import render_reply_required
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Load configuration
|
| 23 |
+
config_path = parent_dir / "config" / "viz_config.json"
|
| 24 |
+
with open(config_path, 'r') as f:
|
| 25 |
+
config = json.load(f)
|
| 26 |
+
|
| 27 |
+
# Page configuration
|
| 28 |
+
st.set_page_config(
|
| 29 |
+
page_title=config['page_config']['page_title'],
|
| 30 |
+
page_icon=config['page_config']['page_icon'],
|
| 31 |
+
layout=config['page_config']['layout'],
|
| 32 |
+
initial_sidebar_state=config['page_config']['initial_sidebar_state']
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def main():
|
| 37 |
+
"""
|
| 38 |
+
Main application function
|
| 39 |
+
"""
|
| 40 |
+
# Sidebar
|
| 41 |
+
with st.sidebar:
|
| 42 |
+
st.image("visualization/img/musora.png", use_container_width=True)
|
| 43 |
+
st.title("Navigation")
|
| 44 |
+
|
| 45 |
+
# Page selection
|
| 46 |
+
page = st.radio(
|
| 47 |
+
"Select Page",
|
| 48 |
+
["📊 Dashboard", "🔍 Sentiment Analysis", "💬 Reply Required"],
|
| 49 |
+
index=0
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
st.markdown("---")
|
| 53 |
+
|
| 54 |
+
# Filters section
|
| 55 |
+
st.markdown("### 🔍 Global Filters")
|
| 56 |
+
|
| 57 |
+
# Initialize session state for filters
|
| 58 |
+
if 'filters_applied' not in st.session_state:
|
| 59 |
+
st.session_state.filters_applied = False
|
| 60 |
+
|
| 61 |
+
# Load data first to get filter options
|
| 62 |
+
with st.spinner("Loading data..."):
|
| 63 |
+
data_loader = SentimentDataLoader()
|
| 64 |
+
df = data_loader.load_data()
|
| 65 |
+
|
| 66 |
+
if df.empty:
|
| 67 |
+
st.error("No data available. Please check your Snowflake connection.")
|
| 68 |
+
return
|
| 69 |
+
|
| 70 |
+
# Get filter options
|
| 71 |
+
filter_options = data_loader.get_filter_options(df)
|
| 72 |
+
|
| 73 |
+
# Platform filter
|
| 74 |
+
selected_platforms = st.multiselect(
|
| 75 |
+
"Platforms",
|
| 76 |
+
options=filter_options['platforms'],
|
| 77 |
+
default=[]
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Brand filter
|
| 81 |
+
selected_brands = st.multiselect(
|
| 82 |
+
"Brands",
|
| 83 |
+
options=filter_options['brands'],
|
| 84 |
+
default=[]
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
# Sentiment filter
|
| 88 |
+
selected_sentiments = st.multiselect(
|
| 89 |
+
"Sentiments",
|
| 90 |
+
options=filter_options['sentiments'],
|
| 91 |
+
default=[]
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
# Date range filter (if available)
|
| 95 |
+
if 'comment_timestamp' in df.columns and not df.empty:
|
| 96 |
+
min_date = df['comment_timestamp'].min().date()
|
| 97 |
+
max_date = df['comment_timestamp'].max().date()
|
| 98 |
+
|
| 99 |
+
date_range = st.date_input(
|
| 100 |
+
"Date Range",
|
| 101 |
+
value=(min_date, max_date),
|
| 102 |
+
min_value=min_date,
|
| 103 |
+
max_value=max_date
|
| 104 |
+
)
|
| 105 |
+
else:
|
| 106 |
+
date_range = None
|
| 107 |
+
|
| 108 |
+
# Apply filters button
|
| 109 |
+
if st.button("🔍 Apply Filters", use_container_width=True):
|
| 110 |
+
st.session_state.filters_applied = True
|
| 111 |
+
|
| 112 |
+
# Reset filters button
|
| 113 |
+
if st.button("🔄 Reset Filters", use_container_width=True):
|
| 114 |
+
st.session_state.filters_applied = False
|
| 115 |
+
st.rerun()
|
| 116 |
+
|
| 117 |
+
st.markdown("---")
|
| 118 |
+
|
| 119 |
+
# Data refresh
|
| 120 |
+
st.markdown("### 🔄 Data Management")
|
| 121 |
+
|
| 122 |
+
if st.button("♻️ Reload Data", use_container_width=True):
|
| 123 |
+
st.cache_data.clear()
|
| 124 |
+
st.rerun()
|
| 125 |
+
|
| 126 |
+
# Display data info
|
| 127 |
+
st.markdown("---")
|
| 128 |
+
st.markdown("### ℹ️ Data Info")
|
| 129 |
+
st.info(f"**Total Records:** {len(df):,}")
|
| 130 |
+
|
| 131 |
+
if 'processed_at' in df.columns and not df.empty:
|
| 132 |
+
last_update = df['processed_at'].max()
|
| 133 |
+
st.info(f"**Last Updated:** {last_update.strftime('%Y-%m-%d %H:%M')}")
|
| 134 |
+
|
| 135 |
+
# Apply filters if needed
|
| 136 |
+
if st.session_state.filters_applied:
|
| 137 |
+
df = data_loader.apply_filters(
|
| 138 |
+
df,
|
| 139 |
+
platforms=selected_platforms if selected_platforms else None,
|
| 140 |
+
brands=selected_brands if selected_brands else None,
|
| 141 |
+
sentiments=selected_sentiments if selected_sentiments else None,
|
| 142 |
+
date_range=date_range if date_range and len(date_range) == 2 else None
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
# Show filter summary
|
| 146 |
+
if df.empty:
|
| 147 |
+
st.warning("No data matches the selected filters. Please adjust your filters.")
|
| 148 |
+
return
|
| 149 |
+
else:
|
| 150 |
+
st.info(f"Showing {len(df):,} records after applying filters")
|
| 151 |
+
|
| 152 |
+
# Main content area - render selected page
|
| 153 |
+
if page == "📊 Dashboard":
|
| 154 |
+
render_dashboard(df)
|
| 155 |
+
|
| 156 |
+
elif page == "🔍 Sentiment Analysis":
|
| 157 |
+
render_sentiment_analysis(df)
|
| 158 |
+
|
| 159 |
+
elif page == "💬 Reply Required":
|
| 160 |
+
render_reply_required(df)
|
| 161 |
+
|
| 162 |
+
# Footer
|
| 163 |
+
st.markdown("---")
|
| 164 |
+
st.markdown(
|
| 165 |
+
"""
|
| 166 |
+
<div style='text-align: center; color: gray; padding: 20px;'>
|
| 167 |
+
<p>Musora Sentiment Analysis Dashboard v1.0</p>
|
| 168 |
+
<p>Powered by Streamlit | Data from Snowflake</p>
|
| 169 |
+
</div>
|
| 170 |
+
""",
|
| 171 |
+
unsafe_allow_html=True
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
if __name__ == "__main__":
|
| 176 |
+
try:
|
| 177 |
+
main()
|
| 178 |
+
except Exception as e:
|
| 179 |
+
st.error(f"An error occurred: {str(e)}")
|
| 180 |
+
st.exception(e)
|
visualization/components/dashboard.py
ADDED
|
@@ -0,0 +1,538 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main Dashboard Page
|
| 3 |
+
Displays overall sentiment distributions by brand and platform
|
| 4 |
+
"""
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# Add parent directory to path
|
| 10 |
+
parent_dir = Path(__file__).resolve().parent.parent
|
| 11 |
+
sys.path.append(str(parent_dir))
|
| 12 |
+
|
| 13 |
+
from utils.data_processor import SentimentDataProcessor
|
| 14 |
+
from utils.metrics import SentimentMetrics
|
| 15 |
+
from visualizations.sentiment_charts import SentimentCharts
|
| 16 |
+
from visualizations.distribution_charts import DistributionCharts
|
| 17 |
+
from visualizations.demographic_charts import DemographicCharts
|
| 18 |
+
from visualizations.content_cards import ContentCards
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def render_dashboard(df):
|
| 22 |
+
"""
|
| 23 |
+
Render the main dashboard page
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
df: Sentiment dataframe
|
| 27 |
+
"""
|
| 28 |
+
st.title("📊 Sentiment Analysis Dashboard")
|
| 29 |
+
|
| 30 |
+
# Performance tip
|
| 31 |
+
if len(df) > 10000:
|
| 32 |
+
st.info(f"💡 **Performance Tip**: Loaded {len(df):,} comments. Use the global filters in the sidebar to narrow down your analysis for faster performance.")
|
| 33 |
+
|
| 34 |
+
st.markdown("---")
|
| 35 |
+
|
| 36 |
+
# Initialize components
|
| 37 |
+
sentiment_charts = SentimentCharts()
|
| 38 |
+
distribution_charts = DistributionCharts()
|
| 39 |
+
processor = SentimentDataProcessor()
|
| 40 |
+
|
| 41 |
+
# Display overall summary statistics
|
| 42 |
+
ContentCards.display_summary_stats(df)
|
| 43 |
+
|
| 44 |
+
st.markdown("---")
|
| 45 |
+
|
| 46 |
+
# Calculate overall metrics
|
| 47 |
+
overall_metrics = SentimentMetrics.calculate_overall_metrics(df)
|
| 48 |
+
|
| 49 |
+
# Display health indicator
|
| 50 |
+
col1, col2, col3 = st.columns([1, 2, 1])
|
| 51 |
+
with col2:
|
| 52 |
+
ContentCards.display_health_indicator(overall_metrics['negative_pct'])
|
| 53 |
+
|
| 54 |
+
st.markdown("---")
|
| 55 |
+
|
| 56 |
+
# Overall sentiment distribution
|
| 57 |
+
st.markdown("## 🎯 Overall Sentiment Distribution")
|
| 58 |
+
|
| 59 |
+
col1, col2 = st.columns(2)
|
| 60 |
+
|
| 61 |
+
with col1:
|
| 62 |
+
# Sentiment pie chart
|
| 63 |
+
sentiment_pie = sentiment_charts.create_sentiment_pie_chart(df, title="Overall Sentiment Distribution")
|
| 64 |
+
st.plotly_chart(sentiment_pie, use_container_width=True)
|
| 65 |
+
|
| 66 |
+
with col2:
|
| 67 |
+
# Sentiment score gauge
|
| 68 |
+
sentiment_gauge = sentiment_charts.create_sentiment_score_gauge(
|
| 69 |
+
overall_metrics['avg_sentiment_score'],
|
| 70 |
+
title="Overall Sentiment Score"
|
| 71 |
+
)
|
| 72 |
+
st.plotly_chart(sentiment_gauge, use_container_width=True)
|
| 73 |
+
|
| 74 |
+
# Additional metrics
|
| 75 |
+
metric_col1, metric_col2 = st.columns(2)
|
| 76 |
+
with metric_col1:
|
| 77 |
+
st.metric("Positive %", f"{overall_metrics['positive_pct']:.1f}%")
|
| 78 |
+
with metric_col2:
|
| 79 |
+
st.metric("Reply Rate %", f"{overall_metrics['reply_required_pct']:.1f}%")
|
| 80 |
+
|
| 81 |
+
st.markdown("---")
|
| 82 |
+
|
| 83 |
+
# Sentiment by Brand
|
| 84 |
+
st.markdown("## 🏢 Sentiment Analysis by Brand")
|
| 85 |
+
|
| 86 |
+
col1, col2 = st.columns(2)
|
| 87 |
+
|
| 88 |
+
with col1:
|
| 89 |
+
# Stacked bar chart
|
| 90 |
+
brand_sentiment_bar = sentiment_charts.create_sentiment_bar_chart(
|
| 91 |
+
df, group_by='brand', title="Sentiment Distribution by Brand"
|
| 92 |
+
)
|
| 93 |
+
st.plotly_chart(brand_sentiment_bar, use_container_width=True)
|
| 94 |
+
|
| 95 |
+
with col2:
|
| 96 |
+
# Percentage bar chart
|
| 97 |
+
brand_sentiment_pct = sentiment_charts.create_sentiment_percentage_bar_chart(
|
| 98 |
+
df, group_by='brand', title="Sentiment Distribution by Brand (%)"
|
| 99 |
+
)
|
| 100 |
+
st.plotly_chart(brand_sentiment_pct, use_container_width=True)
|
| 101 |
+
|
| 102 |
+
# Brand metrics table
|
| 103 |
+
with st.expander("📈 Detailed Brand Metrics"):
|
| 104 |
+
brand_metrics = SentimentMetrics.calculate_brand_metrics(df)
|
| 105 |
+
|
| 106 |
+
brand_data = []
|
| 107 |
+
for brand, metrics in brand_metrics.items():
|
| 108 |
+
brand_data.append({
|
| 109 |
+
'Brand': brand.title(),
|
| 110 |
+
'Total Comments': metrics['total_comments'],
|
| 111 |
+
'Replies Needed': metrics['total_reply_required'],
|
| 112 |
+
'Negative %': f"{metrics['negative_pct']:.1f}%",
|
| 113 |
+
'Positive %': f"{metrics['positive_pct']:.1f}%",
|
| 114 |
+
'Avg Sentiment Score': f"{metrics['avg_sentiment_score']:.2f}"
|
| 115 |
+
})
|
| 116 |
+
|
| 117 |
+
st.table(brand_data)
|
| 118 |
+
|
| 119 |
+
st.markdown("---")
|
| 120 |
+
|
| 121 |
+
# Sentiment by Platform
|
| 122 |
+
st.markdown("## 🌐 Sentiment Analysis by Platform")
|
| 123 |
+
|
| 124 |
+
col1, col2 = st.columns(2)
|
| 125 |
+
|
| 126 |
+
with col1:
|
| 127 |
+
# Stacked bar chart
|
| 128 |
+
platform_sentiment_bar = sentiment_charts.create_sentiment_bar_chart(
|
| 129 |
+
df, group_by='platform', title="Sentiment Distribution by Platform"
|
| 130 |
+
)
|
| 131 |
+
st.plotly_chart(platform_sentiment_bar, use_container_width=True)
|
| 132 |
+
|
| 133 |
+
with col2:
|
| 134 |
+
# Percentage bar chart
|
| 135 |
+
platform_sentiment_pct = sentiment_charts.create_sentiment_percentage_bar_chart(
|
| 136 |
+
df, group_by='platform', title="Sentiment Distribution by Platform (%)"
|
| 137 |
+
)
|
| 138 |
+
st.plotly_chart(platform_sentiment_pct, use_container_width=True)
|
| 139 |
+
|
| 140 |
+
# Platform metrics table
|
| 141 |
+
with st.expander("📈 Detailed Platform Metrics"):
|
| 142 |
+
platform_metrics = SentimentMetrics.calculate_platform_metrics(df)
|
| 143 |
+
|
| 144 |
+
platform_data = []
|
| 145 |
+
for platform, metrics in platform_metrics.items():
|
| 146 |
+
platform_data.append({
|
| 147 |
+
'Platform': platform.title(),
|
| 148 |
+
'Total Comments': metrics['total_comments'],
|
| 149 |
+
'Replies Needed': metrics['total_reply_required'],
|
| 150 |
+
'Negative %': f"{metrics['negative_pct']:.1f}%",
|
| 151 |
+
'Positive %': f"{metrics['positive_pct']:.1f}%",
|
| 152 |
+
'Avg Sentiment Score': f"{metrics['avg_sentiment_score']:.2f}"
|
| 153 |
+
})
|
| 154 |
+
|
| 155 |
+
st.table(platform_data)
|
| 156 |
+
|
| 157 |
+
st.markdown("---")
|
| 158 |
+
|
| 159 |
+
# Intent Analysis
|
| 160 |
+
st.markdown("## 🎭 Intent Analysis")
|
| 161 |
+
|
| 162 |
+
col1, col2 = st.columns(2)
|
| 163 |
+
|
| 164 |
+
with col1:
|
| 165 |
+
# Intent bar chart
|
| 166 |
+
intent_bar = distribution_charts.create_intent_bar_chart(
|
| 167 |
+
df, title="Intent Distribution", orientation='h'
|
| 168 |
+
)
|
| 169 |
+
st.plotly_chart(intent_bar, use_container_width=True)
|
| 170 |
+
|
| 171 |
+
with col2:
|
| 172 |
+
# Intent pie chart
|
| 173 |
+
intent_pie = distribution_charts.create_intent_pie_chart(df, title="Intent Distribution")
|
| 174 |
+
st.plotly_chart(intent_pie, use_container_width=True)
|
| 175 |
+
|
| 176 |
+
st.markdown("---")
|
| 177 |
+
|
| 178 |
+
# Brand-Platform Matrix
|
| 179 |
+
st.markdown("## 🔀 Cross-Dimensional Analysis")
|
| 180 |
+
|
| 181 |
+
col1, col2 = st.columns(2)
|
| 182 |
+
|
| 183 |
+
with col1:
|
| 184 |
+
# Heatmap showing comment distribution
|
| 185 |
+
brand_platform_matrix = distribution_charts.create_brand_platform_matrix(
|
| 186 |
+
df, title="Brand-Platform Comment Matrix"
|
| 187 |
+
)
|
| 188 |
+
st.plotly_chart(brand_platform_matrix, use_container_width=True)
|
| 189 |
+
|
| 190 |
+
with col2:
|
| 191 |
+
# Sentiment heatmap
|
| 192 |
+
sentiment_heatmap = sentiment_charts.create_sentiment_heatmap(
|
| 193 |
+
df, row_dimension='brand', col_dimension='platform', title="Negative Sentiment Heatmap"
|
| 194 |
+
)
|
| 195 |
+
st.plotly_chart(sentiment_heatmap, use_container_width=True)
|
| 196 |
+
|
| 197 |
+
st.markdown("---")
|
| 198 |
+
|
| 199 |
+
# Platform and Brand Distribution
|
| 200 |
+
st.markdown("## 📊 Volume Analysis")
|
| 201 |
+
|
| 202 |
+
col1, col2 = st.columns(2)
|
| 203 |
+
|
| 204 |
+
with col1:
|
| 205 |
+
# Platform distribution
|
| 206 |
+
platform_dist = distribution_charts.create_platform_distribution(df, title="Comments by Platform")
|
| 207 |
+
st.plotly_chart(platform_dist, use_container_width=True)
|
| 208 |
+
|
| 209 |
+
with col2:
|
| 210 |
+
# Brand distribution
|
| 211 |
+
brand_dist = distribution_charts.create_brand_distribution(df, title="Comments by Brand")
|
| 212 |
+
st.plotly_chart(brand_dist, use_container_width=True)
|
| 213 |
+
|
| 214 |
+
st.markdown("---")
|
| 215 |
+
|
| 216 |
+
# Reply Requirements
|
| 217 |
+
st.markdown("## ⚠️ Reply Requirements Analysis")
|
| 218 |
+
|
| 219 |
+
col1, col2 = st.columns(2)
|
| 220 |
+
|
| 221 |
+
with col1:
|
| 222 |
+
# Reply required by brand
|
| 223 |
+
reply_brand = distribution_charts.create_reply_required_chart(
|
| 224 |
+
df, group_by='brand', title="Comments Requiring Reply by Brand"
|
| 225 |
+
)
|
| 226 |
+
st.plotly_chart(reply_brand, use_container_width=True)
|
| 227 |
+
|
| 228 |
+
with col2:
|
| 229 |
+
# Reply required by platform
|
| 230 |
+
reply_platform = distribution_charts.create_reply_required_chart(
|
| 231 |
+
df, group_by='platform', title="Comments Requiring Reply by Platform"
|
| 232 |
+
)
|
| 233 |
+
st.plotly_chart(reply_platform, use_container_width=True)
|
| 234 |
+
|
| 235 |
+
# Response urgency metrics
|
| 236 |
+
urgency_metrics = SentimentMetrics.calculate_response_urgency(df)
|
| 237 |
+
|
| 238 |
+
st.markdown("### 🚨 Response Urgency Breakdown")
|
| 239 |
+
urgency_col1, urgency_col2, urgency_col3, urgency_col4 = st.columns(4)
|
| 240 |
+
|
| 241 |
+
with urgency_col1:
|
| 242 |
+
st.metric("🔴 Urgent", urgency_metrics['urgent_count'], help="Negative sentiment + requires reply")
|
| 243 |
+
|
| 244 |
+
with urgency_col2:
|
| 245 |
+
st.metric("🟠 High Priority", urgency_metrics['high_priority_count'], help="Neutral with feedback/request")
|
| 246 |
+
|
| 247 |
+
with urgency_col3:
|
| 248 |
+
st.metric("🟡 Medium Priority", urgency_metrics['medium_priority_count'], help="Positive requiring reply")
|
| 249 |
+
|
| 250 |
+
with urgency_col4:
|
| 251 |
+
st.metric("🟢 Low Priority", urgency_metrics['low_priority_count'], help="Very positive requiring reply")
|
| 252 |
+
|
| 253 |
+
st.markdown("---")
|
| 254 |
+
|
| 255 |
+
st.markdown("---")
|
| 256 |
+
|
| 257 |
+
# Demographics Analysis (for musora_app only)
|
| 258 |
+
# Check if we have musora_app data and demographic fields
|
| 259 |
+
has_musora_app = 'platform' in df.columns and 'musora_app' in df['platform'].values
|
| 260 |
+
has_demographics = (
|
| 261 |
+
has_musora_app and
|
| 262 |
+
'age_group' in df.columns and
|
| 263 |
+
'timezone' in df.columns and
|
| 264 |
+
'experience_level' in df.columns
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
if has_demographics:
|
| 268 |
+
# Filter for musora_app data only
|
| 269 |
+
df_musora = df[df['platform'] == 'musora_app'].copy()
|
| 270 |
+
|
| 271 |
+
# Check if we have any demographic data (not all Unknown)
|
| 272 |
+
has_valid_demographics = (
|
| 273 |
+
(df_musora['age_group'] != 'Unknown').any() or
|
| 274 |
+
(df_musora['timezone_region'] != 'Unknown').any() or
|
| 275 |
+
(df_musora['experience_group'] != 'Unknown').any()
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
if has_valid_demographics and len(df_musora) > 0:
|
| 279 |
+
st.markdown("## 👥 Demographics Analysis (Musora App)")
|
| 280 |
+
st.info(f"📊 Analyzing demographics for **{len(df_musora):,}** Musora App comments")
|
| 281 |
+
|
| 282 |
+
# Initialize demographic charts
|
| 283 |
+
demographic_charts = DemographicCharts()
|
| 284 |
+
|
| 285 |
+
# Get demographic summary
|
| 286 |
+
demo_summary = processor.get_demographics_summary(df_musora)
|
| 287 |
+
|
| 288 |
+
# Display summary metrics
|
| 289 |
+
demo_col1, demo_col2, demo_col3, demo_col4 = st.columns(4)
|
| 290 |
+
|
| 291 |
+
with demo_col1:
|
| 292 |
+
st.metric(
|
| 293 |
+
"Comments with Demographics",
|
| 294 |
+
f"{demo_summary['users_with_demographics']:,}",
|
| 295 |
+
f"{demo_summary['coverage_percentage']:.1f}% coverage"
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
with demo_col2:
|
| 299 |
+
if demo_summary['avg_age'] is not None:
|
| 300 |
+
st.metric("Average Age", f"{demo_summary['avg_age']:.1f} years")
|
| 301 |
+
else:
|
| 302 |
+
st.metric("Average Age", "N/A")
|
| 303 |
+
|
| 304 |
+
with demo_col3:
|
| 305 |
+
st.metric("Most Common Region", demo_summary['most_common_region'])
|
| 306 |
+
|
| 307 |
+
with demo_col4:
|
| 308 |
+
if demo_summary['avg_experience'] is not None:
|
| 309 |
+
st.metric("Avg Experience", f"{demo_summary['avg_experience']:.1f}/10")
|
| 310 |
+
else:
|
| 311 |
+
st.metric("Avg Experience", "N/A")
|
| 312 |
+
|
| 313 |
+
st.markdown("---")
|
| 314 |
+
|
| 315 |
+
# Age Analysis
|
| 316 |
+
st.markdown("### 🎂 Age Distribution")
|
| 317 |
+
|
| 318 |
+
age_dist = processor.get_demographics_distribution(df_musora, 'age_group')
|
| 319 |
+
age_sentiment = processor.get_demographics_by_sentiment(df_musora, 'age_group')
|
| 320 |
+
|
| 321 |
+
if not age_dist.empty:
|
| 322 |
+
col1, col2 = st.columns(2)
|
| 323 |
+
|
| 324 |
+
with col1:
|
| 325 |
+
age_chart = demographic_charts.create_age_distribution_chart(
|
| 326 |
+
age_dist,
|
| 327 |
+
title="Comments by Age Group"
|
| 328 |
+
)
|
| 329 |
+
st.plotly_chart(age_chart, use_container_width=True)
|
| 330 |
+
|
| 331 |
+
with col2:
|
| 332 |
+
age_sent_chart = demographic_charts.create_age_sentiment_chart(
|
| 333 |
+
age_sentiment,
|
| 334 |
+
title="Sentiment Distribution by Age Group"
|
| 335 |
+
)
|
| 336 |
+
st.plotly_chart(age_sent_chart, use_container_width=True)
|
| 337 |
+
|
| 338 |
+
# Insights
|
| 339 |
+
with st.expander("💡 Age Insights"):
|
| 340 |
+
if len(age_dist) > 0:
|
| 341 |
+
top_age_group = age_dist.iloc[0]['age_group']
|
| 342 |
+
top_age_count = age_dist.iloc[0]['count']
|
| 343 |
+
top_age_pct = age_dist.iloc[0]['percentage']
|
| 344 |
+
|
| 345 |
+
st.write(f"**Most Active Age Group:** {top_age_group} ({top_age_count:,} comments, {top_age_pct:.1f}%)")
|
| 346 |
+
|
| 347 |
+
# Find age group with most negative sentiment
|
| 348 |
+
if not age_sentiment.empty:
|
| 349 |
+
negative_sentiments = age_sentiment[
|
| 350 |
+
age_sentiment['sentiment_polarity'].isin(['negative', 'very_negative'])
|
| 351 |
+
].groupby('age_group')['percentage'].sum().reset_index()
|
| 352 |
+
|
| 353 |
+
if len(negative_sentiments) > 0:
|
| 354 |
+
negative_sentiments = negative_sentiments.sort_values('percentage', ascending=False)
|
| 355 |
+
most_negative_age = negative_sentiments.iloc[0]['age_group']
|
| 356 |
+
most_negative_pct = negative_sentiments.iloc[0]['percentage']
|
| 357 |
+
st.write(f"**Highest Negative Sentiment:** {most_negative_age} ({most_negative_pct:.1f}% negative)")
|
| 358 |
+
else:
|
| 359 |
+
st.info("No age data available for visualization")
|
| 360 |
+
|
| 361 |
+
st.markdown("---")
|
| 362 |
+
|
| 363 |
+
# Timezone Analysis
|
| 364 |
+
st.markdown("### 🌍 Geographic Distribution")
|
| 365 |
+
|
| 366 |
+
# Get timezone data
|
| 367 |
+
top_timezones = processor.get_top_timezones(df_musora, top_n=15)
|
| 368 |
+
region_dist = processor.get_timezone_regions_distribution(df_musora)
|
| 369 |
+
region_sentiment = processor.get_demographics_by_sentiment(df_musora, 'timezone_region')
|
| 370 |
+
|
| 371 |
+
if not top_timezones.empty or not region_dist.empty:
|
| 372 |
+
# Top timezones
|
| 373 |
+
if not top_timezones.empty:
|
| 374 |
+
st.markdown("#### Top 15 Timezones")
|
| 375 |
+
timezone_chart = demographic_charts.create_timezone_chart(
|
| 376 |
+
top_timezones,
|
| 377 |
+
title="Most Common Timezones",
|
| 378 |
+
top_n=15
|
| 379 |
+
)
|
| 380 |
+
st.plotly_chart(timezone_chart, use_container_width=True)
|
| 381 |
+
|
| 382 |
+
# Regional distribution
|
| 383 |
+
if not region_dist.empty:
|
| 384 |
+
st.markdown("#### Regional Distribution")
|
| 385 |
+
col1, col2 = st.columns(2)
|
| 386 |
+
|
| 387 |
+
with col1:
|
| 388 |
+
region_chart = demographic_charts.create_region_distribution_chart(
|
| 389 |
+
region_dist,
|
| 390 |
+
title="Comments by Region"
|
| 391 |
+
)
|
| 392 |
+
st.plotly_chart(region_chart, use_container_width=True)
|
| 393 |
+
|
| 394 |
+
with col2:
|
| 395 |
+
if not region_sentiment.empty:
|
| 396 |
+
region_sent_chart = demographic_charts.create_region_sentiment_chart(
|
| 397 |
+
region_sentiment,
|
| 398 |
+
title="Sentiment Distribution by Region"
|
| 399 |
+
)
|
| 400 |
+
st.plotly_chart(region_sent_chart, use_container_width=True)
|
| 401 |
+
|
| 402 |
+
# Insights
|
| 403 |
+
with st.expander("💡 Geographic Insights"):
|
| 404 |
+
if not top_timezones.empty:
|
| 405 |
+
top_tz = top_timezones.iloc[0]['timezone']
|
| 406 |
+
top_tz_count = top_timezones.iloc[0]['count']
|
| 407 |
+
top_tz_pct = top_timezones.iloc[0]['percentage']
|
| 408 |
+
st.write(f"**Most Common Timezone:** {top_tz} ({top_tz_count:,} comments, {top_tz_pct:.1f}%)")
|
| 409 |
+
|
| 410 |
+
if not region_dist.empty:
|
| 411 |
+
top_region = region_dist.iloc[0]['timezone_region']
|
| 412 |
+
top_region_count = region_dist.iloc[0]['count']
|
| 413 |
+
top_region_pct = region_dist.iloc[0]['percentage']
|
| 414 |
+
st.write(f"**Most Active Region:** {top_region} ({top_region_count:,} comments, {top_region_pct:.1f}%)")
|
| 415 |
+
|
| 416 |
+
# Find region with most negative sentiment
|
| 417 |
+
if not region_sentiment.empty:
|
| 418 |
+
negative_regions = region_sentiment[
|
| 419 |
+
region_sentiment['sentiment_polarity'].isin(['negative', 'very_negative'])
|
| 420 |
+
].groupby('timezone_region')['percentage'].sum().reset_index()
|
| 421 |
+
|
| 422 |
+
if len(negative_regions) > 0:
|
| 423 |
+
negative_regions = negative_regions.sort_values('percentage', ascending=False)
|
| 424 |
+
most_negative_region = negative_regions.iloc[0]['timezone_region']
|
| 425 |
+
most_negative_region_pct = negative_regions.iloc[0]['percentage']
|
| 426 |
+
st.write(f"**Highest Negative Sentiment:** {most_negative_region} ({most_negative_region_pct:.1f}% negative)")
|
| 427 |
+
else:
|
| 428 |
+
st.info("No timezone/region data available for visualization")
|
| 429 |
+
|
| 430 |
+
st.markdown("---")
|
| 431 |
+
|
| 432 |
+
# Experience Level Analysis
|
| 433 |
+
st.markdown("### 🎯 Experience Level Distribution")
|
| 434 |
+
|
| 435 |
+
# Get both detailed and grouped experience data
|
| 436 |
+
exp_dist_detailed = processor.get_experience_level_distribution(df_musora, use_groups=False)
|
| 437 |
+
exp_dist_grouped = processor.get_experience_level_distribution(df_musora, use_groups=True)
|
| 438 |
+
exp_sentiment_grouped = processor.get_demographics_by_sentiment(df_musora, 'experience_group')
|
| 439 |
+
|
| 440 |
+
if not exp_dist_detailed.empty or not exp_dist_grouped.empty:
|
| 441 |
+
# Tabs for detailed vs grouped view
|
| 442 |
+
tab1, tab2 = st.tabs(["📊 Detailed (0-10)", "📊 Grouped (Beginner/Intermediate/Advanced)"])
|
| 443 |
+
|
| 444 |
+
with tab1:
|
| 445 |
+
if not exp_dist_detailed.empty:
|
| 446 |
+
exp_chart_detailed = demographic_charts.create_experience_distribution_chart(
|
| 447 |
+
exp_dist_detailed,
|
| 448 |
+
title="Comments by Experience Level (0-10 Scale)",
|
| 449 |
+
use_groups=False
|
| 450 |
+
)
|
| 451 |
+
st.plotly_chart(exp_chart_detailed, use_container_width=True)
|
| 452 |
+
else:
|
| 453 |
+
st.info("No detailed experience level data available")
|
| 454 |
+
|
| 455 |
+
with tab2:
|
| 456 |
+
if not exp_dist_grouped.empty:
|
| 457 |
+
col1, col2 = st.columns(2)
|
| 458 |
+
|
| 459 |
+
with col1:
|
| 460 |
+
exp_chart_grouped = demographic_charts.create_experience_distribution_chart(
|
| 461 |
+
exp_dist_grouped,
|
| 462 |
+
title="Comments by Experience Group",
|
| 463 |
+
use_groups=True
|
| 464 |
+
)
|
| 465 |
+
st.plotly_chart(exp_chart_grouped, use_container_width=True)
|
| 466 |
+
|
| 467 |
+
with col2:
|
| 468 |
+
if not exp_sentiment_grouped.empty:
|
| 469 |
+
exp_sent_chart = demographic_charts.create_experience_sentiment_chart(
|
| 470 |
+
exp_sentiment_grouped,
|
| 471 |
+
title="Sentiment by Experience Group",
|
| 472 |
+
use_groups=True
|
| 473 |
+
)
|
| 474 |
+
st.plotly_chart(exp_sent_chart, use_container_width=True)
|
| 475 |
+
else:
|
| 476 |
+
st.info("No grouped experience level data available")
|
| 477 |
+
|
| 478 |
+
# Insights
|
| 479 |
+
with st.expander("💡 Experience Insights"):
|
| 480 |
+
if not exp_dist_grouped.empty:
|
| 481 |
+
top_exp_group = exp_dist_grouped.iloc[0]['experience_group']
|
| 482 |
+
top_exp_count = exp_dist_grouped.iloc[0]['count']
|
| 483 |
+
top_exp_pct = exp_dist_grouped.iloc[0]['percentage']
|
| 484 |
+
st.write(f"**Most Active Group:** {top_exp_group} ({top_exp_count:,} comments, {top_exp_pct:.1f}%)")
|
| 485 |
+
|
| 486 |
+
# Find experience group with most negative sentiment
|
| 487 |
+
if not exp_sentiment_grouped.empty:
|
| 488 |
+
negative_exp = exp_sentiment_grouped[
|
| 489 |
+
exp_sentiment_grouped['sentiment_polarity'].isin(['negative', 'very_negative'])
|
| 490 |
+
].groupby('experience_group')['percentage'].sum().reset_index()
|
| 491 |
+
|
| 492 |
+
if len(negative_exp) > 0:
|
| 493 |
+
negative_exp = negative_exp.sort_values('percentage', ascending=False)
|
| 494 |
+
most_negative_exp = negative_exp.iloc[0]['experience_group']
|
| 495 |
+
most_negative_exp_pct = negative_exp.iloc[0]['percentage']
|
| 496 |
+
st.write(f"**Highest Negative Sentiment:** {most_negative_exp} ({most_negative_exp_pct:.1f}% negative)")
|
| 497 |
+
|
| 498 |
+
if demo_summary['avg_experience'] is not None:
|
| 499 |
+
st.write(f"**Average Experience Level:** {demo_summary['avg_experience']:.2f}/10")
|
| 500 |
+
st.write(f"**Most Common Experience Group:** {demo_summary.get('most_common_experience', 'Unknown')}")
|
| 501 |
+
else:
|
| 502 |
+
st.info("No experience level data available for visualization")
|
| 503 |
+
|
| 504 |
+
st.markdown("---")
|
| 505 |
+
|
| 506 |
+
# Language Distribution (if available)
|
| 507 |
+
if 'detected_language' in df.columns:
|
| 508 |
+
st.markdown("## 🌍 Language Distribution")
|
| 509 |
+
|
| 510 |
+
lang_dist = distribution_charts.create_language_distribution(df, top_n=10, title="Top 10 Languages")
|
| 511 |
+
st.plotly_chart(lang_dist, use_container_width=True)
|
| 512 |
+
|
| 513 |
+
st.markdown("---")
|
| 514 |
+
|
| 515 |
+
# Temporal trends (if timestamp available)
|
| 516 |
+
if 'comment_timestamp' in df.columns and not df.empty:
|
| 517 |
+
with st.expander("📈 Temporal Trends", expanded=False):
|
| 518 |
+
# Frequency selector
|
| 519 |
+
freq_col1, freq_col2 = st.columns([1, 3])
|
| 520 |
+
|
| 521 |
+
with freq_col1:
|
| 522 |
+
freq = st.selectbox(
|
| 523 |
+
"Time Granularity",
|
| 524 |
+
options=['D', 'W', 'M'],
|
| 525 |
+
format_func=lambda x: {'D': 'Daily', 'W': 'Weekly', 'M': 'Monthly'}[x],
|
| 526 |
+
index=1 # Default to Weekly
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
sentiment_timeline = sentiment_charts.create_sentiment_timeline(df, freq=freq, title="Sentiment Trends Over Time")
|
| 530 |
+
st.plotly_chart(sentiment_timeline, use_container_width=True)
|
| 531 |
+
|
| 532 |
+
# Hierarchical sunburst
|
| 533 |
+
with st.expander("🌟 Hierarchical View", expanded=False):
|
| 534 |
+
st.markdown("**Interactive Brand > Platform > Sentiment Distribution**")
|
| 535 |
+
sunburst = distribution_charts.create_combined_distribution_sunburst(
|
| 536 |
+
df, title="Brand > Platform > Sentiment Distribution"
|
| 537 |
+
)
|
| 538 |
+
st.plotly_chart(sunburst, use_container_width=True)
|
visualization/components/reply_required.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Reply Required Page
|
| 3 |
+
Displays comments that require replies with filtering and prioritization
|
| 4 |
+
"""
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Add parent directory to path
|
| 11 |
+
parent_dir = Path(__file__).resolve().parent.parent
|
| 12 |
+
sys.path.append(str(parent_dir))
|
| 13 |
+
|
| 14 |
+
from utils.data_processor import SentimentDataProcessor
|
| 15 |
+
from utils.metrics import SentimentMetrics
|
| 16 |
+
from visualizations.sentiment_charts import SentimentCharts
|
| 17 |
+
from visualizations.distribution_charts import DistributionCharts
|
| 18 |
+
from visualizations.content_cards import ContentCards
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def render_reply_required(df):
|
| 22 |
+
"""
|
| 23 |
+
Render the reply required page
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
df: Sentiment dataframe
|
| 27 |
+
"""
|
| 28 |
+
st.title("⚠️ Comments Requiring Reply")
|
| 29 |
+
st.markdown("Manage and prioritize comments that need responses")
|
| 30 |
+
st.markdown("---")
|
| 31 |
+
|
| 32 |
+
# Initialize components
|
| 33 |
+
processor = SentimentDataProcessor()
|
| 34 |
+
metrics = SentimentMetrics()
|
| 35 |
+
|
| 36 |
+
# Get comments requiring reply
|
| 37 |
+
reply_comments = processor.get_comments_requiring_reply(df)
|
| 38 |
+
|
| 39 |
+
if reply_comments.empty:
|
| 40 |
+
st.success("🎉 Great news! No comments currently require replies.")
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
# Display summary statistics
|
| 44 |
+
st.markdown("### 📊 Summary")
|
| 45 |
+
|
| 46 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 47 |
+
|
| 48 |
+
with col1:
|
| 49 |
+
st.metric("Total Replies Needed", len(reply_comments))
|
| 50 |
+
|
| 51 |
+
with col2:
|
| 52 |
+
urgency = metrics.calculate_response_urgency(df)
|
| 53 |
+
st.metric("🔴 Urgent", urgency['urgent_count'], help="Negative sentiment")
|
| 54 |
+
|
| 55 |
+
with col3:
|
| 56 |
+
unique_contents = reply_comments['content_sk'].nunique() if 'content_sk' in reply_comments.columns else 0
|
| 57 |
+
st.metric("Affected Contents", unique_contents)
|
| 58 |
+
|
| 59 |
+
with col4:
|
| 60 |
+
negative_count = reply_comments['sentiment_polarity'].isin(['negative', 'very_negative']).sum()
|
| 61 |
+
negative_pct = (negative_count / len(reply_comments) * 100) if len(reply_comments) > 0 else 0
|
| 62 |
+
st.metric("Negative %", f"{negative_pct:.1f}%")
|
| 63 |
+
|
| 64 |
+
st.markdown("---")
|
| 65 |
+
|
| 66 |
+
# Urgency breakdown
|
| 67 |
+
st.markdown("### 🚨 Response Urgency Breakdown")
|
| 68 |
+
|
| 69 |
+
urgency_metrics = metrics.calculate_response_urgency(df)
|
| 70 |
+
|
| 71 |
+
urgency_col1, urgency_col2, urgency_col3, urgency_col4 = st.columns(4)
|
| 72 |
+
|
| 73 |
+
with urgency_col1:
|
| 74 |
+
st.metric(
|
| 75 |
+
"🔴 Urgent",
|
| 76 |
+
urgency_metrics['urgent_count'],
|
| 77 |
+
help="Negative sentiment requiring reply - immediate action needed"
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
with urgency_col2:
|
| 81 |
+
st.metric(
|
| 82 |
+
"🟠 High Priority",
|
| 83 |
+
urgency_metrics['high_priority_count'],
|
| 84 |
+
help="Neutral with feedback/request - respond within 24 hours"
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
with urgency_col3:
|
| 88 |
+
st.metric(
|
| 89 |
+
"🟡 Medium Priority",
|
| 90 |
+
urgency_metrics['medium_priority_count'],
|
| 91 |
+
help="Positive requiring reply - respond within 48 hours"
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
with urgency_col4:
|
| 95 |
+
st.metric(
|
| 96 |
+
"🟢 Low Priority",
|
| 97 |
+
urgency_metrics['low_priority_count'],
|
| 98 |
+
help="Very positive requiring reply - respond when convenient"
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
st.markdown("---")
|
| 102 |
+
|
| 103 |
+
# Filters
|
| 104 |
+
st.markdown("### 🔍 Filters")
|
| 105 |
+
|
| 106 |
+
filter_col1, filter_col2, filter_col3, filter_col4 = st.columns(4)
|
| 107 |
+
|
| 108 |
+
with filter_col1:
|
| 109 |
+
# Priority filter
|
| 110 |
+
priority_options = ['All', '🔴 Urgent', '🟠 High', '🟡 Medium', '🟢 Low']
|
| 111 |
+
selected_priority = st.selectbox("Priority", priority_options, index=0)
|
| 112 |
+
|
| 113 |
+
with filter_col2:
|
| 114 |
+
# Platform filter
|
| 115 |
+
platform_options = ['All'] + sorted(reply_comments['platform'].unique().tolist())
|
| 116 |
+
selected_platform = st.selectbox("Platform", platform_options, index=0)
|
| 117 |
+
|
| 118 |
+
with filter_col3:
|
| 119 |
+
# Brand filter
|
| 120 |
+
brand_options = ['All'] + sorted(reply_comments['brand'].unique().tolist())
|
| 121 |
+
selected_brand = st.selectbox("Brand", brand_options, index=0)
|
| 122 |
+
|
| 123 |
+
with filter_col4:
|
| 124 |
+
# Intent filter
|
| 125 |
+
intent_list = reply_comments['intent'].str.split(',').explode().str.strip().unique().tolist()
|
| 126 |
+
intent_options = ['All'] + sorted(intent_list)
|
| 127 |
+
selected_intent = st.selectbox("Intent", intent_options, index=0)
|
| 128 |
+
|
| 129 |
+
# Apply filters
|
| 130 |
+
filtered_comments = reply_comments.copy()
|
| 131 |
+
|
| 132 |
+
# Priority filtering
|
| 133 |
+
if selected_priority != 'All':
|
| 134 |
+
if selected_priority == '🔴 Urgent':
|
| 135 |
+
filtered_comments = filtered_comments[
|
| 136 |
+
filtered_comments['sentiment_polarity'].isin(['negative', 'very_negative'])
|
| 137 |
+
]
|
| 138 |
+
elif selected_priority == '🟠 High':
|
| 139 |
+
filtered_comments = filtered_comments[
|
| 140 |
+
(filtered_comments['sentiment_polarity'] == 'neutral') &
|
| 141 |
+
(filtered_comments['intent'].str.contains('feedback_negative|request', na=False))
|
| 142 |
+
]
|
| 143 |
+
elif selected_priority == '🟡 Medium':
|
| 144 |
+
filtered_comments = filtered_comments[
|
| 145 |
+
filtered_comments['sentiment_polarity'] == 'positive'
|
| 146 |
+
]
|
| 147 |
+
elif selected_priority == '🟢 Low':
|
| 148 |
+
filtered_comments = filtered_comments[
|
| 149 |
+
filtered_comments['sentiment_polarity'] == 'very_positive'
|
| 150 |
+
]
|
| 151 |
+
|
| 152 |
+
# Platform filtering
|
| 153 |
+
if selected_platform != 'All':
|
| 154 |
+
filtered_comments = filtered_comments[filtered_comments['platform'] == selected_platform]
|
| 155 |
+
|
| 156 |
+
# Brand filtering
|
| 157 |
+
if selected_brand != 'All':
|
| 158 |
+
filtered_comments = filtered_comments[filtered_comments['brand'] == selected_brand]
|
| 159 |
+
|
| 160 |
+
# Intent filtering
|
| 161 |
+
if selected_intent != 'All':
|
| 162 |
+
filtered_comments = filtered_comments[
|
| 163 |
+
filtered_comments['intent'].str.contains(selected_intent, na=False)
|
| 164 |
+
]
|
| 165 |
+
|
| 166 |
+
st.markdown(f"**Showing {len(filtered_comments)} comments after filtering**")
|
| 167 |
+
|
| 168 |
+
st.markdown("---")
|
| 169 |
+
|
| 170 |
+
# Visualizations
|
| 171 |
+
if not filtered_comments.empty:
|
| 172 |
+
st.markdown("### 📈 Analysis")
|
| 173 |
+
|
| 174 |
+
viz_col1, viz_col2 = st.columns(2)
|
| 175 |
+
|
| 176 |
+
with viz_col1:
|
| 177 |
+
# Sentiment distribution
|
| 178 |
+
sentiment_charts = SentimentCharts()
|
| 179 |
+
sentiment_pie = sentiment_charts.create_sentiment_pie_chart(
|
| 180 |
+
filtered_comments, title="Sentiment Distribution"
|
| 181 |
+
)
|
| 182 |
+
st.plotly_chart(sentiment_pie, use_container_width=True)
|
| 183 |
+
|
| 184 |
+
with viz_col2:
|
| 185 |
+
# Intent distribution
|
| 186 |
+
distribution_charts = DistributionCharts()
|
| 187 |
+
intent_bar = distribution_charts.create_intent_bar_chart(
|
| 188 |
+
filtered_comments, title="Intent Distribution", orientation='h'
|
| 189 |
+
)
|
| 190 |
+
st.plotly_chart(intent_bar, use_container_width=True)
|
| 191 |
+
|
| 192 |
+
st.markdown("---")
|
| 193 |
+
|
| 194 |
+
# Display comments
|
| 195 |
+
st.markdown("### 💬 Comments Requiring Reply")
|
| 196 |
+
|
| 197 |
+
# Pagination
|
| 198 |
+
items_per_page = 10
|
| 199 |
+
total_pages = (len(filtered_comments) - 1) // items_per_page + 1
|
| 200 |
+
|
| 201 |
+
if 'reply_page' not in st.session_state:
|
| 202 |
+
st.session_state.reply_page = 1
|
| 203 |
+
|
| 204 |
+
# Pagination controls at top
|
| 205 |
+
if total_pages > 1:
|
| 206 |
+
page_col1, page_col2, page_col3 = st.columns([1, 2, 1])
|
| 207 |
+
|
| 208 |
+
with page_col1:
|
| 209 |
+
if st.button("⬅️ Previous", key="prev_top", disabled=(st.session_state.reply_page <= 1)):
|
| 210 |
+
st.session_state.reply_page -= 1
|
| 211 |
+
st.rerun()
|
| 212 |
+
|
| 213 |
+
with page_col2:
|
| 214 |
+
st.markdown(f"<center>Page {st.session_state.reply_page} of {total_pages}</center>", unsafe_allow_html=True)
|
| 215 |
+
|
| 216 |
+
with page_col3:
|
| 217 |
+
if st.button("Next ➡️", key="next_top", disabled=(st.session_state.reply_page >= total_pages)):
|
| 218 |
+
st.session_state.reply_page += 1
|
| 219 |
+
st.rerun()
|
| 220 |
+
|
| 221 |
+
st.markdown("---")
|
| 222 |
+
|
| 223 |
+
# Get paginated comments
|
| 224 |
+
start_idx = (st.session_state.reply_page - 1) * items_per_page
|
| 225 |
+
end_idx = start_idx + items_per_page
|
| 226 |
+
paginated_comments = filtered_comments.iloc[start_idx:end_idx]
|
| 227 |
+
|
| 228 |
+
# Display comments
|
| 229 |
+
if paginated_comments.empty:
|
| 230 |
+
st.info("No comments match the selected filters")
|
| 231 |
+
else:
|
| 232 |
+
for idx, (_, comment) in enumerate(paginated_comments.iterrows(), start=start_idx + 1):
|
| 233 |
+
# Priority badge
|
| 234 |
+
priority_emoji = "🟢"
|
| 235 |
+
if comment['sentiment_polarity'] in ['negative', 'very_negative']:
|
| 236 |
+
priority_emoji = "🔴"
|
| 237 |
+
elif comment['sentiment_polarity'] == 'neutral' and any(
|
| 238 |
+
intent in comment['intent'] for intent in ['feedback_negative', 'request']
|
| 239 |
+
):
|
| 240 |
+
priority_emoji = "🟠"
|
| 241 |
+
elif comment['sentiment_polarity'] == 'positive':
|
| 242 |
+
priority_emoji = "🟡"
|
| 243 |
+
|
| 244 |
+
st.markdown(f"#### {priority_emoji} Comment #{idx}")
|
| 245 |
+
|
| 246 |
+
# Display comment card
|
| 247 |
+
ContentCards.display_comment_card(comment, show_original=True)
|
| 248 |
+
|
| 249 |
+
# Pagination controls at bottom
|
| 250 |
+
if total_pages > 1:
|
| 251 |
+
st.markdown("---")
|
| 252 |
+
|
| 253 |
+
page_col1, page_col2, page_col3 = st.columns([1, 2, 1])
|
| 254 |
+
|
| 255 |
+
with page_col1:
|
| 256 |
+
if st.button("⬅️ Previous", key="prev_bottom", disabled=(st.session_state.reply_page <= 1)):
|
| 257 |
+
st.session_state.reply_page -= 1
|
| 258 |
+
st.rerun()
|
| 259 |
+
|
| 260 |
+
with page_col2:
|
| 261 |
+
st.markdown(f"<center>Page {st.session_state.reply_page} of {total_pages}</center>", unsafe_allow_html=True)
|
| 262 |
+
|
| 263 |
+
with page_col3:
|
| 264 |
+
if st.button("Next ➡️", key="next_bottom", disabled=(st.session_state.reply_page >= total_pages)):
|
| 265 |
+
st.session_state.reply_page += 1
|
| 266 |
+
st.rerun()
|
| 267 |
+
|
| 268 |
+
st.markdown("---")
|
| 269 |
+
|
| 270 |
+
# Export option
|
| 271 |
+
st.markdown("### 💾 Export Data")
|
| 272 |
+
|
| 273 |
+
col1, col2 = st.columns([1, 3])
|
| 274 |
+
|
| 275 |
+
with col1:
|
| 276 |
+
# Prepare export data
|
| 277 |
+
export_columns = [
|
| 278 |
+
'comment_id', 'author_name', 'platform', 'brand', 'comment_timestamp',
|
| 279 |
+
'display_text', 'original_text', 'detected_language', 'sentiment_polarity',
|
| 280 |
+
'intent', 'sentiment_confidence', 'content_description', 'permalink_url'
|
| 281 |
+
]
|
| 282 |
+
|
| 283 |
+
# Filter only available columns
|
| 284 |
+
available_columns = [col for col in export_columns if col in filtered_comments.columns]
|
| 285 |
+
export_data = filtered_comments[available_columns]
|
| 286 |
+
|
| 287 |
+
csv = export_data.to_csv(index=False)
|
| 288 |
+
|
| 289 |
+
st.download_button(
|
| 290 |
+
label="📥 Download as CSV",
|
| 291 |
+
data=csv,
|
| 292 |
+
file_name=f"comments_requiring_reply.csv",
|
| 293 |
+
mime="text/csv"
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
with col2:
|
| 297 |
+
st.info("Download the filtered comments for team collaboration or CRM import")
|
| 298 |
+
|
| 299 |
+
st.markdown("---")
|
| 300 |
+
|
| 301 |
+
# Quick stats by content
|
| 302 |
+
st.markdown("### 📋 Reply Requirements by Content")
|
| 303 |
+
|
| 304 |
+
content_reply_summary = filtered_comments.groupby('content_sk').agg({
|
| 305 |
+
'comment_sk': 'count',
|
| 306 |
+
'content_description': 'first',
|
| 307 |
+
'permalink_url': 'first'
|
| 308 |
+
}).reset_index()
|
| 309 |
+
|
| 310 |
+
content_reply_summary.columns = ['content_sk', 'replies_needed', 'content_description', 'permalink_url']
|
| 311 |
+
content_reply_summary = content_reply_summary.sort_values('replies_needed', ascending=False).head(10)
|
| 312 |
+
|
| 313 |
+
for idx, (_, content) in enumerate(content_reply_summary.iterrows(), 1):
|
| 314 |
+
with st.expander(f"📝 Content #{idx} - {content['replies_needed']} replies needed"):
|
| 315 |
+
st.markdown(f"**Description:** {content['content_description']}")
|
| 316 |
+
if pd.notna(content['permalink_url']):
|
| 317 |
+
st.markdown(f"**Link:** [View Content]({content['permalink_url']})")
|
| 318 |
+
|
| 319 |
+
# Show comments for this content
|
| 320 |
+
content_comments = filtered_comments[filtered_comments['content_sk'] == content['content_sk']].head(3)
|
| 321 |
+
|
| 322 |
+
st.markdown(f"**Top {len(content_comments)} comments:**")
|
| 323 |
+
for _, comment in content_comments.iterrows():
|
| 324 |
+
ContentCards.display_comment_card(comment, show_original=True)
|
visualization/components/sentiment_analysis.py
ADDED
|
@@ -0,0 +1,671 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sentiment Analysis Page
|
| 3 |
+
Analyze content performance across all sentiment types with advanced filtering
|
| 4 |
+
"""
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# Add parent directory to path
|
| 10 |
+
parent_dir = Path(__file__).resolve().parent.parent
|
| 11 |
+
sys.path.append(str(parent_dir))
|
| 12 |
+
|
| 13 |
+
from utils.data_processor import SentimentDataProcessor
|
| 14 |
+
from visualizations.sentiment_charts import SentimentCharts
|
| 15 |
+
from visualizations.distribution_charts import DistributionCharts
|
| 16 |
+
from visualizations.content_cards import ContentCards
|
| 17 |
+
from agents.content_summary_agent import ContentSummaryAgent
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def render_sentiment_analysis(df):
|
| 21 |
+
"""
|
| 22 |
+
Render the sentiment analysis page
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
df: Sentiment dataframe (full dataset from app.py)
|
| 26 |
+
"""
|
| 27 |
+
st.title("🔍 Sentiment Analysis")
|
| 28 |
+
st.markdown("Analyze content performance based on sentiment patterns and user feedback")
|
| 29 |
+
st.markdown("---")
|
| 30 |
+
|
| 31 |
+
# Initialize components
|
| 32 |
+
processor = SentimentDataProcessor()
|
| 33 |
+
sentiment_charts = SentimentCharts()
|
| 34 |
+
distribution_charts = DistributionCharts()
|
| 35 |
+
|
| 36 |
+
# Initialize AI agent
|
| 37 |
+
summary_agent = ContentSummaryAgent(model="gpt-5-nano", temperature=1)
|
| 38 |
+
|
| 39 |
+
# Initialize session state for caching summaries
|
| 40 |
+
if 'content_summaries' not in st.session_state:
|
| 41 |
+
st.session_state.content_summaries = {}
|
| 42 |
+
|
| 43 |
+
# Page-Specific Filters (Platform and Brand Selection)
|
| 44 |
+
st.markdown("### 🎯 Select Platform and Brand")
|
| 45 |
+
st.info("⚡ **Performance Optimization**: Select a specific platform and brand to analyze. This filters the data and makes the page load faster.")
|
| 46 |
+
|
| 47 |
+
filter_col1, filter_col2 = st.columns(2)
|
| 48 |
+
|
| 49 |
+
with filter_col1:
|
| 50 |
+
# Get available platforms
|
| 51 |
+
available_platforms = sorted(df['platform'].unique().tolist())
|
| 52 |
+
selected_platform = st.selectbox(
|
| 53 |
+
"Platform *",
|
| 54 |
+
options=[''] + available_platforms,
|
| 55 |
+
index=0,
|
| 56 |
+
help="Select the platform to analyze"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
with filter_col2:
|
| 60 |
+
# Get available brands
|
| 61 |
+
available_brands = sorted(df['brand'].unique().tolist())
|
| 62 |
+
selected_brand = st.selectbox(
|
| 63 |
+
"Brand *",
|
| 64 |
+
options=[''] + available_brands,
|
| 65 |
+
index=0,
|
| 66 |
+
help="Select the brand to analyze"
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Check if both platform and brand are selected
|
| 70 |
+
if not selected_platform or not selected_brand:
|
| 71 |
+
st.warning("⚠️ Please select both **Platform** and **Brand** to view sentiment analysis.")
|
| 72 |
+
st.markdown("---")
|
| 73 |
+
|
| 74 |
+
# Show summary of available data
|
| 75 |
+
st.markdown("### 📊 Available Data Summary")
|
| 76 |
+
col1, col2, col3 = st.columns(3)
|
| 77 |
+
|
| 78 |
+
with col1:
|
| 79 |
+
st.metric("Total Comments", f"{len(df):,}")
|
| 80 |
+
|
| 81 |
+
with col2:
|
| 82 |
+
st.metric("Platforms", len(available_platforms))
|
| 83 |
+
with st.expander("View Platforms"):
|
| 84 |
+
for platform in available_platforms:
|
| 85 |
+
count = len(df[df['platform'] == platform])
|
| 86 |
+
st.write(f"- **{platform}**: {count:,} comments")
|
| 87 |
+
|
| 88 |
+
with col3:
|
| 89 |
+
st.metric("Brands", len(available_brands))
|
| 90 |
+
with st.expander("View Brands"):
|
| 91 |
+
for brand in available_brands:
|
| 92 |
+
count = len(df[df['brand'] == brand])
|
| 93 |
+
st.write(f"- **{brand}**: {count:,} comments")
|
| 94 |
+
|
| 95 |
+
return
|
| 96 |
+
|
| 97 |
+
# Filter dataframe by selected platform and brand
|
| 98 |
+
df_filtered = df[
|
| 99 |
+
(df['platform'] == selected_platform) &
|
| 100 |
+
(df['brand'] == selected_brand)
|
| 101 |
+
].copy()
|
| 102 |
+
|
| 103 |
+
if df_filtered.empty:
|
| 104 |
+
st.error(f"❌ No data found for **{selected_platform}** + **{selected_brand}** combination.")
|
| 105 |
+
return
|
| 106 |
+
|
| 107 |
+
# Show data info
|
| 108 |
+
st.success(f"✅ Loaded **{len(df_filtered):,}** comments for **{selected_platform}** + **{selected_brand}**")
|
| 109 |
+
st.markdown("---")
|
| 110 |
+
|
| 111 |
+
# Filters Section
|
| 112 |
+
st.markdown("### 🔍 Content Filters")
|
| 113 |
+
st.markdown("Filter contents by sentiment and intent to focus your analysis")
|
| 114 |
+
|
| 115 |
+
filter_col1, filter_col2, filter_col3, filter_col4 = st.columns(4)
|
| 116 |
+
|
| 117 |
+
with filter_col1:
|
| 118 |
+
# Sentiment filter
|
| 119 |
+
sentiment_options = df_filtered['sentiment_polarity'].unique().tolist()
|
| 120 |
+
selected_sentiments = st.multiselect(
|
| 121 |
+
"Sentiment",
|
| 122 |
+
options=sorted(sentiment_options),
|
| 123 |
+
default=[],
|
| 124 |
+
help="Filter by dominant sentiment. Leave empty to show all sentiments."
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
with filter_col2:
|
| 128 |
+
# Intent filter
|
| 129 |
+
intent_list = df_filtered['intent'].str.split(',').explode().str.strip().unique().tolist()
|
| 130 |
+
selected_intents = st.multiselect(
|
| 131 |
+
"Intent",
|
| 132 |
+
options=sorted([i for i in intent_list if i]),
|
| 133 |
+
default=[],
|
| 134 |
+
help="Filter contents that have comments with these intents"
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
with filter_col3:
|
| 138 |
+
# Top N selector
|
| 139 |
+
top_n = st.selectbox(
|
| 140 |
+
"Top N Contents",
|
| 141 |
+
options=[5, 10, 15, 20, 25],
|
| 142 |
+
index=1, # Default to 10
|
| 143 |
+
help="Number of contents to display"
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
with filter_col4:
|
| 147 |
+
# Show filter status
|
| 148 |
+
filter_active = bool(selected_sentiments or selected_intents)
|
| 149 |
+
if filter_active:
|
| 150 |
+
st.metric("Filters Active", "✓ Yes", help="Sentiment or intent filters are applied")
|
| 151 |
+
else:
|
| 152 |
+
st.metric("Filters Active", "✗ No", help="Showing all sentiments")
|
| 153 |
+
|
| 154 |
+
st.markdown("---")
|
| 155 |
+
|
| 156 |
+
# Advanced controls (defaults optimized for balanced analysis)
|
| 157 |
+
with st.expander("⚙️ Advanced Ranking Controls", expanded=False):
|
| 158 |
+
st.markdown("**Customize how contents are ranked and filtered**")
|
| 159 |
+
|
| 160 |
+
adv_col1, adv_col2 = st.columns(2)
|
| 161 |
+
|
| 162 |
+
with adv_col1:
|
| 163 |
+
min_comments = st.slider(
|
| 164 |
+
"Minimum Comments Required",
|
| 165 |
+
min_value=1,
|
| 166 |
+
max_value=50,
|
| 167 |
+
value=10,
|
| 168 |
+
step=1,
|
| 169 |
+
help="Filter out contents with fewer comments. Default: 10 (excludes low-volume contents)"
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
with adv_col2:
|
| 173 |
+
sort_by = st.selectbox(
|
| 174 |
+
"Sort By",
|
| 175 |
+
options=[
|
| 176 |
+
('severity_score', '🎯 Severity Score (Balanced) - Recommended'),
|
| 177 |
+
('sentiment_percentage', '📊 Sentiment Percentage'),
|
| 178 |
+
('sentiment_count', '🔢 Sentiment Count (Absolute)'),
|
| 179 |
+
('total_comments', '💬 Total Comments (Volume)')
|
| 180 |
+
],
|
| 181 |
+
format_func=lambda x: x[1],
|
| 182 |
+
index=0,
|
| 183 |
+
help="Severity Score balances sentiment % with comment volume for smarter ranking"
|
| 184 |
+
)
|
| 185 |
+
sort_by_value = sort_by[0]
|
| 186 |
+
|
| 187 |
+
# Show explanation of current sort method
|
| 188 |
+
sentiment_label = "selected sentiments" if selected_sentiments else "all sentiments"
|
| 189 |
+
if sort_by_value == 'severity_score':
|
| 190 |
+
st.info(f"📘 **Severity Score** = Sentiment % × √(Total Comments). Balances {sentiment_label} percentage with volume for high-impact ranking.")
|
| 191 |
+
elif sort_by_value == 'sentiment_percentage':
|
| 192 |
+
st.info(f"📘 Ranks by highest % of {sentiment_label}. May include low-volume contents.")
|
| 193 |
+
elif sort_by_value == 'sentiment_count':
|
| 194 |
+
st.info(f"📘 Ranks by absolute number of comments with {sentiment_label}. Prioritizes volume over percentage.")
|
| 195 |
+
else:
|
| 196 |
+
st.info("📘 Ranks by total comment volume, regardless of sentiment.")
|
| 197 |
+
|
| 198 |
+
# Get filtered contents using the new method
|
| 199 |
+
filtered_contents = processor.get_sentiment_filtered_contents(
|
| 200 |
+
df_filtered,
|
| 201 |
+
selected_sentiments=selected_sentiments if selected_sentiments else None,
|
| 202 |
+
selected_intents=selected_intents if selected_intents else None,
|
| 203 |
+
top_n=top_n,
|
| 204 |
+
min_comments=min_comments,
|
| 205 |
+
sort_by=sort_by_value
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
# Reset pagination when filters change
|
| 209 |
+
filter_key = f"{selected_platform}_{selected_brand}_{top_n}_{min_comments}_{sort_by_value}_{str(selected_sentiments)}_{str(selected_intents)}"
|
| 210 |
+
if 'last_filter_key' not in st.session_state or st.session_state.last_filter_key != filter_key:
|
| 211 |
+
st.session_state.sentiment_page = 1
|
| 212 |
+
st.session_state.last_filter_key = filter_key
|
| 213 |
+
|
| 214 |
+
if filtered_contents.empty:
|
| 215 |
+
st.warning("No content data available with the selected filters. Try adjusting your filters.")
|
| 216 |
+
return
|
| 217 |
+
|
| 218 |
+
# Display summary statistics
|
| 219 |
+
st.markdown("### 📊 Summary")
|
| 220 |
+
|
| 221 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 222 |
+
|
| 223 |
+
with col1:
|
| 224 |
+
st.metric("Contents Analyzed", len(filtered_contents))
|
| 225 |
+
|
| 226 |
+
with col2:
|
| 227 |
+
# Dynamic metric based on selected sentiments
|
| 228 |
+
if 'selected_sentiment_percentage' in filtered_contents.columns:
|
| 229 |
+
avg_sentiment_pct = filtered_contents['selected_sentiment_percentage'].mean()
|
| 230 |
+
sentiment_label = "Selected Sentiment %" if selected_sentiments else "All Sentiment %"
|
| 231 |
+
st.metric(sentiment_label, f"{avg_sentiment_pct:.1f}%")
|
| 232 |
+
else:
|
| 233 |
+
avg_negative_pct = filtered_contents['negative_percentage'].mean()
|
| 234 |
+
st.metric("Avg Negative %", f"{avg_negative_pct:.1f}%")
|
| 235 |
+
|
| 236 |
+
with col3:
|
| 237 |
+
total_comments = filtered_contents['total_comments'].sum()
|
| 238 |
+
st.metric("Total Comments", int(total_comments))
|
| 239 |
+
|
| 240 |
+
with col4:
|
| 241 |
+
total_replies_needed = filtered_contents['reply_required_count'].sum()
|
| 242 |
+
st.metric("Total Replies Needed", int(total_replies_needed))
|
| 243 |
+
|
| 244 |
+
st.markdown("---")
|
| 245 |
+
|
| 246 |
+
# Engagement scatter plot
|
| 247 |
+
st.markdown("### 📈 Content Engagement Analysis")
|
| 248 |
+
engagement_scatter = distribution_charts.create_engagement_scatter(
|
| 249 |
+
filtered_contents, title="Content Engagement vs. Sentiment"
|
| 250 |
+
)
|
| 251 |
+
st.plotly_chart(engagement_scatter, use_container_width=True, key="engagement_scatter_chart")
|
| 252 |
+
|
| 253 |
+
st.markdown("---")
|
| 254 |
+
|
| 255 |
+
# Display each content with detailed analysis
|
| 256 |
+
st.markdown("### 🔍 Detailed Content Analysis")
|
| 257 |
+
|
| 258 |
+
# Add pagination controls for better performance
|
| 259 |
+
if 'sentiment_page' not in st.session_state:
|
| 260 |
+
st.session_state.sentiment_page = 1
|
| 261 |
+
|
| 262 |
+
items_per_page = 5 # Show 5 contents per page
|
| 263 |
+
total_contents = len(filtered_contents)
|
| 264 |
+
total_pages = (total_contents + items_per_page - 1) // items_per_page # Ceiling division
|
| 265 |
+
|
| 266 |
+
if total_contents > items_per_page:
|
| 267 |
+
# Display pagination info
|
| 268 |
+
st.info(f"📄 Showing page {st.session_state.sentiment_page} of {total_pages} ({total_contents} total contents)")
|
| 269 |
+
|
| 270 |
+
# Pagination controls at top
|
| 271 |
+
col_prev, col_info, col_next = st.columns([1, 2, 1])
|
| 272 |
+
|
| 273 |
+
with col_prev:
|
| 274 |
+
if st.button("⬅️ Previous", key="prev_top", disabled=st.session_state.sentiment_page == 1):
|
| 275 |
+
st.session_state.sentiment_page -= 1
|
| 276 |
+
st.rerun()
|
| 277 |
+
|
| 278 |
+
with col_info:
|
| 279 |
+
st.markdown(f"<div style='text-align: center; padding-top: 8px;'>Page {st.session_state.sentiment_page} / {total_pages}</div>", unsafe_allow_html=True)
|
| 280 |
+
|
| 281 |
+
with col_next:
|
| 282 |
+
if st.button("Next ➡️", key="next_top", disabled=st.session_state.sentiment_page >= total_pages):
|
| 283 |
+
st.session_state.sentiment_page += 1
|
| 284 |
+
st.rerun()
|
| 285 |
+
|
| 286 |
+
st.markdown("---")
|
| 287 |
+
|
| 288 |
+
# Calculate pagination indices
|
| 289 |
+
start_idx = (st.session_state.sentiment_page - 1) * items_per_page
|
| 290 |
+
end_idx = min(start_idx + items_per_page, total_contents)
|
| 291 |
+
|
| 292 |
+
# Get paginated contents
|
| 293 |
+
paginated_contents = filtered_contents.iloc[start_idx:end_idx]
|
| 294 |
+
|
| 295 |
+
for idx, (_, content_row) in enumerate(paginated_contents.iterrows(), start_idx + 1):
|
| 296 |
+
# Display content card
|
| 297 |
+
ContentCards.display_content_card(content_row, rank=idx)
|
| 298 |
+
|
| 299 |
+
# Get comments for this content
|
| 300 |
+
content_comments = df_filtered[df_filtered['content_sk'] == content_row['content_sk']]
|
| 301 |
+
|
| 302 |
+
if content_comments.empty:
|
| 303 |
+
st.info("No comment details available for this content")
|
| 304 |
+
continue
|
| 305 |
+
|
| 306 |
+
# Create columns for visualizations
|
| 307 |
+
viz_col1, viz_col2 = st.columns(2)
|
| 308 |
+
|
| 309 |
+
with viz_col1:
|
| 310 |
+
# Sentiment distribution for this content
|
| 311 |
+
content_sentiment_pie = sentiment_charts.create_sentiment_pie_chart(
|
| 312 |
+
content_comments, title=f"Sentiment Distribution"
|
| 313 |
+
)
|
| 314 |
+
st.plotly_chart(content_sentiment_pie, use_container_width=True, key=f"sentiment_pie_{content_row['content_sk']}")
|
| 315 |
+
|
| 316 |
+
with viz_col2:
|
| 317 |
+
# Intent distribution for this content
|
| 318 |
+
content_intent_bar = distribution_charts.create_intent_bar_chart(
|
| 319 |
+
content_comments, title=f"Intent Distribution", orientation='h'
|
| 320 |
+
)
|
| 321 |
+
st.plotly_chart(content_intent_bar, use_container_width=True, key=f"intent_bar_{content_row['content_sk']}")
|
| 322 |
+
|
| 323 |
+
# AI Analysis Section
|
| 324 |
+
st.markdown("#### 🤖 AI-Powered Analysis")
|
| 325 |
+
|
| 326 |
+
content_sk = content_row['content_sk']
|
| 327 |
+
|
| 328 |
+
# Three buttons in a row for different analysis types
|
| 329 |
+
st.markdown("**Select analysis type:**")
|
| 330 |
+
btn_col1, btn_col2, btn_col3 = st.columns(3)
|
| 331 |
+
|
| 332 |
+
with btn_col1:
|
| 333 |
+
generate_negative = st.button(
|
| 334 |
+
"📉 Negative Summary",
|
| 335 |
+
key=f"ai_negative_{content_sk}",
|
| 336 |
+
help="Analyze negative comments only",
|
| 337 |
+
use_container_width=True
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
with btn_col2:
|
| 341 |
+
generate_combined = st.button(
|
| 342 |
+
"📊 Combined Summary",
|
| 343 |
+
key=f"ai_combined_{content_sk}",
|
| 344 |
+
help="Analyze both positive and negative comments",
|
| 345 |
+
use_container_width=True
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
with btn_col3:
|
| 349 |
+
generate_positive = st.button(
|
| 350 |
+
"📈 Positive Summary",
|
| 351 |
+
key=f"ai_positive_{content_sk}",
|
| 352 |
+
help="Analyze positive comments only",
|
| 353 |
+
use_container_width=True
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
# Determine which summary to generate/display
|
| 357 |
+
summary_type = None
|
| 358 |
+
if generate_negative:
|
| 359 |
+
summary_type = 'negative'
|
| 360 |
+
elif generate_positive:
|
| 361 |
+
summary_type = 'positive'
|
| 362 |
+
elif generate_combined:
|
| 363 |
+
summary_type = 'combined'
|
| 364 |
+
|
| 365 |
+
# Check if any summary already exists in session state
|
| 366 |
+
summary_key_negative = f"{content_sk}_negative"
|
| 367 |
+
summary_key_positive = f"{content_sk}_positive"
|
| 368 |
+
summary_key_combined = f"{content_sk}_combined"
|
| 369 |
+
|
| 370 |
+
# Display existing summaries or generate new ones
|
| 371 |
+
if summary_type or summary_key_negative in st.session_state.content_summaries or \
|
| 372 |
+
summary_key_positive in st.session_state.content_summaries or \
|
| 373 |
+
summary_key_combined in st.session_state.content_summaries:
|
| 374 |
+
|
| 375 |
+
# Generate new summary if button was clicked
|
| 376 |
+
if summary_type:
|
| 377 |
+
summary_key = f"{content_sk}_{summary_type}"
|
| 378 |
+
with st.spinner(f"Analyzing {summary_type} comments with AI..."):
|
| 379 |
+
# Prepare input for agent
|
| 380 |
+
agent_input = {
|
| 381 |
+
'content_sk': content_sk,
|
| 382 |
+
'content_description': content_row['content_description'],
|
| 383 |
+
'comments': content_comments,
|
| 384 |
+
'sentiment_type': summary_type
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
# Generate summary
|
| 388 |
+
result = summary_agent.process(agent_input)
|
| 389 |
+
|
| 390 |
+
# Cache the result
|
| 391 |
+
st.session_state.content_summaries[summary_key] = result
|
| 392 |
+
|
| 393 |
+
# Display all available summaries
|
| 394 |
+
available_summaries = []
|
| 395 |
+
if summary_key_negative in st.session_state.content_summaries:
|
| 396 |
+
available_summaries.append(('Negative', summary_key_negative))
|
| 397 |
+
if summary_key_combined in st.session_state.content_summaries:
|
| 398 |
+
available_summaries.append(('Combined', summary_key_combined))
|
| 399 |
+
if summary_key_positive in st.session_state.content_summaries:
|
| 400 |
+
available_summaries.append(('Positive', summary_key_positive))
|
| 401 |
+
|
| 402 |
+
# Display each available summary
|
| 403 |
+
for summary_label, summary_key in available_summaries:
|
| 404 |
+
result = st.session_state.content_summaries[summary_key]
|
| 405 |
+
|
| 406 |
+
# Display the summary
|
| 407 |
+
if result['success']:
|
| 408 |
+
summary = result['summary']
|
| 409 |
+
|
| 410 |
+
with st.expander(f"📊 AI Analysis Report - {summary_label}", expanded=True):
|
| 411 |
+
# Executive Summary
|
| 412 |
+
st.markdown("### Executive Summary")
|
| 413 |
+
st.info(summary['executive_summary'])
|
| 414 |
+
|
| 415 |
+
# Main Themes
|
| 416 |
+
if summary['main_themes']:
|
| 417 |
+
st.markdown("### 🎯 Main Themes")
|
| 418 |
+
for theme in summary['main_themes']:
|
| 419 |
+
sentiment_emoji = {
|
| 420 |
+
'positive': '😊',
|
| 421 |
+
'negative': '😟',
|
| 422 |
+
'mixed': '🤔'
|
| 423 |
+
}.get(theme.get('sentiment', 'mixed'), '🤔')
|
| 424 |
+
|
| 425 |
+
st.markdown(f"""
|
| 426 |
+
**{sentiment_emoji} {theme.get('theme', 'Unknown')}** ({theme.get('sentiment', 'mixed').title()})
|
| 427 |
+
- {theme.get('description', 'No description')}
|
| 428 |
+
""")
|
| 429 |
+
|
| 430 |
+
# Two-column layout for praise and complaints
|
| 431 |
+
col_praise, col_complaints = st.columns(2)
|
| 432 |
+
|
| 433 |
+
with col_praise:
|
| 434 |
+
st.markdown("### ✅ Praise Points")
|
| 435 |
+
if summary['praise_points']:
|
| 436 |
+
for point in summary['praise_points']:
|
| 437 |
+
st.markdown(f"- {point}")
|
| 438 |
+
else:
|
| 439 |
+
st.markdown("*No significant praise points identified*")
|
| 440 |
+
|
| 441 |
+
with col_complaints:
|
| 442 |
+
st.markdown("### ⚠️ Key Complaints")
|
| 443 |
+
if summary['key_complaints']:
|
| 444 |
+
for complaint in summary['key_complaints']:
|
| 445 |
+
st.markdown(f"- {complaint}")
|
| 446 |
+
else:
|
| 447 |
+
st.markdown("*No significant complaints identified*")
|
| 448 |
+
|
| 449 |
+
# FAQs and Insights
|
| 450 |
+
col_faq, col_insights = st.columns(2)
|
| 451 |
+
|
| 452 |
+
with col_faq:
|
| 453 |
+
st.markdown("### ❓ Frequently Asked Questions")
|
| 454 |
+
if summary['frequently_asked_questions']:
|
| 455 |
+
for faq in summary['frequently_asked_questions']:
|
| 456 |
+
st.markdown(f"- {faq}")
|
| 457 |
+
else:
|
| 458 |
+
st.markdown("*No frequent questions identified*")
|
| 459 |
+
|
| 460 |
+
with col_insights:
|
| 461 |
+
st.markdown("### 💡 Unexpected Insights")
|
| 462 |
+
if summary['unexpected_insights']:
|
| 463 |
+
for insight in summary['unexpected_insights']:
|
| 464 |
+
st.markdown(f"- {insight}")
|
| 465 |
+
else:
|
| 466 |
+
st.markdown("*No unexpected insights identified*")
|
| 467 |
+
|
| 468 |
+
# Action Recommendations
|
| 469 |
+
if summary['action_recommendations']:
|
| 470 |
+
st.markdown("### 🎯 Recommended Actions")
|
| 471 |
+
for action in summary['action_recommendations']:
|
| 472 |
+
priority = action.get('priority', 'medium').upper()
|
| 473 |
+
priority_color = {
|
| 474 |
+
'HIGH': '🔴',
|
| 475 |
+
'MEDIUM': '🟡',
|
| 476 |
+
'LOW': '🟢'
|
| 477 |
+
}.get(priority, '🟡')
|
| 478 |
+
|
| 479 |
+
st.markdown(f"{priority_color} **[{priority}]** {action.get('action', 'No action specified')}")
|
| 480 |
+
|
| 481 |
+
# Metadata
|
| 482 |
+
with st.expander("ℹ️ Analysis Metadata"):
|
| 483 |
+
metadata = result.get('metadata', {})
|
| 484 |
+
meta_col1, meta_col2, meta_col3 = st.columns(3)
|
| 485 |
+
|
| 486 |
+
with meta_col1:
|
| 487 |
+
st.metric("Comments Analyzed", metadata.get('total_comments_analyzed', 0))
|
| 488 |
+
|
| 489 |
+
with meta_col2:
|
| 490 |
+
st.metric("Model Used", metadata.get('model_used', 'N/A'))
|
| 491 |
+
|
| 492 |
+
with meta_col3:
|
| 493 |
+
st.metric("Tokens Used", metadata.get('tokens_used', 0))
|
| 494 |
+
|
| 495 |
+
else:
|
| 496 |
+
# Display error
|
| 497 |
+
st.error(f"❌ Failed to generate AI analysis: {result.get('error', 'Unknown error')}")
|
| 498 |
+
|
| 499 |
+
# Option to retry
|
| 500 |
+
if st.button("🔄 Retry Analysis", key=f"retry_{summary_key}"):
|
| 501 |
+
# Clear from cache and rerun
|
| 502 |
+
if summary_key in st.session_state.content_summaries:
|
| 503 |
+
del st.session_state.content_summaries[summary_key]
|
| 504 |
+
st.rerun()
|
| 505 |
+
|
| 506 |
+
# Show comments using expandable sections - one for negative, one for positive
|
| 507 |
+
st.markdown("#### 💬 View Comments by Sentiment")
|
| 508 |
+
|
| 509 |
+
# Get negative and positive comments
|
| 510 |
+
negative_comments = content_comments[
|
| 511 |
+
content_comments['sentiment_polarity'].isin(['negative', 'very_negative'])
|
| 512 |
+
]
|
| 513 |
+
positive_comments = content_comments[
|
| 514 |
+
content_comments['sentiment_polarity'].isin(['positive', 'very_positive'])
|
| 515 |
+
]
|
| 516 |
+
|
| 517 |
+
negative_count = len(negative_comments)
|
| 518 |
+
positive_count = len(positive_comments)
|
| 519 |
+
|
| 520 |
+
# Create two columns for side-by-side expandable sections
|
| 521 |
+
col_neg, col_pos = st.columns(2)
|
| 522 |
+
|
| 523 |
+
with col_neg:
|
| 524 |
+
# Negative comments expandable section (collapsed by default)
|
| 525 |
+
with st.expander(f"📉 Negative Comments ({negative_count})", expanded=False):
|
| 526 |
+
if not negative_comments.empty:
|
| 527 |
+
st.markdown(f"**Showing all {negative_count} negative comments:**")
|
| 528 |
+
for _, comment in negative_comments.iterrows():
|
| 529 |
+
ContentCards.display_comment_card(comment, show_original=True)
|
| 530 |
+
else:
|
| 531 |
+
st.info("No negative comments found for this content")
|
| 532 |
+
|
| 533 |
+
with col_pos:
|
| 534 |
+
# Positive comments expandable section (collapsed by default)
|
| 535 |
+
with st.expander(f"📈 Positive Comments ({positive_count})", expanded=False):
|
| 536 |
+
if not positive_comments.empty:
|
| 537 |
+
st.markdown(f"**Showing all {positive_count} positive comments:**")
|
| 538 |
+
for _, comment in positive_comments.iterrows():
|
| 539 |
+
ContentCards.display_comment_card(comment, show_original=True)
|
| 540 |
+
else:
|
| 541 |
+
st.info("No positive comments found for this content")
|
| 542 |
+
|
| 543 |
+
st.markdown("---")
|
| 544 |
+
|
| 545 |
+
# Pagination controls at bottom
|
| 546 |
+
if total_contents > items_per_page:
|
| 547 |
+
st.markdown("---")
|
| 548 |
+
|
| 549 |
+
col_prev_bottom, col_info_bottom, col_next_bottom = st.columns([1, 2, 1])
|
| 550 |
+
|
| 551 |
+
with col_prev_bottom:
|
| 552 |
+
if st.button("⬅️ Previous", key="prev_bottom", disabled=st.session_state.sentiment_page == 1):
|
| 553 |
+
st.session_state.sentiment_page -= 1
|
| 554 |
+
st.rerun()
|
| 555 |
+
|
| 556 |
+
with col_info_bottom:
|
| 557 |
+
st.markdown(f"<div style='text-align: center; padding-top: 8px;'>Page {st.session_state.sentiment_page} / {total_pages}</div>", unsafe_allow_html=True)
|
| 558 |
+
|
| 559 |
+
with col_next_bottom:
|
| 560 |
+
if st.button("Next ➡️", key="next_bottom", disabled=st.session_state.sentiment_page >= total_pages):
|
| 561 |
+
st.session_state.sentiment_page += 1
|
| 562 |
+
st.rerun()
|
| 563 |
+
|
| 564 |
+
st.markdown("---")
|
| 565 |
+
|
| 566 |
+
# Additional insights
|
| 567 |
+
st.markdown("### 💡 Insights & Recommendations")
|
| 568 |
+
|
| 569 |
+
# Find common patterns
|
| 570 |
+
all_filtered_comments = df_filtered[df_filtered['content_sk'].isin(filtered_contents['content_sk'])]
|
| 571 |
+
|
| 572 |
+
insight_col1, insight_col2 = st.columns(2)
|
| 573 |
+
|
| 574 |
+
with insight_col1:
|
| 575 |
+
st.markdown("#### 🎯 Common Intent Patterns")
|
| 576 |
+
|
| 577 |
+
# Get intent distribution for filtered contents
|
| 578 |
+
intent_distribution = processor.get_intent_distribution(all_filtered_comments)
|
| 579 |
+
intent_distribution = intent_distribution.sort_values('count', ascending=False).head(5)
|
| 580 |
+
|
| 581 |
+
for _, intent_row in intent_distribution.iterrows():
|
| 582 |
+
st.markdown(f"- **{intent_row['intent']}**: {intent_row['count']} occurrences ({intent_row['percentage']:.1f}%)")
|
| 583 |
+
|
| 584 |
+
with insight_col2:
|
| 585 |
+
st.markdown("#### 🌐 Platform Breakdown")
|
| 586 |
+
|
| 587 |
+
# Get platform distribution
|
| 588 |
+
platform_dist = all_filtered_comments['platform'].value_counts()
|
| 589 |
+
|
| 590 |
+
for platform, count in platform_dist.items():
|
| 591 |
+
pct = (count / len(all_filtered_comments) * 100)
|
| 592 |
+
st.markdown(f"- **{platform.title()}**: {count} comments ({pct:.1f}%)")
|
| 593 |
+
|
| 594 |
+
st.markdown("---")
|
| 595 |
+
|
| 596 |
+
# Action items
|
| 597 |
+
st.markdown("### ✅ Recommended Actions")
|
| 598 |
+
|
| 599 |
+
action_items = []
|
| 600 |
+
|
| 601 |
+
# Check for high reply requirement
|
| 602 |
+
if filtered_contents['reply_required_count'].sum() > 0:
|
| 603 |
+
action_items.append(
|
| 604 |
+
f"🔴 **High Priority**: {int(filtered_contents['reply_required_count'].sum())} comments require immediate response"
|
| 605 |
+
)
|
| 606 |
+
|
| 607 |
+
# Check for critical negative percentage
|
| 608 |
+
critical_contents = filtered_contents[filtered_contents['negative_percentage'] > 50]
|
| 609 |
+
if not critical_contents.empty:
|
| 610 |
+
action_items.append(
|
| 611 |
+
f"🚨 **Critical**: {len(critical_contents)} content(s) have over 50% negative sentiment - investigate root causes"
|
| 612 |
+
)
|
| 613 |
+
|
| 614 |
+
# Check for feedback patterns
|
| 615 |
+
feedback_comments = all_filtered_comments[
|
| 616 |
+
all_filtered_comments['intent'].str.contains('feedback_negative', na=False)
|
| 617 |
+
]
|
| 618 |
+
if not feedback_comments.empty:
|
| 619 |
+
action_items.append(
|
| 620 |
+
f"💬 **Feedback**: {len(feedback_comments)} comments contain negative feedback - consider product improvements"
|
| 621 |
+
)
|
| 622 |
+
|
| 623 |
+
# Check for questions
|
| 624 |
+
question_comments = all_filtered_comments[
|
| 625 |
+
all_filtered_comments['intent'].str.contains('question', na=False)
|
| 626 |
+
]
|
| 627 |
+
if not question_comments.empty:
|
| 628 |
+
action_items.append(
|
| 629 |
+
f"❓ **Questions**: {len(question_comments)} unanswered questions - improve FAQ or support documentation"
|
| 630 |
+
)
|
| 631 |
+
|
| 632 |
+
if action_items:
|
| 633 |
+
for action in action_items:
|
| 634 |
+
st.markdown(action)
|
| 635 |
+
else:
|
| 636 |
+
st.success("No critical action items at this time")
|
| 637 |
+
|
| 638 |
+
st.markdown("---")
|
| 639 |
+
|
| 640 |
+
# Export option
|
| 641 |
+
st.markdown("### 💾 Export Data")
|
| 642 |
+
|
| 643 |
+
col1, col2 = st.columns([1, 3])
|
| 644 |
+
|
| 645 |
+
with col1:
|
| 646 |
+
# Prepare export data columns dynamically
|
| 647 |
+
base_columns = ['content_sk', 'content_description', 'permalink_url',
|
| 648 |
+
'total_comments', 'reply_required_count', 'dominant_sentiment']
|
| 649 |
+
|
| 650 |
+
# Add sentiment-specific columns if they exist
|
| 651 |
+
if 'selected_sentiment_count' in filtered_contents.columns:
|
| 652 |
+
base_columns.extend(['selected_sentiment_count', 'selected_sentiment_percentage'])
|
| 653 |
+
|
| 654 |
+
if 'negative_count' in filtered_contents.columns:
|
| 655 |
+
base_columns.extend(['negative_count', 'negative_percentage'])
|
| 656 |
+
|
| 657 |
+
# Filter to only include existing columns
|
| 658 |
+
export_columns = [col for col in base_columns if col in filtered_contents.columns]
|
| 659 |
+
export_data = filtered_contents[export_columns]
|
| 660 |
+
|
| 661 |
+
csv = export_data.to_csv(index=False)
|
| 662 |
+
|
| 663 |
+
st.download_button(
|
| 664 |
+
label="📥 Download as CSV",
|
| 665 |
+
data=csv,
|
| 666 |
+
file_name=f"sentiment_analysis_top{top_n}.csv",
|
| 667 |
+
mime="text/csv"
|
| 668 |
+
)
|
| 669 |
+
|
| 670 |
+
with col2:
|
| 671 |
+
st.info("Download the data for further analysis or reporting")
|
visualization/config/viz_config.json
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"color_schemes": {
|
| 3 |
+
"sentiment_polarity": {
|
| 4 |
+
"very_positive": "#00C851",
|
| 5 |
+
"positive": "#7CB342",
|
| 6 |
+
"neutral": "#FFB300",
|
| 7 |
+
"negative": "#FF6F00",
|
| 8 |
+
"very_negative": "#D32F2F"
|
| 9 |
+
},
|
| 10 |
+
"intent": {
|
| 11 |
+
"praise": "#4CAF50",
|
| 12 |
+
"question": "#2196F3",
|
| 13 |
+
"request": "#9C27B0",
|
| 14 |
+
"feedback_negative": "#FF5722",
|
| 15 |
+
"suggestion": "#00BCD4",
|
| 16 |
+
"humor_sarcasm": "#FFC107",
|
| 17 |
+
"off_topic": "#9E9E9E",
|
| 18 |
+
"spam_selfpromo": "#795548"
|
| 19 |
+
},
|
| 20 |
+
"platform": {
|
| 21 |
+
"facebook": "#1877F2",
|
| 22 |
+
"instagram": "#E4405F",
|
| 23 |
+
"youtube": "#FF0000",
|
| 24 |
+
"twitter": "#1DA1F2",
|
| 25 |
+
"musora_app": "#1982C4",
|
| 26 |
+
"default": "#607D8B"
|
| 27 |
+
},
|
| 28 |
+
"brand": {
|
| 29 |
+
"drumeo": "#FF6B35",
|
| 30 |
+
"pianote": "#6A4C93",
|
| 31 |
+
"musora": "#1982C4",
|
| 32 |
+
"default": "#8AC926"
|
| 33 |
+
}
|
| 34 |
+
},
|
| 35 |
+
"sentiment_order": [
|
| 36 |
+
"very_positive",
|
| 37 |
+
"positive",
|
| 38 |
+
"neutral",
|
| 39 |
+
"negative",
|
| 40 |
+
"very_negative"
|
| 41 |
+
],
|
| 42 |
+
"intent_order": [
|
| 43 |
+
"praise",
|
| 44 |
+
"question",
|
| 45 |
+
"request",
|
| 46 |
+
"feedback_negative",
|
| 47 |
+
"suggestion",
|
| 48 |
+
"humor_sarcasm",
|
| 49 |
+
"off_topic",
|
| 50 |
+
"spam_selfpromo"
|
| 51 |
+
],
|
| 52 |
+
"negative_sentiments": [
|
| 53 |
+
"negative",
|
| 54 |
+
"very_negative"
|
| 55 |
+
],
|
| 56 |
+
"dashboard": {
|
| 57 |
+
"default_date_range_days": 30,
|
| 58 |
+
"max_comments_display": 100,
|
| 59 |
+
"chart_height": 400,
|
| 60 |
+
"top_n_contents": 10
|
| 61 |
+
},
|
| 62 |
+
"page_config": {
|
| 63 |
+
"page_title": "Musora Sentiment Analysis Dashboard",
|
| 64 |
+
"page_icon": "📊",
|
| 65 |
+
"layout": "wide",
|
| 66 |
+
"initial_sidebar_state": "expanded"
|
| 67 |
+
},
|
| 68 |
+
"snowflake": {
|
| 69 |
+
"query": "SELECT s.COMMENT_SK, s.COMMENT_ID, s.ORIGINAL_TEXT, s.PLATFORM, s.COMMENT_TIMESTAMP, s.AUTHOR_NAME, s.AUTHOR_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_TEXT, s.CONTENT_SK, s.CONTENT_ID, s.CONTENT_DESCRIPTION, s.CHANNEL_SK, s.CHANNEL_NAME, s.CHANNEL_DISPLAY_NAME, s.DETECTED_LANGUAGE, s.LANGUAGE_CODE, s.IS_ENGLISH, s.LANGUAGE_CONFIDENCE, s.DETECTION_METHOD, s.HAS_TEXT, s.TRANSLATED_TEXT, s.TRANSLATION_PERFORMED, s.TRANSLATION_CONFIDENCE, s.TRANSLATION_NOTES, s.SENTIMENT_POLARITY, s.INTENT, s.REQUIRES_REPLY, s.SENTIMENT_CONFIDENCE, s.ANALYSIS_NOTES, s.PROCESSING_SUCCESS, CAST(NULL AS VARCHAR(16777216)) as PROCESSING_ERRORS, s.PROCESSED_AT, s.WORKFLOW_VERSION, CAST(NULL AS TIMESTAMP_NTZ(9)) as CREATED_AT, CAST(NULL AS TIMESTAMP_NTZ(9)) as UPDATED_AT, s.CHANNEL_NAME as BRAND, c.PERMALINK_URL, CAST(NULL AS VARCHAR(16777216)) as THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT c ON s.CONTENT_SK = c.CONTENT_SK UNION ALL SELECT COMMENT_SK, COMMENT_ID, ORIGINAL_TEXT, CASE WHEN PLATFORM = 'musora' THEN 'musora_app' ELSE PLATFORM END as PLATFORM, COMMENT_TIMESTAMP, AUTHOR_NAME, AUTHOR_ID, PARENT_COMMENT_ID, PARENT_COMMENT_TEXT, CONTENT_SK, CONTENT_ID, CONTENT_DESCRIPTION, CHANNEL_SK, CHANNEL_NAME, CHANNEL_DISPLAY_NAME, DETECTED_LANGUAGE, LANGUAGE_CODE, IS_ENGLISH, LANGUAGE_CONFIDENCE, DETECTION_METHOD, HAS_TEXT, TRANSLATED_TEXT, TRANSLATION_PERFORMED, TRANSLATION_CONFIDENCE, TRANSLATION_NOTES, SENTIMENT_POLARITY, INTENT, REQUIRES_REPLY, SENTIMENT_CONFIDENCE, ANALYSIS_NOTES, PROCESSING_SUCCESS, PROCESSING_ERRORS, PROCESSED_AT, WORKFLOW_VERSION, CREATED_AT, UPDATED_AT, CHANNEL_NAME as BRAND, PERMALINK_URL, THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES",
|
| 70 |
+
"demographics_query": "SELECT u.id as USER_ID, u.birthday as BIRTHDAY, u.timezone as TIMEZONE, GREATEST(COALESCE(p.difficulty, 0), COALESCE(p.self_report_difficulty, 0), COALESCE(p.method_experience, 0)) AS EXPERIENCE_LEVEL FROM stitch.musora_ecom_db.usora_users u JOIN online_recsys.preprocessed.users p ON u.id = p.user_id"
|
| 71 |
+
},
|
| 72 |
+
"demographics": {
|
| 73 |
+
"age_groups": {
|
| 74 |
+
"18-24": [18, 24],
|
| 75 |
+
"25-34": [25, 34],
|
| 76 |
+
"35-44": [35, 44],
|
| 77 |
+
"45-54": [45, 54],
|
| 78 |
+
"55+": [55, 150]
|
| 79 |
+
},
|
| 80 |
+
"experience_groups": {
|
| 81 |
+
"Beginner (0-3)": [0, 3],
|
| 82 |
+
"Intermediate (4-7)": [4, 7],
|
| 83 |
+
"Advanced (8-10)": [8, 10]
|
| 84 |
+
},
|
| 85 |
+
"top_timezones_count": 15
|
| 86 |
+
}
|
| 87 |
+
}
|
visualization/data/data_loader.py
ADDED
|
@@ -0,0 +1,427 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data loader module for Sentiment Analysis Visualization
|
| 3 |
+
Handles Snowflake connection and data loading with caching
|
| 4 |
+
"""
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import streamlit as st
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import json
|
| 11 |
+
from datetime import datetime, timedelta
|
| 12 |
+
from dateutil.relativedelta import relativedelta
|
| 13 |
+
|
| 14 |
+
# Add parent directory to path to import SnowFlakeConnection
|
| 15 |
+
parent_dir = Path(__file__).resolve().parent.parent.parent
|
| 16 |
+
sys.path.append(str(parent_dir))
|
| 17 |
+
|
| 18 |
+
from visualization.SnowFlakeConnection import SnowFlakeConn
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class SentimentDataLoader:
|
| 22 |
+
"""
|
| 23 |
+
Loads sentiment analysis data from Snowflake with caching
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(self, config_path=None):
|
| 27 |
+
"""
|
| 28 |
+
Initialize data loader
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
config_path: Path to configuration file
|
| 32 |
+
"""
|
| 33 |
+
if config_path is None:
|
| 34 |
+
config_path = Path(__file__).parent.parent / "config" / "viz_config.json"
|
| 35 |
+
|
| 36 |
+
with open(config_path, 'r') as f:
|
| 37 |
+
self.config = json.load(f)
|
| 38 |
+
|
| 39 |
+
self.query = self.config['snowflake']['query']
|
| 40 |
+
self.demographics_query = self.config['snowflake'].get('demographics_query', None)
|
| 41 |
+
|
| 42 |
+
@st.cache_data(ttl=300) # Cache for 5 minutes
|
| 43 |
+
def load_data(_self, reload=False):
|
| 44 |
+
"""
|
| 45 |
+
Load sentiment data from Snowflake
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
reload: Force reload data (bypass cache)
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
pd.DataFrame: Sentiment analysis data
|
| 52 |
+
"""
|
| 53 |
+
try:
|
| 54 |
+
# Connect to Snowflake
|
| 55 |
+
conn = SnowFlakeConn()
|
| 56 |
+
|
| 57 |
+
# Execute query
|
| 58 |
+
df = conn.run_read_query(_self.query, "sentiment features")
|
| 59 |
+
|
| 60 |
+
# Close connection
|
| 61 |
+
conn.close_connection()
|
| 62 |
+
|
| 63 |
+
if df is None or df.empty:
|
| 64 |
+
st.error("No data returned from Snowflake")
|
| 65 |
+
return pd.DataFrame()
|
| 66 |
+
|
| 67 |
+
# Process dataframe
|
| 68 |
+
df = _self._process_dataframe(df)
|
| 69 |
+
|
| 70 |
+
# Load and merge demographics data for musora_app users
|
| 71 |
+
if _self.demographics_query:
|
| 72 |
+
demographics_df = _self.load_demographics_data()
|
| 73 |
+
df = _self.merge_demographics_with_comments(df, demographics_df)
|
| 74 |
+
|
| 75 |
+
return df
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
st.error(f"Error loading data from Snowflake: {e}")
|
| 79 |
+
return pd.DataFrame()
|
| 80 |
+
|
| 81 |
+
def _process_dataframe(self, df):
|
| 82 |
+
"""
|
| 83 |
+
Process and clean the dataframe
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
df: Raw dataframe from Snowflake
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
pd.DataFrame: Processed dataframe
|
| 90 |
+
"""
|
| 91 |
+
# Convert column names to lowercase (should already be done by run_read_query)
|
| 92 |
+
df.columns = df.columns.str.lower()
|
| 93 |
+
|
| 94 |
+
# Parse datetime columns
|
| 95 |
+
if 'comment_timestamp' in df.columns:
|
| 96 |
+
df['comment_timestamp'] = pd.to_datetime(df['comment_timestamp'], errors='coerce')
|
| 97 |
+
|
| 98 |
+
if 'processed_at' in df.columns:
|
| 99 |
+
df['processed_at'] = pd.to_datetime(df['processed_at'], errors='coerce')
|
| 100 |
+
|
| 101 |
+
# Handle null values in key columns
|
| 102 |
+
df['sentiment_polarity'] = df['sentiment_polarity'].fillna('unknown')
|
| 103 |
+
df['intent'] = df['intent'].fillna('unknown')
|
| 104 |
+
df['platform'] = df['platform'].fillna('unknown').str.lower()
|
| 105 |
+
df['brand'] = df['brand'].fillna('unknown').str.lower()
|
| 106 |
+
|
| 107 |
+
# Convert requires_reply to boolean
|
| 108 |
+
if 'requires_reply' in df.columns:
|
| 109 |
+
df['requires_reply'] = df['requires_reply'].astype(bool)
|
| 110 |
+
|
| 111 |
+
# Extract display text (translated if available, otherwise original)
|
| 112 |
+
df['display_text'] = df.apply(
|
| 113 |
+
lambda row: row['translated_text'] if pd.notna(row.get('translated_text')) and row.get('is_english') == False
|
| 114 |
+
else row.get('original_text', ''),
|
| 115 |
+
axis=1
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
# Create a shortened version for display
|
| 119 |
+
df['display_text_short'] = df['display_text'].apply(
|
| 120 |
+
lambda x: x[:100] + '...' if isinstance(x, str) and len(x) > 100 else x
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
return df
|
| 124 |
+
|
| 125 |
+
@staticmethod
|
| 126 |
+
def get_filter_options(df):
|
| 127 |
+
"""
|
| 128 |
+
Get unique values for filters
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
df: Sentiment dataframe
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
dict: Filter options
|
| 135 |
+
"""
|
| 136 |
+
return {
|
| 137 |
+
'platforms': sorted(df['platform'].unique().tolist()),
|
| 138 |
+
'brands': sorted(df['brand'].unique().tolist()),
|
| 139 |
+
'sentiments': sorted(df['sentiment_polarity'].unique().tolist()),
|
| 140 |
+
'languages': sorted(df['detected_language'].dropna().unique().tolist()) if 'detected_language' in df.columns else []
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
@staticmethod
|
| 144 |
+
def apply_filters(df, platforms=None, brands=None, sentiments=None,
|
| 145 |
+
date_range=None, languages=None):
|
| 146 |
+
"""
|
| 147 |
+
Apply filters to dataframe
|
| 148 |
+
|
| 149 |
+
Args:
|
| 150 |
+
df: Sentiment dataframe
|
| 151 |
+
platforms: List of platforms to include
|
| 152 |
+
brands: List of brands to include
|
| 153 |
+
sentiments: List of sentiment polarities to include
|
| 154 |
+
date_range: Tuple of (start_date, end_date)
|
| 155 |
+
languages: List of languages to include
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
pd.DataFrame: Filtered dataframe
|
| 159 |
+
"""
|
| 160 |
+
filtered_df = df.copy()
|
| 161 |
+
|
| 162 |
+
if platforms and len(platforms) > 0:
|
| 163 |
+
filtered_df = filtered_df[filtered_df['platform'].isin(platforms)]
|
| 164 |
+
|
| 165 |
+
if brands and len(brands) > 0:
|
| 166 |
+
filtered_df = filtered_df[filtered_df['brand'].isin(brands)]
|
| 167 |
+
|
| 168 |
+
if sentiments and len(sentiments) > 0:
|
| 169 |
+
filtered_df = filtered_df[filtered_df['sentiment_polarity'].isin(sentiments)]
|
| 170 |
+
|
| 171 |
+
if languages and len(languages) > 0:
|
| 172 |
+
filtered_df = filtered_df[filtered_df['detected_language'].isin(languages)]
|
| 173 |
+
|
| 174 |
+
if date_range and len(date_range) == 2 and 'comment_timestamp' in filtered_df.columns:
|
| 175 |
+
start_date, end_date = date_range
|
| 176 |
+
filtered_df = filtered_df[
|
| 177 |
+
(filtered_df['comment_timestamp'] >= pd.Timestamp(start_date)) &
|
| 178 |
+
(filtered_df['comment_timestamp'] <= pd.Timestamp(end_date))
|
| 179 |
+
]
|
| 180 |
+
|
| 181 |
+
return filtered_df
|
| 182 |
+
|
| 183 |
+
@staticmethod
|
| 184 |
+
def get_date_range(df, default_days=30):
|
| 185 |
+
"""
|
| 186 |
+
Get default date range for filtering
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
df: Sentiment dataframe
|
| 190 |
+
default_days: Number of days to include by default
|
| 191 |
+
|
| 192 |
+
Returns:
|
| 193 |
+
tuple: (min_date, max_date)
|
| 194 |
+
"""
|
| 195 |
+
if 'comment_timestamp' in df.columns and not df.empty:
|
| 196 |
+
max_date = df['comment_timestamp'].max()
|
| 197 |
+
min_date = max_date - timedelta(days=default_days)
|
| 198 |
+
return (min_date, max_date)
|
| 199 |
+
else:
|
| 200 |
+
return (datetime.now() - timedelta(days=default_days), datetime.now())
|
| 201 |
+
|
| 202 |
+
@st.cache_data(ttl=600) # Cache for 10 minutes (demographics change less frequently)
|
| 203 |
+
def load_demographics_data(_self):
|
| 204 |
+
"""
|
| 205 |
+
Load user demographic data from Snowflake
|
| 206 |
+
|
| 207 |
+
Returns:
|
| 208 |
+
pd.DataFrame: User demographic data with user_id, birthday, timezone, experience_level
|
| 209 |
+
"""
|
| 210 |
+
if not _self.demographics_query:
|
| 211 |
+
return pd.DataFrame()
|
| 212 |
+
|
| 213 |
+
try:
|
| 214 |
+
# Connect to Snowflake
|
| 215 |
+
conn = SnowFlakeConn()
|
| 216 |
+
|
| 217 |
+
# Execute demographics query with explicit timestamp conversion
|
| 218 |
+
# Cast timestamp to string to avoid precision issues, then convert in pandas
|
| 219 |
+
query_with_cast = _self.demographics_query.replace(
|
| 220 |
+
"u.birthday as BIRTHDAY",
|
| 221 |
+
"TO_VARCHAR(u.birthday, 'YYYY-MM-DD HH24:MI:SS.FF6 TZHTZM') as BIRTHDAY"
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
df = conn.run_read_query(query_with_cast, "user demographics")
|
| 225 |
+
|
| 226 |
+
# Close connection
|
| 227 |
+
conn.close_connection()
|
| 228 |
+
|
| 229 |
+
if df is None or df.empty:
|
| 230 |
+
return pd.DataFrame()
|
| 231 |
+
|
| 232 |
+
# Process demographics dataframe
|
| 233 |
+
df = _self._process_demographics_dataframe(df)
|
| 234 |
+
|
| 235 |
+
return df
|
| 236 |
+
|
| 237 |
+
except Exception as e:
|
| 238 |
+
# More detailed error message
|
| 239 |
+
st.warning(f"Could not load demographic data: {str(e)}")
|
| 240 |
+
import traceback
|
| 241 |
+
st.error(f"Error reading user demographics: {traceback.format_exc()}")
|
| 242 |
+
return pd.DataFrame()
|
| 243 |
+
|
| 244 |
+
def _process_demographics_dataframe(self, df):
|
| 245 |
+
"""
|
| 246 |
+
Process and enrich demographic dataframe
|
| 247 |
+
|
| 248 |
+
Args:
|
| 249 |
+
df: Raw demographics dataframe
|
| 250 |
+
|
| 251 |
+
Returns:
|
| 252 |
+
pd.DataFrame: Processed demographics with age and region fields
|
| 253 |
+
"""
|
| 254 |
+
# Convert column names to lowercase
|
| 255 |
+
df.columns = df.columns.str.lower()
|
| 256 |
+
|
| 257 |
+
# Parse birthday as datetime
|
| 258 |
+
if 'birthday' in df.columns:
|
| 259 |
+
# Convert to datetime, handling both string and timestamp formats
|
| 260 |
+
# First convert to string to ensure consistency
|
| 261 |
+
df['birthday'] = df['birthday'].astype(str)
|
| 262 |
+
# Then parse as datetime
|
| 263 |
+
df['birthday'] = pd.to_datetime(df['birthday'], errors='coerce', utc=True)
|
| 264 |
+
# Remove timezone info to avoid issues
|
| 265 |
+
df['birthday'] = df['birthday'].dt.tz_localize(None)
|
| 266 |
+
|
| 267 |
+
# Calculate age
|
| 268 |
+
df['age'] = df['birthday'].apply(self._calculate_age)
|
| 269 |
+
|
| 270 |
+
# Create age groups
|
| 271 |
+
df['age_group'] = df['age'].apply(self._categorize_age)
|
| 272 |
+
|
| 273 |
+
# Extract region from timezone
|
| 274 |
+
if 'timezone' in df.columns:
|
| 275 |
+
df['timezone_region'] = df['timezone'].apply(self._extract_timezone_region)
|
| 276 |
+
|
| 277 |
+
# Create experience level groups
|
| 278 |
+
if 'experience_level' in df.columns:
|
| 279 |
+
df['experience_group'] = df['experience_level'].apply(self._categorize_experience)
|
| 280 |
+
|
| 281 |
+
# Remove records with null user_id
|
| 282 |
+
if 'user_id' in df.columns:
|
| 283 |
+
df = df[df['user_id'].notna()]
|
| 284 |
+
|
| 285 |
+
return df
|
| 286 |
+
|
| 287 |
+
@staticmethod
|
| 288 |
+
def _calculate_age(birthday):
|
| 289 |
+
"""
|
| 290 |
+
Calculate age from birthday
|
| 291 |
+
|
| 292 |
+
Args:
|
| 293 |
+
birthday: datetime object
|
| 294 |
+
|
| 295 |
+
Returns:
|
| 296 |
+
int: Age in years, or None if birthday is invalid
|
| 297 |
+
"""
|
| 298 |
+
if pd.isna(birthday):
|
| 299 |
+
return None
|
| 300 |
+
|
| 301 |
+
try:
|
| 302 |
+
today = datetime.now()
|
| 303 |
+
age = relativedelta(today, birthday).years
|
| 304 |
+
|
| 305 |
+
# Sanity check: age should be between 0 and 120
|
| 306 |
+
if 0 <= age <= 120:
|
| 307 |
+
return age
|
| 308 |
+
return None
|
| 309 |
+
except:
|
| 310 |
+
return None
|
| 311 |
+
|
| 312 |
+
def _categorize_age(self, age):
|
| 313 |
+
"""
|
| 314 |
+
Categorize age into groups based on config
|
| 315 |
+
|
| 316 |
+
Args:
|
| 317 |
+
age: Age in years
|
| 318 |
+
|
| 319 |
+
Returns:
|
| 320 |
+
str: Age group label or 'Unknown'
|
| 321 |
+
"""
|
| 322 |
+
if pd.isna(age) or age is None:
|
| 323 |
+
return 'Unknown'
|
| 324 |
+
|
| 325 |
+
age_groups = self.config.get('demographics', {}).get('age_groups', {})
|
| 326 |
+
|
| 327 |
+
for group_name, (min_age, max_age) in age_groups.items():
|
| 328 |
+
if min_age <= age <= max_age:
|
| 329 |
+
return group_name
|
| 330 |
+
|
| 331 |
+
return 'Unknown'
|
| 332 |
+
|
| 333 |
+
@staticmethod
|
| 334 |
+
def _extract_timezone_region(timezone):
|
| 335 |
+
"""
|
| 336 |
+
Extract region from timezone string (e.g., 'America/New_York' -> 'America')
|
| 337 |
+
|
| 338 |
+
Args:
|
| 339 |
+
timezone: Timezone string
|
| 340 |
+
|
| 341 |
+
Returns:
|
| 342 |
+
str: Region name or 'Unknown'
|
| 343 |
+
"""
|
| 344 |
+
if pd.isna(timezone) or not isinstance(timezone, str):
|
| 345 |
+
return 'Unknown'
|
| 346 |
+
|
| 347 |
+
# Split by '/' and take the first part
|
| 348 |
+
parts = timezone.split('/')
|
| 349 |
+
if len(parts) > 0:
|
| 350 |
+
return parts[0]
|
| 351 |
+
|
| 352 |
+
return 'Unknown'
|
| 353 |
+
|
| 354 |
+
def _categorize_experience(self, experience_level):
|
| 355 |
+
"""
|
| 356 |
+
Categorize experience level into groups based on config
|
| 357 |
+
|
| 358 |
+
Args:
|
| 359 |
+
experience_level: Numeric experience level
|
| 360 |
+
|
| 361 |
+
Returns:
|
| 362 |
+
str: Experience group label or 'Unknown'
|
| 363 |
+
"""
|
| 364 |
+
if pd.isna(experience_level):
|
| 365 |
+
return 'Unknown'
|
| 366 |
+
|
| 367 |
+
try:
|
| 368 |
+
exp_level = float(experience_level)
|
| 369 |
+
except:
|
| 370 |
+
return 'Unknown'
|
| 371 |
+
|
| 372 |
+
exp_groups = self.config.get('demographics', {}).get('experience_groups', {})
|
| 373 |
+
|
| 374 |
+
for group_name, (min_exp, max_exp) in exp_groups.items():
|
| 375 |
+
if min_exp <= exp_level <= max_exp:
|
| 376 |
+
return group_name
|
| 377 |
+
|
| 378 |
+
return 'Unknown'
|
| 379 |
+
|
| 380 |
+
def merge_demographics_with_comments(self, comments_df, demographics_df):
|
| 381 |
+
"""
|
| 382 |
+
Merge demographic data with comment data for musora_app platform only
|
| 383 |
+
|
| 384 |
+
Args:
|
| 385 |
+
comments_df: Main comments dataframe
|
| 386 |
+
demographics_df: Demographics dataframe
|
| 387 |
+
|
| 388 |
+
Returns:
|
| 389 |
+
pd.DataFrame: Merged dataframe with demographic fields for musora_app comments
|
| 390 |
+
"""
|
| 391 |
+
if demographics_df.empty:
|
| 392 |
+
# Add empty demographic columns if no demographics data
|
| 393 |
+
comments_df['age'] = None
|
| 394 |
+
comments_df['age_group'] = 'Unknown'
|
| 395 |
+
comments_df['timezone'] = None
|
| 396 |
+
comments_df['timezone_region'] = 'Unknown'
|
| 397 |
+
comments_df['experience_level'] = None
|
| 398 |
+
comments_df['experience_group'] = 'Unknown'
|
| 399 |
+
return comments_df
|
| 400 |
+
|
| 401 |
+
# Ensure author_id is in the same format for merging
|
| 402 |
+
if 'author_id' in comments_df.columns and 'user_id' in demographics_df.columns:
|
| 403 |
+
# Convert both to string for consistent merging
|
| 404 |
+
comments_df['author_id_str'] = comments_df['author_id'].astype(str)
|
| 405 |
+
demographics_df['user_id_str'] = demographics_df['user_id'].astype(str)
|
| 406 |
+
|
| 407 |
+
# Merge demographics only for musora_app platform
|
| 408 |
+
merged_df = comments_df.merge(
|
| 409 |
+
demographics_df[['user_id_str', 'age', 'age_group', 'timezone', 'timezone_region',
|
| 410 |
+
'experience_level', 'experience_group']],
|
| 411 |
+
left_on='author_id_str',
|
| 412 |
+
right_on='user_id_str',
|
| 413 |
+
how='left'
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
+
# Drop temporary merge columns
|
| 417 |
+
merged_df = merged_df.drop(columns=['author_id_str', 'user_id_str'], errors='ignore')
|
| 418 |
+
|
| 419 |
+
# Fill NaN demographic fields with 'Unknown' for non-musora_app platforms
|
| 420 |
+
demographic_cols = ['age_group', 'timezone_region', 'experience_group']
|
| 421 |
+
for col in demographic_cols:
|
| 422 |
+
if col in merged_df.columns:
|
| 423 |
+
merged_df[col] = merged_df[col].fillna('Unknown')
|
| 424 |
+
|
| 425 |
+
return merged_df
|
| 426 |
+
|
| 427 |
+
return comments_df
|
visualization/img/musora.png
ADDED
|
visualization/requirements.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Visualization Requirements
|
| 2 |
+
# Install with: pip install -r requirements.txt
|
| 3 |
+
|
| 4 |
+
# Core visualization
|
| 5 |
+
streamlit>=1.28.0
|
| 6 |
+
plotly>=5.17.0
|
| 7 |
+
|
| 8 |
+
# Data processing
|
| 9 |
+
pandas>=2.0.0
|
| 10 |
+
numpy>=1.24.0
|
| 11 |
+
python-dateutil>=2.8.0
|
| 12 |
+
|
| 13 |
+
# Snowflake connectivity (inherited from parent project)
|
| 14 |
+
snowflake-snowpark-python>=1.8.0
|
| 15 |
+
|
| 16 |
+
# Environment management (inherited from parent project)
|
| 17 |
+
python-dotenv>=1.0.0
|
visualization/utils/data_processor.py
ADDED
|
@@ -0,0 +1,604 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data processing utilities for sentiment analysis
|
| 3 |
+
Handles aggregation, grouping, and transformation operations
|
| 4 |
+
"""
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
from typing import List, Dict, Tuple
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class SentimentDataProcessor:
|
| 11 |
+
"""
|
| 12 |
+
Processes sentiment data for visualization
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
def aggregate_by_dimensions(df, group_by_cols, agg_cols=None):
|
| 17 |
+
"""
|
| 18 |
+
Aggregate data by specified dimensions
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
df: Sentiment dataframe
|
| 22 |
+
group_by_cols: List of columns to group by
|
| 23 |
+
agg_cols: Dictionary of columns and aggregation functions
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
pd.DataFrame: Aggregated dataframe
|
| 27 |
+
"""
|
| 28 |
+
if agg_cols is None:
|
| 29 |
+
agg_cols = {
|
| 30 |
+
'comment_sk': 'count',
|
| 31 |
+
'requires_reply': 'sum'
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
return df.groupby(group_by_cols, as_index=False).agg(agg_cols)
|
| 35 |
+
|
| 36 |
+
@staticmethod
|
| 37 |
+
def get_sentiment_distribution(df, group_by=None):
|
| 38 |
+
"""
|
| 39 |
+
Calculate sentiment distribution
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
df: Sentiment dataframe
|
| 43 |
+
group_by: Optional column(s) to group by
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
pd.DataFrame: Sentiment distribution
|
| 47 |
+
"""
|
| 48 |
+
if group_by:
|
| 49 |
+
# Group by specified columns and sentiment
|
| 50 |
+
if isinstance(group_by, str):
|
| 51 |
+
group_by = [group_by]
|
| 52 |
+
|
| 53 |
+
sentiment_counts = df.groupby(
|
| 54 |
+
group_by + ['sentiment_polarity'],
|
| 55 |
+
as_index=False
|
| 56 |
+
).size().rename(columns={'size': 'count'})
|
| 57 |
+
|
| 58 |
+
# Calculate percentages within each group
|
| 59 |
+
sentiment_counts['percentage'] = sentiment_counts.groupby(group_by)['count'].transform(
|
| 60 |
+
lambda x: (x / x.sum() * 100).round(2)
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
else:
|
| 64 |
+
# Overall sentiment distribution
|
| 65 |
+
sentiment_counts = df['sentiment_polarity'].value_counts().reset_index()
|
| 66 |
+
sentiment_counts.columns = ['sentiment_polarity', 'count']
|
| 67 |
+
sentiment_counts['percentage'] = (
|
| 68 |
+
sentiment_counts['count'] / sentiment_counts['count'].sum() * 100
|
| 69 |
+
).round(2)
|
| 70 |
+
|
| 71 |
+
return sentiment_counts
|
| 72 |
+
|
| 73 |
+
@staticmethod
|
| 74 |
+
def get_intent_distribution(df, group_by=None):
|
| 75 |
+
"""
|
| 76 |
+
Calculate intent distribution (handles multi-label)
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
df: Sentiment dataframe
|
| 80 |
+
group_by: Optional column(s) to group by
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
pd.DataFrame: Intent distribution
|
| 84 |
+
"""
|
| 85 |
+
# Explode intents (split comma-separated values)
|
| 86 |
+
df_exploded = df.copy()
|
| 87 |
+
df_exploded['intent'] = df_exploded['intent'].str.split(',')
|
| 88 |
+
df_exploded = df_exploded.explode('intent')
|
| 89 |
+
df_exploded['intent'] = df_exploded['intent'].str.strip()
|
| 90 |
+
|
| 91 |
+
if group_by:
|
| 92 |
+
# Group by specified columns and intent
|
| 93 |
+
if isinstance(group_by, str):
|
| 94 |
+
group_by = [group_by]
|
| 95 |
+
|
| 96 |
+
intent_counts = df_exploded.groupby(
|
| 97 |
+
group_by + ['intent'],
|
| 98 |
+
as_index=False
|
| 99 |
+
).size().rename(columns={'size': 'count'})
|
| 100 |
+
|
| 101 |
+
# Calculate percentages within each group
|
| 102 |
+
intent_counts['percentage'] = intent_counts.groupby(group_by)['count'].transform(
|
| 103 |
+
lambda x: (x / x.sum() * 100).round(2)
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
else:
|
| 107 |
+
# Overall intent distribution
|
| 108 |
+
intent_counts = df_exploded['intent'].value_counts().reset_index()
|
| 109 |
+
intent_counts.columns = ['intent', 'count']
|
| 110 |
+
intent_counts['percentage'] = (
|
| 111 |
+
intent_counts['count'] / intent_counts['count'].sum() * 100
|
| 112 |
+
).round(2)
|
| 113 |
+
|
| 114 |
+
return intent_counts
|
| 115 |
+
|
| 116 |
+
@staticmethod
|
| 117 |
+
def get_content_summary(df):
|
| 118 |
+
"""
|
| 119 |
+
Get summary statistics for each content
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
df: Sentiment dataframe
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
pd.DataFrame: Content summary with statistics
|
| 126 |
+
"""
|
| 127 |
+
# Group by content (dropna=False to include records with NULL permalink_url, e.g., YouTube)
|
| 128 |
+
content_summary = df.groupby(['content_sk', 'content_description', 'permalink_url'], dropna=False).agg({
|
| 129 |
+
'comment_sk': 'count',
|
| 130 |
+
'requires_reply': 'sum',
|
| 131 |
+
'sentiment_polarity': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'unknown'
|
| 132 |
+
}).reset_index()
|
| 133 |
+
|
| 134 |
+
content_summary.columns = [
|
| 135 |
+
'content_sk', 'content_description', 'permalink_url',
|
| 136 |
+
'total_comments', 'reply_required_count', 'dominant_sentiment'
|
| 137 |
+
]
|
| 138 |
+
|
| 139 |
+
# Calculate negative sentiment percentage for each content
|
| 140 |
+
negative_sentiments = ['negative', 'very_negative']
|
| 141 |
+
content_negative = df[df['sentiment_polarity'].isin(negative_sentiments)].groupby(
|
| 142 |
+
'content_sk'
|
| 143 |
+
).size().reset_index(name='negative_count')
|
| 144 |
+
|
| 145 |
+
content_summary = content_summary.merge(content_negative, on='content_sk', how='left')
|
| 146 |
+
content_summary['negative_count'] = content_summary['negative_count'].fillna(0)
|
| 147 |
+
content_summary['negative_percentage'] = (
|
| 148 |
+
content_summary['negative_count'] / content_summary['total_comments'] * 100
|
| 149 |
+
).round(2)
|
| 150 |
+
|
| 151 |
+
# Calculate severity score (balances percentage and volume)
|
| 152 |
+
# Formula: negative_percentage * sqrt(total_comments)
|
| 153 |
+
# This gives weight to both high negative % and high comment volume
|
| 154 |
+
content_summary['severity_score'] = (
|
| 155 |
+
content_summary['negative_percentage'] *
|
| 156 |
+
(content_summary['total_comments'] ** 0.5)
|
| 157 |
+
).round(2)
|
| 158 |
+
|
| 159 |
+
return content_summary
|
| 160 |
+
|
| 161 |
+
@staticmethod
|
| 162 |
+
def get_top_poor_sentiment_contents(df, top_n=10, min_comments=1, sort_by='severity_score'):
|
| 163 |
+
"""
|
| 164 |
+
Get contents with highest poor sentiment based on selected criteria
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
df: Sentiment dataframe
|
| 168 |
+
top_n: Number of top contents to return
|
| 169 |
+
min_comments: Minimum number of comments a content must have to be included
|
| 170 |
+
sort_by: Sorting criteria - 'severity_score', 'negative_percentage', 'negative_count', 'total_comments'
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
pd.DataFrame: Top contents with poor sentiment
|
| 174 |
+
"""
|
| 175 |
+
content_summary = SentimentDataProcessor.get_content_summary(df)
|
| 176 |
+
|
| 177 |
+
# Filter by minimum comments
|
| 178 |
+
content_summary = content_summary[content_summary['total_comments'] >= min_comments]
|
| 179 |
+
|
| 180 |
+
# Determine sort columns based on sort_by parameter
|
| 181 |
+
if sort_by == 'severity_score':
|
| 182 |
+
# Sort by severity score (balanced), then by negative percentage as tie-breaker
|
| 183 |
+
sort_columns = ['severity_score', 'negative_percentage']
|
| 184 |
+
elif sort_by == 'negative_percentage':
|
| 185 |
+
# Sort by negative percentage, then by total comments
|
| 186 |
+
sort_columns = ['negative_percentage', 'total_comments']
|
| 187 |
+
elif sort_by == 'negative_count':
|
| 188 |
+
# Sort by absolute negative count, then by negative percentage
|
| 189 |
+
sort_columns = ['negative_count', 'negative_percentage']
|
| 190 |
+
elif sort_by == 'total_comments':
|
| 191 |
+
# Sort by total comments volume
|
| 192 |
+
sort_columns = ['total_comments', 'negative_count']
|
| 193 |
+
else:
|
| 194 |
+
# Default to severity score
|
| 195 |
+
sort_columns = ['severity_score', 'negative_percentage']
|
| 196 |
+
|
| 197 |
+
# Sort and get top N
|
| 198 |
+
top_poor = content_summary.sort_values(
|
| 199 |
+
by=sort_columns,
|
| 200 |
+
ascending=[False, False]
|
| 201 |
+
).head(top_n)
|
| 202 |
+
|
| 203 |
+
return top_poor
|
| 204 |
+
|
| 205 |
+
@staticmethod
|
| 206 |
+
def get_comments_requiring_reply(df):
|
| 207 |
+
"""
|
| 208 |
+
Get all comments that require reply
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
df: Sentiment dataframe
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
pd.DataFrame: Comments requiring reply
|
| 215 |
+
"""
|
| 216 |
+
reply_df = df[df['requires_reply'] == True].copy()
|
| 217 |
+
|
| 218 |
+
# Sort by timestamp (most recent first)
|
| 219 |
+
if 'comment_timestamp' in reply_df.columns:
|
| 220 |
+
reply_df = reply_df.sort_values('comment_timestamp', ascending=False)
|
| 221 |
+
|
| 222 |
+
return reply_df
|
| 223 |
+
|
| 224 |
+
@staticmethod
|
| 225 |
+
def get_platform_brand_summary(df):
|
| 226 |
+
"""
|
| 227 |
+
Get summary statistics by platform and brand
|
| 228 |
+
|
| 229 |
+
Args:
|
| 230 |
+
df: Sentiment dataframe
|
| 231 |
+
|
| 232 |
+
Returns:
|
| 233 |
+
pd.DataFrame: Platform and brand summary
|
| 234 |
+
"""
|
| 235 |
+
summary = df.groupby(['platform', 'brand']).agg({
|
| 236 |
+
'comment_sk': 'count',
|
| 237 |
+
'requires_reply': 'sum'
|
| 238 |
+
}).reset_index()
|
| 239 |
+
|
| 240 |
+
summary.columns = ['platform', 'brand', 'total_comments', 'reply_required']
|
| 241 |
+
|
| 242 |
+
# Add sentiment distribution
|
| 243 |
+
sentiment_dist = SentimentDataProcessor.get_sentiment_distribution(
|
| 244 |
+
df, group_by=['platform', 'brand']
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
# Pivot sentiment distribution
|
| 248 |
+
sentiment_pivot = sentiment_dist.pivot_table(
|
| 249 |
+
index=['platform', 'brand'],
|
| 250 |
+
columns='sentiment_polarity',
|
| 251 |
+
values='count',
|
| 252 |
+
fill_value=0
|
| 253 |
+
).reset_index()
|
| 254 |
+
|
| 255 |
+
# Merge with summary
|
| 256 |
+
summary = summary.merge(sentiment_pivot, on=['platform', 'brand'], how='left')
|
| 257 |
+
|
| 258 |
+
return summary
|
| 259 |
+
|
| 260 |
+
@staticmethod
|
| 261 |
+
def get_temporal_trends(df, freq='D'):
|
| 262 |
+
"""
|
| 263 |
+
Get temporal trends of sentiment over time
|
| 264 |
+
|
| 265 |
+
Args:
|
| 266 |
+
df: Sentiment dataframe
|
| 267 |
+
freq: Frequency for aggregation ('D'=daily, 'W'=weekly, 'M'=monthly)
|
| 268 |
+
|
| 269 |
+
Returns:
|
| 270 |
+
pd.DataFrame: Temporal sentiment trends
|
| 271 |
+
"""
|
| 272 |
+
if 'comment_timestamp' not in df.columns:
|
| 273 |
+
return pd.DataFrame()
|
| 274 |
+
|
| 275 |
+
df_temporal = df.copy()
|
| 276 |
+
df_temporal['date'] = pd.to_datetime(df_temporal['comment_timestamp']).dt.to_period(freq)
|
| 277 |
+
|
| 278 |
+
# Aggregate by date and sentiment
|
| 279 |
+
trends = df_temporal.groupby(['date', 'sentiment_polarity']).size().reset_index(name='count')
|
| 280 |
+
trends['date'] = trends['date'].dt.to_timestamp()
|
| 281 |
+
|
| 282 |
+
return trends
|
| 283 |
+
|
| 284 |
+
@staticmethod
|
| 285 |
+
def calculate_sentiment_score(df):
|
| 286 |
+
"""
|
| 287 |
+
Calculate weighted sentiment score
|
| 288 |
+
|
| 289 |
+
Args:
|
| 290 |
+
df: Sentiment dataframe
|
| 291 |
+
|
| 292 |
+
Returns:
|
| 293 |
+
float: Average sentiment score (-2 to +2)
|
| 294 |
+
"""
|
| 295 |
+
sentiment_weights = {
|
| 296 |
+
'very_negative': -2,
|
| 297 |
+
'negative': -1,
|
| 298 |
+
'neutral': 0,
|
| 299 |
+
'positive': 1,
|
| 300 |
+
'very_positive': 2
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
df['sentiment_score'] = df['sentiment_polarity'].map(sentiment_weights)
|
| 304 |
+
return df['sentiment_score'].mean()
|
| 305 |
+
|
| 306 |
+
@staticmethod
|
| 307 |
+
def get_language_distribution(df):
|
| 308 |
+
"""
|
| 309 |
+
Get distribution of detected languages
|
| 310 |
+
|
| 311 |
+
Args:
|
| 312 |
+
df: Sentiment dataframe
|
| 313 |
+
|
| 314 |
+
Returns:
|
| 315 |
+
pd.DataFrame: Language distribution
|
| 316 |
+
"""
|
| 317 |
+
if 'detected_language' not in df.columns:
|
| 318 |
+
return pd.DataFrame()
|
| 319 |
+
|
| 320 |
+
lang_dist = df['detected_language'].value_counts().reset_index()
|
| 321 |
+
lang_dist.columns = ['language', 'count']
|
| 322 |
+
lang_dist['percentage'] = (lang_dist['count'] / lang_dist['count'].sum() * 100).round(2)
|
| 323 |
+
|
| 324 |
+
return lang_dist
|
| 325 |
+
|
| 326 |
+
@staticmethod
|
| 327 |
+
def get_sentiment_filtered_contents(df, selected_sentiments=None, selected_intents=None,
|
| 328 |
+
top_n=10, min_comments=1, sort_by='severity_score'):
|
| 329 |
+
"""
|
| 330 |
+
Get contents filtered by selected sentiments and intents with dynamic sorting
|
| 331 |
+
|
| 332 |
+
Args:
|
| 333 |
+
df: Sentiment dataframe
|
| 334 |
+
selected_sentiments: List of sentiments to filter by (filters by dominant sentiment)
|
| 335 |
+
selected_intents: List of intents to filter by (content must have at least one comment with these intents)
|
| 336 |
+
top_n: Number of top contents to return
|
| 337 |
+
min_comments: Minimum number of comments a content must have
|
| 338 |
+
sort_by: Sorting criteria - 'severity_score', 'sentiment_percentage', 'sentiment_count', 'total_comments'
|
| 339 |
+
|
| 340 |
+
Returns:
|
| 341 |
+
pd.DataFrame: Filtered and sorted contents
|
| 342 |
+
"""
|
| 343 |
+
content_summary = SentimentDataProcessor.get_content_summary(df)
|
| 344 |
+
|
| 345 |
+
# Filter by minimum comments
|
| 346 |
+
content_summary = content_summary[content_summary['total_comments'] >= min_comments]
|
| 347 |
+
|
| 348 |
+
# If no sentiments selected, default to all sentiments
|
| 349 |
+
if not selected_sentiments:
|
| 350 |
+
selected_sentiments = df['sentiment_polarity'].unique().tolist()
|
| 351 |
+
|
| 352 |
+
# Filter by dominant sentiment
|
| 353 |
+
content_summary = content_summary[content_summary['dominant_sentiment'].isin(selected_sentiments)]
|
| 354 |
+
|
| 355 |
+
# Filter by intents if specified
|
| 356 |
+
if selected_intents:
|
| 357 |
+
# Get content_sks that have at least one comment with the selected intents
|
| 358 |
+
content_sks_with_intent = set()
|
| 359 |
+
for intent in selected_intents:
|
| 360 |
+
matching_contents = df[df['intent'].str.contains(intent, na=False, case=False)]['content_sk'].unique()
|
| 361 |
+
content_sks_with_intent.update(matching_contents)
|
| 362 |
+
|
| 363 |
+
content_summary = content_summary[content_summary['content_sk'].isin(content_sks_with_intent)]
|
| 364 |
+
|
| 365 |
+
# Calculate percentage and count for selected sentiments
|
| 366 |
+
sentiment_counts = df[df['sentiment_polarity'].isin(selected_sentiments)].groupby(
|
| 367 |
+
'content_sk'
|
| 368 |
+
).size().reset_index(name='selected_sentiment_count')
|
| 369 |
+
|
| 370 |
+
content_summary = content_summary.merge(sentiment_counts, on='content_sk', how='left')
|
| 371 |
+
content_summary['selected_sentiment_count'] = content_summary['selected_sentiment_count'].fillna(0)
|
| 372 |
+
content_summary['selected_sentiment_percentage'] = (
|
| 373 |
+
content_summary['selected_sentiment_count'] / content_summary['total_comments'] * 100
|
| 374 |
+
).round(2)
|
| 375 |
+
|
| 376 |
+
# Calculate dynamic severity score based on selected sentiments
|
| 377 |
+
content_summary['dynamic_severity_score'] = (
|
| 378 |
+
content_summary['selected_sentiment_percentage'] *
|
| 379 |
+
(content_summary['total_comments'] ** 0.5)
|
| 380 |
+
).round(2)
|
| 381 |
+
|
| 382 |
+
# Determine sort columns based on sort_by parameter
|
| 383 |
+
if sort_by == 'severity_score':
|
| 384 |
+
sort_columns = ['dynamic_severity_score', 'selected_sentiment_percentage']
|
| 385 |
+
elif sort_by == 'sentiment_percentage':
|
| 386 |
+
sort_columns = ['selected_sentiment_percentage', 'total_comments']
|
| 387 |
+
elif sort_by == 'sentiment_count':
|
| 388 |
+
sort_columns = ['selected_sentiment_count', 'selected_sentiment_percentage']
|
| 389 |
+
elif sort_by == 'total_comments':
|
| 390 |
+
sort_columns = ['total_comments', 'selected_sentiment_count']
|
| 391 |
+
else:
|
| 392 |
+
sort_columns = ['dynamic_severity_score', 'selected_sentiment_percentage']
|
| 393 |
+
|
| 394 |
+
# Sort and get top N
|
| 395 |
+
filtered_contents = content_summary.sort_values(
|
| 396 |
+
by=sort_columns,
|
| 397 |
+
ascending=[False, False]
|
| 398 |
+
).head(top_n)
|
| 399 |
+
|
| 400 |
+
return filtered_contents
|
| 401 |
+
|
| 402 |
+
@staticmethod
|
| 403 |
+
def get_demographics_distribution(df, demographic_field, filter_platform='musora_app'):
|
| 404 |
+
"""
|
| 405 |
+
Get distribution of a demographic field (only for specified platform)
|
| 406 |
+
|
| 407 |
+
Args:
|
| 408 |
+
df: Sentiment dataframe with demographic fields
|
| 409 |
+
demographic_field: Field to analyze ('age_group', 'timezone', 'timezone_region', 'experience_level', 'experience_group')
|
| 410 |
+
filter_platform: Platform to filter (default: 'musora_app')
|
| 411 |
+
|
| 412 |
+
Returns:
|
| 413 |
+
pd.DataFrame: Distribution with count and percentage
|
| 414 |
+
"""
|
| 415 |
+
# Filter for specified platform only
|
| 416 |
+
if filter_platform and 'platform' in df.columns:
|
| 417 |
+
df_filtered = df[df['platform'] == filter_platform].copy()
|
| 418 |
+
else:
|
| 419 |
+
df_filtered = df.copy()
|
| 420 |
+
|
| 421 |
+
if df_filtered.empty or demographic_field not in df_filtered.columns:
|
| 422 |
+
return pd.DataFrame()
|
| 423 |
+
|
| 424 |
+
# Remove 'Unknown' and null values
|
| 425 |
+
df_filtered = df_filtered[
|
| 426 |
+
(df_filtered[demographic_field].notna()) &
|
| 427 |
+
(df_filtered[demographic_field] != 'Unknown')
|
| 428 |
+
]
|
| 429 |
+
|
| 430 |
+
if df_filtered.empty:
|
| 431 |
+
return pd.DataFrame()
|
| 432 |
+
|
| 433 |
+
# Count distribution
|
| 434 |
+
distribution = df_filtered[demographic_field].value_counts().reset_index()
|
| 435 |
+
distribution.columns = [demographic_field, 'count']
|
| 436 |
+
|
| 437 |
+
# Calculate percentage
|
| 438 |
+
distribution['percentage'] = (
|
| 439 |
+
distribution['count'] / distribution['count'].sum() * 100
|
| 440 |
+
).round(2)
|
| 441 |
+
|
| 442 |
+
# Sort by count descending
|
| 443 |
+
distribution = distribution.sort_values('count', ascending=False)
|
| 444 |
+
|
| 445 |
+
return distribution
|
| 446 |
+
|
| 447 |
+
@staticmethod
|
| 448 |
+
def get_demographics_by_sentiment(df, demographic_field, filter_platform='musora_app'):
|
| 449 |
+
"""
|
| 450 |
+
Get sentiment distribution for each demographic group
|
| 451 |
+
|
| 452 |
+
Args:
|
| 453 |
+
df: Sentiment dataframe with demographic fields
|
| 454 |
+
demographic_field: Field to analyze
|
| 455 |
+
filter_platform: Platform to filter (default: 'musora_app')
|
| 456 |
+
|
| 457 |
+
Returns:
|
| 458 |
+
pd.DataFrame: Sentiment distribution per demographic group
|
| 459 |
+
"""
|
| 460 |
+
# Filter for specified platform only
|
| 461 |
+
if filter_platform and 'platform' in df.columns:
|
| 462 |
+
df_filtered = df[df['platform'] == filter_platform].copy()
|
| 463 |
+
else:
|
| 464 |
+
df_filtered = df.copy()
|
| 465 |
+
|
| 466 |
+
if df_filtered.empty or demographic_field not in df_filtered.columns:
|
| 467 |
+
return pd.DataFrame()
|
| 468 |
+
|
| 469 |
+
# Remove 'Unknown' and null values
|
| 470 |
+
df_filtered = df_filtered[
|
| 471 |
+
(df_filtered[demographic_field].notna()) &
|
| 472 |
+
(df_filtered[demographic_field] != 'Unknown')
|
| 473 |
+
]
|
| 474 |
+
|
| 475 |
+
if df_filtered.empty:
|
| 476 |
+
return pd.DataFrame()
|
| 477 |
+
|
| 478 |
+
# Group by demographic field and sentiment
|
| 479 |
+
sentiment_by_demo = df_filtered.groupby(
|
| 480 |
+
[demographic_field, 'sentiment_polarity'],
|
| 481 |
+
as_index=False
|
| 482 |
+
).size().rename(columns={'size': 'count'})
|
| 483 |
+
|
| 484 |
+
# Calculate percentage within each demographic group
|
| 485 |
+
sentiment_by_demo['percentage'] = sentiment_by_demo.groupby(demographic_field)['count'].transform(
|
| 486 |
+
lambda x: (x / x.sum() * 100).round(2)
|
| 487 |
+
)
|
| 488 |
+
|
| 489 |
+
return sentiment_by_demo
|
| 490 |
+
|
| 491 |
+
@staticmethod
|
| 492 |
+
def get_top_timezones(df, top_n=15, filter_platform='musora_app'):
|
| 493 |
+
"""
|
| 494 |
+
Get top N timezones with most comments
|
| 495 |
+
|
| 496 |
+
Args:
|
| 497 |
+
df: Sentiment dataframe with timezone field
|
| 498 |
+
top_n: Number of top timezones to return
|
| 499 |
+
filter_platform: Platform to filter (default: 'musora_app')
|
| 500 |
+
|
| 501 |
+
Returns:
|
| 502 |
+
pd.DataFrame: Top timezones with counts
|
| 503 |
+
"""
|
| 504 |
+
return SentimentDataProcessor.get_demographics_distribution(
|
| 505 |
+
df, 'timezone', filter_platform
|
| 506 |
+
).head(top_n)
|
| 507 |
+
|
| 508 |
+
@staticmethod
|
| 509 |
+
def get_timezone_regions_distribution(df, filter_platform='musora_app'):
|
| 510 |
+
"""
|
| 511 |
+
Get distribution of timezone regions
|
| 512 |
+
|
| 513 |
+
Args:
|
| 514 |
+
df: Sentiment dataframe with timezone_region field
|
| 515 |
+
filter_platform: Platform to filter (default: 'musora_app')
|
| 516 |
+
|
| 517 |
+
Returns:
|
| 518 |
+
pd.DataFrame: Region distribution with counts
|
| 519 |
+
"""
|
| 520 |
+
return SentimentDataProcessor.get_demographics_distribution(
|
| 521 |
+
df, 'timezone_region', filter_platform
|
| 522 |
+
)
|
| 523 |
+
|
| 524 |
+
@staticmethod
|
| 525 |
+
def get_experience_level_distribution(df, filter_platform='musora_app', use_groups=False):
|
| 526 |
+
"""
|
| 527 |
+
Get distribution of experience levels
|
| 528 |
+
|
| 529 |
+
Args:
|
| 530 |
+
df: Sentiment dataframe with experience fields
|
| 531 |
+
filter_platform: Platform to filter (default: 'musora_app')
|
| 532 |
+
use_groups: If True, use grouped experience levels, otherwise use raw values
|
| 533 |
+
|
| 534 |
+
Returns:
|
| 535 |
+
pd.DataFrame: Experience distribution
|
| 536 |
+
"""
|
| 537 |
+
field = 'experience_group' if use_groups else 'experience_level'
|
| 538 |
+
return SentimentDataProcessor.get_demographics_distribution(
|
| 539 |
+
df, field, filter_platform
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
@staticmethod
|
| 543 |
+
def get_demographics_summary(df, filter_platform='musora_app'):
|
| 544 |
+
"""
|
| 545 |
+
Get summary statistics for demographic data
|
| 546 |
+
|
| 547 |
+
Args:
|
| 548 |
+
df: Sentiment dataframe with demographic fields
|
| 549 |
+
filter_platform: Platform to filter (default: 'musora_app')
|
| 550 |
+
|
| 551 |
+
Returns:
|
| 552 |
+
dict: Summary statistics
|
| 553 |
+
"""
|
| 554 |
+
# Filter for specified platform only
|
| 555 |
+
if filter_platform and 'platform' in df.columns:
|
| 556 |
+
df_filtered = df[df['platform'] == filter_platform].copy()
|
| 557 |
+
else:
|
| 558 |
+
df_filtered = df.copy()
|
| 559 |
+
|
| 560 |
+
if df_filtered.empty:
|
| 561 |
+
return {
|
| 562 |
+
'total_comments': 0,
|
| 563 |
+
'users_with_demographics': 0,
|
| 564 |
+
'avg_age': None,
|
| 565 |
+
'most_common_age_group': 'Unknown',
|
| 566 |
+
'most_common_region': 'Unknown',
|
| 567 |
+
'avg_experience': None
|
| 568 |
+
}
|
| 569 |
+
|
| 570 |
+
# Remove records without demographic data
|
| 571 |
+
df_with_demo = df_filtered[
|
| 572 |
+
(df_filtered['age'].notna()) |
|
| 573 |
+
(df_filtered['timezone'].notna()) |
|
| 574 |
+
(df_filtered['experience_level'].notna())
|
| 575 |
+
].copy()
|
| 576 |
+
|
| 577 |
+
summary = {
|
| 578 |
+
'total_comments': len(df_filtered),
|
| 579 |
+
'users_with_demographics': len(df_with_demo),
|
| 580 |
+
'coverage_percentage': round(len(df_with_demo) / len(df_filtered) * 100, 2) if len(df_filtered) > 0 else 0
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
# Age statistics
|
| 584 |
+
if 'age' in df_with_demo.columns:
|
| 585 |
+
valid_ages = df_with_demo['age'].dropna()
|
| 586 |
+
summary['avg_age'] = round(valid_ages.mean(), 1) if len(valid_ages) > 0 else None
|
| 587 |
+
|
| 588 |
+
age_groups = df_with_demo['age_group'].value_counts()
|
| 589 |
+
summary['most_common_age_group'] = age_groups.index[0] if len(age_groups) > 0 else 'Unknown'
|
| 590 |
+
|
| 591 |
+
# Timezone statistics
|
| 592 |
+
if 'timezone_region' in df_with_demo.columns:
|
| 593 |
+
regions = df_with_demo[df_with_demo['timezone_region'] != 'Unknown']['timezone_region'].value_counts()
|
| 594 |
+
summary['most_common_region'] = regions.index[0] if len(regions) > 0 else 'Unknown'
|
| 595 |
+
|
| 596 |
+
# Experience statistics
|
| 597 |
+
if 'experience_level' in df_with_demo.columns:
|
| 598 |
+
valid_exp = df_with_demo['experience_level'].dropna()
|
| 599 |
+
summary['avg_experience'] = round(valid_exp.mean(), 2) if len(valid_exp) > 0 else None
|
| 600 |
+
|
| 601 |
+
exp_groups = df_with_demo['experience_group'].value_counts()
|
| 602 |
+
summary['most_common_experience'] = exp_groups.index[0] if len(exp_groups) > 0 else 'Unknown'
|
| 603 |
+
|
| 604 |
+
return summary
|
visualization/utils/llm_helper.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LLM Helper for visualization agents
|
| 3 |
+
Handles OpenAI API calls with retry logic and error handling
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
from typing import Dict, Any, Optional
|
| 8 |
+
from openai import OpenAI
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
import time
|
| 11 |
+
|
| 12 |
+
# Load environment variables from root directory (parent of visualization)
|
| 13 |
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 14 |
+
VISUALIZATION_DIR = os.path.dirname(SCRIPT_DIR)
|
| 15 |
+
ROOT_DIR = os.path.dirname(VISUALIZATION_DIR)
|
| 16 |
+
load_dotenv(os.path.join(ROOT_DIR, '.env'))
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class LLMHelper:
|
| 20 |
+
"""
|
| 21 |
+
Helper class for LLM interactions
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, model: str = "gpt-5-nano", temperature: float = 1):
|
| 25 |
+
"""
|
| 26 |
+
Initialize LLM helper
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
model: Model name to use
|
| 30 |
+
temperature: Temperature for generation
|
| 31 |
+
"""
|
| 32 |
+
self.model = model
|
| 33 |
+
self.temperature = temperature
|
| 34 |
+
self.api_key = os.getenv('OPENAI_API_KEY')
|
| 35 |
+
|
| 36 |
+
if not self.api_key:
|
| 37 |
+
raise ValueError("OPENAI_API_KEY not found in environment variables")
|
| 38 |
+
|
| 39 |
+
self.client = OpenAI(api_key=self.api_key)
|
| 40 |
+
|
| 41 |
+
def get_completion(
|
| 42 |
+
self,
|
| 43 |
+
prompt: str,
|
| 44 |
+
system_message: Optional[str] = None,
|
| 45 |
+
max_retries: int = 3,
|
| 46 |
+
json_mode: bool = False
|
| 47 |
+
) -> Dict[str, Any]:
|
| 48 |
+
"""
|
| 49 |
+
Get completion from LLM with retry logic
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
prompt: User prompt
|
| 53 |
+
system_message: Optional system message
|
| 54 |
+
max_retries: Maximum number of retries
|
| 55 |
+
json_mode: Whether to force JSON response
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
Dictionary with response data
|
| 59 |
+
"""
|
| 60 |
+
messages = []
|
| 61 |
+
|
| 62 |
+
if system_message:
|
| 63 |
+
messages.append({"role": "system", "content": system_message})
|
| 64 |
+
|
| 65 |
+
messages.append({"role": "user", "content": prompt})
|
| 66 |
+
|
| 67 |
+
for attempt in range(max_retries):
|
| 68 |
+
try:
|
| 69 |
+
# Prepare API call parameters
|
| 70 |
+
api_params = {
|
| 71 |
+
"model": self.model,
|
| 72 |
+
"messages": messages,
|
| 73 |
+
"temperature": self.temperature,
|
| 74 |
+
"reasoning_effort": "low",
|
| 75 |
+
"n": 1
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
# Add response format if JSON mode requested
|
| 79 |
+
if json_mode:
|
| 80 |
+
api_params["response_format"] = {"type": "json_object"}
|
| 81 |
+
|
| 82 |
+
# Make API call
|
| 83 |
+
response = self.client.chat.completions.create(**api_params)
|
| 84 |
+
|
| 85 |
+
# Extract response
|
| 86 |
+
content = response.choices[0].message.content
|
| 87 |
+
|
| 88 |
+
# Parse JSON if requested
|
| 89 |
+
if json_mode:
|
| 90 |
+
try:
|
| 91 |
+
content = json.loads(content)
|
| 92 |
+
except json.JSONDecodeError as e:
|
| 93 |
+
return {
|
| 94 |
+
'success': False,
|
| 95 |
+
'error': f"Failed to parse JSON response: {str(e)}",
|
| 96 |
+
'raw_content': content
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
return {
|
| 100 |
+
'success': True,
|
| 101 |
+
'content': content,
|
| 102 |
+
'model': response.model,
|
| 103 |
+
'usage': {
|
| 104 |
+
'prompt_tokens': response.usage.prompt_tokens,
|
| 105 |
+
'completion_tokens': response.usage.completion_tokens,
|
| 106 |
+
'total_tokens': response.usage.total_tokens
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
if attempt < max_retries - 1:
|
| 112 |
+
# Wait before retry (exponential backoff)
|
| 113 |
+
time.sleep(2 ** attempt)
|
| 114 |
+
continue
|
| 115 |
+
else:
|
| 116 |
+
return {
|
| 117 |
+
'success': False,
|
| 118 |
+
'error': str(e),
|
| 119 |
+
'error_type': type(e).__name__
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
return {
|
| 123 |
+
'success': False,
|
| 124 |
+
'error': f"Failed after {max_retries} attempts"
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
def get_structured_completion(
|
| 128 |
+
self,
|
| 129 |
+
prompt: str,
|
| 130 |
+
system_message: str,
|
| 131 |
+
max_retries: int = 3
|
| 132 |
+
) -> Dict[str, Any]:
|
| 133 |
+
"""
|
| 134 |
+
Get structured JSON completion
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
prompt: User prompt
|
| 138 |
+
system_message: System message
|
| 139 |
+
max_retries: Maximum retries
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
Structured response dictionary
|
| 143 |
+
"""
|
| 144 |
+
return self.get_completion(
|
| 145 |
+
prompt=prompt,
|
| 146 |
+
system_message=system_message,
|
| 147 |
+
max_retries=max_retries,
|
| 148 |
+
json_mode=True
|
| 149 |
+
)
|