Commit ·
9858829
1
Parent(s): f89e3ef
Deploying sentiment analysis project
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +0 -35
- .idea/vcs.xml +4 -0
- Dockerfile +0 -20
- README.md +304 -15
- processing_brand_sentiment/ARCHITECTURE_PROPOSAL.md +437 -0
- processing_brand_sentiment/README.md +402 -0
- processing_brand_sentiment/config_files/analysis_categories.json +123 -0
- processing_brand_sentiment/config_files/brand_config.json +111 -0
- processing_brand_sentiment/config_files/workflow_config.json +60 -0
- processing_brand_sentiment/database/__init__.py +8 -0
- processing_brand_sentiment/database/snowflake_connection.py +240 -0
- processing_brand_sentiment/database/sql/create_comments_output_table.sql +161 -0
- processing_brand_sentiment/database/sql/create_output_table.sql +250 -0
- processing_brand_sentiment/database/sql/fetch_comments.sql +82 -0
- processing_brand_sentiment/database/sql/fetch_forum_posts.sql +106 -0
- processing_brand_sentiment/database/sql/init_comments_output_table.sql +78 -0
- processing_brand_sentiment/database/sql/init_output_table.sql +89 -0
- processing_brand_sentiment/main.py +1088 -0
- processing_brand_sentiment/utils/__init__.py +8 -0
- processing_brand_sentiment/utils/html_parser.py +253 -0
- processing_brand_sentiment/workflow/__init__.py +10 -0
- processing_brand_sentiment/workflow/agents/__init__.py +39 -0
- processing_brand_sentiment/workflow/agents/base_agent.py +169 -0
- processing_brand_sentiment/workflow/agents/comment_preprocessor_agent.py +211 -0
- processing_brand_sentiment/workflow/agents/content_preprocessor_agent.py +570 -0
- processing_brand_sentiment/workflow/agents/output_validator_agent.py +408 -0
- processing_brand_sentiment/workflow/agents/preprocessor_agent.py +408 -0
- processing_brand_sentiment/workflow/agents/relevance_validator_agent.py +289 -0
- processing_brand_sentiment/workflow/agents/sabian_analyzer_agent.py +388 -0
- processing_brand_sentiment/workflow/agents/sabian_relevance_extraction_agent.py +431 -0
- processing_brand_sentiment/workflow/agents/sabian_sentiment_analyzer_agent.py +434 -0
- processing_brand_sentiment/workflow/comment_orchestrator.py +558 -0
- processing_brand_sentiment/workflow/orchestrator.py +551 -0
- processing_comments/.dockerignore +8 -0
- processing_comments/LICENSE +201 -0
- processing_comments/README.md +726 -0
- processing_comments/SnowFlakeConnection.py +121 -0
- processing_comments/agents/README.md +1571 -0
- processing_comments/agents/__init__.py +14 -0
- processing_comments/agents/base_agent.py +104 -0
- processing_comments/agents/language_detection_agent.py +292 -0
- processing_comments/agents/sentiment_analysis_agent.py +381 -0
- processing_comments/agents/translation_agent.py +210 -0
- processing_comments/config_files/data_sources_config.json +56 -0
- processing_comments/config_files/sentiment_analysis_config.json +96 -0
- processing_comments/config_files/sentiment_config.json +49 -0
- processing_comments/main.py +572 -0
- processing_comments/requirements.txt +10 -0
- processing_comments/sql/create_ml_features_table.sql +127 -0
- processing_comments/sql/create_musora_ml_features_table.sql +135 -0
.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.idea/vcs.xml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="VcsDirectoryMappings" defaultProject="true" />
|
| 4 |
+
</project>
|
Dockerfile
DELETED
|
@@ -1,20 +0,0 @@
|
|
| 1 |
-
FROM python:3.13.5-slim
|
| 2 |
-
|
| 3 |
-
WORKDIR /app
|
| 4 |
-
|
| 5 |
-
RUN apt-get update && apt-get install -y \
|
| 6 |
-
build-essential \
|
| 7 |
-
curl \
|
| 8 |
-
git \
|
| 9 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
-
|
| 11 |
-
COPY requirements.txt ./
|
| 12 |
-
COPY src/ ./src/
|
| 13 |
-
|
| 14 |
-
RUN pip3 install -r requirements.txt
|
| 15 |
-
|
| 16 |
-
EXPOSE 8501
|
| 17 |
-
|
| 18 |
-
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 19 |
-
|
| 20 |
-
ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,20 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
---
|
| 14 |
|
| 15 |
-
#
|
| 16 |
|
| 17 |
-
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Musora Sentiment Analysis Dashboard
|
| 2 |
+
|
| 3 |
+
A Streamlit dashboard for visualising sentiment analysis results from **social media comments** (Facebook, Instagram, YouTube, Twitter) and the **Musora internal app** across brands (Drumeo, Pianote, Guitareo, Singeo, Musora).
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Table of Contents
|
| 8 |
+
|
| 9 |
+
1. [Project Structure](#project-structure)
|
| 10 |
+
2. [How Data Flows](#how-data-flows)
|
| 11 |
+
3. [Data Loading Strategy](#data-loading-strategy)
|
| 12 |
+
4. [Pages](#pages)
|
| 13 |
+
5. [Global Filters & Session State](#global-filters--session-state)
|
| 14 |
+
6. [Snowflake Queries](#snowflake-queries)
|
| 15 |
+
7. [Adding or Changing Things](#adding-or-changing-things)
|
| 16 |
+
8. [Running the App](#running-the-app)
|
| 17 |
+
9. [Configuration Reference](#configuration-reference)
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## Project Structure
|
| 22 |
+
|
| 23 |
+
```
|
| 24 |
+
visualization/
|
| 25 |
+
├── app.py # Entry point — routing, sidebar, session state
|
| 26 |
+
├── config/
|
| 27 |
+
│ └── viz_config.json # Colors, query strings, dashboard settings
|
| 28 |
+
├── data/
|
| 29 |
+
│ └── data_loader.py # All Snowflake queries and caching logic
|
| 30 |
+
├── utils/
|
| 31 |
+
│ ├── data_processor.py # Pandas aggregations (intent dist, content summary, etc.)
|
| 32 |
+
│ └── metrics.py # KPI calculations (sentiment score, urgency, etc.)
|
| 33 |
+
├── components/
|
| 34 |
+
│ ├── dashboard.py # Dashboard page renderer
|
| 35 |
+
│ ├── sentiment_analysis.py # Sentiment Analysis page renderer
|
| 36 |
+
│ └── reply_required.py # Reply Required page renderer
|
| 37 |
+
├── visualizations/
|
| 38 |
+
│ ├── sentiment_charts.py # Plotly sentiment chart functions
|
| 39 |
+
│ ├── distribution_charts.py # Plotly distribution / heatmap / scatter functions
|
| 40 |
+
│ ├── demographic_charts.py # Plotly demographic chart functions
|
| 41 |
+
│ └── content_cards.py # Streamlit card components (comment cards, content cards)
|
| 42 |
+
├── agents/
|
| 43 |
+
│ └── content_summary_agent.py # AI analysis agent (OpenAI) for comment summarisation
|
| 44 |
+
├── img/
|
| 45 |
+
│ └── musora.png # Sidebar logo
|
| 46 |
+
└── SnowFlakeConnection.py # Snowflake connection wrapper (Snowpark session)
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
---
|
| 50 |
+
|
| 51 |
+
## How Data Flows
|
| 52 |
+
|
| 53 |
+
```
|
| 54 |
+
Snowflake
|
| 55 |
+
│
|
| 56 |
+
▼
|
| 57 |
+
data_loader.py ← Three separate loading modes (see below)
|
| 58 |
+
│
|
| 59 |
+
├── load_dashboard_data() ──► st.session_state['dashboard_df']
|
| 60 |
+
│ └─► app.py sidebar (filter options, counts)
|
| 61 |
+
│ └─► dashboard.py (all charts)
|
| 62 |
+
│
|
| 63 |
+
├── load_sa_data() ──► st.session_state['sa_contents']
|
| 64 |
+
│ (on-demand, button) st.session_state['sa_comments']
|
| 65 |
+
│ └─► sentiment_analysis.py
|
| 66 |
+
│
|
| 67 |
+
└── load_reply_required_data() ► st.session_state['rr_df']
|
| 68 |
+
(on-demand, button) └─► reply_required.py
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
**Key principle:** Data is loaded as little as possible, as late as possible.
|
| 72 |
+
|
| 73 |
+
- The **Dashboard** uses a lightweight query (no text columns, no content join) cached for 24 hours.
|
| 74 |
+
- The **Sentiment Analysis** and **Reply Required** pages never load data automatically — they wait for the user to click **Fetch Data**.
|
| 75 |
+
- All data is stored in `st.session_state` so page navigation and widget interactions do not re-trigger Snowflake queries.
|
| 76 |
+
|
| 77 |
+
---
|
| 78 |
+
|
| 79 |
+
## Data Loading Strategy
|
| 80 |
+
|
| 81 |
+
All loading logic lives in **`data/data_loader.py`** (`SentimentDataLoader` class).
|
| 82 |
+
|
| 83 |
+
### `load_dashboard_data()`
|
| 84 |
+
- Uses `dashboard_query` from `viz_config.json`.
|
| 85 |
+
- Fetches only: `comment_sk, content_sk, platform, brand, sentiment_polarity, intent, requires_reply, detected_language, comment_timestamp, processed_at, author_id`.
|
| 86 |
+
- No text columns, no `DIM_CONTENT` join — significantly faster than the full query.
|
| 87 |
+
- Also merges demographics data if `demographics_query` is configured.
|
| 88 |
+
- Cached for **24 hours** (`@st.cache_data(ttl=86400)`).
|
| 89 |
+
- Called once by `app.py` at startup; result stored in `st.session_state['dashboard_df']`.
|
| 90 |
+
|
| 91 |
+
### `load_sa_data(platform, brand, top_n, min_comments, sort_by, sentiments, intents, date_range)`
|
| 92 |
+
- Runs **two** sequential Snowflake queries:
|
| 93 |
+
1. **Content aggregation** — groups by `content_sk`, counts per sentiment, computes severity score, returns top N.
|
| 94 |
+
2. **Sampled comments** — for the top N `content_sk`s only, fetches up to 50 comments per sentiment group per content (negative, positive, other), using Snowflake `QUALIFY ROW_NUMBER()`. `display_text` is computed in SQL (`CASE WHEN IS_ENGLISH = FALSE AND TRANSLATED_TEXT IS NOT NULL THEN TRANSLATED_TEXT ELSE ORIGINAL_TEXT END`).
|
| 95 |
+
- Returns a tuple `(contents_df, comments_df)`.
|
| 96 |
+
- Cached for **24 hours**.
|
| 97 |
+
- Called only when the user clicks **Fetch Data** on the Sentiment Analysis page.
|
| 98 |
+
|
| 99 |
+
### `load_reply_required_data(platforms, brands, date_range)`
|
| 100 |
+
- Runs a single query filtering `REQUIRES_REPLY = TRUE`.
|
| 101 |
+
- Dynamically includes/excludes the social media table and musora table based on selected platforms.
|
| 102 |
+
- `display_text` computed in SQL.
|
| 103 |
+
- Cached for **24 hours**.
|
| 104 |
+
- Called only when the user clicks **Fetch Data** on the Reply Required page.
|
| 105 |
+
|
| 106 |
+
### Important: SQL Column Qualification
|
| 107 |
+
Both the social media table (`COMMENT_SENTIMENT_FEATURES`) and the content dimension table (`DIM_CONTENT`) share column names. Any `WHERE` clause inside a query that joins these two tables **must** use the table alias prefix (e.g. `s.PLATFORM`, `s.COMMENT_TIMESTAMP`, `s.CHANNEL_NAME`) to avoid Snowflake `ambiguous column name` errors. The musora table (`MUSORA_COMMENT_SENTIMENT_FEATURES`) has no joins so unqualified column names are fine there.
|
| 108 |
+
|
| 109 |
+
---
|
| 110 |
+
|
| 111 |
+
## Pages
|
| 112 |
+
|
| 113 |
+
### Dashboard (`components/dashboard.py`)
|
| 114 |
+
|
| 115 |
+
**Receives:** `filtered_df` — the lightweight dashboard dataframe (after optional global filter applied by `app.py`).
|
| 116 |
+
|
| 117 |
+
**Does not need:** text, translations, content URLs. All charts work purely on aggregated columns (sentiment_polarity, brand, platform, intent, requires_reply, comment_timestamp).
|
| 118 |
+
|
| 119 |
+
**Key sections:**
|
| 120 |
+
- Summary stats + health indicator
|
| 121 |
+
- Sentiment distribution (pie + gauge)
|
| 122 |
+
- Sentiment by brand and platform (stacked + percentage bar charts)
|
| 123 |
+
- Intent analysis
|
| 124 |
+
- Brand-Platform heatmap
|
| 125 |
+
- Reply requirements + urgency breakdown
|
| 126 |
+
- Demographics (age, timezone, experience level) — only rendered if `author_id` is present and demographics were merged
|
| 127 |
+
|
| 128 |
+
**To add a new chart:** create the chart function in `visualizations/` and call it from `render_dashboard()`. The function receives `filtered_df`.
|
| 129 |
+
|
| 130 |
+
---
|
| 131 |
+
|
| 132 |
+
### Sentiment Analysis (`components/sentiment_analysis.py`)
|
| 133 |
+
|
| 134 |
+
**Receives:** `data_loader` instance only (no dataframe).
|
| 135 |
+
|
| 136 |
+
**Flow:**
|
| 137 |
+
1. Reads `st.session_state['dashboard_df']` for filter option lists (platforms, brands, sentiments, intents).
|
| 138 |
+
2. Pre-populates platform/brand dropdowns from `st.session_state['global_filters']`.
|
| 139 |
+
3. Shows filter controls (platform, brand, sentiment, intent, top_n, min_comments, sort_by).
|
| 140 |
+
4. On **Fetch Data** click: calls `data_loader.load_sa_data(...)` and stores results in `st.session_state['sa_contents']` and `['sa_comments']`.
|
| 141 |
+
5. Renders content cards, per-content sentiment + intent charts, AI analysis buttons, and sampled comment expanders.
|
| 142 |
+
|
| 143 |
+
**Pagination:** `st.session_state['sentiment_page']` (5 contents per page). Reset on new fetch.
|
| 144 |
+
|
| 145 |
+
**Comments:** Sampled (up to 50 negative + 50 positive + 50 neutral per content). These are already in memory after the fetch — no extra query is needed when the user expands a comment section.
|
| 146 |
+
|
| 147 |
+
**AI Analysis:** Uses `ContentSummaryAgent` (see `agents/`). Results cached in `st.session_state['content_summaries']`.
|
| 148 |
+
|
| 149 |
---
|
| 150 |
+
|
| 151 |
+
### Reply Required (`components/reply_required.py`)
|
| 152 |
+
|
| 153 |
+
**Receives:** `data_loader` instance only.
|
| 154 |
+
|
| 155 |
+
**Flow:**
|
| 156 |
+
1. Reads `st.session_state['dashboard_df']` for filter option lists.
|
| 157 |
+
2. Pre-populates platform, brand, and date from `st.session_state['global_filters']`.
|
| 158 |
+
3. On **Fetch Data** click: calls `data_loader.load_reply_required_data(...)` and stores result in `st.session_state['rr_df']`.
|
| 159 |
+
4. Shows urgency breakdown, in-page view filters (priority, platform, brand, intent — applied in Python, no new query), paginated comment cards, and a "Reply by Content" summary.
|
| 160 |
+
|
| 161 |
+
**Pagination:** `st.session_state['reply_page']` (10 comments per page). Reset on new fetch.
|
| 162 |
+
|
| 163 |
+
---
|
| 164 |
+
|
| 165 |
+
## Global Filters & Session State
|
| 166 |
+
|
| 167 |
+
Global filters live in the sidebar (`app.py`) and are stored in `st.session_state['global_filters']` as a dict:
|
| 168 |
+
|
| 169 |
+
```python
|
| 170 |
+
{
|
| 171 |
+
'platforms': ['facebook', 'instagram'], # list or []
|
| 172 |
+
'brands': ['drumeo'],
|
| 173 |
+
'sentiments': [],
|
| 174 |
+
'date_range': (date(2025, 1, 1), date(2025, 12, 31)), # or None
|
| 175 |
+
}
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
- **Dashboard:** `app.py` applies global filters to `dashboard_df` using `data_loader.apply_filters()` and passes the result to `render_dashboard()`.
|
| 179 |
+
- **Sentiment Analysis / Reply Required:** global filters are used to pre-populate their own filter widgets. The actual Snowflake query uses those values when the user clicks Fetch. The pages do **not** receive a pre-filtered dataframe.
|
| 180 |
+
|
| 181 |
+
### Full session state key reference
|
| 182 |
+
|
| 183 |
+
| Key | Set by | Used by |
|
| 184 |
+
|-----|--------|---------|
|
| 185 |
+
| `dashboard_df` | `app.py` on startup | sidebar (filter options), dashboard, SA + RR (filter option lists) |
|
| 186 |
+
| `global_filters` | sidebar "Apply Filters" button | app.py (dashboard filter), SA + RR (pre-populate widgets) |
|
| 187 |
+
| `filters_applied` | sidebar buttons | app.py (whether to apply filters) |
|
| 188 |
+
| `sa_contents` | SA fetch button | SA page rendering |
|
| 189 |
+
| `sa_comments` | SA fetch button | SA page rendering |
|
| 190 |
+
| `sa_fetch_key` | SA fetch button | SA page (detect stale data) |
|
| 191 |
+
| `rr_df` | RR fetch button | RR page rendering |
|
| 192 |
+
| `rr_fetch_key` | RR fetch button | RR page (detect stale data) |
|
| 193 |
+
| `sentiment_page` | SA page / fetch | SA pagination |
|
| 194 |
+
| `reply_page` | RR page / fetch | RR pagination |
|
| 195 |
+
| `content_summaries` | AI analysis buttons | SA AI analysis display |
|
| 196 |
+
|
| 197 |
+
---
|
| 198 |
+
|
| 199 |
+
## Snowflake Queries
|
| 200 |
+
|
| 201 |
+
All query strings are either stored in `config/viz_config.json` (static queries) or built dynamically in `data/data_loader.py` (page-specific queries).
|
| 202 |
+
|
| 203 |
+
### Static queries (in `viz_config.json`)
|
| 204 |
+
|
| 205 |
+
| Key | Purpose |
|
| 206 |
+
|-----|---------|
|
| 207 |
+
| `query` | Full query with all columns (legacy, kept for compatibility) |
|
| 208 |
+
| `dashboard_query` | Lightweight query — no text, no DIM_CONTENT join |
|
| 209 |
+
| `demographics_query` | Joins `usora_users` with `preprocessed.users` to get age/timezone/experience |
|
| 210 |
+
|
| 211 |
+
### Dynamic queries (built in `data_loader.py`)
|
| 212 |
+
|
| 213 |
+
| Method | Description |
|
| 214 |
+
|--------|-------------|
|
| 215 |
+
| `_build_sa_content_query()` | Content aggregation for SA page; filters by platform + brand + date |
|
| 216 |
+
| `_build_sa_comments_query()` | Sampled comments for SA page; uses `QUALIFY ROW_NUMBER() <= 50` |
|
| 217 |
+
| `_build_rr_query()` | Reply-required comments; filters by platform/brand/date; conditionally includes social media and/or musora table |
|
| 218 |
+
|
| 219 |
+
### Data source tables
|
| 220 |
+
|
| 221 |
+
| Table | Platform | Notes |
|
| 222 |
+
|-------|----------|-------|
|
| 223 |
+
| `SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES` | facebook, instagram, youtube, twitter | Needs `LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT` for `PERMALINK_URL` |
|
| 224 |
+
| `SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES` | musora_app | Has `PERMALINK_URL` and `THUMBNAIL_URL` natively; platform stored as `'musora'`, mapped to `'musora_app'` in queries |
|
| 225 |
+
|
| 226 |
+
---
|
| 227 |
+
|
| 228 |
+
## Adding or Changing Things
|
| 229 |
+
|
| 230 |
+
### Add a new chart to the Dashboard
|
| 231 |
+
1. Write the chart function in the appropriate `visualizations/` file.
|
| 232 |
+
2. Call it from `render_dashboard()` in `components/dashboard.py`, passing `filtered_df`.
|
| 233 |
+
3. The chart function receives a lightweight df — it has no text columns but has all the columns listed in `dashboard_query`.
|
| 234 |
+
|
| 235 |
+
### Add a new filter to the Dashboard sidebar
|
| 236 |
+
1. Add the widget in `app.py` under the "Global Filters" section.
|
| 237 |
+
2. Store the selected value in the `global_filters` dict under `st.session_state`.
|
| 238 |
+
3. Pass it to `data_loader.apply_filters()`.
|
| 239 |
+
|
| 240 |
+
### Change what the Sentiment Analysis page queries
|
| 241 |
+
- Edit `_build_sa_content_query()` and/or `_build_sa_comments_query()` in `data_loader.py`.
|
| 242 |
+
- If you add new columns to the content aggregation result, also update `_process_sa_content_stats()` so they are available in `contents_df`.
|
| 243 |
+
- If you add new columns to the comments result, update `_process_sa_comments()`.
|
| 244 |
+
|
| 245 |
+
### Change what the Reply Required page queries
|
| 246 |
+
- Edit `_build_rr_query()` in `data_loader.py`.
|
| 247 |
+
- Remember: all column references inside the social media block (which has a `JOIN`) must be prefixed with `s.` to avoid Snowflake ambiguity errors.
|
| 248 |
+
|
| 249 |
+
### Change the cache duration
|
| 250 |
+
- `@st.cache_data(ttl=86400)` is set on `load_dashboard_data`, `_fetch_sa_data`, `_fetch_rr_data`, and `load_demographics_data`.
|
| 251 |
+
- Change `86400` (seconds) to the desired TTL, or set `ttl=None` for no expiry.
|
| 252 |
+
- Users can always force a refresh with the "Reload Data" button in the sidebar (which calls `st.cache_data.clear()` and deletes `st.session_state['dashboard_df']`).
|
| 253 |
+
|
| 254 |
+
### Add a new page
|
| 255 |
+
1. Create `components/new_page.py` with a `render_new_page(data_loader)` function.
|
| 256 |
+
2. Import and add a radio option in `app.py`.
|
| 257 |
+
3. If the page needs its own Snowflake data, add a `load_new_page_data()` method to `SentimentDataLoader` following the same pattern as `load_sa_data`.
|
| 258 |
+
|
| 259 |
+
### Add a new column to the Dashboard query
|
| 260 |
+
- Edit `dashboard_query` in `config/viz_config.json`.
|
| 261 |
+
- Both UNION branches must select the same columns in the same order.
|
| 262 |
+
- `_process_dashboard_dataframe()` in `data_loader.py` handles basic type casting — add processing there if needed.
|
| 263 |
+
|
| 264 |
+
---
|
| 265 |
+
|
| 266 |
+
## Running the App
|
| 267 |
+
|
| 268 |
+
```bash
|
| 269 |
+
# From the project root
|
| 270 |
+
streamlit run visualization/app.py
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
**Required environment variables** (in `.env` at project root):
|
| 274 |
+
|
| 275 |
+
```
|
| 276 |
+
SNOWFLAKE_USER
|
| 277 |
+
SNOWFLAKE_PASSWORD
|
| 278 |
+
SNOWFLAKE_ACCOUNT
|
| 279 |
+
SNOWFLAKE_ROLE
|
| 280 |
+
SNOWFLAKE_DATABASE
|
| 281 |
+
SNOWFLAKE_WAREHOUSE
|
| 282 |
+
SNOWFLAKE_SCHEMA
|
| 283 |
+
```
|
| 284 |
+
|
| 285 |
---
|
| 286 |
|
| 287 |
+
## Configuration Reference
|
| 288 |
|
| 289 |
+
`config/viz_config.json` controls:
|
| 290 |
|
| 291 |
+
| Section | What it configures |
|
| 292 |
+
|---------|-------------------|
|
| 293 |
+
| `color_schemes.sentiment_polarity` | Hex colors for each sentiment level |
|
| 294 |
+
| `color_schemes.intent` | Hex colors for each intent label |
|
| 295 |
+
| `color_schemes.platform` | Hex colors for each platform |
|
| 296 |
+
| `color_schemes.brand` | Hex colors for each brand |
|
| 297 |
+
| `sentiment_order` | Display order for sentiment categories in charts |
|
| 298 |
+
| `intent_order` | Display order for intent categories |
|
| 299 |
+
| `negative_sentiments` | Which sentiment values count as "negative" |
|
| 300 |
+
| `dashboard.default_date_range_days` | Default date filter window (days) |
|
| 301 |
+
| `dashboard.max_comments_display` | Max comments shown per pagination page |
|
| 302 |
+
| `dashboard.chart_height` | Default Plotly chart height |
|
| 303 |
+
| `dashboard.top_n_contents` | Default top-N for content ranking |
|
| 304 |
+
| `snowflake.query` | Full query (legacy, all columns) |
|
| 305 |
+
| `snowflake.dashboard_query` | Lightweight dashboard query (no text columns) |
|
| 306 |
+
| `snowflake.demographics_query` | Demographics join query |
|
| 307 |
+
| `demographics.age_groups` | Age bucket definitions (label → [min, max]) |
|
| 308 |
+
| `demographics.experience_groups` | Experience bucket definitions |
|
| 309 |
+
| `demographics.top_timezones_count` | How many timezones to show in the geographic chart |
|
processing_brand_sentiment/ARCHITECTURE_PROPOSAL.md
ADDED
|
@@ -0,0 +1,437 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Brand Sentiment Analysis - Architecture Redesign Proposal
|
| 2 |
+
|
| 3 |
+
## Executive Summary
|
| 4 |
+
|
| 5 |
+
This document proposes a redesigned multi-agent architecture to address accuracy issues identified during manual evaluation. The new design separates **fact extraction** from **analysis**, adds strict validation, and improves content preprocessing.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Current Issues Analysis
|
| 10 |
+
|
| 11 |
+
| Issue | Root Cause | Impact |
|
| 12 |
+
|-------|------------|--------|
|
| 13 |
+
| **B8X/B8 variation** | Word-boundary matching misses aliases | Missing relevant posts |
|
| 14 |
+
| **Competitor products attributed to Sabian** | LLM lacks competitor awareness, no strict list enforcement | False positives, wrong product attribution |
|
| 15 |
+
| **Short text language detection** | Lingua fails on short brand-heavy text | Skipping valid English posts |
|
| 16 |
+
| **False positive relevance** | Single-pass relevance + no verification | Pizza oven marked as Sabian discussion |
|
| 17 |
+
| **Long posts with overlapping content** | Poor quote separation, raw thread context | Confusing LLM, extraction from wrong content |
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## Proposed Architecture
|
| 22 |
+
|
| 23 |
+
### Design Principles
|
| 24 |
+
|
| 25 |
+
1. **Separation of Concerns**: Fact extraction vs. interpretation/analysis
|
| 26 |
+
2. **Strict Validation**: Enforce predefined value lists at every step
|
| 27 |
+
3. **Structured Data Flow**: Each agent receives clean, relevant input
|
| 28 |
+
4. **Fail-Safe Defaults**: Conservative approach - when uncertain, mark as not relevant
|
| 29 |
+
|
| 30 |
+
### New Workflow
|
| 31 |
+
|
| 32 |
+
```
|
| 33 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 34 |
+
│ 1. CONTENT PREPROCESSOR │
|
| 35 |
+
│ (No LLM) │
|
| 36 |
+
│ • Enhanced HTML parsing (better quote separation) │
|
| 37 |
+
│ • Text cleaning and normalization │
|
| 38 |
+
│ • Language detection (skip for short texts < 50 chars) │
|
| 39 |
+
│ • Keyword screening with aliases (B8 → B8X) │
|
| 40 |
+
│ • Extract: cleaned_content, quoted_content, raw_thread_context │
|
| 41 |
+
└─────────────────────────────┬───────────────────────────────────┘
|
| 42 |
+
│
|
| 43 |
+
▼
|
| 44 |
+
┌───────────────────────────────┐
|
| 45 |
+
│ Has any Sabian-related │
|
| 46 |
+
│ keywords (primary/contextual)?│
|
| 47 |
+
└───────────────┬───────────────┘
|
| 48 |
+
│ │
|
| 49 |
+
YES NO
|
| 50 |
+
│ │
|
| 51 |
+
▼ ▼
|
| 52 |
+
┌─────────────────────────────────┐ ┌──────────────────┐
|
| 53 |
+
│ 2. RELEVANCE & EXTRACTION │ │ Mark as │
|
| 54 |
+
│ AGENT (LLM #1) │ │ NOT RELEVANT │
|
| 55 |
+
│ │ │ (0 LLM calls) │
|
| 56 |
+
│ INPUT: │ └──────────────────┘
|
| 57 |
+
│ • cleaned_content │
|
| 58 |
+
│ • quoted_content │
|
| 59 |
+
│ • raw_thread_context │
|
| 60 |
+
│ • keywords_found │
|
| 61 |
+
│ │
|
| 62 |
+
│ OUTPUT: │
|
| 63 |
+
│ • IS_RELEVANT: boolean │
|
| 64 |
+
│ • RELEVANCE_CONFIDENCE: h/m/l │
|
| 65 |
+
│ • RELEVANCE_REASON: string │
|
| 66 |
+
│ • PRODUCTS_MENTIONED: [] │ ← STRICT: only from predefined list
|
| 67 |
+
│ • SABIAN_MENTION_CONTEXT │
|
| 68 |
+
│ • AUTHOR_ROLE │
|
| 69 |
+
│ • COMPETITORS_MENTIONED: [] │ ← Brand names only, no products
|
| 70 |
+
│ • THREAD_CONTEXT_SUMMARY │ ← 1-2 sentence summary
|
| 71 |
+
└─────────────────┬───────────────┘
|
| 72 |
+
│
|
| 73 |
+
▼
|
| 74 |
+
┌─────────────────┐
|
| 75 |
+
│ IS_RELEVANT? │
|
| 76 |
+
└────────┬────────┘
|
| 77 |
+
│ │
|
| 78 |
+
YES NO
|
| 79 |
+
│ │
|
| 80 |
+
▼ ▼
|
| 81 |
+
┌─────────────────────────────────┐ ┌──────────────────┐
|
| 82 |
+
│ 3. SENTIMENT & INTENT │ │ Store with │
|
| 83 |
+
│ ANALYZER (LLM #2) │ │ is_relevant=F │
|
| 84 |
+
│ │ │ (1 LLM call) │
|
| 85 |
+
│ INPUT (structured): │ └──────────────────┘
|
| 86 |
+
│ • cleaned_content │
|
| 87 |
+
│ • PRODUCTS_MENTIONED │ ← Pre-validated list
|
| 88 |
+
│ • SABIAN_MENTION_CONTEXT │
|
| 89 |
+
│ • AUTHOR_ROLE │
|
| 90 |
+
│ • COMPETITORS_MENTIONED │
|
| 91 |
+
│ • THREAD_CONTEXT_SUMMARY │ ← Clean, concise context
|
| 92 |
+
│ │
|
| 93 |
+
│ OUTPUT: │
|
| 94 |
+
│ • SENTIMENT_LEVEL │
|
| 95 |
+
│ • EMOTION_TYPE │
|
| 96 |
+
│ • SENTIMENT_CONFIDENCE │
|
| 97 |
+
│ • SARCASM_DETECTED │
|
| 98 |
+
│ • PRODUCT_ATTRIBUTES: [] │
|
| 99 |
+
│ • COMPETITOR_PRODUCTS_OWNED: []│
|
| 100 |
+
│ • COMPARISON_TYPE │
|
| 101 |
+
│ • INTENTS: [] │
|
| 102 |
+
│ • PURCHASE_STAGE │
|
| 103 |
+
│ • DECISION_DRIVERS: [] │
|
| 104 |
+
│ • PAIN_POINTS: [] │
|
| 105 |
+
│ • DELIGHT_FACTORS: [] │
|
| 106 |
+
│ • ANALYSIS_NOTES │
|
| 107 |
+
└─────────────────┬───────────────┘
|
| 108 |
+
│
|
| 109 |
+
▼
|
| 110 |
+
┌─────────────────────────────────┐
|
| 111 |
+
│ 4. OUTPUT VALIDATOR │
|
| 112 |
+
│ (No LLM - Rule-based) │
|
| 113 |
+
│ │
|
| 114 |
+
│ • Verify all values from lists │
|
| 115 |
+
│ • Check logical consistency │
|
| 116 |
+
│ • Flag anomalies for review │
|
| 117 |
+
│ • Set processing_status │
|
| 118 |
+
└─────────────────────────────────┘
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
---
|
| 122 |
+
|
| 123 |
+
## API Call Summary
|
| 124 |
+
|
| 125 |
+
| Scenario | Current Calls | New Calls | Notes |
|
| 126 |
+
|----------|--------------|-----------|-------|
|
| 127 |
+
| No keywords found | 0 | 0 | Same |
|
| 128 |
+
| Primary keywords, relevant | 1 | 2 | +1 for better extraction |
|
| 129 |
+
| Primary keywords, not relevant | 1 | 1 | Extraction determines not relevant |
|
| 130 |
+
| Ambiguous keywords, relevant | 2 | 2 | Same |
|
| 131 |
+
| Ambiguous keywords, not relevant | 2 | 1 | Early exit after extraction |
|
| 132 |
+
|
| 133 |
+
**Net Impact**: Slight increase for some cases, but significantly better accuracy.
|
| 134 |
+
|
| 135 |
+
---
|
| 136 |
+
|
| 137 |
+
## Agent Specifications
|
| 138 |
+
|
| 139 |
+
### Agent 1: Content Preprocessor (No LLM)
|
| 140 |
+
|
| 141 |
+
**File**: `workflow/agents/content_preprocessor_agent.py`
|
| 142 |
+
|
| 143 |
+
**Improvements over current**:
|
| 144 |
+
1. Enhanced HTML parsing with better quote/reply separation
|
| 145 |
+
2. Product alias mapping (B8 → B8X, etc.)
|
| 146 |
+
3. Skip language detection for texts < 50 characters
|
| 147 |
+
4. Always process if primary Sabian keywords found (regardless of language detection)
|
| 148 |
+
|
| 149 |
+
**Product Aliases** (add to brand_config.json):
|
| 150 |
+
```json
|
| 151 |
+
"product_aliases": {
|
| 152 |
+
"B8": "B8X",
|
| 153 |
+
"sbrs": "SBR",
|
| 154 |
+
"hand hammered": "HH",
|
| 155 |
+
"hand-hammered": "HH"
|
| 156 |
+
}
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
---
|
| 160 |
+
|
| 161 |
+
### Agent 2: Relevance & Extraction Agent (LLM #1)
|
| 162 |
+
|
| 163 |
+
**File**: `workflow/agents/relevance_extraction_agent.py`
|
| 164 |
+
|
| 165 |
+
**Purpose**: Determine relevance with HIGH confidence and extract verifiable facts.
|
| 166 |
+
|
| 167 |
+
**Key Design Decisions**:
|
| 168 |
+
|
| 169 |
+
1. **Strict Product Matching**:
|
| 170 |
+
- Provide explicit product list in prompt
|
| 171 |
+
- Instruction: "ONLY return products that EXACTLY match items in this list"
|
| 172 |
+
- Return empty list if no exact matches (not hallucinated guesses)
|
| 173 |
+
|
| 174 |
+
2. **Competitor Awareness**:
|
| 175 |
+
- List competitor BRAND names (not products)
|
| 176 |
+
- Instruction: "Products like '2002', 'Signature', 'K Custom' belong to competitors, NOT Sabian"
|
| 177 |
+
- Prevent cross-brand attribution
|
| 178 |
+
|
| 179 |
+
3. **Thread Context Summarization**:
|
| 180 |
+
- Summarize in 1-2 sentences maximum
|
| 181 |
+
- Focus only on information relevant to understanding the post's context
|
| 182 |
+
|
| 183 |
+
4. **Conservative Relevance**:
|
| 184 |
+
- When uncertain, mark as NOT relevant
|
| 185 |
+
- Require explicit Sabian product/brand mention IN THE POST CONTENT
|
| 186 |
+
- Quoted content mentioning Sabian does NOT make post relevant
|
| 187 |
+
|
| 188 |
+
**System Prompt Structure**:
|
| 189 |
+
```
|
| 190 |
+
You are a brand mention extractor for Sabian cymbals. Your job is to:
|
| 191 |
+
1. Determine if the POST CONTENT discusses Sabian products
|
| 192 |
+
2. Extract ONLY facts, not interpretations
|
| 193 |
+
|
| 194 |
+
## CRITICAL RULES
|
| 195 |
+
|
| 196 |
+
### Rule 1: Relevance Based on POST CONTENT Only
|
| 197 |
+
- The post is relevant ONLY if the POST CONTENT itself mentions Sabian products
|
| 198 |
+
- Quoted/parent content mentioning Sabian does NOT make the post relevant
|
| 199 |
+
- Generic replies ("Thanks!", "Got it!") are NEVER relevant
|
| 200 |
+
|
| 201 |
+
### Rule 2: Strict Product Matching
|
| 202 |
+
SABIAN PRODUCTS (use ONLY these exact values):
|
| 203 |
+
[HHX, HH, AAX, AA, Artisan, FRX, Omni, Chopper, Stratus, XSR, B8X, SBR]
|
| 204 |
+
|
| 205 |
+
- Return ONLY products from this list
|
| 206 |
+
- If you see a product not in this list, do NOT include it
|
| 207 |
+
- "2002", "Signature", "Sound Edge", "Formula 602" are PAISTE products, NOT Sabian
|
| 208 |
+
- "K Custom", "A Custom", "K Zildjian" are ZILDJIAN products, NOT Sabian
|
| 209 |
+
- When uncertain, return empty list []
|
| 210 |
+
|
| 211 |
+
### Rule 3: Competitor Brand Awareness
|
| 212 |
+
COMPETITOR BRANDS: [Zildjian, Paiste, Meinl, Dream Cymbals, Istanbul Agop, Bosphorus]
|
| 213 |
+
|
| 214 |
+
- Only return competitor BRAND names in competitors_mentioned
|
| 215 |
+
- Do NOT guess competitor products
|
| 216 |
+
|
| 217 |
+
### Rule 4: Thread Context Summary
|
| 218 |
+
- Summarize thread context in 1-2 sentences maximum
|
| 219 |
+
- Focus on what helps understand the post's topic
|
| 220 |
+
- If thread is about pizza ovens, say "Thread discusses pizza ovens and cooking"
|
| 221 |
+
|
| 222 |
+
## OUTPUT FORMAT
|
| 223 |
+
Return ONLY valid JSON:
|
| 224 |
+
{
|
| 225 |
+
"is_relevant": boolean,
|
| 226 |
+
"relevance_confidence": "high" | "medium" | "low",
|
| 227 |
+
"relevance_reason": "1-2 sentences explaining decision",
|
| 228 |
+
"products_mentioned": [], // ONLY from Sabian list above
|
| 229 |
+
"sabian_mention_context": "primary_focus" | "significant_mention" | "casual_mention" | "comparison_context" | null,
|
| 230 |
+
"author_role": "current_owner" | "past_owner" | "potential_buyer" | "never_owned" | "unknown",
|
| 231 |
+
"competitors_mentioned": [], // Brand names only
|
| 232 |
+
"thread_context_summary": "1-2 sentence summary"
|
| 233 |
+
}
|
| 234 |
+
```
|
| 235 |
+
|
| 236 |
+
---
|
| 237 |
+
|
| 238 |
+
### Agent 3: Sentiment & Intent Analyzer (LLM #2)
|
| 239 |
+
|
| 240 |
+
**File**: `workflow/agents/sentiment_analyzer_agent.py`
|
| 241 |
+
|
| 242 |
+
**Purpose**: Deep analysis on VERIFIED relevant posts with STRUCTURED input.
|
| 243 |
+
|
| 244 |
+
**Key Design Decisions**:
|
| 245 |
+
|
| 246 |
+
1. **Receives Pre-Validated Input**:
|
| 247 |
+
- Products already extracted and validated
|
| 248 |
+
- Thread context already summarized
|
| 249 |
+
- Author role already determined
|
| 250 |
+
|
| 251 |
+
2. **Focused Analysis**:
|
| 252 |
+
- Sentiment TOWARDS SABIAN ONLY
|
| 253 |
+
- Intent classification
|
| 254 |
+
- Pain points / Delights (author's own experience only)
|
| 255 |
+
- Purchase journey (author's own journey only)
|
| 256 |
+
|
| 257 |
+
3. **No Hallucination on Products**:
|
| 258 |
+
- Products are GIVEN in input, not re-extracted
|
| 259 |
+
- Can only discuss attributes of provided products
|
| 260 |
+
|
| 261 |
+
**System Prompt Structure**:
|
| 262 |
+
```
|
| 263 |
+
You are a sentiment analyst for Sabian cymbal discussions.
|
| 264 |
+
|
| 265 |
+
## INPUT CONTEXT (Pre-validated, trust these values)
|
| 266 |
+
- Products mentioned: {products_mentioned}
|
| 267 |
+
- Sabian mention context: {sabian_mention_context}
|
| 268 |
+
- Author role: {author_role}
|
| 269 |
+
- Thread summary: {thread_context_summary}
|
| 270 |
+
- Competitors mentioned: {competitors_mentioned}
|
| 271 |
+
|
| 272 |
+
## YOUR TASK
|
| 273 |
+
Analyze the sentiment, emotions, and intents in this post about Sabian.
|
| 274 |
+
|
| 275 |
+
## CRITICAL RULES
|
| 276 |
+
|
| 277 |
+
### Rule 1: Sabian-Specific Sentiment
|
| 278 |
+
- Sentiment MUST be about Sabian, NOT overall post tone
|
| 279 |
+
- Example: "Love my new kit! The SBR cymbals sound terrible."
|
| 280 |
+
- Overall: positive | Sabian sentiment: NEGATIVE
|
| 281 |
+
|
| 282 |
+
### Rule 2: Author Perspective Only
|
| 283 |
+
These fields are ONLY for author's OWN experience:
|
| 284 |
+
- purchase_stage, decision_drivers, pain_points, delight_factors
|
| 285 |
+
- If author is giving ADVICE to others, these should be null/empty
|
| 286 |
+
|
| 287 |
+
### Rule 3: Use Only Valid Values
|
| 288 |
+
[List all valid values for each field]
|
| 289 |
+
|
| 290 |
+
## OUTPUT FORMAT
|
| 291 |
+
{
|
| 292 |
+
"sentiment_level": "...",
|
| 293 |
+
"emotion_type": "..." or null,
|
| 294 |
+
"sentiment_confidence": "high" | "medium" | "low",
|
| 295 |
+
"sarcasm_detected": boolean,
|
| 296 |
+
"product_attributes": [],
|
| 297 |
+
"competitor_products_owned": [],
|
| 298 |
+
"comparison_type": "..." or null,
|
| 299 |
+
"intents": [],
|
| 300 |
+
"purchase_stage": "..." or null,
|
| 301 |
+
"decision_drivers": [],
|
| 302 |
+
"pain_points": [],
|
| 303 |
+
"delight_factors": [],
|
| 304 |
+
"analysis_notes": "1-2 sentences"
|
| 305 |
+
}
|
| 306 |
+
```
|
| 307 |
+
|
| 308 |
+
---
|
| 309 |
+
|
| 310 |
+
### Agent 4: Output Validator (No LLM)
|
| 311 |
+
|
| 312 |
+
**File**: `workflow/agents/output_validator_agent.py`
|
| 313 |
+
|
| 314 |
+
**Purpose**: Final validation and anomaly detection.
|
| 315 |
+
|
| 316 |
+
**Validation Rules**:
|
| 317 |
+
|
| 318 |
+
1. **List Validation**:
|
| 319 |
+
- All products_mentioned are in Sabian product list
|
| 320 |
+
- All competitors_mentioned are in competitor list
|
| 321 |
+
- All categorical values are from predefined lists
|
| 322 |
+
|
| 323 |
+
2. **Logical Consistency**:
|
| 324 |
+
- If is_relevant=True, products_mentioned should not be empty (flag if empty)
|
| 325 |
+
- If sabian_mention_context="primary_focus", products_mentioned should have items
|
| 326 |
+
- If sentiment_level="very_negative", pain_points should not be empty (warn)
|
| 327 |
+
|
| 328 |
+
3. **Anomaly Flagging**:
|
| 329 |
+
- Flag for manual review if inconsistencies detected
|
| 330 |
+
- Add `validation_flags` field to output
|
| 331 |
+
|
| 332 |
+
---
|
| 333 |
+
|
| 334 |
+
## Configuration Changes
|
| 335 |
+
|
| 336 |
+
### brand_config.json Updates
|
| 337 |
+
|
| 338 |
+
```json
|
| 339 |
+
{
|
| 340 |
+
"brand": {
|
| 341 |
+
"name": "Sabian",
|
| 342 |
+
"products": ["HHX", "HH", "AAX", "AA", "Artisan", "FRX", "Omni", "Chopper", "Stratus", "XSR", "B8X", "SBR"],
|
| 343 |
+
"product_aliases": {
|
| 344 |
+
"B8": "B8X",
|
| 345 |
+
"sbrs": "SBR",
|
| 346 |
+
"hhx's": "HHX",
|
| 347 |
+
"aax's": "AAX"
|
| 348 |
+
},
|
| 349 |
+
"competitor_products_warning": [
|
| 350 |
+
"2002", "Signature", "Sound Edge", "Formula 602", "Giant Beat",
|
| 351 |
+
"K Custom", "A Custom", "K Zildjian", "A Zildjian", "S Family",
|
| 352 |
+
"Byzance", "Pure Alloy", "HCS",
|
| 353 |
+
"Bliss", "Contact", "Energy"
|
| 354 |
+
],
|
| 355 |
+
"competitors": [...]
|
| 356 |
+
},
|
| 357 |
+
"preprocessing": {
|
| 358 |
+
"min_length_for_language_detection": 50,
|
| 359 |
+
"always_process_if_primary_keyword": true
|
| 360 |
+
}
|
| 361 |
+
}
|
| 362 |
+
```
|
| 363 |
+
|
| 364 |
+
---
|
| 365 |
+
|
| 366 |
+
## File Structure
|
| 367 |
+
|
| 368 |
+
```
|
| 369 |
+
processing_brand_sentiment/
|
| 370 |
+
├── config_files/
|
| 371 |
+
│ ├── brand_config.json # Updated with aliases, warnings
|
| 372 |
+
│ ├── workflow_config.json # Agent configurations
|
| 373 |
+
│ └── analysis_categories.json # Category definitions (unchanged)
|
| 374 |
+
├── workflow/
|
| 375 |
+
│ ├── orchestrator.py # Updated workflow graph
|
| 376 |
+
│ └── agents/
|
| 377 |
+
│ ├── base_agent.py # Base class (unchanged)
|
| 378 |
+
│ ├── content_preprocessor_agent.py # Enhanced preprocessing
|
| 379 |
+
│ ├── relevance_extraction_agent.py # NEW: Extraction + relevance
|
| 380 |
+
│ ├── sentiment_analyzer_agent.py # NEW: Focused analysis
|
| 381 |
+
│ └── output_validator_agent.py # NEW: Validation
|
| 382 |
+
```
|
| 383 |
+
|
| 384 |
+
---
|
| 385 |
+
|
| 386 |
+
## Migration Path
|
| 387 |
+
|
| 388 |
+
### Phase 1: Configuration Updates
|
| 389 |
+
1. Update brand_config.json with product aliases
|
| 390 |
+
2. Add competitor product warnings
|
| 391 |
+
3. Update preprocessing settings
|
| 392 |
+
|
| 393 |
+
### Phase 2: New Agents
|
| 394 |
+
1. Create relevance_extraction_agent.py
|
| 395 |
+
2. Create sentiment_analyzer_agent.py
|
| 396 |
+
3. Create output_validator_agent.py
|
| 397 |
+
4. Update content_preprocessor_agent.py
|
| 398 |
+
|
| 399 |
+
### Phase 3: Orchestrator Update
|
| 400 |
+
1. Update workflow graph with new flow
|
| 401 |
+
2. Update state definition
|
| 402 |
+
3. Add new routing logic
|
| 403 |
+
|
| 404 |
+
### Phase 4: Testing & Validation
|
| 405 |
+
1. Run on test batch with known issues
|
| 406 |
+
2. Compare accuracy metrics
|
| 407 |
+
3. Fine-tune prompts based on results
|
| 408 |
+
|
| 409 |
+
---
|
| 410 |
+
|
| 411 |
+
## Expected Improvements
|
| 412 |
+
|
| 413 |
+
| Issue | Current Behavior | Expected After |
|
| 414 |
+
|-------|------------------|----------------|
|
| 415 |
+
| B8/B8X | Missed | Caught via alias mapping |
|
| 416 |
+
| Paiste products as Sabian | Attributed to Sabian | Correctly identified as competitor |
|
| 417 |
+
| Short text language | Marked as Latin | Processed as English |
|
| 418 |
+
| False positive (pizza) | Marked relevant | Marked not relevant |
|
| 419 |
+
| Long confusing context | Raw text confuses LLM | Summarized 1-2 sentences |
|
| 420 |
+
|
| 421 |
+
---
|
| 422 |
+
|
| 423 |
+
## Success Metrics
|
| 424 |
+
|
| 425 |
+
1. **Relevance Accuracy**: >99% (currently ~90%)
|
| 426 |
+
2. **Product Attribution Accuracy**: >99% (currently ~85%)
|
| 427 |
+
3. **Sentiment Accuracy**: >95% (current unknown)
|
| 428 |
+
4. **False Positive Rate**: <1%
|
| 429 |
+
5. **False Negative Rate**: <1%
|
| 430 |
+
|
| 431 |
+
---
|
| 432 |
+
|
| 433 |
+
## Questions for Review
|
| 434 |
+
|
| 435 |
+
1. Should we add a manual review queue for flagged posts?
|
| 436 |
+
2. Should thread_context_summary be stored in output for debugging?
|
| 437 |
+
3. Preferred batch size for re-processing existing data?
|
processing_brand_sentiment/README.md
ADDED
|
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Brand Sentiment Analysis Pipeline
|
| 2 |
+
|
| 3 |
+
A modular, scalable system for analyzing forum discussions and social media comments about specific brands using an agentic workflow with LLMs. The initial implementation focuses on **Sabian** (a cymbal manufacturer), but the architecture supports easy addition of new brands through configuration.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
The pipeline fetches data from Snowflake (forum posts and/or social media comments), preprocesses them (parsing HTML for forums or cleaning plain text for comments), detects language, validates brand relevance, performs comprehensive sentiment and intelligence extraction using OpenAI's API, and stores enriched results back to Snowflake.
|
| 8 |
+
|
| 9 |
+
## Data Sources
|
| 10 |
+
|
| 11 |
+
| Source | Table | Output Table | Description |
|
| 12 |
+
|--------|-------|--------------|-------------|
|
| 13 |
+
| **Forums** | `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS` | `SABIAN_BRAND_ANALYSIS` | Forum posts with thread context |
|
| 14 |
+
| **Comments** | `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS` | `SABIAN_BRAND_ANALYSIS_COMMENTS` | Social media comments with content context |
|
| 15 |
+
|
| 16 |
+
## Architecture v4.0
|
| 17 |
+
|
| 18 |
+
The system uses a 4-agent pipeline that separates **fact extraction** from **analysis** for improved accuracy. Both data sources share the same extraction, analysis, and validation agents - only the preprocessor differs.
|
| 19 |
+
|
| 20 |
+
```
|
| 21 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 22 |
+
│ 1a. CONTENT PREPROCESSOR (Forums) │
|
| 23 |
+
│ (No LLM) │
|
| 24 |
+
│ - HTML parsing with quote/reply separation │
|
| 25 |
+
│ - Product alias mapping (B8 → B8X) │
|
| 26 |
+
│ - Smart language detection │
|
| 27 |
+
│ - Keyword-based relevance screening │
|
| 28 |
+
├─────────────────────────────────────────────────────────────────┤
|
| 29 |
+
│ 1b. COMMENT PREPROCESSOR (Comments) │
|
| 30 |
+
│ (No LLM) │
|
| 31 |
+
│ - Plain text cleaning (no HTML) │
|
| 32 |
+
│ - Product alias mapping (B8 → B8X) │
|
| 33 |
+
│ - Smart language detection │
|
| 34 |
+
│ - Keyword-based relevance screening │
|
| 35 |
+
│ - Context: content title + description + parent comment │
|
| 36 |
+
└─────────────────────────────┬───────────────────────────────────┘
|
| 37 |
+
│
|
| 38 |
+
▼
|
| 39 |
+
┌───────────────────────────────┐
|
| 40 |
+
│ Has Sabian-related keywords? │
|
| 41 |
+
└───────────────┬───────────────┘
|
| 42 |
+
│ │
|
| 43 |
+
YES NO
|
| 44 |
+
│ │
|
| 45 |
+
▼ ▼
|
| 46 |
+
┌─────────────────────────────────┐ ┌──────────────────┐
|
| 47 |
+
│ 2. RELEVANCE & EXTRACTION │ │ Mark as │
|
| 48 |
+
│ AGENT (LLM #1) │ │ NOT RELEVANT │
|
| 49 |
+
│ [SHARED] │ │ (0 LLM calls) │
|
| 50 |
+
│ - Validates relevance │ └──────────────────┘
|
| 51 |
+
│ - Extracts products (strict) │
|
| 52 |
+
│ - Identifies author role │
|
| 53 |
+
│ - Summarizes context │
|
| 54 |
+
│ - Detects competitors │
|
| 55 |
+
└─────────────────┬───────────────┘
|
| 56 |
+
│
|
| 57 |
+
▼
|
| 58 |
+
┌─────────────────┐
|
| 59 |
+
│ IS_RELEVANT? │
|
| 60 |
+
└────────┬────────┘
|
| 61 |
+
│ │
|
| 62 |
+
YES NO
|
| 63 |
+
│ │
|
| 64 |
+
▼ ▼
|
| 65 |
+
┌─────────────────────────────────┐ ┌──────────────────┐
|
| 66 |
+
│ 3. SENTIMENT & INTENT │ │ Store with │
|
| 67 |
+
│ ANALYZER (LLM #2) │ │ is_relevant=F │
|
| 68 |
+
│ [SHARED] │ │ (1 LLM call) │
|
| 69 |
+
│ - Sabian-specific sentiment │ └──────────────────┘
|
| 70 |
+
│ - Intent classification │
|
| 71 |
+
│ - Pain points / Delights │
|
| 72 |
+
��� - Purchase journey (author) │
|
| 73 |
+
│ - Competitor products owned │
|
| 74 |
+
└─────────────────┬───────────────┘
|
| 75 |
+
│
|
| 76 |
+
▼
|
| 77 |
+
┌─────────────────────────────────┐
|
| 78 |
+
│ 4. OUTPUT VALIDATOR │
|
| 79 |
+
│ (No LLM - Rule-based) │
|
| 80 |
+
│ [SHARED] │
|
| 81 |
+
│ - Validates all values │
|
| 82 |
+
│ - Checks logical consistency │
|
| 83 |
+
│ - Flags anomalies for review │
|
| 84 |
+
└─────────────────────────────────┘
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
## Features
|
| 88 |
+
|
| 89 |
+
- **Multi-Source Support**: Process forums, social media comments, or both
|
| 90 |
+
- **4-Agent Pipeline**: Separation of extraction and analysis for improved accuracy
|
| 91 |
+
- **Strict Product Matching**: Only returns products from predefined list, preventing hallucination
|
| 92 |
+
- **Competitor Awareness**: Knows which products belong to competitors
|
| 93 |
+
- **Smart Language Detection**: Skips detection for short texts, always processes if primary keywords found
|
| 94 |
+
- **Product Alias Mapping**: Handles variations (B8 → B8X, "hand hammered" → HH)
|
| 95 |
+
- **Thread/Comment Context**: LLM summarizes context for clarity
|
| 96 |
+
- **Validation & Anomaly Detection**: Rule-based validator catches errors and flags edge cases
|
| 97 |
+
- **Author Perspective Tracking**: Distinguishes author's own experience from advice to others
|
| 98 |
+
- **Platform Tracking**: Records source platform for each processed item
|
| 99 |
+
|
| 100 |
+
## Project Structure
|
| 101 |
+
|
| 102 |
+
```
|
| 103 |
+
processing_brand_sentiment/
|
| 104 |
+
├── config_files/
|
| 105 |
+
│ ├── brand_config.json # Brand products, aliases, competitors, keywords, data sources
|
| 106 |
+
│ ├── workflow_config.json # LLM settings, batch sizes, output config (forums + comments)
|
| 107 |
+
│ └── analysis_categories.json # Sentiment, intent, pain point categories
|
| 108 |
+
├── database/
|
| 109 |
+
│ ├── __init__.py
|
| 110 |
+
│ ├── snowflake_connection.py # Snowflake connection handler
|
| 111 |
+
│ └── sql/
|
| 112 |
+
│ ├── fetch_forum_posts.sql # Query for forum posts with thread context
|
| 113 |
+
│ ├── fetch_comments.sql # Query for social media comments with content context
|
| 114 |
+
│ ├── create_output_table.sql # Forum output schema with views
|
| 115 |
+
│ ├── init_output_table.sql # Forum table initialization
|
| 116 |
+
│ ├── create_comments_output_table.sql # Comment output schema with views
|
| 117 |
+
│ └── init_comments_output_table.sql # Comment table initialization
|
| 118 |
+
├── workflow/
|
| 119 |
+
│ ├── __init__.py
|
| 120 |
+
│ ├── orchestrator.py # Forum LangGraph workflow coordinator
|
| 121 |
+
│ ├── comment_orchestrator.py # Comment LangGraph workflow coordinator
|
| 122 |
+
│ └── agents/
|
| 123 |
+
│ ├── __init__.py
|
| 124 |
+
│ ├── base_agent.py # Abstract base class
|
| 125 |
+
│ ├── content_preprocessor_agent.py # Forum: HTML parsing, alias mapping
|
| 126 |
+
│ ├── comment_preprocessor_agent.py # Comments: plain text, comment context
|
| 127 |
+
│ ├── sabian_relevance_extraction_agent.py # Shared: relevance + extraction
|
| 128 |
+
│ ├── sabian_sentiment_analyzer_agent.py # Shared: sentiment analysis
|
| 129 |
+
│ └── output_validator_agent.py # Shared: rule-based validation
|
| 130 |
+
├── utils/
|
| 131 |
+
│ ├── __init__.py
|
| 132 |
+
│ └── html_parser.py # HTML content extraction (forums only)
|
| 133 |
+
├── logs/ # Processing logs (auto-created)
|
| 134 |
+
├── main.py # Main execution script (multi-source)
|
| 135 |
+
├── .env # Environment variables
|
| 136 |
+
└── README.md # This file
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
## Setup
|
| 140 |
+
|
| 141 |
+
### 1. Install Dependencies
|
| 142 |
+
|
| 143 |
+
```bash
|
| 144 |
+
pip install langchain-openai langgraph snowflake-snowpark-python python-dotenv pandas beautifulsoup4 lingua-language-detector
|
| 145 |
+
```
|
| 146 |
+
|
| 147 |
+
### 2. Configure Environment Variables
|
| 148 |
+
|
| 149 |
+
Ensure `.env` file contains:
|
| 150 |
+
|
| 151 |
+
```env
|
| 152 |
+
# Snowflake
|
| 153 |
+
SNOWFLAKE_USER=your_user
|
| 154 |
+
SNOWFLAKE_PASSWORD=your_password
|
| 155 |
+
SNOWFLAKE_ACCOUNT=your_account
|
| 156 |
+
SNOWFLAKE_ROLE=your_role
|
| 157 |
+
SNOWFLAKE_DATABASE=SOCIAL_MEDIA_DB
|
| 158 |
+
SNOWFLAKE_WAREHOUSE=your_warehouse
|
| 159 |
+
SNOWFLAKE_SCHEMA=ML_FEATURES
|
| 160 |
+
|
| 161 |
+
# OpenAI
|
| 162 |
+
OPENAI_API_KEY=your_openai_key
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
### 3. Initialize Snowflake Tables
|
| 166 |
+
|
| 167 |
+
Run the initialization scripts before first processing:
|
| 168 |
+
|
| 169 |
+
```sql
|
| 170 |
+
-- For forums
|
| 171 |
+
database/sql/init_output_table.sql
|
| 172 |
+
|
| 173 |
+
-- For social media comments
|
| 174 |
+
database/sql/init_comments_output_table.sql
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
## Usage
|
| 178 |
+
|
| 179 |
+
### Process All Sources (Default)
|
| 180 |
+
|
| 181 |
+
```bash
|
| 182 |
+
python main.py
|
| 183 |
+
```
|
| 184 |
+
|
| 185 |
+
### Process Forums Only
|
| 186 |
+
|
| 187 |
+
```bash
|
| 188 |
+
python main.py --data-source forums
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
### Process Social Media Comments Only
|
| 192 |
+
|
| 193 |
+
```bash
|
| 194 |
+
python main.py --data-source comments
|
| 195 |
+
```
|
| 196 |
+
|
| 197 |
+
### Process Limited Number
|
| 198 |
+
|
| 199 |
+
```bash
|
| 200 |
+
python main.py --limit 100
|
| 201 |
+
python main.py --data-source comments --limit 50
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
### Sequential Processing (Debug Mode)
|
| 205 |
+
|
| 206 |
+
```bash
|
| 207 |
+
python main.py --limit 50 --sequential
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
### First Run (Overwrite Mode)
|
| 211 |
+
|
| 212 |
+
```bash
|
| 213 |
+
python main.py --overwrite --limit 100
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
### Command-Line Arguments
|
| 217 |
+
|
| 218 |
+
| Argument | Description | Default |
|
| 219 |
+
|----------|-------------|---------|
|
| 220 |
+
| `--limit N` | Process only N items per source | All unprocessed |
|
| 221 |
+
| `--overwrite` | Overwrite existing table | Append mode |
|
| 222 |
+
| `--sequential` | Single-threaded processing | Parallel |
|
| 223 |
+
| `--config-dir PATH` | Custom config directory | config_files/ |
|
| 224 |
+
| `--data-source SOURCE` | Source to process: `forums`, `comments`, `all` | `all` |
|
| 225 |
+
|
| 226 |
+
## Configuration
|
| 227 |
+
|
| 228 |
+
### brand_config.json
|
| 229 |
+
|
| 230 |
+
Key sections:
|
| 231 |
+
|
| 232 |
+
```json
|
| 233 |
+
{
|
| 234 |
+
"brand": {
|
| 235 |
+
"name": "Sabian",
|
| 236 |
+
"products": ["HHX", "HH", "AAX", "AA", "Artisan", "FRX", "Omni", "Chopper", "Stratus", "XSR", "B8X", "SBR"],
|
| 237 |
+
"product_aliases": {
|
| 238 |
+
"b8": "B8X",
|
| 239 |
+
"hand hammered": "HH"
|
| 240 |
+
},
|
| 241 |
+
"competitor_products_warning": {
|
| 242 |
+
"paiste_products": ["2002", "signature", "sound edge", "formula 602"],
|
| 243 |
+
"zildjian_products": ["k custom", "a custom", "k zildjian"]
|
| 244 |
+
},
|
| 245 |
+
"competitors": [...]
|
| 246 |
+
},
|
| 247 |
+
"data_sources": {
|
| 248 |
+
"forums": {
|
| 249 |
+
"table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS",
|
| 250 |
+
"platform": "musora_forums"
|
| 251 |
+
},
|
| 252 |
+
"comments": {
|
| 253 |
+
"table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS",
|
| 254 |
+
"platform_column": "PLATFORM"
|
| 255 |
+
}
|
| 256 |
+
}
|
| 257 |
+
}
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
### analysis_categories.json
|
| 261 |
+
|
| 262 |
+
Defines valid values for all categorical fields:
|
| 263 |
+
|
| 264 |
+
- `author_role`: current_owner, past_owner, potential_buyer, never_owned, unknown
|
| 265 |
+
- `sabian_mention_context`: primary_focus, significant_mention, casual_mention, comparison_context
|
| 266 |
+
- `sentiment_level`: very_negative, negative, neutral, positive, very_positive
|
| 267 |
+
- `intents`: seeking_information, providing_information, sharing_experience, comparing, praising, criticizing, buying_selling, general_discussion
|
| 268 |
+
- `feedback_aspects`: sound_quality, price_value, durability, playability, versatility, customer_service, availability, aesthetics
|
| 269 |
+
|
| 270 |
+
## Output Tables
|
| 271 |
+
|
| 272 |
+
### Forum Output: `SABIAN_BRAND_ANALYSIS`
|
| 273 |
+
|
| 274 |
+
| Category | Key Columns |
|
| 275 |
+
|----------|-------------|
|
| 276 |
+
| **Identifiers** | POST_ID, THREAD_ID, POST_AUTHOR_ID, PLATFORM |
|
| 277 |
+
| **Content** | ORIGINAL_CONTENT, CLEANED_CONTENT, QUOTED_CONTENT, THREAD_CONTEXT_SUMMARY |
|
| 278 |
+
| **Thread** | THREAD_TITLE, THREAD_FIRST_POST, POST_CREATED_AT, THREAD_STARTED_AT |
|
| 279 |
+
| **Category** | CATEGORY_TITLE, CATEGORY_TOPIC |
|
| 280 |
+
|
| 281 |
+
### Comment Output: `SABIAN_BRAND_ANALYSIS_COMMENTS`
|
| 282 |
+
|
| 283 |
+
| Category | Key Columns |
|
| 284 |
+
|----------|-------------|
|
| 285 |
+
| **Identifiers** | COMMENT_SK, COMMENT_ID, PLATFORM, AUTHOR_NAME, AUTHOR_ID |
|
| 286 |
+
| **Content** | ORIGINAL_TEXT, COMMENT_TIMESTAMP |
|
| 287 |
+
| **Context** | CONTENT_SK, CONTENT_ID, CONTENT_DESCRIPTION, PARENT_COMMENT_ID, PARENT_COMMENT_TEXT |
|
| 288 |
+
| **Channel** | CHANNEL_SK, CHANNEL_NAME, CHANNEL_DISPLAY_NAME |
|
| 289 |
+
|
| 290 |
+
### Shared Analysis Columns (Both Tables)
|
| 291 |
+
|
| 292 |
+
| Category | Fields | Notes |
|
| 293 |
+
|----------|--------|-------|
|
| 294 |
+
| **Language** | DETECTED_LANGUAGE, LANGUAGE_CODE, IS_ENGLISH | Language detection |
|
| 295 |
+
| **Relevance** | IS_RELEVANT, RELEVANCE_CONFIDENCE, RELEVANCE_REASON | Brand relevance |
|
| 296 |
+
| **Extraction** | PRODUCTS_MENTIONED, AUTHOR_ROLE, SABIAN_MENTION_CONTEXT | From Agent 1 |
|
| 297 |
+
| **Sentiment** | SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_CONFIDENCE | Sabian-specific |
|
| 298 |
+
| **Intents** | INTENTS (multi-label) | What author is trying to accomplish |
|
| 299 |
+
| **Journey** | PURCHASE_STAGE, DECISION_DRIVERS | Author perspective only |
|
| 300 |
+
| **Feedback** | PAIN_POINTS, DELIGHT_FACTORS | Author's own experience |
|
| 301 |
+
| **Competitive** | COMPETITORS_MENTIONED, COMPETITOR_PRODUCTS_OWNED, COMPARISON_TYPE | Competitive intel |
|
| 302 |
+
| **Validation** | VALIDATION_FLAGS, PROCESSING_STATUS | Anomaly detection |
|
| 303 |
+
|
| 304 |
+
### Processing Status Values
|
| 305 |
+
|
| 306 |
+
| Status | Description |
|
| 307 |
+
|--------|-------------|
|
| 308 |
+
| `completed` | Successfully processed, no issues |
|
| 309 |
+
| `completed_with_flags` | Processed but has anomalies to review |
|
| 310 |
+
| `validation_failed` | Validation errors detected |
|
| 311 |
+
| `workflow_error` | Unexpected error during processing |
|
| 312 |
+
|
| 313 |
+
### Available Views
|
| 314 |
+
|
| 315 |
+
#### Forum Views
|
| 316 |
+
|
| 317 |
+
| View | Description |
|
| 318 |
+
|------|-------------|
|
| 319 |
+
| `VW_SABIAN_RELEVANT_ANALYSIS` | Only relevant, successfully processed posts |
|
| 320 |
+
| `VW_SABIAN_FLAGGED_POSTS` | Posts with validation flags for review |
|
| 321 |
+
| `VW_SABIAN_SENTIMENT_DISTRIBUTION` | Sentiment breakdown statistics |
|
| 322 |
+
| `VW_SABIAN_PRODUCT_MENTIONS` | Product mention summary |
|
| 323 |
+
| `VW_SABIAN_COMPETITOR_ANALYSIS` | Competitor comparison analysis |
|
| 324 |
+
| `VW_SABIAN_PAIN_POINTS` | Pain point frequency analysis |
|
| 325 |
+
| `VW_SABIAN_AUTHOR_ROLES` | Author role distribution |
|
| 326 |
+
| `VW_SABIAN_COMPETITOR_OWNERSHIP` | Competitor brands owned by authors |
|
| 327 |
+
| `VW_SABIAN_VALIDATION_SUMMARY` | Processing status breakdown |
|
| 328 |
+
|
| 329 |
+
#### Comment Views
|
| 330 |
+
|
| 331 |
+
| View | Description |
|
| 332 |
+
|------|-------------|
|
| 333 |
+
| `VW_SABIAN_COMMENTS_RELEVANT_ANALYSIS` | Relevant, successful comments |
|
| 334 |
+
| `VW_SABIAN_COMMENTS_FLAGGED` | Comments with validation flags |
|
| 335 |
+
| `VW_SABIAN_COMMENTS_SENTIMENT_DISTRIBUTION` | Sentiment by platform |
|
| 336 |
+
| `VW_SABIAN_COMMENTS_PRODUCT_MENTIONS` | Product mentions by platform |
|
| 337 |
+
| `VW_SABIAN_COMMENTS_VALIDATION_SUMMARY` | Processing status by platform |
|
| 338 |
+
|
| 339 |
+
## API Call Efficiency
|
| 340 |
+
|
| 341 |
+
| Scenario | LLM Calls | Notes |
|
| 342 |
+
|----------|-----------|-------|
|
| 343 |
+
| No keywords found | 0 | Early exit in preprocessor |
|
| 344 |
+
| Primary keywords, relevant | 2 | Extraction + Analysis |
|
| 345 |
+
| Primary keywords, not relevant | 1 | Only Extraction |
|
| 346 |
+
| Non-English content | 0 | Skipped |
|
| 347 |
+
|
| 348 |
+
## Key Design Decisions
|
| 349 |
+
|
| 350 |
+
### Why Separate Forum and Comment Preprocessors?
|
| 351 |
+
|
| 352 |
+
1. **Different input formats**: Forums use HTML (quotes, blockquotes), comments are plain text
|
| 353 |
+
2. **Different context sources**: Forums have thread title + first post + category; comments have content title + description + parent comment
|
| 354 |
+
3. **Shared analysis**: Both feed into the same extraction and analysis agents
|
| 355 |
+
|
| 356 |
+
### Why Separate Output Tables?
|
| 357 |
+
|
| 358 |
+
1. **Different identifiers**: Forums use POST_ID/THREAD_ID; comments use COMMENT_SK/COMMENT_ID/PLATFORM
|
| 359 |
+
2. **Different metadata**: Forums have thread context; comments have content/channel metadata
|
| 360 |
+
3. **Clean separation**: Avoids NULL columns and schema confusion
|
| 361 |
+
4. **Shared analysis columns**: All extracted intelligence fields are identical
|
| 362 |
+
|
| 363 |
+
### Why Platform Column for Forums?
|
| 364 |
+
|
| 365 |
+
The `PLATFORM` column was added to `SABIAN_BRAND_ANALYSIS` (defaulting to `musora_forums`) to enable cross-source analysis and maintain consistency with the comments table which uses the dynamic platform value from the source data.
|
| 366 |
+
|
| 367 |
+
## Troubleshooting
|
| 368 |
+
|
| 369 |
+
### "Table does not exist" on First Run
|
| 370 |
+
|
| 371 |
+
Run the appropriate init SQL in Snowflake first:
|
| 372 |
+
- Forums: `database/sql/init_output_table.sql`
|
| 373 |
+
- Comments: `database/sql/init_comments_output_table.sql`
|
| 374 |
+
|
| 375 |
+
### No Comments Being Processed
|
| 376 |
+
|
| 377 |
+
Check that `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS` table exists and contains data. The query joins with `DIM_CONTENT` and `DIM_CHANNEL` - verify these dimension tables have matching records.
|
| 378 |
+
|
| 379 |
+
### Competitor Products Attributed to Sabian
|
| 380 |
+
|
| 381 |
+
Check `brand_config.json` for `competitor_products_warning` section. Add any missing competitor products.
|
| 382 |
+
|
| 383 |
+
### API Rate Limits
|
| 384 |
+
|
| 385 |
+
Use `--sequential` mode or reduce `--limit`:
|
| 386 |
+
```bash
|
| 387 |
+
python main.py --sequential --limit 50
|
| 388 |
+
```
|
| 389 |
+
|
| 390 |
+
## Schema Version History
|
| 391 |
+
|
| 392 |
+
| Version | Changes |
|
| 393 |
+
|---------|---------|
|
| 394 |
+
| 1.0 | Initial release |
|
| 395 |
+
| 2.0 | Added author_role, post_type, sabian_mention_context |
|
| 396 |
+
| 3.0 | Removed post_type (merged into intents), unified feedback_aspects |
|
| 397 |
+
| 4.0 | 4-agent pipeline, thread_context_summary, validation flags, product aliases |
|
| 398 |
+
| 4.0+ | Added social media comments support, PLATFORM column, separate comment output table |
|
| 399 |
+
|
| 400 |
+
## License
|
| 401 |
+
|
| 402 |
+
Internal use only - Brand sentiment analysis project.
|
processing_brand_sentiment/config_files/analysis_categories.json
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"author_role": {
|
| 3 |
+
"description": "Author's relationship to Sabian products",
|
| 4 |
+
"categories": [
|
| 5 |
+
{"value": "current_owner", "description": "Currently owns/uses Sabian"},
|
| 6 |
+
{"value": "past_owner", "description": "Previously owned, sold/replaced"},
|
| 7 |
+
{"value": "potential_buyer", "description": "Considering purchasing Sabian"},
|
| 8 |
+
{"value": "never_owned", "description": "Explicitly doesn't own Sabian"},
|
| 9 |
+
{"value": "unknown", "description": "Cannot determine from post"}
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
"sabian_mention_context": {
|
| 13 |
+
"description": "How prominently Sabian is discussed",
|
| 14 |
+
"categories": [
|
| 15 |
+
{"value": "primary_focus", "description": "Sabian is the main topic"},
|
| 16 |
+
{"value": "significant_mention", "description": "Discussed with detail, not main focus"},
|
| 17 |
+
{"value": "casual_mention", "description": "Brief mention among other topics"},
|
| 18 |
+
{"value": "comparison_context", "description": "Mentioned while comparing to competitors"}
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
"sentiment": {
|
| 22 |
+
"brand_specific": true,
|
| 23 |
+
"description": "Sentiment TOWARDS SABIAN ONLY (not overall post tone)",
|
| 24 |
+
"levels": [
|
| 25 |
+
{"value": "very_negative", "description": "Strong criticism, anger, severe disappointment"},
|
| 26 |
+
{"value": "negative", "description": "Complaints, dissatisfaction, mild criticism"},
|
| 27 |
+
{"value": "neutral", "description": "Factual mention, balanced, no clear sentiment"},
|
| 28 |
+
{"value": "positive", "description": "Satisfaction, appreciation, mild praise"},
|
| 29 |
+
{"value": "very_positive", "description": "Enthusiasm, strong praise, highly recommend"}
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
"emotions": {
|
| 33 |
+
"brand_specific": true,
|
| 34 |
+
"description": "Emotion towards SABIAN specifically",
|
| 35 |
+
"categories": [
|
| 36 |
+
{"value": "frustration", "description": "Annoyance with product issues"},
|
| 37 |
+
{"value": "disappointment", "description": "Unmet expectations"},
|
| 38 |
+
{"value": "anger", "description": "Strong negative emotion"},
|
| 39 |
+
{"value": "satisfaction", "description": "Expectations met, content"},
|
| 40 |
+
{"value": "excitement", "description": "Eagerness, anticipation"},
|
| 41 |
+
{"value": "curiosity", "description": "Interest, wanting to know more"},
|
| 42 |
+
{"value": "indifference", "description": "No strong feelings"}
|
| 43 |
+
]
|
| 44 |
+
},
|
| 45 |
+
"intents": {
|
| 46 |
+
"multi_label": true,
|
| 47 |
+
"description": "What the author is trying to accomplish (can select multiple)",
|
| 48 |
+
"categories": [
|
| 49 |
+
{"value": "seeking_information", "description": "Asking questions, seeking advice/recommendations"},
|
| 50 |
+
{"value": "providing_information", "description": "Answering questions, giving advice, helping others"},
|
| 51 |
+
{"value": "sharing_experience", "description": "Personal experience, review, testimonial, purchase announcement"},
|
| 52 |
+
{"value": "comparing", "description": "Comparing brands/products against each other"},
|
| 53 |
+
{"value": "praising", "description": "Actively endorsing, recommending, advocating for Sabian"},
|
| 54 |
+
{"value": "criticizing", "description": "Actively complaining, warning others, reporting issues"},
|
| 55 |
+
{"value": "buying_selling", "description": "Listing gear for sale, looking to buy/trade"},
|
| 56 |
+
{"value": "general_discussion", "description": "General conversation not fitting above"}
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
"purchase_stage": {
|
| 60 |
+
"author_perspective_only": true,
|
| 61 |
+
"description": "Author's own purchase journey stage (null if giving advice to others)",
|
| 62 |
+
"categories": [
|
| 63 |
+
{"value": "researching", "description": "Gathering info before buying"},
|
| 64 |
+
{"value": "deciding", "description": "Actively comparing, about to decide"},
|
| 65 |
+
{"value": "recently_purchased", "description": "Just bought the product"},
|
| 66 |
+
{"value": "long_term_owner", "description": "Owned for extended period"},
|
| 67 |
+
{"value": "selling_replacing", "description": "Selling or replacing gear"}
|
| 68 |
+
]
|
| 69 |
+
},
|
| 70 |
+
"comparison_type": {
|
| 71 |
+
"description": "Type of competitive comparison (if comparing)",
|
| 72 |
+
"categories": [
|
| 73 |
+
{"value": "direct_comparison", "description": "Side-by-side evaluation"},
|
| 74 |
+
{"value": "preference_statement", "description": "Stating brand preference"},
|
| 75 |
+
{"value": "switching_to_sabian", "description": "Moving or Moved from competitor to Sabian"},
|
| 76 |
+
{"value": "switching_from_sabian", "description": "Moving or Moved from Sabian to competitor"}
|
| 77 |
+
]
|
| 78 |
+
},
|
| 79 |
+
"feedback_aspects": {
|
| 80 |
+
"description": "Product/brand aspects discussed. Used for BOTH pain_points (negative) and delight_factors (positive)",
|
| 81 |
+
"categories": [
|
| 82 |
+
{"value": "sound_quality", "description": "Sound, tone, character, audio qualities"},
|
| 83 |
+
{"value": "price_value", "description": "Cost, value for money, deals"},
|
| 84 |
+
{"value": "durability", "description": "Build quality, longevity, cracking/wear"},
|
| 85 |
+
{"value": "playability", "description": "Feel, response, ease of playing"},
|
| 86 |
+
{"value": "versatility", "description": "Range of genres/applications, flexibility"},
|
| 87 |
+
{"value": "customer_service", "description": "Support, warranty, brand interaction"},
|
| 88 |
+
{"value": "availability", "description": "Stock, ease of finding/purchasing"},
|
| 89 |
+
{"value": "aesthetics", "description": "Appearance, finish, visual appeal"}
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
"decision_drivers": {
|
| 93 |
+
"author_perspective_only": true,
|
| 94 |
+
"description": "What influenced AUTHOR's own purchase decision (empty if giving advice)",
|
| 95 |
+
"categories": [
|
| 96 |
+
{"value": "sound_quality", "description": "Sound characteristics"},
|
| 97 |
+
{"value": "price", "description": "Cost/budget considerations"},
|
| 98 |
+
{"value": "durability", "description": "Build quality, longevity"},
|
| 99 |
+
{"value": "artist_endorsement", "description": "Influenced by endorsed artists"},
|
| 100 |
+
{"value": "peer_recommendation", "description": "Friends/community recommended"},
|
| 101 |
+
{"value": "hands_on_testing", "description": "Tried before buying"},
|
| 102 |
+
{"value": "brand_loyalty", "description": "Previous positive experience"},
|
| 103 |
+
{"value": "versatility", "description": "Multi-genre/application use"},
|
| 104 |
+
{"value": "online_reviews", "description": "Read reviews that influenced"}
|
| 105 |
+
]
|
| 106 |
+
},
|
| 107 |
+
"product_attributes": {
|
| 108 |
+
"description": "Attributes being discussed about Sabian products",
|
| 109 |
+
"categories": [
|
| 110 |
+
{"value": "sound_quality", "description": "Tone, character, audio qualities"},
|
| 111 |
+
{"value": "durability", "description": "Build quality, longevity"},
|
| 112 |
+
{"value": "price", "description": "Cost and value"},
|
| 113 |
+
{"value": "playability", "description": "Feel, response"},
|
| 114 |
+
{"value": "aesthetics", "description": "Appearance, finish"},
|
| 115 |
+
{"value": "volume", "description": "Loudness, projection"},
|
| 116 |
+
{"value": "sustain", "description": "How long sound lasts"},
|
| 117 |
+
{"value": "versatility", "description": "Range of applications"}
|
| 118 |
+
]
|
| 119 |
+
},
|
| 120 |
+
"analysis_notes_guidelines": {
|
| 121 |
+
"description": "Keep to 1-2 sentences. Focus on Sabian-specific insights not captured by other fields."
|
| 122 |
+
}
|
| 123 |
+
}
|
processing_brand_sentiment/config_files/brand_config.json
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"brand": {
|
| 3 |
+
"name": "Sabian",
|
| 4 |
+
"description": "Sabian is a Canadian manufacturer of cymbals founded in 1981",
|
| 5 |
+
"products": [
|
| 6 |
+
"HHX",
|
| 7 |
+
"AAX",
|
| 8 |
+
"Artisan",
|
| 9 |
+
"FRX",
|
| 10 |
+
"Omni",
|
| 11 |
+
"Chopper",
|
| 12 |
+
"Stratus",
|
| 13 |
+
"XSR",
|
| 14 |
+
"B8X",
|
| 15 |
+
"SBR"
|
| 16 |
+
],
|
| 17 |
+
"product_aliases": {
|
| 18 |
+
"b8": "B8X",
|
| 19 |
+
"sbrs": "SBR",
|
| 20 |
+
"hhxs": "HHX",
|
| 21 |
+
"aaxs": "AAX",
|
| 22 |
+
"hhx's": "HHX",
|
| 23 |
+
"aax's": "AAX"
|
| 24 |
+
},
|
| 25 |
+
"product_descriptions": {
|
| 26 |
+
"HHX": "Hand Hammered Xtreme - Professional series with dark, complex tones",
|
| 27 |
+
"AAX": "Bright, cutting cymbals for modern music",
|
| 28 |
+
"Artisan": "Premium hand-crafted cymbals with unique character",
|
| 29 |
+
"FRX": "Frequency Reduced Xtreme - Lower volume cymbals",
|
| 30 |
+
"Omni": "Multi-purpose cymbals for various playing styles",
|
| 31 |
+
"Chopper": "Effect cymbals with unique sound",
|
| 32 |
+
"Stratus": "Dark, complex sounds for jazz and fusion",
|
| 33 |
+
"XSR": "Entry-level professional cymbals",
|
| 34 |
+
"B8X": "Bronze entry-level cymbals",
|
| 35 |
+
"SBR": "Entry-level brass cymbals"
|
| 36 |
+
},
|
| 37 |
+
"competitor_products_warning": {
|
| 38 |
+
"description": "Products that belong to competitors - DO NOT attribute to Sabian",
|
| 39 |
+
"paiste_products": ["2002", "signature", "sound edge", "formula 602", "giant beat", "pst", "rude", "masters", "traditionals", "twenty", "dark energy"],
|
| 40 |
+
"zildjian_products": ["k custom", "a custom", "k zildjian", "a zildjian", "s family", "i family", "l80", "kerope", "constantinople", "k sweet"],
|
| 41 |
+
"meinl_products": ["byzance", "pure alloy", "hcs", "classics custom", "mb20", "mb10", "soundcaster"],
|
| 42 |
+
"dream_products": ["bliss", "contact", "energy", "dark matter", "vintage bliss", "eclipse"],
|
| 43 |
+
"istanbul_products": ["agop", "xist", "traditional", "sultan", "mehmet"]
|
| 44 |
+
},
|
| 45 |
+
"competitors": [
|
| 46 |
+
{
|
| 47 |
+
"name": "Zildjian",
|
| 48 |
+
"aliases": ["zildjian", "zil", "z custom", "a custom", "k custom", "k zildjian", "a zildjian"]
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"name": "Meinl",
|
| 52 |
+
"aliases": ["meinl", "byzance", "classics"]
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"name": "Paiste",
|
| 56 |
+
"aliases": ["paiste", "2002", "signature", "formula 602", "sound edge"]
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"name": "Dream Cymbals",
|
| 60 |
+
"aliases": ["dream", "dream cymbals", "bliss"]
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "Istanbul Agop",
|
| 64 |
+
"aliases": ["istanbul", "agop", "istanbul agop", "istanbul mehmet"]
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"name": "Bosphorus",
|
| 68 |
+
"aliases": ["bosphorus"]
|
| 69 |
+
}
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
"relevance_keywords": {
|
| 73 |
+
"primary": {
|
| 74 |
+
"description": "Keywords that definitively indicate Sabian content",
|
| 75 |
+
"keywords": ["sabian", "hhx", "aax", "artisan", "frx", "omni", "chopper", "stratus", "xsr", "b8x", "sbr"]
|
| 76 |
+
},
|
| 77 |
+
"contextual": {
|
| 78 |
+
"description": "Ambiguous keywords that need context verification",
|
| 79 |
+
"keywords": ["b8"]
|
| 80 |
+
},
|
| 81 |
+
"cymbal_context": {
|
| 82 |
+
"description": "Keywords that provide cymbal-related context for disambiguation",
|
| 83 |
+
"keywords": ["cymbal", "cymbals", "crash", "ride", "hi-hat", "hihat", "hi hat", "splash", "china", "bell", "stack", "effects"]
|
| 84 |
+
}
|
| 85 |
+
},
|
| 86 |
+
"preprocessing": {
|
| 87 |
+
"min_length_for_language_detection": 50,
|
| 88 |
+
"default_language_for_short_text": "English",
|
| 89 |
+
"always_process_if_primary_keyword": true,
|
| 90 |
+
"min_content_length": 3
|
| 91 |
+
},
|
| 92 |
+
"filter_conditions": {
|
| 93 |
+
"exclude_access_levels": ["team", "house-coach"],
|
| 94 |
+
"exclude_post_states": ["deleted", "spam"],
|
| 95 |
+
"require_content_length_min": 3
|
| 96 |
+
},
|
| 97 |
+
"data_sources": {
|
| 98 |
+
"forums": {
|
| 99 |
+
"table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS",
|
| 100 |
+
"description": "Forum posts mentioning Sabian and their products",
|
| 101 |
+
"sql_query_file": "database/sql/fetch_forum_posts.sql",
|
| 102 |
+
"platform": "musora_forums"
|
| 103 |
+
},
|
| 104 |
+
"comments": {
|
| 105 |
+
"table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS",
|
| 106 |
+
"description": "Social media comments potentially related to Sabian brand",
|
| 107 |
+
"sql_query_file": "database/sql/fetch_comments.sql",
|
| 108 |
+
"platform_column": "PLATFORM"
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
}
|
processing_brand_sentiment/config_files/workflow_config.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"llm": {
|
| 3 |
+
"default_model": "gpt-5-nano",
|
| 4 |
+
"default_temperature": 0.2,
|
| 5 |
+
"max_retries": 3,
|
| 6 |
+
"timeout": 60
|
| 7 |
+
},
|
| 8 |
+
"agents": {
|
| 9 |
+
"preprocessor": {
|
| 10 |
+
"name": "PreprocessorAgent",
|
| 11 |
+
"description": "Deterministic agent for HTML parsing, text cleaning, language detection",
|
| 12 |
+
"model": "gpt-5-nano",
|
| 13 |
+
"temperature": 0.0,
|
| 14 |
+
"uses_llm": false
|
| 15 |
+
},
|
| 16 |
+
"relevance_validator": {
|
| 17 |
+
"name": "RelevanceValidatorAgent",
|
| 18 |
+
"description": "Lightweight LLM for disambiguation of ambiguous terms (HH, AA)",
|
| 19 |
+
"model": "gpt-5-nano",
|
| 20 |
+
"temperature": 0.0,
|
| 21 |
+
"max_retries": 2
|
| 22 |
+
},
|
| 23 |
+
"brand_analyzer": {
|
| 24 |
+
"name": "SabianAnalyzerAgent",
|
| 25 |
+
"description": "Comprehensive brand analysis for Sabian products",
|
| 26 |
+
"model": "gpt-5-nano",
|
| 27 |
+
"temperature": 0.2,
|
| 28 |
+
"max_retries": 3
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"workflow": {
|
| 32 |
+
"parallel_processing": {
|
| 33 |
+
"enabled": true,
|
| 34 |
+
"worker_calculation": "CPU count - 2, max 5 workers",
|
| 35 |
+
"max_workers": 5,
|
| 36 |
+
"min_batch_size": 20,
|
| 37 |
+
"max_batch_size": 500
|
| 38 |
+
},
|
| 39 |
+
"thread_context": {
|
| 40 |
+
"enabled": true,
|
| 41 |
+
"include_thread_title": true,
|
| 42 |
+
"include_first_post": true
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"output": {
|
| 46 |
+
"table_name": "SABIAN_BRAND_ANALYSIS",
|
| 47 |
+
"database": "SOCIAL_MEDIA_DB",
|
| 48 |
+
"schema": "ML_FEATURES"
|
| 49 |
+
},
|
| 50 |
+
"comments_output": {
|
| 51 |
+
"table_name": "SABIAN_BRAND_ANALYSIS_COMMENTS",
|
| 52 |
+
"database": "SOCIAL_MEDIA_DB",
|
| 53 |
+
"schema": "ML_FEATURES"
|
| 54 |
+
},
|
| 55 |
+
"logging": {
|
| 56 |
+
"level": "INFO",
|
| 57 |
+
"log_directory": "logs",
|
| 58 |
+
"log_file_prefix": "brand_sentiment_processing"
|
| 59 |
+
}
|
| 60 |
+
}
|
processing_brand_sentiment/database/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Database module for brand sentiment analysis.
|
| 3 |
+
Contains Snowflake connection handler and SQL query utilities.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .snowflake_connection import SnowFlakeConn
|
| 7 |
+
|
| 8 |
+
__all__ = ['SnowFlakeConn']
|
processing_brand_sentiment/database/snowflake_connection.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Snowflake connection handler for brand sentiment analysis.
|
| 3 |
+
Provides methods for reading data, executing queries, and storing results.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from snowflake.snowpark import Session
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
import logging
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from typing import Optional, List, Any
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
# Load environment variables
|
| 16 |
+
load_dotenv()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class SnowFlakeConn:
|
| 20 |
+
"""
|
| 21 |
+
Handles Snowflake database connections and operations for brand sentiment analysis.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self):
|
| 25 |
+
"""Initialize Snowflake connection."""
|
| 26 |
+
self.session = self.connect_to_snowflake()
|
| 27 |
+
|
| 28 |
+
def connect_to_snowflake(self) -> Session:
|
| 29 |
+
"""
|
| 30 |
+
Create a connection to Snowflake using environment variables.
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
Snowflake Session object
|
| 34 |
+
"""
|
| 35 |
+
conn = dict(
|
| 36 |
+
user=self.get_credential("SNOWFLAKE_USER"),
|
| 37 |
+
password=self.get_credential("SNOWFLAKE_PASSWORD"),
|
| 38 |
+
account=self.get_credential("SNOWFLAKE_ACCOUNT"),
|
| 39 |
+
role=self.get_credential("SNOWFLAKE_ROLE"),
|
| 40 |
+
database=self.get_credential("SNOWFLAKE_DATABASE"),
|
| 41 |
+
warehouse=self.get_credential("SNOWFLAKE_WAREHOUSE"),
|
| 42 |
+
schema=self.get_credential("SNOWFLAKE_SCHEMA"),
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
session = Session.builder.configs(conn).create()
|
| 46 |
+
logger.info("Successfully connected to Snowflake")
|
| 47 |
+
return session
|
| 48 |
+
|
| 49 |
+
def get_credential(self, key: str) -> str:
|
| 50 |
+
"""
|
| 51 |
+
Get credential from environment variables.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
key: Environment variable name
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
Credential value
|
| 58 |
+
"""
|
| 59 |
+
return os.getenv(key)
|
| 60 |
+
|
| 61 |
+
def run_read_query(self, query: str, description: str = "data") -> pd.DataFrame:
|
| 62 |
+
"""
|
| 63 |
+
Execute a SQL query that fetches data.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
query: SQL query string
|
| 67 |
+
description: Description of what data is being fetched
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
Pandas DataFrame containing query results
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
dataframe = self.session.sql(query).to_pandas()
|
| 74 |
+
dataframe.columns = dataframe.columns.str.lower()
|
| 75 |
+
logger.info(f"Successfully read {len(dataframe)} rows for {description}")
|
| 76 |
+
return dataframe
|
| 77 |
+
except Exception as e:
|
| 78 |
+
logger.error(f"Error reading {description}: {e}")
|
| 79 |
+
raise
|
| 80 |
+
|
| 81 |
+
def store_df_to_snowflake(
|
| 82 |
+
self,
|
| 83 |
+
table_name: str,
|
| 84 |
+
dataframe: pd.DataFrame,
|
| 85 |
+
database: str = "SOCIAL_MEDIA_DB",
|
| 86 |
+
schema: str = "ML_FEATURES",
|
| 87 |
+
overwrite: bool = False
|
| 88 |
+
) -> None:
|
| 89 |
+
"""
|
| 90 |
+
Store a DataFrame to Snowflake.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
table_name: Target table name
|
| 94 |
+
dataframe: DataFrame to store
|
| 95 |
+
database: Target database
|
| 96 |
+
schema: Target schema
|
| 97 |
+
overwrite: If True, overwrite existing data; if False, append
|
| 98 |
+
"""
|
| 99 |
+
try:
|
| 100 |
+
self.session.use_database(database)
|
| 101 |
+
self.session.use_schema(schema)
|
| 102 |
+
|
| 103 |
+
dataframe = dataframe.reset_index(drop=True)
|
| 104 |
+
dataframe.columns = dataframe.columns.str.upper()
|
| 105 |
+
|
| 106 |
+
self.session.write_pandas(
|
| 107 |
+
df=dataframe,
|
| 108 |
+
table_name=table_name.strip().upper(),
|
| 109 |
+
auto_create_table=True,
|
| 110 |
+
overwrite=overwrite,
|
| 111 |
+
use_logical_type=True
|
| 112 |
+
)
|
| 113 |
+
logger.info(f"Successfully stored {len(dataframe)} rows to {table_name}")
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.error(f"Error storing data to {table_name}: {e}")
|
| 117 |
+
raise
|
| 118 |
+
|
| 119 |
+
def execute_sql_file(self, file_path: str) -> Optional[List[Any]]:
|
| 120 |
+
"""
|
| 121 |
+
Execute SQL queries from a file.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
file_path: Path to SQL file
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
Query result or None for DDL/DML
|
| 128 |
+
"""
|
| 129 |
+
try:
|
| 130 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
| 131 |
+
sql_content = file.read()
|
| 132 |
+
|
| 133 |
+
result = self.session.sql(sql_content).collect()
|
| 134 |
+
logger.info(f"Successfully executed SQL from {file_path}")
|
| 135 |
+
return result
|
| 136 |
+
except Exception as e:
|
| 137 |
+
logger.error(f"Error executing SQL file {file_path}: {e}")
|
| 138 |
+
return None
|
| 139 |
+
|
| 140 |
+
def execute_query(self, query: str, description: str = "query") -> Optional[List[Any]]:
|
| 141 |
+
"""
|
| 142 |
+
Execute a SQL query and return results.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
query: SQL query string
|
| 146 |
+
description: Description of the query for logging
|
| 147 |
+
|
| 148 |
+
Returns:
|
| 149 |
+
Query results
|
| 150 |
+
"""
|
| 151 |
+
try:
|
| 152 |
+
result = self.session.sql(query).collect()
|
| 153 |
+
logger.info(f"Successfully executed {description}")
|
| 154 |
+
return result
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logger.error(f"Error executing {description}: {e}")
|
| 157 |
+
return None
|
| 158 |
+
|
| 159 |
+
def fetch_forum_posts_with_context(
|
| 160 |
+
self,
|
| 161 |
+
sql_file_path: str,
|
| 162 |
+
limit: Optional[int] = None
|
| 163 |
+
) -> pd.DataFrame:
|
| 164 |
+
"""
|
| 165 |
+
Fetch forum posts with thread context from SQL file.
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
sql_file_path: Path to the SQL query file
|
| 169 |
+
limit: Optional limit on number of posts to fetch
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
DataFrame containing forum posts with context
|
| 173 |
+
"""
|
| 174 |
+
try:
|
| 175 |
+
with open(sql_file_path, 'r', encoding='utf-8') as f:
|
| 176 |
+
query = f.read()
|
| 177 |
+
|
| 178 |
+
# Add limit if specified
|
| 179 |
+
if limit:
|
| 180 |
+
# Strip whitespace first, then semicolon, to handle Windows line endings
|
| 181 |
+
query = query.strip().rstrip(';') + f"\nLIMIT {limit};"
|
| 182 |
+
|
| 183 |
+
df = self.run_read_query(query, "forum posts with context")
|
| 184 |
+
|
| 185 |
+
# Validate required columns
|
| 186 |
+
required_cols = ['post_id', 'post_content', 'thread_id']
|
| 187 |
+
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 188 |
+
if missing_cols:
|
| 189 |
+
logger.warning(f"Missing expected columns: {missing_cols}")
|
| 190 |
+
|
| 191 |
+
return df
|
| 192 |
+
|
| 193 |
+
except Exception as e:
|
| 194 |
+
logger.error(f"Error fetching forum posts: {e}")
|
| 195 |
+
raise
|
| 196 |
+
|
| 197 |
+
def fetch_comments(
|
| 198 |
+
self,
|
| 199 |
+
sql_file_path: str,
|
| 200 |
+
limit: Optional[int] = None
|
| 201 |
+
) -> pd.DataFrame:
|
| 202 |
+
"""
|
| 203 |
+
Fetch social media comments with context from SQL file.
|
| 204 |
+
|
| 205 |
+
Args:
|
| 206 |
+
sql_file_path: Path to the SQL query file
|
| 207 |
+
limit: Optional limit on number of comments to fetch
|
| 208 |
+
|
| 209 |
+
Returns:
|
| 210 |
+
DataFrame containing comments with context
|
| 211 |
+
"""
|
| 212 |
+
try:
|
| 213 |
+
with open(sql_file_path, 'r', encoding='utf-8') as f:
|
| 214 |
+
query = f.read()
|
| 215 |
+
|
| 216 |
+
# Add limit if specified
|
| 217 |
+
if limit:
|
| 218 |
+
query = query.strip().rstrip(';') + f"\nLIMIT {limit};"
|
| 219 |
+
|
| 220 |
+
df = self.run_read_query(query, "social media comments with context")
|
| 221 |
+
|
| 222 |
+
# Validate required columns
|
| 223 |
+
required_cols = ['comment_sk', 'comment_id', 'comment_text', 'platform']
|
| 224 |
+
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 225 |
+
if missing_cols:
|
| 226 |
+
logger.warning(f"Missing expected columns: {missing_cols}")
|
| 227 |
+
|
| 228 |
+
return df
|
| 229 |
+
|
| 230 |
+
except Exception as e:
|
| 231 |
+
logger.error(f"Error fetching comments: {e}")
|
| 232 |
+
raise
|
| 233 |
+
|
| 234 |
+
def close_connection(self) -> None:
|
| 235 |
+
"""Close the Snowflake session."""
|
| 236 |
+
try:
|
| 237 |
+
self.session.close()
|
| 238 |
+
logger.info("Snowflake connection closed")
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logger.error(f"Error closing connection: {e}")
|
processing_brand_sentiment/database/sql/create_comments_output_table.sql
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Create the output table for Sabian brand sentiment analysis on social media comments
|
| 2 |
+
-- Stores processed comments with extracted brand intelligence
|
| 3 |
+
-- Schema Version 4.0: Same analysis fields as forum table, different source identifiers
|
| 4 |
+
|
| 5 |
+
CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS (
|
| 6 |
+
-- Source identifiers (comment-specific)
|
| 7 |
+
COMMENT_SK NUMBER(38,0),
|
| 8 |
+
COMMENT_ID VARCHAR(16777216),
|
| 9 |
+
ORIGINAL_TEXT VARCHAR(16777216),
|
| 10 |
+
PLATFORM VARCHAR(16777216),
|
| 11 |
+
COMMENT_TIMESTAMP TIMESTAMP_NTZ(9),
|
| 12 |
+
AUTHOR_NAME VARCHAR(16777216),
|
| 13 |
+
AUTHOR_ID VARCHAR(16777216),
|
| 14 |
+
CONTENT_SK NUMBER(38,0),
|
| 15 |
+
CONTENT_ID VARCHAR(16777216),
|
| 16 |
+
CONTENT_DESCRIPTION VARCHAR(16777216),
|
| 17 |
+
CHANNEL_SK NUMBER(38,0),
|
| 18 |
+
CHANNEL_NAME VARCHAR(16777216),
|
| 19 |
+
CHANNEL_DISPLAY_NAME VARCHAR(16777216),
|
| 20 |
+
PARENT_COMMENT_ID VARCHAR(16777216),
|
| 21 |
+
PARENT_COMMENT_TEXT VARCHAR(16777216),
|
| 22 |
+
|
| 23 |
+
-- Language detection
|
| 24 |
+
DETECTED_LANGUAGE VARCHAR(100),
|
| 25 |
+
LANGUAGE_CODE VARCHAR(10),
|
| 26 |
+
IS_ENGLISH BOOLEAN,
|
| 27 |
+
|
| 28 |
+
-- Relevance assessment
|
| 29 |
+
IS_RELEVANT BOOLEAN,
|
| 30 |
+
RELEVANCE_CONFIDENCE VARCHAR(20),
|
| 31 |
+
RELEVANCE_REASON VARCHAR(500),
|
| 32 |
+
|
| 33 |
+
-- Author classification
|
| 34 |
+
AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
|
| 35 |
+
SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
|
| 36 |
+
|
| 37 |
+
-- Sentiment analysis
|
| 38 |
+
SENTIMENT_LEVEL VARCHAR(20),
|
| 39 |
+
EMOTION_TYPE VARCHAR(50),
|
| 40 |
+
SENTIMENT_TARGET VARCHAR(50),
|
| 41 |
+
SENTIMENT_CONFIDENCE VARCHAR(20),
|
| 42 |
+
|
| 43 |
+
-- Product information (stored as JSON arrays)
|
| 44 |
+
PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"]
|
| 45 |
+
PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
|
| 46 |
+
PURCHASE_STAGE VARCHAR(50),
|
| 47 |
+
|
| 48 |
+
-- Competitive intelligence
|
| 49 |
+
COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"]
|
| 50 |
+
COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
|
| 51 |
+
COMPARISON_TYPE VARCHAR(50),
|
| 52 |
+
COMPETITIVE_POSITIONING VARCHAR(500),
|
| 53 |
+
BRAND_SWITCHING VARCHAR(100),
|
| 54 |
+
|
| 55 |
+
-- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
|
| 56 |
+
INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"]
|
| 57 |
+
DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"]
|
| 58 |
+
PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"]
|
| 59 |
+
DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
|
| 60 |
+
|
| 61 |
+
-- Analysis notes
|
| 62 |
+
ANALYSIS_NOTES VARCHAR(16777216),
|
| 63 |
+
SARCASM_DETECTED BOOLEAN,
|
| 64 |
+
|
| 65 |
+
-- Validation results
|
| 66 |
+
VALIDATION_PASSED BOOLEAN,
|
| 67 |
+
VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
|
| 68 |
+
VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
|
| 69 |
+
VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags
|
| 70 |
+
|
| 71 |
+
-- Processing metadata
|
| 72 |
+
PROCESSING_SUCCESS BOOLEAN,
|
| 73 |
+
PROCESSING_STATUS VARCHAR(50), -- completed, completed_with_flags, validation_failed, workflow_error
|
| 74 |
+
PROCESSING_ERRORS VARCHAR(16777216),
|
| 75 |
+
PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
|
| 76 |
+
WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
|
| 77 |
+
)
|
| 78 |
+
COMMENT = 'Brand sentiment analysis results for Sabian social media comments. Schema v4.0: 4-agent pipeline with extraction/analysis separation and validation.';
|
| 79 |
+
|
| 80 |
+
-- Create indexes for common query patterns
|
| 81 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_SK ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(COMMENT_SK);
|
| 82 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_PLATFORM ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PLATFORM);
|
| 83 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_IS_RELEVANT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(IS_RELEVANT);
|
| 84 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_SENTIMENT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(SENTIMENT_LEVEL);
|
| 85 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_PROCESSED_AT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PROCESSED_AT);
|
| 86 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_AUTHOR_ROLE ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(AUTHOR_ROLE);
|
| 87 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_MENTION_CTX ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(SABIAN_MENTION_CONTEXT);
|
| 88 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_STATUS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PROCESSING_STATUS);
|
| 89 |
+
|
| 90 |
+
-- Create view for relevant comments only
|
| 91 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_RELEVANT_ANALYSIS AS
|
| 92 |
+
SELECT *
|
| 93 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
|
| 94 |
+
WHERE IS_RELEVANT = TRUE
|
| 95 |
+
AND PROCESSING_SUCCESS = TRUE;
|
| 96 |
+
|
| 97 |
+
-- Create view for comments needing review (flagged by validator)
|
| 98 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_FLAGGED AS
|
| 99 |
+
SELECT
|
| 100 |
+
COMMENT_SK,
|
| 101 |
+
COMMENT_ID,
|
| 102 |
+
PLATFORM,
|
| 103 |
+
ORIGINAL_TEXT,
|
| 104 |
+
IS_RELEVANT,
|
| 105 |
+
RELEVANCE_CONFIDENCE,
|
| 106 |
+
RELEVANCE_REASON,
|
| 107 |
+
PRODUCTS_MENTIONED,
|
| 108 |
+
SABIAN_MENTION_CONTEXT,
|
| 109 |
+
SENTIMENT_LEVEL,
|
| 110 |
+
VALIDATION_FLAGS,
|
| 111 |
+
VALIDATION_WARNINGS,
|
| 112 |
+
PROCESSING_STATUS
|
| 113 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
|
| 114 |
+
WHERE PROCESSING_STATUS = 'completed_with_flags'
|
| 115 |
+
OR VALIDATION_PASSED = FALSE
|
| 116 |
+
ORDER BY PROCESSED_AT DESC;
|
| 117 |
+
|
| 118 |
+
-- Create view for sentiment distribution
|
| 119 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_SENTIMENT_DISTRIBUTION AS
|
| 120 |
+
SELECT
|
| 121 |
+
PLATFORM,
|
| 122 |
+
SENTIMENT_LEVEL,
|
| 123 |
+
EMOTION_TYPE,
|
| 124 |
+
SENTIMENT_TARGET,
|
| 125 |
+
COUNT(*) AS COMMENT_COUNT,
|
| 126 |
+
COUNT(CASE WHEN SARCASM_DETECTED = TRUE THEN 1 END) AS SARCASM_COUNT
|
| 127 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
|
| 128 |
+
WHERE IS_RELEVANT = TRUE
|
| 129 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 130 |
+
GROUP BY PLATFORM, SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_TARGET
|
| 131 |
+
ORDER BY COMMENT_COUNT DESC;
|
| 132 |
+
|
| 133 |
+
-- Create view for product mentions summary
|
| 134 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_PRODUCT_MENTIONS AS
|
| 135 |
+
SELECT
|
| 136 |
+
PLATFORM,
|
| 137 |
+
TRIM(product.VALUE::STRING) AS PRODUCT,
|
| 138 |
+
SENTIMENT_LEVEL,
|
| 139 |
+
COUNT(*) AS MENTION_COUNT,
|
| 140 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT,
|
| 141 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT
|
| 142 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS,
|
| 143 |
+
LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PRODUCTS_MENTIONED)) AS product
|
| 144 |
+
WHERE IS_RELEVANT = TRUE
|
| 145 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 146 |
+
AND PRODUCTS_MENTIONED IS NOT NULL
|
| 147 |
+
GROUP BY PLATFORM, TRIM(product.VALUE::STRING), SENTIMENT_LEVEL
|
| 148 |
+
ORDER BY MENTION_COUNT DESC;
|
| 149 |
+
|
| 150 |
+
-- Create view for validation summary
|
| 151 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_VALIDATION_SUMMARY AS
|
| 152 |
+
SELECT
|
| 153 |
+
PLATFORM,
|
| 154 |
+
PROCESSING_STATUS,
|
| 155 |
+
VALIDATION_PASSED,
|
| 156 |
+
COUNT(*) AS COMMENT_COUNT,
|
| 157 |
+
COUNT(CASE WHEN IS_RELEVANT = TRUE THEN 1 END) AS RELEVANT_COUNT,
|
| 158 |
+
COUNT(CASE WHEN IS_RELEVANT = FALSE THEN 1 END) AS NOT_RELEVANT_COUNT
|
| 159 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
|
| 160 |
+
GROUP BY PLATFORM, PROCESSING_STATUS, VALIDATION_PASSED
|
| 161 |
+
ORDER BY COMMENT_COUNT DESC;
|
processing_brand_sentiment/database/sql/create_output_table.sql
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Create the output table for Sabian brand sentiment analysis
|
| 2 |
+
-- Stores processed forum posts with extracted brand intelligence
|
| 3 |
+
-- Schema Version 4.0: Added THREAD_CONTEXT_SUMMARY, validation fields, and processing status
|
| 4 |
+
|
| 5 |
+
CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS (
|
| 6 |
+
-- Source identifiers
|
| 7 |
+
POST_ID NUMBER(38,0) PRIMARY KEY,
|
| 8 |
+
THREAD_ID NUMBER(38,0),
|
| 9 |
+
POST_AUTHOR_ID NUMBER(38,0),
|
| 10 |
+
|
| 11 |
+
-- Original and processed content
|
| 12 |
+
ORIGINAL_CONTENT VARCHAR(16777216),
|
| 13 |
+
CLEANED_CONTENT VARCHAR(16777216),
|
| 14 |
+
QUOTED_CONTENT VARCHAR(16777216),
|
| 15 |
+
THREAD_CONTEXT VARCHAR(16777216), -- Raw thread context (legacy)
|
| 16 |
+
THREAD_CONTEXT_SUMMARY VARCHAR(4096), -- NEW v4.0: Summarized thread context for analysis
|
| 17 |
+
|
| 18 |
+
-- Thread metadata
|
| 19 |
+
THREAD_TITLE VARCHAR(16777216),
|
| 20 |
+
THREAD_FIRST_POST VARCHAR(16777216),
|
| 21 |
+
|
| 22 |
+
-- Timestamps
|
| 23 |
+
POST_CREATED_AT TIMESTAMP_LTZ(9),
|
| 24 |
+
THREAD_STARTED_AT TIMESTAMP_LTZ(9),
|
| 25 |
+
|
| 26 |
+
-- Category information
|
| 27 |
+
CATEGORY_TITLE VARCHAR(16777216),
|
| 28 |
+
CATEGORY_TOPIC VARCHAR(16777216),
|
| 29 |
+
|
| 30 |
+
-- Language detection
|
| 31 |
+
DETECTED_LANGUAGE VARCHAR(100),
|
| 32 |
+
LANGUAGE_CODE VARCHAR(10),
|
| 33 |
+
IS_ENGLISH BOOLEAN,
|
| 34 |
+
|
| 35 |
+
-- Relevance assessment
|
| 36 |
+
IS_RELEVANT BOOLEAN,
|
| 37 |
+
RELEVANCE_CONFIDENCE VARCHAR(20),
|
| 38 |
+
RELEVANCE_REASON VARCHAR(500),
|
| 39 |
+
|
| 40 |
+
-- Author classification
|
| 41 |
+
AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
|
| 42 |
+
SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
|
| 43 |
+
|
| 44 |
+
-- Sentiment analysis
|
| 45 |
+
SENTIMENT_LEVEL VARCHAR(20),
|
| 46 |
+
EMOTION_TYPE VARCHAR(50),
|
| 47 |
+
SENTIMENT_TARGET VARCHAR(50),
|
| 48 |
+
SENTIMENT_CONFIDENCE VARCHAR(20),
|
| 49 |
+
|
| 50 |
+
-- Product information (stored as JSON arrays)
|
| 51 |
+
PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"]
|
| 52 |
+
PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
|
| 53 |
+
PURCHASE_STAGE VARCHAR(50),
|
| 54 |
+
|
| 55 |
+
-- Competitive intelligence
|
| 56 |
+
COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"]
|
| 57 |
+
COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
|
| 58 |
+
COMPARISON_TYPE VARCHAR(50),
|
| 59 |
+
COMPETITIVE_POSITIONING VARCHAR(500),
|
| 60 |
+
BRAND_SWITCHING VARCHAR(100),
|
| 61 |
+
|
| 62 |
+
-- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
|
| 63 |
+
INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"]
|
| 64 |
+
DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"]
|
| 65 |
+
PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"]
|
| 66 |
+
DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
|
| 67 |
+
|
| 68 |
+
-- Analysis notes
|
| 69 |
+
ANALYSIS_NOTES VARCHAR(16777216),
|
| 70 |
+
SARCASM_DETECTED BOOLEAN,
|
| 71 |
+
|
| 72 |
+
-- Validation results (NEW v4.0)
|
| 73 |
+
VALIDATION_PASSED BOOLEAN,
|
| 74 |
+
VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
|
| 75 |
+
VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
|
| 76 |
+
VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags (e.g., "sarcasm_detected", "low_confidence_relevant")
|
| 77 |
+
|
| 78 |
+
-- Platform identifier
|
| 79 |
+
PLATFORM VARCHAR(50) DEFAULT 'musora_forums',
|
| 80 |
+
|
| 81 |
+
-- Processing metadata
|
| 82 |
+
PROCESSING_SUCCESS BOOLEAN,
|
| 83 |
+
PROCESSING_STATUS VARCHAR(50), -- NEW v4.0: completed, completed_with_flags, validation_failed, workflow_error
|
| 84 |
+
PROCESSING_ERRORS VARCHAR(16777216),
|
| 85 |
+
PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
|
| 86 |
+
WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
|
| 87 |
+
)
|
| 88 |
+
COMMENT = 'Brand sentiment analysis results for Sabian forum posts. Schema v4.0: 4-agent pipeline with extraction/analysis separation, thread context summarization, and validation.';
|
| 89 |
+
|
| 90 |
+
-- Create indexes for common query patterns
|
| 91 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_THREAD_ID ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(THREAD_ID);
|
| 92 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_IS_RELEVANT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(IS_RELEVANT);
|
| 93 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_SENTIMENT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(SENTIMENT_LEVEL);
|
| 94 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_PROCESSED_AT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(PROCESSED_AT);
|
| 95 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_AUTHOR_ROLE ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(AUTHOR_ROLE);
|
| 96 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_MENTION_CONTEXT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(SABIAN_MENTION_CONTEXT);
|
| 97 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_PROCESSING_STATUS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(PROCESSING_STATUS);
|
| 98 |
+
CREATE INDEX IF NOT EXISTS IDX_SABIAN_VALIDATION_FLAGS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(VALIDATION_PASSED);
|
| 99 |
+
|
| 100 |
+
-- Create view for relevant posts only
|
| 101 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_RELEVANT_ANALYSIS AS
|
| 102 |
+
SELECT *
|
| 103 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
|
| 104 |
+
WHERE IS_RELEVANT = TRUE
|
| 105 |
+
AND PROCESSING_SUCCESS = TRUE;
|
| 106 |
+
|
| 107 |
+
-- Create view for posts needing review (flagged by validator)
|
| 108 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_FLAGGED_POSTS AS
|
| 109 |
+
SELECT
|
| 110 |
+
POST_ID,
|
| 111 |
+
THREAD_ID,
|
| 112 |
+
CLEANED_CONTENT,
|
| 113 |
+
THREAD_CONTEXT_SUMMARY,
|
| 114 |
+
IS_RELEVANT,
|
| 115 |
+
RELEVANCE_CONFIDENCE,
|
| 116 |
+
RELEVANCE_REASON,
|
| 117 |
+
PRODUCTS_MENTIONED,
|
| 118 |
+
SABIAN_MENTION_CONTEXT,
|
| 119 |
+
SENTIMENT_LEVEL,
|
| 120 |
+
VALIDATION_FLAGS,
|
| 121 |
+
VALIDATION_WARNINGS,
|
| 122 |
+
PROCESSING_STATUS
|
| 123 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
|
| 124 |
+
WHERE PROCESSING_STATUS = 'completed_with_flags'
|
| 125 |
+
OR VALIDATION_PASSED = FALSE
|
| 126 |
+
ORDER BY PROCESSED_AT DESC;
|
| 127 |
+
|
| 128 |
+
-- Create view for sentiment distribution
|
| 129 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_SENTIMENT_DISTRIBUTION AS
|
| 130 |
+
SELECT
|
| 131 |
+
SENTIMENT_LEVEL,
|
| 132 |
+
EMOTION_TYPE,
|
| 133 |
+
SENTIMENT_TARGET,
|
| 134 |
+
COUNT(*) AS POST_COUNT,
|
| 135 |
+
COUNT(CASE WHEN SARCASM_DETECTED = TRUE THEN 1 END) AS SARCASM_COUNT
|
| 136 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
|
| 137 |
+
WHERE IS_RELEVANT = TRUE
|
| 138 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 139 |
+
GROUP BY SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_TARGET
|
| 140 |
+
ORDER BY POST_COUNT DESC;
|
| 141 |
+
|
| 142 |
+
-- Create view for product mentions summary
|
| 143 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_PRODUCT_MENTIONS AS
|
| 144 |
+
SELECT
|
| 145 |
+
TRIM(product.VALUE::STRING) AS PRODUCT,
|
| 146 |
+
SENTIMENT_LEVEL,
|
| 147 |
+
COUNT(*) AS MENTION_COUNT,
|
| 148 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT,
|
| 149 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT
|
| 150 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
|
| 151 |
+
LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PRODUCTS_MENTIONED)) AS product
|
| 152 |
+
WHERE IS_RELEVANT = TRUE
|
| 153 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 154 |
+
AND PRODUCTS_MENTIONED IS NOT NULL
|
| 155 |
+
GROUP BY TRIM(product.VALUE::STRING), SENTIMENT_LEVEL
|
| 156 |
+
ORDER BY MENTION_COUNT DESC;
|
| 157 |
+
|
| 158 |
+
-- Create view for competitor analysis
|
| 159 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMPETITOR_ANALYSIS AS
|
| 160 |
+
SELECT
|
| 161 |
+
TRIM(competitor.VALUE::STRING) AS COMPETITOR,
|
| 162 |
+
COMPARISON_TYPE,
|
| 163 |
+
BRAND_SWITCHING,
|
| 164 |
+
COUNT(*) AS MENTION_COUNT,
|
| 165 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_SENTIMENT,
|
| 166 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_SENTIMENT
|
| 167 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
|
| 168 |
+
LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(COMPETITORS_MENTIONED)) AS competitor
|
| 169 |
+
WHERE IS_RELEVANT = TRUE
|
| 170 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 171 |
+
AND COMPETITORS_MENTIONED IS NOT NULL
|
| 172 |
+
GROUP BY TRIM(competitor.VALUE::STRING), COMPARISON_TYPE, BRAND_SWITCHING
|
| 173 |
+
ORDER BY MENTION_COUNT DESC;
|
| 174 |
+
|
| 175 |
+
-- Create view for pain points analysis
|
| 176 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_PAIN_POINTS AS
|
| 177 |
+
SELECT
|
| 178 |
+
TRIM(pain_point.VALUE::STRING) AS PAIN_POINT,
|
| 179 |
+
COUNT(*) AS OCCURRENCE_COUNT,
|
| 180 |
+
ARRAY_AGG(DISTINCT SENTIMENT_LEVEL) AS SENTIMENT_LEVELS
|
| 181 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
|
| 182 |
+
LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PAIN_POINTS)) AS pain_point
|
| 183 |
+
WHERE IS_RELEVANT = TRUE
|
| 184 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 185 |
+
AND PAIN_POINTS IS NOT NULL
|
| 186 |
+
GROUP BY TRIM(pain_point.VALUE::STRING)
|
| 187 |
+
ORDER BY OCCURRENCE_COUNT DESC;
|
| 188 |
+
|
| 189 |
+
-- Create view for author role analysis
|
| 190 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_AUTHOR_ROLES AS
|
| 191 |
+
SELECT
|
| 192 |
+
AUTHOR_ROLE,
|
| 193 |
+
SABIAN_MENTION_CONTEXT,
|
| 194 |
+
COUNT(*) AS POST_COUNT,
|
| 195 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT,
|
| 196 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT,
|
| 197 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL = 'neutral' THEN 1 END) AS NEUTRAL_COUNT
|
| 198 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
|
| 199 |
+
WHERE IS_RELEVANT = TRUE
|
| 200 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 201 |
+
GROUP BY AUTHOR_ROLE, SABIAN_MENTION_CONTEXT
|
| 202 |
+
ORDER BY POST_COUNT DESC;
|
| 203 |
+
|
| 204 |
+
-- Create view for competitor ownership analysis
|
| 205 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMPETITOR_OWNERSHIP AS
|
| 206 |
+
SELECT
|
| 207 |
+
TRIM(competitor.VALUE::STRING) AS COMPETITOR_OWNED,
|
| 208 |
+
AUTHOR_ROLE,
|
| 209 |
+
COUNT(*) AS AUTHOR_COUNT,
|
| 210 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_TOWARD_SABIAN,
|
| 211 |
+
COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_TOWARD_SABIAN
|
| 212 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
|
| 213 |
+
LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(COMPETITOR_PRODUCTS_OWNED)) AS competitor
|
| 214 |
+
WHERE IS_RELEVANT = TRUE
|
| 215 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 216 |
+
AND COMPETITOR_PRODUCTS_OWNED IS NOT NULL
|
| 217 |
+
GROUP BY TRIM(competitor.VALUE::STRING), AUTHOR_ROLE
|
| 218 |
+
ORDER BY AUTHOR_COUNT DESC;
|
| 219 |
+
|
| 220 |
+
-- Create view for mention context by sentiment
|
| 221 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_MENTION_DEPTH AS
|
| 222 |
+
SELECT
|
| 223 |
+
SABIAN_MENTION_CONTEXT,
|
| 224 |
+
SENTIMENT_LEVEL,
|
| 225 |
+
COUNT(*) AS POST_COUNT,
|
| 226 |
+
AVG(CASE
|
| 227 |
+
WHEN SENTIMENT_LEVEL = 'very_positive' THEN 2
|
| 228 |
+
WHEN SENTIMENT_LEVEL = 'positive' THEN 1
|
| 229 |
+
WHEN SENTIMENT_LEVEL = 'neutral' THEN 0
|
| 230 |
+
WHEN SENTIMENT_LEVEL = 'negative' THEN -1
|
| 231 |
+
WHEN SENTIMENT_LEVEL = 'very_negative' THEN -2
|
| 232 |
+
ELSE 0
|
| 233 |
+
END) AS AVG_SENTIMENT_SCORE
|
| 234 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
|
| 235 |
+
WHERE IS_RELEVANT = TRUE
|
| 236 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 237 |
+
GROUP BY SABIAN_MENTION_CONTEXT, SENTIMENT_LEVEL
|
| 238 |
+
ORDER BY SABIAN_MENTION_CONTEXT, POST_COUNT DESC;
|
| 239 |
+
|
| 240 |
+
-- Create view for validation flags analysis (NEW v4.0)
|
| 241 |
+
CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_VALIDATION_SUMMARY AS
|
| 242 |
+
SELECT
|
| 243 |
+
PROCESSING_STATUS,
|
| 244 |
+
VALIDATION_PASSED,
|
| 245 |
+
COUNT(*) AS POST_COUNT,
|
| 246 |
+
COUNT(CASE WHEN IS_RELEVANT = TRUE THEN 1 END) AS RELEVANT_COUNT,
|
| 247 |
+
COUNT(CASE WHEN IS_RELEVANT = FALSE THEN 1 END) AS NOT_RELEVANT_COUNT
|
| 248 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
|
| 249 |
+
GROUP BY PROCESSING_STATUS, VALIDATION_PASSED
|
| 250 |
+
ORDER BY POST_COUNT DESC;
|
processing_brand_sentiment/database/sql/fetch_comments.sql
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Query to fetch social media comments with context for brand sentiment analysis
|
| 2 |
+
-- Source: SOCIAL_MEDIA_DB.brand_sentiment.SABIAN_comments (same structure as CORE.FACT_COMMENTS)
|
| 3 |
+
-- Includes: comment content, parent comment text, content metadata, channel info
|
| 4 |
+
-- Excludes: official accounts, already-processed comments, empty comments
|
| 5 |
+
|
| 6 |
+
SELECT
|
| 7 |
+
-- Comment identifiers
|
| 8 |
+
fc.COMMENT_SK,
|
| 9 |
+
fc.COMMENT_ID,
|
| 10 |
+
fc.PLATFORM,
|
| 11 |
+
fc.MESSAGE AS COMMENT_TEXT,
|
| 12 |
+
fc.CREATED_TIME AS COMMENT_TIMESTAMP,
|
| 13 |
+
fc.AUTHOR_NAME,
|
| 14 |
+
fc.AUTHOR_ID,
|
| 15 |
+
fc.LIKE_COUNT,
|
| 16 |
+
fc.PARENT_COMMENT_ID,
|
| 17 |
+
fc.REPLIES_COUNT,
|
| 18 |
+
fc.COMMENT_LENGTH,
|
| 19 |
+
fc.IS_ACTIVE AS COMMENT_IS_ACTIVE,
|
| 20 |
+
|
| 21 |
+
-- Parent comment information (self-join to get parent comment text)
|
| 22 |
+
parent_fc.MESSAGE AS PARENT_COMMENT_TEXT,
|
| 23 |
+
|
| 24 |
+
-- Content information
|
| 25 |
+
dc.CONTENT_SK,
|
| 26 |
+
dc.CONTENT_ID,
|
| 27 |
+
dc.CONTENT_TYPE,
|
| 28 |
+
dc.MESSAGE AS CONTENT_DESCRIPTION,
|
| 29 |
+
dc.TITLE AS CONTENT_TITLE,
|
| 30 |
+
dc.PERMALINK_URL,
|
| 31 |
+
dc.CREATED_TIME AS CONTENT_TIMESTAMP,
|
| 32 |
+
|
| 33 |
+
-- Channel information
|
| 34 |
+
dch.CHANNEL_SK,
|
| 35 |
+
dch.CHANNEL_NAME,
|
| 36 |
+
dch.CHANNEL_DISPLAY_NAME
|
| 37 |
+
|
| 38 |
+
FROM
|
| 39 |
+
SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS fc
|
| 40 |
+
|
| 41 |
+
-- Left join to get parent comment text if it exists
|
| 42 |
+
LEFT JOIN
|
| 43 |
+
SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS parent_fc
|
| 44 |
+
ON fc.PARENT_COMMENT_ID = parent_fc.COMMENT_ID
|
| 45 |
+
AND fc.PLATFORM = parent_fc.PLATFORM
|
| 46 |
+
|
| 47 |
+
INNER JOIN
|
| 48 |
+
SOCIAL_MEDIA_DB.CORE.DIM_CONTENT dc
|
| 49 |
+
ON fc.CONTENT_SK = dc.CONTENT_SK
|
| 50 |
+
|
| 51 |
+
INNER JOIN
|
| 52 |
+
SOCIAL_MEDIA_DB.CORE.DIM_CHANNEL dch
|
| 53 |
+
ON dc.CHANNEL_NAME = dch.CHANNEL_NAME
|
| 54 |
+
AND dc.PLATFORM = dch.PLATFORM
|
| 55 |
+
|
| 56 |
+
-- Left join with output table to exclude already-processed comments
|
| 57 |
+
LEFT JOIN
|
| 58 |
+
SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS sba
|
| 59 |
+
ON fc.COMMENT_SK = sba.COMMENT_SK
|
| 60 |
+
|
| 61 |
+
WHERE
|
| 62 |
+
-- Active records only
|
| 63 |
+
fc.IS_ACTIVE = TRUE
|
| 64 |
+
AND dc.IS_ACTIVE = TRUE
|
| 65 |
+
AND dch.IS_ACTIVE = TRUE
|
| 66 |
+
|
| 67 |
+
-- Exclude official accounts
|
| 68 |
+
AND (fc.AUTHOR_NAME IS NULL OR fc.AUTHOR_NAME NOT IN (
|
| 69 |
+
'Musora', 'Drumeo', 'Pianote',
|
| 70 |
+
'@PianoteOfficial', '@DrumeoOfficial', '@MusoraOfficial'
|
| 71 |
+
))
|
| 72 |
+
|
| 73 |
+
-- Exclude already-processed comments
|
| 74 |
+
AND sba.COMMENT_SK IS NULL
|
| 75 |
+
|
| 76 |
+
-- Ensure comment has content
|
| 77 |
+
AND fc.MESSAGE IS NOT NULL
|
| 78 |
+
AND TRIM(fc.MESSAGE) != ''
|
| 79 |
+
AND LENGTH(TRIM(fc.MESSAGE)) > 0
|
| 80 |
+
|
| 81 |
+
ORDER BY
|
| 82 |
+
fc.CREATED_TIME DESC;
|
processing_brand_sentiment/database/sql/fetch_forum_posts.sql
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Query to fetch forum posts with thread context for brand sentiment analysis
|
| 2 |
+
-- Includes: post content, thread context (title, first post), parent relationships
|
| 3 |
+
-- Excludes: team/house-coach posts, already-processed posts, deleted posts
|
| 4 |
+
|
| 5 |
+
WITH thread_first_posts AS (
|
| 6 |
+
-- Get the first post (by creation date) for each thread to use as context
|
| 7 |
+
-- Using ROW_NUMBER for reliable first post identification
|
| 8 |
+
SELECT
|
| 9 |
+
THREAD_ID,
|
| 10 |
+
POST_CONTENT AS FIRST_POST_CONTENT,
|
| 11 |
+
POST_AUTHOR_ID AS FIRST_POST_AUTHOR_ID,
|
| 12 |
+
POST_CREATED_AT AS FIRST_POST_CREATED_AT
|
| 13 |
+
FROM (
|
| 14 |
+
SELECT
|
| 15 |
+
THREAD_ID,
|
| 16 |
+
POST_CONTENT,
|
| 17 |
+
POST_AUTHOR_ID,
|
| 18 |
+
POST_CREATED_AT,
|
| 19 |
+
ROW_NUMBER() OVER (PARTITION BY THREAD_ID ORDER BY POST_CREATED_AT ASC) AS rn
|
| 20 |
+
FROM SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS
|
| 21 |
+
WHERE POST_CONTENT IS NOT NULL
|
| 22 |
+
AND TRIM(POST_CONTENT) != ''
|
| 23 |
+
) ranked
|
| 24 |
+
WHERE rn = 1
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
SELECT
|
| 28 |
+
-- Post identifiers
|
| 29 |
+
fp.POST_ID,
|
| 30 |
+
fp.POST_AUTHOR_ID,
|
| 31 |
+
fp.THREAD_ID,
|
| 32 |
+
|
| 33 |
+
-- Post content (may contain HTML with quoted parent)
|
| 34 |
+
fp.POST_CONTENT,
|
| 35 |
+
|
| 36 |
+
-- Post timestamps
|
| 37 |
+
fp.POST_CREATED_AT,
|
| 38 |
+
fp.POST_EDITED_ON,
|
| 39 |
+
fp.POST_PUBLISHED_ON,
|
| 40 |
+
fp.POST_STATE,
|
| 41 |
+
|
| 42 |
+
-- Parent/Child relationships (for context)
|
| 43 |
+
fp.PROMPTING_POST_ID,
|
| 44 |
+
fp.PARENT_ID,
|
| 45 |
+
fp.PARENT_CONTENT,
|
| 46 |
+
fp.PARENT_AUTHOR_ID,
|
| 47 |
+
fp.PARENT_CREATED_AT,
|
| 48 |
+
fp.CHILD_ID,
|
| 49 |
+
fp.CHILD_CONTENT,
|
| 50 |
+
|
| 51 |
+
-- Thread context
|
| 52 |
+
fp.THREAD_TITLE,
|
| 53 |
+
fp.THREAD_SLUG,
|
| 54 |
+
fp.THREAD_STATE,
|
| 55 |
+
fp.THREAD_LOCKED,
|
| 56 |
+
fp.THREAD_PINNED,
|
| 57 |
+
fp.THREAD_POST_COUNT,
|
| 58 |
+
fp.THREAD_PUBLISHED_ON,
|
| 59 |
+
|
| 60 |
+
-- First post of the thread (for context)
|
| 61 |
+
tfp.FIRST_POST_CONTENT AS THREAD_FIRST_POST,
|
| 62 |
+
tfp.FIRST_POST_CREATED_AT AS THREAD_STARTED_AT,
|
| 63 |
+
|
| 64 |
+
-- Category information
|
| 65 |
+
fp.CATEGORY_ID,
|
| 66 |
+
fp.CATEGORY_BRAND,
|
| 67 |
+
fp.CATEGORY_DESCRIPTION,
|
| 68 |
+
fp.CATEGORY_TITLE,
|
| 69 |
+
fp.CATEGORY_TOPIC,
|
| 70 |
+
fp.CATEGORY_SLUG,
|
| 71 |
+
|
| 72 |
+
-- Access levels (for filtering)
|
| 73 |
+
fp.POST_AUTHOR_ACCESS_LEVEL,
|
| 74 |
+
fp.PARENT_AUTHOR_ACCESS_LEVEL,
|
| 75 |
+
fp.CHILD_AUTHOR_ACCESS_LEVEL
|
| 76 |
+
|
| 77 |
+
FROM
|
| 78 |
+
SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS fp
|
| 79 |
+
|
| 80 |
+
-- Join to get thread's first post for context
|
| 81 |
+
LEFT JOIN
|
| 82 |
+
thread_first_posts tfp ON fp.THREAD_ID = tfp.THREAD_ID
|
| 83 |
+
|
| 84 |
+
-- Left join with output table to exclude already-processed posts
|
| 85 |
+
LEFT JOIN
|
| 86 |
+
SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS sba
|
| 87 |
+
ON fp.POST_ID = sba.POST_ID
|
| 88 |
+
|
| 89 |
+
WHERE
|
| 90 |
+
-- Exclude team and house-coach posts (internal comments)
|
| 91 |
+
(fp.POST_AUTHOR_ACCESS_LEVEL IS NULL OR fp.POST_AUTHOR_ACCESS_LEVEL NOT IN ('team', 'house-coach'))
|
| 92 |
+
|
| 93 |
+
-- Exclude deleted posts
|
| 94 |
+
AND (fp.POST_STATE IS NULL OR fp.POST_STATE != 'deleted')
|
| 95 |
+
AND fp.POST_DELETED_AT IS NULL
|
| 96 |
+
|
| 97 |
+
-- Exclude already-processed posts
|
| 98 |
+
AND sba.POST_ID IS NULL
|
| 99 |
+
|
| 100 |
+
-- Ensure post has content
|
| 101 |
+
AND fp.POST_CONTENT IS NOT NULL
|
| 102 |
+
AND TRIM(fp.POST_CONTENT) != ''
|
| 103 |
+
AND LENGTH(TRIM(fp.POST_CONTENT)) > 0
|
| 104 |
+
|
| 105 |
+
ORDER BY
|
| 106 |
+
fp.POST_CREATED_AT DESC;
|
processing_brand_sentiment/database/sql/init_comments_output_table.sql
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Initialize empty output table for Sabian brand sentiment analysis on social media comments
|
| 2 |
+
-- Run this script BEFORE the first processing run to create the table structure
|
| 3 |
+
-- This prevents "table not found" errors when the fetch query tries to check for already-processed comments
|
| 4 |
+
|
| 5 |
+
CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS (
|
| 6 |
+
-- Source identifiers (comment-specific)
|
| 7 |
+
COMMENT_SK NUMBER(38,0),
|
| 8 |
+
COMMENT_ID VARCHAR(16777216),
|
| 9 |
+
ORIGINAL_TEXT VARCHAR(16777216),
|
| 10 |
+
PLATFORM VARCHAR(16777216),
|
| 11 |
+
COMMENT_TIMESTAMP TIMESTAMP_NTZ(9),
|
| 12 |
+
AUTHOR_NAME VARCHAR(16777216),
|
| 13 |
+
AUTHOR_ID VARCHAR(16777216),
|
| 14 |
+
CONTENT_SK NUMBER(38,0),
|
| 15 |
+
CONTENT_ID VARCHAR(16777216),
|
| 16 |
+
CONTENT_DESCRIPTION VARCHAR(16777216),
|
| 17 |
+
CHANNEL_SK NUMBER(38,0),
|
| 18 |
+
CHANNEL_NAME VARCHAR(16777216),
|
| 19 |
+
CHANNEL_DISPLAY_NAME VARCHAR(16777216),
|
| 20 |
+
PARENT_COMMENT_ID VARCHAR(16777216),
|
| 21 |
+
PARENT_COMMENT_TEXT VARCHAR(16777216),
|
| 22 |
+
|
| 23 |
+
-- Language detection
|
| 24 |
+
DETECTED_LANGUAGE VARCHAR(100),
|
| 25 |
+
LANGUAGE_CODE VARCHAR(10),
|
| 26 |
+
IS_ENGLISH BOOLEAN,
|
| 27 |
+
|
| 28 |
+
-- Relevance assessment
|
| 29 |
+
IS_RELEVANT BOOLEAN,
|
| 30 |
+
RELEVANCE_CONFIDENCE VARCHAR(20),
|
| 31 |
+
RELEVANCE_REASON VARCHAR(500),
|
| 32 |
+
|
| 33 |
+
-- Author classification
|
| 34 |
+
AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
|
| 35 |
+
SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
|
| 36 |
+
|
| 37 |
+
-- Sentiment analysis
|
| 38 |
+
SENTIMENT_LEVEL VARCHAR(20),
|
| 39 |
+
EMOTION_TYPE VARCHAR(50),
|
| 40 |
+
SENTIMENT_TARGET VARCHAR(50),
|
| 41 |
+
SENTIMENT_CONFIDENCE VARCHAR(20),
|
| 42 |
+
|
| 43 |
+
-- Product information (stored as JSON arrays)
|
| 44 |
+
PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"]
|
| 45 |
+
PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
|
| 46 |
+
PURCHASE_STAGE VARCHAR(50),
|
| 47 |
+
|
| 48 |
+
-- Competitive intelligence
|
| 49 |
+
COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"]
|
| 50 |
+
COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
|
| 51 |
+
COMPARISON_TYPE VARCHAR(50),
|
| 52 |
+
COMPETITIVE_POSITIONING VARCHAR(500),
|
| 53 |
+
BRAND_SWITCHING VARCHAR(100),
|
| 54 |
+
|
| 55 |
+
-- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
|
| 56 |
+
INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"]
|
| 57 |
+
DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"]
|
| 58 |
+
PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"]
|
| 59 |
+
DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
|
| 60 |
+
|
| 61 |
+
-- Analysis notes
|
| 62 |
+
ANALYSIS_NOTES VARCHAR(16777216),
|
| 63 |
+
SARCASM_DETECTED BOOLEAN,
|
| 64 |
+
|
| 65 |
+
-- Validation results
|
| 66 |
+
VALIDATION_PASSED BOOLEAN,
|
| 67 |
+
VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
|
| 68 |
+
VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
|
| 69 |
+
VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags
|
| 70 |
+
|
| 71 |
+
-- Processing metadata
|
| 72 |
+
PROCESSING_SUCCESS BOOLEAN,
|
| 73 |
+
PROCESSING_STATUS VARCHAR(50), -- completed, completed_with_flags, validation_failed, workflow_error
|
| 74 |
+
PROCESSING_ERRORS VARCHAR(16777216),
|
| 75 |
+
PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
|
| 76 |
+
WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
|
| 77 |
+
)
|
| 78 |
+
COMMENT = 'Brand sentiment analysis results for Sabian social media comments. Schema v4.0: 4-agent pipeline with extraction/analysis separation and validation.';
|
processing_brand_sentiment/database/sql/init_output_table.sql
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Initialize empty output table for Sabian brand sentiment analysis
|
| 2 |
+
-- Run this script BEFORE the first processing run to create the table structure
|
| 3 |
+
-- This prevents "table not found" errors when the fetch query tries to check for already-processed posts
|
| 4 |
+
-- Schema Version 4.0: Added THREAD_CONTEXT_SUMMARY, validation fields, and processing status
|
| 5 |
+
|
| 6 |
+
CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS (
|
| 7 |
+
-- Source identifiers
|
| 8 |
+
POST_ID NUMBER(38,0) PRIMARY KEY,
|
| 9 |
+
THREAD_ID NUMBER(38,0),
|
| 10 |
+
POST_AUTHOR_ID NUMBER(38,0),
|
| 11 |
+
|
| 12 |
+
-- Original and processed content
|
| 13 |
+
ORIGINAL_CONTENT VARCHAR(16777216),
|
| 14 |
+
CLEANED_CONTENT VARCHAR(16777216),
|
| 15 |
+
QUOTED_CONTENT VARCHAR(16777216),
|
| 16 |
+
THREAD_CONTEXT VARCHAR(16777216), -- Raw thread context (legacy)
|
| 17 |
+
THREAD_CONTEXT_SUMMARY VARCHAR(4096), -- NEW v4.0: Summarized thread context
|
| 18 |
+
|
| 19 |
+
-- Thread metadata
|
| 20 |
+
THREAD_TITLE VARCHAR(16777216),
|
| 21 |
+
THREAD_FIRST_POST VARCHAR(16777216),
|
| 22 |
+
|
| 23 |
+
-- Timestamps
|
| 24 |
+
POST_CREATED_AT TIMESTAMP_LTZ(9),
|
| 25 |
+
THREAD_STARTED_AT TIMESTAMP_LTZ(9),
|
| 26 |
+
|
| 27 |
+
-- Category information
|
| 28 |
+
CATEGORY_TITLE VARCHAR(16777216),
|
| 29 |
+
CATEGORY_TOPIC VARCHAR(16777216),
|
| 30 |
+
|
| 31 |
+
-- Language detection
|
| 32 |
+
DETECTED_LANGUAGE VARCHAR(100),
|
| 33 |
+
LANGUAGE_CODE VARCHAR(10),
|
| 34 |
+
IS_ENGLISH BOOLEAN,
|
| 35 |
+
|
| 36 |
+
-- Relevance assessment
|
| 37 |
+
IS_RELEVANT BOOLEAN,
|
| 38 |
+
RELEVANCE_CONFIDENCE VARCHAR(20),
|
| 39 |
+
RELEVANCE_REASON VARCHAR(500),
|
| 40 |
+
|
| 41 |
+
-- Author classification
|
| 42 |
+
AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
|
| 43 |
+
SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
|
| 44 |
+
|
| 45 |
+
-- Sentiment analysis
|
| 46 |
+
SENTIMENT_LEVEL VARCHAR(20),
|
| 47 |
+
EMOTION_TYPE VARCHAR(50),
|
| 48 |
+
SENTIMENT_TARGET VARCHAR(50),
|
| 49 |
+
SENTIMENT_CONFIDENCE VARCHAR(20),
|
| 50 |
+
|
| 51 |
+
-- Product information (stored as JSON arrays)
|
| 52 |
+
PRODUCTS_MENTIONED VARCHAR(16777216),
|
| 53 |
+
PRODUCT_ATTRIBUTES VARCHAR(16777216),
|
| 54 |
+
|
| 55 |
+
-- Competitive intelligence
|
| 56 |
+
COMPETITORS_MENTIONED VARCHAR(16777216),
|
| 57 |
+
COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
|
| 58 |
+
COMPARISON_TYPE VARCHAR(50),
|
| 59 |
+
COMPETITIVE_POSITIONING VARCHAR(500),
|
| 60 |
+
BRAND_SWITCHING VARCHAR(100),
|
| 61 |
+
|
| 62 |
+
-- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
|
| 63 |
+
INTENTS VARCHAR(16777216), -- Multi-label: seeking_information, providing_information, sharing_experience, comparing, praising, criticizing, buying_selling, general_discussion
|
| 64 |
+
PURCHASE_STAGE VARCHAR(50), -- AUTHOR's own stage only
|
| 65 |
+
DECISION_DRIVERS VARCHAR(16777216), -- AUTHOR's own decision drivers only
|
| 66 |
+
PAIN_POINTS VARCHAR(16777216), -- AUTHOR's negative feedback aspects (uses feedback_aspects categories)
|
| 67 |
+
DELIGHT_FACTORS VARCHAR(16777216), -- AUTHOR's positive feedback aspects (uses feedback_aspects categories)
|
| 68 |
+
|
| 69 |
+
-- Analysis notes
|
| 70 |
+
ANALYSIS_NOTES VARCHAR(16777216),
|
| 71 |
+
SARCASM_DETECTED BOOLEAN,
|
| 72 |
+
|
| 73 |
+
-- Validation results (NEW v4.0)
|
| 74 |
+
VALIDATION_PASSED BOOLEAN,
|
| 75 |
+
VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
|
| 76 |
+
VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
|
| 77 |
+
VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags
|
| 78 |
+
|
| 79 |
+
-- Platform identifier
|
| 80 |
+
PLATFORM VARCHAR(50) DEFAULT 'musora_forums',
|
| 81 |
+
|
| 82 |
+
-- Processing metadata
|
| 83 |
+
PROCESSING_SUCCESS BOOLEAN,
|
| 84 |
+
PROCESSING_STATUS VARCHAR(50), -- NEW v4.0: completed, completed_with_flags, validation_failed, workflow_error
|
| 85 |
+
PROCESSING_ERRORS VARCHAR(16777216),
|
| 86 |
+
PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
|
| 87 |
+
WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
|
| 88 |
+
)
|
| 89 |
+
COMMENT = 'Brand sentiment analysis results for Sabian forum posts. Schema v4.0: Added thread_context_summary, validation fields, and processing status.';
|
processing_brand_sentiment/main.py
ADDED
|
@@ -0,0 +1,1088 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main execution script for brand sentiment analysis workflow.
|
| 3 |
+
Orchestrates data fetching, processing, and storage using an agentic workflow.
|
| 4 |
+
Supports parallel processing with multiprocessing for improved performance.
|
| 5 |
+
Supports multiple data sources: forums, social media comments, or both.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
import logging
|
| 11 |
+
import argparse
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from dotenv import load_dotenv
|
| 15 |
+
from multiprocessing import Pool, cpu_count
|
| 16 |
+
import traceback
|
| 17 |
+
from typing import Dict, Any, List
|
| 18 |
+
|
| 19 |
+
from database.snowflake_connection import SnowFlakeConn
|
| 20 |
+
from workflow.orchestrator import BrandAnalysisWorkflow
|
| 21 |
+
from workflow.comment_orchestrator import CommentAnalysisWorkflow
|
| 22 |
+
|
| 23 |
+
# Get the directory where this script is located
|
| 24 |
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 25 |
+
|
| 26 |
+
# Load environment variables
|
| 27 |
+
load_dotenv(os.path.join(SCRIPT_DIR, '.env'))
|
| 28 |
+
|
| 29 |
+
# Ensure logs directory exists
|
| 30 |
+
LOGS_DIR = os.path.join(SCRIPT_DIR, 'logs')
|
| 31 |
+
os.makedirs(LOGS_DIR, exist_ok=True)
|
| 32 |
+
|
| 33 |
+
# Configure logging
|
| 34 |
+
logging.basicConfig(
|
| 35 |
+
level=logging.INFO,
|
| 36 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 37 |
+
handlers=[
|
| 38 |
+
logging.FileHandler(
|
| 39 |
+
os.path.join(LOGS_DIR, f'brand_sentiment_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
|
| 40 |
+
),
|
| 41 |
+
logging.StreamHandler()
|
| 42 |
+
]
|
| 43 |
+
)
|
| 44 |
+
logger = logging.getLogger(__name__)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# ============================================================
|
| 48 |
+
# Configuration Loading
|
| 49 |
+
# ============================================================
|
| 50 |
+
|
| 51 |
+
def load_configs(config_dir: str = None) -> Dict[str, Dict]:
|
| 52 |
+
"""
|
| 53 |
+
Load all configuration files.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
config_dir: Directory containing config files
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Dictionary with all configurations
|
| 60 |
+
"""
|
| 61 |
+
if config_dir is None:
|
| 62 |
+
config_dir = os.path.join(SCRIPT_DIR, 'config_files')
|
| 63 |
+
|
| 64 |
+
configs = {}
|
| 65 |
+
|
| 66 |
+
# Load workflow config
|
| 67 |
+
with open(os.path.join(config_dir, 'workflow_config.json'), 'r') as f:
|
| 68 |
+
configs['workflow'] = json.load(f)
|
| 69 |
+
|
| 70 |
+
# Load brand config
|
| 71 |
+
with open(os.path.join(config_dir, 'brand_config.json'), 'r') as f:
|
| 72 |
+
configs['brand'] = json.load(f)
|
| 73 |
+
|
| 74 |
+
# Load analysis categories
|
| 75 |
+
with open(os.path.join(config_dir, 'analysis_categories.json'), 'r') as f:
|
| 76 |
+
configs['categories'] = json.load(f)
|
| 77 |
+
|
| 78 |
+
return configs
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# ============================================================
|
| 82 |
+
# Batch Processing Utilities
|
| 83 |
+
# ============================================================
|
| 84 |
+
|
| 85 |
+
def calculate_optimal_batch_size(
|
| 86 |
+
total_posts: int,
|
| 87 |
+
num_workers: int,
|
| 88 |
+
min_batch: int = 20,
|
| 89 |
+
max_batch: int = 500
|
| 90 |
+
) -> int:
|
| 91 |
+
"""
|
| 92 |
+
Calculate optimal batch size based on total posts and workers.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
total_posts: Total number of posts to process
|
| 96 |
+
num_workers: Number of parallel workers
|
| 97 |
+
min_batch: Minimum batch size
|
| 98 |
+
max_batch: Maximum batch size
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Optimal batch size
|
| 102 |
+
"""
|
| 103 |
+
if total_posts <= min_batch:
|
| 104 |
+
return total_posts
|
| 105 |
+
|
| 106 |
+
batch_size = total_posts // num_workers
|
| 107 |
+
batch_size = max(min_batch, min(max_batch, batch_size))
|
| 108 |
+
|
| 109 |
+
return batch_size
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def safe_to_json(value: Any) -> Any:
|
| 113 |
+
"""
|
| 114 |
+
Safely convert a value to JSON string.
|
| 115 |
+
Handles None, NaN, lists, and already-string values.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
value: Value to convert
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
JSON string if list, None if null, original value otherwise
|
| 122 |
+
"""
|
| 123 |
+
# Handle None and NaN
|
| 124 |
+
if value is None or (isinstance(value, float) and pd.isna(value)):
|
| 125 |
+
return None
|
| 126 |
+
# Handle lists - convert to JSON
|
| 127 |
+
if isinstance(value, list):
|
| 128 |
+
return json.dumps(value) if value else None
|
| 129 |
+
# Handle already-string values
|
| 130 |
+
if isinstance(value, str):
|
| 131 |
+
return value if value else None
|
| 132 |
+
# Return as-is for other types
|
| 133 |
+
return value
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def safe_json_list_length(value: Any) -> int:
|
| 137 |
+
"""
|
| 138 |
+
Safely get the length of a JSON array string.
|
| 139 |
+
Handles None, NaN, empty strings, and invalid JSON.
|
| 140 |
+
|
| 141 |
+
Args:
|
| 142 |
+
value: Value to parse (expected JSON string of array)
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
Length of the array, or 0 if invalid/empty
|
| 146 |
+
"""
|
| 147 |
+
# Handle None and NaN
|
| 148 |
+
if value is None or (isinstance(value, float) and pd.isna(value)):
|
| 149 |
+
return 0
|
| 150 |
+
# Handle non-string values
|
| 151 |
+
if not isinstance(value, str):
|
| 152 |
+
return 0
|
| 153 |
+
# Handle empty strings
|
| 154 |
+
if not value or value == '[]' or value == 'null':
|
| 155 |
+
return 0
|
| 156 |
+
# Try to parse JSON
|
| 157 |
+
try:
|
| 158 |
+
parsed = json.loads(value)
|
| 159 |
+
return len(parsed) if isinstance(parsed, list) else 0
|
| 160 |
+
except (json.JSONDecodeError, TypeError):
|
| 161 |
+
return 0
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def calculate_batch_stats(df: pd.DataFrame) -> Dict[str, int]:
|
| 165 |
+
"""
|
| 166 |
+
Calculate statistics from batch results.
|
| 167 |
+
Handles null values safely for all fields.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
df: DataFrame with processed results
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
Dictionary with statistics
|
| 174 |
+
"""
|
| 175 |
+
stats = {
|
| 176 |
+
'relevant_count': 0,
|
| 177 |
+
'not_relevant_count': 0,
|
| 178 |
+
'products_mentioned_count': 0,
|
| 179 |
+
'competitors_mentioned_count': 0,
|
| 180 |
+
'positive_sentiment_count': 0,
|
| 181 |
+
'negative_sentiment_count': 0,
|
| 182 |
+
# Author role stats
|
| 183 |
+
'current_owner_count': 0,
|
| 184 |
+
'potential_buyer_count': 0,
|
| 185 |
+
'primary_focus_count': 0
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
# Handle empty dataframe
|
| 189 |
+
if df.empty:
|
| 190 |
+
return stats
|
| 191 |
+
|
| 192 |
+
# Count relevant/not relevant posts
|
| 193 |
+
if 'IS_RELEVANT' in df.columns:
|
| 194 |
+
relevant_col = df['IS_RELEVANT']
|
| 195 |
+
non_null_mask = relevant_col.notna()
|
| 196 |
+
if non_null_mask.any():
|
| 197 |
+
stats['relevant_count'] = int(relevant_col[non_null_mask].astype(bool).sum())
|
| 198 |
+
stats['not_relevant_count'] = int((~relevant_col[non_null_mask].astype(bool)).sum())
|
| 199 |
+
|
| 200 |
+
# Count product mentions using safe helper
|
| 201 |
+
if 'PRODUCTS_MENTIONED' in df.columns:
|
| 202 |
+
stats['products_mentioned_count'] = int(
|
| 203 |
+
df['PRODUCTS_MENTIONED'].apply(safe_json_list_length).sum()
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
# Count competitor mentions using safe helper
|
| 207 |
+
if 'COMPETITORS_MENTIONED' in df.columns:
|
| 208 |
+
stats['competitors_mentioned_count'] = int(
|
| 209 |
+
df['COMPETITORS_MENTIONED'].apply(safe_json_list_length).sum()
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Count sentiment distribution
|
| 213 |
+
if 'SENTIMENT_LEVEL' in df.columns:
|
| 214 |
+
sentiment_values = df['SENTIMENT_LEVEL'].dropna()
|
| 215 |
+
if not sentiment_values.empty:
|
| 216 |
+
stats['positive_sentiment_count'] = int(
|
| 217 |
+
sentiment_values.isin(['positive', 'very_positive']).sum()
|
| 218 |
+
)
|
| 219 |
+
stats['negative_sentiment_count'] = int(
|
| 220 |
+
sentiment_values.isin(['negative', 'very_negative']).sum()
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
# Count author roles
|
| 224 |
+
if 'AUTHOR_ROLE' in df.columns:
|
| 225 |
+
author_roles = df['AUTHOR_ROLE'].dropna()
|
| 226 |
+
if not author_roles.empty:
|
| 227 |
+
stats['current_owner_count'] = int((author_roles == 'current_owner').sum())
|
| 228 |
+
stats['potential_buyer_count'] = int((author_roles == 'potential_buyer').sum())
|
| 229 |
+
|
| 230 |
+
# Count mention context
|
| 231 |
+
if 'SABIAN_MENTION_CONTEXT' in df.columns:
|
| 232 |
+
mention_context = df['SABIAN_MENTION_CONTEXT'].dropna()
|
| 233 |
+
if not mention_context.empty:
|
| 234 |
+
stats['primary_focus_count'] = int((mention_context == 'primary_focus').sum())
|
| 235 |
+
|
| 236 |
+
return stats
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def aggregate_results(results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 240 |
+
"""
|
| 241 |
+
Aggregate results from multiple batches.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
results: List of batch result dictionaries
|
| 245 |
+
|
| 246 |
+
Returns:
|
| 247 |
+
Aggregated statistics dictionary
|
| 248 |
+
"""
|
| 249 |
+
aggregated = {
|
| 250 |
+
'total_processed': sum(r.get('total_processed', 0) for r in results),
|
| 251 |
+
'total_stored': sum(r.get('total_stored', 0) for r in results),
|
| 252 |
+
'failed_count': sum(r.get('failed_count', 0) for r in results),
|
| 253 |
+
'relevant_count': sum(r.get('relevant_count', 0) for r in results),
|
| 254 |
+
'not_relevant_count': sum(r.get('not_relevant_count', 0) for r in results),
|
| 255 |
+
'products_mentioned_count': sum(r.get('products_mentioned_count', 0) for r in results),
|
| 256 |
+
'competitors_mentioned_count': sum(r.get('competitors_mentioned_count', 0) for r in results),
|
| 257 |
+
'positive_sentiment_count': sum(r.get('positive_sentiment_count', 0) for r in results),
|
| 258 |
+
'negative_sentiment_count': sum(r.get('negative_sentiment_count', 0) for r in results),
|
| 259 |
+
'current_owner_count': sum(r.get('current_owner_count', 0) for r in results),
|
| 260 |
+
'potential_buyer_count': sum(r.get('potential_buyer_count', 0) for r in results),
|
| 261 |
+
'primary_focus_count': sum(r.get('primary_focus_count', 0) for r in results),
|
| 262 |
+
'failed_batches': sum(1 for r in results if not r.get('success', False))
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
# Log failed batches
|
| 266 |
+
failed_batches = [r for r in results if not r.get('success', False)]
|
| 267 |
+
if failed_batches:
|
| 268 |
+
logger.error(f"{len(failed_batches)} batch(es) failed:")
|
| 269 |
+
for fb in failed_batches:
|
| 270 |
+
logger.error(f" Batch {fb.get('batch_num')}: {fb.get('error')}")
|
| 271 |
+
|
| 272 |
+
return aggregated
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
# ============================================================
|
| 276 |
+
# Forum Processing (existing functionality)
|
| 277 |
+
# ============================================================
|
| 278 |
+
|
| 279 |
+
# Columns that should be converted from lists to JSON strings
|
| 280 |
+
FORUM_JSON_ARRAY_COLUMNS = [
|
| 281 |
+
'products_mentioned', 'product_attributes', 'competitors_mentioned',
|
| 282 |
+
'competitor_products_owned', 'intents', 'decision_drivers',
|
| 283 |
+
'pain_points', 'delight_factors', 'processing_errors', 'relevance_keywords_found',
|
| 284 |
+
'validation_errors', 'validation_warnings', 'validation_flags'
|
| 285 |
+
]
|
| 286 |
+
|
| 287 |
+
# Column mapping from forum workflow state to output table
|
| 288 |
+
FORUM_COLUMN_MAPPING = {
|
| 289 |
+
'post_id': 'POST_ID',
|
| 290 |
+
'thread_id': 'THREAD_ID',
|
| 291 |
+
'post_author_id': 'POST_AUTHOR_ID',
|
| 292 |
+
'original_content': 'ORIGINAL_CONTENT',
|
| 293 |
+
'cleaned_content': 'CLEANED_CONTENT',
|
| 294 |
+
'quoted_content': 'QUOTED_CONTENT',
|
| 295 |
+
'raw_thread_context': 'THREAD_CONTEXT',
|
| 296 |
+
'thread_context_summary': 'THREAD_CONTEXT_SUMMARY',
|
| 297 |
+
'thread_title': 'THREAD_TITLE',
|
| 298 |
+
'thread_first_post': 'THREAD_FIRST_POST',
|
| 299 |
+
'post_created_at': 'POST_CREATED_AT',
|
| 300 |
+
'thread_started_at': 'THREAD_STARTED_AT',
|
| 301 |
+
'category_title': 'CATEGORY_TITLE',
|
| 302 |
+
'category_topic': 'CATEGORY_TOPIC',
|
| 303 |
+
'detected_language': 'DETECTED_LANGUAGE',
|
| 304 |
+
'language_code': 'LANGUAGE_CODE',
|
| 305 |
+
'is_english': 'IS_ENGLISH',
|
| 306 |
+
'is_relevant': 'IS_RELEVANT',
|
| 307 |
+
'relevance_confidence': 'RELEVANCE_CONFIDENCE',
|
| 308 |
+
'relevance_reason': 'RELEVANCE_REASON',
|
| 309 |
+
'author_role': 'AUTHOR_ROLE',
|
| 310 |
+
'sabian_mention_context': 'SABIAN_MENTION_CONTEXT',
|
| 311 |
+
'sentiment_level': 'SENTIMENT_LEVEL',
|
| 312 |
+
'emotion_type': 'EMOTION_TYPE',
|
| 313 |
+
'sentiment_target': 'SENTIMENT_TARGET',
|
| 314 |
+
'sentiment_confidence': 'SENTIMENT_CONFIDENCE',
|
| 315 |
+
'products_mentioned': 'PRODUCTS_MENTIONED',
|
| 316 |
+
'product_attributes': 'PRODUCT_ATTRIBUTES',
|
| 317 |
+
'competitors_mentioned': 'COMPETITORS_MENTIONED',
|
| 318 |
+
'competitor_products_owned': 'COMPETITOR_PRODUCTS_OWNED',
|
| 319 |
+
'comparison_type': 'COMPARISON_TYPE',
|
| 320 |
+
'competitive_positioning': 'COMPETITIVE_POSITIONING',
|
| 321 |
+
'brand_switching': 'BRAND_SWITCHING',
|
| 322 |
+
'intents': 'INTENTS',
|
| 323 |
+
'purchase_stage': 'PURCHASE_STAGE',
|
| 324 |
+
'decision_drivers': 'DECISION_DRIVERS',
|
| 325 |
+
'pain_points': 'PAIN_POINTS',
|
| 326 |
+
'delight_factors': 'DELIGHT_FACTORS',
|
| 327 |
+
'analysis_notes': 'ANALYSIS_NOTES',
|
| 328 |
+
'sarcasm_detected': 'SARCASM_DETECTED',
|
| 329 |
+
'validation_passed': 'VALIDATION_PASSED',
|
| 330 |
+
'validation_errors': 'VALIDATION_ERRORS',
|
| 331 |
+
'validation_warnings': 'VALIDATION_WARNINGS',
|
| 332 |
+
'validation_flags': 'VALIDATION_FLAGS',
|
| 333 |
+
'success': 'PROCESSING_SUCCESS',
|
| 334 |
+
'processing_status': 'PROCESSING_STATUS',
|
| 335 |
+
'processing_errors': 'PROCESSING_ERRORS'
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
def prepare_forum_output_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
| 340 |
+
"""
|
| 341 |
+
Prepare forum output DataFrame with proper column mapping.
|
| 342 |
+
|
| 343 |
+
Args:
|
| 344 |
+
df: DataFrame with processing results
|
| 345 |
+
|
| 346 |
+
Returns:
|
| 347 |
+
DataFrame ready for Snowflake storage
|
| 348 |
+
"""
|
| 349 |
+
output_df = pd.DataFrame()
|
| 350 |
+
|
| 351 |
+
for source_col, target_col in FORUM_COLUMN_MAPPING.items():
|
| 352 |
+
if source_col in df.columns:
|
| 353 |
+
value = df[source_col].copy()
|
| 354 |
+
if source_col in FORUM_JSON_ARRAY_COLUMNS:
|
| 355 |
+
value = value.apply(safe_to_json)
|
| 356 |
+
output_df[target_col] = value
|
| 357 |
+
else:
|
| 358 |
+
output_df[target_col] = None
|
| 359 |
+
|
| 360 |
+
# Add metadata
|
| 361 |
+
output_df['PLATFORM'] = 'musora_forums'
|
| 362 |
+
output_df['PROCESSED_AT'] = datetime.now()
|
| 363 |
+
output_df['WORKFLOW_VERSION'] = '4.0'
|
| 364 |
+
|
| 365 |
+
return output_df
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
def process_forum_batch_worker(batch_data: tuple) -> Dict[str, Any]:
|
| 369 |
+
"""
|
| 370 |
+
Worker function to process a single batch of forum posts.
|
| 371 |
+
Runs in a separate process.
|
| 372 |
+
|
| 373 |
+
Args:
|
| 374 |
+
batch_data: Tuple containing (batch_num, posts, configs, api_key, overwrite_first_batch, output_config)
|
| 375 |
+
|
| 376 |
+
Returns:
|
| 377 |
+
Dictionary with batch statistics
|
| 378 |
+
"""
|
| 379 |
+
batch_num, posts, configs, api_key, overwrite_first_batch, output_config = batch_data
|
| 380 |
+
|
| 381 |
+
worker_logger = logging.getLogger(f"ForumWorker-{batch_num}")
|
| 382 |
+
|
| 383 |
+
try:
|
| 384 |
+
worker_logger.info(f"Forum Batch {batch_num}: Starting processing of {len(posts)} posts")
|
| 385 |
+
|
| 386 |
+
# Initialize Snowflake connection for this worker
|
| 387 |
+
snowflake = SnowFlakeConn()
|
| 388 |
+
|
| 389 |
+
# Initialize workflow for this worker
|
| 390 |
+
workflow = BrandAnalysisWorkflow(
|
| 391 |
+
workflow_config=configs['workflow'],
|
| 392 |
+
brand_config=configs['brand'],
|
| 393 |
+
analysis_categories=configs['categories'],
|
| 394 |
+
api_key=api_key
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
# Process posts
|
| 398 |
+
results = workflow.process_batch(posts)
|
| 399 |
+
|
| 400 |
+
# Convert to DataFrame
|
| 401 |
+
results_df = pd.DataFrame(results)
|
| 402 |
+
|
| 403 |
+
# Filter successful results
|
| 404 |
+
initial_count = len(results_df)
|
| 405 |
+
df_successful = results_df[results_df['success'] == True].copy()
|
| 406 |
+
failed_count = initial_count - len(df_successful)
|
| 407 |
+
|
| 408 |
+
worker_logger.info(f"Forum Batch {batch_num}: Processed {initial_count} posts, {len(df_successful)} successful")
|
| 409 |
+
|
| 410 |
+
# Prepare output DataFrame
|
| 411 |
+
output_df = prepare_forum_output_dataframe(df_successful)
|
| 412 |
+
|
| 413 |
+
# Store results
|
| 414 |
+
if len(output_df) > 0:
|
| 415 |
+
overwrite = overwrite_first_batch and batch_num == 1
|
| 416 |
+
|
| 417 |
+
snowflake.store_df_to_snowflake(
|
| 418 |
+
table_name=output_config['table_name'],
|
| 419 |
+
dataframe=output_df,
|
| 420 |
+
database=output_config['database'],
|
| 421 |
+
schema=output_config['schema'],
|
| 422 |
+
overwrite=overwrite
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
worker_logger.info(f"Forum Batch {batch_num}: Stored {len(output_df)} records to Snowflake")
|
| 426 |
+
else:
|
| 427 |
+
worker_logger.warning(f"Forum Batch {batch_num}: No successful records to store")
|
| 428 |
+
|
| 429 |
+
# Close connection
|
| 430 |
+
snowflake.close_connection()
|
| 431 |
+
|
| 432 |
+
# Calculate statistics
|
| 433 |
+
stats = calculate_batch_stats(output_df)
|
| 434 |
+
stats.update({
|
| 435 |
+
'batch_num': batch_num,
|
| 436 |
+
'success': True,
|
| 437 |
+
'total_processed': initial_count,
|
| 438 |
+
'total_stored': len(output_df),
|
| 439 |
+
'failed_count': failed_count,
|
| 440 |
+
'error': None
|
| 441 |
+
})
|
| 442 |
+
|
| 443 |
+
return stats
|
| 444 |
+
|
| 445 |
+
except Exception as e:
|
| 446 |
+
error_msg = f"Forum Batch {batch_num} failed: {str(e)}"
|
| 447 |
+
worker_logger.error(error_msg)
|
| 448 |
+
worker_logger.error(traceback.format_exc())
|
| 449 |
+
|
| 450 |
+
return {
|
| 451 |
+
'batch_num': batch_num,
|
| 452 |
+
'success': False,
|
| 453 |
+
'total_processed': len(posts),
|
| 454 |
+
'total_stored': 0,
|
| 455 |
+
'failed_count': len(posts),
|
| 456 |
+
'error': error_msg
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
# ============================================================
|
| 461 |
+
# Comment Processing (new functionality)
|
| 462 |
+
# ============================================================
|
| 463 |
+
|
| 464 |
+
# Columns that should be converted from lists to JSON strings (same analysis fields)
|
| 465 |
+
COMMENT_JSON_ARRAY_COLUMNS = [
|
| 466 |
+
'products_mentioned', 'product_attributes', 'competitors_mentioned',
|
| 467 |
+
'competitor_products_owned', 'intents', 'decision_drivers',
|
| 468 |
+
'pain_points', 'delight_factors', 'processing_errors', 'relevance_keywords_found',
|
| 469 |
+
'validation_errors', 'validation_warnings', 'validation_flags'
|
| 470 |
+
]
|
| 471 |
+
|
| 472 |
+
# Column mapping from comment workflow state to output table
|
| 473 |
+
COMMENT_COLUMN_MAPPING = {
|
| 474 |
+
# Comment-specific identifiers
|
| 475 |
+
'comment_sk': 'COMMENT_SK',
|
| 476 |
+
'comment_id': 'COMMENT_ID',
|
| 477 |
+
'original_text': 'ORIGINAL_TEXT',
|
| 478 |
+
'platform': 'PLATFORM',
|
| 479 |
+
'comment_timestamp': 'COMMENT_TIMESTAMP',
|
| 480 |
+
'author_name': 'AUTHOR_NAME',
|
| 481 |
+
'author_id': 'AUTHOR_ID',
|
| 482 |
+
'content_sk': 'CONTENT_SK',
|
| 483 |
+
'content_id': 'CONTENT_ID',
|
| 484 |
+
'content_description': 'CONTENT_DESCRIPTION',
|
| 485 |
+
'channel_sk': 'CHANNEL_SK',
|
| 486 |
+
'channel_name': 'CHANNEL_NAME',
|
| 487 |
+
'channel_display_name': 'CHANNEL_DISPLAY_NAME',
|
| 488 |
+
'parent_comment_id': 'PARENT_COMMENT_ID',
|
| 489 |
+
'parent_comment_text': 'PARENT_COMMENT_TEXT',
|
| 490 |
+
# Analysis fields (same as forums)
|
| 491 |
+
'detected_language': 'DETECTED_LANGUAGE',
|
| 492 |
+
'language_code': 'LANGUAGE_CODE',
|
| 493 |
+
'is_english': 'IS_ENGLISH',
|
| 494 |
+
'is_relevant': 'IS_RELEVANT',
|
| 495 |
+
'relevance_confidence': 'RELEVANCE_CONFIDENCE',
|
| 496 |
+
'relevance_reason': 'RELEVANCE_REASON',
|
| 497 |
+
'author_role': 'AUTHOR_ROLE',
|
| 498 |
+
'sabian_mention_context': 'SABIAN_MENTION_CONTEXT',
|
| 499 |
+
'sentiment_level': 'SENTIMENT_LEVEL',
|
| 500 |
+
'emotion_type': 'EMOTION_TYPE',
|
| 501 |
+
'sentiment_target': 'SENTIMENT_TARGET',
|
| 502 |
+
'sentiment_confidence': 'SENTIMENT_CONFIDENCE',
|
| 503 |
+
'products_mentioned': 'PRODUCTS_MENTIONED',
|
| 504 |
+
'product_attributes': 'PRODUCT_ATTRIBUTES',
|
| 505 |
+
'purchase_stage': 'PURCHASE_STAGE',
|
| 506 |
+
'competitors_mentioned': 'COMPETITORS_MENTIONED',
|
| 507 |
+
'competitor_products_owned': 'COMPETITOR_PRODUCTS_OWNED',
|
| 508 |
+
'comparison_type': 'COMPARISON_TYPE',
|
| 509 |
+
'competitive_positioning': 'COMPETITIVE_POSITIONING',
|
| 510 |
+
'brand_switching': 'BRAND_SWITCHING',
|
| 511 |
+
'intents': 'INTENTS',
|
| 512 |
+
'decision_drivers': 'DECISION_DRIVERS',
|
| 513 |
+
'pain_points': 'PAIN_POINTS',
|
| 514 |
+
'delight_factors': 'DELIGHT_FACTORS',
|
| 515 |
+
'analysis_notes': 'ANALYSIS_NOTES',
|
| 516 |
+
'sarcasm_detected': 'SARCASM_DETECTED',
|
| 517 |
+
'validation_passed': 'VALIDATION_PASSED',
|
| 518 |
+
'validation_errors': 'VALIDATION_ERRORS',
|
| 519 |
+
'validation_warnings': 'VALIDATION_WARNINGS',
|
| 520 |
+
'validation_flags': 'VALIDATION_FLAGS',
|
| 521 |
+
'success': 'PROCESSING_SUCCESS',
|
| 522 |
+
'processing_status': 'PROCESSING_STATUS',
|
| 523 |
+
'processing_errors': 'PROCESSING_ERRORS'
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
|
| 527 |
+
def prepare_comment_output_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
| 528 |
+
"""
|
| 529 |
+
Prepare comment output DataFrame with proper column mapping.
|
| 530 |
+
|
| 531 |
+
Args:
|
| 532 |
+
df: DataFrame with processing results
|
| 533 |
+
|
| 534 |
+
Returns:
|
| 535 |
+
DataFrame ready for Snowflake storage
|
| 536 |
+
"""
|
| 537 |
+
output_df = pd.DataFrame()
|
| 538 |
+
|
| 539 |
+
for source_col, target_col in COMMENT_COLUMN_MAPPING.items():
|
| 540 |
+
if source_col in df.columns:
|
| 541 |
+
value = df[source_col].copy()
|
| 542 |
+
if source_col in COMMENT_JSON_ARRAY_COLUMNS:
|
| 543 |
+
value = value.apply(safe_to_json)
|
| 544 |
+
output_df[target_col] = value
|
| 545 |
+
else:
|
| 546 |
+
output_df[target_col] = None
|
| 547 |
+
|
| 548 |
+
# Add metadata
|
| 549 |
+
output_df['PROCESSED_AT'] = datetime.now()
|
| 550 |
+
output_df['WORKFLOW_VERSION'] = '4.0'
|
| 551 |
+
|
| 552 |
+
return output_df
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
def process_comment_batch_worker(batch_data: tuple) -> Dict[str, Any]:
|
| 556 |
+
"""
|
| 557 |
+
Worker function to process a single batch of social media comments.
|
| 558 |
+
Runs in a separate process.
|
| 559 |
+
|
| 560 |
+
Args:
|
| 561 |
+
batch_data: Tuple containing (batch_num, comments, configs, api_key, overwrite_first_batch, output_config)
|
| 562 |
+
|
| 563 |
+
Returns:
|
| 564 |
+
Dictionary with batch statistics
|
| 565 |
+
"""
|
| 566 |
+
batch_num, comments, configs, api_key, overwrite_first_batch, output_config = batch_data
|
| 567 |
+
|
| 568 |
+
worker_logger = logging.getLogger(f"CommentWorker-{batch_num}")
|
| 569 |
+
|
| 570 |
+
try:
|
| 571 |
+
worker_logger.info(f"Comment Batch {batch_num}: Starting processing of {len(comments)} comments")
|
| 572 |
+
|
| 573 |
+
# Initialize Snowflake connection for this worker
|
| 574 |
+
snowflake = SnowFlakeConn()
|
| 575 |
+
|
| 576 |
+
# Initialize comment workflow for this worker
|
| 577 |
+
workflow = CommentAnalysisWorkflow(
|
| 578 |
+
workflow_config=configs['workflow'],
|
| 579 |
+
brand_config=configs['brand'],
|
| 580 |
+
analysis_categories=configs['categories'],
|
| 581 |
+
api_key=api_key
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
+
# Process comments
|
| 585 |
+
results = workflow.process_batch(comments)
|
| 586 |
+
|
| 587 |
+
# Convert to DataFrame
|
| 588 |
+
results_df = pd.DataFrame(results)
|
| 589 |
+
|
| 590 |
+
# Filter successful results
|
| 591 |
+
initial_count = len(results_df)
|
| 592 |
+
df_successful = results_df[results_df['success'] == True].copy()
|
| 593 |
+
failed_count = initial_count - len(df_successful)
|
| 594 |
+
|
| 595 |
+
worker_logger.info(f"Comment Batch {batch_num}: Processed {initial_count} comments, {len(df_successful)} successful")
|
| 596 |
+
|
| 597 |
+
# Prepare output DataFrame
|
| 598 |
+
output_df = prepare_comment_output_dataframe(df_successful)
|
| 599 |
+
|
| 600 |
+
# Store results
|
| 601 |
+
if len(output_df) > 0:
|
| 602 |
+
overwrite = overwrite_first_batch and batch_num == 1
|
| 603 |
+
|
| 604 |
+
snowflake.store_df_to_snowflake(
|
| 605 |
+
table_name=output_config['table_name'],
|
| 606 |
+
dataframe=output_df,
|
| 607 |
+
database=output_config['database'],
|
| 608 |
+
schema=output_config['schema'],
|
| 609 |
+
overwrite=overwrite
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
worker_logger.info(f"Comment Batch {batch_num}: Stored {len(output_df)} records to Snowflake")
|
| 613 |
+
else:
|
| 614 |
+
worker_logger.warning(f"Comment Batch {batch_num}: No successful records to store")
|
| 615 |
+
|
| 616 |
+
# Close connection
|
| 617 |
+
snowflake.close_connection()
|
| 618 |
+
|
| 619 |
+
# Calculate statistics
|
| 620 |
+
stats = calculate_batch_stats(output_df)
|
| 621 |
+
stats.update({
|
| 622 |
+
'batch_num': batch_num,
|
| 623 |
+
'success': True,
|
| 624 |
+
'total_processed': initial_count,
|
| 625 |
+
'total_stored': len(output_df),
|
| 626 |
+
'failed_count': failed_count,
|
| 627 |
+
'error': None
|
| 628 |
+
})
|
| 629 |
+
|
| 630 |
+
return stats
|
| 631 |
+
|
| 632 |
+
except Exception as e:
|
| 633 |
+
error_msg = f"Comment Batch {batch_num} failed: {str(e)}"
|
| 634 |
+
worker_logger.error(error_msg)
|
| 635 |
+
worker_logger.error(traceback.format_exc())
|
| 636 |
+
|
| 637 |
+
return {
|
| 638 |
+
'batch_num': batch_num,
|
| 639 |
+
'success': False,
|
| 640 |
+
'total_processed': len(comments),
|
| 641 |
+
'total_stored': 0,
|
| 642 |
+
'failed_count': len(comments),
|
| 643 |
+
'error': error_msg
|
| 644 |
+
}
|
| 645 |
+
|
| 646 |
+
|
| 647 |
+
# ============================================================
|
| 648 |
+
# Main Processor Class
|
| 649 |
+
# ============================================================
|
| 650 |
+
|
| 651 |
+
class BrandSentimentProcessor:
|
| 652 |
+
"""
|
| 653 |
+
Main processor class that orchestrates the entire workflow.
|
| 654 |
+
Supports processing forums, social media comments, or both.
|
| 655 |
+
"""
|
| 656 |
+
|
| 657 |
+
def __init__(self, config_dir: str = None):
|
| 658 |
+
"""
|
| 659 |
+
Initialize the processor.
|
| 660 |
+
|
| 661 |
+
Args:
|
| 662 |
+
config_dir: Directory containing configuration files
|
| 663 |
+
"""
|
| 664 |
+
# Load configurations
|
| 665 |
+
self.configs = load_configs(config_dir)
|
| 666 |
+
|
| 667 |
+
# Initialize Snowflake connection
|
| 668 |
+
self.snowflake = SnowFlakeConn()
|
| 669 |
+
|
| 670 |
+
# Get OpenAI API key
|
| 671 |
+
self.api_key = os.getenv("OPENAI_API_KEY")
|
| 672 |
+
if not self.api_key:
|
| 673 |
+
raise ValueError("OPENAI_API_KEY not found in environment variables")
|
| 674 |
+
|
| 675 |
+
# Get output configurations
|
| 676 |
+
self.forum_output_config = self.configs['workflow'].get('output', {
|
| 677 |
+
'table_name': 'SABIAN_BRAND_ANALYSIS',
|
| 678 |
+
'database': 'SOCIAL_MEDIA_DB',
|
| 679 |
+
'schema': 'ML_FEATURES'
|
| 680 |
+
})
|
| 681 |
+
|
| 682 |
+
self.comment_output_config = self.configs['workflow'].get('comments_output', {
|
| 683 |
+
'table_name': 'SABIAN_BRAND_ANALYSIS_COMMENTS',
|
| 684 |
+
'database': 'SOCIAL_MEDIA_DB',
|
| 685 |
+
'schema': 'ML_FEATURES'
|
| 686 |
+
})
|
| 687 |
+
|
| 688 |
+
logger.info("BrandSentimentProcessor initialized successfully")
|
| 689 |
+
|
| 690 |
+
def fetch_forum_posts(self, limit: int = None) -> pd.DataFrame:
|
| 691 |
+
"""
|
| 692 |
+
Fetch forum posts from Snowflake.
|
| 693 |
+
|
| 694 |
+
Args:
|
| 695 |
+
limit: Optional limit on number of posts
|
| 696 |
+
|
| 697 |
+
Returns:
|
| 698 |
+
DataFrame containing post data
|
| 699 |
+
"""
|
| 700 |
+
logger.info("Fetching forum posts...")
|
| 701 |
+
|
| 702 |
+
sql_path = os.path.join(SCRIPT_DIR, 'database', 'sql', 'fetch_forum_posts.sql')
|
| 703 |
+
df = self.snowflake.fetch_forum_posts_with_context(sql_path, limit)
|
| 704 |
+
|
| 705 |
+
logger.info(f"Fetched {len(df)} forum posts")
|
| 706 |
+
return df
|
| 707 |
+
|
| 708 |
+
def fetch_comments(self, limit: int = None) -> pd.DataFrame:
|
| 709 |
+
"""
|
| 710 |
+
Fetch social media comments from Snowflake.
|
| 711 |
+
|
| 712 |
+
Args:
|
| 713 |
+
limit: Optional limit on number of comments
|
| 714 |
+
|
| 715 |
+
Returns:
|
| 716 |
+
DataFrame containing comment data
|
| 717 |
+
"""
|
| 718 |
+
logger.info("Fetching social media comments...")
|
| 719 |
+
|
| 720 |
+
sql_path = os.path.join(SCRIPT_DIR, 'database', 'sql', 'fetch_comments.sql')
|
| 721 |
+
df = self.snowflake.fetch_comments(sql_path, limit)
|
| 722 |
+
|
| 723 |
+
logger.info(f"Fetched {len(df)} social media comments")
|
| 724 |
+
return df
|
| 725 |
+
|
| 726 |
+
def calculate_num_workers(self) -> int:
|
| 727 |
+
"""
|
| 728 |
+
Calculate number of parallel workers.
|
| 729 |
+
|
| 730 |
+
Returns:
|
| 731 |
+
Number of workers
|
| 732 |
+
"""
|
| 733 |
+
parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {})
|
| 734 |
+
max_workers = parallel_config.get('max_workers', 5)
|
| 735 |
+
|
| 736 |
+
num_cpus = cpu_count()
|
| 737 |
+
num_workers = max(1, min(max_workers, num_cpus - 2))
|
| 738 |
+
|
| 739 |
+
logger.info(f"Using {num_workers} parallel workers (CPU count: {num_cpus})")
|
| 740 |
+
return num_workers
|
| 741 |
+
|
| 742 |
+
# ---- Forum Processing ----
|
| 743 |
+
|
| 744 |
+
def process_forums_parallel(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
|
| 745 |
+
"""
|
| 746 |
+
Process forum posts using parallel workers.
|
| 747 |
+
|
| 748 |
+
Args:
|
| 749 |
+
df: DataFrame containing posts
|
| 750 |
+
overwrite: Whether to overwrite existing table
|
| 751 |
+
|
| 752 |
+
Returns:
|
| 753 |
+
Dictionary with aggregated statistics
|
| 754 |
+
"""
|
| 755 |
+
posts = df.to_dict('records')
|
| 756 |
+
total_posts = len(posts)
|
| 757 |
+
|
| 758 |
+
logger.info(f"Processing {total_posts} forum posts using parallel processing...")
|
| 759 |
+
|
| 760 |
+
num_workers = self.calculate_num_workers()
|
| 761 |
+
|
| 762 |
+
parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {})
|
| 763 |
+
min_batch = parallel_config.get('min_batch_size', 20)
|
| 764 |
+
max_batch = parallel_config.get('max_batch_size', 400)
|
| 765 |
+
|
| 766 |
+
batch_size = calculate_optimal_batch_size(total_posts, num_workers, min_batch, max_batch)
|
| 767 |
+
logger.info(f"Forum batch size: {batch_size}")
|
| 768 |
+
|
| 769 |
+
# Create batches
|
| 770 |
+
batches = []
|
| 771 |
+
for i in range(0, total_posts, batch_size):
|
| 772 |
+
batch = posts[i:i + batch_size]
|
| 773 |
+
batch_num = (i // batch_size) + 1
|
| 774 |
+
batches.append((batch_num, batch, self.configs, self.api_key, overwrite, self.forum_output_config))
|
| 775 |
+
|
| 776 |
+
total_batches = len(batches)
|
| 777 |
+
logger.info(f"Split into {total_batches} forum batches")
|
| 778 |
+
|
| 779 |
+
# Process in parallel
|
| 780 |
+
with Pool(processes=num_workers) as pool:
|
| 781 |
+
results = pool.map(process_forum_batch_worker, batches)
|
| 782 |
+
|
| 783 |
+
return aggregate_results(results)
|
| 784 |
+
|
| 785 |
+
def process_forums_sequential(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
|
| 786 |
+
"""
|
| 787 |
+
Process forum posts sequentially (for debugging).
|
| 788 |
+
|
| 789 |
+
Args:
|
| 790 |
+
df: DataFrame containing posts
|
| 791 |
+
overwrite: Whether to overwrite existing table
|
| 792 |
+
|
| 793 |
+
Returns:
|
| 794 |
+
Dictionary with statistics
|
| 795 |
+
"""
|
| 796 |
+
logger.info(f"Processing {len(df)} forum posts using sequential processing...")
|
| 797 |
+
|
| 798 |
+
posts = df.to_dict('records')
|
| 799 |
+
batch_data = (1, posts, self.configs, self.api_key, overwrite, self.forum_output_config)
|
| 800 |
+
result = process_forum_batch_worker(batch_data)
|
| 801 |
+
|
| 802 |
+
return {
|
| 803 |
+
'total_processed': result.get('total_processed', 0),
|
| 804 |
+
'total_stored': result.get('total_stored', 0),
|
| 805 |
+
'failed_count': result.get('failed_count', 0),
|
| 806 |
+
'relevant_count': result.get('relevant_count', 0),
|
| 807 |
+
'not_relevant_count': result.get('not_relevant_count', 0),
|
| 808 |
+
'products_mentioned_count': result.get('products_mentioned_count', 0),
|
| 809 |
+
'competitors_mentioned_count': result.get('competitors_mentioned_count', 0),
|
| 810 |
+
'positive_sentiment_count': result.get('positive_sentiment_count', 0),
|
| 811 |
+
'negative_sentiment_count': result.get('negative_sentiment_count', 0),
|
| 812 |
+
'current_owner_count': result.get('current_owner_count', 0),
|
| 813 |
+
'potential_buyer_count': result.get('potential_buyer_count', 0),
|
| 814 |
+
'primary_focus_count': result.get('primary_focus_count', 0),
|
| 815 |
+
'failed_batches': 0 if result.get('success', False) else 1
|
| 816 |
+
}
|
| 817 |
+
|
| 818 |
+
# ---- Comment Processing ----
|
| 819 |
+
|
| 820 |
+
def process_comments_parallel(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
|
| 821 |
+
"""
|
| 822 |
+
Process social media comments using parallel workers.
|
| 823 |
+
|
| 824 |
+
Args:
|
| 825 |
+
df: DataFrame containing comments
|
| 826 |
+
overwrite: Whether to overwrite existing table
|
| 827 |
+
|
| 828 |
+
Returns:
|
| 829 |
+
Dictionary with aggregated statistics
|
| 830 |
+
"""
|
| 831 |
+
comments = df.to_dict('records')
|
| 832 |
+
total_comments = len(comments)
|
| 833 |
+
|
| 834 |
+
logger.info(f"Processing {total_comments} comments using parallel processing...")
|
| 835 |
+
|
| 836 |
+
num_workers = self.calculate_num_workers()
|
| 837 |
+
|
| 838 |
+
parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {})
|
| 839 |
+
min_batch = parallel_config.get('min_batch_size', 20)
|
| 840 |
+
max_batch = parallel_config.get('max_batch_size', 400)
|
| 841 |
+
|
| 842 |
+
batch_size = calculate_optimal_batch_size(total_comments, num_workers, min_batch, max_batch)
|
| 843 |
+
logger.info(f"Comment batch size: {batch_size}")
|
| 844 |
+
|
| 845 |
+
# Create batches
|
| 846 |
+
batches = []
|
| 847 |
+
for i in range(0, total_comments, batch_size):
|
| 848 |
+
batch = comments[i:i + batch_size]
|
| 849 |
+
batch_num = (i // batch_size) + 1
|
| 850 |
+
batches.append((batch_num, batch, self.configs, self.api_key, overwrite, self.comment_output_config))
|
| 851 |
+
|
| 852 |
+
total_batches = len(batches)
|
| 853 |
+
logger.info(f"Split into {total_batches} comment batches")
|
| 854 |
+
|
| 855 |
+
# Process in parallel
|
| 856 |
+
with Pool(processes=num_workers) as pool:
|
| 857 |
+
results = pool.map(process_comment_batch_worker, batches)
|
| 858 |
+
|
| 859 |
+
return aggregate_results(results)
|
| 860 |
+
|
| 861 |
+
def process_comments_sequential(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
|
| 862 |
+
"""
|
| 863 |
+
Process social media comments sequentially (for debugging).
|
| 864 |
+
|
| 865 |
+
Args:
|
| 866 |
+
df: DataFrame containing comments
|
| 867 |
+
overwrite: Whether to overwrite existing table
|
| 868 |
+
|
| 869 |
+
Returns:
|
| 870 |
+
Dictionary with statistics
|
| 871 |
+
"""
|
| 872 |
+
logger.info(f"Processing {len(df)} comments using sequential processing...")
|
| 873 |
+
|
| 874 |
+
comments = df.to_dict('records')
|
| 875 |
+
batch_data = (1, comments, self.configs, self.api_key, overwrite, self.comment_output_config)
|
| 876 |
+
result = process_comment_batch_worker(batch_data)
|
| 877 |
+
|
| 878 |
+
return {
|
| 879 |
+
'total_processed': result.get('total_processed', 0),
|
| 880 |
+
'total_stored': result.get('total_stored', 0),
|
| 881 |
+
'failed_count': result.get('failed_count', 0),
|
| 882 |
+
'relevant_count': result.get('relevant_count', 0),
|
| 883 |
+
'not_relevant_count': result.get('not_relevant_count', 0),
|
| 884 |
+
'products_mentioned_count': result.get('products_mentioned_count', 0),
|
| 885 |
+
'competitors_mentioned_count': result.get('competitors_mentioned_count', 0),
|
| 886 |
+
'positive_sentiment_count': result.get('positive_sentiment_count', 0),
|
| 887 |
+
'negative_sentiment_count': result.get('negative_sentiment_count', 0),
|
| 888 |
+
'current_owner_count': result.get('current_owner_count', 0),
|
| 889 |
+
'potential_buyer_count': result.get('potential_buyer_count', 0),
|
| 890 |
+
'primary_focus_count': result.get('primary_focus_count', 0),
|
| 891 |
+
'failed_batches': 0 if result.get('success', False) else 1
|
| 892 |
+
}
|
| 893 |
+
|
| 894 |
+
# ---- Unified Processing ----
|
| 895 |
+
|
| 896 |
+
def _log_source_summary(self, source_name: str, stats: Dict[str, Any], processing_time: float) -> None:
|
| 897 |
+
"""
|
| 898 |
+
Log processing summary for a data source.
|
| 899 |
+
|
| 900 |
+
Args:
|
| 901 |
+
source_name: Name of the data source
|
| 902 |
+
stats: Processing statistics
|
| 903 |
+
processing_time: Time taken in seconds
|
| 904 |
+
"""
|
| 905 |
+
logger.info(f" --- {source_name} ---")
|
| 906 |
+
logger.info(f" Total processed: {stats.get('total_processed', 0)}")
|
| 907 |
+
logger.info(f" Successfully stored: {stats.get('total_stored', 0)}")
|
| 908 |
+
logger.info(f" Failed: {stats.get('failed_count', 0)}")
|
| 909 |
+
logger.info(f" Relevant: {stats.get('relevant_count', 0)}")
|
| 910 |
+
logger.info(f" Not relevant: {stats.get('not_relevant_count', 0)}")
|
| 911 |
+
logger.info(f" Product mentions: {stats.get('products_mentioned_count', 0)}")
|
| 912 |
+
logger.info(f" Competitor mentions: {stats.get('competitors_mentioned_count', 0)}")
|
| 913 |
+
logger.info(f" Positive sentiment: {stats.get('positive_sentiment_count', 0)}")
|
| 914 |
+
logger.info(f" Negative sentiment: {stats.get('negative_sentiment_count', 0)}")
|
| 915 |
+
logger.info(f" Current owners: {stats.get('current_owner_count', 0)}")
|
| 916 |
+
logger.info(f" Potential buyers: {stats.get('potential_buyer_count', 0)}")
|
| 917 |
+
logger.info(f" Primary focus: {stats.get('primary_focus_count', 0)}")
|
| 918 |
+
if stats.get('failed_batches', 0) > 0:
|
| 919 |
+
logger.info(f" Failed batches: {stats['failed_batches']}")
|
| 920 |
+
logger.info(f" Processing time: {processing_time:.2f} seconds")
|
| 921 |
+
if stats.get('total_processed', 0) > 0:
|
| 922 |
+
logger.info(f" Average per item: {processing_time / stats['total_processed']:.2f} seconds")
|
| 923 |
+
|
| 924 |
+
def run(
|
| 925 |
+
self,
|
| 926 |
+
limit: int = None,
|
| 927 |
+
overwrite: bool = False,
|
| 928 |
+
sequential: bool = False,
|
| 929 |
+
data_source: str = 'all'
|
| 930 |
+
):
|
| 931 |
+
"""
|
| 932 |
+
Run the complete processing pipeline.
|
| 933 |
+
|
| 934 |
+
Args:
|
| 935 |
+
limit: Optional limit on items to process per source
|
| 936 |
+
overwrite: Whether to overwrite existing table
|
| 937 |
+
sequential: Use sequential processing instead of parallel
|
| 938 |
+
data_source: Which data source to process ('forums', 'comments', 'all')
|
| 939 |
+
"""
|
| 940 |
+
try:
|
| 941 |
+
logger.info("=" * 80)
|
| 942 |
+
logger.info("Starting Brand Sentiment Analysis Workflow")
|
| 943 |
+
logger.info(f"Brand: {self.configs['brand'].get('brand', {}).get('name', 'Unknown')}")
|
| 944 |
+
logger.info(f"Mode: {'SEQUENTIAL' if sequential else 'PARALLEL'}")
|
| 945 |
+
logger.info(f"Data source: {data_source}")
|
| 946 |
+
logger.info("=" * 80)
|
| 947 |
+
|
| 948 |
+
process_forums = data_source in ('forums', 'all')
|
| 949 |
+
process_comments = data_source in ('comments', 'all')
|
| 950 |
+
|
| 951 |
+
# Track results for summary
|
| 952 |
+
forum_stats = None
|
| 953 |
+
forum_time = 0.0
|
| 954 |
+
comment_stats = None
|
| 955 |
+
comment_time = 0.0
|
| 956 |
+
|
| 957 |
+
# ---- Process Forums ----
|
| 958 |
+
if process_forums:
|
| 959 |
+
logger.info("-" * 40)
|
| 960 |
+
logger.info("Processing FORUMS")
|
| 961 |
+
logger.info("-" * 40)
|
| 962 |
+
|
| 963 |
+
df_posts = self.fetch_forum_posts(limit)
|
| 964 |
+
|
| 965 |
+
if df_posts.empty:
|
| 966 |
+
logger.warning("No forum posts to process")
|
| 967 |
+
else:
|
| 968 |
+
start_time = datetime.now()
|
| 969 |
+
|
| 970 |
+
if sequential:
|
| 971 |
+
forum_stats = self.process_forums_sequential(df_posts, overwrite)
|
| 972 |
+
else:
|
| 973 |
+
forum_stats = self.process_forums_parallel(df_posts, overwrite)
|
| 974 |
+
|
| 975 |
+
forum_time = (datetime.now() - start_time).total_seconds()
|
| 976 |
+
|
| 977 |
+
# ---- Process Comments ----
|
| 978 |
+
if process_comments:
|
| 979 |
+
logger.info("-" * 40)
|
| 980 |
+
logger.info("Processing SOCIAL MEDIA COMMENTS")
|
| 981 |
+
logger.info("-" * 40)
|
| 982 |
+
|
| 983 |
+
df_comments = self.fetch_comments(limit)
|
| 984 |
+
|
| 985 |
+
if df_comments.empty:
|
| 986 |
+
logger.warning("No social media comments to process")
|
| 987 |
+
else:
|
| 988 |
+
start_time = datetime.now()
|
| 989 |
+
|
| 990 |
+
if sequential:
|
| 991 |
+
comment_stats = self.process_comments_sequential(df_comments, overwrite)
|
| 992 |
+
else:
|
| 993 |
+
comment_stats = self.process_comments_parallel(df_comments, overwrite)
|
| 994 |
+
|
| 995 |
+
comment_time = (datetime.now() - start_time).total_seconds()
|
| 996 |
+
|
| 997 |
+
# ---- Summary ----
|
| 998 |
+
logger.info("=" * 80)
|
| 999 |
+
logger.info("Processing Summary:")
|
| 1000 |
+
logger.info(f" Mode: {'Sequential' if sequential else 'Parallel'}")
|
| 1001 |
+
logger.info(f" Data source: {data_source}")
|
| 1002 |
+
|
| 1003 |
+
if forum_stats is not None:
|
| 1004 |
+
self._log_source_summary("Forums", forum_stats, forum_time)
|
| 1005 |
+
|
| 1006 |
+
if comment_stats is not None:
|
| 1007 |
+
self._log_source_summary("Social Media Comments", comment_stats, comment_time)
|
| 1008 |
+
|
| 1009 |
+
logger.info("=" * 80)
|
| 1010 |
+
|
| 1011 |
+
except Exception as e:
|
| 1012 |
+
logger.error(f"Error in workflow execution: {str(e)}", exc_info=True)
|
| 1013 |
+
raise
|
| 1014 |
+
|
| 1015 |
+
finally:
|
| 1016 |
+
self.snowflake.close_connection()
|
| 1017 |
+
logger.info("Snowflake connection closed")
|
| 1018 |
+
|
| 1019 |
+
|
| 1020 |
+
# ============================================================
|
| 1021 |
+
# Legacy compatibility - keep old function names working
|
| 1022 |
+
# ============================================================
|
| 1023 |
+
|
| 1024 |
+
def prepare_output_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
| 1025 |
+
"""Legacy wrapper for forum output preparation."""
|
| 1026 |
+
return prepare_forum_output_dataframe(df)
|
| 1027 |
+
|
| 1028 |
+
|
| 1029 |
+
def process_batch_worker(batch_data: tuple) -> Dict[str, Any]:
|
| 1030 |
+
"""Legacy wrapper for forum batch worker."""
|
| 1031 |
+
return process_forum_batch_worker(batch_data)
|
| 1032 |
+
|
| 1033 |
+
|
| 1034 |
+
# ============================================================
|
| 1035 |
+
# Main Entry Point
|
| 1036 |
+
# ============================================================
|
| 1037 |
+
|
| 1038 |
+
def main():
|
| 1039 |
+
"""Main entry point."""
|
| 1040 |
+
parser = argparse.ArgumentParser(
|
| 1041 |
+
description="Brand Sentiment Analysis - Analyze forum posts and social media comments for brand intelligence"
|
| 1042 |
+
)
|
| 1043 |
+
parser.add_argument(
|
| 1044 |
+
'--limit',
|
| 1045 |
+
type=int,
|
| 1046 |
+
default=None,
|
| 1047 |
+
help='Limit number of items to process per source (default: all unprocessed)'
|
| 1048 |
+
)
|
| 1049 |
+
parser.add_argument(
|
| 1050 |
+
'--overwrite',
|
| 1051 |
+
action='store_true',
|
| 1052 |
+
default=False,
|
| 1053 |
+
help='Overwrite existing Snowflake table (default: append)'
|
| 1054 |
+
)
|
| 1055 |
+
parser.add_argument(
|
| 1056 |
+
'--sequential',
|
| 1057 |
+
action='store_true',
|
| 1058 |
+
default=False,
|
| 1059 |
+
help='Use sequential processing instead of parallel (for debugging)'
|
| 1060 |
+
)
|
| 1061 |
+
parser.add_argument(
|
| 1062 |
+
'--config-dir',
|
| 1063 |
+
type=str,
|
| 1064 |
+
default=None,
|
| 1065 |
+
help='Path to configuration directory (default: config_files/)'
|
| 1066 |
+
)
|
| 1067 |
+
parser.add_argument(
|
| 1068 |
+
'--data-source',
|
| 1069 |
+
type=str,
|
| 1070 |
+
choices=['forums', 'comments', 'all'],
|
| 1071 |
+
default='all',
|
| 1072 |
+
help='Data source to process: forums, comments, or all (default: all)'
|
| 1073 |
+
)
|
| 1074 |
+
|
| 1075 |
+
args = parser.parse_args()
|
| 1076 |
+
|
| 1077 |
+
# Initialize and run
|
| 1078 |
+
processor = BrandSentimentProcessor(config_dir=args.config_dir)
|
| 1079 |
+
processor.run(
|
| 1080 |
+
limit=args.limit,
|
| 1081 |
+
overwrite=args.overwrite,
|
| 1082 |
+
sequential=args.sequential,
|
| 1083 |
+
data_source=args.data_source
|
| 1084 |
+
)
|
| 1085 |
+
|
| 1086 |
+
|
| 1087 |
+
if __name__ == "__main__":
|
| 1088 |
+
main()
|
processing_brand_sentiment/utils/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utilities module for brand sentiment analysis.
|
| 3 |
+
Contains HTML parsing and other helper functions.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .html_parser import HTMLParser
|
| 7 |
+
|
| 8 |
+
__all__ = ['HTMLParser']
|
processing_brand_sentiment/utils/html_parser.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HTML Parser utility for extracting content from forum posts.
|
| 3 |
+
Handles the complex HTML structure where replies contain quoted parent content.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
import html
|
| 8 |
+
from typing import Dict, Optional, Tuple
|
| 9 |
+
from bs4 import BeautifulSoup
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class HTMLParser:
|
| 16 |
+
"""
|
| 17 |
+
Parses HTML content from forum posts to extract actual reply content
|
| 18 |
+
and quoted parent content separately.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self):
|
| 22 |
+
"""Initialize the HTML parser."""
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
def parse_post_content(self, html_content: str) -> Dict[str, Optional[str]]:
|
| 26 |
+
"""
|
| 27 |
+
Parse HTML post content to extract reply and quoted content.
|
| 28 |
+
|
| 29 |
+
The forum posts have a structure where:
|
| 30 |
+
- <blockquote> contains the quoted parent post
|
| 31 |
+
- Content outside blockquote is the actual reply
|
| 32 |
+
|
| 33 |
+
Example input:
|
| 34 |
+
<blockquote><span class="post-id">125015</span>
|
| 35 |
+
<p class="quote-heading"><strong>JackO</strong><em> - Feb 3, 2015</em></p>
|
| 36 |
+
<br /><p>Parent content here...</p></blockquote>
|
| 37 |
+
<br /><p>Actual reply content here...</p>
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
html_content: Raw HTML content from POST_CONTENT field
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
Dictionary with:
|
| 44 |
+
- reply_content: The actual reply text (cleaned)
|
| 45 |
+
- quoted_content: The quoted parent text (cleaned), if any
|
| 46 |
+
- quoted_author: Author of the quoted post, if any
|
| 47 |
+
- quoted_date: Date of the quoted post, if any
|
| 48 |
+
- has_quote: Boolean indicating if post contains a quote
|
| 49 |
+
"""
|
| 50 |
+
if not html_content or not html_content.strip():
|
| 51 |
+
return {
|
| 52 |
+
"reply_content": "",
|
| 53 |
+
"quoted_content": None,
|
| 54 |
+
"quoted_author": None,
|
| 55 |
+
"quoted_date": None,
|
| 56 |
+
"has_quote": False
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 61 |
+
|
| 62 |
+
# Extract quoted content from blockquotes
|
| 63 |
+
quoted_content = None
|
| 64 |
+
quoted_author = None
|
| 65 |
+
quoted_date = None
|
| 66 |
+
has_quote = False
|
| 67 |
+
|
| 68 |
+
blockquotes = soup.find_all('blockquote')
|
| 69 |
+
|
| 70 |
+
if blockquotes:
|
| 71 |
+
has_quote = True
|
| 72 |
+
quote_parts = []
|
| 73 |
+
|
| 74 |
+
for blockquote in blockquotes:
|
| 75 |
+
# Extract quote heading info (author and date)
|
| 76 |
+
quote_heading = blockquote.find('p', class_='quote-heading')
|
| 77 |
+
if quote_heading:
|
| 78 |
+
author_tag = quote_heading.find('strong')
|
| 79 |
+
if author_tag:
|
| 80 |
+
quoted_author = author_tag.get_text(strip=True)
|
| 81 |
+
|
| 82 |
+
date_tag = quote_heading.find('em')
|
| 83 |
+
if date_tag:
|
| 84 |
+
quoted_date = date_tag.get_text(strip=True).lstrip(' - ')
|
| 85 |
+
|
| 86 |
+
# Get the quote text content (excluding heading)
|
| 87 |
+
# Remove the heading first to get just the content
|
| 88 |
+
if quote_heading:
|
| 89 |
+
quote_heading.decompose()
|
| 90 |
+
|
| 91 |
+
# Remove post-id spans
|
| 92 |
+
for post_id_span in blockquote.find_all('span', class_='post-id'):
|
| 93 |
+
post_id_span.decompose()
|
| 94 |
+
|
| 95 |
+
quote_text = self._clean_text(blockquote.get_text())
|
| 96 |
+
if quote_text:
|
| 97 |
+
quote_parts.append(quote_text)
|
| 98 |
+
|
| 99 |
+
# Remove the blockquote from the soup to get remaining content
|
| 100 |
+
blockquote.decompose()
|
| 101 |
+
|
| 102 |
+
quoted_content = " ".join(quote_parts) if quote_parts else None
|
| 103 |
+
|
| 104 |
+
# Get the remaining content (actual reply)
|
| 105 |
+
reply_content = self._clean_text(soup.get_text())
|
| 106 |
+
|
| 107 |
+
return {
|
| 108 |
+
"reply_content": reply_content,
|
| 109 |
+
"quoted_content": quoted_content,
|
| 110 |
+
"quoted_author": quoted_author,
|
| 111 |
+
"quoted_date": quoted_date,
|
| 112 |
+
"has_quote": has_quote
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.warning(f"Error parsing HTML content: {e}")
|
| 117 |
+
# Fallback: try to extract text directly
|
| 118 |
+
return {
|
| 119 |
+
"reply_content": self._clean_text(self._strip_html_tags(html_content)),
|
| 120 |
+
"quoted_content": None,
|
| 121 |
+
"quoted_author": None,
|
| 122 |
+
"quoted_date": None,
|
| 123 |
+
"has_quote": False
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
def _clean_text(self, text: str) -> str:
|
| 127 |
+
"""
|
| 128 |
+
Clean extracted text by removing extra whitespace and normalizing.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
text: Raw text to clean
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
Cleaned text
|
| 135 |
+
"""
|
| 136 |
+
if not text:
|
| 137 |
+
return ""
|
| 138 |
+
|
| 139 |
+
# Decode HTML entities
|
| 140 |
+
text = html.unescape(text)
|
| 141 |
+
|
| 142 |
+
# Replace multiple whitespace with single space
|
| 143 |
+
text = re.sub(r'\s+', ' ', text)
|
| 144 |
+
|
| 145 |
+
# Strip leading/trailing whitespace
|
| 146 |
+
text = text.strip()
|
| 147 |
+
|
| 148 |
+
return text
|
| 149 |
+
|
| 150 |
+
def _strip_html_tags(self, html_content: str) -> str:
|
| 151 |
+
"""
|
| 152 |
+
Fallback method to strip HTML tags if BeautifulSoup fails.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
html_content: HTML content
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
Text without HTML tags
|
| 159 |
+
"""
|
| 160 |
+
# Remove HTML tags
|
| 161 |
+
clean = re.sub(r'<[^>]+>', ' ', html_content)
|
| 162 |
+
# Decode entities
|
| 163 |
+
clean = html.unescape(clean)
|
| 164 |
+
# Clean whitespace
|
| 165 |
+
clean = re.sub(r'\s+', ' ', clean)
|
| 166 |
+
return clean.strip()
|
| 167 |
+
|
| 168 |
+
def extract_plain_text(self, html_content: str) -> str:
|
| 169 |
+
"""
|
| 170 |
+
Extract plain text from HTML content, preserving readability.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
html_content: HTML content
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
Plain text version
|
| 177 |
+
"""
|
| 178 |
+
if not html_content:
|
| 179 |
+
return ""
|
| 180 |
+
|
| 181 |
+
try:
|
| 182 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 183 |
+
|
| 184 |
+
# Add newlines for block elements
|
| 185 |
+
for br in soup.find_all('br'):
|
| 186 |
+
br.replace_with('\n')
|
| 187 |
+
for p in soup.find_all('p'):
|
| 188 |
+
p.append('\n')
|
| 189 |
+
|
| 190 |
+
text = soup.get_text()
|
| 191 |
+
return self._clean_text(text)
|
| 192 |
+
|
| 193 |
+
except Exception as e:
|
| 194 |
+
logger.warning(f"Error extracting plain text: {e}")
|
| 195 |
+
return self._clean_text(self._strip_html_tags(html_content))
|
| 196 |
+
|
| 197 |
+
def build_thread_context(
|
| 198 |
+
self,
|
| 199 |
+
thread_title: Optional[str],
|
| 200 |
+
first_post_content: Optional[str],
|
| 201 |
+
category_title: Optional[str] = None,
|
| 202 |
+
category_topic: Optional[str] = None
|
| 203 |
+
) -> str:
|
| 204 |
+
"""
|
| 205 |
+
Build a context string from thread information.
|
| 206 |
+
|
| 207 |
+
Args:
|
| 208 |
+
thread_title: Title of the discussion thread
|
| 209 |
+
first_post_content: Content of the first post in the thread
|
| 210 |
+
category_title: Category title
|
| 211 |
+
category_topic: Category topic
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
Formatted context string
|
| 215 |
+
"""
|
| 216 |
+
context_parts = []
|
| 217 |
+
|
| 218 |
+
if category_title:
|
| 219 |
+
context_parts.append(f"Category: {category_title}")
|
| 220 |
+
|
| 221 |
+
if category_topic:
|
| 222 |
+
context_parts.append(f"Topic: {category_topic}")
|
| 223 |
+
|
| 224 |
+
if thread_title:
|
| 225 |
+
context_parts.append(f"Thread: {thread_title}")
|
| 226 |
+
|
| 227 |
+
if first_post_content:
|
| 228 |
+
# Parse and clean the first post content
|
| 229 |
+
parsed = self.parse_post_content(first_post_content)
|
| 230 |
+
first_post_text = parsed.get("reply_content", "")
|
| 231 |
+
if first_post_text:
|
| 232 |
+
# Truncate if too long
|
| 233 |
+
if len(first_post_text) > 500:
|
| 234 |
+
first_post_text = first_post_text[:500] + "..."
|
| 235 |
+
context_parts.append(f"Original discussion: {first_post_text}")
|
| 236 |
+
|
| 237 |
+
return " | ".join(context_parts) if context_parts else ""
|
| 238 |
+
|
| 239 |
+
def is_empty_content(self, html_content: str) -> bool:
|
| 240 |
+
"""
|
| 241 |
+
Check if HTML content is effectively empty.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
html_content: HTML content to check
|
| 245 |
+
|
| 246 |
+
Returns:
|
| 247 |
+
True if content is empty or contains no meaningful text
|
| 248 |
+
"""
|
| 249 |
+
if not html_content:
|
| 250 |
+
return True
|
| 251 |
+
|
| 252 |
+
text = self.extract_plain_text(html_content)
|
| 253 |
+
return len(text.strip()) == 0
|
processing_brand_sentiment/workflow/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Workflow module for brand sentiment analysis.
|
| 3 |
+
Contains the LangGraph orchestrators and agent implementations.
|
| 4 |
+
Supports both forum posts and social media comments.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .orchestrator import BrandAnalysisWorkflow
|
| 8 |
+
from .comment_orchestrator import CommentAnalysisWorkflow
|
| 9 |
+
|
| 10 |
+
__all__ = ['BrandAnalysisWorkflow', 'CommentAnalysisWorkflow']
|
processing_brand_sentiment/workflow/agents/__init__.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agents module for brand sentiment analysis v4.0.
|
| 3 |
+
|
| 4 |
+
Contains specialized agents for the 4-stage pipeline:
|
| 5 |
+
1. ContentPreprocessorAgent - HTML parsing, cleaning, keyword detection (forums)
|
| 6 |
+
CommentPreprocessorAgent - Plain text cleaning, keyword detection (comments)
|
| 7 |
+
2. SabianRelevanceExtractionAgent - Relevance + fact extraction
|
| 8 |
+
3. SabianSentimentAnalyzerAgent - Deep sentiment analysis
|
| 9 |
+
4. OutputValidatorAgent - Rule-based validation
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from .base_agent import BaseAgent
|
| 13 |
+
from .content_preprocessor_agent import ContentPreprocessorAgent
|
| 14 |
+
from .comment_preprocessor_agent import CommentPreprocessorAgent
|
| 15 |
+
from .sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent
|
| 16 |
+
from .sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent
|
| 17 |
+
from .output_validator_agent import OutputValidatorAgent
|
| 18 |
+
|
| 19 |
+
# Legacy imports for backward compatibility
|
| 20 |
+
from .preprocessor_agent import PreprocessorAgent
|
| 21 |
+
from .relevance_validator_agent import RelevanceValidatorAgent
|
| 22 |
+
from .sabian_analyzer_agent import SabianAnalyzerAgent
|
| 23 |
+
|
| 24 |
+
__all__ = [
|
| 25 |
+
# Base
|
| 26 |
+
'BaseAgent',
|
| 27 |
+
|
| 28 |
+
# New agents (v4.0)
|
| 29 |
+
'ContentPreprocessorAgent',
|
| 30 |
+
'CommentPreprocessorAgent',
|
| 31 |
+
'SabianRelevanceExtractionAgent',
|
| 32 |
+
'SabianSentimentAnalyzerAgent',
|
| 33 |
+
'OutputValidatorAgent',
|
| 34 |
+
|
| 35 |
+
# Legacy agents (for backward compatibility)
|
| 36 |
+
'PreprocessorAgent',
|
| 37 |
+
'RelevanceValidatorAgent',
|
| 38 |
+
'SabianAnalyzerAgent'
|
| 39 |
+
]
|
processing_brand_sentiment/workflow/agents/base_agent.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Base Agent class for all agents in the brand sentiment analysis workflow.
|
| 3 |
+
Provides a common interface and structure for extensibility.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from abc import ABC, abstractmethod
|
| 7 |
+
from typing import Dict, Any, Optional
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class BaseAgent(ABC):
|
| 15 |
+
"""
|
| 16 |
+
Abstract base class for all agents in the brand sentiment analysis workflow.
|
| 17 |
+
Provides common functionality and enforces consistent interface.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, name: str, config: Dict[str, Any]):
|
| 21 |
+
"""
|
| 22 |
+
Initialize the base agent.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
name: Name of the agent
|
| 26 |
+
config: Configuration dictionary for the agent
|
| 27 |
+
"""
|
| 28 |
+
self.name = name
|
| 29 |
+
self.config = config
|
| 30 |
+
self.model = config.get("model", "gpt-5-nano")
|
| 31 |
+
self.temperature = config.get("temperature", 0.2)
|
| 32 |
+
self.max_retries = config.get("max_retries", 3)
|
| 33 |
+
logger.info(f"Initialized {self.name} with model {self.model}")
|
| 34 |
+
|
| 35 |
+
@abstractmethod
|
| 36 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 37 |
+
"""
|
| 38 |
+
Process input data and return results.
|
| 39 |
+
This method must be implemented by all concrete agent classes.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
input_data: Dictionary containing input data for processing
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
Dictionary containing processing results
|
| 46 |
+
"""
|
| 47 |
+
pass
|
| 48 |
+
|
| 49 |
+
@abstractmethod
|
| 50 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 51 |
+
"""
|
| 52 |
+
Validate input data before processing.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
input_data: Dictionary containing input data
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
True if input is valid, False otherwise
|
| 59 |
+
"""
|
| 60 |
+
pass
|
| 61 |
+
|
| 62 |
+
def get_name(self) -> str:
|
| 63 |
+
"""Get the agent name."""
|
| 64 |
+
return self.name
|
| 65 |
+
|
| 66 |
+
def get_config(self) -> Dict[str, Any]:
|
| 67 |
+
"""Get the agent configuration."""
|
| 68 |
+
return self.config
|
| 69 |
+
|
| 70 |
+
def log_processing(self, message: str, level: str = "info"):
|
| 71 |
+
"""
|
| 72 |
+
Log processing information.
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
message: Log message
|
| 76 |
+
level: Log level (info, warning, error, debug)
|
| 77 |
+
"""
|
| 78 |
+
log_method = getattr(logger, level, logger.info)
|
| 79 |
+
log_method(f"[{self.name}] {message}")
|
| 80 |
+
|
| 81 |
+
def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]:
|
| 82 |
+
"""
|
| 83 |
+
Handle errors consistently across all agents.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
error: The exception that occurred
|
| 87 |
+
context: Additional context about the error
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
Error dictionary with details
|
| 91 |
+
"""
|
| 92 |
+
error_msg = f"Error in {self.name}"
|
| 93 |
+
if context:
|
| 94 |
+
error_msg += f" ({context})"
|
| 95 |
+
error_msg += f": {str(error)}"
|
| 96 |
+
|
| 97 |
+
logger.error(error_msg)
|
| 98 |
+
|
| 99 |
+
return {
|
| 100 |
+
"success": False,
|
| 101 |
+
"error": str(error),
|
| 102 |
+
"agent": self.name,
|
| 103 |
+
"context": context
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]:
|
| 107 |
+
"""
|
| 108 |
+
Parse LLM response that may contain JSON wrapped in markdown code blocks.
|
| 109 |
+
|
| 110 |
+
Args:
|
| 111 |
+
response_content: Raw response content from LLM
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
Parsed JSON dictionary
|
| 115 |
+
|
| 116 |
+
Raises:
|
| 117 |
+
json.JSONDecodeError: If JSON cannot be parsed
|
| 118 |
+
"""
|
| 119 |
+
content = response_content.strip()
|
| 120 |
+
|
| 121 |
+
# Check if response is wrapped in markdown code block
|
| 122 |
+
if content.startswith("```json"):
|
| 123 |
+
# Remove ```json prefix and ``` suffix
|
| 124 |
+
content = content[7:] # Remove ```json
|
| 125 |
+
if content.endswith("```"):
|
| 126 |
+
content = content[:-3] # Remove trailing ```
|
| 127 |
+
content = content.strip()
|
| 128 |
+
elif content.startswith("```"):
|
| 129 |
+
# Remove generic ``` code block
|
| 130 |
+
content = content[3:]
|
| 131 |
+
if content.endswith("```"):
|
| 132 |
+
content = content[:-3]
|
| 133 |
+
content = content.strip()
|
| 134 |
+
|
| 135 |
+
# Parse the cleaned JSON
|
| 136 |
+
return json.loads(content)
|
| 137 |
+
|
| 138 |
+
def _safe_get(self, data: Dict[str, Any], key: str, default: Any = None) -> Any:
|
| 139 |
+
"""
|
| 140 |
+
Safely get a value from a dictionary with a default.
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
data: Dictionary to get value from
|
| 144 |
+
key: Key to look up
|
| 145 |
+
default: Default value if key not found
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
Value from dictionary or default
|
| 149 |
+
"""
|
| 150 |
+
return data.get(key, default)
|
| 151 |
+
|
| 152 |
+
def _ensure_list(self, value: Any) -> list:
|
| 153 |
+
"""
|
| 154 |
+
Ensure a value is a list.
|
| 155 |
+
|
| 156 |
+
Args:
|
| 157 |
+
value: Value to convert
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
List version of value
|
| 161 |
+
"""
|
| 162 |
+
if value is None:
|
| 163 |
+
return []
|
| 164 |
+
if isinstance(value, list):
|
| 165 |
+
return value
|
| 166 |
+
if isinstance(value, str):
|
| 167 |
+
# Try to parse as comma-separated
|
| 168 |
+
return [v.strip() for v in value.split(",") if v.strip()]
|
| 169 |
+
return [value]
|
processing_brand_sentiment/workflow/agents/comment_preprocessor_agent.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Comment Preprocessor Agent for brand sentiment analysis on social media comments.
|
| 3 |
+
|
| 4 |
+
Extends ContentPreprocessorAgent but handles plain text (no HTML parsing).
|
| 5 |
+
Builds context from content title, content description, and parent comment text
|
| 6 |
+
instead of thread title and first post.
|
| 7 |
+
|
| 8 |
+
Reuses: keyword sets, product alias mapping, language detection, relevance screening.
|
| 9 |
+
Overrides: process() method for plain text handling and comment-specific context building.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from typing import Dict, Any, Optional
|
| 13 |
+
import logging
|
| 14 |
+
|
| 15 |
+
from .content_preprocessor_agent import ContentPreprocessorAgent
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class CommentPreprocessorAgent(ContentPreprocessorAgent):
|
| 21 |
+
"""
|
| 22 |
+
Agent that preprocesses social media comments for brand sentiment analysis.
|
| 23 |
+
|
| 24 |
+
Inherits keyword detection, product alias mapping, language detection,
|
| 25 |
+
and relevance screening from ContentPreprocessorAgent.
|
| 26 |
+
|
| 27 |
+
Key differences from forum preprocessor:
|
| 28 |
+
- No HTML parsing (comments are plain text)
|
| 29 |
+
- Context built from content title + description + parent comment
|
| 30 |
+
- Different input field names (comment_text vs post_content)
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]):
|
| 34 |
+
"""
|
| 35 |
+
Initialize the Comment Preprocessor Agent.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
config: Agent configuration
|
| 39 |
+
brand_config: Brand-specific configuration with keywords, products, and aliases
|
| 40 |
+
"""
|
| 41 |
+
super().__init__(config, brand_config)
|
| 42 |
+
self.name = "CommentPreprocessorAgent"
|
| 43 |
+
|
| 44 |
+
logger.info(
|
| 45 |
+
f"CommentPreprocessorAgent initialized with {len(self.primary_keywords)} primary keywords, "
|
| 46 |
+
f"{len(self.product_aliases)} product aliases"
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 50 |
+
"""
|
| 51 |
+
Validate that input contains required fields for comment processing.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
input_data: Input dictionary
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
True if valid, False otherwise
|
| 58 |
+
"""
|
| 59 |
+
required_fields = ["comment_sk", "comment_text"]
|
| 60 |
+
return all(field in input_data for field in required_fields)
|
| 61 |
+
|
| 62 |
+
def _build_comment_context(
|
| 63 |
+
self,
|
| 64 |
+
content_title: Optional[str] = None,
|
| 65 |
+
content_description: Optional[str] = None,
|
| 66 |
+
parent_comment_text: Optional[str] = None
|
| 67 |
+
) -> str:
|
| 68 |
+
"""
|
| 69 |
+
Build context string from social media content and parent comment information.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
content_title: Title of the social media post/content
|
| 73 |
+
content_description: Description/message of the social media post
|
| 74 |
+
parent_comment_text: Text of the parent comment (if this is a reply)
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
Formatted context string
|
| 78 |
+
"""
|
| 79 |
+
context_parts = []
|
| 80 |
+
|
| 81 |
+
if content_title:
|
| 82 |
+
context_parts.append(f"Post title: {content_title}")
|
| 83 |
+
|
| 84 |
+
if content_description:
|
| 85 |
+
# Truncate if too long
|
| 86 |
+
truncated = content_description[:500] + "..." if len(content_description) > 500 else content_description
|
| 87 |
+
context_parts.append(f"Post description: {truncated}")
|
| 88 |
+
|
| 89 |
+
if parent_comment_text:
|
| 90 |
+
truncated = parent_comment_text[:500] + "..." if len(parent_comment_text) > 500 else parent_comment_text
|
| 91 |
+
context_parts.append(f"Parent comment: {truncated}")
|
| 92 |
+
|
| 93 |
+
return " | ".join(context_parts) if context_parts else ""
|
| 94 |
+
|
| 95 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 96 |
+
"""
|
| 97 |
+
Process a social media comment through the preprocessing pipeline.
|
| 98 |
+
|
| 99 |
+
Unlike forum posts, comments are plain text (no HTML parsing needed).
|
| 100 |
+
Context is built from content title, description, and parent comment.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
input_data: Dictionary containing comment data with at least:
|
| 104 |
+
- comment_sk: Comment surrogate key
|
| 105 |
+
- comment_text: Raw comment text (plain text)
|
| 106 |
+
- content_title: Title of the post (optional)
|
| 107 |
+
- content_description: Description of the post (optional)
|
| 108 |
+
- parent_comment_text: Parent comment text if reply (optional)
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
Dictionary with preprocessing results
|
| 112 |
+
"""
|
| 113 |
+
try:
|
| 114 |
+
# Validate input
|
| 115 |
+
if not self.validate_input(input_data):
|
| 116 |
+
return {
|
| 117 |
+
"success": False,
|
| 118 |
+
"error": "Invalid input: missing required fields (comment_sk, comment_text)",
|
| 119 |
+
**input_data
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
comment_text = input_data.get("comment_text", "")
|
| 123 |
+
|
| 124 |
+
# Step 1: Clean text (plain text - no HTML parsing needed)
|
| 125 |
+
cleaned_content = comment_text.strip() if comment_text else ""
|
| 126 |
+
|
| 127 |
+
# Check for empty content
|
| 128 |
+
if not cleaned_content or len(cleaned_content) < self.min_content_length:
|
| 129 |
+
return {
|
| 130 |
+
"success": True,
|
| 131 |
+
"cleaned_content": cleaned_content,
|
| 132 |
+
"quoted_content": None,
|
| 133 |
+
"is_empty": True,
|
| 134 |
+
"preliminary_relevant": False,
|
| 135 |
+
"needs_relevance_validation": False,
|
| 136 |
+
**{k: v for k, v in input_data.items() if k != "comment_text"}
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
# Step 2: Check relevance (reused from parent class)
|
| 140 |
+
relevance_result = self._check_relevance(cleaned_content)
|
| 141 |
+
has_primary_keywords = relevance_result.get("has_primary_keywords", False)
|
| 142 |
+
|
| 143 |
+
# Step 3: Build comment context
|
| 144 |
+
raw_thread_context = self._build_comment_context(
|
| 145 |
+
content_title=input_data.get("content_title"),
|
| 146 |
+
content_description=input_data.get("content_description"),
|
| 147 |
+
parent_comment_text=input_data.get("parent_comment_text")
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
# Step 4: Detect language (reused from parent class)
|
| 151 |
+
lang_result = self._detect_language(cleaned_content, has_primary_keywords)
|
| 152 |
+
|
| 153 |
+
# Step 5: Extract product and competitor mentions (reused from parent class)
|
| 154 |
+
products_found = self._extract_mentioned_products(cleaned_content)
|
| 155 |
+
competitors_found = self._extract_mentioned_competitors(cleaned_content)
|
| 156 |
+
|
| 157 |
+
# Determine quoted content (parent comment serves as quoted context)
|
| 158 |
+
parent_comment = input_data.get("parent_comment_text")
|
| 159 |
+
has_parent = parent_comment is not None and str(parent_comment).strip() != ""
|
| 160 |
+
|
| 161 |
+
# Build result
|
| 162 |
+
result = {
|
| 163 |
+
"success": True,
|
| 164 |
+
"is_empty": False,
|
| 165 |
+
|
| 166 |
+
# Cleaned content
|
| 167 |
+
"cleaned_content": cleaned_content,
|
| 168 |
+
"quoted_content": parent_comment if has_parent else None,
|
| 169 |
+
"has_quote": has_parent,
|
| 170 |
+
"quoted_author": None,
|
| 171 |
+
"raw_thread_context": raw_thread_context,
|
| 172 |
+
|
| 173 |
+
# Language detection
|
| 174 |
+
"detected_language": lang_result["language"],
|
| 175 |
+
"language_code": lang_result["language_code"],
|
| 176 |
+
"is_english": lang_result["is_english"],
|
| 177 |
+
"language_confidence": lang_result["confidence"],
|
| 178 |
+
"language_detection_skipped": lang_result.get("detection_skipped", False),
|
| 179 |
+
|
| 180 |
+
# Relevance assessment
|
| 181 |
+
"preliminary_relevant": relevance_result["preliminary_relevant"],
|
| 182 |
+
"needs_relevance_validation": relevance_result["needs_relevance_validation"],
|
| 183 |
+
"relevance_keywords_found": relevance_result["found_keywords"],
|
| 184 |
+
"relevance_type": relevance_result["relevance_type"],
|
| 185 |
+
"relevance_confidence": relevance_result["relevance_confidence"],
|
| 186 |
+
"has_primary_keywords": has_primary_keywords,
|
| 187 |
+
|
| 188 |
+
# Initial extractions
|
| 189 |
+
"products_detected": products_found,
|
| 190 |
+
"competitors_detected": competitors_found,
|
| 191 |
+
|
| 192 |
+
# Preserve original data (exclude raw text to avoid duplication)
|
| 193 |
+
**{k: v for k, v in input_data.items() if k not in ["comment_text"]}
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
# Keep original content for reference
|
| 197 |
+
result["original_text"] = comment_text
|
| 198 |
+
|
| 199 |
+
self.log_processing(
|
| 200 |
+
f"Processed comment {input_data.get('comment_sk')}: "
|
| 201 |
+
f"lang={lang_result['language']}, "
|
| 202 |
+
f"relevant={relevance_result['preliminary_relevant']}, "
|
| 203 |
+
f"needs_validation={relevance_result['needs_relevance_validation']}, "
|
| 204 |
+
f"products={products_found}",
|
| 205 |
+
"debug"
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
return result
|
| 209 |
+
|
| 210 |
+
except Exception as e:
|
| 211 |
+
return self.handle_error(e, f"preprocessing comment {input_data.get('comment_sk')}")
|
processing_brand_sentiment/workflow/agents/content_preprocessor_agent.py
ADDED
|
@@ -0,0 +1,570 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Content Preprocessor Agent for brand sentiment analysis.
|
| 3 |
+
Handles HTML parsing, text cleaning, language detection, product alias mapping,
|
| 4 |
+
and initial relevance screening. This is a deterministic agent (no LLM calls).
|
| 5 |
+
|
| 6 |
+
Enhanced version with:
|
| 7 |
+
- Product alias mapping (B8 -> B8X)
|
| 8 |
+
- Smart language detection (skip for short texts)
|
| 9 |
+
- Always process if primary keywords found
|
| 10 |
+
- Better content separation
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import re
|
| 14 |
+
from typing import Dict, Any, List, Optional, Set
|
| 15 |
+
from lingua import Language, LanguageDetectorBuilder
|
| 16 |
+
import logging
|
| 17 |
+
|
| 18 |
+
from .base_agent import BaseAgent
|
| 19 |
+
from utils.html_parser import HTMLParser
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class ContentPreprocessorAgent(BaseAgent):
|
| 25 |
+
"""
|
| 26 |
+
Agent that preprocesses forum posts:
|
| 27 |
+
- Parses HTML to extract reply and quoted content
|
| 28 |
+
- Cleans and normalizes text
|
| 29 |
+
- Maps product aliases to canonical names
|
| 30 |
+
- Detects language (with smart handling for short texts)
|
| 31 |
+
- Performs initial keyword-based relevance screening
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
# Lingua to ISO 639-1 language code mapping
|
| 35 |
+
LINGUA_TO_ISO = {
|
| 36 |
+
Language.ENGLISH: "en",
|
| 37 |
+
Language.SPANISH: "es",
|
| 38 |
+
Language.FRENCH: "fr",
|
| 39 |
+
Language.GERMAN: "de",
|
| 40 |
+
Language.ITALIAN: "it",
|
| 41 |
+
Language.PORTUGUESE: "pt",
|
| 42 |
+
Language.RUSSIAN: "ru",
|
| 43 |
+
Language.JAPANESE: "ja",
|
| 44 |
+
Language.KOREAN: "ko",
|
| 45 |
+
Language.CHINESE: "zh",
|
| 46 |
+
Language.ARABIC: "ar",
|
| 47 |
+
Language.HINDI: "hi",
|
| 48 |
+
Language.DUTCH: "nl",
|
| 49 |
+
Language.SWEDISH: "sv",
|
| 50 |
+
Language.POLISH: "pl",
|
| 51 |
+
Language.TURKISH: "tr"
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]):
|
| 55 |
+
"""
|
| 56 |
+
Initialize the Content Preprocessor Agent.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
config: Agent configuration
|
| 60 |
+
brand_config: Brand-specific configuration with keywords, products, and aliases
|
| 61 |
+
"""
|
| 62 |
+
super().__init__("ContentPreprocessorAgent", config)
|
| 63 |
+
self.brand_config = brand_config
|
| 64 |
+
self.html_parser = HTMLParser()
|
| 65 |
+
|
| 66 |
+
# Get preprocessing settings
|
| 67 |
+
preprocessing_config = brand_config.get("preprocessing", {})
|
| 68 |
+
self.min_length_for_lang_detection = preprocessing_config.get(
|
| 69 |
+
"min_length_for_language_detection", 50
|
| 70 |
+
)
|
| 71 |
+
self.default_language = preprocessing_config.get(
|
| 72 |
+
"default_language_for_short_text", "English"
|
| 73 |
+
)
|
| 74 |
+
self.always_process_primary = preprocessing_config.get(
|
| 75 |
+
"always_process_if_primary_keyword", True
|
| 76 |
+
)
|
| 77 |
+
self.min_content_length = preprocessing_config.get("min_content_length", 3)
|
| 78 |
+
|
| 79 |
+
# Initialize lingua detector
|
| 80 |
+
self.language_detector = LanguageDetectorBuilder.from_all_languages().build()
|
| 81 |
+
|
| 82 |
+
# Build keyword sets and alias mappings
|
| 83 |
+
self._build_keyword_sets()
|
| 84 |
+
self._build_alias_mappings()
|
| 85 |
+
|
| 86 |
+
logger.info(
|
| 87 |
+
f"ContentPreprocessorAgent initialized with {len(self.primary_keywords)} primary keywords, "
|
| 88 |
+
f"{len(self.product_aliases)} product aliases"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
def _build_keyword_sets(self) -> None:
|
| 92 |
+
"""Build keyword sets from brand configuration for efficient relevance checking."""
|
| 93 |
+
relevance_config = self.brand_config.get("relevance_keywords", {})
|
| 94 |
+
|
| 95 |
+
# Primary keywords - definitive Sabian mentions
|
| 96 |
+
primary = relevance_config.get("primary", {}).get("keywords", [])
|
| 97 |
+
self.primary_keywords: Set[str] = set(k.lower() for k in primary)
|
| 98 |
+
|
| 99 |
+
# Contextual keywords - need disambiguation (HH, AA)
|
| 100 |
+
contextual = relevance_config.get("contextual", {}).get("keywords", [])
|
| 101 |
+
self.contextual_keywords: Set[str] = set(k.lower() for k in contextual)
|
| 102 |
+
|
| 103 |
+
# Cymbal context keywords - help disambiguate contextual terms
|
| 104 |
+
cymbal_context = relevance_config.get("cymbal_context", {}).get("keywords", [])
|
| 105 |
+
self.cymbal_context_keywords: Set[str] = set(k.lower() for k in cymbal_context)
|
| 106 |
+
|
| 107 |
+
# Competitor names and aliases for detection
|
| 108 |
+
competitors = self.brand_config.get("brand", {}).get("competitors", [])
|
| 109 |
+
self.competitor_keywords: Set[str] = set()
|
| 110 |
+
self.competitor_name_map: Dict[str, str] = {} # alias -> canonical name
|
| 111 |
+
|
| 112 |
+
for comp in competitors:
|
| 113 |
+
if isinstance(comp, dict):
|
| 114 |
+
name = comp.get("name", "")
|
| 115 |
+
self.competitor_keywords.add(name.lower())
|
| 116 |
+
self.competitor_name_map[name.lower()] = name
|
| 117 |
+
for alias in comp.get("aliases", []):
|
| 118 |
+
alias_lower = alias.lower()
|
| 119 |
+
self.competitor_keywords.add(alias_lower)
|
| 120 |
+
self.competitor_name_map[alias_lower] = name
|
| 121 |
+
else:
|
| 122 |
+
comp_str = str(comp).lower()
|
| 123 |
+
self.competitor_keywords.add(comp_str)
|
| 124 |
+
self.competitor_name_map[comp_str] = str(comp)
|
| 125 |
+
|
| 126 |
+
# Product names
|
| 127 |
+
products = self.brand_config.get("brand", {}).get("products", [])
|
| 128 |
+
self.product_keywords: Set[str] = set(p.lower() for p in products)
|
| 129 |
+
self.products_list = products # Keep original case
|
| 130 |
+
|
| 131 |
+
logger.debug(
|
| 132 |
+
f"Built keyword sets: {len(self.primary_keywords)} primary, "
|
| 133 |
+
f"{len(self.contextual_keywords)} contextual, "
|
| 134 |
+
f"{len(self.product_keywords)} products, "
|
| 135 |
+
f"{len(self.competitor_keywords)} competitor terms"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
def _build_alias_mappings(self) -> None:
|
| 139 |
+
"""Build product alias mappings from brand configuration."""
|
| 140 |
+
aliases = self.brand_config.get("brand", {}).get("product_aliases", {})
|
| 141 |
+
|
| 142 |
+
# Build alias -> canonical product mapping
|
| 143 |
+
self.product_aliases: Dict[str, str] = {}
|
| 144 |
+
for alias, canonical in aliases.items():
|
| 145 |
+
self.product_aliases[alias.lower()] = canonical
|
| 146 |
+
|
| 147 |
+
# Also add primary keywords that are aliases to contextual keywords
|
| 148 |
+
# e.g., "b8" should trigger contextual check since it maps to "B8X"
|
| 149 |
+
for alias in self.product_aliases.keys():
|
| 150 |
+
if alias not in self.primary_keywords:
|
| 151 |
+
self.contextual_keywords.add(alias)
|
| 152 |
+
|
| 153 |
+
logger.debug(f"Built {len(self.product_aliases)} product alias mappings")
|
| 154 |
+
|
| 155 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 156 |
+
"""
|
| 157 |
+
Validate that input contains required fields.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
input_data: Input dictionary
|
| 161 |
+
|
| 162 |
+
Returns:
|
| 163 |
+
True if valid, False otherwise
|
| 164 |
+
"""
|
| 165 |
+
required_fields = ["post_id", "post_content"]
|
| 166 |
+
return all(field in input_data for field in required_fields)
|
| 167 |
+
|
| 168 |
+
def _detect_language(self, text: str, has_primary_keywords: bool = False) -> Dict[str, Any]:
|
| 169 |
+
"""
|
| 170 |
+
Detect the language of text using lingua library.
|
| 171 |
+
|
| 172 |
+
Enhanced logic:
|
| 173 |
+
- Skip detection for short texts (< min_length_for_lang_detection chars)
|
| 174 |
+
- Always return English if primary Sabian keywords are found
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
text: Text to analyze
|
| 178 |
+
has_primary_keywords: Whether primary Sabian keywords were found
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
Dictionary with language detection results
|
| 182 |
+
"""
|
| 183 |
+
try:
|
| 184 |
+
cleaned_text = text.strip()
|
| 185 |
+
|
| 186 |
+
# If text is too short, default to English
|
| 187 |
+
if len(cleaned_text) < self.min_length_for_lang_detection:
|
| 188 |
+
return {
|
| 189 |
+
"language": self.default_language,
|
| 190 |
+
"language_code": "en",
|
| 191 |
+
"is_english": True,
|
| 192 |
+
"confidence": "low",
|
| 193 |
+
"detection_skipped": True,
|
| 194 |
+
"skip_reason": f"Text too short ({len(cleaned_text)} < {self.min_length_for_lang_detection} chars)"
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
# If primary keywords found and always_process_primary is True, treat as English
|
| 198 |
+
if has_primary_keywords and self.always_process_primary:
|
| 199 |
+
# Still try to detect, but override if non-English
|
| 200 |
+
detected = self.language_detector.detect_language_of(cleaned_text)
|
| 201 |
+
|
| 202 |
+
if detected == Language.ENGLISH:
|
| 203 |
+
return {
|
| 204 |
+
"language": "English",
|
| 205 |
+
"language_code": "en",
|
| 206 |
+
"is_english": True,
|
| 207 |
+
"confidence": "high",
|
| 208 |
+
"detection_skipped": False,
|
| 209 |
+
"skip_reason": None
|
| 210 |
+
}
|
| 211 |
+
else:
|
| 212 |
+
# Primary keyword found but detected as non-English
|
| 213 |
+
# Force to English since Sabian is explicitly mentioned
|
| 214 |
+
lang_name = detected.name.capitalize() if detected else "Unknown"
|
| 215 |
+
return {
|
| 216 |
+
"language": "English",
|
| 217 |
+
"language_code": "en",
|
| 218 |
+
"is_english": True,
|
| 219 |
+
"confidence": "medium",
|
| 220 |
+
"detection_skipped": False,
|
| 221 |
+
"skip_reason": None,
|
| 222 |
+
"original_detected_language": lang_name,
|
| 223 |
+
"override_reason": "Primary Sabian keyword found, treating as English"
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
# Standard detection
|
| 227 |
+
detected = self.language_detector.detect_language_of(cleaned_text)
|
| 228 |
+
|
| 229 |
+
if detected is None:
|
| 230 |
+
return {
|
| 231 |
+
"language": self.default_language,
|
| 232 |
+
"language_code": "en",
|
| 233 |
+
"is_english": True,
|
| 234 |
+
"confidence": "low",
|
| 235 |
+
"detection_skipped": False,
|
| 236 |
+
"skip_reason": None
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
if detected == Language.ENGLISH:
|
| 240 |
+
return {
|
| 241 |
+
"language": "English",
|
| 242 |
+
"language_code": "en",
|
| 243 |
+
"is_english": True,
|
| 244 |
+
"confidence": "high",
|
| 245 |
+
"detection_skipped": False,
|
| 246 |
+
"skip_reason": None
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
lang_code = self.LINGUA_TO_ISO.get(detected, "unknown")
|
| 250 |
+
lang_name = detected.name.capitalize()
|
| 251 |
+
|
| 252 |
+
return {
|
| 253 |
+
"language": lang_name,
|
| 254 |
+
"language_code": lang_code,
|
| 255 |
+
"is_english": False,
|
| 256 |
+
"confidence": "high",
|
| 257 |
+
"detection_skipped": False,
|
| 258 |
+
"skip_reason": None
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
except Exception as e:
|
| 262 |
+
logger.warning(f"Language detection failed: {e}")
|
| 263 |
+
return {
|
| 264 |
+
"language": self.default_language,
|
| 265 |
+
"language_code": "en",
|
| 266 |
+
"is_english": True,
|
| 267 |
+
"confidence": "low",
|
| 268 |
+
"detection_skipped": False,
|
| 269 |
+
"skip_reason": None,
|
| 270 |
+
"detection_error": str(e)
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
def _normalize_product_mentions(self, found_products: List[str]) -> List[str]:
|
| 274 |
+
"""
|
| 275 |
+
Normalize product mentions using alias mappings.
|
| 276 |
+
|
| 277 |
+
Args:
|
| 278 |
+
found_products: List of product terms found
|
| 279 |
+
|
| 280 |
+
Returns:
|
| 281 |
+
List of canonical product names
|
| 282 |
+
"""
|
| 283 |
+
normalized = []
|
| 284 |
+
for product in found_products:
|
| 285 |
+
product_lower = product.lower()
|
| 286 |
+
|
| 287 |
+
# Check if it's an alias
|
| 288 |
+
if product_lower in self.product_aliases:
|
| 289 |
+
canonical = self.product_aliases[product_lower]
|
| 290 |
+
if canonical not in normalized:
|
| 291 |
+
normalized.append(canonical)
|
| 292 |
+
# Check if it's a direct product match
|
| 293 |
+
elif product_lower in self.product_keywords:
|
| 294 |
+
# Find the original case version
|
| 295 |
+
for p in self.products_list:
|
| 296 |
+
if p.lower() == product_lower:
|
| 297 |
+
if p not in normalized:
|
| 298 |
+
normalized.append(p)
|
| 299 |
+
break
|
| 300 |
+
|
| 301 |
+
return normalized
|
| 302 |
+
|
| 303 |
+
def _check_relevance(self, text: str) -> Dict[str, Any]:
|
| 304 |
+
"""
|
| 305 |
+
Check if text is relevant to the brand using keyword matching.
|
| 306 |
+
|
| 307 |
+
Enhanced to handle product aliases.
|
| 308 |
+
|
| 309 |
+
Returns:
|
| 310 |
+
Dictionary with relevance assessment
|
| 311 |
+
"""
|
| 312 |
+
text_lower = text.lower()
|
| 313 |
+
|
| 314 |
+
# Tokenize for word boundary matching
|
| 315 |
+
words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
|
| 316 |
+
|
| 317 |
+
# Also check for multi-word phrases (for aliases like "hand hammered")
|
| 318 |
+
all_aliases = set(self.product_aliases.keys())
|
| 319 |
+
|
| 320 |
+
# Check for primary keywords (definitive matches)
|
| 321 |
+
found_primary = self.primary_keywords.intersection(words)
|
| 322 |
+
|
| 323 |
+
# Check for product aliases in text
|
| 324 |
+
found_aliases = []
|
| 325 |
+
for alias in all_aliases:
|
| 326 |
+
if ' ' in alias:
|
| 327 |
+
# Multi-word alias - check in full text
|
| 328 |
+
if alias in text_lower:
|
| 329 |
+
found_aliases.append(alias)
|
| 330 |
+
elif alias in words:
|
| 331 |
+
found_aliases.append(alias)
|
| 332 |
+
|
| 333 |
+
# Map aliases to canonical products
|
| 334 |
+
alias_products = []
|
| 335 |
+
for alias in found_aliases:
|
| 336 |
+
if alias in self.product_aliases:
|
| 337 |
+
canonical = self.product_aliases[alias]
|
| 338 |
+
if canonical not in alias_products:
|
| 339 |
+
alias_products.append(canonical)
|
| 340 |
+
|
| 341 |
+
if found_primary or alias_products:
|
| 342 |
+
all_found = list(found_primary) + found_aliases
|
| 343 |
+
return {
|
| 344 |
+
"preliminary_relevant": True,
|
| 345 |
+
"needs_relevance_validation": False,
|
| 346 |
+
"found_keywords": all_found,
|
| 347 |
+
"mapped_products": alias_products,
|
| 348 |
+
"relevance_type": "primary",
|
| 349 |
+
"relevance_confidence": "high",
|
| 350 |
+
"has_primary_keywords": True
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
# Check for contextual keywords (need validation)
|
| 354 |
+
found_contextual = self.contextual_keywords.intersection(words)
|
| 355 |
+
if found_contextual:
|
| 356 |
+
# Check if there's cymbal context
|
| 357 |
+
found_cymbal_context = self.cymbal_context_keywords.intersection(words)
|
| 358 |
+
has_cymbal_context = len(found_cymbal_context) > 0
|
| 359 |
+
|
| 360 |
+
return {
|
| 361 |
+
"preliminary_relevant": True,
|
| 362 |
+
"needs_relevance_validation": True,
|
| 363 |
+
"found_keywords": list(found_contextual),
|
| 364 |
+
"cymbal_context_found": list(found_cymbal_context) if found_cymbal_context else [],
|
| 365 |
+
"has_cymbal_context": has_cymbal_context,
|
| 366 |
+
"mapped_products": [],
|
| 367 |
+
"relevance_type": "contextual",
|
| 368 |
+
"relevance_confidence": "medium" if has_cymbal_context else "low",
|
| 369 |
+
"has_primary_keywords": False
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
# Check for competitor mentions (might be comparative discussion)
|
| 373 |
+
found_competitors = self.competitor_keywords.intersection(words)
|
| 374 |
+
if found_competitors:
|
| 375 |
+
return {
|
| 376 |
+
"preliminary_relevant": False,
|
| 377 |
+
"needs_relevance_validation": True,
|
| 378 |
+
"found_keywords": list(found_competitors),
|
| 379 |
+
"mapped_products": [],
|
| 380 |
+
"relevance_type": "competitor_only",
|
| 381 |
+
"relevance_confidence": "low",
|
| 382 |
+
"has_primary_keywords": False
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
# No relevant keywords found
|
| 386 |
+
return {
|
| 387 |
+
"preliminary_relevant": False,
|
| 388 |
+
"needs_relevance_validation": False,
|
| 389 |
+
"found_keywords": [],
|
| 390 |
+
"mapped_products": [],
|
| 391 |
+
"relevance_type": "none",
|
| 392 |
+
"relevance_confidence": "high",
|
| 393 |
+
"has_primary_keywords": False
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
def _extract_mentioned_products(self, text: str) -> List[str]:
|
| 397 |
+
"""
|
| 398 |
+
Extract product names mentioned in the text, including aliases.
|
| 399 |
+
|
| 400 |
+
Args:
|
| 401 |
+
text: Text to search
|
| 402 |
+
|
| 403 |
+
Returns:
|
| 404 |
+
List of canonical product names found
|
| 405 |
+
"""
|
| 406 |
+
text_lower = text.lower()
|
| 407 |
+
words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
|
| 408 |
+
|
| 409 |
+
found_products = []
|
| 410 |
+
|
| 411 |
+
# Check direct product mentions
|
| 412 |
+
for product in self.products_list:
|
| 413 |
+
if product.lower() in words:
|
| 414 |
+
if product not in found_products:
|
| 415 |
+
found_products.append(product)
|
| 416 |
+
|
| 417 |
+
# Check aliases
|
| 418 |
+
for alias, canonical in self.product_aliases.items():
|
| 419 |
+
if ' ' in alias:
|
| 420 |
+
# Multi-word alias
|
| 421 |
+
if alias in text_lower:
|
| 422 |
+
if canonical not in found_products:
|
| 423 |
+
found_products.append(canonical)
|
| 424 |
+
elif alias in words:
|
| 425 |
+
if canonical not in found_products:
|
| 426 |
+
found_products.append(canonical)
|
| 427 |
+
|
| 428 |
+
return found_products
|
| 429 |
+
|
| 430 |
+
def _extract_mentioned_competitors(self, text: str) -> List[str]:
|
| 431 |
+
"""
|
| 432 |
+
Extract competitor brand names mentioned in the text.
|
| 433 |
+
|
| 434 |
+
Args:
|
| 435 |
+
text: Text to search
|
| 436 |
+
|
| 437 |
+
Returns:
|
| 438 |
+
List of canonical competitor names found
|
| 439 |
+
"""
|
| 440 |
+
text_lower = text.lower()
|
| 441 |
+
words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
|
| 442 |
+
|
| 443 |
+
found_competitors = set()
|
| 444 |
+
|
| 445 |
+
for alias in self.competitor_keywords:
|
| 446 |
+
if ' ' in alias:
|
| 447 |
+
# Multi-word check
|
| 448 |
+
if alias in text_lower:
|
| 449 |
+
canonical = self.competitor_name_map.get(alias, alias)
|
| 450 |
+
found_competitors.add(canonical)
|
| 451 |
+
elif alias in words:
|
| 452 |
+
canonical = self.competitor_name_map.get(alias, alias)
|
| 453 |
+
found_competitors.add(canonical)
|
| 454 |
+
|
| 455 |
+
return list(found_competitors)
|
| 456 |
+
|
| 457 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 458 |
+
"""
|
| 459 |
+
Process a forum post through preprocessing pipeline.
|
| 460 |
+
|
| 461 |
+
Args:
|
| 462 |
+
input_data: Dictionary containing post data with at least:
|
| 463 |
+
- post_id: Post identifier
|
| 464 |
+
- post_content: Raw HTML content
|
| 465 |
+
- thread_title: Thread title (optional)
|
| 466 |
+
- thread_first_post: First post content (optional)
|
| 467 |
+
- category_title: Category title (optional)
|
| 468 |
+
- category_topic: Category topic (optional)
|
| 469 |
+
|
| 470 |
+
Returns:
|
| 471 |
+
Dictionary with preprocessing results
|
| 472 |
+
"""
|
| 473 |
+
try:
|
| 474 |
+
# Validate input
|
| 475 |
+
if not self.validate_input(input_data):
|
| 476 |
+
return {
|
| 477 |
+
"success": False,
|
| 478 |
+
"error": "Invalid input: missing required fields",
|
| 479 |
+
**input_data
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
+
post_content = input_data.get("post_content", "")
|
| 483 |
+
|
| 484 |
+
# Step 1: Parse HTML content
|
| 485 |
+
parsed = self.html_parser.parse_post_content(post_content)
|
| 486 |
+
reply_content = parsed.get("reply_content", "")
|
| 487 |
+
quoted_content = parsed.get("quoted_content")
|
| 488 |
+
|
| 489 |
+
# Check for empty content
|
| 490 |
+
if not reply_content or len(reply_content.strip()) < self.min_content_length:
|
| 491 |
+
return {
|
| 492 |
+
"success": True,
|
| 493 |
+
"cleaned_content": reply_content,
|
| 494 |
+
"quoted_content": quoted_content,
|
| 495 |
+
"is_empty": True,
|
| 496 |
+
"preliminary_relevant": False,
|
| 497 |
+
"needs_relevance_validation": False,
|
| 498 |
+
**{k: v for k, v in input_data.items() if k != "post_content"}
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
# Step 2: Check relevance FIRST (needed for language detection logic)
|
| 502 |
+
relevance_result = self._check_relevance(reply_content)
|
| 503 |
+
has_primary_keywords = relevance_result.get("has_primary_keywords", False)
|
| 504 |
+
|
| 505 |
+
# Step 3: Build thread context (raw - will be summarized by extraction agent)
|
| 506 |
+
raw_thread_context = self.html_parser.build_thread_context(
|
| 507 |
+
thread_title=input_data.get("thread_title"),
|
| 508 |
+
first_post_content=input_data.get("thread_first_post"),
|
| 509 |
+
category_title=input_data.get("category_title"),
|
| 510 |
+
category_topic=input_data.get("category_topic")
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
+
# Step 4: Detect language (with smart handling)
|
| 514 |
+
lang_result = self._detect_language(reply_content, has_primary_keywords)
|
| 515 |
+
|
| 516 |
+
# Step 5: Extract product and competitor mentions from actual post content
|
| 517 |
+
products_found = self._extract_mentioned_products(reply_content)
|
| 518 |
+
competitors_found = self._extract_mentioned_competitors(reply_content)
|
| 519 |
+
|
| 520 |
+
# Build result
|
| 521 |
+
result = {
|
| 522 |
+
"success": True,
|
| 523 |
+
"is_empty": False,
|
| 524 |
+
|
| 525 |
+
# Cleaned content
|
| 526 |
+
"cleaned_content": reply_content,
|
| 527 |
+
"quoted_content": quoted_content,
|
| 528 |
+
"has_quote": parsed.get("has_quote", False),
|
| 529 |
+
"quoted_author": parsed.get("quoted_author"),
|
| 530 |
+
"raw_thread_context": raw_thread_context,
|
| 531 |
+
|
| 532 |
+
# Language detection
|
| 533 |
+
"detected_language": lang_result["language"],
|
| 534 |
+
"language_code": lang_result["language_code"],
|
| 535 |
+
"is_english": lang_result["is_english"],
|
| 536 |
+
"language_confidence": lang_result["confidence"],
|
| 537 |
+
"language_detection_skipped": lang_result.get("detection_skipped", False),
|
| 538 |
+
|
| 539 |
+
# Relevance assessment
|
| 540 |
+
"preliminary_relevant": relevance_result["preliminary_relevant"],
|
| 541 |
+
"needs_relevance_validation": relevance_result["needs_relevance_validation"],
|
| 542 |
+
"relevance_keywords_found": relevance_result["found_keywords"],
|
| 543 |
+
"relevance_type": relevance_result["relevance_type"],
|
| 544 |
+
"relevance_confidence": relevance_result["relevance_confidence"],
|
| 545 |
+
"has_primary_keywords": has_primary_keywords,
|
| 546 |
+
|
| 547 |
+
# Initial extractions
|
| 548 |
+
"products_detected": products_found,
|
| 549 |
+
"competitors_detected": competitors_found,
|
| 550 |
+
|
| 551 |
+
# Preserve original data
|
| 552 |
+
**{k: v for k, v in input_data.items() if k not in ["post_content"]}
|
| 553 |
+
}
|
| 554 |
+
|
| 555 |
+
# Keep original content for reference
|
| 556 |
+
result["original_content"] = post_content
|
| 557 |
+
|
| 558 |
+
self.log_processing(
|
| 559 |
+
f"Processed post {input_data.get('post_id')}: "
|
| 560 |
+
f"lang={lang_result['language']}, "
|
| 561 |
+
f"relevant={relevance_result['preliminary_relevant']}, "
|
| 562 |
+
f"needs_validation={relevance_result['needs_relevance_validation']}, "
|
| 563 |
+
f"products={products_found}",
|
| 564 |
+
"debug"
|
| 565 |
+
)
|
| 566 |
+
|
| 567 |
+
return result
|
| 568 |
+
|
| 569 |
+
except Exception as e:
|
| 570 |
+
return self.handle_error(e, f"preprocessing post {input_data.get('post_id')}")
|
processing_brand_sentiment/workflow/agents/output_validator_agent.py
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Output Validator Agent for brand sentiment analysis.
|
| 3 |
+
|
| 4 |
+
This agent performs rule-based validation on the final output to ensure:
|
| 5 |
+
1. All values are from predefined lists
|
| 6 |
+
2. Logical consistency between fields
|
| 7 |
+
3. Anomaly detection for manual review flagging
|
| 8 |
+
|
| 9 |
+
This is a deterministic agent (no LLM calls) that acts as a quality gate.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from typing import Dict, Any, List, Set
|
| 13 |
+
import logging
|
| 14 |
+
|
| 15 |
+
from .base_agent import BaseAgent
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class OutputValidatorAgent(BaseAgent):
|
| 21 |
+
"""
|
| 22 |
+
Agent that validates the final output for consistency and quality.
|
| 23 |
+
|
| 24 |
+
Performs rule-based checks without LLM calls to ensure data quality
|
| 25 |
+
and flag posts that may need manual review.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(
|
| 29 |
+
self,
|
| 30 |
+
config: Dict[str, Any],
|
| 31 |
+
brand_config: Dict[str, Any],
|
| 32 |
+
analysis_categories: Dict[str, Any]
|
| 33 |
+
):
|
| 34 |
+
"""
|
| 35 |
+
Initialize the Output Validator Agent.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
config: Agent configuration
|
| 39 |
+
brand_config: Brand-specific configuration
|
| 40 |
+
analysis_categories: Category definitions for validation
|
| 41 |
+
"""
|
| 42 |
+
super().__init__("OutputValidatorAgent", config)
|
| 43 |
+
self.brand_config = brand_config
|
| 44 |
+
self.analysis_categories = analysis_categories
|
| 45 |
+
|
| 46 |
+
# Build valid value sets for validation
|
| 47 |
+
self._build_valid_value_sets()
|
| 48 |
+
|
| 49 |
+
logger.info("OutputValidatorAgent initialized")
|
| 50 |
+
|
| 51 |
+
def _build_valid_value_sets(self) -> None:
|
| 52 |
+
"""Build sets of valid values for efficient validation."""
|
| 53 |
+
brand = self.brand_config.get("brand", {})
|
| 54 |
+
|
| 55 |
+
# Products
|
| 56 |
+
self.valid_products: Set[str] = set(
|
| 57 |
+
p.lower() for p in brand.get("products", [])
|
| 58 |
+
)
|
| 59 |
+
self.products_canonical = {p.lower(): p for p in brand.get("products", [])}
|
| 60 |
+
|
| 61 |
+
# Competitors
|
| 62 |
+
self.valid_competitors: Set[str] = set()
|
| 63 |
+
self.competitors_canonical = {}
|
| 64 |
+
for comp in brand.get("competitors", []):
|
| 65 |
+
if isinstance(comp, dict):
|
| 66 |
+
name = comp.get("name", "")
|
| 67 |
+
self.valid_competitors.add(name.lower())
|
| 68 |
+
self.competitors_canonical[name.lower()] = name
|
| 69 |
+
|
| 70 |
+
# Extract all category values
|
| 71 |
+
self.valid_values = {}
|
| 72 |
+
|
| 73 |
+
category_configs = {
|
| 74 |
+
"author_role": self.analysis_categories.get("author_role", {}),
|
| 75 |
+
"sabian_mention_context": self.analysis_categories.get("sabian_mention_context", {}),
|
| 76 |
+
"sentiment_level": self.analysis_categories.get("sentiment", {}),
|
| 77 |
+
"emotion_type": self.analysis_categories.get("emotions", {}),
|
| 78 |
+
"intents": self.analysis_categories.get("intents", {}),
|
| 79 |
+
"purchase_stage": self.analysis_categories.get("purchase_stage", {}),
|
| 80 |
+
"comparison_type": self.analysis_categories.get("comparison_type", {}),
|
| 81 |
+
"feedback_aspects": self.analysis_categories.get("feedback_aspects", {}),
|
| 82 |
+
"decision_drivers": self.analysis_categories.get("decision_drivers", {}),
|
| 83 |
+
"product_attributes": self.analysis_categories.get("product_attributes", {}),
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
for key, config in category_configs.items():
|
| 87 |
+
if "categories" in config:
|
| 88 |
+
self.valid_values[key] = set(
|
| 89 |
+
c["value"].lower() for c in config["categories"]
|
| 90 |
+
)
|
| 91 |
+
elif "levels" in config:
|
| 92 |
+
self.valid_values[key] = set(
|
| 93 |
+
c["value"].lower() for c in config["levels"]
|
| 94 |
+
)
|
| 95 |
+
else:
|
| 96 |
+
self.valid_values[key] = set()
|
| 97 |
+
|
| 98 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 99 |
+
"""Validate that input contains required fields."""
|
| 100 |
+
# The validator accepts any input - it will validate what's there
|
| 101 |
+
return True
|
| 102 |
+
|
| 103 |
+
def _validate_list_values(
|
| 104 |
+
self,
|
| 105 |
+
values: List[Any],
|
| 106 |
+
valid_set: Set[str],
|
| 107 |
+
field_name: str
|
| 108 |
+
) -> Dict[str, Any]:
|
| 109 |
+
"""
|
| 110 |
+
Validate list values against a set of valid values.
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
Dictionary with validation results
|
| 114 |
+
"""
|
| 115 |
+
if not values:
|
| 116 |
+
return {"valid": True, "invalid_values": [], "field": field_name}
|
| 117 |
+
|
| 118 |
+
invalid = []
|
| 119 |
+
for v in values:
|
| 120 |
+
if isinstance(v, str) and v.lower() not in valid_set:
|
| 121 |
+
invalid.append(v)
|
| 122 |
+
|
| 123 |
+
return {
|
| 124 |
+
"valid": len(invalid) == 0,
|
| 125 |
+
"invalid_values": invalid,
|
| 126 |
+
"field": field_name
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
def _validate_single_value(
|
| 130 |
+
self,
|
| 131 |
+
value: Any,
|
| 132 |
+
valid_set: Set[str],
|
| 133 |
+
field_name: str,
|
| 134 |
+
allow_none: bool = True
|
| 135 |
+
) -> Dict[str, Any]:
|
| 136 |
+
"""
|
| 137 |
+
Validate a single value against a set of valid values.
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
Dictionary with validation results
|
| 141 |
+
"""
|
| 142 |
+
if value is None:
|
| 143 |
+
return {"valid": allow_none, "invalid_value": None if allow_none else value, "field": field_name}
|
| 144 |
+
|
| 145 |
+
if isinstance(value, str) and value.lower() in valid_set:
|
| 146 |
+
return {"valid": True, "invalid_value": None, "field": field_name}
|
| 147 |
+
|
| 148 |
+
return {"valid": False, "invalid_value": value, "field": field_name}
|
| 149 |
+
|
| 150 |
+
def _check_logical_consistency(self, data: Dict[str, Any]) -> List[str]:
|
| 151 |
+
"""
|
| 152 |
+
Check for logical consistency between fields.
|
| 153 |
+
|
| 154 |
+
Note: Empty products_mentioned is OK even when relevant - users may
|
| 155 |
+
discuss the Sabian brand generally without specific products.
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
List of inconsistency warnings
|
| 159 |
+
"""
|
| 160 |
+
warnings = []
|
| 161 |
+
is_relevant = data.get("is_relevant", False)
|
| 162 |
+
|
| 163 |
+
# Check 1: If not relevant, certain fields should be empty/null
|
| 164 |
+
if not is_relevant:
|
| 165 |
+
if data.get("sabian_mention_context"):
|
| 166 |
+
warnings.append(
|
| 167 |
+
"sabian_mention_context should be null when is_relevant=False"
|
| 168 |
+
)
|
| 169 |
+
if data.get("sentiment_level") and data.get("sentiment_level") != "neutral":
|
| 170 |
+
warnings.append(
|
| 171 |
+
"sentiment_level should be null/neutral when is_relevant=False"
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# Check 2: Comparison type should only be set if comparing intent exists
|
| 175 |
+
if data.get("comparison_type"):
|
| 176 |
+
intents = data.get("intents", [])
|
| 177 |
+
if "comparing" not in intents:
|
| 178 |
+
warnings.append(
|
| 179 |
+
"comparison_type is set but 'comparing' not in intents"
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
# Check 3: Author perspective fields consistency
|
| 183 |
+
# If author is giving advice (providing_information) without sharing experience,
|
| 184 |
+
# pain_points and delight_factors should typically be empty
|
| 185 |
+
intents = data.get("intents", [])
|
| 186 |
+
if "providing_information" in intents and "sharing_experience" not in intents:
|
| 187 |
+
if data.get("pain_points") or data.get("delight_factors"):
|
| 188 |
+
warnings.append(
|
| 189 |
+
"pain_points/delight_factors set for advice-giving post without sharing_experience intent"
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
return warnings
|
| 193 |
+
|
| 194 |
+
def _fix_overlapping_feedback(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
| 195 |
+
"""
|
| 196 |
+
Fix overlapping values between pain_points and delight_factors.
|
| 197 |
+
|
| 198 |
+
Rule: The same aspect cannot be both a pain point and a delight factor.
|
| 199 |
+
Resolution: Use sentiment to determine which to keep, or clear both if neutral.
|
| 200 |
+
|
| 201 |
+
Args:
|
| 202 |
+
data: Dictionary with analysis results
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
Updated dictionary with fixed pain_points and delight_factors
|
| 206 |
+
"""
|
| 207 |
+
pain_points = data.get("pain_points", []) or []
|
| 208 |
+
delight_factors = data.get("delight_factors", []) or []
|
| 209 |
+
|
| 210 |
+
if not pain_points or not delight_factors:
|
| 211 |
+
return data
|
| 212 |
+
|
| 213 |
+
# Find overlapping values
|
| 214 |
+
pain_set = set(p.lower() if isinstance(p, str) else p for p in pain_points)
|
| 215 |
+
delight_set = set(d.lower() if isinstance(d, str) else d for d in delight_factors)
|
| 216 |
+
overlap = pain_set.intersection(delight_set)
|
| 217 |
+
|
| 218 |
+
if not overlap:
|
| 219 |
+
return data
|
| 220 |
+
|
| 221 |
+
# Get sentiment to determine which to keep
|
| 222 |
+
sentiment = data.get("sentiment_level", "neutral")
|
| 223 |
+
|
| 224 |
+
# Create new lists without overlapping values
|
| 225 |
+
if sentiment in ["positive", "very_positive"]:
|
| 226 |
+
# Keep in delight_factors, remove from pain_points
|
| 227 |
+
new_pain_points = [p for p in pain_points if p.lower() not in overlap]
|
| 228 |
+
new_delight_factors = delight_factors
|
| 229 |
+
elif sentiment in ["negative", "very_negative"]:
|
| 230 |
+
# Keep in pain_points, remove from delight_factors
|
| 231 |
+
new_pain_points = pain_points
|
| 232 |
+
new_delight_factors = [d for d in delight_factors if d.lower() not in overlap]
|
| 233 |
+
else:
|
| 234 |
+
# Neutral sentiment - clear both (can't determine intent)
|
| 235 |
+
new_pain_points = [p for p in pain_points if p.lower() not in overlap]
|
| 236 |
+
new_delight_factors = [d for d in delight_factors if d.lower() not in overlap]
|
| 237 |
+
|
| 238 |
+
# Update data
|
| 239 |
+
data["pain_points"] = new_pain_points
|
| 240 |
+
data["delight_factors"] = new_delight_factors
|
| 241 |
+
|
| 242 |
+
logger.debug(
|
| 243 |
+
f"Fixed overlapping feedback: removed {overlap} from "
|
| 244 |
+
f"{'pain_points' if sentiment in ['positive', 'very_positive'] else 'delight_factors' if sentiment in ['negative', 'very_negative'] else 'both'}"
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
return data
|
| 248 |
+
|
| 249 |
+
def _detect_anomalies(self, data: Dict[str, Any]) -> List[str]:
|
| 250 |
+
"""
|
| 251 |
+
Detect anomalies that might need manual review.
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
List of anomaly flags
|
| 255 |
+
"""
|
| 256 |
+
anomalies = []
|
| 257 |
+
|
| 258 |
+
# Anomaly 1: Low confidence relevance
|
| 259 |
+
if data.get("is_relevant") and data.get("relevance_confidence") == "low":
|
| 260 |
+
anomalies.append("low_confidence_relevant")
|
| 261 |
+
|
| 262 |
+
# Anomaly 2: Sarcasm detected - sentiment might be inverted
|
| 263 |
+
if data.get("sarcasm_detected"):
|
| 264 |
+
anomalies.append("sarcasm_detected")
|
| 265 |
+
|
| 266 |
+
# Anomaly 3: Very short content marked as relevant
|
| 267 |
+
content = data.get("cleaned_content", "")
|
| 268 |
+
if data.get("is_relevant") and len(content) < 20:
|
| 269 |
+
anomalies.append("short_relevant_content")
|
| 270 |
+
|
| 271 |
+
# Anomaly 4: Switching behavior detected
|
| 272 |
+
comparison_type = data.get("comparison_type", "")
|
| 273 |
+
if comparison_type in ["switching_to_sabian", "switching_from_sabian"]:
|
| 274 |
+
anomalies.append(f"brand_switching_{comparison_type}")
|
| 275 |
+
|
| 276 |
+
return anomalies
|
| 277 |
+
|
| 278 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 279 |
+
"""
|
| 280 |
+
Process and validate the analysis output.
|
| 281 |
+
|
| 282 |
+
Args:
|
| 283 |
+
input_data: Dictionary with all analysis results
|
| 284 |
+
|
| 285 |
+
Returns:
|
| 286 |
+
Dictionary with validation results added
|
| 287 |
+
"""
|
| 288 |
+
try:
|
| 289 |
+
validation_errors = []
|
| 290 |
+
validation_warnings = []
|
| 291 |
+
|
| 292 |
+
# Skip detailed validation for non-relevant or skipped posts
|
| 293 |
+
if not input_data.get("is_relevant", False) or input_data.get("analysis_skipped", False):
|
| 294 |
+
return {
|
| 295 |
+
**input_data,
|
| 296 |
+
"validation_passed": True,
|
| 297 |
+
"validation_errors": [],
|
| 298 |
+
"validation_warnings": [],
|
| 299 |
+
"validation_flags": [],
|
| 300 |
+
"processing_status": "completed"
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
# Fix overlapping pain_points and delight_factors (safety net)
|
| 304 |
+
input_data = self._fix_overlapping_feedback(input_data)
|
| 305 |
+
|
| 306 |
+
# Validate products_mentioned
|
| 307 |
+
products_result = self._validate_list_values(
|
| 308 |
+
input_data.get("products_mentioned", []),
|
| 309 |
+
self.valid_products,
|
| 310 |
+
"products_mentioned"
|
| 311 |
+
)
|
| 312 |
+
if not products_result["valid"]:
|
| 313 |
+
validation_errors.append(
|
| 314 |
+
f"Invalid products: {products_result['invalid_values']}"
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
# Validate competitors_mentioned
|
| 318 |
+
competitors_result = self._validate_list_values(
|
| 319 |
+
input_data.get("competitors_mentioned", []),
|
| 320 |
+
self.valid_competitors,
|
| 321 |
+
"competitors_mentioned"
|
| 322 |
+
)
|
| 323 |
+
if not competitors_result["valid"]:
|
| 324 |
+
validation_errors.append(
|
| 325 |
+
f"Invalid competitors: {competitors_result['invalid_values']}"
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
# Validate categorical fields
|
| 329 |
+
categorical_validations = [
|
| 330 |
+
("author_role", "author_role", True),
|
| 331 |
+
("sabian_mention_context", "sabian_mention_context", True),
|
| 332 |
+
("sentiment_level", "sentiment_level", True),
|
| 333 |
+
("emotion_type", "emotion_type", True),
|
| 334 |
+
("purchase_stage", "purchase_stage", True),
|
| 335 |
+
("comparison_type", "comparison_type", True),
|
| 336 |
+
]
|
| 337 |
+
|
| 338 |
+
for field, valid_key, allow_none in categorical_validations:
|
| 339 |
+
result = self._validate_single_value(
|
| 340 |
+
input_data.get(field),
|
| 341 |
+
self.valid_values.get(valid_key, set()),
|
| 342 |
+
field,
|
| 343 |
+
allow_none
|
| 344 |
+
)
|
| 345 |
+
if not result["valid"]:
|
| 346 |
+
validation_errors.append(
|
| 347 |
+
f"Invalid {field}: {result['invalid_value']}"
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
# Validate list fields
|
| 351 |
+
list_validations = [
|
| 352 |
+
("intents", "intents"),
|
| 353 |
+
("product_attributes", "product_attributes"),
|
| 354 |
+
("pain_points", "feedback_aspects"),
|
| 355 |
+
("delight_factors", "feedback_aspects"),
|
| 356 |
+
("decision_drivers", "decision_drivers"),
|
| 357 |
+
]
|
| 358 |
+
|
| 359 |
+
for field, valid_key in list_validations:
|
| 360 |
+
result = self._validate_list_values(
|
| 361 |
+
input_data.get(field, []),
|
| 362 |
+
self.valid_values.get(valid_key, set()),
|
| 363 |
+
field
|
| 364 |
+
)
|
| 365 |
+
if not result["valid"]:
|
| 366 |
+
validation_warnings.append(
|
| 367 |
+
f"Invalid values in {field}: {result['invalid_values']}"
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
# Check logical consistency
|
| 371 |
+
consistency_warnings = self._check_logical_consistency(input_data)
|
| 372 |
+
validation_warnings.extend(consistency_warnings)
|
| 373 |
+
|
| 374 |
+
# Detect anomalies
|
| 375 |
+
anomalies = self._detect_anomalies(input_data)
|
| 376 |
+
|
| 377 |
+
# Determine overall validation status
|
| 378 |
+
validation_passed = len(validation_errors) == 0
|
| 379 |
+
|
| 380 |
+
# Set processing status
|
| 381 |
+
if validation_errors:
|
| 382 |
+
processing_status = "validation_failed"
|
| 383 |
+
elif anomalies:
|
| 384 |
+
processing_status = "completed_with_flags"
|
| 385 |
+
else:
|
| 386 |
+
processing_status = "completed"
|
| 387 |
+
|
| 388 |
+
result = {
|
| 389 |
+
**input_data,
|
| 390 |
+
"validation_passed": validation_passed,
|
| 391 |
+
"validation_errors": validation_errors,
|
| 392 |
+
"validation_warnings": validation_warnings,
|
| 393 |
+
"validation_flags": anomalies,
|
| 394 |
+
"processing_status": processing_status
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
if validation_errors or validation_warnings or anomalies:
|
| 398 |
+
self.log_processing(
|
| 399 |
+
f"Validation complete: passed={validation_passed}, "
|
| 400 |
+
f"errors={len(validation_errors)}, warnings={len(validation_warnings)}, "
|
| 401 |
+
f"flags={anomalies}",
|
| 402 |
+
"debug"
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
return result
|
| 406 |
+
|
| 407 |
+
except Exception as e:
|
| 408 |
+
return self.handle_error(e, "output validation")
|
processing_brand_sentiment/workflow/agents/preprocessor_agent.py
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Preprocessor Agent for brand sentiment analysis.
|
| 3 |
+
Handles HTML parsing, text cleaning, language detection, and initial relevance screening.
|
| 4 |
+
This is a deterministic agent (no LLM calls except for language detection fallback).
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import re
|
| 8 |
+
from typing import Dict, Any, List, Optional, Set
|
| 9 |
+
from lingua import Language, LanguageDetectorBuilder
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
from .base_agent import BaseAgent
|
| 13 |
+
from utils.html_parser import HTMLParser
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class PreprocessorAgent(BaseAgent):
|
| 19 |
+
"""
|
| 20 |
+
Agent that preprocesses forum posts:
|
| 21 |
+
- Parses HTML to extract reply and quoted content
|
| 22 |
+
- Cleans and normalizes text
|
| 23 |
+
- Detects language
|
| 24 |
+
- Performs initial keyword-based relevance screening
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
# Lingua to ISO 639-1 language code mapping
|
| 28 |
+
LINGUA_TO_ISO = {
|
| 29 |
+
Language.ENGLISH: "en",
|
| 30 |
+
Language.SPANISH: "es",
|
| 31 |
+
Language.FRENCH: "fr",
|
| 32 |
+
Language.GERMAN: "de",
|
| 33 |
+
Language.ITALIAN: "it",
|
| 34 |
+
Language.PORTUGUESE: "pt",
|
| 35 |
+
Language.RUSSIAN: "ru",
|
| 36 |
+
Language.JAPANESE: "ja",
|
| 37 |
+
Language.KOREAN: "ko",
|
| 38 |
+
Language.CHINESE: "zh",
|
| 39 |
+
Language.ARABIC: "ar",
|
| 40 |
+
Language.HINDI: "hi",
|
| 41 |
+
Language.DUTCH: "nl",
|
| 42 |
+
Language.SWEDISH: "sv",
|
| 43 |
+
Language.POLISH: "pl",
|
| 44 |
+
Language.TURKISH: "tr"
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]):
|
| 48 |
+
"""
|
| 49 |
+
Initialize the Preprocessor Agent.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
config: Agent configuration
|
| 53 |
+
brand_config: Brand-specific configuration with keywords and products
|
| 54 |
+
"""
|
| 55 |
+
super().__init__("PreprocessorAgent", config)
|
| 56 |
+
self.brand_config = brand_config
|
| 57 |
+
self.html_parser = HTMLParser()
|
| 58 |
+
|
| 59 |
+
# Initialize lingua detector
|
| 60 |
+
self.language_detector = LanguageDetectorBuilder.from_all_languages().build()
|
| 61 |
+
|
| 62 |
+
# Build keyword sets for efficient lookup
|
| 63 |
+
self._build_keyword_sets()
|
| 64 |
+
|
| 65 |
+
logger.info("PreprocessorAgent initialized")
|
| 66 |
+
|
| 67 |
+
def _build_keyword_sets(self) -> None:
|
| 68 |
+
"""Build keyword sets from brand configuration for efficient relevance checking."""
|
| 69 |
+
relevance_config = self.brand_config.get("relevance_keywords", {})
|
| 70 |
+
|
| 71 |
+
# Primary keywords - definitive Sabian mentions
|
| 72 |
+
primary = relevance_config.get("primary", {}).get("keywords", [])
|
| 73 |
+
self.primary_keywords: Set[str] = set(k.lower() for k in primary)
|
| 74 |
+
|
| 75 |
+
# Contextual keywords - need disambiguation (HH, AA)
|
| 76 |
+
contextual = relevance_config.get("contextual", {}).get("keywords", [])
|
| 77 |
+
self.contextual_keywords: Set[str] = set(k.lower() for k in contextual)
|
| 78 |
+
|
| 79 |
+
# Cymbal context keywords - help disambiguate contextual terms
|
| 80 |
+
cymbal_context = relevance_config.get("cymbal_context", {}).get("keywords", [])
|
| 81 |
+
self.cymbal_context_keywords: Set[str] = set(k.lower() for k in cymbal_context)
|
| 82 |
+
|
| 83 |
+
# Competitor names for detection
|
| 84 |
+
competitors = self.brand_config.get("brand", {}).get("competitors", [])
|
| 85 |
+
self.competitor_keywords: Set[str] = set()
|
| 86 |
+
for comp in competitors:
|
| 87 |
+
if isinstance(comp, dict):
|
| 88 |
+
self.competitor_keywords.add(comp.get("name", "").lower())
|
| 89 |
+
for alias in comp.get("aliases", []):
|
| 90 |
+
self.competitor_keywords.add(alias.lower())
|
| 91 |
+
else:
|
| 92 |
+
self.competitor_keywords.add(str(comp).lower())
|
| 93 |
+
|
| 94 |
+
# Product names
|
| 95 |
+
products = self.brand_config.get("brand", {}).get("products", [])
|
| 96 |
+
self.product_keywords: Set[str] = set(p.lower() for p in products)
|
| 97 |
+
|
| 98 |
+
logger.info(f"Built keyword sets: {len(self.primary_keywords)} primary, "
|
| 99 |
+
f"{len(self.contextual_keywords)} contextual, "
|
| 100 |
+
f"{len(self.product_keywords)} products")
|
| 101 |
+
|
| 102 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 103 |
+
"""
|
| 104 |
+
Validate that input contains required fields.
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
input_data: Input dictionary
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
True if valid, False otherwise
|
| 111 |
+
"""
|
| 112 |
+
required_fields = ["post_id", "post_content"]
|
| 113 |
+
return all(field in input_data for field in required_fields)
|
| 114 |
+
|
| 115 |
+
def _detect_language(self, text: str) -> Dict[str, Any]:
|
| 116 |
+
"""
|
| 117 |
+
Detect the language of text using lingua library.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
text: Text to analyze
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
Dictionary with language detection results
|
| 124 |
+
"""
|
| 125 |
+
try:
|
| 126 |
+
cleaned_text = text.strip()
|
| 127 |
+
if not cleaned_text or len(cleaned_text) < 3:
|
| 128 |
+
return {
|
| 129 |
+
"language": "English",
|
| 130 |
+
"language_code": "en",
|
| 131 |
+
"is_english": True,
|
| 132 |
+
"confidence": "low"
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
detected = self.language_detector.detect_language_of(cleaned_text)
|
| 136 |
+
|
| 137 |
+
if detected is None:
|
| 138 |
+
return {
|
| 139 |
+
"language": "English",
|
| 140 |
+
"language_code": "en",
|
| 141 |
+
"is_english": True,
|
| 142 |
+
"confidence": "low"
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
if detected == Language.ENGLISH:
|
| 146 |
+
return {
|
| 147 |
+
"language": "English",
|
| 148 |
+
"language_code": "en",
|
| 149 |
+
"is_english": True,
|
| 150 |
+
"confidence": "high"
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
lang_code = self.LINGUA_TO_ISO.get(detected, "unknown")
|
| 154 |
+
lang_name = detected.name.capitalize()
|
| 155 |
+
|
| 156 |
+
return {
|
| 157 |
+
"language": lang_name,
|
| 158 |
+
"language_code": lang_code,
|
| 159 |
+
"is_english": False,
|
| 160 |
+
"confidence": "high"
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
except Exception as e:
|
| 164 |
+
logger.warning(f"Language detection failed: {e}")
|
| 165 |
+
return {
|
| 166 |
+
"language": "English",
|
| 167 |
+
"language_code": "en",
|
| 168 |
+
"is_english": True,
|
| 169 |
+
"confidence": "low"
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
def _check_relevance(self, text: str) -> Dict[str, Any]:
|
| 173 |
+
"""
|
| 174 |
+
Check if text is relevant to the brand using keyword matching.
|
| 175 |
+
|
| 176 |
+
Returns:
|
| 177 |
+
Dictionary with relevance assessment:
|
| 178 |
+
- preliminary_relevant: Initial relevance assessment
|
| 179 |
+
- needs_relevance_validation: True if contains ambiguous terms needing LLM check
|
| 180 |
+
- found_keywords: Keywords found in the text
|
| 181 |
+
- relevance_type: 'primary', 'contextual', or 'none'
|
| 182 |
+
"""
|
| 183 |
+
text_lower = text.lower()
|
| 184 |
+
|
| 185 |
+
# Tokenize for word boundary matching
|
| 186 |
+
words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
|
| 187 |
+
|
| 188 |
+
# Check for primary keywords (definitive matches)
|
| 189 |
+
found_primary = self.primary_keywords.intersection(words)
|
| 190 |
+
if found_primary:
|
| 191 |
+
return {
|
| 192 |
+
"preliminary_relevant": True,
|
| 193 |
+
"needs_relevance_validation": False,
|
| 194 |
+
"found_keywords": list(found_primary),
|
| 195 |
+
"relevance_type": "primary",
|
| 196 |
+
"relevance_confidence": "high"
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
# Check for contextual keywords (need validation)
|
| 200 |
+
found_contextual = self.contextual_keywords.intersection(words)
|
| 201 |
+
if found_contextual:
|
| 202 |
+
# Check if there's cymbal context
|
| 203 |
+
found_cymbal_context = self.cymbal_context_keywords.intersection(words)
|
| 204 |
+
has_cymbal_context = len(found_cymbal_context) > 0
|
| 205 |
+
|
| 206 |
+
return {
|
| 207 |
+
"preliminary_relevant": True, # Potentially relevant
|
| 208 |
+
"needs_relevance_validation": True, # Needs LLM confirmation
|
| 209 |
+
"found_keywords": list(found_contextual),
|
| 210 |
+
"cymbal_context_found": list(found_cymbal_context) if found_cymbal_context else [],
|
| 211 |
+
"has_cymbal_context": has_cymbal_context,
|
| 212 |
+
"relevance_type": "contextual",
|
| 213 |
+
"relevance_confidence": "medium" if has_cymbal_context else "low"
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
# Check for competitor mentions (might be comparative discussion)
|
| 217 |
+
found_competitors = self.competitor_keywords.intersection(words)
|
| 218 |
+
if found_competitors:
|
| 219 |
+
# Has competitor mention but no Sabian mention
|
| 220 |
+
# Could still be relevant in a comparison context
|
| 221 |
+
return {
|
| 222 |
+
"preliminary_relevant": False,
|
| 223 |
+
"needs_relevance_validation": True, # LLM should check context
|
| 224 |
+
"found_keywords": list(found_competitors),
|
| 225 |
+
"relevance_type": "competitor_only",
|
| 226 |
+
"relevance_confidence": "low"
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
# No relevant keywords found
|
| 230 |
+
return {
|
| 231 |
+
"preliminary_relevant": False,
|
| 232 |
+
"needs_relevance_validation": False,
|
| 233 |
+
"found_keywords": [],
|
| 234 |
+
"relevance_type": "none",
|
| 235 |
+
"relevance_confidence": "high"
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
def _extract_mentioned_products(self, text: str) -> List[str]:
|
| 239 |
+
"""
|
| 240 |
+
Extract product names mentioned in the text.
|
| 241 |
+
|
| 242 |
+
Args:
|
| 243 |
+
text: Text to search
|
| 244 |
+
|
| 245 |
+
Returns:
|
| 246 |
+
List of product names found
|
| 247 |
+
"""
|
| 248 |
+
text_lower = text.lower()
|
| 249 |
+
words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
|
| 250 |
+
|
| 251 |
+
found_products = []
|
| 252 |
+
products = self.brand_config.get("brand", {}).get("products", [])
|
| 253 |
+
|
| 254 |
+
for product in products:
|
| 255 |
+
if product.lower() in words:
|
| 256 |
+
found_products.append(product)
|
| 257 |
+
|
| 258 |
+
return found_products
|
| 259 |
+
|
| 260 |
+
def _extract_mentioned_competitors(self, text: str) -> List[str]:
|
| 261 |
+
"""
|
| 262 |
+
Extract competitor names mentioned in the text.
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
text: Text to search
|
| 266 |
+
|
| 267 |
+
Returns:
|
| 268 |
+
List of competitor names found
|
| 269 |
+
"""
|
| 270 |
+
text_lower = text.lower()
|
| 271 |
+
words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
|
| 272 |
+
|
| 273 |
+
found_competitors = []
|
| 274 |
+
competitors = self.brand_config.get("brand", {}).get("competitors", [])
|
| 275 |
+
|
| 276 |
+
for comp in competitors:
|
| 277 |
+
if isinstance(comp, dict):
|
| 278 |
+
name = comp.get("name", "")
|
| 279 |
+
aliases = comp.get("aliases", [])
|
| 280 |
+
|
| 281 |
+
# Check name and aliases
|
| 282 |
+
if name.lower() in words:
|
| 283 |
+
if name not in found_competitors:
|
| 284 |
+
found_competitors.append(name)
|
| 285 |
+
else:
|
| 286 |
+
for alias in aliases:
|
| 287 |
+
if alias.lower() in words:
|
| 288 |
+
if name not in found_competitors:
|
| 289 |
+
found_competitors.append(name)
|
| 290 |
+
break
|
| 291 |
+
else:
|
| 292 |
+
if str(comp).lower() in words:
|
| 293 |
+
found_competitors.append(str(comp))
|
| 294 |
+
|
| 295 |
+
return found_competitors
|
| 296 |
+
|
| 297 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 298 |
+
"""
|
| 299 |
+
Process a forum post through preprocessing pipeline.
|
| 300 |
+
|
| 301 |
+
Args:
|
| 302 |
+
input_data: Dictionary containing post data with at least:
|
| 303 |
+
- post_id: Post identifier
|
| 304 |
+
- post_content: Raw HTML content
|
| 305 |
+
- thread_title: Thread title (optional)
|
| 306 |
+
- thread_first_post: First post content (optional)
|
| 307 |
+
- category_title: Category title (optional)
|
| 308 |
+
- category_topic: Category topic (optional)
|
| 309 |
+
|
| 310 |
+
Returns:
|
| 311 |
+
Dictionary with preprocessing results
|
| 312 |
+
"""
|
| 313 |
+
try:
|
| 314 |
+
# Validate input
|
| 315 |
+
if not self.validate_input(input_data):
|
| 316 |
+
return {
|
| 317 |
+
"success": False,
|
| 318 |
+
"error": "Invalid input: missing required fields",
|
| 319 |
+
**input_data
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
post_content = input_data.get("post_content", "")
|
| 323 |
+
|
| 324 |
+
# Step 1: Parse HTML content
|
| 325 |
+
parsed = self.html_parser.parse_post_content(post_content)
|
| 326 |
+
reply_content = parsed.get("reply_content", "")
|
| 327 |
+
quoted_content = parsed.get("quoted_content")
|
| 328 |
+
|
| 329 |
+
# Check for empty content
|
| 330 |
+
if not reply_content or len(reply_content.strip()) < 3:
|
| 331 |
+
return {
|
| 332 |
+
"success": True,
|
| 333 |
+
"cleaned_content": reply_content,
|
| 334 |
+
"quoted_content": quoted_content,
|
| 335 |
+
"is_empty": True,
|
| 336 |
+
"preliminary_relevant": False,
|
| 337 |
+
"needs_relevance_validation": False,
|
| 338 |
+
**{k: v for k, v in input_data.items() if k != "post_content"}
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
# Step 2: Build thread context
|
| 342 |
+
thread_context = self.html_parser.build_thread_context(
|
| 343 |
+
thread_title=input_data.get("thread_title"),
|
| 344 |
+
first_post_content=input_data.get("thread_first_post"),
|
| 345 |
+
category_title=input_data.get("category_title"),
|
| 346 |
+
category_topic=input_data.get("category_topic")
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
# Step 3: Detect language
|
| 350 |
+
lang_result = self._detect_language(reply_content)
|
| 351 |
+
|
| 352 |
+
# Step 4: Check relevance - ONLY on the actual post content, NOT quoted/context
|
| 353 |
+
# The quoted content and thread context are for understanding, not for relevance determination
|
| 354 |
+
relevance_result = self._check_relevance(reply_content)
|
| 355 |
+
|
| 356 |
+
# Step 5: Extract product and competitor mentions - ONLY from actual post content
|
| 357 |
+
# We don't want to extract from quoted content as that will be processed separately
|
| 358 |
+
products_found = self._extract_mentioned_products(reply_content)
|
| 359 |
+
competitors_found = self._extract_mentioned_competitors(reply_content)
|
| 360 |
+
|
| 361 |
+
# Build result
|
| 362 |
+
result = {
|
| 363 |
+
"success": True,
|
| 364 |
+
"is_empty": False,
|
| 365 |
+
|
| 366 |
+
# Cleaned content
|
| 367 |
+
"cleaned_content": reply_content,
|
| 368 |
+
"quoted_content": quoted_content,
|
| 369 |
+
"has_quote": parsed.get("has_quote", False),
|
| 370 |
+
"quoted_author": parsed.get("quoted_author"),
|
| 371 |
+
"thread_context": thread_context,
|
| 372 |
+
|
| 373 |
+
# Language detection
|
| 374 |
+
"detected_language": lang_result["language"],
|
| 375 |
+
"language_code": lang_result["language_code"],
|
| 376 |
+
"is_english": lang_result["is_english"],
|
| 377 |
+
"language_confidence": lang_result["confidence"],
|
| 378 |
+
|
| 379 |
+
# Relevance assessment
|
| 380 |
+
"preliminary_relevant": relevance_result["preliminary_relevant"],
|
| 381 |
+
"needs_relevance_validation": relevance_result["needs_relevance_validation"],
|
| 382 |
+
"relevance_keywords_found": relevance_result["found_keywords"],
|
| 383 |
+
"relevance_type": relevance_result["relevance_type"],
|
| 384 |
+
"relevance_confidence": relevance_result["relevance_confidence"],
|
| 385 |
+
|
| 386 |
+
# Initial extractions
|
| 387 |
+
"products_detected": products_found,
|
| 388 |
+
"competitors_detected": competitors_found,
|
| 389 |
+
|
| 390 |
+
# Preserve original data
|
| 391 |
+
**{k: v for k, v in input_data.items() if k not in ["post_content"]}
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
# Keep original content for reference
|
| 395 |
+
result["original_content"] = post_content
|
| 396 |
+
|
| 397 |
+
self.log_processing(
|
| 398 |
+
f"Processed post {input_data.get('post_id')}: "
|
| 399 |
+
f"lang={lang_result['language']}, "
|
| 400 |
+
f"relevant={relevance_result['preliminary_relevant']}, "
|
| 401 |
+
f"needs_validation={relevance_result['needs_relevance_validation']}",
|
| 402 |
+
"debug"
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
return result
|
| 406 |
+
|
| 407 |
+
except Exception as e:
|
| 408 |
+
return self.handle_error(e, f"preprocessing post {input_data.get('post_id')}")
|
processing_brand_sentiment/workflow/agents/relevance_validator_agent.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Relevance Validator Agent for brand sentiment analysis.
|
| 3 |
+
Lightweight LLM-based agent that confirms whether ambiguous terms (HH, AA)
|
| 4 |
+
refer to Sabian products or generic terms.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import Dict, Any
|
| 8 |
+
import json
|
| 9 |
+
from langchain_openai import ChatOpenAI
|
| 10 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 11 |
+
import logging
|
| 12 |
+
|
| 13 |
+
from .base_agent import BaseAgent
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class RelevanceValidatorAgent(BaseAgent):
|
| 19 |
+
"""
|
| 20 |
+
Agent that validates whether posts with ambiguous terms (like HH, AA)
|
| 21 |
+
are actually referring to Sabian products or generic terms.
|
| 22 |
+
|
| 23 |
+
This is a lightweight LLM call specifically for disambiguation.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(self, config: Dict[str, Any], api_key: str, brand_config: Dict[str, Any]):
|
| 27 |
+
"""
|
| 28 |
+
Initialize the Relevance Validator Agent.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
config: Agent configuration
|
| 32 |
+
api_key: OpenAI API key
|
| 33 |
+
brand_config: Brand-specific configuration with product info
|
| 34 |
+
"""
|
| 35 |
+
super().__init__("RelevanceValidatorAgent", config)
|
| 36 |
+
self.api_key = api_key
|
| 37 |
+
self.brand_config = brand_config
|
| 38 |
+
|
| 39 |
+
self.llm = ChatOpenAI(
|
| 40 |
+
model=self.model,
|
| 41 |
+
temperature=self.temperature,
|
| 42 |
+
api_key=self.api_key
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# Build disambiguation context from brand config
|
| 46 |
+
self._build_disambiguation_context()
|
| 47 |
+
|
| 48 |
+
logger.info("RelevanceValidatorAgent initialized")
|
| 49 |
+
|
| 50 |
+
def _build_disambiguation_context(self) -> None:
|
| 51 |
+
"""Build context strings for disambiguation from brand config."""
|
| 52 |
+
brand = self.brand_config.get("brand", {})
|
| 53 |
+
ambiguous = brand.get("ambiguous_terms", {})
|
| 54 |
+
|
| 55 |
+
self.disambiguation_info = {}
|
| 56 |
+
for term, info in ambiguous.items():
|
| 57 |
+
if isinstance(info, dict):
|
| 58 |
+
self.disambiguation_info[term] = {
|
| 59 |
+
"description": info.get("description", ""),
|
| 60 |
+
"context_clues": info.get("disambiguation_context", [])
|
| 61 |
+
}
|
| 62 |
+
else:
|
| 63 |
+
self.disambiguation_info[term] = {
|
| 64 |
+
"description": str(info),
|
| 65 |
+
"context_clues": []
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
# Product descriptions for context
|
| 69 |
+
self.product_descriptions = brand.get("product_descriptions", {})
|
| 70 |
+
|
| 71 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 72 |
+
"""
|
| 73 |
+
Validate that input contains required fields.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
input_data: Input dictionary
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
True if valid, False otherwise
|
| 80 |
+
"""
|
| 81 |
+
required = ["cleaned_content", "relevance_keywords_found"]
|
| 82 |
+
return all(field in input_data for field in required)
|
| 83 |
+
|
| 84 |
+
def _build_system_prompt(self) -> str:
|
| 85 |
+
"""Build the system prompt for relevance validation."""
|
| 86 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 87 |
+
products = self.brand_config.get("brand", {}).get("products", [])
|
| 88 |
+
|
| 89 |
+
# Build disambiguation rules
|
| 90 |
+
disambiguation_rules = []
|
| 91 |
+
for term, info in self.disambiguation_info.items():
|
| 92 |
+
desc = info.get("description", "")
|
| 93 |
+
clues = info.get("context_clues", [])
|
| 94 |
+
rule = f"- '{term}': {desc}"
|
| 95 |
+
if clues:
|
| 96 |
+
rule += f" Context clues for {brand_name}: {', '.join(clues)}"
|
| 97 |
+
disambiguation_rules.append(rule)
|
| 98 |
+
|
| 99 |
+
disambiguation_text = "\n".join(disambiguation_rules) if disambiguation_rules else "No specific disambiguation rules."
|
| 100 |
+
|
| 101 |
+
system_prompt = f"""You are an expert at identifying brand mentions in drum/cymbal forum discussions.
|
| 102 |
+
|
| 103 |
+
Your task is to determine if the POST CONTENT itself discusses {brand_name} products.
|
| 104 |
+
|
| 105 |
+
**CRITICAL RULE:**
|
| 106 |
+
- You must determine relevance based ONLY on the POST CONTENT
|
| 107 |
+
- The context (thread info, quoted/parent content) is provided to help you understand ambiguous terms
|
| 108 |
+
- But if the POST CONTENT itself does not mention or discuss {brand_name}, it is NOT relevant
|
| 109 |
+
- Example: If quoted content mentions Sabian but the post just says "Got it! Thanks!" → NOT relevant
|
| 110 |
+
|
| 111 |
+
**{brand_name} Product Lines:**
|
| 112 |
+
{', '.join(products)}
|
| 113 |
+
|
| 114 |
+
**Ambiguous Terms to Watch For:**
|
| 115 |
+
{disambiguation_text}
|
| 116 |
+
|
| 117 |
+
**Key Disambiguation Rules:**
|
| 118 |
+
- "HH" alone usually means "Hi-Hat" (a type of cymbal), NOT Sabian HH series
|
| 119 |
+
- "HH" WITH Sabian context IN THE POST (e.g., "Sabian HH", "HH crashes", "my HH ride") likely refers to Sabian
|
| 120 |
+
- "AA" alone might be a general abbreviation, NOT Sabian AA series
|
| 121 |
+
- "AA" WITH Sabian context IN THE POST (e.g., "Sabian AA", "AA cymbals", "AA medium ride") likely refers to Sabian
|
| 122 |
+
- Generic replies like "Thanks!", "Got it!", "Good point!" are NOT relevant even if context mentions {brand_name}
|
| 123 |
+
|
| 124 |
+
**Return JSON with:**
|
| 125 |
+
- is_relevant: boolean - true ONLY if the POST CONTENT itself discusses {brand_name} products
|
| 126 |
+
- confidence: "high", "medium", or "low"
|
| 127 |
+
- reason: brief explanation (1-2 sentences) - explain what IN THE POST made you decide
|
| 128 |
+
- detected_products: list of {brand_name} products mentioned IN THE POST (empty if none)
|
| 129 |
+
|
| 130 |
+
Return only valid JSON."""
|
| 131 |
+
|
| 132 |
+
return system_prompt
|
| 133 |
+
|
| 134 |
+
def validate_relevance(
|
| 135 |
+
self,
|
| 136 |
+
content: str,
|
| 137 |
+
keywords_found: list,
|
| 138 |
+
thread_context: str = "",
|
| 139 |
+
quoted_content: str = ""
|
| 140 |
+
) -> Dict[str, Any]:
|
| 141 |
+
"""
|
| 142 |
+
Validate whether content is relevant to the brand.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
content: The cleaned post content
|
| 146 |
+
keywords_found: Keywords that triggered validation
|
| 147 |
+
thread_context: Thread context for additional context
|
| 148 |
+
quoted_content: Quoted content if any
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
Dictionary with validation results
|
| 152 |
+
"""
|
| 153 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 154 |
+
|
| 155 |
+
# Build context for the LLM
|
| 156 |
+
context_parts = []
|
| 157 |
+
if thread_context:
|
| 158 |
+
context_parts.append(f"Thread context: {thread_context}")
|
| 159 |
+
if quoted_content:
|
| 160 |
+
context_parts.append(f"Replying to: {quoted_content[:300]}...")
|
| 161 |
+
|
| 162 |
+
context_str = "\n".join(context_parts) if context_parts else "No additional context."
|
| 163 |
+
|
| 164 |
+
user_prompt = f"""Determine if this POST CONTENT discusses {brand_name} cymbal products.
|
| 165 |
+
|
| 166 |
+
**Keywords found in post:** {', '.join(keywords_found)}
|
| 167 |
+
|
| 168 |
+
**CONTEXT (for understanding ambiguous terms only - do NOT base relevance on this):**
|
| 169 |
+
{context_str}
|
| 170 |
+
|
| 171 |
+
**POST CONTENT TO EVALUATE (base your relevance decision ONLY on this):**
|
| 172 |
+
"{content}"
|
| 173 |
+
|
| 174 |
+
Does the POST CONTENT itself discuss {brand_name} products? Remember: generic replies are NOT relevant even if context mentions {brand_name}. Return JSON only."""
|
| 175 |
+
|
| 176 |
+
try:
|
| 177 |
+
messages = [
|
| 178 |
+
SystemMessage(content=self._build_system_prompt()),
|
| 179 |
+
HumanMessage(content=user_prompt)
|
| 180 |
+
]
|
| 181 |
+
|
| 182 |
+
response = self.llm.invoke(messages)
|
| 183 |
+
result = self._parse_llm_json_response(response.content)
|
| 184 |
+
|
| 185 |
+
return {
|
| 186 |
+
"success": True,
|
| 187 |
+
"is_relevant": result.get("is_relevant", False),
|
| 188 |
+
"relevance_confidence": result.get("confidence", "low"),
|
| 189 |
+
"relevance_reason": result.get("reason", ""),
|
| 190 |
+
"detected_products": result.get("detected_products", [])
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
except json.JSONDecodeError as e:
|
| 194 |
+
self.log_processing(f"JSON decode error in relevance validation: {e}", "warning")
|
| 195 |
+
# Default to relevant if we can't determine
|
| 196 |
+
return {
|
| 197 |
+
"success": True,
|
| 198 |
+
"is_relevant": True,
|
| 199 |
+
"relevance_confidence": "low",
|
| 200 |
+
"relevance_reason": "Could not parse LLM response, defaulting to relevant",
|
| 201 |
+
"detected_products": []
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
except Exception as e:
|
| 205 |
+
self.log_processing(f"Relevance validation error: {e}", "error")
|
| 206 |
+
return {
|
| 207 |
+
"success": False,
|
| 208 |
+
"is_relevant": True, # Default to relevant on error
|
| 209 |
+
"relevance_confidence": "low",
|
| 210 |
+
"relevance_reason": f"Error during validation: {str(e)}",
|
| 211 |
+
"detected_products": [],
|
| 212 |
+
"error": str(e)
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 216 |
+
"""
|
| 217 |
+
Process a post to validate its relevance to the brand.
|
| 218 |
+
|
| 219 |
+
Args:
|
| 220 |
+
input_data: Dictionary containing:
|
| 221 |
+
- cleaned_content: Cleaned post text
|
| 222 |
+
- relevance_keywords_found: Keywords that triggered validation
|
| 223 |
+
- thread_context: Optional thread context
|
| 224 |
+
- quoted_content: Optional quoted content
|
| 225 |
+
|
| 226 |
+
Returns:
|
| 227 |
+
Dictionary with validation results and original data
|
| 228 |
+
"""
|
| 229 |
+
try:
|
| 230 |
+
if not self.validate_input(input_data):
|
| 231 |
+
return {
|
| 232 |
+
"success": False,
|
| 233 |
+
"error": "Invalid input: missing required fields",
|
| 234 |
+
"is_relevant": True, # Default to relevant
|
| 235 |
+
"relevance_confidence": "low",
|
| 236 |
+
**input_data
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
# Check if validation is actually needed
|
| 240 |
+
if not input_data.get("needs_relevance_validation", False):
|
| 241 |
+
# No validation needed, use preliminary assessment
|
| 242 |
+
return {
|
| 243 |
+
"success": True,
|
| 244 |
+
"is_relevant": input_data.get("preliminary_relevant", False),
|
| 245 |
+
"relevance_confidence": input_data.get("relevance_confidence", "high"),
|
| 246 |
+
"relevance_reason": "No validation needed - preliminary assessment used",
|
| 247 |
+
"validation_performed": False,
|
| 248 |
+
**input_data
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
# Perform LLM validation
|
| 252 |
+
validation_result = self.validate_relevance(
|
| 253 |
+
content=input_data.get("cleaned_content", ""),
|
| 254 |
+
keywords_found=input_data.get("relevance_keywords_found", []),
|
| 255 |
+
thread_context=input_data.get("thread_context", ""),
|
| 256 |
+
quoted_content=input_data.get("quoted_content", "")
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
# Merge results
|
| 260 |
+
result = {
|
| 261 |
+
**input_data,
|
| 262 |
+
"is_relevant": validation_result["is_relevant"],
|
| 263 |
+
"relevance_confidence": validation_result["relevance_confidence"],
|
| 264 |
+
"relevance_reason": validation_result["relevance_reason"],
|
| 265 |
+
"validation_performed": True,
|
| 266 |
+
"success": validation_result["success"]
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
# Update products detected if LLM found any
|
| 270 |
+
if validation_result.get("detected_products"):
|
| 271 |
+
existing_products = input_data.get("products_detected", [])
|
| 272 |
+
llm_products = validation_result["detected_products"]
|
| 273 |
+
# Merge without duplicates
|
| 274 |
+
all_products = list(set(existing_products + llm_products))
|
| 275 |
+
result["products_detected"] = all_products
|
| 276 |
+
|
| 277 |
+
if "error" in validation_result:
|
| 278 |
+
result["validation_error"] = validation_result["error"]
|
| 279 |
+
|
| 280 |
+
self.log_processing(
|
| 281 |
+
f"Validated relevance for post: is_relevant={result['is_relevant']}, "
|
| 282 |
+
f"confidence={result['relevance_confidence']}",
|
| 283 |
+
"debug"
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
return result
|
| 287 |
+
|
| 288 |
+
except Exception as e:
|
| 289 |
+
return self.handle_error(e, "relevance validation")
|
processing_brand_sentiment/workflow/agents/sabian_analyzer_agent.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sabian Analyzer Agent for comprehensive brand sentiment analysis.
|
| 3 |
+
LLM-based agent that extracts products, competitors, sentiment, intents,
|
| 4 |
+
pain points, and other brand intelligence from forum posts.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import Dict, Any, List
|
| 8 |
+
import json
|
| 9 |
+
from langchain_openai import ChatOpenAI
|
| 10 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 11 |
+
import logging
|
| 12 |
+
|
| 13 |
+
from .base_agent import BaseAgent
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class SabianAnalyzerAgent(BaseAgent):
|
| 19 |
+
"""
|
| 20 |
+
Comprehensive brand analysis agent for Sabian cymbal discussions.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(
|
| 24 |
+
self,
|
| 25 |
+
config: Dict[str, Any],
|
| 26 |
+
api_key: str,
|
| 27 |
+
brand_config: Dict[str, Any],
|
| 28 |
+
analysis_categories: Dict[str, Any]
|
| 29 |
+
):
|
| 30 |
+
super().__init__("SabianAnalyzerAgent", config)
|
| 31 |
+
self.api_key = api_key
|
| 32 |
+
self.brand_config = brand_config
|
| 33 |
+
self.analysis_categories = analysis_categories
|
| 34 |
+
|
| 35 |
+
self.llm = ChatOpenAI(
|
| 36 |
+
model=self.model,
|
| 37 |
+
temperature=self.temperature,
|
| 38 |
+
api_key=self.api_key
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Pre-compute valid values for validation
|
| 42 |
+
self._valid_values = self._compute_valid_values()
|
| 43 |
+
logger.info("SabianAnalyzerAgent initialized")
|
| 44 |
+
|
| 45 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 46 |
+
required = ["cleaned_content", "is_relevant"]
|
| 47 |
+
return all(field in input_data for field in required)
|
| 48 |
+
|
| 49 |
+
def _compute_valid_values(self) -> Dict[str, List[str]]:
|
| 50 |
+
"""Pre-compute all valid values from config for validation."""
|
| 51 |
+
valid = {}
|
| 52 |
+
|
| 53 |
+
# Products from brand config
|
| 54 |
+
valid["products"] = self.brand_config.get("brand", {}).get("products", [])
|
| 55 |
+
|
| 56 |
+
# Competitors
|
| 57 |
+
competitor_names = []
|
| 58 |
+
for comp in self.brand_config.get("brand", {}).get("competitors", []):
|
| 59 |
+
if isinstance(comp, dict):
|
| 60 |
+
competitor_names.append(comp.get("name", ""))
|
| 61 |
+
valid["competitors"] = competitor_names
|
| 62 |
+
|
| 63 |
+
# Extract category values from analysis_categories
|
| 64 |
+
category_map = {
|
| 65 |
+
"author_role": "author_role",
|
| 66 |
+
"sabian_mention_context": "sabian_mention_context",
|
| 67 |
+
"sentiment_level": "sentiment",
|
| 68 |
+
"emotion_type": "emotions",
|
| 69 |
+
"intents": "intents",
|
| 70 |
+
"purchase_stage": "purchase_stage",
|
| 71 |
+
"comparison_type": "comparison_type",
|
| 72 |
+
"feedback_aspects": "feedback_aspects",
|
| 73 |
+
"decision_drivers": "decision_drivers",
|
| 74 |
+
"product_attributes": "product_attributes",
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
for key, config_key in category_map.items():
|
| 78 |
+
config_section = self.analysis_categories.get(config_key, {})
|
| 79 |
+
if "categories" in config_section:
|
| 80 |
+
valid[key] = [c["value"] for c in config_section["categories"]]
|
| 81 |
+
elif "levels" in config_section:
|
| 82 |
+
valid[key] = [c["value"] for c in config_section["levels"]]
|
| 83 |
+
else:
|
| 84 |
+
valid[key] = []
|
| 85 |
+
|
| 86 |
+
return valid
|
| 87 |
+
|
| 88 |
+
def _get_category_list(self, key: str) -> List[str]:
|
| 89 |
+
"""Get list of valid values for a category."""
|
| 90 |
+
return self._valid_values.get(key, [])
|
| 91 |
+
|
| 92 |
+
def _build_system_prompt(self) -> str:
|
| 93 |
+
"""Build optimized system prompt for brand analysis."""
|
| 94 |
+
brand = self.brand_config.get("brand", {})
|
| 95 |
+
brand_name = brand.get("name", "Sabian")
|
| 96 |
+
products = brand.get("products", [])
|
| 97 |
+
|
| 98 |
+
competitors = [c.get("name", "") for c in brand.get("competitors", []) if isinstance(c, dict)]
|
| 99 |
+
|
| 100 |
+
# Get all valid values
|
| 101 |
+
v = self._valid_values
|
| 102 |
+
|
| 103 |
+
return f"""You are a brand analyst extracting insights from forum posts about {brand_name} cymbals.
|
| 104 |
+
|
| 105 |
+
## STRICT RULES
|
| 106 |
+
1. Extract ONLY from POST CONTENT, never from quoted/context text
|
| 107 |
+
2. Use ONLY values from the lists below - return null/[] if no match
|
| 108 |
+
3. Sentiment must be about {brand_name} specifically, NOT overall post tone
|
| 109 |
+
4. pain_points/delight_factors use SAME value list (feedback_aspects) - classification determines positive vs negative
|
| 110 |
+
|
| 111 |
+
## VALID VALUES
|
| 112 |
+
|
| 113 |
+
**{brand_name} Products:** {products}
|
| 114 |
+
**Competitors:** {competitors}
|
| 115 |
+
|
| 116 |
+
| Field | Valid Values |
|
| 117 |
+
|-------|--------------|
|
| 118 |
+
| author_role | {v.get('author_role', [])} |
|
| 119 |
+
| sabian_mention_context | {v.get('sabian_mention_context', [])} |
|
| 120 |
+
| sentiment_level | {v.get('sentiment_level', [])} |
|
| 121 |
+
| emotion_type | {v.get('emotion_type', [])} |
|
| 122 |
+
| intents (multi) | {v.get('intents', [])} |
|
| 123 |
+
| purchase_stage | {v.get('purchase_stage', [])} |
|
| 124 |
+
| comparison_type | {v.get('comparison_type', [])} |
|
| 125 |
+
| feedback_aspects | {v.get('feedback_aspects', [])} |
|
| 126 |
+
| decision_drivers | {v.get('decision_drivers', [])} |
|
| 127 |
+
| product_attributes | {v.get('product_attributes', [])} |
|
| 128 |
+
|
| 129 |
+
## KEY DISTINCTIONS
|
| 130 |
+
|
| 131 |
+
**Sentiment vs Intent:**
|
| 132 |
+
- sentiment_level = How author FEELS about {brand_name} (positive/negative/neutral)
|
| 133 |
+
- praising/criticizing intent = Author is actively ENDORSING or WARNING others
|
| 134 |
+
|
| 135 |
+
**Author-only fields (null if giving advice to others):**
|
| 136 |
+
- purchase_stage, decision_drivers, pain_points, delight_factors
|
| 137 |
+
|
| 138 |
+
**Example - Sabian-specific sentiment:**
|
| 139 |
+
Post: "Love my new drum kit! The SBR cymbals sound terrible though."
|
| 140 |
+
- Overall post: positive (happy about kit)
|
| 141 |
+
- {brand_name} sentiment: NEGATIVE (dislikes SBR sound)
|
| 142 |
+
- pain_points: ["sound_quality"]
|
| 143 |
+
|
| 144 |
+
## OUTPUT JSON
|
| 145 |
+
```json
|
| 146 |
+
{{
|
| 147 |
+
"author_role": "value from list",
|
| 148 |
+
"sabian_mention_context": "value from list",
|
| 149 |
+
"sentiment_level": "value from list",
|
| 150 |
+
"emotion_type": "value or null",
|
| 151 |
+
"sentiment_confidence": "high|medium|low",
|
| 152 |
+
"sarcasm_detected": false,
|
| 153 |
+
"products_mentioned": [],
|
| 154 |
+
"product_attributes": [],
|
| 155 |
+
"competitors_mentioned": [],
|
| 156 |
+
"competitor_products_owned": [],
|
| 157 |
+
"comparison_type": "value or null",
|
| 158 |
+
"intents": [],
|
| 159 |
+
"purchase_stage": "value or null",
|
| 160 |
+
"decision_drivers": [],
|
| 161 |
+
"pain_points": [],
|
| 162 |
+
"delight_factors": [],
|
| 163 |
+
"analysis_notes": "1-2 sentences on key {brand_name}-specific insights"
|
| 164 |
+
}}
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
Return ONLY valid JSON."""
|
| 168 |
+
|
| 169 |
+
def analyze_post(
|
| 170 |
+
self,
|
| 171 |
+
content: str,
|
| 172 |
+
thread_context: str = "",
|
| 173 |
+
quoted_content: str = ""
|
| 174 |
+
) -> Dict[str, Any]:
|
| 175 |
+
"""Perform brand analysis on a post."""
|
| 176 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 177 |
+
|
| 178 |
+
context_str = ""
|
| 179 |
+
if thread_context:
|
| 180 |
+
context_str += f"[Thread: {thread_context[:200]}] "
|
| 181 |
+
if quoted_content:
|
| 182 |
+
context_str += f"[Replying to: {quoted_content[:200]}...]"
|
| 183 |
+
|
| 184 |
+
user_prompt = f"""Analyze this post about {brand_name}.
|
| 185 |
+
|
| 186 |
+
CONTEXT (for understanding only, DO NOT extract from): {context_str or "None"}
|
| 187 |
+
|
| 188 |
+
POST CONTENT (extract from THIS only):
|
| 189 |
+
"{content}"
|
| 190 |
+
|
| 191 |
+
Return JSON only."""
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
messages = [
|
| 195 |
+
SystemMessage(content=self._build_system_prompt()),
|
| 196 |
+
HumanMessage(content=user_prompt)
|
| 197 |
+
]
|
| 198 |
+
|
| 199 |
+
response = self.llm.invoke(messages)
|
| 200 |
+
result = self._parse_llm_json_response(response.content)
|
| 201 |
+
validated = self._validate_and_normalize(result)
|
| 202 |
+
|
| 203 |
+
return {"success": True, **validated}
|
| 204 |
+
|
| 205 |
+
except json.JSONDecodeError as e:
|
| 206 |
+
self.log_processing(f"JSON decode error: {e}", "warning")
|
| 207 |
+
return {
|
| 208 |
+
"success": False,
|
| 209 |
+
"error": f"JSON parse error: {str(e)}",
|
| 210 |
+
"sentiment_level": "neutral",
|
| 211 |
+
"intents": ["general_discussion"]
|
| 212 |
+
}
|
| 213 |
+
except Exception as e:
|
| 214 |
+
self.log_processing(f"Analysis error: {e}", "error")
|
| 215 |
+
return {"success": False, "error": str(e)}
|
| 216 |
+
|
| 217 |
+
def _validate_single(self, value: Any, valid_list: List[str], default: Any = None) -> Any:
|
| 218 |
+
"""Validate single value against list, return canonical form or default."""
|
| 219 |
+
if value is None:
|
| 220 |
+
return default
|
| 221 |
+
if isinstance(value, str):
|
| 222 |
+
val_lower = value.lower()
|
| 223 |
+
for v in valid_list:
|
| 224 |
+
if v.lower() == val_lower:
|
| 225 |
+
return v
|
| 226 |
+
return default
|
| 227 |
+
|
| 228 |
+
def _validate_list(self, values: Any, valid_list: List[str]) -> List[str]:
|
| 229 |
+
"""Validate list values, return only valid items in canonical form."""
|
| 230 |
+
if not values:
|
| 231 |
+
return []
|
| 232 |
+
if not isinstance(values, list):
|
| 233 |
+
values = [values]
|
| 234 |
+
|
| 235 |
+
validated = []
|
| 236 |
+
valid_lower = {v.lower(): v for v in valid_list}
|
| 237 |
+
for val in values:
|
| 238 |
+
if isinstance(val, str) and val.lower() in valid_lower:
|
| 239 |
+
validated.append(valid_lower[val.lower()])
|
| 240 |
+
return validated
|
| 241 |
+
|
| 242 |
+
def _validate_and_normalize(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
| 243 |
+
"""Validate all fields against predefined values and normalize."""
|
| 244 |
+
v = self._valid_values
|
| 245 |
+
|
| 246 |
+
normalized = {
|
| 247 |
+
# Classification
|
| 248 |
+
"author_role": self._validate_single(
|
| 249 |
+
result.get("author_role"), v["author_role"], "unknown"
|
| 250 |
+
),
|
| 251 |
+
"sabian_mention_context": self._validate_single(
|
| 252 |
+
result.get("sabian_mention_context"), v["sabian_mention_context"], "casual_mention"
|
| 253 |
+
),
|
| 254 |
+
|
| 255 |
+
# Sentiment
|
| 256 |
+
"sentiment_level": self._validate_single(
|
| 257 |
+
result.get("sentiment_level"), v["sentiment_level"], "neutral"
|
| 258 |
+
),
|
| 259 |
+
"emotion_type": self._validate_single(
|
| 260 |
+
result.get("emotion_type"), v["emotion_type"], None
|
| 261 |
+
),
|
| 262 |
+
"sentiment_confidence": result.get("sentiment_confidence", "medium"),
|
| 263 |
+
"sarcasm_detected": bool(result.get("sarcasm_detected", False)),
|
| 264 |
+
|
| 265 |
+
# Products
|
| 266 |
+
"products_mentioned": self._validate_list(
|
| 267 |
+
result.get("products_mentioned"), v["products"]
|
| 268 |
+
),
|
| 269 |
+
"product_attributes": self._validate_list(
|
| 270 |
+
result.get("product_attributes"), v["product_attributes"]
|
| 271 |
+
),
|
| 272 |
+
|
| 273 |
+
# Competitors
|
| 274 |
+
"competitors_mentioned": self._validate_list(
|
| 275 |
+
result.get("competitors_mentioned"), v["competitors"]
|
| 276 |
+
),
|
| 277 |
+
"competitor_products_owned": self._validate_list(
|
| 278 |
+
result.get("competitor_products_owned"), v["competitors"]
|
| 279 |
+
),
|
| 280 |
+
"comparison_type": self._validate_single(
|
| 281 |
+
result.get("comparison_type"), v["comparison_type"], None
|
| 282 |
+
),
|
| 283 |
+
|
| 284 |
+
# Intents
|
| 285 |
+
"intents": self._validate_list(
|
| 286 |
+
result.get("intents"), v["intents"]
|
| 287 |
+
) or ["general_discussion"],
|
| 288 |
+
|
| 289 |
+
# Author journey (null if advising others)
|
| 290 |
+
"purchase_stage": self._validate_single(
|
| 291 |
+
result.get("purchase_stage"), v["purchase_stage"], None
|
| 292 |
+
),
|
| 293 |
+
"decision_drivers": self._validate_list(
|
| 294 |
+
result.get("decision_drivers"), v["decision_drivers"]
|
| 295 |
+
),
|
| 296 |
+
|
| 297 |
+
# Feedback - both use feedback_aspects
|
| 298 |
+
"pain_points": self._validate_list(
|
| 299 |
+
result.get("pain_points"), v["feedback_aspects"]
|
| 300 |
+
),
|
| 301 |
+
"delight_factors": self._validate_list(
|
| 302 |
+
result.get("delight_factors"), v["feedback_aspects"]
|
| 303 |
+
),
|
| 304 |
+
|
| 305 |
+
# Notes
|
| 306 |
+
"analysis_notes": result.get("analysis_notes", ""),
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
# Log filtered values for debugging
|
| 310 |
+
for field in ["products_mentioned", "product_attributes", "pain_points", "delight_factors"]:
|
| 311 |
+
original = result.get(field, [])
|
| 312 |
+
if isinstance(original, list) and len(original) > len(normalized[field]):
|
| 313 |
+
filtered = set(str(x) for x in original) - set(normalized[field])
|
| 314 |
+
if filtered:
|
| 315 |
+
logger.debug(f"Filtered invalid {field}: {filtered}")
|
| 316 |
+
|
| 317 |
+
return normalized
|
| 318 |
+
|
| 319 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 320 |
+
"""Process a post through brand analysis."""
|
| 321 |
+
try:
|
| 322 |
+
if not self.validate_input(input_data):
|
| 323 |
+
return {
|
| 324 |
+
"success": False,
|
| 325 |
+
"error": "Invalid input: missing required fields",
|
| 326 |
+
**input_data
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
# Skip non-relevant posts
|
| 330 |
+
if not input_data.get("is_relevant", False):
|
| 331 |
+
return {
|
| 332 |
+
"success": True,
|
| 333 |
+
"analysis_skipped": True,
|
| 334 |
+
"analysis_skip_reason": "Post marked as not relevant",
|
| 335 |
+
"author_role": None,
|
| 336 |
+
"sabian_mention_context": None,
|
| 337 |
+
"sentiment_level": None,
|
| 338 |
+
"emotion_type": None,
|
| 339 |
+
"products_mentioned": [],
|
| 340 |
+
"competitors_mentioned": [],
|
| 341 |
+
"competitor_products_owned": [],
|
| 342 |
+
"intents": [],
|
| 343 |
+
"purchase_stage": None,
|
| 344 |
+
"decision_drivers": [],
|
| 345 |
+
"pain_points": [],
|
| 346 |
+
"delight_factors": [],
|
| 347 |
+
**input_data
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
# Skip non-English posts
|
| 351 |
+
if not input_data.get("is_english", True):
|
| 352 |
+
return {
|
| 353 |
+
"success": True,
|
| 354 |
+
"analysis_skipped": True,
|
| 355 |
+
"analysis_skip_reason": f"Non-English: {input_data.get('detected_language')}",
|
| 356 |
+
"author_role": None,
|
| 357 |
+
"sabian_mention_context": None,
|
| 358 |
+
"sentiment_level": None,
|
| 359 |
+
"emotion_type": None,
|
| 360 |
+
"intents": [],
|
| 361 |
+
"competitor_products_owned": [],
|
| 362 |
+
**input_data
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
# Perform analysis
|
| 366 |
+
analysis_result = self.analyze_post(
|
| 367 |
+
content=input_data.get("cleaned_content", ""),
|
| 368 |
+
thread_context=input_data.get("thread_context", ""),
|
| 369 |
+
quoted_content=input_data.get("quoted_content", "")
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
result = {
|
| 373 |
+
**input_data,
|
| 374 |
+
**analysis_result,
|
| 375 |
+
"analysis_skipped": False
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
self.log_processing(
|
| 379 |
+
f"Analyzed: sentiment={result.get('sentiment_level')}, "
|
| 380 |
+
f"products={len(result.get('products_mentioned', []))}, "
|
| 381 |
+
f"intents={result.get('intents', [])}",
|
| 382 |
+
"debug"
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
return result
|
| 386 |
+
|
| 387 |
+
except Exception as e:
|
| 388 |
+
return self.handle_error(e, "brand analysis")
|
processing_brand_sentiment/workflow/agents/sabian_relevance_extraction_agent.py
ADDED
|
@@ -0,0 +1,431 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sabian Relevance & Extraction Agent for brand sentiment analysis.
|
| 3 |
+
|
| 4 |
+
This agent performs two critical functions:
|
| 5 |
+
1. Determines relevance with HIGH confidence using strict rules
|
| 6 |
+
2. Extracts verifiable facts (products, author role, context summary)
|
| 7 |
+
|
| 8 |
+
Key Design Principles:
|
| 9 |
+
- Strict product matching: ONLY return products from predefined list
|
| 10 |
+
- Competitor awareness: Know what products belong to competitors
|
| 11 |
+
- Conservative relevance: When uncertain, mark as NOT relevant
|
| 12 |
+
- Thread context summarization: Provide clean, concise context for next agent
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from typing import Dict, Any, List
|
| 16 |
+
import json
|
| 17 |
+
from langchain_openai import ChatOpenAI
|
| 18 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 19 |
+
import logging
|
| 20 |
+
|
| 21 |
+
from .base_agent import BaseAgent
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class SabianRelevanceExtractionAgent(BaseAgent):
|
| 27 |
+
"""
|
| 28 |
+
Agent that validates relevance and extracts key facts from posts.
|
| 29 |
+
|
| 30 |
+
This agent is the first LLM call in the pipeline and serves as the
|
| 31 |
+
gatekeeper for relevance while also extracting structured information
|
| 32 |
+
for downstream analysis.
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
def __init__(
|
| 36 |
+
self,
|
| 37 |
+
config: Dict[str, Any],
|
| 38 |
+
api_key: str,
|
| 39 |
+
brand_config: Dict[str, Any],
|
| 40 |
+
analysis_categories: Dict[str, Any]
|
| 41 |
+
):
|
| 42 |
+
"""
|
| 43 |
+
Initialize the Relevance & Extraction Agent.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
config: Agent configuration
|
| 47 |
+
api_key: OpenAI API key
|
| 48 |
+
brand_config: Brand-specific configuration with products and competitors
|
| 49 |
+
analysis_categories: Category definitions for validation
|
| 50 |
+
"""
|
| 51 |
+
super().__init__("SabianRelevanceExtractionAgent", config)
|
| 52 |
+
self.api_key = api_key
|
| 53 |
+
self.brand_config = brand_config
|
| 54 |
+
self.analysis_categories = analysis_categories
|
| 55 |
+
|
| 56 |
+
self.llm = ChatOpenAI(
|
| 57 |
+
model=self.model,
|
| 58 |
+
temperature=self.temperature,
|
| 59 |
+
api_key=self.api_key
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Pre-compute valid values
|
| 63 |
+
self._build_valid_values()
|
| 64 |
+
self._build_competitor_product_warnings()
|
| 65 |
+
|
| 66 |
+
logger.info("SabianRelevanceExtractionAgent initialized")
|
| 67 |
+
|
| 68 |
+
def _build_valid_values(self) -> None:
|
| 69 |
+
"""Build valid value lists for validation."""
|
| 70 |
+
brand = self.brand_config.get("brand", {})
|
| 71 |
+
|
| 72 |
+
# Products
|
| 73 |
+
self.valid_products = brand.get("products", [])
|
| 74 |
+
|
| 75 |
+
# Competitors (brand names only)
|
| 76 |
+
self.valid_competitors = []
|
| 77 |
+
for comp in brand.get("competitors", []):
|
| 78 |
+
if isinstance(comp, dict):
|
| 79 |
+
self.valid_competitors.append(comp.get("name", ""))
|
| 80 |
+
else:
|
| 81 |
+
self.valid_competitors.append(str(comp))
|
| 82 |
+
|
| 83 |
+
# Author roles from categories
|
| 84 |
+
author_role_config = self.analysis_categories.get("author_role", {})
|
| 85 |
+
self.valid_author_roles = [
|
| 86 |
+
c["value"] for c in author_role_config.get("categories", [])
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
# Sabian mention context from categories
|
| 90 |
+
mention_context_config = self.analysis_categories.get("sabian_mention_context", {})
|
| 91 |
+
self.valid_mention_contexts = [
|
| 92 |
+
c["value"] for c in mention_context_config.get("categories", [])
|
| 93 |
+
]
|
| 94 |
+
|
| 95 |
+
def _build_competitor_product_warnings(self) -> None:
|
| 96 |
+
"""Build list of competitor products to warn about in prompts."""
|
| 97 |
+
warnings = self.brand_config.get("brand", {}).get("competitor_products_warning", {})
|
| 98 |
+
|
| 99 |
+
self.competitor_products_by_brand = {}
|
| 100 |
+
for key, products in warnings.items():
|
| 101 |
+
if key == "description":
|
| 102 |
+
continue
|
| 103 |
+
# Extract brand name from key (e.g., "paiste_products" -> "Paiste")
|
| 104 |
+
brand_name = key.replace("_products", "").capitalize()
|
| 105 |
+
self.competitor_products_by_brand[brand_name] = products
|
| 106 |
+
|
| 107 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 108 |
+
"""Validate input contains required fields."""
|
| 109 |
+
required = ["cleaned_content"]
|
| 110 |
+
return all(field in input_data for field in required)
|
| 111 |
+
|
| 112 |
+
def _build_system_prompt(self) -> str:
|
| 113 |
+
"""Build the system prompt for relevance and extraction."""
|
| 114 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 115 |
+
|
| 116 |
+
# Build competitor product warnings
|
| 117 |
+
competitor_warnings = []
|
| 118 |
+
for brand, products in self.competitor_products_by_brand.items():
|
| 119 |
+
products_str = ", ".join(f'"{p}"' for p in products[:5]) # Limit to 5 examples
|
| 120 |
+
if len(products) > 5:
|
| 121 |
+
products_str += f" (and {len(products)-5} more)"
|
| 122 |
+
competitor_warnings.append(f"- {brand}: {products_str}")
|
| 123 |
+
|
| 124 |
+
competitor_warnings_text = "\n".join(competitor_warnings) if competitor_warnings else "None specified"
|
| 125 |
+
|
| 126 |
+
return f"""You are a brand mention extractor for {brand_name} cymbals. Your job is to:
|
| 127 |
+
1. Determine if the POST CONTENT discusses {brand_name} products or brand
|
| 128 |
+
2. Extract ONLY verifiable facts, not interpretations
|
| 129 |
+
|
| 130 |
+
## CRITICAL RULES
|
| 131 |
+
|
| 132 |
+
### Rule 1: Relevance Based on POST CONTENT Only
|
| 133 |
+
- The post is relevant ONLY if the POST CONTENT itself mentions {brand_name} brand or products
|
| 134 |
+
- Quoted/parent content mentioning {brand_name} does NOT make the post relevant
|
| 135 |
+
- Generic replies ("Thanks!", "Got it!", "Good point!") are NEVER relevant
|
| 136 |
+
- Posts can be relevant even without specific product mentions if they discuss the {brand_name} brand
|
| 137 |
+
|
| 138 |
+
### Rule 2: Strict Product Matching
|
| 139 |
+
{brand_name.upper()} PRODUCTS (use ONLY these exact values):
|
| 140 |
+
{self.valid_products}
|
| 141 |
+
|
| 142 |
+
CRITICAL:
|
| 143 |
+
- Return ONLY products from this exact list above
|
| 144 |
+
- If you see a product not in this list, do NOT include it
|
| 145 |
+
- Return empty list [] if no products from the list are mentioned
|
| 146 |
+
- It's OK to have empty products_mentioned if the post discusses {brand_name} brand generally
|
| 147 |
+
|
| 148 |
+
### Rule 3: Competitor Product Awareness
|
| 149 |
+
These products belong to COMPETITORS, NOT {brand_name}:
|
| 150 |
+
{competitor_warnings_text}
|
| 151 |
+
|
| 152 |
+
COMPETITOR BRANDS: {self.valid_competitors}
|
| 153 |
+
- Only return competitor BRAND names in competitors_mentioned (not their products)
|
| 154 |
+
- If you see "2002", "Signature", "Sound Edge", "Formula 602" - these are PAISTE, not {brand_name}
|
| 155 |
+
- If you see "K Custom", "A Custom" - these are ZILDJIAN, not {brand_name}
|
| 156 |
+
|
| 157 |
+
### Rule 4: Thread Context Summary
|
| 158 |
+
- Summarize thread context in 1-2 sentences MAXIMUM
|
| 159 |
+
- Focus only on what helps understand what the post is responding to
|
| 160 |
+
- If thread is about unrelated topics (pizza, general life), say so briefly
|
| 161 |
+
- Keep it factual and concise
|
| 162 |
+
|
| 163 |
+
### Rule 5: Author Role Classification
|
| 164 |
+
Determine the author's relationship to {brand_name}:
|
| 165 |
+
- current_owner: Currently owns/uses {brand_name} products
|
| 166 |
+
- past_owner: Previously owned but sold/replaced
|
| 167 |
+
- potential_buyer: Considering purchasing {brand_name}
|
| 168 |
+
- never_owned: Explicitly states they don't own {brand_name}
|
| 169 |
+
- unknown: Cannot determine from post content
|
| 170 |
+
|
| 171 |
+
### Rule 6: Mention Context Classification
|
| 172 |
+
How prominently is {brand_name} discussed IN THE POST CONTENT:
|
| 173 |
+
- primary_focus: {brand_name} is the main topic of the post
|
| 174 |
+
- significant_mention: {brand_name} discussed with some detail, but not main focus
|
| 175 |
+
- casual_mention: Brief mention among other topics
|
| 176 |
+
- comparison_context: Mentioned while comparing to competitors
|
| 177 |
+
- null: Not relevant (use when is_relevant=false)
|
| 178 |
+
|
| 179 |
+
## OUTPUT FORMAT
|
| 180 |
+
Return ONLY valid JSON with these exact fields:
|
| 181 |
+
```json
|
| 182 |
+
{{
|
| 183 |
+
"is_relevant": true/false,
|
| 184 |
+
"relevance_confidence": "high" | "medium" | "low",
|
| 185 |
+
"relevance_reason": "1-2 sentences explaining your decision",
|
| 186 |
+
"products_mentioned": [],
|
| 187 |
+
"sabian_mention_context": "value from list" | null,
|
| 188 |
+
"author_role": "value from list",
|
| 189 |
+
"competitors_mentioned": [],
|
| 190 |
+
"thread_context_summary": "1-2 sentence summary of thread context"
|
| 191 |
+
}}
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
IMPORTANT: Return ONLY the JSON object, no additional text."""
|
| 195 |
+
|
| 196 |
+
def _build_user_prompt(
|
| 197 |
+
self,
|
| 198 |
+
content: str,
|
| 199 |
+
quoted_content: str,
|
| 200 |
+
raw_thread_context: str,
|
| 201 |
+
keywords_found: List[str]
|
| 202 |
+
) -> str:
|
| 203 |
+
"""Build the user prompt with post content and context."""
|
| 204 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 205 |
+
|
| 206 |
+
context_section = ""
|
| 207 |
+
if raw_thread_context:
|
| 208 |
+
# Truncate if too long
|
| 209 |
+
truncated_context = raw_thread_context[:1000] if len(raw_thread_context) > 1000 else raw_thread_context
|
| 210 |
+
context_section += f"THREAD CONTEXT (for understanding only):\n{truncated_context}\n\n"
|
| 211 |
+
|
| 212 |
+
if quoted_content:
|
| 213 |
+
truncated_quote = quoted_content[:500] if len(quoted_content) > 500 else quoted_content
|
| 214 |
+
context_section += f"QUOTED/PARENT CONTENT (for understanding only):\n{truncated_quote}\n\n"
|
| 215 |
+
|
| 216 |
+
keywords_info = ""
|
| 217 |
+
if keywords_found:
|
| 218 |
+
keywords_info = f"Keywords detected by preprocessor: {', '.join(keywords_found)}\n\n"
|
| 219 |
+
|
| 220 |
+
return f"""Analyze this post for {brand_name} relevance and extract facts.
|
| 221 |
+
|
| 222 |
+
{keywords_info}{context_section}POST CONTENT TO EVALUATE (base your decision ONLY on this):
|
| 223 |
+
\"\"\"{content}\"\"\"
|
| 224 |
+
|
| 225 |
+
Remember:
|
| 226 |
+
- is_relevant=true ONLY if POST CONTENT discusses {brand_name}
|
| 227 |
+
- products_mentioned must be from the exact product list provided
|
| 228 |
+
- competitors_mentioned should be brand names only (Zildjian, Paiste, etc.)
|
| 229 |
+
- thread_context_summary should be 1-2 sentences max
|
| 230 |
+
|
| 231 |
+
Return JSON only."""
|
| 232 |
+
|
| 233 |
+
def extract_and_validate(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 234 |
+
"""
|
| 235 |
+
Perform relevance check and fact extraction.
|
| 236 |
+
|
| 237 |
+
Args:
|
| 238 |
+
input_data: Preprocessed post data
|
| 239 |
+
|
| 240 |
+
Returns:
|
| 241 |
+
Dictionary with extraction results
|
| 242 |
+
"""
|
| 243 |
+
content = input_data.get("cleaned_content", "")
|
| 244 |
+
quoted_content = input_data.get("quoted_content", "")
|
| 245 |
+
raw_thread_context = input_data.get("raw_thread_context", "")
|
| 246 |
+
keywords_found = input_data.get("relevance_keywords_found", [])
|
| 247 |
+
|
| 248 |
+
try:
|
| 249 |
+
messages = [
|
| 250 |
+
SystemMessage(content=self._build_system_prompt()),
|
| 251 |
+
HumanMessage(content=self._build_user_prompt(
|
| 252 |
+
content, quoted_content, raw_thread_context, keywords_found
|
| 253 |
+
))
|
| 254 |
+
]
|
| 255 |
+
|
| 256 |
+
response = self.llm.invoke(messages)
|
| 257 |
+
result = self._parse_llm_json_response(response.content)
|
| 258 |
+
|
| 259 |
+
# Validate and normalize the response
|
| 260 |
+
validated = self._validate_response(result)
|
| 261 |
+
|
| 262 |
+
return {
|
| 263 |
+
"success": True,
|
| 264 |
+
**validated
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
except json.JSONDecodeError as e:
|
| 268 |
+
self.log_processing(f"JSON decode error: {e}", "warning")
|
| 269 |
+
return {
|
| 270 |
+
"success": False,
|
| 271 |
+
"error": f"JSON parse error: {str(e)}",
|
| 272 |
+
"is_relevant": False,
|
| 273 |
+
"relevance_confidence": "low",
|
| 274 |
+
"relevance_reason": "Failed to parse LLM response"
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
except Exception as e:
|
| 278 |
+
self.log_processing(f"Extraction error: {e}", "error")
|
| 279 |
+
return {
|
| 280 |
+
"success": False,
|
| 281 |
+
"error": str(e),
|
| 282 |
+
"is_relevant": False,
|
| 283 |
+
"relevance_confidence": "low",
|
| 284 |
+
"relevance_reason": f"Error during extraction: {str(e)}"
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
def _validate_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
| 288 |
+
"""Validate and normalize LLM response against allowed values."""
|
| 289 |
+
|
| 290 |
+
# Validate products
|
| 291 |
+
products = result.get("products_mentioned", [])
|
| 292 |
+
if not isinstance(products, list):
|
| 293 |
+
products = []
|
| 294 |
+
valid_products = [
|
| 295 |
+
p for p in products
|
| 296 |
+
if any(p.lower() == vp.lower() for vp in self.valid_products)
|
| 297 |
+
]
|
| 298 |
+
# Normalize to canonical case
|
| 299 |
+
normalized_products = []
|
| 300 |
+
for p in valid_products:
|
| 301 |
+
for vp in self.valid_products:
|
| 302 |
+
if p.lower() == vp.lower():
|
| 303 |
+
normalized_products.append(vp)
|
| 304 |
+
break
|
| 305 |
+
|
| 306 |
+
# Validate competitors
|
| 307 |
+
competitors = result.get("competitors_mentioned", [])
|
| 308 |
+
if not isinstance(competitors, list):
|
| 309 |
+
competitors = []
|
| 310 |
+
valid_competitors = [
|
| 311 |
+
c for c in competitors
|
| 312 |
+
if any(c.lower() == vc.lower() for vc in self.valid_competitors)
|
| 313 |
+
]
|
| 314 |
+
# Normalize to canonical case
|
| 315 |
+
normalized_competitors = []
|
| 316 |
+
for c in valid_competitors:
|
| 317 |
+
for vc in self.valid_competitors:
|
| 318 |
+
if c.lower() == vc.lower():
|
| 319 |
+
normalized_competitors.append(vc)
|
| 320 |
+
break
|
| 321 |
+
|
| 322 |
+
# Validate author_role
|
| 323 |
+
author_role = result.get("author_role", "unknown")
|
| 324 |
+
if author_role not in self.valid_author_roles:
|
| 325 |
+
author_role = "unknown"
|
| 326 |
+
|
| 327 |
+
# Validate sabian_mention_context
|
| 328 |
+
mention_context = result.get("sabian_mention_context")
|
| 329 |
+
is_relevant = result.get("is_relevant", False)
|
| 330 |
+
|
| 331 |
+
if not is_relevant:
|
| 332 |
+
mention_context = None
|
| 333 |
+
elif mention_context and mention_context not in self.valid_mention_contexts:
|
| 334 |
+
mention_context = "casual_mention" # Default for relevant posts
|
| 335 |
+
|
| 336 |
+
# Validate confidence
|
| 337 |
+
confidence = result.get("relevance_confidence", "medium")
|
| 338 |
+
if confidence not in ["high", "medium", "low"]:
|
| 339 |
+
confidence = "medium"
|
| 340 |
+
|
| 341 |
+
return {
|
| 342 |
+
"is_relevant": bool(is_relevant),
|
| 343 |
+
"relevance_confidence": confidence,
|
| 344 |
+
"relevance_reason": result.get("relevance_reason", ""),
|
| 345 |
+
"products_mentioned": normalized_products,
|
| 346 |
+
"sabian_mention_context": mention_context,
|
| 347 |
+
"author_role": author_role,
|
| 348 |
+
"competitors_mentioned": normalized_competitors,
|
| 349 |
+
"thread_context_summary": result.get("thread_context_summary", "")
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 353 |
+
"""
|
| 354 |
+
Process a post through relevance validation and fact extraction.
|
| 355 |
+
|
| 356 |
+
Args:
|
| 357 |
+
input_data: Dictionary from preprocessor containing:
|
| 358 |
+
- cleaned_content: Cleaned post text
|
| 359 |
+
- quoted_content: Quoted content if any
|
| 360 |
+
- raw_thread_context: Raw thread context
|
| 361 |
+
- relevance_keywords_found: Keywords from preprocessor
|
| 362 |
+
- preliminary_relevant: Preprocessor's relevance assessment
|
| 363 |
+
- needs_relevance_validation: Whether LLM validation needed
|
| 364 |
+
|
| 365 |
+
Returns:
|
| 366 |
+
Dictionary with extraction results and original data
|
| 367 |
+
"""
|
| 368 |
+
try:
|
| 369 |
+
if not self.validate_input(input_data):
|
| 370 |
+
return {
|
| 371 |
+
"success": False,
|
| 372 |
+
"error": "Invalid input: missing required fields",
|
| 373 |
+
"is_relevant": False,
|
| 374 |
+
**input_data
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
# Skip if already determined not relevant and no validation needed
|
| 378 |
+
if (not input_data.get("preliminary_relevant", False) and
|
| 379 |
+
not input_data.get("needs_relevance_validation", False)):
|
| 380 |
+
return {
|
| 381 |
+
"success": True,
|
| 382 |
+
"is_relevant": False,
|
| 383 |
+
"relevance_confidence": "high",
|
| 384 |
+
"relevance_reason": "No Sabian-related keywords found in post",
|
| 385 |
+
"products_mentioned": [],
|
| 386 |
+
"sabian_mention_context": None,
|
| 387 |
+
"author_role": "unknown",
|
| 388 |
+
"competitors_mentioned": input_data.get("competitors_detected", []),
|
| 389 |
+
"thread_context_summary": "",
|
| 390 |
+
"extraction_performed": False,
|
| 391 |
+
**input_data
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
# Skip non-English posts
|
| 395 |
+
if not input_data.get("is_english", True):
|
| 396 |
+
return {
|
| 397 |
+
"success": True,
|
| 398 |
+
"is_relevant": False,
|
| 399 |
+
"relevance_confidence": "high",
|
| 400 |
+
"relevance_reason": f"Non-English post: {input_data.get('detected_language')}",
|
| 401 |
+
"products_mentioned": [],
|
| 402 |
+
"sabian_mention_context": None,
|
| 403 |
+
"author_role": "unknown",
|
| 404 |
+
"competitors_mentioned": [],
|
| 405 |
+
"thread_context_summary": "",
|
| 406 |
+
"extraction_performed": False,
|
| 407 |
+
**input_data
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
# Perform LLM extraction
|
| 411 |
+
extraction_result = self.extract_and_validate(input_data)
|
| 412 |
+
|
| 413 |
+
# Merge results
|
| 414 |
+
result = {
|
| 415 |
+
**input_data,
|
| 416 |
+
**extraction_result,
|
| 417 |
+
"extraction_performed": True
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
# Log the result
|
| 421 |
+
self.log_processing(
|
| 422 |
+
f"Extraction complete: is_relevant={result.get('is_relevant')}, "
|
| 423 |
+
f"products={result.get('products_mentioned')}, "
|
| 424 |
+
f"context={result.get('sabian_mention_context')}",
|
| 425 |
+
"debug"
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
return result
|
| 429 |
+
|
| 430 |
+
except Exception as e:
|
| 431 |
+
return self.handle_error(e, "relevance extraction")
|
processing_brand_sentiment/workflow/agents/sabian_sentiment_analyzer_agent.py
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sabian Sentiment & Intent Analyzer Agent for brand sentiment analysis.
|
| 3 |
+
|
| 4 |
+
This agent performs deep analysis on VERIFIED relevant posts with STRUCTURED input.
|
| 5 |
+
It receives pre-validated data from the Relevance Extraction Agent including:
|
| 6 |
+
- Products already extracted and validated
|
| 7 |
+
- Thread context already summarized
|
| 8 |
+
- Author role already determined
|
| 9 |
+
|
| 10 |
+
Key Design Principles:
|
| 11 |
+
- Focused analysis: Only sentiment, intents, and customer journey
|
| 12 |
+
- No re-extraction: Products are given, not re-detected
|
| 13 |
+
- Sabian-specific sentiment: How author feels about Sabian, not overall post tone
|
| 14 |
+
- Author perspective: Pain points/delights only from author's own experience
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from typing import Dict, Any, List
|
| 18 |
+
import json
|
| 19 |
+
from langchain_openai import ChatOpenAI
|
| 20 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 21 |
+
import logging
|
| 22 |
+
|
| 23 |
+
from .base_agent import BaseAgent
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class SabianSentimentAnalyzerAgent(BaseAgent):
|
| 29 |
+
"""
|
| 30 |
+
Agent that performs deep sentiment and intent analysis on relevant posts.
|
| 31 |
+
|
| 32 |
+
This agent is the second LLM call in the pipeline and focuses purely on
|
| 33 |
+
analysis, not extraction. It receives structured input from the extraction
|
| 34 |
+
agent and produces sentiment, intent, and customer journey insights.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
config: Dict[str, Any],
|
| 40 |
+
api_key: str,
|
| 41 |
+
brand_config: Dict[str, Any],
|
| 42 |
+
analysis_categories: Dict[str, Any]
|
| 43 |
+
):
|
| 44 |
+
"""
|
| 45 |
+
Initialize the Sentiment Analyzer Agent.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
config: Agent configuration
|
| 49 |
+
api_key: OpenAI API key
|
| 50 |
+
brand_config: Brand-specific configuration
|
| 51 |
+
analysis_categories: Category definitions for analysis
|
| 52 |
+
"""
|
| 53 |
+
super().__init__("SabianSentimentAnalyzerAgent", config)
|
| 54 |
+
self.api_key = api_key
|
| 55 |
+
self.brand_config = brand_config
|
| 56 |
+
self.analysis_categories = analysis_categories
|
| 57 |
+
|
| 58 |
+
self.llm = ChatOpenAI(
|
| 59 |
+
model=self.model,
|
| 60 |
+
temperature=self.temperature,
|
| 61 |
+
api_key=self.api_key
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# Pre-compute valid values for validation
|
| 65 |
+
self._valid_values = self._compute_valid_values()
|
| 66 |
+
|
| 67 |
+
logger.info("SabianSentimentAnalyzerAgent initialized")
|
| 68 |
+
|
| 69 |
+
def _compute_valid_values(self) -> Dict[str, List[str]]:
|
| 70 |
+
"""Pre-compute all valid values from config for validation."""
|
| 71 |
+
valid = {}
|
| 72 |
+
|
| 73 |
+
# Products from brand config
|
| 74 |
+
valid["products"] = self.brand_config.get("brand", {}).get("products", [])
|
| 75 |
+
|
| 76 |
+
# Competitors
|
| 77 |
+
competitor_names = []
|
| 78 |
+
for comp in self.brand_config.get("brand", {}).get("competitors", []):
|
| 79 |
+
if isinstance(comp, dict):
|
| 80 |
+
competitor_names.append(comp.get("name", ""))
|
| 81 |
+
valid["competitors"] = competitor_names
|
| 82 |
+
|
| 83 |
+
# Extract category values from analysis_categories
|
| 84 |
+
category_map = {
|
| 85 |
+
"sentiment_level": "sentiment",
|
| 86 |
+
"emotion_type": "emotions",
|
| 87 |
+
"intents": "intents",
|
| 88 |
+
"purchase_stage": "purchase_stage",
|
| 89 |
+
"comparison_type": "comparison_type",
|
| 90 |
+
"feedback_aspects": "feedback_aspects",
|
| 91 |
+
"decision_drivers": "decision_drivers",
|
| 92 |
+
"product_attributes": "product_attributes",
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
for key, config_key in category_map.items():
|
| 96 |
+
config_section = self.analysis_categories.get(config_key, {})
|
| 97 |
+
if "categories" in config_section:
|
| 98 |
+
valid[key] = [c["value"] for c in config_section["categories"]]
|
| 99 |
+
elif "levels" in config_section:
|
| 100 |
+
valid[key] = [c["value"] for c in config_section["levels"]]
|
| 101 |
+
else:
|
| 102 |
+
valid[key] = []
|
| 103 |
+
|
| 104 |
+
return valid
|
| 105 |
+
|
| 106 |
+
def _get_valid_list(self, key: str) -> List[str]:
|
| 107 |
+
"""Get list of valid values for a category."""
|
| 108 |
+
return self._valid_values.get(key, [])
|
| 109 |
+
|
| 110 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 111 |
+
"""Validate that input contains required fields."""
|
| 112 |
+
required = ["cleaned_content", "is_relevant"]
|
| 113 |
+
return all(field in input_data for field in required)
|
| 114 |
+
|
| 115 |
+
def _build_system_prompt(self) -> str:
|
| 116 |
+
"""Build optimized system prompt for sentiment analysis."""
|
| 117 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 118 |
+
v = self._valid_values
|
| 119 |
+
|
| 120 |
+
return f"""You are a sentiment analyst for {brand_name} cymbal discussions.
|
| 121 |
+
|
| 122 |
+
## YOUR TASK
|
| 123 |
+
Analyze the sentiment, emotions, and intents in posts about {brand_name}.
|
| 124 |
+
You will receive PRE-VALIDATED context (products, author role, etc.) - trust these values.
|
| 125 |
+
|
| 126 |
+
## CRITICAL RULES
|
| 127 |
+
|
| 128 |
+
### Rule 1: Neutral by Default
|
| 129 |
+
Sentiment defaults to NEUTRAL unless there is EXPLICIT positive or negative language toward {brand_name}.
|
| 130 |
+
- Factual statements = neutral
|
| 131 |
+
- Comparative statements ("sounds different", "not the same as") = neutral (different ≠ worse)
|
| 132 |
+
- Advice-giving without personal opinion = neutral
|
| 133 |
+
|
| 134 |
+
Only assign positive/negative sentiment when the author CLEARLY expresses satisfaction or dissatisfaction with {brand_name}.
|
| 135 |
+
|
| 136 |
+
### Rule 2: {brand_name}-Specific Sentiment
|
| 137 |
+
Sentiment MUST be about {brand_name} specifically, NOT overall post tone or other products.
|
| 138 |
+
|
| 139 |
+
EXAMPLE:
|
| 140 |
+
Post: "I have SBR cymbals and bought a Pearl crash. The Pearl sounds different from the SBR. Go with what feels best!"
|
| 141 |
+
- This is NEUTRAL toward {brand_name} - "different" is not criticism
|
| 142 |
+
- The author owns SBR (no complaint), is giving advice
|
| 143 |
+
- pain_points: [] (no negative experience expressed)
|
| 144 |
+
- delight_factors: [] (no positive experience expressed)
|
| 145 |
+
|
| 146 |
+
### Rule 3: Mutually Exclusive Feedback
|
| 147 |
+
pain_points and delight_factors CANNOT contain the same values.
|
| 148 |
+
- If an aspect is positive → delight_factors only
|
| 149 |
+
- If an aspect is negative → pain_points only
|
| 150 |
+
- Never both
|
| 151 |
+
|
| 152 |
+
### Rule 4: Author Perspective Only
|
| 153 |
+
These fields are ONLY for author's OWN experience, not advice to others:
|
| 154 |
+
- purchase_stage, decision_drivers, pain_points, delight_factors
|
| 155 |
+
|
| 156 |
+
If author is primarily giving ADVICE to someone else, these should be null/empty.
|
| 157 |
+
|
| 158 |
+
### Rule 5: Valid Values
|
| 159 |
+
|
| 160 |
+
| Field | Valid Values |
|
| 161 |
+
|-------|--------------|
|
| 162 |
+
| sentiment_level | {v.get('sentiment_level', [])} |
|
| 163 |
+
| emotion_type | {v.get('emotion_type', [])} |
|
| 164 |
+
| intents (multi-select) | {v.get('intents', [])} |
|
| 165 |
+
| purchase_stage | {v.get('purchase_stage', [])} |
|
| 166 |
+
| comparison_type | {v.get('comparison_type', [])} |
|
| 167 |
+
| feedback_aspects | {v.get('feedback_aspects', [])} |
|
| 168 |
+
| decision_drivers | {v.get('decision_drivers', [])} |
|
| 169 |
+
| product_attributes | {v.get('product_attributes', [])} |
|
| 170 |
+
| competitor brands | {v.get('competitors', [])} |
|
| 171 |
+
|
| 172 |
+
### Rule 6: Intent Classification
|
| 173 |
+
- seeking_information: Asking questions, seeking advice
|
| 174 |
+
- providing_information: Answering questions, giving advice
|
| 175 |
+
- sharing_experience: Personal experience, review, testimonial
|
| 176 |
+
- comparing: Comparing brands/products
|
| 177 |
+
- praising: Actively endorsing {brand_name}
|
| 178 |
+
- criticizing: Actively complaining about {brand_name}
|
| 179 |
+
- buying_selling: Listing gear for sale/trade
|
| 180 |
+
- general_discussion: General conversation
|
| 181 |
+
|
| 182 |
+
## OUTPUT FORMAT
|
| 183 |
+
```json
|
| 184 |
+
{{
|
| 185 |
+
"sentiment_level": "neutral unless explicit positive/negative",
|
| 186 |
+
"emotion_type": "value or null",
|
| 187 |
+
"sentiment_confidence": "high" | "medium" | "low",
|
| 188 |
+
"sarcasm_detected": false,
|
| 189 |
+
"product_attributes": [],
|
| 190 |
+
"competitor_products_owned": [],
|
| 191 |
+
"comparison_type": "value or null",
|
| 192 |
+
"intents": [],
|
| 193 |
+
"purchase_stage": "value or null",
|
| 194 |
+
"decision_drivers": [],
|
| 195 |
+
"pain_points": [],
|
| 196 |
+
"delight_factors": [],
|
| 197 |
+
"analysis_notes": "1-2 sentences"
|
| 198 |
+
}}
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
Return ONLY valid JSON."""
|
| 202 |
+
|
| 203 |
+
def _build_user_prompt(self, input_data: Dict[str, Any]) -> str:
|
| 204 |
+
"""Build user prompt with structured context."""
|
| 205 |
+
brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
|
| 206 |
+
|
| 207 |
+
content = input_data.get("cleaned_content", "")
|
| 208 |
+
products_mentioned = input_data.get("products_mentioned", [])
|
| 209 |
+
sabian_context = input_data.get("sabian_mention_context", "")
|
| 210 |
+
author_role = input_data.get("author_role", "unknown")
|
| 211 |
+
thread_summary = input_data.get("thread_context_summary", "")
|
| 212 |
+
competitors_mentioned = input_data.get("competitors_mentioned", [])
|
| 213 |
+
|
| 214 |
+
context_section = f"""## PRE-VALIDATED CONTEXT (trust these values)
|
| 215 |
+
- Products mentioned: {products_mentioned if products_mentioned else 'None specific'}
|
| 216 |
+
- {brand_name} mention context: {sabian_context}
|
| 217 |
+
- Author role: {author_role}
|
| 218 |
+
- Competitors mentioned: {competitors_mentioned if competitors_mentioned else 'None'}
|
| 219 |
+
- Thread summary: {thread_summary if thread_summary else 'Not available'}
|
| 220 |
+
"""
|
| 221 |
+
|
| 222 |
+
return f"""Analyze this post about {brand_name} for sentiment and intents.
|
| 223 |
+
|
| 224 |
+
{context_section}
|
| 225 |
+
## POST CONTENT TO ANALYZE:
|
| 226 |
+
\"\"\"{content}\"\"\"
|
| 227 |
+
|
| 228 |
+
Remember:
|
| 229 |
+
- Sentiment is about {brand_name} ONLY, not overall post tone
|
| 230 |
+
- pain_points/delight_factors only from author's OWN experience
|
| 231 |
+
- Use only values from the valid lists provided
|
| 232 |
+
|
| 233 |
+
Return JSON only."""
|
| 234 |
+
|
| 235 |
+
def analyze_post(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 236 |
+
"""
|
| 237 |
+
Perform sentiment and intent analysis.
|
| 238 |
+
|
| 239 |
+
Args:
|
| 240 |
+
input_data: Structured data from extraction agent
|
| 241 |
+
|
| 242 |
+
Returns:
|
| 243 |
+
Dictionary with analysis results
|
| 244 |
+
"""
|
| 245 |
+
try:
|
| 246 |
+
messages = [
|
| 247 |
+
SystemMessage(content=self._build_system_prompt()),
|
| 248 |
+
HumanMessage(content=self._build_user_prompt(input_data))
|
| 249 |
+
]
|
| 250 |
+
|
| 251 |
+
response = self.llm.invoke(messages)
|
| 252 |
+
result = self._parse_llm_json_response(response.content)
|
| 253 |
+
|
| 254 |
+
# Validate and normalize
|
| 255 |
+
validated = self._validate_and_normalize(result)
|
| 256 |
+
|
| 257 |
+
return {"success": True, **validated}
|
| 258 |
+
|
| 259 |
+
except json.JSONDecodeError as e:
|
| 260 |
+
self.log_processing(f"JSON decode error: {e}", "warning")
|
| 261 |
+
return {
|
| 262 |
+
"success": False,
|
| 263 |
+
"error": f"JSON parse error: {str(e)}",
|
| 264 |
+
"sentiment_level": "neutral",
|
| 265 |
+
"intents": ["general_discussion"]
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
except Exception as e:
|
| 269 |
+
self.log_processing(f"Analysis error: {e}", "error")
|
| 270 |
+
return {"success": False, "error": str(e)}
|
| 271 |
+
|
| 272 |
+
def _validate_single(self, value: Any, valid_list: List[str], default: Any = None) -> Any:
|
| 273 |
+
"""Validate single value against list, return canonical form or default."""
|
| 274 |
+
if value is None:
|
| 275 |
+
return default
|
| 276 |
+
if isinstance(value, str):
|
| 277 |
+
val_lower = value.lower()
|
| 278 |
+
for v in valid_list:
|
| 279 |
+
if v.lower() == val_lower:
|
| 280 |
+
return v
|
| 281 |
+
return default
|
| 282 |
+
|
| 283 |
+
def _validate_list(self, values: Any, valid_list: List[str]) -> List[str]:
|
| 284 |
+
"""Validate list values, return only valid items in canonical form."""
|
| 285 |
+
if not values:
|
| 286 |
+
return []
|
| 287 |
+
if not isinstance(values, list):
|
| 288 |
+
values = [values]
|
| 289 |
+
|
| 290 |
+
validated = []
|
| 291 |
+
valid_lower = {v.lower(): v for v in valid_list}
|
| 292 |
+
for val in values:
|
| 293 |
+
if isinstance(val, str) and val.lower() in valid_lower:
|
| 294 |
+
validated.append(valid_lower[val.lower()])
|
| 295 |
+
return validated
|
| 296 |
+
|
| 297 |
+
def _validate_and_normalize(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
| 298 |
+
"""Validate all fields against predefined values and normalize."""
|
| 299 |
+
v = self._valid_values
|
| 300 |
+
|
| 301 |
+
normalized = {
|
| 302 |
+
# Sentiment
|
| 303 |
+
"sentiment_level": self._validate_single(
|
| 304 |
+
result.get("sentiment_level"), v["sentiment_level"], "neutral"
|
| 305 |
+
),
|
| 306 |
+
"emotion_type": self._validate_single(
|
| 307 |
+
result.get("emotion_type"), v["emotion_type"], None
|
| 308 |
+
),
|
| 309 |
+
"sentiment_confidence": result.get("sentiment_confidence", "medium"),
|
| 310 |
+
"sarcasm_detected": bool(result.get("sarcasm_detected", False)),
|
| 311 |
+
|
| 312 |
+
# Product info
|
| 313 |
+
"product_attributes": self._validate_list(
|
| 314 |
+
result.get("product_attributes"), v["product_attributes"]
|
| 315 |
+
),
|
| 316 |
+
|
| 317 |
+
# Competitors
|
| 318 |
+
"competitor_products_owned": self._validate_list(
|
| 319 |
+
result.get("competitor_products_owned"), v["competitors"]
|
| 320 |
+
),
|
| 321 |
+
"comparison_type": self._validate_single(
|
| 322 |
+
result.get("comparison_type"), v["comparison_type"], None
|
| 323 |
+
),
|
| 324 |
+
|
| 325 |
+
# Intents
|
| 326 |
+
"intents": self._validate_list(
|
| 327 |
+
result.get("intents"), v["intents"]
|
| 328 |
+
) or ["general_discussion"],
|
| 329 |
+
|
| 330 |
+
# Author journey (null if advising others)
|
| 331 |
+
"purchase_stage": self._validate_single(
|
| 332 |
+
result.get("purchase_stage"), v["purchase_stage"], None
|
| 333 |
+
),
|
| 334 |
+
"decision_drivers": self._validate_list(
|
| 335 |
+
result.get("decision_drivers"), v["decision_drivers"]
|
| 336 |
+
),
|
| 337 |
+
|
| 338 |
+
# Feedback - both use feedback_aspects
|
| 339 |
+
"pain_points": self._validate_list(
|
| 340 |
+
result.get("pain_points"), v["feedback_aspects"]
|
| 341 |
+
),
|
| 342 |
+
"delight_factors": self._validate_list(
|
| 343 |
+
result.get("delight_factors"), v["feedback_aspects"]
|
| 344 |
+
),
|
| 345 |
+
|
| 346 |
+
# Notes
|
| 347 |
+
"analysis_notes": result.get("analysis_notes", ""),
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
# Validate confidence
|
| 351 |
+
if normalized["sentiment_confidence"] not in ["high", "medium", "low"]:
|
| 352 |
+
normalized["sentiment_confidence"] = "medium"
|
| 353 |
+
|
| 354 |
+
return normalized
|
| 355 |
+
|
| 356 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 357 |
+
"""
|
| 358 |
+
Process a post through sentiment and intent analysis.
|
| 359 |
+
|
| 360 |
+
Args:
|
| 361 |
+
input_data: Dictionary from extraction agent containing:
|
| 362 |
+
- cleaned_content: Post text
|
| 363 |
+
- is_relevant: Relevance determination
|
| 364 |
+
- products_mentioned: Pre-validated products
|
| 365 |
+
- sabian_mention_context: How Sabian is discussed
|
| 366 |
+
- author_role: Author's relationship to Sabian
|
| 367 |
+
- thread_context_summary: Summarized context
|
| 368 |
+
- competitors_mentioned: Competitor brands
|
| 369 |
+
|
| 370 |
+
Returns:
|
| 371 |
+
Dictionary with analysis results and original data
|
| 372 |
+
"""
|
| 373 |
+
try:
|
| 374 |
+
if not self.validate_input(input_data):
|
| 375 |
+
return {
|
| 376 |
+
"success": False,
|
| 377 |
+
"error": "Invalid input: missing required fields",
|
| 378 |
+
**input_data
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
# Skip non-relevant posts
|
| 382 |
+
if not input_data.get("is_relevant", False):
|
| 383 |
+
return {
|
| 384 |
+
"success": True,
|
| 385 |
+
"analysis_skipped": True,
|
| 386 |
+
"analysis_skip_reason": "Post marked as not relevant",
|
| 387 |
+
"sentiment_level": None,
|
| 388 |
+
"emotion_type": None,
|
| 389 |
+
"sentiment_confidence": None,
|
| 390 |
+
"sarcasm_detected": False,
|
| 391 |
+
"product_attributes": [],
|
| 392 |
+
"competitor_products_owned": [],
|
| 393 |
+
"comparison_type": None,
|
| 394 |
+
"intents": [],
|
| 395 |
+
"purchase_stage": None,
|
| 396 |
+
"decision_drivers": [],
|
| 397 |
+
"pain_points": [],
|
| 398 |
+
"delight_factors": [],
|
| 399 |
+
"analysis_notes": "",
|
| 400 |
+
**input_data
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
# Skip non-English posts (should already be filtered, but double-check)
|
| 404 |
+
if not input_data.get("is_english", True):
|
| 405 |
+
return {
|
| 406 |
+
"success": True,
|
| 407 |
+
"analysis_skipped": True,
|
| 408 |
+
"analysis_skip_reason": f"Non-English: {input_data.get('detected_language')}",
|
| 409 |
+
"sentiment_level": None,
|
| 410 |
+
"emotion_type": None,
|
| 411 |
+
"intents": [],
|
| 412 |
+
**input_data
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
# Perform analysis
|
| 416 |
+
analysis_result = self.analyze_post(input_data)
|
| 417 |
+
|
| 418 |
+
result = {
|
| 419 |
+
**input_data,
|
| 420 |
+
**analysis_result,
|
| 421 |
+
"analysis_skipped": False
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
self.log_processing(
|
| 425 |
+
f"Analyzed: sentiment={result.get('sentiment_level')}, "
|
| 426 |
+
f"intents={result.get('intents')}, "
|
| 427 |
+
f"pain_points={result.get('pain_points')}",
|
| 428 |
+
"debug"
|
| 429 |
+
)
|
| 430 |
+
|
| 431 |
+
return result
|
| 432 |
+
|
| 433 |
+
except Exception as e:
|
| 434 |
+
return self.handle_error(e, "sentiment analysis")
|
processing_brand_sentiment/workflow/comment_orchestrator.py
ADDED
|
@@ -0,0 +1,558 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Comment Analysis Workflow Orchestrator using LangGraph.
|
| 3 |
+
|
| 4 |
+
Coordinates the 4-agent pipeline for social media comments:
|
| 5 |
+
1. CommentPreprocessorAgent - Plain text cleaning, keyword detection (no LLM)
|
| 6 |
+
2. SabianRelevanceExtractionAgent - Relevance + fact extraction (LLM #1) [shared]
|
| 7 |
+
3. SabianSentimentAnalyzerAgent - Deep sentiment analysis (LLM #2) [shared]
|
| 8 |
+
4. OutputValidatorAgent - Rule-based validation (no LLM) [shared]
|
| 9 |
+
|
| 10 |
+
Architecture v4.0:
|
| 11 |
+
- Same analysis pipeline as forums, different preprocessing and state
|
| 12 |
+
- Plain text input (no HTML parsing)
|
| 13 |
+
- Context from social media content metadata and parent comments
|
| 14 |
+
- Comment-specific identifiers (comment_sk, comment_id, platform, etc.)
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from typing import Dict, Any, List, TypedDict, Annotated, Optional
|
| 18 |
+
import operator
|
| 19 |
+
import json
|
| 20 |
+
import os
|
| 21 |
+
from langgraph.graph import StateGraph, END
|
| 22 |
+
import logging
|
| 23 |
+
|
| 24 |
+
from .agents.comment_preprocessor_agent import CommentPreprocessorAgent
|
| 25 |
+
from .agents.sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent
|
| 26 |
+
from .agents.sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent
|
| 27 |
+
from .agents.output_validator_agent import OutputValidatorAgent
|
| 28 |
+
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class CommentAnalysisState(TypedDict):
|
| 33 |
+
"""
|
| 34 |
+
State definition for the comment analysis workflow v4.0.
|
| 35 |
+
|
| 36 |
+
Uses comment-specific identifiers but shares the same analysis fields
|
| 37 |
+
as the forum workflow for consistent output.
|
| 38 |
+
"""
|
| 39 |
+
# ============== Source Identifiers (Comment-specific) ==============
|
| 40 |
+
comment_sk: int
|
| 41 |
+
comment_id: str
|
| 42 |
+
platform: str
|
| 43 |
+
comment_timestamp: Any
|
| 44 |
+
author_name: str
|
| 45 |
+
author_id: str
|
| 46 |
+
parent_comment_id: str
|
| 47 |
+
parent_comment_text: str
|
| 48 |
+
|
| 49 |
+
# Content metadata
|
| 50 |
+
content_sk: int
|
| 51 |
+
content_id: str
|
| 52 |
+
content_description: str
|
| 53 |
+
content_title: str
|
| 54 |
+
channel_sk: int
|
| 55 |
+
channel_name: str
|
| 56 |
+
channel_display_name: str
|
| 57 |
+
|
| 58 |
+
# ============== Original Content ==============
|
| 59 |
+
comment_text: str
|
| 60 |
+
original_text: str
|
| 61 |
+
|
| 62 |
+
# ============== Preprocessor Output ==============
|
| 63 |
+
cleaned_content: str
|
| 64 |
+
quoted_content: str
|
| 65 |
+
has_quote: bool
|
| 66 |
+
quoted_author: str
|
| 67 |
+
raw_thread_context: str # Comment context (reuses field name for agent compatibility)
|
| 68 |
+
is_empty: bool
|
| 69 |
+
|
| 70 |
+
# Language detection
|
| 71 |
+
detected_language: str
|
| 72 |
+
language_code: str
|
| 73 |
+
is_english: bool
|
| 74 |
+
language_confidence: str
|
| 75 |
+
language_detection_skipped: bool
|
| 76 |
+
|
| 77 |
+
# Preliminary relevance (keyword-based)
|
| 78 |
+
preliminary_relevant: bool
|
| 79 |
+
needs_relevance_validation: bool
|
| 80 |
+
relevance_keywords_found: List[str]
|
| 81 |
+
relevance_type: str
|
| 82 |
+
has_primary_keywords: bool
|
| 83 |
+
|
| 84 |
+
# Initial detections
|
| 85 |
+
products_detected: List[str]
|
| 86 |
+
competitors_detected: List[str]
|
| 87 |
+
|
| 88 |
+
# ============== Extraction Agent Output ==============
|
| 89 |
+
is_relevant: bool
|
| 90 |
+
relevance_confidence: str
|
| 91 |
+
relevance_reason: str
|
| 92 |
+
extraction_performed: bool
|
| 93 |
+
|
| 94 |
+
# Extracted facts
|
| 95 |
+
products_mentioned: List[str]
|
| 96 |
+
sabian_mention_context: str
|
| 97 |
+
author_role: str
|
| 98 |
+
competitors_mentioned: List[str]
|
| 99 |
+
thread_context_summary: str
|
| 100 |
+
|
| 101 |
+
# ============== Sentiment Analyzer Output ==============
|
| 102 |
+
sentiment_level: str
|
| 103 |
+
emotion_type: str
|
| 104 |
+
sentiment_confidence: str
|
| 105 |
+
sarcasm_detected: bool
|
| 106 |
+
|
| 107 |
+
# Product information
|
| 108 |
+
product_attributes: List[str]
|
| 109 |
+
|
| 110 |
+
# Competitive intelligence
|
| 111 |
+
competitor_products_owned: List[str]
|
| 112 |
+
comparison_type: str
|
| 113 |
+
|
| 114 |
+
# Customer journey (AUTHOR PERSPECTIVE ONLY)
|
| 115 |
+
intents: List[str]
|
| 116 |
+
purchase_stage: str
|
| 117 |
+
decision_drivers: List[str]
|
| 118 |
+
pain_points: List[str]
|
| 119 |
+
delight_factors: List[str]
|
| 120 |
+
|
| 121 |
+
# Analysis notes
|
| 122 |
+
analysis_notes: str
|
| 123 |
+
analysis_skipped: bool
|
| 124 |
+
analysis_skip_reason: str
|
| 125 |
+
|
| 126 |
+
# ============== Validator Output ==============
|
| 127 |
+
validation_passed: bool
|
| 128 |
+
validation_errors: List[str]
|
| 129 |
+
validation_warnings: List[str]
|
| 130 |
+
validation_flags: List[str]
|
| 131 |
+
processing_status: str
|
| 132 |
+
|
| 133 |
+
# ============== Processing Metadata ==============
|
| 134 |
+
processing_errors: Annotated[List[str], operator.add]
|
| 135 |
+
success: bool
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class CommentAnalysisWorkflow:
|
| 139 |
+
"""
|
| 140 |
+
LangGraph-based workflow for comment brand sentiment analysis v4.0.
|
| 141 |
+
|
| 142 |
+
Pipeline:
|
| 143 |
+
1. Comment Preprocessor (no LLM) - plain text, comment context
|
| 144 |
+
2. Relevance & Extraction Agent (LLM #1) - shared with forums
|
| 145 |
+
3. Sentiment Analyzer Agent (LLM #2) - shared with forums
|
| 146 |
+
4. Output Validator (no LLM) - shared with forums
|
| 147 |
+
"""
|
| 148 |
+
|
| 149 |
+
def __init__(
|
| 150 |
+
self,
|
| 151 |
+
workflow_config: Dict[str, Any],
|
| 152 |
+
brand_config: Dict[str, Any],
|
| 153 |
+
analysis_categories: Dict[str, Any],
|
| 154 |
+
api_key: str
|
| 155 |
+
):
|
| 156 |
+
"""
|
| 157 |
+
Initialize the workflow with agents and configuration.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
workflow_config: Workflow and agent configuration
|
| 161 |
+
brand_config: Brand-specific configuration
|
| 162 |
+
analysis_categories: Analysis category definitions
|
| 163 |
+
api_key: OpenAI API key
|
| 164 |
+
"""
|
| 165 |
+
self.workflow_config = workflow_config
|
| 166 |
+
self.brand_config = brand_config
|
| 167 |
+
self.analysis_categories = analysis_categories
|
| 168 |
+
self.api_key = api_key
|
| 169 |
+
|
| 170 |
+
# Initialize agents
|
| 171 |
+
self._init_agents()
|
| 172 |
+
|
| 173 |
+
# Build the workflow graph
|
| 174 |
+
self.workflow = self._build_workflow()
|
| 175 |
+
|
| 176 |
+
logger.info("CommentAnalysisWorkflow v4.0 initialized successfully")
|
| 177 |
+
|
| 178 |
+
def _init_agents(self) -> None:
|
| 179 |
+
"""Initialize all agents with their configurations."""
|
| 180 |
+
agents_config = self.workflow_config.get("agents", {})
|
| 181 |
+
|
| 182 |
+
# 1. Comment Preprocessor Agent (no LLM) - comment-specific
|
| 183 |
+
preprocessor_config = agents_config.get("preprocessor", {})
|
| 184 |
+
self.preprocessor = CommentPreprocessorAgent(
|
| 185 |
+
preprocessor_config,
|
| 186 |
+
self.brand_config
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
# 2. Relevance & Extraction Agent (LLM #1) - shared with forums
|
| 190 |
+
extraction_config = agents_config.get("relevance_extraction",
|
| 191 |
+
agents_config.get("relevance_validator", {})
|
| 192 |
+
)
|
| 193 |
+
self.extraction_agent = SabianRelevanceExtractionAgent(
|
| 194 |
+
extraction_config,
|
| 195 |
+
self.api_key,
|
| 196 |
+
self.brand_config,
|
| 197 |
+
self.analysis_categories
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
# 3. Sentiment Analyzer Agent (LLM #2) - shared with forums
|
| 201 |
+
analyzer_config = agents_config.get("sentiment_analyzer",
|
| 202 |
+
agents_config.get("brand_analyzer", {})
|
| 203 |
+
)
|
| 204 |
+
self.sentiment_analyzer = SabianSentimentAnalyzerAgent(
|
| 205 |
+
analyzer_config,
|
| 206 |
+
self.api_key,
|
| 207 |
+
self.brand_config,
|
| 208 |
+
self.analysis_categories
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
# 4. Output Validator Agent (no LLM) - shared with forums
|
| 212 |
+
validator_config = agents_config.get("output_validator", {})
|
| 213 |
+
self.output_validator = OutputValidatorAgent(
|
| 214 |
+
validator_config,
|
| 215 |
+
self.brand_config,
|
| 216 |
+
self.analysis_categories
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
logger.info("All 4 agents initialized for comment processing")
|
| 220 |
+
|
| 221 |
+
def _build_workflow(self) -> StateGraph:
|
| 222 |
+
"""
|
| 223 |
+
Build the LangGraph workflow.
|
| 224 |
+
|
| 225 |
+
Flow:
|
| 226 |
+
preprocessing -> extraction -> (analysis if relevant) -> validation -> END
|
| 227 |
+
|
| 228 |
+
Returns:
|
| 229 |
+
Compiled StateGraph workflow
|
| 230 |
+
"""
|
| 231 |
+
workflow = StateGraph(CommentAnalysisState)
|
| 232 |
+
|
| 233 |
+
# Add nodes
|
| 234 |
+
workflow.add_node("preprocessing", self._preprocessing_node)
|
| 235 |
+
workflow.add_node("extraction", self._extraction_node)
|
| 236 |
+
workflow.add_node("analysis", self._analysis_node)
|
| 237 |
+
workflow.add_node("validation", self._validation_node)
|
| 238 |
+
|
| 239 |
+
# Set entry point
|
| 240 |
+
workflow.set_entry_point("preprocessing")
|
| 241 |
+
|
| 242 |
+
# Define edges
|
| 243 |
+
workflow.add_conditional_edges(
|
| 244 |
+
"preprocessing",
|
| 245 |
+
self._route_after_preprocessing,
|
| 246 |
+
{
|
| 247 |
+
"extract": "extraction",
|
| 248 |
+
"skip_to_validation": "validation"
|
| 249 |
+
}
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
workflow.add_conditional_edges(
|
| 253 |
+
"extraction",
|
| 254 |
+
self._route_after_extraction,
|
| 255 |
+
{
|
| 256 |
+
"analyze": "analysis",
|
| 257 |
+
"skip_to_validation": "validation"
|
| 258 |
+
}
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
workflow.add_edge("analysis", "validation")
|
| 262 |
+
workflow.add_edge("validation", END)
|
| 263 |
+
|
| 264 |
+
return workflow.compile()
|
| 265 |
+
|
| 266 |
+
def _preprocessing_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
|
| 267 |
+
"""
|
| 268 |
+
Preprocessing node: Plain text cleaning, language detection, keyword check.
|
| 269 |
+
"""
|
| 270 |
+
try:
|
| 271 |
+
input_data = {
|
| 272 |
+
"comment_sk": state.get("comment_sk"),
|
| 273 |
+
"comment_text": state.get("comment_text", ""),
|
| 274 |
+
"content_title": state.get("content_title"),
|
| 275 |
+
"content_description": state.get("content_description"),
|
| 276 |
+
"parent_comment_text": state.get("parent_comment_text")
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
result = self.preprocessor.process(input_data)
|
| 280 |
+
|
| 281 |
+
if result.get("success", False):
|
| 282 |
+
# Content
|
| 283 |
+
state["cleaned_content"] = result.get("cleaned_content", "")
|
| 284 |
+
state["quoted_content"] = result.get("quoted_content")
|
| 285 |
+
state["has_quote"] = result.get("has_quote", False)
|
| 286 |
+
state["quoted_author"] = result.get("quoted_author")
|
| 287 |
+
state["raw_thread_context"] = result.get("raw_thread_context", "")
|
| 288 |
+
state["is_empty"] = result.get("is_empty", False)
|
| 289 |
+
state["original_text"] = result.get("original_text", state.get("comment_text", ""))
|
| 290 |
+
|
| 291 |
+
# Language
|
| 292 |
+
state["detected_language"] = result.get("detected_language", "English")
|
| 293 |
+
state["language_code"] = result.get("language_code", "en")
|
| 294 |
+
state["is_english"] = result.get("is_english", True)
|
| 295 |
+
state["language_confidence"] = result.get("language_confidence", "low")
|
| 296 |
+
state["language_detection_skipped"] = result.get("language_detection_skipped", False)
|
| 297 |
+
|
| 298 |
+
# Relevance
|
| 299 |
+
state["preliminary_relevant"] = result.get("preliminary_relevant", False)
|
| 300 |
+
state["needs_relevance_validation"] = result.get("needs_relevance_validation", False)
|
| 301 |
+
state["relevance_keywords_found"] = result.get("relevance_keywords_found", [])
|
| 302 |
+
state["relevance_type"] = result.get("relevance_type", "none")
|
| 303 |
+
state["has_primary_keywords"] = result.get("has_primary_keywords", False)
|
| 304 |
+
|
| 305 |
+
# Detections
|
| 306 |
+
state["products_detected"] = result.get("products_detected", [])
|
| 307 |
+
state["competitors_detected"] = result.get("competitors_detected", [])
|
| 308 |
+
|
| 309 |
+
state["success"] = True
|
| 310 |
+
else:
|
| 311 |
+
error_msg = f"Preprocessing failed: {result.get('error', 'Unknown error')}"
|
| 312 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 313 |
+
state["success"] = False
|
| 314 |
+
|
| 315 |
+
logger.debug(f"Preprocessing complete for comment {state.get('comment_sk')}")
|
| 316 |
+
return state
|
| 317 |
+
|
| 318 |
+
except Exception as e:
|
| 319 |
+
error_msg = f"Preprocessing node error: {str(e)}"
|
| 320 |
+
logger.error(error_msg)
|
| 321 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 322 |
+
state["success"] = False
|
| 323 |
+
return state
|
| 324 |
+
|
| 325 |
+
def _extraction_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
|
| 326 |
+
"""
|
| 327 |
+
Extraction node: LLM-based relevance validation and fact extraction.
|
| 328 |
+
Reuses the same extraction agent as forums.
|
| 329 |
+
"""
|
| 330 |
+
try:
|
| 331 |
+
input_data = {
|
| 332 |
+
"cleaned_content": state.get("cleaned_content", ""),
|
| 333 |
+
"quoted_content": state.get("quoted_content"),
|
| 334 |
+
"raw_thread_context": state.get("raw_thread_context", ""),
|
| 335 |
+
"relevance_keywords_found": state.get("relevance_keywords_found", []),
|
| 336 |
+
"preliminary_relevant": state.get("preliminary_relevant", False),
|
| 337 |
+
"needs_relevance_validation": state.get("needs_relevance_validation", True),
|
| 338 |
+
"products_detected": state.get("products_detected", []),
|
| 339 |
+
"competitors_detected": state.get("competitors_detected", []),
|
| 340 |
+
"is_english": state.get("is_english", True),
|
| 341 |
+
"detected_language": state.get("detected_language", "English")
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
result = self.extraction_agent.process(input_data)
|
| 345 |
+
|
| 346 |
+
# Update state with extraction results
|
| 347 |
+
state["is_relevant"] = result.get("is_relevant", False)
|
| 348 |
+
state["relevance_confidence"] = result.get("relevance_confidence", "low")
|
| 349 |
+
state["relevance_reason"] = result.get("relevance_reason", "")
|
| 350 |
+
state["extraction_performed"] = result.get("extraction_performed", True)
|
| 351 |
+
|
| 352 |
+
# Extracted facts
|
| 353 |
+
state["products_mentioned"] = result.get("products_mentioned", [])
|
| 354 |
+
state["sabian_mention_context"] = result.get("sabian_mention_context")
|
| 355 |
+
state["author_role"] = result.get("author_role", "unknown")
|
| 356 |
+
state["competitors_mentioned"] = result.get("competitors_mentioned", [])
|
| 357 |
+
state["thread_context_summary"] = result.get("thread_context_summary", "")
|
| 358 |
+
|
| 359 |
+
if not result.get("success", False) and result.get("error"):
|
| 360 |
+
state["processing_errors"] = state.get("processing_errors", []) + [result["error"]]
|
| 361 |
+
|
| 362 |
+
logger.debug(f"Extraction complete: is_relevant={state['is_relevant']}")
|
| 363 |
+
return state
|
| 364 |
+
|
| 365 |
+
except Exception as e:
|
| 366 |
+
error_msg = f"Extraction node error: {str(e)}"
|
| 367 |
+
logger.error(error_msg)
|
| 368 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 369 |
+
state["is_relevant"] = False
|
| 370 |
+
state["relevance_confidence"] = "low"
|
| 371 |
+
return state
|
| 372 |
+
|
| 373 |
+
def _analysis_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
|
| 374 |
+
"""
|
| 375 |
+
Analysis node: Deep sentiment and intent analysis for relevant comments.
|
| 376 |
+
Reuses the same sentiment analyzer as forums.
|
| 377 |
+
"""
|
| 378 |
+
try:
|
| 379 |
+
input_data = {
|
| 380 |
+
"cleaned_content": state.get("cleaned_content", ""),
|
| 381 |
+
"is_relevant": state.get("is_relevant", True),
|
| 382 |
+
"is_english": state.get("is_english", True),
|
| 383 |
+
"detected_language": state.get("detected_language", "English"),
|
| 384 |
+
"products_mentioned": state.get("products_mentioned", []),
|
| 385 |
+
"sabian_mention_context": state.get("sabian_mention_context"),
|
| 386 |
+
"author_role": state.get("author_role", "unknown"),
|
| 387 |
+
"competitors_mentioned": state.get("competitors_mentioned", []),
|
| 388 |
+
"thread_context_summary": state.get("thread_context_summary", "")
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
result = self.sentiment_analyzer.process(input_data)
|
| 392 |
+
|
| 393 |
+
if result.get("success", False):
|
| 394 |
+
# Sentiment
|
| 395 |
+
state["sentiment_level"] = result.get("sentiment_level")
|
| 396 |
+
state["emotion_type"] = result.get("emotion_type")
|
| 397 |
+
state["sentiment_confidence"] = result.get("sentiment_confidence", "medium")
|
| 398 |
+
state["sarcasm_detected"] = result.get("sarcasm_detected", False)
|
| 399 |
+
|
| 400 |
+
# Products
|
| 401 |
+
state["product_attributes"] = result.get("product_attributes", [])
|
| 402 |
+
|
| 403 |
+
# Competitive
|
| 404 |
+
state["competitor_products_owned"] = result.get("competitor_products_owned", [])
|
| 405 |
+
state["comparison_type"] = result.get("comparison_type")
|
| 406 |
+
|
| 407 |
+
# Journey
|
| 408 |
+
state["intents"] = result.get("intents", [])
|
| 409 |
+
state["purchase_stage"] = result.get("purchase_stage")
|
| 410 |
+
state["decision_drivers"] = result.get("decision_drivers", [])
|
| 411 |
+
state["pain_points"] = result.get("pain_points", [])
|
| 412 |
+
state["delight_factors"] = result.get("delight_factors", [])
|
| 413 |
+
|
| 414 |
+
# Notes
|
| 415 |
+
state["analysis_notes"] = result.get("analysis_notes", "")
|
| 416 |
+
state["analysis_skipped"] = result.get("analysis_skipped", False)
|
| 417 |
+
state["analysis_skip_reason"] = result.get("analysis_skip_reason", "")
|
| 418 |
+
else:
|
| 419 |
+
error_msg = f"Analysis failed: {result.get('error', 'Unknown error')}"
|
| 420 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 421 |
+
|
| 422 |
+
logger.debug(f"Analysis complete for comment {state.get('comment_sk')}")
|
| 423 |
+
return state
|
| 424 |
+
|
| 425 |
+
except Exception as e:
|
| 426 |
+
error_msg = f"Analysis node error: {str(e)}"
|
| 427 |
+
logger.error(error_msg)
|
| 428 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 429 |
+
return state
|
| 430 |
+
|
| 431 |
+
def _validation_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
|
| 432 |
+
"""
|
| 433 |
+
Validation node: Rule-based validation and anomaly detection.
|
| 434 |
+
Reuses the same output validator as forums.
|
| 435 |
+
"""
|
| 436 |
+
try:
|
| 437 |
+
result = self.output_validator.process(dict(state))
|
| 438 |
+
|
| 439 |
+
state["validation_passed"] = result.get("validation_passed", True)
|
| 440 |
+
state["validation_errors"] = result.get("validation_errors", [])
|
| 441 |
+
state["validation_warnings"] = result.get("validation_warnings", [])
|
| 442 |
+
state["validation_flags"] = result.get("validation_flags", [])
|
| 443 |
+
state["processing_status"] = result.get("processing_status", "completed")
|
| 444 |
+
|
| 445 |
+
# Set overall success
|
| 446 |
+
has_errors = len(state.get("processing_errors", [])) > 0
|
| 447 |
+
state["success"] = not has_errors or state.get("is_relevant") is not None
|
| 448 |
+
|
| 449 |
+
logger.debug(f"Validation complete: status={state['processing_status']}")
|
| 450 |
+
return state
|
| 451 |
+
|
| 452 |
+
except Exception as e:
|
| 453 |
+
error_msg = f"Validation node error: {str(e)}"
|
| 454 |
+
logger.error(error_msg)
|
| 455 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 456 |
+
state["validation_passed"] = False
|
| 457 |
+
state["processing_status"] = "validation_failed"
|
| 458 |
+
state["success"] = False
|
| 459 |
+
return state
|
| 460 |
+
|
| 461 |
+
def _route_after_preprocessing(self, state: CommentAnalysisState) -> str:
|
| 462 |
+
"""Determine routing after preprocessing."""
|
| 463 |
+
if state.get("is_empty", False):
|
| 464 |
+
state["is_relevant"] = False
|
| 465 |
+
state["relevance_reason"] = "Empty content"
|
| 466 |
+
return "skip_to_validation"
|
| 467 |
+
|
| 468 |
+
if not state.get("is_english", True):
|
| 469 |
+
state["is_relevant"] = False
|
| 470 |
+
state["relevance_reason"] = f"Non-English: {state.get('detected_language')}"
|
| 471 |
+
return "skip_to_validation"
|
| 472 |
+
|
| 473 |
+
if (not state.get("preliminary_relevant", False) and
|
| 474 |
+
not state.get("needs_relevance_validation", False)):
|
| 475 |
+
state["is_relevant"] = False
|
| 476 |
+
state["relevance_reason"] = "No relevant keywords found"
|
| 477 |
+
return "skip_to_validation"
|
| 478 |
+
|
| 479 |
+
return "extract"
|
| 480 |
+
|
| 481 |
+
def _route_after_extraction(self, state: CommentAnalysisState) -> str:
|
| 482 |
+
"""Determine routing after extraction."""
|
| 483 |
+
if state.get("is_relevant", False):
|
| 484 |
+
return "analyze"
|
| 485 |
+
return "skip_to_validation"
|
| 486 |
+
|
| 487 |
+
def process_comment(self, comment_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 488 |
+
"""
|
| 489 |
+
Process a single social media comment through the workflow.
|
| 490 |
+
|
| 491 |
+
Args:
|
| 492 |
+
comment_data: Dictionary containing comment data
|
| 493 |
+
|
| 494 |
+
Returns:
|
| 495 |
+
Dictionary with processed results
|
| 496 |
+
"""
|
| 497 |
+
try:
|
| 498 |
+
initial_state = {
|
| 499 |
+
# Comment identifiers
|
| 500 |
+
"comment_sk": comment_data.get("comment_sk"),
|
| 501 |
+
"comment_id": comment_data.get("comment_id"),
|
| 502 |
+
"platform": comment_data.get("platform"),
|
| 503 |
+
"comment_timestamp": comment_data.get("comment_timestamp"),
|
| 504 |
+
"author_name": comment_data.get("author_name"),
|
| 505 |
+
"author_id": comment_data.get("author_id"),
|
| 506 |
+
"parent_comment_id": comment_data.get("parent_comment_id"),
|
| 507 |
+
"parent_comment_text": comment_data.get("parent_comment_text"),
|
| 508 |
+
|
| 509 |
+
# Content metadata
|
| 510 |
+
"content_sk": comment_data.get("content_sk"),
|
| 511 |
+
"content_id": comment_data.get("content_id"),
|
| 512 |
+
"content_description": comment_data.get("content_description"),
|
| 513 |
+
"content_title": comment_data.get("content_title"),
|
| 514 |
+
"channel_sk": comment_data.get("channel_sk"),
|
| 515 |
+
"channel_name": comment_data.get("channel_name"),
|
| 516 |
+
"channel_display_name": comment_data.get("channel_display_name"),
|
| 517 |
+
|
| 518 |
+
# Comment text
|
| 519 |
+
"comment_text": comment_data.get("comment_text", ""),
|
| 520 |
+
|
| 521 |
+
# Processing metadata
|
| 522 |
+
"processing_errors": [],
|
| 523 |
+
"success": True
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
final_state = self.workflow.invoke(initial_state)
|
| 527 |
+
|
| 528 |
+
return dict(final_state)
|
| 529 |
+
|
| 530 |
+
except Exception as e:
|
| 531 |
+
logger.error(f"Workflow execution error: {str(e)}")
|
| 532 |
+
return {
|
| 533 |
+
**comment_data,
|
| 534 |
+
"success": False,
|
| 535 |
+
"processing_errors": [str(e)],
|
| 536 |
+
"processing_status": "workflow_error"
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
def process_batch(self, comments: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 540 |
+
"""
|
| 541 |
+
Process a batch of social media comments.
|
| 542 |
+
|
| 543 |
+
Args:
|
| 544 |
+
comments: List of comment dictionaries
|
| 545 |
+
|
| 546 |
+
Returns:
|
| 547 |
+
List of processed comment dictionaries
|
| 548 |
+
"""
|
| 549 |
+
results = []
|
| 550 |
+
total = len(comments)
|
| 551 |
+
|
| 552 |
+
for idx, comment in enumerate(comments, 1):
|
| 553 |
+
logger.info(f"Processing comment {idx}/{total} (SK: {comment.get('comment_sk')})")
|
| 554 |
+
result = self.process_comment(comment)
|
| 555 |
+
results.append(result)
|
| 556 |
+
|
| 557 |
+
logger.info(f"Batch processing complete: {total} comments processed")
|
| 558 |
+
return results
|
processing_brand_sentiment/workflow/orchestrator.py
ADDED
|
@@ -0,0 +1,551 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Brand Analysis Workflow Orchestrator using LangGraph.
|
| 3 |
+
|
| 4 |
+
Coordinates the 4-agent pipeline:
|
| 5 |
+
1. ContentPreprocessorAgent - HTML parsing, cleaning, keyword detection (no LLM)
|
| 6 |
+
2. SabianRelevanceExtractionAgent - Relevance + fact extraction (LLM #1)
|
| 7 |
+
3. SabianSentimentAnalyzerAgent - Deep sentiment analysis (LLM #2)
|
| 8 |
+
4. OutputValidatorAgent - Rule-based validation (no LLM)
|
| 9 |
+
|
| 10 |
+
Architecture v4.0:
|
| 11 |
+
- Separation of concerns: extraction vs analysis
|
| 12 |
+
- Strict validation at every step
|
| 13 |
+
- Structured data flow between agents
|
| 14 |
+
- Conservative relevance determination
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from typing import Dict, Any, List, TypedDict, Annotated, Optional
|
| 18 |
+
import operator
|
| 19 |
+
import json
|
| 20 |
+
import os
|
| 21 |
+
from langgraph.graph import StateGraph, END
|
| 22 |
+
import logging
|
| 23 |
+
|
| 24 |
+
from .agents.content_preprocessor_agent import ContentPreprocessorAgent
|
| 25 |
+
from .agents.sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent
|
| 26 |
+
from .agents.sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent
|
| 27 |
+
from .agents.output_validator_agent import OutputValidatorAgent
|
| 28 |
+
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class BrandAnalysisState(TypedDict):
|
| 33 |
+
"""
|
| 34 |
+
State definition for the brand analysis workflow v4.0.
|
| 35 |
+
|
| 36 |
+
This state flows through all agents, accumulating data at each step.
|
| 37 |
+
"""
|
| 38 |
+
# ============== Source Identifiers ==============
|
| 39 |
+
post_id: int
|
| 40 |
+
thread_id: int
|
| 41 |
+
post_author_id: int
|
| 42 |
+
|
| 43 |
+
# ============== Original Content ==============
|
| 44 |
+
post_content: str
|
| 45 |
+
original_content: str
|
| 46 |
+
|
| 47 |
+
# ============== Thread Context ==============
|
| 48 |
+
thread_title: str
|
| 49 |
+
thread_first_post: str
|
| 50 |
+
thread_started_at: Any
|
| 51 |
+
category_title: str
|
| 52 |
+
category_topic: str
|
| 53 |
+
|
| 54 |
+
# ============== Timestamps ==============
|
| 55 |
+
post_created_at: Any
|
| 56 |
+
|
| 57 |
+
# ============== Preprocessor Output ==============
|
| 58 |
+
cleaned_content: str
|
| 59 |
+
quoted_content: str
|
| 60 |
+
has_quote: bool
|
| 61 |
+
quoted_author: str
|
| 62 |
+
raw_thread_context: str # Raw context for extraction agent
|
| 63 |
+
is_empty: bool
|
| 64 |
+
|
| 65 |
+
# Language detection
|
| 66 |
+
detected_language: str
|
| 67 |
+
language_code: str
|
| 68 |
+
is_english: bool
|
| 69 |
+
language_confidence: str
|
| 70 |
+
language_detection_skipped: bool
|
| 71 |
+
|
| 72 |
+
# Preliminary relevance (keyword-based)
|
| 73 |
+
preliminary_relevant: bool
|
| 74 |
+
needs_relevance_validation: bool
|
| 75 |
+
relevance_keywords_found: List[str]
|
| 76 |
+
relevance_type: str
|
| 77 |
+
has_primary_keywords: bool
|
| 78 |
+
|
| 79 |
+
# Initial detections
|
| 80 |
+
products_detected: List[str]
|
| 81 |
+
competitors_detected: List[str]
|
| 82 |
+
|
| 83 |
+
# ============== Extraction Agent Output ==============
|
| 84 |
+
is_relevant: bool
|
| 85 |
+
relevance_confidence: str
|
| 86 |
+
relevance_reason: str
|
| 87 |
+
extraction_performed: bool
|
| 88 |
+
|
| 89 |
+
# Extracted facts
|
| 90 |
+
products_mentioned: List[str]
|
| 91 |
+
sabian_mention_context: str # primary_focus, significant_mention, casual_mention, comparison_context
|
| 92 |
+
author_role: str # current_owner, past_owner, potential_buyer, never_owned, unknown
|
| 93 |
+
competitors_mentioned: List[str]
|
| 94 |
+
thread_context_summary: str # NEW: Summarized context for storage and analysis
|
| 95 |
+
|
| 96 |
+
# ============== Sentiment Analyzer Output ==============
|
| 97 |
+
sentiment_level: str
|
| 98 |
+
emotion_type: str
|
| 99 |
+
sentiment_confidence: str
|
| 100 |
+
sarcasm_detected: bool
|
| 101 |
+
|
| 102 |
+
# Product information
|
| 103 |
+
product_attributes: List[str]
|
| 104 |
+
|
| 105 |
+
# Competitive intelligence
|
| 106 |
+
competitor_products_owned: List[str]
|
| 107 |
+
comparison_type: str
|
| 108 |
+
|
| 109 |
+
# Customer journey (AUTHOR PERSPECTIVE ONLY)
|
| 110 |
+
intents: List[str]
|
| 111 |
+
purchase_stage: str
|
| 112 |
+
decision_drivers: List[str]
|
| 113 |
+
pain_points: List[str]
|
| 114 |
+
delight_factors: List[str]
|
| 115 |
+
|
| 116 |
+
# Analysis notes
|
| 117 |
+
analysis_notes: str
|
| 118 |
+
analysis_skipped: bool
|
| 119 |
+
analysis_skip_reason: str
|
| 120 |
+
|
| 121 |
+
# ============== Validator Output ==============
|
| 122 |
+
validation_passed: bool
|
| 123 |
+
validation_errors: List[str]
|
| 124 |
+
validation_warnings: List[str]
|
| 125 |
+
validation_flags: List[str]
|
| 126 |
+
processing_status: str # completed, completed_with_flags, validation_failed
|
| 127 |
+
|
| 128 |
+
# ============== Processing Metadata ==============
|
| 129 |
+
processing_errors: Annotated[List[str], operator.add]
|
| 130 |
+
success: bool
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
class BrandAnalysisWorkflow:
|
| 134 |
+
"""
|
| 135 |
+
LangGraph-based workflow for brand sentiment analysis v4.0.
|
| 136 |
+
|
| 137 |
+
Pipeline:
|
| 138 |
+
1. Content Preprocessor (no LLM)
|
| 139 |
+
2. Relevance & Extraction Agent (LLM #1)
|
| 140 |
+
3. Sentiment Analyzer Agent (LLM #2) - only for relevant posts
|
| 141 |
+
4. Output Validator (no LLM)
|
| 142 |
+
"""
|
| 143 |
+
|
| 144 |
+
def __init__(
|
| 145 |
+
self,
|
| 146 |
+
workflow_config: Dict[str, Any],
|
| 147 |
+
brand_config: Dict[str, Any],
|
| 148 |
+
analysis_categories: Dict[str, Any],
|
| 149 |
+
api_key: str
|
| 150 |
+
):
|
| 151 |
+
"""
|
| 152 |
+
Initialize the workflow with agents and configuration.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
workflow_config: Workflow and agent configuration
|
| 156 |
+
brand_config: Brand-specific configuration
|
| 157 |
+
analysis_categories: Analysis category definitions
|
| 158 |
+
api_key: OpenAI API key
|
| 159 |
+
"""
|
| 160 |
+
self.workflow_config = workflow_config
|
| 161 |
+
self.brand_config = brand_config
|
| 162 |
+
self.analysis_categories = analysis_categories
|
| 163 |
+
self.api_key = api_key
|
| 164 |
+
|
| 165 |
+
# Initialize agents
|
| 166 |
+
self._init_agents()
|
| 167 |
+
|
| 168 |
+
# Build the workflow graph
|
| 169 |
+
self.workflow = self._build_workflow()
|
| 170 |
+
|
| 171 |
+
logger.info("BrandAnalysisWorkflow v4.0 initialized successfully")
|
| 172 |
+
|
| 173 |
+
def _init_agents(self) -> None:
|
| 174 |
+
"""Initialize all agents with their configurations."""
|
| 175 |
+
agents_config = self.workflow_config.get("agents", {})
|
| 176 |
+
|
| 177 |
+
# 1. Content Preprocessor Agent (no LLM)
|
| 178 |
+
preprocessor_config = agents_config.get("preprocessor", {})
|
| 179 |
+
self.preprocessor = ContentPreprocessorAgent(
|
| 180 |
+
preprocessor_config,
|
| 181 |
+
self.brand_config
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
# 2. Relevance & Extraction Agent (LLM #1)
|
| 185 |
+
extraction_config = agents_config.get("relevance_extraction",
|
| 186 |
+
agents_config.get("relevance_validator", {}) # Fallback to old config
|
| 187 |
+
)
|
| 188 |
+
self.extraction_agent = SabianRelevanceExtractionAgent(
|
| 189 |
+
extraction_config,
|
| 190 |
+
self.api_key,
|
| 191 |
+
self.brand_config,
|
| 192 |
+
self.analysis_categories
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# 3. Sentiment Analyzer Agent (LLM #2)
|
| 196 |
+
analyzer_config = agents_config.get("sentiment_analyzer",
|
| 197 |
+
agents_config.get("brand_analyzer", {}) # Fallback to old config
|
| 198 |
+
)
|
| 199 |
+
self.sentiment_analyzer = SabianSentimentAnalyzerAgent(
|
| 200 |
+
analyzer_config,
|
| 201 |
+
self.api_key,
|
| 202 |
+
self.brand_config,
|
| 203 |
+
self.analysis_categories
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
# 4. Output Validator Agent (no LLM)
|
| 207 |
+
validator_config = agents_config.get("output_validator", {})
|
| 208 |
+
self.output_validator = OutputValidatorAgent(
|
| 209 |
+
validator_config,
|
| 210 |
+
self.brand_config,
|
| 211 |
+
self.analysis_categories
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
logger.info("All 4 agents initialized")
|
| 215 |
+
|
| 216 |
+
def _build_workflow(self) -> StateGraph:
|
| 217 |
+
"""
|
| 218 |
+
Build the LangGraph workflow.
|
| 219 |
+
|
| 220 |
+
Flow:
|
| 221 |
+
preprocessing -> extraction -> (analysis if relevant) -> validation -> END
|
| 222 |
+
|
| 223 |
+
Returns:
|
| 224 |
+
Compiled StateGraph workflow
|
| 225 |
+
"""
|
| 226 |
+
workflow = StateGraph(BrandAnalysisState)
|
| 227 |
+
|
| 228 |
+
# Add nodes
|
| 229 |
+
workflow.add_node("preprocessing", self._preprocessing_node)
|
| 230 |
+
workflow.add_node("extraction", self._extraction_node)
|
| 231 |
+
workflow.add_node("analysis", self._analysis_node)
|
| 232 |
+
workflow.add_node("validation", self._validation_node)
|
| 233 |
+
|
| 234 |
+
# Set entry point
|
| 235 |
+
workflow.set_entry_point("preprocessing")
|
| 236 |
+
|
| 237 |
+
# Define edges
|
| 238 |
+
# Preprocessing -> conditional routing
|
| 239 |
+
workflow.add_conditional_edges(
|
| 240 |
+
"preprocessing",
|
| 241 |
+
self._route_after_preprocessing,
|
| 242 |
+
{
|
| 243 |
+
"extract": "extraction",
|
| 244 |
+
"skip_to_validation": "validation"
|
| 245 |
+
}
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
# Extraction -> conditional routing
|
| 249 |
+
workflow.add_conditional_edges(
|
| 250 |
+
"extraction",
|
| 251 |
+
self._route_after_extraction,
|
| 252 |
+
{
|
| 253 |
+
"analyze": "analysis",
|
| 254 |
+
"skip_to_validation": "validation"
|
| 255 |
+
}
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
# Analysis -> validation
|
| 259 |
+
workflow.add_edge("analysis", "validation")
|
| 260 |
+
|
| 261 |
+
# Validation -> END
|
| 262 |
+
workflow.add_edge("validation", END)
|
| 263 |
+
|
| 264 |
+
return workflow.compile()
|
| 265 |
+
|
| 266 |
+
def _preprocessing_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
|
| 267 |
+
"""
|
| 268 |
+
Preprocessing node: HTML parsing, cleaning, language detection, keyword check.
|
| 269 |
+
"""
|
| 270 |
+
try:
|
| 271 |
+
input_data = {
|
| 272 |
+
"post_id": state.get("post_id"),
|
| 273 |
+
"post_content": state.get("post_content", ""),
|
| 274 |
+
"thread_title": state.get("thread_title"),
|
| 275 |
+
"thread_first_post": state.get("thread_first_post"),
|
| 276 |
+
"category_title": state.get("category_title"),
|
| 277 |
+
"category_topic": state.get("category_topic")
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
result = self.preprocessor.process(input_data)
|
| 281 |
+
|
| 282 |
+
if result.get("success", False):
|
| 283 |
+
# Content
|
| 284 |
+
state["cleaned_content"] = result.get("cleaned_content", "")
|
| 285 |
+
state["quoted_content"] = result.get("quoted_content")
|
| 286 |
+
state["has_quote"] = result.get("has_quote", False)
|
| 287 |
+
state["quoted_author"] = result.get("quoted_author")
|
| 288 |
+
state["raw_thread_context"] = result.get("raw_thread_context", "")
|
| 289 |
+
state["is_empty"] = result.get("is_empty", False)
|
| 290 |
+
state["original_content"] = result.get("original_content", state.get("post_content", ""))
|
| 291 |
+
|
| 292 |
+
# Language
|
| 293 |
+
state["detected_language"] = result.get("detected_language", "English")
|
| 294 |
+
state["language_code"] = result.get("language_code", "en")
|
| 295 |
+
state["is_english"] = result.get("is_english", True)
|
| 296 |
+
state["language_confidence"] = result.get("language_confidence", "low")
|
| 297 |
+
state["language_detection_skipped"] = result.get("language_detection_skipped", False)
|
| 298 |
+
|
| 299 |
+
# Relevance
|
| 300 |
+
state["preliminary_relevant"] = result.get("preliminary_relevant", False)
|
| 301 |
+
state["needs_relevance_validation"] = result.get("needs_relevance_validation", False)
|
| 302 |
+
state["relevance_keywords_found"] = result.get("relevance_keywords_found", [])
|
| 303 |
+
state["relevance_type"] = result.get("relevance_type", "none")
|
| 304 |
+
state["has_primary_keywords"] = result.get("has_primary_keywords", False)
|
| 305 |
+
|
| 306 |
+
# Detections
|
| 307 |
+
state["products_detected"] = result.get("products_detected", [])
|
| 308 |
+
state["competitors_detected"] = result.get("competitors_detected", [])
|
| 309 |
+
|
| 310 |
+
state["success"] = True
|
| 311 |
+
else:
|
| 312 |
+
error_msg = f"Preprocessing failed: {result.get('error', 'Unknown error')}"
|
| 313 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 314 |
+
state["success"] = False
|
| 315 |
+
|
| 316 |
+
logger.debug(f"Preprocessing complete for post {state.get('post_id')}")
|
| 317 |
+
return state
|
| 318 |
+
|
| 319 |
+
except Exception as e:
|
| 320 |
+
error_msg = f"Preprocessing node error: {str(e)}"
|
| 321 |
+
logger.error(error_msg)
|
| 322 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 323 |
+
state["success"] = False
|
| 324 |
+
return state
|
| 325 |
+
|
| 326 |
+
def _extraction_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
|
| 327 |
+
"""
|
| 328 |
+
Extraction node: LLM-based relevance validation and fact extraction.
|
| 329 |
+
"""
|
| 330 |
+
try:
|
| 331 |
+
input_data = {
|
| 332 |
+
"cleaned_content": state.get("cleaned_content", ""),
|
| 333 |
+
"quoted_content": state.get("quoted_content"),
|
| 334 |
+
"raw_thread_context": state.get("raw_thread_context", ""),
|
| 335 |
+
"relevance_keywords_found": state.get("relevance_keywords_found", []),
|
| 336 |
+
"preliminary_relevant": state.get("preliminary_relevant", False),
|
| 337 |
+
"needs_relevance_validation": state.get("needs_relevance_validation", True),
|
| 338 |
+
"products_detected": state.get("products_detected", []),
|
| 339 |
+
"competitors_detected": state.get("competitors_detected", []),
|
| 340 |
+
"is_english": state.get("is_english", True),
|
| 341 |
+
"detected_language": state.get("detected_language", "English")
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
result = self.extraction_agent.process(input_data)
|
| 345 |
+
|
| 346 |
+
# Update state with extraction results
|
| 347 |
+
state["is_relevant"] = result.get("is_relevant", False)
|
| 348 |
+
state["relevance_confidence"] = result.get("relevance_confidence", "low")
|
| 349 |
+
state["relevance_reason"] = result.get("relevance_reason", "")
|
| 350 |
+
state["extraction_performed"] = result.get("extraction_performed", True)
|
| 351 |
+
|
| 352 |
+
# Extracted facts
|
| 353 |
+
state["products_mentioned"] = result.get("products_mentioned", [])
|
| 354 |
+
state["sabian_mention_context"] = result.get("sabian_mention_context")
|
| 355 |
+
state["author_role"] = result.get("author_role", "unknown")
|
| 356 |
+
state["competitors_mentioned"] = result.get("competitors_mentioned", [])
|
| 357 |
+
state["thread_context_summary"] = result.get("thread_context_summary", "")
|
| 358 |
+
|
| 359 |
+
if not result.get("success", False) and result.get("error"):
|
| 360 |
+
state["processing_errors"] = state.get("processing_errors", []) + [result["error"]]
|
| 361 |
+
|
| 362 |
+
logger.debug(f"Extraction complete: is_relevant={state['is_relevant']}")
|
| 363 |
+
return state
|
| 364 |
+
|
| 365 |
+
except Exception as e:
|
| 366 |
+
error_msg = f"Extraction node error: {str(e)}"
|
| 367 |
+
logger.error(error_msg)
|
| 368 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 369 |
+
state["is_relevant"] = False
|
| 370 |
+
state["relevance_confidence"] = "low"
|
| 371 |
+
return state
|
| 372 |
+
|
| 373 |
+
def _analysis_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
|
| 374 |
+
"""
|
| 375 |
+
Analysis node: Deep sentiment and intent analysis for relevant posts.
|
| 376 |
+
"""
|
| 377 |
+
try:
|
| 378 |
+
input_data = {
|
| 379 |
+
"cleaned_content": state.get("cleaned_content", ""),
|
| 380 |
+
"is_relevant": state.get("is_relevant", True),
|
| 381 |
+
"is_english": state.get("is_english", True),
|
| 382 |
+
"detected_language": state.get("detected_language", "English"),
|
| 383 |
+
"products_mentioned": state.get("products_mentioned", []),
|
| 384 |
+
"sabian_mention_context": state.get("sabian_mention_context"),
|
| 385 |
+
"author_role": state.get("author_role", "unknown"),
|
| 386 |
+
"competitors_mentioned": state.get("competitors_mentioned", []),
|
| 387 |
+
"thread_context_summary": state.get("thread_context_summary", "")
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
result = self.sentiment_analyzer.process(input_data)
|
| 391 |
+
|
| 392 |
+
if result.get("success", False):
|
| 393 |
+
# Sentiment
|
| 394 |
+
state["sentiment_level"] = result.get("sentiment_level")
|
| 395 |
+
state["emotion_type"] = result.get("emotion_type")
|
| 396 |
+
state["sentiment_confidence"] = result.get("sentiment_confidence", "medium")
|
| 397 |
+
state["sarcasm_detected"] = result.get("sarcasm_detected", False)
|
| 398 |
+
|
| 399 |
+
# Products
|
| 400 |
+
state["product_attributes"] = result.get("product_attributes", [])
|
| 401 |
+
|
| 402 |
+
# Competitive
|
| 403 |
+
state["competitor_products_owned"] = result.get("competitor_products_owned", [])
|
| 404 |
+
state["comparison_type"] = result.get("comparison_type")
|
| 405 |
+
|
| 406 |
+
# Journey
|
| 407 |
+
state["intents"] = result.get("intents", [])
|
| 408 |
+
state["purchase_stage"] = result.get("purchase_stage")
|
| 409 |
+
state["decision_drivers"] = result.get("decision_drivers", [])
|
| 410 |
+
state["pain_points"] = result.get("pain_points", [])
|
| 411 |
+
state["delight_factors"] = result.get("delight_factors", [])
|
| 412 |
+
|
| 413 |
+
# Notes
|
| 414 |
+
state["analysis_notes"] = result.get("analysis_notes", "")
|
| 415 |
+
state["analysis_skipped"] = result.get("analysis_skipped", False)
|
| 416 |
+
state["analysis_skip_reason"] = result.get("analysis_skip_reason", "")
|
| 417 |
+
else:
|
| 418 |
+
error_msg = f"Analysis failed: {result.get('error', 'Unknown error')}"
|
| 419 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 420 |
+
|
| 421 |
+
logger.debug(f"Analysis complete for post {state.get('post_id')}")
|
| 422 |
+
return state
|
| 423 |
+
|
| 424 |
+
except Exception as e:
|
| 425 |
+
error_msg = f"Analysis node error: {str(e)}"
|
| 426 |
+
logger.error(error_msg)
|
| 427 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 428 |
+
return state
|
| 429 |
+
|
| 430 |
+
def _validation_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
|
| 431 |
+
"""
|
| 432 |
+
Validation node: Rule-based validation and anomaly detection.
|
| 433 |
+
"""
|
| 434 |
+
try:
|
| 435 |
+
result = self.output_validator.process(dict(state))
|
| 436 |
+
|
| 437 |
+
state["validation_passed"] = result.get("validation_passed", True)
|
| 438 |
+
state["validation_errors"] = result.get("validation_errors", [])
|
| 439 |
+
state["validation_warnings"] = result.get("validation_warnings", [])
|
| 440 |
+
state["validation_flags"] = result.get("validation_flags", [])
|
| 441 |
+
state["processing_status"] = result.get("processing_status", "completed")
|
| 442 |
+
|
| 443 |
+
# Set overall success
|
| 444 |
+
has_errors = len(state.get("processing_errors", [])) > 0
|
| 445 |
+
state["success"] = not has_errors or state.get("is_relevant") is not None
|
| 446 |
+
|
| 447 |
+
logger.debug(f"Validation complete: status={state['processing_status']}")
|
| 448 |
+
return state
|
| 449 |
+
|
| 450 |
+
except Exception as e:
|
| 451 |
+
error_msg = f"Validation node error: {str(e)}"
|
| 452 |
+
logger.error(error_msg)
|
| 453 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 454 |
+
state["validation_passed"] = False
|
| 455 |
+
state["processing_status"] = "validation_failed"
|
| 456 |
+
state["success"] = False
|
| 457 |
+
return state
|
| 458 |
+
|
| 459 |
+
def _route_after_preprocessing(self, state: BrandAnalysisState) -> str:
|
| 460 |
+
"""
|
| 461 |
+
Determine routing after preprocessing.
|
| 462 |
+
"""
|
| 463 |
+
# If empty content, skip to validation
|
| 464 |
+
if state.get("is_empty", False):
|
| 465 |
+
state["is_relevant"] = False
|
| 466 |
+
state["relevance_reason"] = "Empty content"
|
| 467 |
+
return "skip_to_validation"
|
| 468 |
+
|
| 469 |
+
# If not English, skip to validation
|
| 470 |
+
if not state.get("is_english", True):
|
| 471 |
+
state["is_relevant"] = False
|
| 472 |
+
state["relevance_reason"] = f"Non-English: {state.get('detected_language')}"
|
| 473 |
+
return "skip_to_validation"
|
| 474 |
+
|
| 475 |
+
# If no keywords found and no need for validation, skip
|
| 476 |
+
if (not state.get("preliminary_relevant", False) and
|
| 477 |
+
not state.get("needs_relevance_validation", False)):
|
| 478 |
+
state["is_relevant"] = False
|
| 479 |
+
state["relevance_reason"] = "No relevant keywords found"
|
| 480 |
+
return "skip_to_validation"
|
| 481 |
+
|
| 482 |
+
# Otherwise, go to extraction
|
| 483 |
+
return "extract"
|
| 484 |
+
|
| 485 |
+
def _route_after_extraction(self, state: BrandAnalysisState) -> str:
|
| 486 |
+
"""
|
| 487 |
+
Determine routing after extraction.
|
| 488 |
+
"""
|
| 489 |
+
if state.get("is_relevant", False):
|
| 490 |
+
return "analyze"
|
| 491 |
+
return "skip_to_validation"
|
| 492 |
+
|
| 493 |
+
def process_post(self, post_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 494 |
+
"""
|
| 495 |
+
Process a single forum post through the workflow.
|
| 496 |
+
|
| 497 |
+
Args:
|
| 498 |
+
post_data: Dictionary containing post data
|
| 499 |
+
|
| 500 |
+
Returns:
|
| 501 |
+
Dictionary with processed results
|
| 502 |
+
"""
|
| 503 |
+
try:
|
| 504 |
+
initial_state = {
|
| 505 |
+
"post_id": post_data.get("post_id"),
|
| 506 |
+
"thread_id": post_data.get("thread_id"),
|
| 507 |
+
"post_author_id": post_data.get("post_author_id"),
|
| 508 |
+
"post_content": post_data.get("post_content", ""),
|
| 509 |
+
"thread_title": post_data.get("thread_title"),
|
| 510 |
+
"thread_first_post": post_data.get("thread_first_post"),
|
| 511 |
+
"thread_started_at": post_data.get("thread_started_at"),
|
| 512 |
+
"category_title": post_data.get("category_title"),
|
| 513 |
+
"category_topic": post_data.get("category_topic"),
|
| 514 |
+
"post_created_at": post_data.get("post_created_at"),
|
| 515 |
+
"processing_errors": [],
|
| 516 |
+
"success": True
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
final_state = self.workflow.invoke(initial_state)
|
| 520 |
+
|
| 521 |
+
return dict(final_state)
|
| 522 |
+
|
| 523 |
+
except Exception as e:
|
| 524 |
+
logger.error(f"Workflow execution error: {str(e)}")
|
| 525 |
+
return {
|
| 526 |
+
**post_data,
|
| 527 |
+
"success": False,
|
| 528 |
+
"processing_errors": [str(e)],
|
| 529 |
+
"processing_status": "workflow_error"
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
def process_batch(self, posts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 533 |
+
"""
|
| 534 |
+
Process a batch of forum posts.
|
| 535 |
+
|
| 536 |
+
Args:
|
| 537 |
+
posts: List of post dictionaries
|
| 538 |
+
|
| 539 |
+
Returns:
|
| 540 |
+
List of processed post dictionaries
|
| 541 |
+
"""
|
| 542 |
+
results = []
|
| 543 |
+
total = len(posts)
|
| 544 |
+
|
| 545 |
+
for idx, post in enumerate(posts, 1):
|
| 546 |
+
logger.info(f"Processing post {idx}/{total} (ID: {post.get('post_id')})")
|
| 547 |
+
result = self.process_post(post)
|
| 548 |
+
results.append(result)
|
| 549 |
+
|
| 550 |
+
logger.info(f"Batch processing complete: {total} posts processed")
|
| 551 |
+
return results
|
processing_comments/.dockerignore
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
**/__pycache__/
|
| 2 |
+
**/*.pyc
|
| 3 |
+
.git
|
| 4 |
+
.gitignore
|
| 5 |
+
.env
|
| 6 |
+
*.log
|
| 7 |
+
dist
|
| 8 |
+
build
|
processing_comments/LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright [yyyy] [name of copyright owner]
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
processing_comments/README.md
ADDED
|
@@ -0,0 +1,726 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Comment Processing with Agentic Workflow
|
| 2 |
+
|
| 3 |
+
A scalable, modular system for processing comments from multiple data sources using OpenAI API, LangChain, and LangGraph. The system performs language detection, translation, and context-aware sentiment analysis using an agentic workflow architecture.
|
| 4 |
+
|
| 5 |
+
## Data Sources Supported
|
| 6 |
+
|
| 7 |
+
- **Social Media Comments**: External platforms (Facebook, Instagram, YouTube, etc.)
|
| 8 |
+
- **Musora Internal Comments**: Comments from Musora internal applications
|
| 9 |
+
- **Extensible Architecture**: Easily add new data sources via configuration
|
| 10 |
+
|
| 11 |
+
## Features
|
| 12 |
+
|
| 13 |
+
- **Multi-Source Support**: Process comments from multiple data sources with a single codebase
|
| 14 |
+
- **Configuration-Driven**: Add new data sources without code changes
|
| 15 |
+
- **Parent Comment Context**: Automatically includes parent comment text for reply analysis
|
| 16 |
+
- **Modular Agent Architecture**: Extensible base classes for easy addition of new agents
|
| 17 |
+
- **Language Detection**: Hybrid approach using lingua library for fast English detection, with LLM fallback for non-English languages
|
| 18 |
+
- **Translation**: High-quality translation for non-English comments using OpenAI models
|
| 19 |
+
- **Context-Aware Sentiment Analysis**:
|
| 20 |
+
- Uses content description for context
|
| 21 |
+
- Includes parent comment text when analyzing replies
|
| 22 |
+
- Multi-label intent classification
|
| 23 |
+
- **LangGraph Workflow**: Flexible graph-based orchestration of agent operations
|
| 24 |
+
- **Snowflake Integration**: Seamless data fetching and storage with source-specific tables
|
| 25 |
+
- **Parallel Processing**: Multiprocessing support for high-performance batch processing
|
| 26 |
+
- **Dynamic Batch Sizing**: Intelligent batch size calculation based on workload and available resources
|
| 27 |
+
- **Independent Batch Execution**: Each batch processes and stores results independently
|
| 28 |
+
- **Comprehensive Logging**: Detailed logging for monitoring and debugging
|
| 29 |
+
- **Scalable Configuration**: Easy-to-modify sentiment categories and intents via JSON config
|
| 30 |
+
|
| 31 |
+
## Project Structure
|
| 32 |
+
|
| 33 |
+
```
|
| 34 |
+
musora-sentiment-analysis/
|
| 35 |
+
├── agents/
|
| 36 |
+
│ ├── __init__.py
|
| 37 |
+
│ ├── base_agent.py # Base class for all agents
|
| 38 |
+
│ ├── language_detection_agent.py # Language detection agent
|
| 39 |
+
│ ├── translation_agent.py # Translation agent
|
| 40 |
+
│ └── sentiment_analysis_agent.py # Sentiment analysis agent (parent context support)
|
| 41 |
+
├── workflow/
|
| 42 |
+
│ ├── __init__.py
|
| 43 |
+
│ └── comment_processor.py # LangGraph workflow orchestrator
|
| 44 |
+
├── sql/
|
| 45 |
+
│ ├── fetch_comments.sql # Query for social media comments (with parent join)
|
| 46 |
+
│ ├── fetch_musora_comments.sql # Query for Musora internal comments (with parent join)
|
| 47 |
+
│ ├── create_ml_features_table.sql # Schema for social media table (with parent fields)
|
| 48 |
+
│ ├── init_musora_table.sql # Initialize empty Musora table (run first!)
|
| 49 |
+
│ └── create_musora_ml_features_table.sql # Full Musora schema with views (optional)
|
| 50 |
+
├── config_files/
|
| 51 |
+
│ ├── data_sources_config.json # Data source configuration (NEW)
|
| 52 |
+
│ ├── sentiment_config.json # Configuration for agents and workflow
|
| 53 |
+
│ └── sentiment_analysis_config.json # Sentiment categories and intents
|
| 54 |
+
├── logs/ # Processing logs (auto-created)
|
| 55 |
+
├── LLM.py # LLM utility class
|
| 56 |
+
├── SnowFlakeConnection.py # Snowflake connection handler
|
| 57 |
+
├── main.py # Main execution script (multi-source support)
|
| 58 |
+
├── requirements.txt # Python dependencies
|
| 59 |
+
├── .env # Environment variables (not in git)
|
| 60 |
+
├── README.md # This file
|
| 61 |
+
└── CLAUDE.md # Detailed technical documentation
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
## Setup
|
| 65 |
+
|
| 66 |
+
### 1. Install Dependencies
|
| 67 |
+
|
| 68 |
+
```bash
|
| 69 |
+
pip install -r requirements.txt
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
### 2. Configure Environment Variables
|
| 73 |
+
|
| 74 |
+
Ensure your `.env` file contains the required credentials:
|
| 75 |
+
|
| 76 |
+
```env
|
| 77 |
+
# Snowflake
|
| 78 |
+
SNOWFLAKE_USER=your_user
|
| 79 |
+
SNOWFLAKE_PASSWORD=your_password
|
| 80 |
+
SNOWFLAKE_ACCOUNT=your_account
|
| 81 |
+
SNOWFLAKE_ROLE=your_role
|
| 82 |
+
SNOWFLAKE_DATABASE=SOCIAL_MEDIA_DB
|
| 83 |
+
SNOWFLAKE_WAREHOUSE=your_warehouse
|
| 84 |
+
SNOWFLAKE_SCHEMA=ML_FEATURES
|
| 85 |
+
|
| 86 |
+
# OpenAI
|
| 87 |
+
OPENAI_API_KEY=your_openai_key
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
### 3. Create Snowflake Tables
|
| 91 |
+
|
| 92 |
+
Run the SQL scripts to create the output tables:
|
| 93 |
+
|
| 94 |
+
```bash
|
| 95 |
+
# Execute the SQL files in Snowflake
|
| 96 |
+
# For social media comments (if not already exists)
|
| 97 |
+
sql/create_ml_features_table.sql
|
| 98 |
+
|
| 99 |
+
# For Musora internal comments - INITIAL SETUP (First time only)
|
| 100 |
+
# This creates the empty table structure
|
| 101 |
+
sql/init_musora_table.sql
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
**Note**: Run `init_musora_table.sql` before the first Musora comments processing run. After that, you can optionally run `create_musora_ml_features_table.sql` to create the additional views if needed.
|
| 105 |
+
|
| 106 |
+
## Usage
|
| 107 |
+
|
| 108 |
+
### Basic Usage (Process All Data Sources)
|
| 109 |
+
|
| 110 |
+
Process unprocessed comments from all enabled data sources:
|
| 111 |
+
|
| 112 |
+
```bash
|
| 113 |
+
python main.py
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
This will:
|
| 117 |
+
- Process all enabled data sources (social media and Musora comments)
|
| 118 |
+
- Fetch only comments that haven't been processed yet
|
| 119 |
+
- Process them through the workflow using parallel workers (CPU count - 2, max 5)
|
| 120 |
+
- Each batch processes and stores to Snowflake independently
|
| 121 |
+
- Append new results to the existing tables (no overwrite)
|
| 122 |
+
|
| 123 |
+
### Process Specific Data Source
|
| 124 |
+
|
| 125 |
+
Process only social media comments:
|
| 126 |
+
|
| 127 |
+
```bash
|
| 128 |
+
python main.py --data-source social_media
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
Process only Musora internal comments:
|
| 132 |
+
|
| 133 |
+
```bash
|
| 134 |
+
python main.py --data-source musora_comments
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
### Process Limited Number of Comments
|
| 138 |
+
|
| 139 |
+
Limit applies per data source:
|
| 140 |
+
|
| 141 |
+
```bash
|
| 142 |
+
# Process 100 comments from each enabled data source
|
| 143 |
+
python main.py --limit 100
|
| 144 |
+
|
| 145 |
+
# Process 100 comments from only Musora source
|
| 146 |
+
python main.py --limit 100 --data-source musora_comments
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
### Sequential Processing (Debug Mode)
|
| 150 |
+
|
| 151 |
+
For debugging purposes, use sequential processing:
|
| 152 |
+
|
| 153 |
+
```bash
|
| 154 |
+
python main.py --limit 100 --sequential
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
This processes all comments in a single batch, making it easier to debug issues.
|
| 158 |
+
|
| 159 |
+
### First Run for New Data Source
|
| 160 |
+
|
| 161 |
+
For the first run of Musora comments:
|
| 162 |
+
|
| 163 |
+
1. **First**: Run the initialization SQL script in Snowflake:
|
| 164 |
+
```sql
|
| 165 |
+
-- Execute in Snowflake
|
| 166 |
+
sql/init_musora_table.sql
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
2. **Then**: Run the processing with overwrite flag:
|
| 170 |
+
```bash
|
| 171 |
+
python main.py --overwrite --data-source musora_comments --limit 100
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
**Why two steps?**
|
| 175 |
+
- The fetch query checks for already-processed comments by querying the output table
|
| 176 |
+
- On first run, that table doesn't exist, causing an error
|
| 177 |
+
- The init script creates the empty table structure first
|
| 178 |
+
- Then processing can run normally
|
| 179 |
+
|
| 180 |
+
**Warning**: Overwrite will replace all existing data in the output table. Only use for initial table creation or when reprocessing from scratch.
|
| 181 |
+
|
| 182 |
+
### Custom Configuration File
|
| 183 |
+
|
| 184 |
+
```bash
|
| 185 |
+
python main.py --config path/to/custom_config.json
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
### Command-Line Arguments
|
| 189 |
+
|
| 190 |
+
- `--limit N`: Process only N comments per data source (default: 10000)
|
| 191 |
+
- `--overwrite`: Overwrite existing Snowflake table (default: append mode)
|
| 192 |
+
- `--config PATH`: Custom configuration file path
|
| 193 |
+
- `--sequential`: Use sequential processing instead of parallel (for debugging)
|
| 194 |
+
- `--data-source SOURCE`: Process only specific data source (e.g., social_media, musora_comments)
|
| 195 |
+
|
| 196 |
+
### Parallel Processing
|
| 197 |
+
|
| 198 |
+
The system uses multiprocessing to process comments in parallel:
|
| 199 |
+
|
| 200 |
+
**Worker Calculation**:
|
| 201 |
+
- Number of workers: `CPU count - 2` (max 5 workers)
|
| 202 |
+
- Leaves CPU cores available for system operations
|
| 203 |
+
- Example: 8-core system → 5 workers (capped at max)
|
| 204 |
+
|
| 205 |
+
**Dynamic Batch Sizing**:
|
| 206 |
+
- Batch size calculated as: `total_comments / num_workers`
|
| 207 |
+
- Minimum batch size: 20 comments
|
| 208 |
+
- Maximum batch size: 1000 comments
|
| 209 |
+
- Batches ≤ 20 comments are not split
|
| 210 |
+
|
| 211 |
+
**Independent Execution**:
|
| 212 |
+
- Each batch runs in a separate process
|
| 213 |
+
- Batches store to Snowflake immediately upon completion
|
| 214 |
+
- No waiting for all batches to complete
|
| 215 |
+
- Failed batches don't affect successful ones
|
| 216 |
+
|
| 217 |
+
**Performance**:
|
| 218 |
+
- Expected speedup: ~1.8-4.5x depending on number of workers
|
| 219 |
+
- Real-time progress reporting as batches complete
|
| 220 |
+
- Processing time and average per comment displayed in summary
|
| 221 |
+
|
| 222 |
+
### Incremental Processing
|
| 223 |
+
|
| 224 |
+
The pipeline is designed for incremental processing:
|
| 225 |
+
- **Automatic deduplication**: SQL query excludes comments already in `COMMENT_SENTIMENT_FEATURES`
|
| 226 |
+
- **Append-only by default**: New results are added without overwriting existing data
|
| 227 |
+
- **Failed comment retry**: Comments with `success=False` are not stored and will be retried in future runs
|
| 228 |
+
- **Run regularly**: Safe to run daily/weekly to process new comments
|
| 229 |
+
|
| 230 |
+
## Configuration
|
| 231 |
+
|
| 232 |
+
### Data Sources Configuration
|
| 233 |
+
|
| 234 |
+
The `config_files/data_sources_config.json` file defines available data sources:
|
| 235 |
+
|
| 236 |
+
```json
|
| 237 |
+
{
|
| 238 |
+
"data_sources": {
|
| 239 |
+
"social_media": {
|
| 240 |
+
"name": "Social Media Comments",
|
| 241 |
+
"enabled": true,
|
| 242 |
+
"sql_query_file": "sql/fetch_comments.sql",
|
| 243 |
+
"output_config": {
|
| 244 |
+
"table_name": "COMMENT_SENTIMENT_FEATURES",
|
| 245 |
+
"database": "SOCIAL_MEDIA_DB",
|
| 246 |
+
"schema": "ML_FEATURES"
|
| 247 |
+
}
|
| 248 |
+
},
|
| 249 |
+
"musora_comments": {
|
| 250 |
+
"name": "Musora Internal Comments",
|
| 251 |
+
"enabled": true,
|
| 252 |
+
"sql_query_file": "sql/fetch_musora_comments.sql",
|
| 253 |
+
"output_config": {
|
| 254 |
+
"table_name": "MUSORA_COMMENT_SENTIMENT_FEATURES",
|
| 255 |
+
"database": "SOCIAL_MEDIA_DB",
|
| 256 |
+
"schema": "ML_FEATURES"
|
| 257 |
+
},
|
| 258 |
+
"additional_fields": [
|
| 259 |
+
"PERMALINK_URL",
|
| 260 |
+
"THUMBNAIL_URL"
|
| 261 |
+
]
|
| 262 |
+
}
|
| 263 |
+
}
|
| 264 |
+
}
|
| 265 |
+
```
|
| 266 |
+
|
| 267 |
+
**To add a new data source**: Simply add a new entry to this config file and create the corresponding SQL query file.
|
| 268 |
+
|
| 269 |
+
### Agent Configuration
|
| 270 |
+
|
| 271 |
+
The `config_files/sentiment_config.json` file controls agent behavior:
|
| 272 |
+
|
| 273 |
+
```json
|
| 274 |
+
{
|
| 275 |
+
"agents": {
|
| 276 |
+
"language_detection": {
|
| 277 |
+
"model": "gpt-5-nano",
|
| 278 |
+
"temperature": 0.0,
|
| 279 |
+
"max_retries": 3
|
| 280 |
+
},
|
| 281 |
+
"translation": {
|
| 282 |
+
"model": "gpt-5-nano",
|
| 283 |
+
"temperature": 0.3,
|
| 284 |
+
"max_retries": 3
|
| 285 |
+
},
|
| 286 |
+
"sentiment_analysis": {
|
| 287 |
+
"model": "gpt-5-nano",
|
| 288 |
+
"temperature": 0.2,
|
| 289 |
+
"max_retries": 3
|
| 290 |
+
}
|
| 291 |
+
},
|
| 292 |
+
"workflow": {
|
| 293 |
+
"description": "Batch size is calculated dynamically based on number of workers (min: 20, max: 1000)",
|
| 294 |
+
"parallel_processing": {
|
| 295 |
+
"enabled": true,
|
| 296 |
+
"worker_calculation": "CPU count - 2, max 5 workers",
|
| 297 |
+
"min_batch_size": 20,
|
| 298 |
+
"max_batch_size": 1000
|
| 299 |
+
}
|
| 300 |
+
},
|
| 301 |
+
"snowflake": {
|
| 302 |
+
"output_table": "COMMENT_SENTIMENT_FEATURES",
|
| 303 |
+
"database": "SOCIAL_MEDIA_DB",
|
| 304 |
+
"schema": "ML_FEATURES"
|
| 305 |
+
}
|
| 306 |
+
}
|
| 307 |
+
```
|
| 308 |
+
|
| 309 |
+
**Note**: Batch size is now calculated dynamically and no longer needs to be configured manually.
|
| 310 |
+
|
| 311 |
+
### Sentiment Categories Configuration
|
| 312 |
+
|
| 313 |
+
The `config_files/sentiment_analysis_config.json` file defines sentiment categories and intents (easily extensible):
|
| 314 |
+
|
| 315 |
+
```json
|
| 316 |
+
{
|
| 317 |
+
"sentiment_polarity": {
|
| 318 |
+
"categories": [
|
| 319 |
+
{"value": "very_positive", "label": "Very Positive", "description": "..."},
|
| 320 |
+
{"value": "positive", "label": "Positive", "description": "..."},
|
| 321 |
+
{"value": "neutral", "label": "Neutral", "description": "..."},
|
| 322 |
+
{"value": "negative", "label": "Negative", "description": "..."},
|
| 323 |
+
{"value": "very_negative", "label": "Very Negative", "description": "..."}
|
| 324 |
+
]
|
| 325 |
+
},
|
| 326 |
+
"intent": {
|
| 327 |
+
"categories": [
|
| 328 |
+
{"value": "praise", "label": "Praise", "description": "..."},
|
| 329 |
+
{"value": "question", "label": "Question", "description": "..."},
|
| 330 |
+
{"value": "request", "label": "Request", "description": "..."},
|
| 331 |
+
{"value": "feedback_negative", "label": "Negative Feedback", "description": "..."},
|
| 332 |
+
{"value": "suggestion", "label": "Suggestion", "description": "..."},
|
| 333 |
+
{"value": "humor_sarcasm", "label": "Humor/Sarcasm", "description": "..."},
|
| 334 |
+
{"value": "off_topic", "label": "Off Topic", "description": "..."},
|
| 335 |
+
{"value": "spam_selfpromo", "label": "Spam/Self-Promotion", "description": "..."}
|
| 336 |
+
]
|
| 337 |
+
},
|
| 338 |
+
"reply_policy": {
|
| 339 |
+
"requires_reply_intents": ["question", "request"],
|
| 340 |
+
"description": "Comments with these intents should be flagged for reply"
|
| 341 |
+
},
|
| 342 |
+
"intent_settings": {
|
| 343 |
+
"multi_label": true,
|
| 344 |
+
"description": "Intent can have multiple labels as a comment can express multiple intents"
|
| 345 |
+
}
|
| 346 |
+
}
|
| 347 |
+
```
|
| 348 |
+
|
| 349 |
+
## Adding New Agents
|
| 350 |
+
|
| 351 |
+
The system is designed for easy extensibility. To add a new agent:
|
| 352 |
+
|
| 353 |
+
### 1. Create Agent Class
|
| 354 |
+
|
| 355 |
+
```python
|
| 356 |
+
from agents.base_agent import BaseAgent
|
| 357 |
+
from typing import Dict, Any
|
| 358 |
+
|
| 359 |
+
class MyNewAgent(BaseAgent):
|
| 360 |
+
def __init__(self, config: Dict[str, Any], api_key: str):
|
| 361 |
+
super().__init__("MyNewAgent", config)
|
| 362 |
+
# Initialize your agent-specific components
|
| 363 |
+
|
| 364 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 365 |
+
# Validate input data
|
| 366 |
+
return True
|
| 367 |
+
|
| 368 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 369 |
+
# Implement your agent logic
|
| 370 |
+
return {"success": True, "result": "..."}
|
| 371 |
+
```
|
| 372 |
+
|
| 373 |
+
### 2. Update Workflow
|
| 374 |
+
|
| 375 |
+
Add the agent to `workflow/comment_processor.py`:
|
| 376 |
+
|
| 377 |
+
```python
|
| 378 |
+
# Add to CommentState TypedDict
|
| 379 |
+
new_agent_result: str
|
| 380 |
+
|
| 381 |
+
# Add node
|
| 382 |
+
workflow.add_node("my_new_agent", self._my_new_agent_node)
|
| 383 |
+
|
| 384 |
+
# Add edges
|
| 385 |
+
workflow.add_edge("translation", "my_new_agent")
|
| 386 |
+
workflow.add_edge("my_new_agent", END)
|
| 387 |
+
```
|
| 388 |
+
|
| 389 |
+
### 3. Update Configuration
|
| 390 |
+
|
| 391 |
+
Add agent config to `sentiment_config.json`:
|
| 392 |
+
|
| 393 |
+
```json
|
| 394 |
+
{
|
| 395 |
+
"agents": {
|
| 396 |
+
"my_new_agent": {
|
| 397 |
+
"name": "MyNewAgent",
|
| 398 |
+
"model": "gpt-4o-mini",
|
| 399 |
+
"temperature": 0.5,
|
| 400 |
+
"max_retries": 3
|
| 401 |
+
}
|
| 402 |
+
}
|
| 403 |
+
}
|
| 404 |
+
```
|
| 405 |
+
|
| 406 |
+
## Output Schema
|
| 407 |
+
|
| 408 |
+
### Social Media Comments Table
|
| 409 |
+
Stored in `SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES`
|
| 410 |
+
|
| 411 |
+
### Musora Comments Table
|
| 412 |
+
Stored in `SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES`
|
| 413 |
+
|
| 414 |
+
### Common Columns (Both Tables)
|
| 415 |
+
|
| 416 |
+
| Column | Type | Description |
|
| 417 |
+
|--------|------|-------------|
|
| 418 |
+
| COMMENT_SK | NUMBER | Surrogate key from source |
|
| 419 |
+
| COMMENT_ID | VARCHAR | Platform comment ID |
|
| 420 |
+
| ORIGINAL_TEXT | VARCHAR | Original comment text |
|
| 421 |
+
| **PARENT_COMMENT_ID** | **VARCHAR** | **ID of parent comment if this is a reply** |
|
| 422 |
+
| **PARENT_COMMENT_TEXT** | **VARCHAR** | **Text of parent comment for context** |
|
| 423 |
+
| DETECTED_LANGUAGE | VARCHAR | Detected language name |
|
| 424 |
+
| LANGUAGE_CODE | VARCHAR | ISO 639-1 code |
|
| 425 |
+
| IS_ENGLISH | BOOLEAN | Is comment in English |
|
| 426 |
+
| TRANSLATED_TEXT | VARCHAR | English translation |
|
| 427 |
+
| TRANSLATION_PERFORMED | BOOLEAN | Was translation performed |
|
| 428 |
+
| SENTIMENT_POLARITY | VARCHAR | Sentiment (very_positive, positive, neutral, negative, very_negative) |
|
| 429 |
+
| INTENT | VARCHAR | Multi-label intents (comma-separated) |
|
| 430 |
+
| REQUIRES_REPLY | BOOLEAN | Does comment need a response |
|
| 431 |
+
| SENTIMENT_CONFIDENCE | VARCHAR | Analysis confidence (high, medium, low) |
|
| 432 |
+
| PROCESSING_SUCCESS | BOOLEAN | Processing status |
|
| 433 |
+
| PROCESSED_AT | TIMESTAMP | Processing timestamp |
|
| 434 |
+
|
| 435 |
+
### Musora-Specific Additional Columns
|
| 436 |
+
|
| 437 |
+
| Column | Type | Description |
|
| 438 |
+
|--------|------|-------------|
|
| 439 |
+
| PERMALINK_URL | VARCHAR | Web URL path of the content |
|
| 440 |
+
| THUMBNAIL_URL | VARCHAR | Thumbnail URL of the content |
|
| 441 |
+
|
| 442 |
+
### Available Views
|
| 443 |
+
|
| 444 |
+
**Social Media:**
|
| 445 |
+
- `VW_COMMENTS_REQUIRING_REPLY`: Comments that need responses (includes parent comment info)
|
| 446 |
+
- `VW_SENTIMENT_DISTRIBUTION`: Sentiment and intent statistics by channel (includes reply comment count)
|
| 447 |
+
- `VW_NON_ENGLISH_COMMENTS`: Filtered view of non-English comments
|
| 448 |
+
|
| 449 |
+
**Musora:**
|
| 450 |
+
- `VW_MUSORA_COMMENTS_REQUIRING_REPLY`: Musora comments needing responses
|
| 451 |
+
- `VW_MUSORA_SENTIMENT_DISTRIBUTION`: Musora sentiment and intent statistics
|
| 452 |
+
- `VW_MUSORA_NON_ENGLISH_COMMENTS`: Non-English Musora comments
|
| 453 |
+
|
| 454 |
+
## Workflow Architecture
|
| 455 |
+
|
| 456 |
+
The system uses LangGraph to create a flexible, state-based workflow:
|
| 457 |
+
|
| 458 |
+
```
|
| 459 |
+
┌─────────────────────┐
|
| 460 |
+
│ Fetch Comments │
|
| 461 |
+
│ from Snowflake │
|
| 462 |
+
│ (Unprocessed Only) │
|
| 463 |
+
└──────────┬──────────┘
|
| 464 |
+
│
|
| 465 |
+
▼
|
| 466 |
+
┌─────────────────────┐
|
| 467 |
+
│ Language Detection │
|
| 468 |
+
│ Agent │
|
| 469 |
+
└──────────┬──────────┘
|
| 470 |
+
│
|
| 471 |
+
▼
|
| 472 |
+
┌────┴────┐
|
| 473 |
+
│ English?│
|
| 474 |
+
└────┬────┘
|
| 475 |
+
│
|
| 476 |
+
┌─────┴─────┐
|
| 477 |
+
│ │
|
| 478 |
+
Yes No
|
| 479 |
+
│ │
|
| 480 |
+
│ ▼
|
| 481 |
+
│ ┌─────────────┐
|
| 482 |
+
│ │ Translation │
|
| 483 |
+
│ │ Agent │
|
| 484 |
+
│ └──────┬──────┘
|
| 485 |
+
│ │
|
| 486 |
+
└─────┬─────┘
|
| 487 |
+
│
|
| 488 |
+
▼
|
| 489 |
+
┌──────────────────┐
|
| 490 |
+
│ Sentiment │
|
| 491 |
+
│ Analysis Agent │
|
| 492 |
+
└─────────┬────────┘
|
| 493 |
+
│
|
| 494 |
+
▼
|
| 495 |
+
┌──────────────┐
|
| 496 |
+
│Store Results │
|
| 497 |
+
│to Snowflake │
|
| 498 |
+
│(Append Mode) │
|
| 499 |
+
└──────────────┘
|
| 500 |
+
```
|
| 501 |
+
|
| 502 |
+
**Note**: The fetch step automatically excludes comments already present in `COMMENT_SENTIMENT_FEATURES`, enabling incremental processing.
|
| 503 |
+
|
| 504 |
+
## Logging
|
| 505 |
+
|
| 506 |
+
Logs are automatically created in the `logs/` directory with timestamps:
|
| 507 |
+
|
| 508 |
+
```
|
| 509 |
+
logs/comment_processing_20251001_143022.log
|
| 510 |
+
```
|
| 511 |
+
|
| 512 |
+
## Adding New Data Sources
|
| 513 |
+
|
| 514 |
+
The system is designed to make adding new data sources easy:
|
| 515 |
+
|
| 516 |
+
### Steps to Add a New Source:
|
| 517 |
+
|
| 518 |
+
1. **Update Configuration** (`config_files/data_sources_config.json`):
|
| 519 |
+
```json
|
| 520 |
+
"your_new_source": {
|
| 521 |
+
"name": "Your New Source Name",
|
| 522 |
+
"enabled": true,
|
| 523 |
+
"sql_query_file": "sql/fetch_your_source.sql",
|
| 524 |
+
"output_config": {
|
| 525 |
+
"table_name": "YOUR_SOURCE_SENTIMENT_FEATURES",
|
| 526 |
+
"database": "SOCIAL_MEDIA_DB",
|
| 527 |
+
"schema": "ML_FEATURES"
|
| 528 |
+
},
|
| 529 |
+
"additional_fields": ["FIELD1", "FIELD2"] // Optional
|
| 530 |
+
}
|
| 531 |
+
```
|
| 532 |
+
|
| 533 |
+
2. **Create SQL Query File** (`sql/fetch_your_source.sql`):
|
| 534 |
+
- Fetch comments with consistent column names
|
| 535 |
+
- Include self-join for parent comments if available
|
| 536 |
+
- Exclude already-processed comments (LEFT JOIN with output table)
|
| 537 |
+
|
| 538 |
+
3. **Create Table Initialization Script** (`sql/init_your_source_table.sql`):
|
| 539 |
+
- Creates empty table structure
|
| 540 |
+
- Base schema on `init_musora_table.sql`
|
| 541 |
+
- Add source-specific fields as needed
|
| 542 |
+
- **Run this in Snowflake FIRST before processing**
|
| 543 |
+
|
| 544 |
+
4. **Create Full Schema** (optional):
|
| 545 |
+
- Base schema on `create_musora_ml_features_table.sql`
|
| 546 |
+
- Include views and indexes
|
| 547 |
+
|
| 548 |
+
5. **Run First Time**:
|
| 549 |
+
```bash
|
| 550 |
+
# Step 1: Run init script in Snowflake
|
| 551 |
+
sql/init_your_source_table.sql
|
| 552 |
+
|
| 553 |
+
# Step 2: Process first batch
|
| 554 |
+
python main.py --overwrite --data-source your_new_source --limit 100
|
| 555 |
+
```
|
| 556 |
+
|
| 557 |
+
**No code changes required!**
|
| 558 |
+
|
| 559 |
+
## Best Practices
|
| 560 |
+
|
| 561 |
+
1. **Testing**: Always test with `--limit` flag first (e.g., `--limit 100`)
|
| 562 |
+
2. **New Data Sources**: Test new sources with `--sequential --limit 100` first
|
| 563 |
+
3. **Debugging**: Use `--sequential` flag for easier debugging of processing issues
|
| 564 |
+
4. **Incremental Processing**: Run regularly without `--overwrite` to process only new comments
|
| 565 |
+
5. **Monitoring**: Check logs for processing errors and batch completion
|
| 566 |
+
6. **Performance**: Use default parallel mode for production workloads
|
| 567 |
+
7. **Extensibility**: Follow the base agent pattern for consistency
|
| 568 |
+
8. **Error Handling**: All agents include robust error handling
|
| 569 |
+
9. **Failed Comments**: Review logs for failed comments - they'll be automatically retried in future runs
|
| 570 |
+
10. **Resource Management**: System automatically adapts to available CPU resources
|
| 571 |
+
11. **Parent Comments**: Ensure SQL queries include parent comment joins for best accuracy
|
| 572 |
+
|
| 573 |
+
## Sentiment Analysis Features
|
| 574 |
+
|
| 575 |
+
### Multi-Label Intent Classification
|
| 576 |
+
|
| 577 |
+
The system supports **multi-label intent classification**, meaning a single comment can have multiple intents:
|
| 578 |
+
|
| 579 |
+
- **Example**: "This is amazing! What scale are you using?" → `["praise", "question"]`
|
| 580 |
+
- **Example**: "Love this but can you make a tutorial on it?" → `["praise", "request"]`
|
| 581 |
+
|
| 582 |
+
### Context-Aware Analysis with Parent Comment Support
|
| 583 |
+
|
| 584 |
+
The sentiment analysis agent provides rich context understanding:
|
| 585 |
+
|
| 586 |
+
1. **Content Context**: Uses the `content_description` field to understand what the comment is about
|
| 587 |
+
2. **Parent Comment Context** (NEW): When analyzing reply comments, the system:
|
| 588 |
+
- Automatically detects when a comment is a reply
|
| 589 |
+
- Fetches the parent comment text from the database
|
| 590 |
+
- Includes parent comment in the LLM prompt
|
| 591 |
+
- Explicitly instructs the LLM that this is a reply comment
|
| 592 |
+
- Results in more accurate sentiment and intent classification
|
| 593 |
+
|
| 594 |
+
**Example**:
|
| 595 |
+
- Parent Comment: "Does anyone know how to play this riff?"
|
| 596 |
+
- Reply Comment: "Yes!"
|
| 597 |
+
- Without parent context: Might be classified as unclear/off-topic
|
| 598 |
+
- With parent context: Correctly classified as answering a question
|
| 599 |
+
|
| 600 |
+
This dramatically improves accuracy for:
|
| 601 |
+
- Short reply comments ("Yes", "Thanks!", "Agreed")
|
| 602 |
+
- Sarcastic replies (context crucial for understanding)
|
| 603 |
+
- Continuation of discussions
|
| 604 |
+
- Agreement/disagreement comments
|
| 605 |
+
|
| 606 |
+
### Failure Handling & Reprocessing
|
| 607 |
+
|
| 608 |
+
Comments that fail sentiment analysis (missing critical fields like sentiment_polarity or intents) are:
|
| 609 |
+
- Marked as `success=False` in the workflow
|
| 610 |
+
- **NOT stored in Snowflake**
|
| 611 |
+
- **Automatically available for reprocessing** in future runs
|
| 612 |
+
|
| 613 |
+
This ensures only successfully processed comments are stored, while failed comments remain available for retry.
|
| 614 |
+
|
| 615 |
+
### Incremental Processing & Deduplication
|
| 616 |
+
|
| 617 |
+
The pipeline automatically handles incremental processing:
|
| 618 |
+
- **SQL-level deduplication**: Query excludes comments already in `COMMENT_SENTIMENT_FEATURES` using `LEFT JOIN`
|
| 619 |
+
- **Automatic retry**: Failed comments (not stored) are automatically retried on next run
|
| 620 |
+
- **Append-only mode**: Default behavior appends new records without overwriting
|
| 621 |
+
- **Production-ready**: Safe to run daily/weekly/monthly to process new comments
|
| 622 |
+
|
| 623 |
+
### Scalable Configuration
|
| 624 |
+
|
| 625 |
+
To add or modify sentiment categories or intents:
|
| 626 |
+
|
| 627 |
+
1. Edit `config_files/sentiment_analysis_config.json`
|
| 628 |
+
2. Add/modify categories in the `sentiment_polarity` or `intent` sections
|
| 629 |
+
3. Update `reply_policy.requires_reply_intents` if needed
|
| 630 |
+
4. No code changes required!
|
| 631 |
+
|
| 632 |
+
## Future Extensions
|
| 633 |
+
|
| 634 |
+
The modular architecture supports easy addition of:
|
| 635 |
+
|
| 636 |
+
- Topic classification agent
|
| 637 |
+
- Entity extraction agent
|
| 638 |
+
- Engagement score prediction agent
|
| 639 |
+
- Named entity recognition agent
|
| 640 |
+
|
| 641 |
+
Simply create a new agent inheriting from `BaseAgent` and add it to the workflow graph.
|
| 642 |
+
|
| 643 |
+
## Troubleshooting
|
| 644 |
+
|
| 645 |
+
### Issue: "Object does not exist or not authorized" on First Run
|
| 646 |
+
|
| 647 |
+
**Error**: `Object 'SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES' does not exist or not authorized`
|
| 648 |
+
|
| 649 |
+
**Cause**: The fetch query tries to check for already-processed comments, but the output table doesn't exist yet on first run.
|
| 650 |
+
|
| 651 |
+
**Solution**:
|
| 652 |
+
1. Run the initialization script first:
|
| 653 |
+
```sql
|
| 654 |
+
-- Execute in Snowflake
|
| 655 |
+
sql/init_musora_table.sql
|
| 656 |
+
```
|
| 657 |
+
2. Then run the processing:
|
| 658 |
+
```bash
|
| 659 |
+
python main.py --overwrite --data-source musora_comments --limit 100
|
| 660 |
+
```
|
| 661 |
+
|
| 662 |
+
### Issue: API Rate Limits
|
| 663 |
+
|
| 664 |
+
If hitting API rate limits, reduce the number of parallel workers or process fewer comments:
|
| 665 |
+
```bash
|
| 666 |
+
# Process fewer comments at a time
|
| 667 |
+
python main.py --limit 500
|
| 668 |
+
|
| 669 |
+
# Or use sequential mode
|
| 670 |
+
python main.py --sequential --limit 100
|
| 671 |
+
```
|
| 672 |
+
|
| 673 |
+
### Issue: Memory Issues
|
| 674 |
+
|
| 675 |
+
Process in smaller batches using `--limit`:
|
| 676 |
+
```bash
|
| 677 |
+
python main.py --limit 500
|
| 678 |
+
```
|
| 679 |
+
|
| 680 |
+
### Issue: Debugging Processing Errors
|
| 681 |
+
|
| 682 |
+
Use sequential mode to debug issues more easily:
|
| 683 |
+
```bash
|
| 684 |
+
python main.py --sequential --limit 50
|
| 685 |
+
```
|
| 686 |
+
|
| 687 |
+
This processes all comments in a single batch with clearer error messages.
|
| 688 |
+
|
| 689 |
+
### Issue: Connection Timeouts
|
| 690 |
+
|
| 691 |
+
Check Snowflake credentials in `.env` and network connectivity.
|
| 692 |
+
|
| 693 |
+
### Issue: Parallel Processing Not Working
|
| 694 |
+
|
| 695 |
+
If multiprocessing issues occur, use sequential mode:
|
| 696 |
+
```bash
|
| 697 |
+
python main.py --sequential
|
| 698 |
+
```
|
| 699 |
+
|
| 700 |
+
## Performance
|
| 701 |
+
|
| 702 |
+
### Expected Speedup
|
| 703 |
+
|
| 704 |
+
Parallel processing provides significant performance improvements:
|
| 705 |
+
|
| 706 |
+
- **Sequential**: 1x (baseline)
|
| 707 |
+
- **2 workers**: ~1.8-1.9x faster
|
| 708 |
+
- **5 workers**: ~4-4.5x faster
|
| 709 |
+
|
| 710 |
+
Speedup isn't perfectly linear due to:
|
| 711 |
+
- Snowflake connection overhead
|
| 712 |
+
- LLM API rate limits (shared across workers)
|
| 713 |
+
- I/O operations
|
| 714 |
+
|
| 715 |
+
### Monitoring Performance
|
| 716 |
+
|
| 717 |
+
The processing summary includes:
|
| 718 |
+
- Total processing time
|
| 719 |
+
- Average time per comment
|
| 720 |
+
- Number of workers used
|
| 721 |
+
- Batch size calculations
|
| 722 |
+
- Failed batches (if any)
|
| 723 |
+
|
| 724 |
+
## License
|
| 725 |
+
|
| 726 |
+
Internal use only - Musora sentiment analysis project.
|
processing_comments/SnowFlakeConnection.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This class create a connection to Snowflake, run queries (read and write)
|
| 3 |
+
"""
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from snowflake.snowpark import Session
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
import logging
|
| 9 |
+
logger = logging.getLogger()
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
class SnowFlakeConn:
|
| 13 |
+
def __init__(self):
|
| 14 |
+
self. session = self.connect_to_snowflake()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# =========================================================
|
| 18 |
+
def connect_to_snowflake(self):
|
| 19 |
+
# --- Snowflake connection via env vars ---
|
| 20 |
+
conn = dict(
|
| 21 |
+
user=self.get_credential("SNOWFLAKE_USER"),
|
| 22 |
+
password=self.get_credential("SNOWFLAKE_PASSWORD"),
|
| 23 |
+
account=self.get_credential("SNOWFLAKE_ACCOUNT"),
|
| 24 |
+
role=self.get_credential("SNOWFLAKE_ROLE"),
|
| 25 |
+
database=self.get_credential("SNOWFLAKE_DATABASE"),
|
| 26 |
+
warehouse=self.get_credential("SNOWFLAKE_WAREHOUSE"),
|
| 27 |
+
schema=self.get_credential("SNOWFLAKE_SCHEMA"),
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
session = Session.builder.configs(conn).create()
|
| 31 |
+
return session
|
| 32 |
+
|
| 33 |
+
# =========================================================
|
| 34 |
+
def get_credential(self, key):
|
| 35 |
+
return os.getenv(key)
|
| 36 |
+
|
| 37 |
+
# =========================================================
|
| 38 |
+
def run_read_query(self, query, data):
|
| 39 |
+
"""
|
| 40 |
+
Executes a SQL query on Snowflake that fetch the data
|
| 41 |
+
:return: Pandas dataframe containing the query results
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
# Connect to Snowflake
|
| 45 |
+
try:
|
| 46 |
+
dataframe = self.session.sql(query).to_pandas()
|
| 47 |
+
dataframe.columns = dataframe.columns.str.lower()
|
| 48 |
+
print(f"reading {data} table successfully")
|
| 49 |
+
return dataframe
|
| 50 |
+
except Exception as e:
|
| 51 |
+
print(f"Error in creating/updating table: {e}")
|
| 52 |
+
|
| 53 |
+
# =========================================================
|
| 54 |
+
def store_df_to_snowflake(self, table_name, dataframe, database="SOCIAL_MEDIA_DB", schema="ML_FEATURES", overwrite=False):
|
| 55 |
+
"""
|
| 56 |
+
Executes a SQL query on Snowflake that write the preprocessed data on new tables
|
| 57 |
+
:param query: SQL query string to be executed
|
| 58 |
+
:return: None
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
self.session.use_database(database)
|
| 63 |
+
self.session.use_schema(schema)
|
| 64 |
+
|
| 65 |
+
dataframe = dataframe.reset_index(drop=True)
|
| 66 |
+
dataframe.columns = dataframe.columns.str.upper()
|
| 67 |
+
|
| 68 |
+
self.session.write_pandas(df=dataframe,
|
| 69 |
+
table_name=table_name.strip().upper(),
|
| 70 |
+
auto_create_table=True,
|
| 71 |
+
overwrite=overwrite,
|
| 72 |
+
use_logical_type=True)
|
| 73 |
+
print(f"Data inserted into {table_name} successfully.")
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f"Error in creating/updating/inserting table: {e}")
|
| 77 |
+
|
| 78 |
+
# =========================================================
|
| 79 |
+
def execute_sql_file(self, file_path):
|
| 80 |
+
"""
|
| 81 |
+
Executes SQL queries from a file
|
| 82 |
+
:param file_path: Path to SQL file
|
| 83 |
+
:return: Query result or None for DDL/DML
|
| 84 |
+
"""
|
| 85 |
+
try:
|
| 86 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
| 87 |
+
sql_content = file.read()
|
| 88 |
+
|
| 89 |
+
result = self.session.sql(sql_content).collect()
|
| 90 |
+
print(f"Successfully executed SQL from {file_path}")
|
| 91 |
+
return result
|
| 92 |
+
except Exception as e:
|
| 93 |
+
print(f"Error executing SQL file {file_path}: {e}")
|
| 94 |
+
return None
|
| 95 |
+
|
| 96 |
+
# =========================================================
|
| 97 |
+
def execute_query(self, query, description="query"):
|
| 98 |
+
"""
|
| 99 |
+
Executes a SQL query and returns results
|
| 100 |
+
:param query: SQL query string
|
| 101 |
+
:param description: Description of the query for logging
|
| 102 |
+
:return: Query results
|
| 103 |
+
"""
|
| 104 |
+
try:
|
| 105 |
+
result = self.session.sql(query).collect()
|
| 106 |
+
print(f"Successfully executed {description}")
|
| 107 |
+
return result
|
| 108 |
+
except Exception as e:
|
| 109 |
+
print(f"Error executing {description}: {e}")
|
| 110 |
+
return None
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# =========================================================
|
| 114 |
+
def get_data(self, data):
|
| 115 |
+
# get any sort of data based on requirement --> comments, contents, etc
|
| 116 |
+
pass
|
| 117 |
+
|
| 118 |
+
# =========================================================
|
| 119 |
+
def close_connection(self):
|
| 120 |
+
self.session.close()
|
| 121 |
+
|
processing_comments/agents/README.md
ADDED
|
@@ -0,0 +1,1571 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Agents Architecture Documentation
|
| 2 |
+
|
| 3 |
+
## Table of Contents
|
| 4 |
+
- [Overview](#overview)
|
| 5 |
+
- [Agent Architecture](#agent-architecture)
|
| 6 |
+
- [Existing Agents](#existing-agents)
|
| 7 |
+
- [How Agents Work](#how-agents-work)
|
| 8 |
+
- [Adding New Agents](#adding-new-agents)
|
| 9 |
+
- [Modifying Existing Agents](#modifying-existing-agents)
|
| 10 |
+
- [Configuration System](#configuration-system)
|
| 11 |
+
- [Best Practices](#best-practices)
|
| 12 |
+
- [Troubleshooting](#troubleshooting)
|
| 13 |
+
|
| 14 |
+
## Overview
|
| 15 |
+
|
| 16 |
+
The agent system in this project is built on a modular, extensible architecture that processes social media comments through a series of specialized agents. Each agent performs a specific task (language detection, translation, sentiment analysis) and is orchestrated through a LangGraph workflow.
|
| 17 |
+
|
| 18 |
+
### Key Design Principles
|
| 19 |
+
|
| 20 |
+
1. **Modularity**: Each agent handles a single responsibility
|
| 21 |
+
2. **Extensibility**: Easy to add new agents without modifying existing code
|
| 22 |
+
3. **Consistency**: All agents inherit from a common base class
|
| 23 |
+
4. **Configuration-Driven**: Agent behavior controlled through JSON config files
|
| 24 |
+
5. **Error Resilience**: Robust error handling at every level
|
| 25 |
+
|
| 26 |
+
### Technology Stack
|
| 27 |
+
|
| 28 |
+
- **LangChain**: For LLM interactions and agent framework
|
| 29 |
+
- **LangGraph**: For workflow orchestration
|
| 30 |
+
- **OpenAI API**: LLM backend for NLP tasks
|
| 31 |
+
- **Lingua**: Fast language detection library
|
| 32 |
+
- **Python 3.x**: Core language
|
| 33 |
+
|
| 34 |
+
## Agent Architecture
|
| 35 |
+
|
| 36 |
+
### Directory Structure
|
| 37 |
+
|
| 38 |
+
```
|
| 39 |
+
agents/
|
| 40 |
+
├── __init__.py # Module exports
|
| 41 |
+
├── base_agent.py # Abstract base class
|
| 42 |
+
├── language_detection_agent.py # Language detection agent
|
| 43 |
+
├── translation_agent.py # Translation agent
|
| 44 |
+
├── sentiment_analysis_agent.py # Sentiment analysis agent
|
| 45 |
+
└── README.md # This file
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
### Base Agent Class
|
| 49 |
+
|
| 50 |
+
All agents inherit from `BaseAgent` (`base_agent.py`), which provides:
|
| 51 |
+
|
| 52 |
+
```python
|
| 53 |
+
class BaseAgent(ABC):
|
| 54 |
+
"""Abstract base class for all agents"""
|
| 55 |
+
|
| 56 |
+
# Common attributes
|
| 57 |
+
- name: str # Agent name
|
| 58 |
+
- config: Dict[str, Any] # Configuration dictionary
|
| 59 |
+
- model: str # LLM model to use
|
| 60 |
+
- temperature: float # LLM temperature
|
| 61 |
+
- max_retries: int # Maximum retry attempts
|
| 62 |
+
|
| 63 |
+
# Abstract methods (must be implemented)
|
| 64 |
+
@abstractmethod
|
| 65 |
+
def process(input_data: Dict) -> Dict
|
| 66 |
+
@abstractmethod
|
| 67 |
+
def validate_input(input_data: Dict) -> bool
|
| 68 |
+
|
| 69 |
+
# Common methods (inherited)
|
| 70 |
+
def get_name() -> str
|
| 71 |
+
def get_config() -> Dict
|
| 72 |
+
def log_processing(message: str, level: str)
|
| 73 |
+
def handle_error(error: Exception, context: str) -> Dict
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
### Workflow Integration
|
| 77 |
+
|
| 78 |
+
Agents are orchestrated through `workflow/comment_processor.py` using LangGraph:
|
| 79 |
+
|
| 80 |
+
```
|
| 81 |
+
┌─────────────────────┐
|
| 82 |
+
│ Language Detection │
|
| 83 |
+
│ Agent │
|
| 84 |
+
└──────────┬──────────┘
|
| 85 |
+
│
|
| 86 |
+
▼
|
| 87 |
+
┌────┴────┐
|
| 88 |
+
│ English?│
|
| 89 |
+
└────┬────┘
|
| 90 |
+
│
|
| 91 |
+
┌─────┴─────┐
|
| 92 |
+
│ │
|
| 93 |
+
Yes No
|
| 94 |
+
│ │
|
| 95 |
+
│ ▼
|
| 96 |
+
│ ┌─────────────┐
|
| 97 |
+
│ │ Translation │
|
| 98 |
+
│ │ Agent │
|
| 99 |
+
│ └──────┬──────┘
|
| 100 |
+
│ │
|
| 101 |
+
└─────┬─────┘
|
| 102 |
+
│
|
| 103 |
+
▼
|
| 104 |
+
┌──────────────────┐
|
| 105 |
+
│ Sentiment │
|
| 106 |
+
│ Analysis Agent │
|
| 107 |
+
└──────────────────┘
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
## Existing Agents
|
| 111 |
+
|
| 112 |
+
### 1. Language Detection Agent
|
| 113 |
+
|
| 114 |
+
**File**: `language_detection_agent.py`
|
| 115 |
+
|
| 116 |
+
**Purpose**: Detects the language of comment text using a hybrid approach.
|
| 117 |
+
|
| 118 |
+
**Strategy**:
|
| 119 |
+
- Uses **Lingua library** for fast English detection
|
| 120 |
+
- Falls back to **LLM** for non-English languages (higher accuracy)
|
| 121 |
+
- Returns language name, ISO code, and confidence level
|
| 122 |
+
|
| 123 |
+
**Key Methods**:
|
| 124 |
+
```python
|
| 125 |
+
def detect_with_lingua(text: str) -> tuple[str, str, bool]
|
| 126 |
+
# Fast detection using lingua library
|
| 127 |
+
# Returns: (language_code, language_name, is_english)
|
| 128 |
+
|
| 129 |
+
def detect_with_llm(text: str) -> Dict[str, Any]
|
| 130 |
+
# LLM-based detection for nuanced analysis
|
| 131 |
+
# Returns: {language, language_code, confidence, has_text}
|
| 132 |
+
|
| 133 |
+
def process(input_data: Dict) -> Dict
|
| 134 |
+
# Main processing: lingua first, LLM if not English
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
**Configuration** (`sentiment_config.json`):
|
| 138 |
+
```json
|
| 139 |
+
{
|
| 140 |
+
"language_detection": {
|
| 141 |
+
"model": "gpt-5-nano",
|
| 142 |
+
"temperature": 0.0,
|
| 143 |
+
"max_retries": 3
|
| 144 |
+
}
|
| 145 |
+
}
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
**Input Requirements**:
|
| 149 |
+
- `comment_text`: str
|
| 150 |
+
|
| 151 |
+
**Output**:
|
| 152 |
+
- `language`: str (e.g., "English", "Spanish")
|
| 153 |
+
- `language_code`: str (ISO 639-1, e.g., "en", "es")
|
| 154 |
+
- `is_english`: bool
|
| 155 |
+
- `confidence`: str ("high", "medium", "low")
|
| 156 |
+
- `detection_method`: str ("lingua", "llm", "default")
|
| 157 |
+
- `has_text`: bool
|
| 158 |
+
|
| 159 |
+
### 2. Translation Agent
|
| 160 |
+
|
| 161 |
+
**File**: `translation_agent.py`
|
| 162 |
+
|
| 163 |
+
**Purpose**: Translates non-English comments to English using LLM.
|
| 164 |
+
|
| 165 |
+
**Strategy**:
|
| 166 |
+
- Skips translation if already English
|
| 167 |
+
- Uses LLM for context-aware, high-quality translation
|
| 168 |
+
- Preserves tone, intent, emojis, and special characters
|
| 169 |
+
- Specialized for music/education social media content
|
| 170 |
+
|
| 171 |
+
**Key Methods**:
|
| 172 |
+
```python
|
| 173 |
+
def translate_text(text: str, source_language: str) -> Dict
|
| 174 |
+
# LLM-based translation with context preservation
|
| 175 |
+
# Returns: {translated_text, translation_confidence, notes}
|
| 176 |
+
|
| 177 |
+
def process(input_data: Dict) -> Dict
|
| 178 |
+
# Main processing: checks is_english, translates if needed
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
**Configuration**:
|
| 182 |
+
```json
|
| 183 |
+
{
|
| 184 |
+
"translation": {
|
| 185 |
+
"model": "gpt-5-nano",
|
| 186 |
+
"temperature": 0.3,
|
| 187 |
+
"max_retries": 3
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
**Input Requirements**:
|
| 193 |
+
- `comment_text`: str
|
| 194 |
+
- `is_english`: bool
|
| 195 |
+
- `language`: str (optional, for context)
|
| 196 |
+
|
| 197 |
+
**Output**:
|
| 198 |
+
- `translated_text`: str
|
| 199 |
+
- `translation_performed`: bool
|
| 200 |
+
- `translation_confidence`: str
|
| 201 |
+
- `translation_notes`: str
|
| 202 |
+
|
| 203 |
+
### 3. Sentiment Analysis Agent
|
| 204 |
+
|
| 205 |
+
**File**: `sentiment_analysis_agent.py`
|
| 206 |
+
|
| 207 |
+
**Purpose**: Analyzes sentiment polarity, intent, and determines if reply is needed.
|
| 208 |
+
|
| 209 |
+
**Strategy**:
|
| 210 |
+
- Uses content description for context
|
| 211 |
+
- Supports parent comment context for reply analysis
|
| 212 |
+
- Multi-label intent classification
|
| 213 |
+
- Differentiates genuine vs rhetorical/sarcastic questions
|
| 214 |
+
- Platform-aware analysis (YouTube, Facebook, Instagram)
|
| 215 |
+
|
| 216 |
+
**Key Features**:
|
| 217 |
+
- **Context-Aware**: Uses content description and parent comment
|
| 218 |
+
- **Multi-Label**: Can assign multiple intents to a single comment
|
| 219 |
+
- **Reply Policy**: Flags comments requiring responses
|
| 220 |
+
- **Rhetorical Detection**: Identifies sarcastic/rhetorical questions
|
| 221 |
+
|
| 222 |
+
**Key Methods**:
|
| 223 |
+
```python
|
| 224 |
+
def _build_context_string(
|
| 225 |
+
content_description: str,
|
| 226 |
+
parent_comment_text: str = None,
|
| 227 |
+
platform: str = None,
|
| 228 |
+
content_title: str = None
|
| 229 |
+
) -> str
|
| 230 |
+
# Builds context for LLM prompt
|
| 231 |
+
# Handles YouTube title+description vs other platforms
|
| 232 |
+
|
| 233 |
+
def analyze_sentiment(
|
| 234 |
+
comment_text: str,
|
| 235 |
+
content_description: str,
|
| 236 |
+
parent_comment_text: str = None,
|
| 237 |
+
platform: str = None,
|
| 238 |
+
content_title: str = None
|
| 239 |
+
) -> Dict
|
| 240 |
+
# Performs sentiment analysis with full context
|
| 241 |
+
# Returns: {sentiment_polarity, intent, requires_reply, confidence, analysis_notes}
|
| 242 |
+
|
| 243 |
+
def process(input_data: Dict) -> Dict
|
| 244 |
+
# Main processing: validates input, analyzes sentiment
|
| 245 |
+
```
|
| 246 |
+
|
| 247 |
+
**Configuration**:
|
| 248 |
+
Uses two config files:
|
| 249 |
+
|
| 250 |
+
1. **Agent Config** (`sentiment_config.json`):
|
| 251 |
+
```json
|
| 252 |
+
{
|
| 253 |
+
"sentiment_analysis": {
|
| 254 |
+
"model": "gpt-5-nano",
|
| 255 |
+
"temperature": 0.2,
|
| 256 |
+
"max_retries": 3
|
| 257 |
+
}
|
| 258 |
+
}
|
| 259 |
+
```
|
| 260 |
+
|
| 261 |
+
2. **Categories Config** (`sentiment_analysis_config.json`):
|
| 262 |
+
```json
|
| 263 |
+
{
|
| 264 |
+
"sentiment_polarity": {
|
| 265 |
+
"categories": [
|
| 266 |
+
{"value": "very_positive", "label": "Very Positive", "description": "..."},
|
| 267 |
+
{"value": "positive", "label": "Positive", "description": "..."},
|
| 268 |
+
{"value": "neutral", "label": "Neutral", "description": "..."},
|
| 269 |
+
{"value": "negative", "label": "Negative", "description": "..."},
|
| 270 |
+
{"value": "very_negative", "label": "Very Negative", "description": "..."}
|
| 271 |
+
]
|
| 272 |
+
},
|
| 273 |
+
"intent": {
|
| 274 |
+
"categories": [
|
| 275 |
+
{"value": "praise", "description": "..."},
|
| 276 |
+
{"value": "question", "description": "..."},
|
| 277 |
+
{"value": "request", "description": "..."},
|
| 278 |
+
{"value": "feedback_negative", "description": "..."},
|
| 279 |
+
{"value": "suggestion", "description": "..."},
|
| 280 |
+
{"value": "humor_sarcasm", "description": "..."},
|
| 281 |
+
{"value": "off_topic", "description": "..."},
|
| 282 |
+
{"value": "spam_selfpromo", "description": "..."}
|
| 283 |
+
]
|
| 284 |
+
},
|
| 285 |
+
"reply_policy": {
|
| 286 |
+
"requires_reply_intents": ["question", "request"],
|
| 287 |
+
"not_include": ["humor_sarcasm"]
|
| 288 |
+
}
|
| 289 |
+
}
|
| 290 |
+
```
|
| 291 |
+
|
| 292 |
+
**Input Requirements**:
|
| 293 |
+
- `comment_text`: str
|
| 294 |
+
- `content_description`: str
|
| 295 |
+
- `parent_comment_text`: str (optional)
|
| 296 |
+
- `platform`: str (optional, e.g., "youtube", "facebook")
|
| 297 |
+
- `content_title`: str (optional, mainly for YouTube)
|
| 298 |
+
|
| 299 |
+
**Output**:
|
| 300 |
+
- `sentiment_polarity`: str (one of: very_positive, positive, neutral, negative, very_negative)
|
| 301 |
+
- `intent`: str (comma-separated list, e.g., "praise, question")
|
| 302 |
+
- `requires_reply`: bool
|
| 303 |
+
- `sentiment_confidence`: str ("high", "medium", "low")
|
| 304 |
+
- `analysis_notes`: str (1-2 sentence summary)
|
| 305 |
+
- `success`: bool (False if critical fields missing)
|
| 306 |
+
|
| 307 |
+
### Common Patterns Across All Agents
|
| 308 |
+
|
| 309 |
+
1. **JSON Response Parsing**: All agents have `_parse_llm_json_response()` method to handle markdown-wrapped JSON
|
| 310 |
+
2. **Error Handling**: All use `handle_error()` from base class
|
| 311 |
+
3. **Logging**: All use `log_processing()` for consistent logging
|
| 312 |
+
4. **Validation**: All implement `validate_input()` before processing
|
| 313 |
+
5. **State Preservation**: All preserve original input data in output
|
| 314 |
+
|
| 315 |
+
## How Agents Work
|
| 316 |
+
|
| 317 |
+
### Workflow Execution Flow
|
| 318 |
+
|
| 319 |
+
1. **Initialization** (`CommentProcessingWorkflow.__init__`):
|
| 320 |
+
```python
|
| 321 |
+
# Load configurations
|
| 322 |
+
lang_detect_config = config["agents"]["language_detection"]
|
| 323 |
+
translation_config = config["agents"]["translation"]
|
| 324 |
+
sentiment_config = config["agents"]["sentiment_analysis"]
|
| 325 |
+
|
| 326 |
+
# Initialize agents
|
| 327 |
+
self.language_agent = LanguageDetectionAgent(lang_detect_config, api_key)
|
| 328 |
+
self.translation_agent = TranslationAgent(translation_config, api_key)
|
| 329 |
+
self.sentiment_agent = SentimentAnalysisAgent(sentiment_config, api_key, sentiment_categories)
|
| 330 |
+
|
| 331 |
+
# Build workflow graph
|
| 332 |
+
self.workflow = self._build_workflow()
|
| 333 |
+
```
|
| 334 |
+
|
| 335 |
+
2. **Workflow Graph** (`_build_workflow()`):
|
| 336 |
+
```python
|
| 337 |
+
workflow = StateGraph(CommentState)
|
| 338 |
+
|
| 339 |
+
# Add nodes (agents)
|
| 340 |
+
workflow.add_node("language_detection", self._language_detection_node)
|
| 341 |
+
workflow.add_node("translation", self._translation_node)
|
| 342 |
+
workflow.add_node("sentiment_analysis", self._sentiment_analysis_node)
|
| 343 |
+
|
| 344 |
+
# Define edges (control flow)
|
| 345 |
+
workflow.set_entry_point("language_detection")
|
| 346 |
+
workflow.add_conditional_edges(
|
| 347 |
+
"language_detection",
|
| 348 |
+
self._should_translate,
|
| 349 |
+
{"translate": "translation", "skip_translation": "sentiment_analysis"}
|
| 350 |
+
)
|
| 351 |
+
workflow.add_edge("translation", "sentiment_analysis")
|
| 352 |
+
workflow.add_edge("sentiment_analysis", END)
|
| 353 |
+
|
| 354 |
+
return workflow.compile()
|
| 355 |
+
```
|
| 356 |
+
|
| 357 |
+
3. **Node Execution** (Example: `_language_detection_node`):
|
| 358 |
+
```python
|
| 359 |
+
def _language_detection_node(self, state: CommentState) -> CommentState:
|
| 360 |
+
try:
|
| 361 |
+
# Prepare input
|
| 362 |
+
input_data = {"comment_text": state["comment_text"]}
|
| 363 |
+
|
| 364 |
+
# Process with agent
|
| 365 |
+
result = self.language_agent.process(input_data)
|
| 366 |
+
|
| 367 |
+
# Update state
|
| 368 |
+
if result.get("success", False):
|
| 369 |
+
state["language"] = result.get("language", "English")
|
| 370 |
+
state["language_code"] = result.get("language_code", "en")
|
| 371 |
+
state["is_english"] = result.get("is_english", True)
|
| 372 |
+
# ... more fields
|
| 373 |
+
else:
|
| 374 |
+
# Handle error, set defaults
|
| 375 |
+
state["processing_errors"].append(result.get("error"))
|
| 376 |
+
|
| 377 |
+
return state
|
| 378 |
+
except Exception as e:
|
| 379 |
+
# Error handling
|
| 380 |
+
state["processing_errors"].append(str(e))
|
| 381 |
+
return state
|
| 382 |
+
```
|
| 383 |
+
|
| 384 |
+
4. **Decision Points** (Example: `_should_translate`):
|
| 385 |
+
```python
|
| 386 |
+
def _should_translate(self, state: CommentState) -> str:
|
| 387 |
+
if state.get("is_english", True) or not state.get("has_text", True):
|
| 388 |
+
# Set defaults for skipped translation
|
| 389 |
+
state["translated_text"] = state["comment_text"]
|
| 390 |
+
state["translation_performed"] = False
|
| 391 |
+
return "skip_translation"
|
| 392 |
+
else:
|
| 393 |
+
return "translate"
|
| 394 |
+
```
|
| 395 |
+
|
| 396 |
+
5. **Comment Processing** (`process_comment()`):
|
| 397 |
+
```python
|
| 398 |
+
def process_comment(self, comment_data: Dict) -> Dict:
|
| 399 |
+
# Initialize state
|
| 400 |
+
initial_state = {
|
| 401 |
+
"comment_sk": comment_data.get("comment_sk"),
|
| 402 |
+
"comment_text": comment_data.get("comment_text"),
|
| 403 |
+
# ... all fields
|
| 404 |
+
"processing_errors": [],
|
| 405 |
+
"success": True
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
# Run workflow
|
| 409 |
+
final_state = self.workflow.invoke(initial_state)
|
| 410 |
+
|
| 411 |
+
# Merge and return
|
| 412 |
+
return dict(final_state)
|
| 413 |
+
```
|
| 414 |
+
|
| 415 |
+
### State Management
|
| 416 |
+
|
| 417 |
+
The workflow uses a `CommentState` TypedDict to pass data between agents:
|
| 418 |
+
|
| 419 |
+
```python
|
| 420 |
+
class CommentState(TypedDict):
|
| 421 |
+
# Input fields
|
| 422 |
+
comment_sk: int
|
| 423 |
+
comment_id: str
|
| 424 |
+
comment_text: str
|
| 425 |
+
# ... more fields
|
| 426 |
+
|
| 427 |
+
# Processing fields (populated by agents)
|
| 428 |
+
language: str
|
| 429 |
+
language_code: str
|
| 430 |
+
is_english: bool
|
| 431 |
+
translated_text: str
|
| 432 |
+
sentiment_polarity: str
|
| 433 |
+
intent: str
|
| 434 |
+
# ... more fields
|
| 435 |
+
|
| 436 |
+
# Metadata
|
| 437 |
+
processing_errors: Annotated[List[str], operator.add]
|
| 438 |
+
success: bool
|
| 439 |
+
```
|
| 440 |
+
|
| 441 |
+
### Error Handling Strategy
|
| 442 |
+
|
| 443 |
+
1. **Agent Level**: Each agent returns `{"success": False, "error": "..."}` on failure
|
| 444 |
+
2. **Node Level**: Nodes catch exceptions, set defaults, append to `processing_errors`
|
| 445 |
+
3. **Workflow Level**: Workflow continues even if an agent fails (graceful degradation)
|
| 446 |
+
4. **Critical Failures**: Sentiment agent marks `success=False` if critical fields missing (comment not stored)
|
| 447 |
+
|
| 448 |
+
## Adding New Agents
|
| 449 |
+
|
| 450 |
+
### Step-by-Step Guide
|
| 451 |
+
|
| 452 |
+
#### Step 1: Create the Agent Class
|
| 453 |
+
|
| 454 |
+
Create a new file in the `agents/` directory (e.g., `topic_classification_agent.py`):
|
| 455 |
+
|
| 456 |
+
```python
|
| 457 |
+
"""
|
| 458 |
+
Topic Classification Agent
|
| 459 |
+
Extracts topics and themes from comments
|
| 460 |
+
"""
|
| 461 |
+
|
| 462 |
+
from typing import Dict, Any
|
| 463 |
+
import json
|
| 464 |
+
from langchain_openai import ChatOpenAI
|
| 465 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 466 |
+
from agents.base_agent import BaseAgent
|
| 467 |
+
import logging
|
| 468 |
+
|
| 469 |
+
logger = logging.getLogger(__name__)
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
class TopicClassificationAgent(BaseAgent):
|
| 473 |
+
"""
|
| 474 |
+
Agent that classifies comments into predefined topics/themes.
|
| 475 |
+
"""
|
| 476 |
+
|
| 477 |
+
def __init__(self, config: Dict[str, Any], api_key: str, topic_categories: Dict[str, Any]):
|
| 478 |
+
"""
|
| 479 |
+
Initialize the Topic Classification Agent.
|
| 480 |
+
|
| 481 |
+
Args:
|
| 482 |
+
config: Configuration dictionary
|
| 483 |
+
api_key: OpenAI API key
|
| 484 |
+
topic_categories: Dictionary with topic categories
|
| 485 |
+
"""
|
| 486 |
+
super().__init__("TopicClassificationAgent", config)
|
| 487 |
+
self.api_key = api_key
|
| 488 |
+
self.topic_categories = topic_categories
|
| 489 |
+
self.llm = ChatOpenAI(
|
| 490 |
+
model=self.model,
|
| 491 |
+
temperature=self.temperature,
|
| 492 |
+
api_key=self.api_key
|
| 493 |
+
)
|
| 494 |
+
|
| 495 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 496 |
+
"""
|
| 497 |
+
Validate that input contains required fields.
|
| 498 |
+
|
| 499 |
+
Args:
|
| 500 |
+
input_data: Input dictionary
|
| 501 |
+
|
| 502 |
+
Returns:
|
| 503 |
+
True if valid, False otherwise
|
| 504 |
+
"""
|
| 505 |
+
required_fields = ["comment_text"]
|
| 506 |
+
return all(field in input_data for field in required_fields)
|
| 507 |
+
|
| 508 |
+
def classify_topics(self, comment_text: str) -> Dict[str, Any]:
|
| 509 |
+
"""
|
| 510 |
+
Classify comment into topics using LLM.
|
| 511 |
+
|
| 512 |
+
Args:
|
| 513 |
+
comment_text: The comment text to analyze
|
| 514 |
+
|
| 515 |
+
Returns:
|
| 516 |
+
Dictionary with topic classification results
|
| 517 |
+
"""
|
| 518 |
+
# Build topic options from config
|
| 519 |
+
topic_options = "\n".join([
|
| 520 |
+
f"- {cat['value']}: {cat['description']}"
|
| 521 |
+
for cat in self.topic_categories["topics"]["categories"]
|
| 522 |
+
])
|
| 523 |
+
|
| 524 |
+
system_prompt = f"""You are an expert at classifying music-related comments into topics.
|
| 525 |
+
|
| 526 |
+
Available Topics:
|
| 527 |
+
{topic_options}
|
| 528 |
+
|
| 529 |
+
Return your response in JSON format with the following fields:
|
| 530 |
+
- topics: array of topic values (multi-label, can have multiple topics)
|
| 531 |
+
- confidence: your confidence level (high, medium, low)
|
| 532 |
+
- reasoning: brief explanation of your classification
|
| 533 |
+
"""
|
| 534 |
+
|
| 535 |
+
user_prompt = f"""Classify this comment into relevant topics:
|
| 536 |
+
|
| 537 |
+
Comment: "{comment_text}"
|
| 538 |
+
|
| 539 |
+
Return JSON only."""
|
| 540 |
+
|
| 541 |
+
try:
|
| 542 |
+
messages = [
|
| 543 |
+
SystemMessage(content=system_prompt),
|
| 544 |
+
HumanMessage(content=user_prompt)
|
| 545 |
+
]
|
| 546 |
+
|
| 547 |
+
response = self.llm.invoke(messages)
|
| 548 |
+
result = self._parse_llm_json_response(response.content)
|
| 549 |
+
|
| 550 |
+
topics = result.get("topics", [])
|
| 551 |
+
if isinstance(topics, str):
|
| 552 |
+
topics = [topics]
|
| 553 |
+
|
| 554 |
+
topic_str = ", ".join(topics) if topics else None
|
| 555 |
+
|
| 556 |
+
return {
|
| 557 |
+
"success": True,
|
| 558 |
+
"topics": topic_str,
|
| 559 |
+
"topic_confidence": result.get("confidence", "medium"),
|
| 560 |
+
"topic_reasoning": result.get("reasoning", "")
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
except json.JSONDecodeError as e:
|
| 564 |
+
self.log_processing(f"JSON decode error: {str(e)}", "warning")
|
| 565 |
+
return {
|
| 566 |
+
"success": False,
|
| 567 |
+
"error": str(e)
|
| 568 |
+
}
|
| 569 |
+
except Exception as e:
|
| 570 |
+
self.log_processing(f"Topic classification failed: {str(e)}", "error")
|
| 571 |
+
return {
|
| 572 |
+
"success": False,
|
| 573 |
+
"error": str(e)
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 577 |
+
"""
|
| 578 |
+
Process comment and extract topics.
|
| 579 |
+
|
| 580 |
+
Args:
|
| 581 |
+
input_data: Dictionary containing comment data
|
| 582 |
+
|
| 583 |
+
Returns:
|
| 584 |
+
Dictionary with topic classification results
|
| 585 |
+
"""
|
| 586 |
+
try:
|
| 587 |
+
# Validate input
|
| 588 |
+
if not self.validate_input(input_data):
|
| 589 |
+
return {
|
| 590 |
+
"success": False,
|
| 591 |
+
"error": "Invalid input: missing required fields"
|
| 592 |
+
}
|
| 593 |
+
|
| 594 |
+
comment_text = input_data["comment_text"]
|
| 595 |
+
|
| 596 |
+
self.log_processing(f"Classifying topics for comment", "debug")
|
| 597 |
+
|
| 598 |
+
# Perform classification
|
| 599 |
+
classification_result = self.classify_topics(comment_text)
|
| 600 |
+
|
| 601 |
+
result = {
|
| 602 |
+
"success": classification_result.get("success", False),
|
| 603 |
+
"topics": classification_result.get("topics"),
|
| 604 |
+
"topic_confidence": classification_result.get("topic_confidence"),
|
| 605 |
+
"topic_reasoning": classification_result.get("topic_reasoning", "")
|
| 606 |
+
}
|
| 607 |
+
|
| 608 |
+
if "error" in classification_result:
|
| 609 |
+
result["topic_error"] = classification_result["error"]
|
| 610 |
+
|
| 611 |
+
# Preserve all original data
|
| 612 |
+
for key, value in input_data.items():
|
| 613 |
+
if key not in result:
|
| 614 |
+
result[key] = value
|
| 615 |
+
|
| 616 |
+
return result
|
| 617 |
+
|
| 618 |
+
except Exception as e:
|
| 619 |
+
return self.handle_error(e, "topic_classification")
|
| 620 |
+
|
| 621 |
+
def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]:
|
| 622 |
+
"""
|
| 623 |
+
Parse LLM response that may contain JSON wrapped in markdown code blocks.
|
| 624 |
+
|
| 625 |
+
Args:
|
| 626 |
+
response_content: Raw response content from LLM
|
| 627 |
+
|
| 628 |
+
Returns:
|
| 629 |
+
Parsed JSON dictionary
|
| 630 |
+
|
| 631 |
+
Raises:
|
| 632 |
+
json.JSONDecodeError: If JSON cannot be parsed
|
| 633 |
+
"""
|
| 634 |
+
content = response_content.strip()
|
| 635 |
+
|
| 636 |
+
# Check if response is wrapped in markdown code block
|
| 637 |
+
if content.startswith("```json"):
|
| 638 |
+
content = content[7:]
|
| 639 |
+
if content.endswith("```"):
|
| 640 |
+
content = content[:-3]
|
| 641 |
+
content = content.strip()
|
| 642 |
+
elif content.startswith("```"):
|
| 643 |
+
content = content[3:]
|
| 644 |
+
if content.endswith("```"):
|
| 645 |
+
content = content[:-3]
|
| 646 |
+
content = content.strip()
|
| 647 |
+
|
| 648 |
+
return json.loads(content)
|
| 649 |
+
```
|
| 650 |
+
|
| 651 |
+
#### Step 2: Update `__init__.py`
|
| 652 |
+
|
| 653 |
+
Add your agent to `agents/__init__.py`:
|
| 654 |
+
|
| 655 |
+
```python
|
| 656 |
+
"""
|
| 657 |
+
Agents module for the sentiment analysis workflow.
|
| 658 |
+
Provides modular, extensible agents for various NLP tasks.
|
| 659 |
+
"""
|
| 660 |
+
|
| 661 |
+
from agents.base_agent import BaseAgent
|
| 662 |
+
from agents.language_detection_agent import LanguageDetectionAgent
|
| 663 |
+
from agents.translation_agent import TranslationAgent
|
| 664 |
+
from agents.sentiment_analysis_agent import SentimentAnalysisAgent
|
| 665 |
+
from agents.topic_classification_agent import TopicClassificationAgent # ADD THIS
|
| 666 |
+
|
| 667 |
+
__all__ = [
|
| 668 |
+
"BaseAgent",
|
| 669 |
+
"LanguageDetectionAgent",
|
| 670 |
+
"TranslationAgent",
|
| 671 |
+
"SentimentAnalysisAgent",
|
| 672 |
+
"TopicClassificationAgent" # ADD THIS
|
| 673 |
+
]
|
| 674 |
+
```
|
| 675 |
+
|
| 676 |
+
#### Step 3: Update Configuration Files
|
| 677 |
+
|
| 678 |
+
Add agent configuration to `config_files/sentiment_config.json`:
|
| 679 |
+
|
| 680 |
+
```json
|
| 681 |
+
{
|
| 682 |
+
"agents": {
|
| 683 |
+
"language_detection": { ... },
|
| 684 |
+
"translation": { ... },
|
| 685 |
+
"sentiment_analysis": { ... },
|
| 686 |
+
"topic_classification": {
|
| 687 |
+
"name": "TopicClassificationAgent",
|
| 688 |
+
"model": "gpt-5-nano",
|
| 689 |
+
"temperature": 0.2,
|
| 690 |
+
"max_retries": 3,
|
| 691 |
+
"description": "Classifies comments into topic categories"
|
| 692 |
+
}
|
| 693 |
+
}
|
| 694 |
+
}
|
| 695 |
+
```
|
| 696 |
+
|
| 697 |
+
Create topic categories config (or add to existing `sentiment_analysis_config.json`):
|
| 698 |
+
|
| 699 |
+
```json
|
| 700 |
+
{
|
| 701 |
+
"topics": {
|
| 702 |
+
"categories": [
|
| 703 |
+
{
|
| 704 |
+
"value": "technique",
|
| 705 |
+
"label": "Technique",
|
| 706 |
+
"description": "Playing technique, finger positioning, hand coordination"
|
| 707 |
+
},
|
| 708 |
+
{
|
| 709 |
+
"value": "theory",
|
| 710 |
+
"label": "Music Theory",
|
| 711 |
+
"description": "Scales, chords, harmony, composition theory"
|
| 712 |
+
},
|
| 713 |
+
{
|
| 714 |
+
"value": "equipment",
|
| 715 |
+
"label": "Equipment",
|
| 716 |
+
"description": "Instruments, gear, accessories, software"
|
| 717 |
+
},
|
| 718 |
+
{
|
| 719 |
+
"value": "performance",
|
| 720 |
+
"label": "Performance",
|
| 721 |
+
"description": "Stage presence, live playing, performance anxiety"
|
| 722 |
+
},
|
| 723 |
+
{
|
| 724 |
+
"value": "practice",
|
| 725 |
+
"label": "Practice",
|
| 726 |
+
"description": "Practice routines, discipline, improvement tips"
|
| 727 |
+
}
|
| 728 |
+
]
|
| 729 |
+
}
|
| 730 |
+
}
|
| 731 |
+
```
|
| 732 |
+
|
| 733 |
+
#### Step 4: Update Workflow State
|
| 734 |
+
|
| 735 |
+
Add fields to `CommentState` in `workflow/comment_processor.py`:
|
| 736 |
+
|
| 737 |
+
```python
|
| 738 |
+
class CommentState(TypedDict):
|
| 739 |
+
# ... existing fields ...
|
| 740 |
+
|
| 741 |
+
# Topic classification fields
|
| 742 |
+
topics: str
|
| 743 |
+
topic_confidence: str
|
| 744 |
+
topic_reasoning: str
|
| 745 |
+
```
|
| 746 |
+
|
| 747 |
+
#### Step 5: Add Workflow Node
|
| 748 |
+
|
| 749 |
+
Add the node method to `CommentProcessingWorkflow` class:
|
| 750 |
+
|
| 751 |
+
```python
|
| 752 |
+
def _topic_classification_node(self, state: CommentState) -> CommentState:
|
| 753 |
+
"""
|
| 754 |
+
Node for topic classification.
|
| 755 |
+
|
| 756 |
+
Args:
|
| 757 |
+
state: Current workflow state
|
| 758 |
+
|
| 759 |
+
Returns:
|
| 760 |
+
Updated state with topic classification results
|
| 761 |
+
"""
|
| 762 |
+
try:
|
| 763 |
+
# Prepare input
|
| 764 |
+
input_data = {
|
| 765 |
+
"comment_text": state.get("translated_text", state["comment_text"])
|
| 766 |
+
}
|
| 767 |
+
|
| 768 |
+
# Process with topic classification agent
|
| 769 |
+
result = self.topic_agent.process(input_data)
|
| 770 |
+
|
| 771 |
+
# Update state
|
| 772 |
+
if result.get("success", False):
|
| 773 |
+
state["topics"] = result.get("topics")
|
| 774 |
+
state["topic_confidence"] = result.get("topic_confidence")
|
| 775 |
+
state["topic_reasoning"] = result.get("topic_reasoning", "")
|
| 776 |
+
else:
|
| 777 |
+
error_msg = f"Topic classification failed: {result.get('error', 'Unknown error')}"
|
| 778 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 779 |
+
state["topics"] = None
|
| 780 |
+
state["topic_confidence"] = None
|
| 781 |
+
state["topic_reasoning"] = "Topic classification failed"
|
| 782 |
+
|
| 783 |
+
logger.debug(f"Topics: {state['topics']}")
|
| 784 |
+
return state
|
| 785 |
+
|
| 786 |
+
except Exception as e:
|
| 787 |
+
error_msg = f"Topic classification node error: {str(e)}"
|
| 788 |
+
logger.error(error_msg)
|
| 789 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 790 |
+
state["topics"] = None
|
| 791 |
+
state["topic_confidence"] = None
|
| 792 |
+
state["topic_reasoning"] = "Error during topic classification"
|
| 793 |
+
return state
|
| 794 |
+
```
|
| 795 |
+
|
| 796 |
+
#### Step 6: Initialize Agent in Workflow
|
| 797 |
+
|
| 798 |
+
Update `__init__` method:
|
| 799 |
+
|
| 800 |
+
```python
|
| 801 |
+
def __init__(self, config: Dict[str, Any], api_key: str):
|
| 802 |
+
# ... existing initialization ...
|
| 803 |
+
|
| 804 |
+
# Load topic categories
|
| 805 |
+
topic_categories_path = config.get("topic_categories_config", "config_files/topic_categories.json")
|
| 806 |
+
with open(topic_categories_path, 'r') as f:
|
| 807 |
+
topic_categories = json.load(f)
|
| 808 |
+
|
| 809 |
+
# Initialize topic agent
|
| 810 |
+
topic_config = config["agents"]["topic_classification"]
|
| 811 |
+
self.topic_agent = TopicClassificationAgent(topic_config, api_key, topic_categories)
|
| 812 |
+
```
|
| 813 |
+
|
| 814 |
+
#### Step 7: Update Workflow Graph
|
| 815 |
+
|
| 816 |
+
Modify `_build_workflow()`:
|
| 817 |
+
|
| 818 |
+
```python
|
| 819 |
+
def _build_workflow(self) -> StateGraph:
|
| 820 |
+
workflow = StateGraph(CommentState)
|
| 821 |
+
|
| 822 |
+
# Add nodes
|
| 823 |
+
workflow.add_node("language_detection", self._language_detection_node)
|
| 824 |
+
workflow.add_node("translation", self._translation_node)
|
| 825 |
+
workflow.add_node("sentiment_analysis", self._sentiment_analysis_node)
|
| 826 |
+
workflow.add_node("topic_classification", self._topic_classification_node) # ADD THIS
|
| 827 |
+
|
| 828 |
+
# Define edges
|
| 829 |
+
workflow.set_entry_point("language_detection")
|
| 830 |
+
workflow.add_conditional_edges(
|
| 831 |
+
"language_detection",
|
| 832 |
+
self._should_translate,
|
| 833 |
+
{"translate": "translation", "skip_translation": "sentiment_analysis"}
|
| 834 |
+
)
|
| 835 |
+
workflow.add_edge("translation", "sentiment_analysis")
|
| 836 |
+
workflow.add_edge("sentiment_analysis", "topic_classification") # ADD THIS
|
| 837 |
+
workflow.add_edge("topic_classification", END) # MODIFY THIS
|
| 838 |
+
|
| 839 |
+
return workflow.compile()
|
| 840 |
+
```
|
| 841 |
+
|
| 842 |
+
#### Step 8: Update Database Schema
|
| 843 |
+
|
| 844 |
+
Add columns to your Snowflake table:
|
| 845 |
+
|
| 846 |
+
```sql
|
| 847 |
+
ALTER TABLE COMMENT_SENTIMENT_FEATURES
|
| 848 |
+
ADD COLUMN TOPICS VARCHAR(500),
|
| 849 |
+
ADD COLUMN TOPIC_CONFIDENCE VARCHAR(20),
|
| 850 |
+
ADD COLUMN TOPIC_REASONING VARCHAR(1000);
|
| 851 |
+
```
|
| 852 |
+
|
| 853 |
+
#### Step 9: Test Your Agent
|
| 854 |
+
|
| 855 |
+
Test with a small batch first:
|
| 856 |
+
|
| 857 |
+
```bash
|
| 858 |
+
python main.py --limit 10 --sequential
|
| 859 |
+
```
|
| 860 |
+
|
| 861 |
+
Check logs for any errors and verify output in Snowflake.
|
| 862 |
+
|
| 863 |
+
### Quick Checklist for Adding New Agents
|
| 864 |
+
|
| 865 |
+
- [ ] Create agent class inheriting from `BaseAgent`
|
| 866 |
+
- [ ] Implement `validate_input()` method
|
| 867 |
+
- [ ] Implement `process()` method
|
| 868 |
+
- [ ] Implement `_parse_llm_json_response()` if using LLM
|
| 869 |
+
- [ ] Add agent to `agents/__init__.py`
|
| 870 |
+
- [ ] Add configuration to `sentiment_config.json`
|
| 871 |
+
- [ ] Create/update category config file if needed
|
| 872 |
+
- [ ] Add fields to `CommentState` TypedDict
|
| 873 |
+
- [ ] Create node method in `CommentProcessingWorkflow`
|
| 874 |
+
- [ ] Initialize agent in `__init__`
|
| 875 |
+
- [ ] Add node to workflow graph
|
| 876 |
+
- [ ] Update edges in workflow
|
| 877 |
+
- [ ] Update database schema
|
| 878 |
+
- [ ] Test with small batch
|
| 879 |
+
|
| 880 |
+
## Modifying Existing Agents
|
| 881 |
+
|
| 882 |
+
### Common Modifications
|
| 883 |
+
|
| 884 |
+
#### 1. Change LLM Model
|
| 885 |
+
|
| 886 |
+
Update `config_files/sentiment_config.json`:
|
| 887 |
+
|
| 888 |
+
```json
|
| 889 |
+
{
|
| 890 |
+
"agents": {
|
| 891 |
+
"sentiment_analysis": {
|
| 892 |
+
"model": "gpt-4o", // Change from gpt-5-nano
|
| 893 |
+
"temperature": 0.2,
|
| 894 |
+
"max_retries": 3
|
| 895 |
+
}
|
| 896 |
+
}
|
| 897 |
+
}
|
| 898 |
+
```
|
| 899 |
+
|
| 900 |
+
No code changes needed! Configuration is loaded dynamically.
|
| 901 |
+
|
| 902 |
+
#### 2. Add New Sentiment Category
|
| 903 |
+
|
| 904 |
+
Update `config_files/sentiment_analysis_config.json`:
|
| 905 |
+
|
| 906 |
+
```json
|
| 907 |
+
{
|
| 908 |
+
"sentiment_polarity": {
|
| 909 |
+
"categories": [
|
| 910 |
+
// ... existing categories ...
|
| 911 |
+
{
|
| 912 |
+
"value": "mixed",
|
| 913 |
+
"label": "Mixed",
|
| 914 |
+
"description": "Contains both positive and negative elements"
|
| 915 |
+
}
|
| 916 |
+
]
|
| 917 |
+
}
|
| 918 |
+
}
|
| 919 |
+
```
|
| 920 |
+
|
| 921 |
+
The agent will automatically include this in prompts. No code changes needed.
|
| 922 |
+
|
| 923 |
+
#### 3. Add New Intent Category
|
| 924 |
+
|
| 925 |
+
Update `config_files/sentiment_analysis_config.json`:
|
| 926 |
+
|
| 927 |
+
```json
|
| 928 |
+
{
|
| 929 |
+
"intent": {
|
| 930 |
+
"categories": [
|
| 931 |
+
// ... existing categories ...
|
| 932 |
+
{
|
| 933 |
+
"value": "collaboration",
|
| 934 |
+
"label": "Collaboration",
|
| 935 |
+
"description": "Seeking or offering collaboration opportunities"
|
| 936 |
+
}
|
| 937 |
+
]
|
| 938 |
+
}
|
| 939 |
+
}
|
| 940 |
+
```
|
| 941 |
+
|
| 942 |
+
#### 4. Modify Reply Policy
|
| 943 |
+
|
| 944 |
+
Update `config_files/sentiment_analysis_config.json`:
|
| 945 |
+
|
| 946 |
+
```json
|
| 947 |
+
{
|
| 948 |
+
"reply_policy": {
|
| 949 |
+
"requires_reply_intents": ["question", "request", "feedback_negative"], // Added feedback_negative
|
| 950 |
+
"not_include": ["humor_sarcasm", "spam_selfpromo"] // Added spam_selfpromo
|
| 951 |
+
}
|
| 952 |
+
}
|
| 953 |
+
```
|
| 954 |
+
|
| 955 |
+
#### 5. Adjust Temperature for Better Results
|
| 956 |
+
|
| 957 |
+
If getting inconsistent results, adjust temperature:
|
| 958 |
+
|
| 959 |
+
```json
|
| 960 |
+
{
|
| 961 |
+
"agents": {
|
| 962 |
+
"sentiment_analysis": {
|
| 963 |
+
"model": "gpt-5-nano",
|
| 964 |
+
"temperature": 0.1, // Lower = more consistent, less creative
|
| 965 |
+
"max_retries": 3
|
| 966 |
+
}
|
| 967 |
+
}
|
| 968 |
+
}
|
| 969 |
+
```
|
| 970 |
+
|
| 971 |
+
#### 6. Add Context to Sentiment Analysis
|
| 972 |
+
|
| 973 |
+
Modify `_build_context_string()` in `sentiment_analysis_agent.py`:
|
| 974 |
+
|
| 975 |
+
```python
|
| 976 |
+
def _build_context_string(self, content_description: str, parent_comment_text: str = None,
|
| 977 |
+
platform: str = None, content_title: str = None,
|
| 978 |
+
channel_name: str = None) -> str: # ADD channel_name
|
| 979 |
+
"""Build context string for sentiment analysis."""
|
| 980 |
+
context_parts = []
|
| 981 |
+
|
| 982 |
+
# ... existing code ...
|
| 983 |
+
|
| 984 |
+
# ADD THIS
|
| 985 |
+
if channel_name:
|
| 986 |
+
context_parts.append(f"Channel: {channel_name}")
|
| 987 |
+
|
| 988 |
+
return "\n".join(context_parts)
|
| 989 |
+
```
|
| 990 |
+
|
| 991 |
+
Then update the `analyze_sentiment()` method to accept and pass `channel_name`.
|
| 992 |
+
|
| 993 |
+
#### 7. Improve Language Detection Accuracy
|
| 994 |
+
|
| 995 |
+
Modify `language_detection_agent.py` to add more languages to LINGUA_TO_ISO:
|
| 996 |
+
|
| 997 |
+
```python
|
| 998 |
+
LINGUA_TO_ISO = {
|
| 999 |
+
# ... existing mappings ...
|
| 1000 |
+
Language.VIETNAMESE: "vi",
|
| 1001 |
+
Language.THAI: "th",
|
| 1002 |
+
Language.INDONESIAN: "id",
|
| 1003 |
+
# Add more as needed
|
| 1004 |
+
}
|
| 1005 |
+
```
|
| 1006 |
+
|
| 1007 |
+
#### 8. Customize Translation Prompt
|
| 1008 |
+
|
| 1009 |
+
Modify `translate_text()` in `translation_agent.py`:
|
| 1010 |
+
|
| 1011 |
+
```python
|
| 1012 |
+
system_prompt = """You are a professional translator specializing in social media content related to music and education.
|
| 1013 |
+
Translate the given text from the source language to English. The text is a comment on a musical content.
|
| 1014 |
+
Preserve the tone, intent, and any emojis or special characters.
|
| 1015 |
+
For informal social media language, maintain the casual tone in translation.
|
| 1016 |
+
|
| 1017 |
+
// ADD THESE GUIDELINES:
|
| 1018 |
+
Special Instructions:
|
| 1019 |
+
- Preserve musical terminology (e.g., "legato", "staccato") untranslated
|
| 1020 |
+
- Translate instrument names (e.g., "guitarra" → "guitar")
|
| 1021 |
+
- Keep artist names and brand names in original language
|
| 1022 |
+
- Maintain slang and colloquialisms when possible
|
| 1023 |
+
|
| 1024 |
+
Return your response in JSON format with the following fields:
|
| 1025 |
+
- translated_text: The English translation
|
| 1026 |
+
- translation_confidence: Your confidence level (high, medium, low)
|
| 1027 |
+
- notes: Any important notes about the translation (optional)
|
| 1028 |
+
"""
|
| 1029 |
+
```
|
| 1030 |
+
|
| 1031 |
+
#### 9. Add Retry Logic for Failed Analyses
|
| 1032 |
+
|
| 1033 |
+
Modify `process()` in `sentiment_analysis_agent.py`:
|
| 1034 |
+
|
| 1035 |
+
```python
|
| 1036 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 1037 |
+
try:
|
| 1038 |
+
# ... existing validation code ...
|
| 1039 |
+
|
| 1040 |
+
# ADD RETRY LOGIC
|
| 1041 |
+
max_attempts = self.max_retries
|
| 1042 |
+
for attempt in range(max_attempts):
|
| 1043 |
+
analysis_result = self.analyze_sentiment(
|
| 1044 |
+
comment_text, content_description,
|
| 1045 |
+
parent_comment_text, platform, content_title
|
| 1046 |
+
)
|
| 1047 |
+
|
| 1048 |
+
if analysis_result.get("success"):
|
| 1049 |
+
break
|
| 1050 |
+
|
| 1051 |
+
if attempt < max_attempts - 1:
|
| 1052 |
+
self.log_processing(f"Attempt {attempt + 1} failed, retrying...", "warning")
|
| 1053 |
+
|
| 1054 |
+
# ... rest of existing code ...
|
| 1055 |
+
```
|
| 1056 |
+
|
| 1057 |
+
#### 10. Add Custom Validation Rules
|
| 1058 |
+
|
| 1059 |
+
Modify `validate_input()` in any agent:
|
| 1060 |
+
|
| 1061 |
+
```python
|
| 1062 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 1063 |
+
"""Validate that input contains required fields."""
|
| 1064 |
+
required_fields = ["comment_text", "content_description"]
|
| 1065 |
+
|
| 1066 |
+
# Check required fields exist
|
| 1067 |
+
if not all(field in input_data for field in required_fields):
|
| 1068 |
+
return False
|
| 1069 |
+
|
| 1070 |
+
# ADD CUSTOM VALIDATION
|
| 1071 |
+
# Ensure comment_text is not empty or too short
|
| 1072 |
+
comment_text = input_data.get("comment_text", "")
|
| 1073 |
+
if not comment_text or len(comment_text.strip()) < 2:
|
| 1074 |
+
self.log_processing("Comment text too short or empty", "warning")
|
| 1075 |
+
return False
|
| 1076 |
+
|
| 1077 |
+
# Ensure content_description exists
|
| 1078 |
+
content_desc = input_data.get("content_description", "")
|
| 1079 |
+
if not content_desc or content_desc.strip() == "":
|
| 1080 |
+
self.log_processing("Content description missing", "warning")
|
| 1081 |
+
return False
|
| 1082 |
+
|
| 1083 |
+
return True
|
| 1084 |
+
```
|
| 1085 |
+
|
| 1086 |
+
### Testing Modified Agents
|
| 1087 |
+
|
| 1088 |
+
After making modifications, always test:
|
| 1089 |
+
|
| 1090 |
+
```bash
|
| 1091 |
+
# Test with a small batch
|
| 1092 |
+
python main.py --limit 10 --sequential
|
| 1093 |
+
|
| 1094 |
+
# Check specific data source
|
| 1095 |
+
python main.py --limit 10 --sequential --data-source social_media
|
| 1096 |
+
|
| 1097 |
+
# Review logs for errors
|
| 1098 |
+
tail -f logs/comment_processing_*.log
|
| 1099 |
+
```
|
| 1100 |
+
|
| 1101 |
+
## Configuration System
|
| 1102 |
+
|
| 1103 |
+
### Configuration Files Overview
|
| 1104 |
+
|
| 1105 |
+
```
|
| 1106 |
+
config_files/
|
| 1107 |
+
├── sentiment_config.json # Agent behavior config
|
| 1108 |
+
├── sentiment_analysis_config.json # Sentiment categories and intents
|
| 1109 |
+
└── data_sources_config.json # Data source configuration
|
| 1110 |
+
```
|
| 1111 |
+
|
| 1112 |
+
### Agent Configuration Structure
|
| 1113 |
+
|
| 1114 |
+
**File**: `sentiment_config.json`
|
| 1115 |
+
|
| 1116 |
+
```json
|
| 1117 |
+
{
|
| 1118 |
+
"agents": {
|
| 1119 |
+
"agent_name": {
|
| 1120 |
+
"name": "AgentClassName",
|
| 1121 |
+
"model": "gpt-5-nano", // LLM model to use
|
| 1122 |
+
"temperature": 0.0, // Creativity (0.0 = deterministic, 1.0 = creative)
|
| 1123 |
+
"max_retries": 3, // Max retry attempts
|
| 1124 |
+
"description": "What this agent does"
|
| 1125 |
+
}
|
| 1126 |
+
},
|
| 1127 |
+
"workflow": {
|
| 1128 |
+
"parallel_processing": {
|
| 1129 |
+
"enabled": true,
|
| 1130 |
+
"worker_calculation": "CPU count - 2, max 5 workers",
|
| 1131 |
+
"min_batch_size": 20,
|
| 1132 |
+
"max_batch_size": 1000
|
| 1133 |
+
}
|
| 1134 |
+
}
|
| 1135 |
+
}
|
| 1136 |
+
```
|
| 1137 |
+
|
| 1138 |
+
### Temperature Guidelines
|
| 1139 |
+
|
| 1140 |
+
- **0.0 - 0.1**: Deterministic, consistent (good for classification)
|
| 1141 |
+
- **0.2 - 0.4**: Slight variation, mostly consistent (good for sentiment analysis)
|
| 1142 |
+
- **0.5 - 0.7**: Balanced creativity and consistency (good for translation)
|
| 1143 |
+
- **0.8 - 1.0**: Creative, varied (good for content generation)
|
| 1144 |
+
|
| 1145 |
+
### Model Selection Guidelines
|
| 1146 |
+
|
| 1147 |
+
- **gpt-5-nano**: Fast, cheap, good for simple tasks
|
| 1148 |
+
- **gpt-4o-mini**: Balanced speed/quality, good for most tasks
|
| 1149 |
+
- **gpt-4o**: High quality, slower, good for complex analysis
|
| 1150 |
+
|
| 1151 |
+
### Category Configuration Structure
|
| 1152 |
+
|
| 1153 |
+
**File**: `sentiment_analysis_config.json`
|
| 1154 |
+
|
| 1155 |
+
```json
|
| 1156 |
+
{
|
| 1157 |
+
"category_type": {
|
| 1158 |
+
"categories": [
|
| 1159 |
+
{
|
| 1160 |
+
"value": "machine_readable_value", // Used in code/DB
|
| 1161 |
+
"label": "Human Readable Label", // Used in UI
|
| 1162 |
+
"description": "Detailed description for LLM prompt"
|
| 1163 |
+
}
|
| 1164 |
+
]
|
| 1165 |
+
}
|
| 1166 |
+
}
|
| 1167 |
+
```
|
| 1168 |
+
|
| 1169 |
+
### Loading Configuration in Code
|
| 1170 |
+
|
| 1171 |
+
```python
|
| 1172 |
+
# In workflow/__init__ or agent __init__
|
| 1173 |
+
import json
|
| 1174 |
+
import os
|
| 1175 |
+
|
| 1176 |
+
# Load agent config
|
| 1177 |
+
with open('config_files/sentiment_config.json', 'r') as f:
|
| 1178 |
+
config = json.load(f)
|
| 1179 |
+
|
| 1180 |
+
agent_config = config["agents"]["agent_name"]
|
| 1181 |
+
|
| 1182 |
+
# Load category config
|
| 1183 |
+
with open('config_files/sentiment_analysis_config.json', 'r') as f:
|
| 1184 |
+
categories = json.load(f)
|
| 1185 |
+
|
| 1186 |
+
sentiment_categories = categories["sentiment_polarity"]["categories"]
|
| 1187 |
+
```
|
| 1188 |
+
|
| 1189 |
+
## Best Practices
|
| 1190 |
+
|
| 1191 |
+
### Agent Development
|
| 1192 |
+
|
| 1193 |
+
1. **Single Responsibility**: Each agent should do one thing well
|
| 1194 |
+
2. **Fail Gracefully**: Always return structured error responses
|
| 1195 |
+
3. **Preserve Data**: Never lose original input data - pass it through
|
| 1196 |
+
4. **Log Everything**: Use `log_processing()` for debugging
|
| 1197 |
+
5. **Validate Early**: Check inputs before processing
|
| 1198 |
+
6. **Configuration Over Code**: Use config files for behavior changes
|
| 1199 |
+
7. **Test Incrementally**: Test with `--limit 10 --sequential` first
|
| 1200 |
+
|
| 1201 |
+
### Prompt Engineering
|
| 1202 |
+
|
| 1203 |
+
1. **Be Specific**: Clearly define expected output format
|
| 1204 |
+
2. **Use Examples**: Include few-shot examples in prompts
|
| 1205 |
+
3. **Request JSON**: Always request JSON format for structured data
|
| 1206 |
+
4. **Handle Edge Cases**: Document edge cases in prompts
|
| 1207 |
+
5. **Provide Context**: Give LLM all relevant context
|
| 1208 |
+
6. **Set Constraints**: Clearly define boundaries and limitations
|
| 1209 |
+
|
| 1210 |
+
Example of good prompt structure:
|
| 1211 |
+
|
| 1212 |
+
```python
|
| 1213 |
+
system_prompt = """You are an expert at [TASK].
|
| 1214 |
+
|
| 1215 |
+
Your task is to:
|
| 1216 |
+
1. [Step 1]
|
| 1217 |
+
2. [Step 2]
|
| 1218 |
+
3. [Step 3]
|
| 1219 |
+
|
| 1220 |
+
Context: [Explain the context]
|
| 1221 |
+
|
| 1222 |
+
Rules:
|
| 1223 |
+
- Rule 1
|
| 1224 |
+
- Rule 2
|
| 1225 |
+
- Rule 3
|
| 1226 |
+
|
| 1227 |
+
Examples:
|
| 1228 |
+
- Input: "..." → Output: {...}
|
| 1229 |
+
- Input: "..." → Output: {...}
|
| 1230 |
+
|
| 1231 |
+
Return your response in JSON format with the following fields:
|
| 1232 |
+
- field1: description
|
| 1233 |
+
- field2: description
|
| 1234 |
+
"""
|
| 1235 |
+
```
|
| 1236 |
+
|
| 1237 |
+
### Error Handling
|
| 1238 |
+
|
| 1239 |
+
1. **Try-Catch Everything**: Wrap all processing in try-catch
|
| 1240 |
+
2. **Specific Error Messages**: Make errors actionable
|
| 1241 |
+
3. **Graceful Degradation**: Continue workflow even if one agent fails
|
| 1242 |
+
4. **Error Accumulation**: Collect errors in `processing_errors` list
|
| 1243 |
+
5. **Critical vs Non-Critical**: Distinguish between recoverable and fatal errors
|
| 1244 |
+
|
| 1245 |
+
Example:
|
| 1246 |
+
|
| 1247 |
+
```python
|
| 1248 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 1249 |
+
try:
|
| 1250 |
+
# Validate
|
| 1251 |
+
if not self.validate_input(input_data):
|
| 1252 |
+
return {
|
| 1253 |
+
"success": False,
|
| 1254 |
+
"error": "Invalid input: missing required fields",
|
| 1255 |
+
**input_data # Preserve original data
|
| 1256 |
+
}
|
| 1257 |
+
|
| 1258 |
+
# Process
|
| 1259 |
+
result = self.do_processing(input_data)
|
| 1260 |
+
|
| 1261 |
+
# Check result
|
| 1262 |
+
if not result.get("success"):
|
| 1263 |
+
return {
|
| 1264 |
+
"success": False,
|
| 1265 |
+
"error": result.get("error", "Unknown error"),
|
| 1266 |
+
**input_data
|
| 1267 |
+
}
|
| 1268 |
+
|
| 1269 |
+
# Return success
|
| 1270 |
+
return {
|
| 1271 |
+
"success": True,
|
| 1272 |
+
"output_field": result["output"],
|
| 1273 |
+
**input_data
|
| 1274 |
+
}
|
| 1275 |
+
|
| 1276 |
+
except Exception as e:
|
| 1277 |
+
return self.handle_error(e, "process")
|
| 1278 |
+
```
|
| 1279 |
+
|
| 1280 |
+
### Testing
|
| 1281 |
+
|
| 1282 |
+
1. **Unit Test Agents**: Test agents independently before integration
|
| 1283 |
+
2. **Small Batches**: Always test with `--limit 10` first
|
| 1284 |
+
3. **Sequential Mode**: Use `--sequential` for debugging
|
| 1285 |
+
4. **Check Logs**: Review logs after every test run
|
| 1286 |
+
5. **Validate Output**: Check Snowflake results
|
| 1287 |
+
6. **Test Edge Cases**: Empty text, emojis only, very long text, special characters
|
| 1288 |
+
|
| 1289 |
+
Test script example:
|
| 1290 |
+
|
| 1291 |
+
```python
|
| 1292 |
+
# test_agent.py
|
| 1293 |
+
from agents.sentiment_analysis_agent import SentimentAnalysisAgent
|
| 1294 |
+
import json
|
| 1295 |
+
|
| 1296 |
+
# Load config
|
| 1297 |
+
with open('config_files/sentiment_config.json', 'r') as f:
|
| 1298 |
+
config = json.load(f)
|
| 1299 |
+
with open('config_files/sentiment_analysis_config.json', 'r') as f:
|
| 1300 |
+
categories = json.load(f)
|
| 1301 |
+
|
| 1302 |
+
# Initialize agent
|
| 1303 |
+
agent = SentimentAnalysisAgent(
|
| 1304 |
+
config["agents"]["sentiment_analysis"],
|
| 1305 |
+
"your-api-key",
|
| 1306 |
+
categories
|
| 1307 |
+
)
|
| 1308 |
+
|
| 1309 |
+
# Test cases
|
| 1310 |
+
test_cases = [
|
| 1311 |
+
{"comment_text": "This is amazing!", "content_description": "Guitar tutorial"},
|
| 1312 |
+
{"comment_text": "😊😊😊", "content_description": "Piano cover"},
|
| 1313 |
+
{"comment_text": "What scale is this?", "content_description": "Blues solo"},
|
| 1314 |
+
]
|
| 1315 |
+
|
| 1316 |
+
for test in test_cases:
|
| 1317 |
+
result = agent.process(test)
|
| 1318 |
+
print(f"Input: {test['comment_text']}")
|
| 1319 |
+
print(f"Result: {result}")
|
| 1320 |
+
print("---")
|
| 1321 |
+
```
|
| 1322 |
+
|
| 1323 |
+
### Performance Optimization
|
| 1324 |
+
|
| 1325 |
+
1. **Batch Processing**: Process comments in batches (handled by workflow)
|
| 1326 |
+
2. **Parallel Workers**: Use multiprocessing for large batches
|
| 1327 |
+
3. **Minimize LLM Calls**: Cache results when possible
|
| 1328 |
+
4. **Optimize Prompts**: Shorter prompts = faster responses
|
| 1329 |
+
5. **Choose Right Model**: Use gpt-5-nano for simple tasks
|
| 1330 |
+
|
| 1331 |
+
### Code Organization
|
| 1332 |
+
|
| 1333 |
+
1. **One Agent Per File**: Don't combine multiple agents
|
| 1334 |
+
2. **Helper Methods**: Use private methods (\_method\_name) for internal logic
|
| 1335 |
+
3. **Type Hints**: Always use type hints for parameters and returns
|
| 1336 |
+
4. **Docstrings**: Document all public methods
|
| 1337 |
+
5. **Constants**: Define constants at class level
|
| 1338 |
+
|
| 1339 |
+
Example structure:
|
| 1340 |
+
|
| 1341 |
+
```python
|
| 1342 |
+
class MyAgent(BaseAgent):
|
| 1343 |
+
# Constants
|
| 1344 |
+
DEFAULT_VALUE = "default"
|
| 1345 |
+
MAX_LENGTH = 1000
|
| 1346 |
+
|
| 1347 |
+
def __init__(self, config, api_key):
|
| 1348 |
+
"""Initialize agent."""
|
| 1349 |
+
super().__init__("MyAgent", config)
|
| 1350 |
+
# ... initialization
|
| 1351 |
+
|
| 1352 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 1353 |
+
"""Validate input data."""
|
| 1354 |
+
# ... validation
|
| 1355 |
+
|
| 1356 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 1357 |
+
"""Main processing method."""
|
| 1358 |
+
# ... processing
|
| 1359 |
+
|
| 1360 |
+
def _helper_method(self, data: str) -> str:
|
| 1361 |
+
"""Private helper method."""
|
| 1362 |
+
# ... helper logic
|
| 1363 |
+
|
| 1364 |
+
def _parse_llm_json_response(self, response: str) -> Dict[str, Any]:
|
| 1365 |
+
"""Parse LLM JSON response."""
|
| 1366 |
+
# ... parsing
|
| 1367 |
+
```
|
| 1368 |
+
|
| 1369 |
+
## Troubleshooting
|
| 1370 |
+
|
| 1371 |
+
### Common Issues
|
| 1372 |
+
|
| 1373 |
+
#### Issue 1: Agent Returns Empty Results
|
| 1374 |
+
|
| 1375 |
+
**Symptoms**: Agent succeeds but returns None or empty strings for key fields
|
| 1376 |
+
|
| 1377 |
+
**Causes**:
|
| 1378 |
+
- LLM not following JSON format
|
| 1379 |
+
- JSON parsing failing silently
|
| 1380 |
+
- Missing fields in LLM response
|
| 1381 |
+
|
| 1382 |
+
**Solutions**:
|
| 1383 |
+
1. Check logs for JSON parsing warnings
|
| 1384 |
+
2. Add validation after LLM call:
|
| 1385 |
+
```python
|
| 1386 |
+
result = self._parse_llm_json_response(response.content)
|
| 1387 |
+
|
| 1388 |
+
# Validate result
|
| 1389 |
+
if not result.get("sentiment_polarity"):
|
| 1390 |
+
return {
|
| 1391 |
+
"success": False,
|
| 1392 |
+
"error": "Missing sentiment_polarity in LLM response"
|
| 1393 |
+
}
|
| 1394 |
+
```
|
| 1395 |
+
3. Improve prompt to be more specific about required fields
|
| 1396 |
+
4. Add examples to prompt showing exact JSON structure
|
| 1397 |
+
|
| 1398 |
+
#### Issue 2: JSON Parsing Errors
|
| 1399 |
+
|
| 1400 |
+
**Symptoms**: `JSON decode error` in logs
|
| 1401 |
+
|
| 1402 |
+
**Causes**:
|
| 1403 |
+
- LLM returns markdown-wrapped JSON
|
| 1404 |
+
- LLM includes explanatory text before/after JSON
|
| 1405 |
+
- Malformed JSON from LLM
|
| 1406 |
+
|
| 1407 |
+
**Solutions**:
|
| 1408 |
+
1. Use `_parse_llm_json_response()` helper (already handles markdown)
|
| 1409 |
+
2. Add more explicit prompt:
|
| 1410 |
+
```python
|
| 1411 |
+
user_prompt = """...
|
| 1412 |
+
|
| 1413 |
+
Return ONLY valid JSON, no explanation or markdown. Just the raw JSON object.
|
| 1414 |
+
"""
|
| 1415 |
+
```
|
| 1416 |
+
3. Add fallback parsing:
|
| 1417 |
+
```python
|
| 1418 |
+
try:
|
| 1419 |
+
result = json.loads(content)
|
| 1420 |
+
except json.JSONDecodeError:
|
| 1421 |
+
# Try to extract JSON from text
|
| 1422 |
+
import re
|
| 1423 |
+
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
| 1424 |
+
if json_match:
|
| 1425 |
+
result = json.loads(json_match.group())
|
| 1426 |
+
else:
|
| 1427 |
+
raise
|
| 1428 |
+
```
|
| 1429 |
+
|
| 1430 |
+
#### Issue 3: Inconsistent Results
|
| 1431 |
+
|
| 1432 |
+
**Symptoms**: Same comment gets different classifications on reruns
|
| 1433 |
+
|
| 1434 |
+
**Causes**:
|
| 1435 |
+
- Temperature too high
|
| 1436 |
+
- Prompt too vague
|
| 1437 |
+
- Model inconsistency
|
| 1438 |
+
|
| 1439 |
+
**Solutions**:
|
| 1440 |
+
1. Lower temperature to 0.0 - 0.2 for classification tasks
|
| 1441 |
+
2. Make prompt more specific and rule-based
|
| 1442 |
+
3. Add examples to prompt
|
| 1443 |
+
4. Use a more consistent model (gpt-5-nano vs gpt-4o)
|
| 1444 |
+
|
| 1445 |
+
#### Issue 4: Agent Too Slow
|
| 1446 |
+
|
| 1447 |
+
**Symptoms**: Processing takes very long
|
| 1448 |
+
|
| 1449 |
+
**Causes**:
|
| 1450 |
+
- Large LLM model
|
| 1451 |
+
- Complex prompts
|
| 1452 |
+
- Sequential processing
|
| 1453 |
+
- API rate limits
|
| 1454 |
+
|
| 1455 |
+
**Solutions**:
|
| 1456 |
+
1. Use faster model (gpt-5-nano instead of gpt-4o)
|
| 1457 |
+
2. Simplify prompt (shorter = faster)
|
| 1458 |
+
3. Enable parallel processing (already default)
|
| 1459 |
+
4. Increase batch size (if not hitting rate limits)
|
| 1460 |
+
5. Consider caching repeated analyses
|
| 1461 |
+
|
| 1462 |
+
#### Issue 5: Agent Failing Validation
|
| 1463 |
+
|
| 1464 |
+
**Symptoms**: `validate_input()` returns False, agent skips processing
|
| 1465 |
+
|
| 1466 |
+
**Causes**:
|
| 1467 |
+
- Missing required fields in input
|
| 1468 |
+
- Empty or None values
|
| 1469 |
+
- Wrong data types
|
| 1470 |
+
|
| 1471 |
+
**Solutions**:
|
| 1472 |
+
1. Check workflow node - ensure all required fields passed:
|
| 1473 |
+
```python
|
| 1474 |
+
input_data = {
|
| 1475 |
+
"comment_text": state.get("translated_text", state["comment_text"]),
|
| 1476 |
+
"content_description": state["content_description"],
|
| 1477 |
+
# Add all required fields
|
| 1478 |
+
}
|
| 1479 |
+
```
|
| 1480 |
+
2. Add logging to validation:
|
| 1481 |
+
```python
|
| 1482 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 1483 |
+
for field in required_fields:
|
| 1484 |
+
if field not in input_data:
|
| 1485 |
+
self.log_processing(f"Missing field: {field}", "error")
|
| 1486 |
+
return False
|
| 1487 |
+
return True
|
| 1488 |
+
```
|
| 1489 |
+
|
| 1490 |
+
#### Issue 6: Workflow Not Running New Agent
|
| 1491 |
+
|
| 1492 |
+
**Symptoms**: New agent not being called, no logs from new agent
|
| 1493 |
+
|
| 1494 |
+
**Causes**:
|
| 1495 |
+
- Forgot to add node to workflow graph
|
| 1496 |
+
- Forgot to initialize agent
|
| 1497 |
+
- Workflow edges not connected
|
| 1498 |
+
|
| 1499 |
+
**Solutions**:
|
| 1500 |
+
1. Verify agent initialized in `__init__`:
|
| 1501 |
+
```python
|
| 1502 |
+
self.new_agent = NewAgent(config, api_key)
|
| 1503 |
+
```
|
| 1504 |
+
2. Verify node added:
|
| 1505 |
+
```python
|
| 1506 |
+
workflow.add_node("new_agent", self._new_agent_node)
|
| 1507 |
+
```
|
| 1508 |
+
3. Verify edges:
|
| 1509 |
+
```python
|
| 1510 |
+
workflow.add_edge("previous_agent", "new_agent")
|
| 1511 |
+
workflow.add_edge("new_agent", END)
|
| 1512 |
+
```
|
| 1513 |
+
4. Check for exceptions in workflow compilation
|
| 1514 |
+
|
| 1515 |
+
#### Issue 7: Database Insert Fails
|
| 1516 |
+
|
| 1517 |
+
**Symptoms**: Processing succeeds but data not in Snowflake
|
| 1518 |
+
|
| 1519 |
+
**Causes**:
|
| 1520 |
+
- Missing columns in database
|
| 1521 |
+
- Data type mismatch
|
| 1522 |
+
- Field name mismatch
|
| 1523 |
+
|
| 1524 |
+
**Solutions**:
|
| 1525 |
+
1. Check column exists:
|
| 1526 |
+
```sql
|
| 1527 |
+
DESC TABLE COMMENT_SENTIMENT_FEATURES;
|
| 1528 |
+
```
|
| 1529 |
+
2. Add column if missing:
|
| 1530 |
+
```sql
|
| 1531 |
+
ALTER TABLE COMMENT_SENTIMENT_FEATURES
|
| 1532 |
+
ADD COLUMN NEW_FIELD VARCHAR(500);
|
| 1533 |
+
```
|
| 1534 |
+
3. Check field names match exactly (case-sensitive)
|
| 1535 |
+
4. Check main.py result_df construction includes new fields
|
| 1536 |
+
|
| 1537 |
+
### Debugging Tips
|
| 1538 |
+
|
| 1539 |
+
1. **Enable Debug Logging**: Set log level to DEBUG in main.py
|
| 1540 |
+
2. **Print State**: Add print statements in workflow nodes to see state
|
| 1541 |
+
3. **Test Agent Directly**: Test agent outside workflow first
|
| 1542 |
+
4. **Use Sequential Mode**: `--sequential` flag for clearer debugging
|
| 1543 |
+
5. **Check API Logs**: Review OpenAI API dashboard for errors
|
| 1544 |
+
6. **Validate JSON**: Use online JSON validator for config files
|
| 1545 |
+
7. **Check Git Status**: Ensure all files saved and changes committed
|
| 1546 |
+
|
| 1547 |
+
### Getting Help
|
| 1548 |
+
|
| 1549 |
+
1. **Check Logs**: Always check `logs/` directory first
|
| 1550 |
+
2. **Review This README**: Answers to most questions are here
|
| 1551 |
+
3. **Test Incrementally**: Isolate the problem to one agent
|
| 1552 |
+
4. **Use Small Batches**: Test with `--limit 5` for faster iteration
|
| 1553 |
+
5. **Document Issues**: Keep notes on what you tried
|
| 1554 |
+
|
| 1555 |
+
## Conclusion
|
| 1556 |
+
|
| 1557 |
+
This agent architecture provides a flexible, maintainable foundation for processing social media comments. Key takeaways:
|
| 1558 |
+
|
| 1559 |
+
- **Base class pattern** ensures consistency
|
| 1560 |
+
- **LangGraph workflow** enables flexible orchestration
|
| 1561 |
+
- **Configuration-driven** design minimizes code changes
|
| 1562 |
+
- **Error resilience** at every level
|
| 1563 |
+
- **Extensible by design** - easy to add new agents
|
| 1564 |
+
|
| 1565 |
+
For questions or issues, refer to the main project README or review the existing agent implementations for patterns and examples.
|
| 1566 |
+
|
| 1567 |
+
---
|
| 1568 |
+
|
| 1569 |
+
**Last Updated**: 2026-01-15
|
| 1570 |
+
**Version**: 1.0
|
| 1571 |
+
**Maintainer**: Musora Development Team
|
processing_comments/agents/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agents module for the sentiment analysis workflow.
|
| 3 |
+
Provides modular, extensible agents for various NLP tasks.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from agents.base_agent import BaseAgent
|
| 7 |
+
from agents.language_detection_agent import LanguageDetectionAgent
|
| 8 |
+
from agents.translation_agent import TranslationAgent
|
| 9 |
+
|
| 10 |
+
__all__ = [
|
| 11 |
+
"BaseAgent",
|
| 12 |
+
"LanguageDetectionAgent",
|
| 13 |
+
"TranslationAgent"
|
| 14 |
+
]
|
processing_comments/agents/base_agent.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Base Agent class for all agents in the workflow
|
| 3 |
+
This provides a common interface and structure for extensibility
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from abc import ABC, abstractmethod
|
| 7 |
+
from typing import Dict, Any, Optional
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class BaseAgent(ABC):
|
| 15 |
+
"""
|
| 16 |
+
Abstract base class for all agents in the agentic workflow.
|
| 17 |
+
Provides common functionality and enforces consistent interface.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, name: str, config: Dict[str, Any]):
|
| 21 |
+
"""
|
| 22 |
+
Initialize the base agent.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
name: Name of the agent
|
| 26 |
+
config: Configuration dictionary for the agent
|
| 27 |
+
"""
|
| 28 |
+
self.name = name
|
| 29 |
+
self.config = config
|
| 30 |
+
self.model = config.get("model", "gpt-4o-mini")
|
| 31 |
+
self.temperature = config.get("temperature", 0.7)
|
| 32 |
+
self.max_retries = config.get("max_retries", 3)
|
| 33 |
+
logger.info(f"Initialized {self.name} with model {self.model}")
|
| 34 |
+
|
| 35 |
+
@abstractmethod
|
| 36 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 37 |
+
"""
|
| 38 |
+
Process input data and return results.
|
| 39 |
+
This method must be implemented by all concrete agent classes.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
input_data: Dictionary containing input data for processing
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
Dictionary containing processing results
|
| 46 |
+
"""
|
| 47 |
+
pass
|
| 48 |
+
|
| 49 |
+
@abstractmethod
|
| 50 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 51 |
+
"""
|
| 52 |
+
Validate input data before processing.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
input_data: Dictionary containing input data
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
True if input is valid, False otherwise
|
| 59 |
+
"""
|
| 60 |
+
pass
|
| 61 |
+
|
| 62 |
+
def get_name(self) -> str:
|
| 63 |
+
"""Get the agent name."""
|
| 64 |
+
return self.name
|
| 65 |
+
|
| 66 |
+
def get_config(self) -> Dict[str, Any]:
|
| 67 |
+
"""Get the agent configuration."""
|
| 68 |
+
return self.config
|
| 69 |
+
|
| 70 |
+
def log_processing(self, message: str, level: str = "info"):
|
| 71 |
+
"""
|
| 72 |
+
Log processing information.
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
message: Log message
|
| 76 |
+
level: Log level (info, warning, error, debug)
|
| 77 |
+
"""
|
| 78 |
+
log_method = getattr(logger, level, logger.info)
|
| 79 |
+
log_method(f"[{self.name}] {message}")
|
| 80 |
+
|
| 81 |
+
def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]:
|
| 82 |
+
"""
|
| 83 |
+
Handle errors consistently across all agents.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
error: The exception that occurred
|
| 87 |
+
context: Additional context about the error
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
Error dictionary with details
|
| 91 |
+
"""
|
| 92 |
+
error_msg = f"Error in {self.name}"
|
| 93 |
+
if context:
|
| 94 |
+
error_msg += f" ({context})"
|
| 95 |
+
error_msg += f": {str(error)}"
|
| 96 |
+
|
| 97 |
+
logger.error(error_msg)
|
| 98 |
+
|
| 99 |
+
return {
|
| 100 |
+
"success": False,
|
| 101 |
+
"error": str(error),
|
| 102 |
+
"agent": self.name,
|
| 103 |
+
"context": context
|
| 104 |
+
}
|
processing_comments/agents/language_detection_agent.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Language Detection Agent
|
| 3 |
+
Detects the language of social media comments using lingua library and LLM fallback
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict, Any
|
| 7 |
+
import json
|
| 8 |
+
from lingua import Language, LanguageDetectorBuilder
|
| 9 |
+
from langchain_openai import ChatOpenAI
|
| 10 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 11 |
+
from agents.base_agent import BaseAgent
|
| 12 |
+
import logging
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class LanguageDetectionAgent(BaseAgent):
|
| 18 |
+
"""
|
| 19 |
+
Agent that detects the language of text comments.
|
| 20 |
+
Uses lingua library for fast English detection, then LLM for non-English languages.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
# Lingua to ISO 639-1 language code mapping
|
| 24 |
+
LINGUA_TO_ISO = {
|
| 25 |
+
Language.ENGLISH: "en",
|
| 26 |
+
Language.SPANISH: "es",
|
| 27 |
+
Language.FRENCH: "fr",
|
| 28 |
+
Language.GERMAN: "de",
|
| 29 |
+
Language.ITALIAN: "it",
|
| 30 |
+
Language.PORTUGUESE: "pt",
|
| 31 |
+
Language.RUSSIAN: "ru",
|
| 32 |
+
Language.JAPANESE: "ja",
|
| 33 |
+
Language.KOREAN: "ko",
|
| 34 |
+
Language.CHINESE: "zh",
|
| 35 |
+
Language.ARABIC: "ar",
|
| 36 |
+
Language.HINDI: "hi",
|
| 37 |
+
Language.DUTCH: "nl",
|
| 38 |
+
Language.SWEDISH: "sv",
|
| 39 |
+
Language.POLISH: "pl",
|
| 40 |
+
Language.TURKISH: "tr"
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
def __init__(self, config: Dict[str, Any], api_key: str):
|
| 44 |
+
"""
|
| 45 |
+
Initialize the Language Detection Agent.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
config: Configuration dictionary
|
| 49 |
+
api_key: OpenAI API key
|
| 50 |
+
"""
|
| 51 |
+
super().__init__("LanguageDetectionAgent", config)
|
| 52 |
+
self.api_key = api_key
|
| 53 |
+
self.llm = ChatOpenAI(
|
| 54 |
+
model=self.model,
|
| 55 |
+
temperature=self.temperature,
|
| 56 |
+
api_key=self.api_key
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Initialize lingua detector with all languages
|
| 60 |
+
self.detector = LanguageDetectorBuilder.from_all_languages().build()
|
| 61 |
+
|
| 62 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 63 |
+
"""
|
| 64 |
+
Validate that input contains required fields.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
input_data: Input dictionary
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
True if valid, False otherwise
|
| 71 |
+
"""
|
| 72 |
+
return "comment_text" in input_data and input_data["comment_text"]
|
| 73 |
+
|
| 74 |
+
def detect_with_lingua(self, text: str) -> tuple[str, str, bool]:
|
| 75 |
+
"""
|
| 76 |
+
Detect language using lingua library.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
text: Text to analyze
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
Tuple of (language_code, language_name, is_english)
|
| 83 |
+
"""
|
| 84 |
+
try:
|
| 85 |
+
# Clean text
|
| 86 |
+
cleaned_text = text.strip()
|
| 87 |
+
if not cleaned_text or len(cleaned_text) < 3:
|
| 88 |
+
return "en", "English", True # Default for very short text
|
| 89 |
+
|
| 90 |
+
# Detect language with lingua
|
| 91 |
+
detected_language = self.detector.detect_language_of(cleaned_text)
|
| 92 |
+
|
| 93 |
+
if detected_language is None:
|
| 94 |
+
# If detection fails, default to English
|
| 95 |
+
return "en", "English", True
|
| 96 |
+
|
| 97 |
+
# Check if it's English
|
| 98 |
+
if detected_language == Language.ENGLISH:
|
| 99 |
+
return "en", "English", True
|
| 100 |
+
|
| 101 |
+
# Map lingua language to ISO code
|
| 102 |
+
lang_code = self.LINGUA_TO_ISO.get(detected_language, "unknown")
|
| 103 |
+
lang_name = detected_language.name.capitalize()
|
| 104 |
+
|
| 105 |
+
return lang_code, lang_name, False
|
| 106 |
+
|
| 107 |
+
except Exception as e:
|
| 108 |
+
logger.warning(f"Lingua detection failed: {str(e)}")
|
| 109 |
+
# If detection fails, default to English
|
| 110 |
+
return "en", "English", True
|
| 111 |
+
|
| 112 |
+
def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]:
|
| 113 |
+
"""
|
| 114 |
+
Parse LLM response that may contain JSON wrapped in markdown code blocks.
|
| 115 |
+
|
| 116 |
+
Args:
|
| 117 |
+
response_content: Raw response content from LLM
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
Parsed JSON dictionary
|
| 121 |
+
|
| 122 |
+
Raises:
|
| 123 |
+
json.JSONDecodeError: If JSON cannot be parsed
|
| 124 |
+
"""
|
| 125 |
+
content = response_content.strip()
|
| 126 |
+
|
| 127 |
+
# Check if response is wrapped in markdown code block
|
| 128 |
+
if content.startswith("```json"):
|
| 129 |
+
# Remove ```json prefix and ``` suffix
|
| 130 |
+
content = content[7:] # Remove ```json
|
| 131 |
+
if content.endswith("```"):
|
| 132 |
+
content = content[:-3] # Remove trailing ```
|
| 133 |
+
content = content.strip()
|
| 134 |
+
elif content.startswith("```"):
|
| 135 |
+
# Remove generic ``` code block
|
| 136 |
+
content = content[3:]
|
| 137 |
+
if content.endswith("```"):
|
| 138 |
+
content = content[:-3]
|
| 139 |
+
content = content.strip()
|
| 140 |
+
|
| 141 |
+
# Parse the cleaned JSON
|
| 142 |
+
return json.loads(content)
|
| 143 |
+
|
| 144 |
+
def detect_with_llm(self, text: str) -> Dict[str, Any]:
|
| 145 |
+
"""
|
| 146 |
+
Detect language using LLM for more nuanced detection.
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
text: Text to analyze
|
| 150 |
+
|
| 151 |
+
Returns:
|
| 152 |
+
Dictionary with language detection results
|
| 153 |
+
"""
|
| 154 |
+
system_prompt = """You are a language detection expert. Analyze the given text and detect its language.
|
| 155 |
+
For text with only emojis, special characters, or minimal content, classify as "English". Comment is about a music content, so having links or using musician name is normal and still be english.
|
| 156 |
+
Return your response in JSON format with the following fields:
|
| 157 |
+
- language: The detected language name (e.g., "English", "Spanish", "French")
|
| 158 |
+
- language_code: ISO 639-1 language code (e.g., "en", "es", "fr")
|
| 159 |
+
- confidence: Your confidence level (high, medium, low)
|
| 160 |
+
- has_text: boolean indicating if there is actual textual content (not just emojis/symbols)
|
| 161 |
+
"""
|
| 162 |
+
|
| 163 |
+
user_prompt = f"""Detect the language of this comment related to a musical content:
|
| 164 |
+
|
| 165 |
+
"{text}"
|
| 166 |
+
|
| 167 |
+
Return JSON only."""
|
| 168 |
+
|
| 169 |
+
try:
|
| 170 |
+
messages = [
|
| 171 |
+
SystemMessage(content=system_prompt),
|
| 172 |
+
HumanMessage(content=user_prompt)
|
| 173 |
+
]
|
| 174 |
+
|
| 175 |
+
response = self.llm.invoke(messages)
|
| 176 |
+
|
| 177 |
+
# Parse the response using helper function
|
| 178 |
+
result = self._parse_llm_json_response(response.content)
|
| 179 |
+
|
| 180 |
+
# If no text content, default to English
|
| 181 |
+
if not result.get("has_text", True):
|
| 182 |
+
result["language"] = "English"
|
| 183 |
+
result["language_code"] = "en"
|
| 184 |
+
|
| 185 |
+
return result
|
| 186 |
+
|
| 187 |
+
except json.JSONDecodeError as e:
|
| 188 |
+
self.log_processing(f"LLM response JSON parsing failed: {str(e)}", "warning")
|
| 189 |
+
self.log_processing(f"Raw response: {response.content[:200]}", "debug")
|
| 190 |
+
return {
|
| 191 |
+
"language": "English",
|
| 192 |
+
"language_code": "en",
|
| 193 |
+
"confidence": "low",
|
| 194 |
+
"has_text": True
|
| 195 |
+
}
|
| 196 |
+
except Exception as e:
|
| 197 |
+
self.log_processing(f"LLM detection failed: {str(e)}", "warning")
|
| 198 |
+
return {
|
| 199 |
+
"language": "English",
|
| 200 |
+
"language_code": "en",
|
| 201 |
+
"confidence": "low",
|
| 202 |
+
"has_text": True
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 206 |
+
"""
|
| 207 |
+
Process comment and detect its language.
|
| 208 |
+
Strategy: Use lingua first. If English, done. If not English, use LLM for better accuracy.
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
input_data: Dictionary containing comment_text and other metadata
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
Dictionary with language detection results
|
| 215 |
+
"""
|
| 216 |
+
try:
|
| 217 |
+
# Validate input
|
| 218 |
+
if not self.validate_input(input_data):
|
| 219 |
+
return {
|
| 220 |
+
"success": False,
|
| 221 |
+
"error": "Invalid input: missing comment_text",
|
| 222 |
+
"language": "English",
|
| 223 |
+
"language_code": "en",
|
| 224 |
+
"is_english": True
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
comment_text = input_data["comment_text"]
|
| 228 |
+
|
| 229 |
+
# Check for empty or emoji-only content
|
| 230 |
+
if not comment_text or len(comment_text.strip()) == 0:
|
| 231 |
+
return {
|
| 232 |
+
"success": True,
|
| 233 |
+
"comment_text": comment_text,
|
| 234 |
+
"language": "English",
|
| 235 |
+
"language_code": "en",
|
| 236 |
+
"is_english": True,
|
| 237 |
+
"confidence": "high",
|
| 238 |
+
"detection_method": "default",
|
| 239 |
+
"has_text": False
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
# Step 1: Use lingua for initial detection
|
| 243 |
+
lingua_lang_code, lingua_lang_name, is_english = self.detect_with_lingua(comment_text)
|
| 244 |
+
|
| 245 |
+
# Step 2: If English, we're done (lingua is good at detecting English)
|
| 246 |
+
if is_english:
|
| 247 |
+
result = {
|
| 248 |
+
"success": True,
|
| 249 |
+
"comment_text": comment_text,
|
| 250 |
+
"language": "English",
|
| 251 |
+
"language_code": "en",
|
| 252 |
+
"is_english": True,
|
| 253 |
+
"confidence": "high",
|
| 254 |
+
"detection_method": "lingua",
|
| 255 |
+
"has_text": True
|
| 256 |
+
}
|
| 257 |
+
else:
|
| 258 |
+
# Step 3: If not English, use LLM for more accurate detection
|
| 259 |
+
llm_result = self.detect_with_llm(comment_text)
|
| 260 |
+
language = llm_result.get("language", lingua_lang_name)
|
| 261 |
+
language_code = llm_result.get("language_code", lingua_lang_code)
|
| 262 |
+
confidence = llm_result.get("confidence", "medium")
|
| 263 |
+
has_text = llm_result.get("has_text", True)
|
| 264 |
+
if language_code == "en" or language == "English":
|
| 265 |
+
is_english=True
|
| 266 |
+
|
| 267 |
+
result = {
|
| 268 |
+
"success": True,
|
| 269 |
+
"comment_text": comment_text,
|
| 270 |
+
"language": language,
|
| 271 |
+
"language_code": language_code,
|
| 272 |
+
"is_english": is_english,
|
| 273 |
+
"confidence": confidence,
|
| 274 |
+
"detection_method": "llm",
|
| 275 |
+
"has_text": has_text
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
# Preserve original metadata
|
| 279 |
+
for key, value in input_data.items():
|
| 280 |
+
if key not in result:
|
| 281 |
+
result[key] = value
|
| 282 |
+
|
| 283 |
+
self.log_processing(
|
| 284 |
+
f"Detected language: {result['language']} ({result['language_code']}) - "
|
| 285 |
+
f"Method: {result['detection_method']}",
|
| 286 |
+
"debug"
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
return result
|
| 290 |
+
|
| 291 |
+
except Exception as e:
|
| 292 |
+
return self.handle_error(e, "language detection")
|
processing_comments/agents/sentiment_analysis_agent.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sentiment Analysis Agent
|
| 3 |
+
Extracts sentiment polarity, intent, and determines if reply is needed
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict, Any, List, Optional
|
| 7 |
+
import json
|
| 8 |
+
import re
|
| 9 |
+
from langchain_openai import ChatOpenAI
|
| 10 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 11 |
+
from agents.base_agent import BaseAgent
|
| 12 |
+
import logging
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
# Reply policy constants — must stay in sync with reply_policy in sentiment_analysis_config.json
|
| 17 |
+
_REQUIRES_REPLY_INTENTS = {"question", "request", "subscription"}
|
| 18 |
+
_NO_REPLY_INTENTS = {"humor_sarcasm"}
|
| 19 |
+
|
| 20 |
+
# Compiled regexes for content description parsing (compiled once at module load)
|
| 21 |
+
_RE_FOLLOW_SECTION = re.compile(r"^Follow\b", re.IGNORECASE)
|
| 22 |
+
_RE_ARROW_LINK = re.compile(r"^►")
|
| 23 |
+
_RE_URL_ONLY = re.compile(r"^https?://\S+$")
|
| 24 |
+
_RE_TIMESTAMP = re.compile(r"^\d+:\d+\s*[-–]\s*(.*)")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class SentimentAnalysisAgent(BaseAgent):
|
| 28 |
+
"""
|
| 29 |
+
Agent that analyzes comment sentiment, intent, and reply requirements.
|
| 30 |
+
|
| 31 |
+
Design decisions:
|
| 32 |
+
- System prompt is built once at init (static across all calls)
|
| 33 |
+
- requires_reply is computed deterministically in Python, not by the LLM
|
| 34 |
+
- LLM output is validated against config-defined allowed value sets
|
| 35 |
+
- Content descriptions are parsed to strip URLs, timestamps, and social sections
|
| 36 |
+
- Parent comments are passed as read-only context; classification targets the
|
| 37 |
+
TARGET comment only
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
def __init__(self, config: Dict[str, Any], api_key: str, sentiment_categories: Dict[str, Any]):
|
| 41 |
+
"""
|
| 42 |
+
Initialize the Sentiment Analysis Agent.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
config: Agent configuration dictionary
|
| 46 |
+
api_key: OpenAI API key
|
| 47 |
+
sentiment_categories: Loaded sentiment_analysis_config.json dict
|
| 48 |
+
"""
|
| 49 |
+
super().__init__("SentimentAnalysisAgent", config)
|
| 50 |
+
self.api_key = api_key
|
| 51 |
+
self.sentiment_categories = sentiment_categories
|
| 52 |
+
|
| 53 |
+
# Pre-compute valid value sets from config for O(1) validation
|
| 54 |
+
self._valid_polarities = {
|
| 55 |
+
cat["value"] for cat in sentiment_categories["sentiment_polarity"]["categories"]
|
| 56 |
+
}
|
| 57 |
+
self._valid_intents = {
|
| 58 |
+
cat["value"] for cat in sentiment_categories["intent"]["categories"]
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
self.llm = ChatOpenAI(
|
| 62 |
+
model=self.model,
|
| 63 |
+
temperature=self.temperature,
|
| 64 |
+
api_key=self.api_key,
|
| 65 |
+
model_kwargs={"response_format": {"type": "json_object"}}
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Build system prompt once at init — reused for every LLM call
|
| 69 |
+
self._system_prompt = self._build_system_prompt()
|
| 70 |
+
|
| 71 |
+
# ------------------------------------------------------------------
|
| 72 |
+
# Prompt construction
|
| 73 |
+
# ------------------------------------------------------------------
|
| 74 |
+
|
| 75 |
+
def _build_system_prompt(self) -> str:
|
| 76 |
+
"""
|
| 77 |
+
Build a compact, static system prompt from the sentiment config.
|
| 78 |
+
Pulls category descriptions directly from config so changes to
|
| 79 |
+
sentiment_analysis_config.json are automatically reflected.
|
| 80 |
+
"""
|
| 81 |
+
polarity_lines = "\n".join(
|
| 82 |
+
f"- {cat['value']}: {cat['description']}"
|
| 83 |
+
for cat in self.sentiment_categories["sentiment_polarity"]["categories"]
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
intent_lines = "\n".join(
|
| 87 |
+
f"- {cat['value']}: {cat['description']}"
|
| 88 |
+
for cat in self.sentiment_categories["intent"]["categories"]
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
return (
|
| 92 |
+
"Classify a social media comment about musical content.\n\n"
|
| 93 |
+
"RULE: Analyze ONLY the TARGET comment. "
|
| 94 |
+
"The parent comment is context only — do not extract sentiment or intent from it.\n\n"
|
| 95 |
+
"Return JSON only:\n"
|
| 96 |
+
'{"sentiment_polarity": <value>, "intents": [<values>], '
|
| 97 |
+
'"confidence": "high"|"medium"|"low", "analysis_notes": "<1-2 sentences>"}\n\n'
|
| 98 |
+
f"POLARITY (pick one):\n{polarity_lines}\n\n"
|
| 99 |
+
f"INTENTS (multi-label, pick all that apply):\n{intent_lines}\n\n"
|
| 100 |
+
"Rhetorical/sarcasm rules:\n"
|
| 101 |
+
"- Rhetorical questions → humor_sarcasm or feedback_negative, NOT question\n"
|
| 102 |
+
"- Sarcastic suggestions → feedback_negative, NOT suggestion\n"
|
| 103 |
+
"- Sarcastic requests → feedback_negative, NOT request\n"
|
| 104 |
+
"- Only use question/request/suggestion for GENUINE expressions"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
def _build_user_prompt(
|
| 108 |
+
self,
|
| 109 |
+
comment_text: str,
|
| 110 |
+
content_description: str,
|
| 111 |
+
parent_comment_text: Optional[str] = None,
|
| 112 |
+
platform: Optional[str] = None,
|
| 113 |
+
content_title: Optional[str] = None,
|
| 114 |
+
) -> str:
|
| 115 |
+
"""
|
| 116 |
+
Build the compact user prompt with parsed, truncated context.
|
| 117 |
+
|
| 118 |
+
YouTube stores the video title separately from the description, so they
|
| 119 |
+
are combined here. Other platforms already embed the title in the
|
| 120 |
+
description, so only the parsed description is used.
|
| 121 |
+
"""
|
| 122 |
+
parsed_description = self._parse_content_description(content_description)
|
| 123 |
+
|
| 124 |
+
if platform and platform.lower() == "youtube" and content_title and str(content_title).strip():
|
| 125 |
+
content_context = f"{content_title.strip()} — {parsed_description}"[:500]
|
| 126 |
+
else:
|
| 127 |
+
content_context = parsed_description
|
| 128 |
+
|
| 129 |
+
parts = [f"Content: {content_context}"]
|
| 130 |
+
|
| 131 |
+
if parent_comment_text and str(parent_comment_text).strip():
|
| 132 |
+
parent_snippet = str(parent_comment_text).strip()[:500]
|
| 133 |
+
parts.append(f'Parent (context only): "{parent_snippet}"')
|
| 134 |
+
|
| 135 |
+
parts.append(f'TARGET: "{comment_text}"')
|
| 136 |
+
|
| 137 |
+
return "\n".join(parts)
|
| 138 |
+
|
| 139 |
+
# ------------------------------------------------------------------
|
| 140 |
+
# Content description parsing
|
| 141 |
+
# ------------------------------------------------------------------
|
| 142 |
+
|
| 143 |
+
@staticmethod
|
| 144 |
+
def _parse_content_description(text: str) -> str:
|
| 145 |
+
"""
|
| 146 |
+
Extract meaningful narrative text from a raw content description.
|
| 147 |
+
|
| 148 |
+
Strips noise common in YouTube/social descriptions:
|
| 149 |
+
- "Follow [name]:" blocks and everything after them
|
| 150 |
+
- Lines starting with ► (hyperlinks)
|
| 151 |
+
- Lines that are a bare URL
|
| 152 |
+
- Timestamp chapter markers: "01:08 - Active listening" → "Active listening"
|
| 153 |
+
|
| 154 |
+
Returns at most 500 characters of joined clean text.
|
| 155 |
+
"""
|
| 156 |
+
if not text or not str(text).strip():
|
| 157 |
+
return ""
|
| 158 |
+
|
| 159 |
+
cleaned = []
|
| 160 |
+
for line in str(text).splitlines():
|
| 161 |
+
stripped = line.strip()
|
| 162 |
+
|
| 163 |
+
# Stop at social-media "Follow" blocks
|
| 164 |
+
if _RE_FOLLOW_SECTION.match(stripped):
|
| 165 |
+
break
|
| 166 |
+
|
| 167 |
+
# Skip ► link lines
|
| 168 |
+
if _RE_ARROW_LINK.match(stripped):
|
| 169 |
+
continue
|
| 170 |
+
|
| 171 |
+
# Skip bare URL lines
|
| 172 |
+
if _RE_URL_ONLY.match(stripped):
|
| 173 |
+
continue
|
| 174 |
+
|
| 175 |
+
# Convert "MM:SS - Chapter label" → keep just the label
|
| 176 |
+
ts_match = _RE_TIMESTAMP.match(stripped)
|
| 177 |
+
if ts_match:
|
| 178 |
+
label = ts_match.group(1).strip()
|
| 179 |
+
if label:
|
| 180 |
+
cleaned.append(label)
|
| 181 |
+
continue
|
| 182 |
+
|
| 183 |
+
if stripped:
|
| 184 |
+
cleaned.append(stripped)
|
| 185 |
+
|
| 186 |
+
return " ".join(cleaned)[:500]
|
| 187 |
+
|
| 188 |
+
# ------------------------------------------------------------------
|
| 189 |
+
# Output validation and reply computation
|
| 190 |
+
# ------------------------------------------------------------------
|
| 191 |
+
|
| 192 |
+
def _validate_result(self, raw: Dict[str, Any]) -> Dict[str, Any]:
|
| 193 |
+
"""
|
| 194 |
+
Validate LLM output against config-defined allowed value sets.
|
| 195 |
+
|
| 196 |
+
- Invalid polarity → fail (comment will not be stored)
|
| 197 |
+
- Invalid intent values → filtered out; if none remain → fail
|
| 198 |
+
- Invalid confidence → silently corrected to "medium"
|
| 199 |
+
|
| 200 |
+
Returns a success dict with cleaned fields, or a failure dict with
|
| 201 |
+
an explanatory error message.
|
| 202 |
+
"""
|
| 203 |
+
sentiment_polarity = raw.get("sentiment_polarity")
|
| 204 |
+
|
| 205 |
+
if not sentiment_polarity or sentiment_polarity not in self._valid_polarities:
|
| 206 |
+
return {
|
| 207 |
+
"success": False,
|
| 208 |
+
"error": (
|
| 209 |
+
f"Invalid sentiment_polarity '{sentiment_polarity}'. "
|
| 210 |
+
f"Expected one of: {sorted(self._valid_polarities)}"
|
| 211 |
+
),
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
# Normalize intents to a list
|
| 215 |
+
intents = raw.get("intents", raw.get("intent", []))
|
| 216 |
+
if isinstance(intents, str):
|
| 217 |
+
intents = [i.strip() for i in intents.split(",")]
|
| 218 |
+
if not isinstance(intents, list):
|
| 219 |
+
intents = []
|
| 220 |
+
|
| 221 |
+
valid_intents = [i for i in intents if i in self._valid_intents]
|
| 222 |
+
if not valid_intents:
|
| 223 |
+
return {
|
| 224 |
+
"success": False,
|
| 225 |
+
"error": (
|
| 226 |
+
f"No valid intents in response: {intents}. "
|
| 227 |
+
f"Expected values from: {sorted(self._valid_intents)}"
|
| 228 |
+
),
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
confidence = raw.get("confidence", "medium")
|
| 232 |
+
if confidence not in {"high", "medium", "low"}:
|
| 233 |
+
confidence = "medium"
|
| 234 |
+
|
| 235 |
+
return {
|
| 236 |
+
"success": True,
|
| 237 |
+
"sentiment_polarity": sentiment_polarity,
|
| 238 |
+
"intents": valid_intents,
|
| 239 |
+
"confidence": confidence,
|
| 240 |
+
"analysis_notes": str(raw.get("analysis_notes", "")).strip(),
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
@staticmethod
|
| 244 |
+
def _compute_requires_reply(intents: List[str]) -> bool:
|
| 245 |
+
"""
|
| 246 |
+
Deterministically decide if the comment requires a reply.
|
| 247 |
+
|
| 248 |
+
True when the comment contains at least one reply-required intent
|
| 249 |
+
(question, request, subscription) AND no no-reply intents (humor_sarcasm).
|
| 250 |
+
This mirrors the reply_policy section of sentiment_analysis_config.json
|
| 251 |
+
without delegating the decision to the LLM.
|
| 252 |
+
"""
|
| 253 |
+
intent_set = set(intents)
|
| 254 |
+
return (
|
| 255 |
+
bool(intent_set & _REQUIRES_REPLY_INTENTS)
|
| 256 |
+
and not bool(intent_set & _NO_REPLY_INTENTS)
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
# ------------------------------------------------------------------
|
| 260 |
+
# Core analysis
|
| 261 |
+
# ------------------------------------------------------------------
|
| 262 |
+
|
| 263 |
+
def analyze_sentiment(
|
| 264 |
+
self,
|
| 265 |
+
comment_text: str,
|
| 266 |
+
content_description: str,
|
| 267 |
+
parent_comment_text: Optional[str] = None,
|
| 268 |
+
platform: Optional[str] = None,
|
| 269 |
+
content_title: Optional[str] = None,
|
| 270 |
+
) -> Dict[str, Any]:
|
| 271 |
+
"""
|
| 272 |
+
Call the LLM to classify the TARGET comment's sentiment and intents.
|
| 273 |
+
|
| 274 |
+
Args:
|
| 275 |
+
comment_text: The comment to analyze (translated to English if needed)
|
| 276 |
+
content_description: Raw content description (will be parsed internally)
|
| 277 |
+
parent_comment_text: Optional parent comment — context only, max 500 chars
|
| 278 |
+
platform: Platform name; drives YouTube title-handling logic
|
| 279 |
+
content_title: YouTube video title (YouTube only)
|
| 280 |
+
|
| 281 |
+
Returns:
|
| 282 |
+
Success dict with sentiment_polarity, intent (comma-separated str),
|
| 283 |
+
requires_reply, sentiment_confidence, analysis_notes
|
| 284 |
+
— or a failure dict with an error key.
|
| 285 |
+
"""
|
| 286 |
+
user_prompt = self._build_user_prompt(
|
| 287 |
+
comment_text, content_description, parent_comment_text, platform, content_title
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
try:
|
| 291 |
+
messages = [
|
| 292 |
+
SystemMessage(content=self._system_prompt),
|
| 293 |
+
HumanMessage(content=user_prompt),
|
| 294 |
+
]
|
| 295 |
+
|
| 296 |
+
response = self.llm.invoke(messages)
|
| 297 |
+
raw = json.loads(response.content)
|
| 298 |
+
|
| 299 |
+
validated = self._validate_result(raw)
|
| 300 |
+
if not validated["success"]:
|
| 301 |
+
self.log_processing(f"Validation failed: {validated['error']}", "warning")
|
| 302 |
+
return validated
|
| 303 |
+
|
| 304 |
+
requires_reply = self._compute_requires_reply(validated["intents"])
|
| 305 |
+
intent_str = ", ".join(validated["intents"])
|
| 306 |
+
|
| 307 |
+
return {
|
| 308 |
+
"success": True,
|
| 309 |
+
"sentiment_polarity": validated["sentiment_polarity"],
|
| 310 |
+
"intent": intent_str,
|
| 311 |
+
"requires_reply": requires_reply,
|
| 312 |
+
"sentiment_confidence": validated["confidence"],
|
| 313 |
+
"analysis_notes": validated["analysis_notes"],
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
except json.JSONDecodeError as e:
|
| 317 |
+
self.log_processing(f"JSON decode error: {e}", "warning")
|
| 318 |
+
return {"success": False, "error": f"JSON parse error: {e}"}
|
| 319 |
+
|
| 320 |
+
except Exception as e:
|
| 321 |
+
self.log_processing(f"Sentiment analysis failed: {e}", "error")
|
| 322 |
+
return {"success": False, "error": str(e)}
|
| 323 |
+
|
| 324 |
+
# ------------------------------------------------------------------
|
| 325 |
+
# Agent interface
|
| 326 |
+
# ------------------------------------------------------------------
|
| 327 |
+
|
| 328 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 329 |
+
return all(field in input_data for field in ("comment_text", "content_description"))
|
| 330 |
+
|
| 331 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 332 |
+
"""
|
| 333 |
+
Process a comment and return sentiment analysis results merged with
|
| 334 |
+
the original input fields.
|
| 335 |
+
|
| 336 |
+
Args:
|
| 337 |
+
input_data: Must contain comment_text and content_description.
|
| 338 |
+
May contain parent_comment_text, platform, content_title,
|
| 339 |
+
and any additional source fields (permalink_url, etc.)
|
| 340 |
+
|
| 341 |
+
Returns:
|
| 342 |
+
Dict with sentiment fields merged on top of original input_data.
|
| 343 |
+
"""
|
| 344 |
+
try:
|
| 345 |
+
if not self.validate_input(input_data):
|
| 346 |
+
return {
|
| 347 |
+
"success": False,
|
| 348 |
+
"error": "Invalid input: missing required fields (comment_text, content_description)",
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
self.log_processing("Analyzing sentiment for comment", "debug")
|
| 352 |
+
|
| 353 |
+
analysis_result = self.analyze_sentiment(
|
| 354 |
+
comment_text=input_data["comment_text"],
|
| 355 |
+
content_description=input_data["content_description"],
|
| 356 |
+
parent_comment_text=input_data.get("parent_comment_text"),
|
| 357 |
+
platform=input_data.get("platform"),
|
| 358 |
+
content_title=input_data.get("content_title"),
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
result = {
|
| 362 |
+
"success": analysis_result.get("success", False),
|
| 363 |
+
"sentiment_polarity": analysis_result.get("sentiment_polarity"),
|
| 364 |
+
"intent": analysis_result.get("intent"),
|
| 365 |
+
"requires_reply": analysis_result.get("requires_reply", False),
|
| 366 |
+
"sentiment_confidence": analysis_result.get("sentiment_confidence"),
|
| 367 |
+
"analysis_notes": analysis_result.get("analysis_notes", ""),
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
if "error" in analysis_result:
|
| 371 |
+
result["sentiment_error"] = analysis_result["error"]
|
| 372 |
+
|
| 373 |
+
# Preserve all original input fields (e.g. permalink_url, thumbnail_url)
|
| 374 |
+
for key, value in input_data.items():
|
| 375 |
+
if key not in result:
|
| 376 |
+
result[key] = value
|
| 377 |
+
|
| 378 |
+
return result
|
| 379 |
+
|
| 380 |
+
except Exception as e:
|
| 381 |
+
return self.handle_error(e, "sentiment_analysis")
|
processing_comments/agents/translation_agent.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Translation Agent
|
| 3 |
+
Translates non-English comments to English using LLM
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict, Any
|
| 7 |
+
import json
|
| 8 |
+
from langchain_openai import ChatOpenAI
|
| 9 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 10 |
+
from agents.base_agent import BaseAgent
|
| 11 |
+
import logging
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class TranslationAgent(BaseAgent):
|
| 17 |
+
"""
|
| 18 |
+
Agent that translates text from source language to English.
|
| 19 |
+
Uses LLM for high-quality, context-aware translation.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, config: Dict[str, Any], api_key: str):
|
| 23 |
+
"""
|
| 24 |
+
Initialize the Translation Agent.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
config: Configuration dictionary
|
| 28 |
+
api_key: OpenAI API key
|
| 29 |
+
"""
|
| 30 |
+
super().__init__("TranslationAgent", config)
|
| 31 |
+
self.api_key = api_key
|
| 32 |
+
self.llm = ChatOpenAI(
|
| 33 |
+
model=self.model,
|
| 34 |
+
temperature=self.temperature,
|
| 35 |
+
api_key=self.api_key
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 39 |
+
"""
|
| 40 |
+
Validate that input contains required fields.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
input_data: Input dictionary
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
True if valid, False otherwise
|
| 47 |
+
"""
|
| 48 |
+
required_fields = ["comment_text", "is_english"]
|
| 49 |
+
return all(field in input_data for field in required_fields)
|
| 50 |
+
|
| 51 |
+
def translate_text(self, text: str, source_language: str) -> Dict[str, Any]:
|
| 52 |
+
"""
|
| 53 |
+
Translate text from source language to English using LLM.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
text: Text to translate
|
| 57 |
+
source_language: Source language name
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
Dictionary with translation results
|
| 61 |
+
"""
|
| 62 |
+
system_prompt = """You are a professional translator specializing in social media content related to music and education.
|
| 63 |
+
Translate the given text from the source language to English. The text is a comment on a musical content.
|
| 64 |
+
Preserve the tone, intent, and any emojis or special characters.
|
| 65 |
+
For informal social media language, maintain the casual tone in translation.
|
| 66 |
+
|
| 67 |
+
Return your response in JSON format with the following fields:
|
| 68 |
+
- translated_text: The English translation
|
| 69 |
+
- translation_confidence: Your confidence level (high, medium, low)
|
| 70 |
+
- notes: Any important notes about the translation (optional)
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
user_prompt = f"""Translate this {source_language} comment to English:
|
| 74 |
+
|
| 75 |
+
"{text}"
|
| 76 |
+
|
| 77 |
+
Return JSON only."""
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
messages = [
|
| 81 |
+
SystemMessage(content=system_prompt),
|
| 82 |
+
HumanMessage(content=user_prompt)
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
response = self.llm.invoke(messages)
|
| 86 |
+
result = self._parse_llm_json_response(response.content)
|
| 87 |
+
|
| 88 |
+
return {
|
| 89 |
+
"success": True,
|
| 90 |
+
"translated_text": result.get("translated_text", text),
|
| 91 |
+
"translation_confidence": result.get("translation_confidence", "medium"),
|
| 92 |
+
"translation_notes": result.get("notes", "")
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
except json.JSONDecodeError as e:
|
| 96 |
+
self.log_processing(f"JSON decode error: {str(e)}", "warning")
|
| 97 |
+
# Try to extract text from response
|
| 98 |
+
return {
|
| 99 |
+
"success": False,
|
| 100 |
+
"translated_text": text,
|
| 101 |
+
"translation_confidence": "low",
|
| 102 |
+
"translation_notes": "JSON parsing failed",
|
| 103 |
+
"error": str(e)
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
except Exception as e:
|
| 107 |
+
self.log_processing(f"Translation failed: {str(e)}", "error")
|
| 108 |
+
return {
|
| 109 |
+
"success": False,
|
| 110 |
+
"translated_text": text,
|
| 111 |
+
"translation_confidence": "low",
|
| 112 |
+
"translation_notes": "Translation error",
|
| 113 |
+
"error": str(e)
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 117 |
+
"""
|
| 118 |
+
Process comment and translate if needed.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
input_data: Dictionary containing comment data with language info
|
| 122 |
+
|
| 123 |
+
Returns:
|
| 124 |
+
Dictionary with translation results
|
| 125 |
+
"""
|
| 126 |
+
try:
|
| 127 |
+
# Validate input
|
| 128 |
+
if not self.validate_input(input_data):
|
| 129 |
+
return {
|
| 130 |
+
"success": False,
|
| 131 |
+
"error": "Invalid input: missing required fields",
|
| 132 |
+
"translated_text": input_data.get("comment_text", ""),
|
| 133 |
+
"translation_performed": False
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
comment_text = input_data["comment_text"]
|
| 137 |
+
is_english = input_data["is_english"]
|
| 138 |
+
source_language = input_data.get("language", "Unknown")
|
| 139 |
+
|
| 140 |
+
# If already English, no translation needed
|
| 141 |
+
if is_english:
|
| 142 |
+
result = {
|
| 143 |
+
"success": True,
|
| 144 |
+
"translated_text": comment_text,
|
| 145 |
+
"translation_performed": False,
|
| 146 |
+
"translation_confidence": "N/A",
|
| 147 |
+
"translation_notes": "Original text is English"
|
| 148 |
+
}
|
| 149 |
+
self.log_processing("Text is already English, skipping translation", "debug")
|
| 150 |
+
else:
|
| 151 |
+
# Perform translation
|
| 152 |
+
self.log_processing(
|
| 153 |
+
f"Translating from {source_language} to English",
|
| 154 |
+
"debug"
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
translation_result = self.translate_text(comment_text, source_language)
|
| 158 |
+
|
| 159 |
+
result = {
|
| 160 |
+
"success": translation_result.get("success", True),
|
| 161 |
+
"translated_text": translation_result.get("translated_text", comment_text),
|
| 162 |
+
"translation_performed": True,
|
| 163 |
+
"translation_confidence": translation_result.get("translation_confidence", "medium"),
|
| 164 |
+
"translation_notes": translation_result.get("translation_notes", "")
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
if "error" in translation_result:
|
| 168 |
+
result["translation_error"] = translation_result["error"]
|
| 169 |
+
|
| 170 |
+
# Preserve all original data
|
| 171 |
+
for key, value in input_data.items():
|
| 172 |
+
if key not in result:
|
| 173 |
+
result[key] = value
|
| 174 |
+
|
| 175 |
+
return result
|
| 176 |
+
|
| 177 |
+
except Exception as e:
|
| 178 |
+
return self.handle_error(e, "translation")
|
| 179 |
+
|
| 180 |
+
def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]:
|
| 181 |
+
"""
|
| 182 |
+
Parse LLM response that may contain JSON wrapped in markdown code blocks.
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
response_content: Raw response content from LLM
|
| 186 |
+
|
| 187 |
+
Returns:
|
| 188 |
+
Parsed JSON dictionary
|
| 189 |
+
|
| 190 |
+
Raises:
|
| 191 |
+
json.JSONDecodeError: If JSON cannot be parsed
|
| 192 |
+
"""
|
| 193 |
+
content = response_content.strip()
|
| 194 |
+
|
| 195 |
+
# Check if response is wrapped in markdown code block
|
| 196 |
+
if content.startswith("```json"):
|
| 197 |
+
# Remove ```json prefix and ``` suffix
|
| 198 |
+
content = content[7:] # Remove ```json
|
| 199 |
+
if content.endswith("```"):
|
| 200 |
+
content = content[:-3] # Remove trailing ```
|
| 201 |
+
content = content.strip()
|
| 202 |
+
elif content.startswith("```"):
|
| 203 |
+
# Remove generic ``` code block
|
| 204 |
+
content = content[3:]
|
| 205 |
+
if content.endswith("```"):
|
| 206 |
+
content = content[:-3]
|
| 207 |
+
content = content.strip()
|
| 208 |
+
|
| 209 |
+
# Parse the cleaned JSON
|
| 210 |
+
return json.loads(content)
|
processing_comments/config_files/data_sources_config.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"data_sources": {
|
| 3 |
+
"social_media": {
|
| 4 |
+
"name": "Social Media Comments",
|
| 5 |
+
"description": "Comments from external social media platforms (Facebook, Instagram, YouTube, etc.)",
|
| 6 |
+
"enabled": true,
|
| 7 |
+
"sql_query_file": "sql/fetch_comments.sql",
|
| 8 |
+
"output_config": {
|
| 9 |
+
"table_name": "COMMENT_SENTIMENT_FEATURES",
|
| 10 |
+
"database": "SOCIAL_MEDIA_DB",
|
| 11 |
+
"schema": "ML_FEATURES"
|
| 12 |
+
},
|
| 13 |
+
"source_columns": {
|
| 14 |
+
"comment_sk": "COMMENT_SK",
|
| 15 |
+
"comment_id": "COMMENT_ID",
|
| 16 |
+
"comment_text": "COMMENT_TEXT",
|
| 17 |
+
"parent_comment_id": "PARENT_COMMENT_ID",
|
| 18 |
+
"parent_comment_text": "PARENT_COMMENT_TEXT",
|
| 19 |
+
"platform": "PLATFORM",
|
| 20 |
+
"content_description": "CONTENT_DESCRIPTION"
|
| 21 |
+
}
|
| 22 |
+
},
|
| 23 |
+
"musora_comments": {
|
| 24 |
+
"name": "Musora Internal Comments",
|
| 25 |
+
"description": "Comments from Musora internal applications",
|
| 26 |
+
"enabled": true,
|
| 27 |
+
"sql_query_file": "sql/fetch_musora_comments.sql",
|
| 28 |
+
"output_config": {
|
| 29 |
+
"table_name": "MUSORA_COMMENT_SENTIMENT_FEATURES",
|
| 30 |
+
"database": "SOCIAL_MEDIA_DB",
|
| 31 |
+
"schema": "ML_FEATURES"
|
| 32 |
+
},
|
| 33 |
+
"source_columns": {
|
| 34 |
+
"comment_sk": "COMMENT_SK (generated via HASH)",
|
| 35 |
+
"comment_id": "COMMENT_ID",
|
| 36 |
+
"comment_text": "COMMENT_TEXT (aliased from MESSAGE)",
|
| 37 |
+
"parent_comment_id": "PARENT_COMMENT_ID",
|
| 38 |
+
"parent_comment_text": "PARENT_COMMENT_TEXT",
|
| 39 |
+
"platform": "PLATFORM",
|
| 40 |
+
"content_description": "CONTENT_DESCRIPTION (aliased from CONTENT_PROFILE)",
|
| 41 |
+
"author_id": "AUTHOR_ID (aliased from USER_ID)",
|
| 42 |
+
"permalink_url": "PERMALINK_URL (aliased from WEB_URL_PATH)",
|
| 43 |
+
"thumbnail_url": "THUMBNAIL_URL"
|
| 44 |
+
},
|
| 45 |
+
"additional_fields": [
|
| 46 |
+
"PERMALINK_URL",
|
| 47 |
+
"THUMBNAIL_URL"
|
| 48 |
+
]
|
| 49 |
+
}
|
| 50 |
+
},
|
| 51 |
+
"processing": {
|
| 52 |
+
"default_limit": 10000,
|
| 53 |
+
"enable_parent_context": true,
|
| 54 |
+
"parent_context_description": "When a comment is a reply, include the parent comment text for better sentiment analysis context"
|
| 55 |
+
}
|
| 56 |
+
}
|
processing_comments/config_files/sentiment_analysis_config.json
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"sentiment_polarity": {
|
| 3 |
+
"categories": [
|
| 4 |
+
{
|
| 5 |
+
"value": "very_positive",
|
| 6 |
+
"label": "Very Positive",
|
| 7 |
+
"description": "Extremely enthusiastic, excited, deeply grateful, or highly satisfied"
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"value": "positive",
|
| 11 |
+
"label": "Positive",
|
| 12 |
+
"description": "Generally positive, appreciative, supportive, or encouraging"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"value": "neutral",
|
| 16 |
+
"label": "Neutral",
|
| 17 |
+
"description": "Factual, informational, balanced, or lacking clear emotional tone"
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"value": "negative",
|
| 21 |
+
"label": "Negative",
|
| 22 |
+
"description": "Disappointed, critical, frustrated, or mildly dissatisfied"
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"value": "very_negative",
|
| 26 |
+
"label": "Very Negative",
|
| 27 |
+
"description": "Highly critical, angry, abusive, or extremely dissatisfied"
|
| 28 |
+
}
|
| 29 |
+
]
|
| 30 |
+
},
|
| 31 |
+
"intent": {
|
| 32 |
+
"categories": [
|
| 33 |
+
{
|
| 34 |
+
"value": "praise",
|
| 35 |
+
"label": "Praise",
|
| 36 |
+
"description": "Compliments, thanks, admiration, excitement, and similar positive expressions"
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"value": "question",
|
| 40 |
+
"label": "Question",
|
| 41 |
+
"description": "Information seeking (e.g., 'what scale?', 'when's it out?', How to get account?)"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"value": "request",
|
| 45 |
+
"label": "Request",
|
| 46 |
+
"description": "Asking for something actionable (tutorial, feature, sheet music, etc.)"
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"value": "feedback_negative",
|
| 50 |
+
"label": "Negative Feedback",
|
| 51 |
+
"description": "Critical feedback about the content or issues (mixing, performance, composition) without abuse"
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"value": "suggestion",
|
| 55 |
+
"label": "Suggestion",
|
| 56 |
+
"description": "Constructive ideas/improvements (e.g., 'try slower tempo', 'add captions')"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"value": "humor_sarcasm",
|
| 60 |
+
"label": "Humor/Sarcasm",
|
| 61 |
+
"description": "Joking, teasing, memes, irony (non-toxic)"
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"value": "off_topic",
|
| 65 |
+
"label": "Off Topic",
|
| 66 |
+
"description": "Unrelated chatter or unclear/no discernible intent"
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"value": "spam_selfpromo",
|
| 70 |
+
"label": "Spam/Self-Promotion",
|
| 71 |
+
"description": "Ads, links, promos, scams"
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"value": "subscription",
|
| 75 |
+
"label": "Subscription",
|
| 76 |
+
"description": "Questions about subscribing (e.g., 'How do I subscribe?', 'What's the cost?') or requests to unsubscribe/cancel (e.g., 'I want to cancel', 'How to unsubscribe?')"
|
| 77 |
+
}
|
| 78 |
+
]
|
| 79 |
+
},
|
| 80 |
+
"reply_policy": {
|
| 81 |
+
"requires_reply_intents": ["question", "request", "subscription"],
|
| 82 |
+
"not_include": ["humor_sarcasm"],
|
| 83 |
+
"description": "Comments with these intents should be flagged for reply"
|
| 84 |
+
},
|
| 85 |
+
"intent_settings": {
|
| 86 |
+
"multi_label": true,
|
| 87 |
+
"description": "Intent can have multiple labels as a comment can express multiple intents",
|
| 88 |
+
"rhetorical_sarcasm_handling": true,
|
| 89 |
+
"rhetorical_sarcasm_description": "System differentiates between genuine questions/suggestions/requests and rhetorical/sarcastic ones"
|
| 90 |
+
},
|
| 91 |
+
"analysis_notes_policy": {
|
| 92 |
+
"max_length": "1-2 sentences",
|
| 93 |
+
"include_topics": true,
|
| 94 |
+
"description": "Concise notes including key topics/highlights not covered by other categories for future summarization"
|
| 95 |
+
}
|
| 96 |
+
}
|
processing_comments/config_files/sentiment_config.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"LLM_models": ["gpt-5-nano", "gpt-4o-mini"],
|
| 3 |
+
"reasoning": ["gpt-5-nano"],
|
| 4 |
+
|
| 5 |
+
"agents": {
|
| 6 |
+
"language_detection": {
|
| 7 |
+
"name": "LanguageDetectionAgent",
|
| 8 |
+
"model": "gpt-5-nano",
|
| 9 |
+
"temperature": 0.0,
|
| 10 |
+
"max_retries": 3,
|
| 11 |
+
"description": "Detects language of comments and identifies non-English content"
|
| 12 |
+
},
|
| 13 |
+
"translation": {
|
| 14 |
+
"name": "TranslationAgent",
|
| 15 |
+
"model": "gpt-5-nano",
|
| 16 |
+
"temperature": 0.3,
|
| 17 |
+
"max_retries": 3,
|
| 18 |
+
"description": "Translates non-English comments to English"
|
| 19 |
+
},
|
| 20 |
+
"sentiment_analysis": {
|
| 21 |
+
"name": "SentimentAnalysisAgent",
|
| 22 |
+
"model": "gpt-5-nano",
|
| 23 |
+
"temperature": 0.0,
|
| 24 |
+
"max_retries": 3,
|
| 25 |
+
"description": "Analyzes sentiment polarity, intent, and determines if reply is needed"
|
| 26 |
+
}
|
| 27 |
+
},
|
| 28 |
+
|
| 29 |
+
"workflow": {
|
| 30 |
+
"description": "Batch size is calculated dynamically based on number of workers (min: 20, max: 1000)",
|
| 31 |
+
"parallel_processing": {
|
| 32 |
+
"enabled": true,
|
| 33 |
+
"worker_calculation": "CPU count - 2, max 5 workers",
|
| 34 |
+
"min_batch_size": 20,
|
| 35 |
+
"max_batch_size": 1000
|
| 36 |
+
}
|
| 37 |
+
},
|
| 38 |
+
|
| 39 |
+
"snowflake": {
|
| 40 |
+
"output_table": "COMMENT_SENTIMENT_FEATURES",
|
| 41 |
+
"database": "SOCIAL_MEDIA_DB",
|
| 42 |
+
"schema": "ML_FEATURES"
|
| 43 |
+
},
|
| 44 |
+
|
| 45 |
+
"default_language": "English"
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
processing_comments/main.py
ADDED
|
@@ -0,0 +1,572 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main execution script for comment processing workflow.
|
| 3 |
+
Orchestrates data fetching, processing, and storage using agentic workflow.
|
| 4 |
+
Supports parallel processing with multiprocessing for improved performance.
|
| 5 |
+
Supports multiple data sources (social media and Musora internal comments).
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
import logging
|
| 11 |
+
import argparse
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from dotenv import load_dotenv
|
| 15 |
+
from multiprocessing import Pool, cpu_count, Manager
|
| 16 |
+
from functools import partial
|
| 17 |
+
import traceback
|
| 18 |
+
from typing import Dict, Any, List
|
| 19 |
+
|
| 20 |
+
from SnowFlakeConnection import SnowFlakeConn
|
| 21 |
+
from workflow.comment_processor import CommentProcessingWorkflow
|
| 22 |
+
|
| 23 |
+
# Get the directory where this script is located
|
| 24 |
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 25 |
+
|
| 26 |
+
# Load environment variables from root directory (parent of processing_comments)
|
| 27 |
+
ROOT_DIR = os.path.dirname(SCRIPT_DIR)
|
| 28 |
+
load_dotenv(os.path.join(ROOT_DIR, '.env'))
|
| 29 |
+
|
| 30 |
+
# Configure logging
|
| 31 |
+
logging.basicConfig(
|
| 32 |
+
level=logging.INFO,
|
| 33 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 34 |
+
handlers=[
|
| 35 |
+
logging.FileHandler(os.path.join(SCRIPT_DIR, 'logs', f'comment_processing_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')),
|
| 36 |
+
logging.StreamHandler()
|
| 37 |
+
]
|
| 38 |
+
)
|
| 39 |
+
logger = logging.getLogger(__name__)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def calculate_optimal_batch_size(total_comments: int, num_workers: int, min_batch: int = 20, max_batch: int = 100) -> int:
|
| 43 |
+
"""
|
| 44 |
+
Calculate optimal batch size based on total comments and number of workers.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
total_comments: Total number of comments to process
|
| 48 |
+
num_workers: Number of parallel workers
|
| 49 |
+
min_batch: Minimum batch size (default: 20)
|
| 50 |
+
max_batch: Maximum batch size (default: 1000)
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
Optimal batch size
|
| 54 |
+
"""
|
| 55 |
+
if total_comments <= min_batch:
|
| 56 |
+
return total_comments
|
| 57 |
+
|
| 58 |
+
# Calculate batch size to distribute work evenly among workers
|
| 59 |
+
batch_size = total_comments // num_workers
|
| 60 |
+
|
| 61 |
+
# Apply constraints
|
| 62 |
+
batch_size = max(min_batch, min(max_batch, batch_size))
|
| 63 |
+
|
| 64 |
+
return batch_size
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def process_batch_worker(batch_data: tuple) -> dict:
|
| 68 |
+
"""
|
| 69 |
+
Worker function to process a single batch of comments.
|
| 70 |
+
This function runs in a separate process.
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
batch_data: Tuple containing (batch_num, batch_comments, config, api_key, overwrite_first_batch, data_source_config)
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
Dictionary with batch statistics and results
|
| 77 |
+
"""
|
| 78 |
+
batch_num, batch_comments, config, api_key, overwrite_first_batch, data_source_config = batch_data
|
| 79 |
+
|
| 80 |
+
# Configure logging for this worker
|
| 81 |
+
worker_logger = logging.getLogger(f"Worker-{batch_num}")
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
worker_logger.info(f"Batch {batch_num}: Starting processing of {len(batch_comments)} comments")
|
| 85 |
+
|
| 86 |
+
# Initialize Snowflake connection for this worker
|
| 87 |
+
snowflake = SnowFlakeConn()
|
| 88 |
+
|
| 89 |
+
# Initialize workflow for this worker
|
| 90 |
+
workflow = CommentProcessingWorkflow(config, api_key)
|
| 91 |
+
|
| 92 |
+
# Process comments through workflow
|
| 93 |
+
results = workflow.process_batch(batch_comments)
|
| 94 |
+
|
| 95 |
+
# Convert to DataFrame
|
| 96 |
+
results_df = pd.DataFrame(results)
|
| 97 |
+
|
| 98 |
+
# Filter successful results
|
| 99 |
+
initial_count = len(results_df)
|
| 100 |
+
df_successful = results_df[results_df['success'] == True].copy()
|
| 101 |
+
filtered_count = initial_count - len(df_successful)
|
| 102 |
+
|
| 103 |
+
worker_logger.info(f"Batch {batch_num}: Processed {initial_count} comments, {len(df_successful)} successful")
|
| 104 |
+
|
| 105 |
+
# Prepare output data with base columns
|
| 106 |
+
output_columns = {
|
| 107 |
+
'comment_sk': 'COMMENT_SK',
|
| 108 |
+
'comment_id': 'COMMENT_ID',
|
| 109 |
+
'comment_text': 'ORIGINAL_TEXT',
|
| 110 |
+
'platform': 'PLATFORM',
|
| 111 |
+
'comment_timestamp': 'COMMENT_TIMESTAMP',
|
| 112 |
+
'author_name': 'AUTHOR_NAME',
|
| 113 |
+
'author_id': 'AUTHOR_ID',
|
| 114 |
+
'parent_comment_id': 'PARENT_COMMENT_ID',
|
| 115 |
+
'parent_comment_text': 'PARENT_COMMENT_TEXT',
|
| 116 |
+
'content_sk': 'CONTENT_SK',
|
| 117 |
+
'content_id': 'CONTENT_ID',
|
| 118 |
+
'content_description': 'CONTENT_DESCRIPTION',
|
| 119 |
+
'channel_sk': 'CHANNEL_SK',
|
| 120 |
+
'channel_name': 'CHANNEL_NAME',
|
| 121 |
+
'channel_display_name': 'CHANNEL_DISPLAY_NAME',
|
| 122 |
+
'language': 'DETECTED_LANGUAGE',
|
| 123 |
+
'language_code': 'LANGUAGE_CODE',
|
| 124 |
+
'is_english': 'IS_ENGLISH',
|
| 125 |
+
'language_confidence': 'LANGUAGE_CONFIDENCE',
|
| 126 |
+
'detection_method': 'DETECTION_METHOD',
|
| 127 |
+
'has_text': 'HAS_TEXT',
|
| 128 |
+
'translated_text': 'TRANSLATED_TEXT',
|
| 129 |
+
'translation_performed': 'TRANSLATION_PERFORMED',
|
| 130 |
+
'translation_confidence': 'TRANSLATION_CONFIDENCE',
|
| 131 |
+
'translation_notes': 'TRANSLATION_NOTES',
|
| 132 |
+
'sentiment_polarity': 'SENTIMENT_POLARITY',
|
| 133 |
+
'intent': 'INTENT',
|
| 134 |
+
'requires_reply': 'REQUIRES_REPLY',
|
| 135 |
+
'sentiment_confidence': 'SENTIMENT_CONFIDENCE',
|
| 136 |
+
'analysis_notes': 'ANALYSIS_NOTES',
|
| 137 |
+
'success': 'PROCESSING_SUCCESS'
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
# Add data source-specific columns if present
|
| 141 |
+
if 'additional_fields' in data_source_config:
|
| 142 |
+
for field in data_source_config['additional_fields']:
|
| 143 |
+
field_lower = field.lower()
|
| 144 |
+
output_columns[field_lower] = field
|
| 145 |
+
worker_logger.debug(f"Batch {batch_num}: Added {len(data_source_config['additional_fields'])} additional fields")
|
| 146 |
+
|
| 147 |
+
output_df = pd.DataFrame()
|
| 148 |
+
for source_col, target_col in output_columns.items():
|
| 149 |
+
if source_col in df_successful.columns:
|
| 150 |
+
output_df[target_col] = df_successful[source_col]
|
| 151 |
+
else:
|
| 152 |
+
output_df[target_col] = None
|
| 153 |
+
# Log missing columns for debugging
|
| 154 |
+
if source_col in ['permalink_url', 'thumbnail_url']:
|
| 155 |
+
worker_logger.warning(f"Batch {batch_num}: Column '{source_col}' not found in DataFrame. Available columns: {list(df_successful.columns)}")
|
| 156 |
+
|
| 157 |
+
# Add processing metadata
|
| 158 |
+
output_df['PROCESSED_AT'] = datetime.now()
|
| 159 |
+
output_df['WORKFLOW_VERSION'] = '1.0'
|
| 160 |
+
|
| 161 |
+
# Store results to Snowflake
|
| 162 |
+
if len(output_df) > 0:
|
| 163 |
+
# Use data source-specific output configuration
|
| 164 |
+
table_name = data_source_config['output_config']['table_name']
|
| 165 |
+
database = data_source_config['output_config']['database']
|
| 166 |
+
schema = data_source_config['output_config']['schema']
|
| 167 |
+
|
| 168 |
+
# Only the first batch should overwrite if requested
|
| 169 |
+
overwrite = overwrite_first_batch and batch_num == 1
|
| 170 |
+
|
| 171 |
+
snowflake.store_df_to_snowflake(
|
| 172 |
+
table_name=table_name,
|
| 173 |
+
dataframe=output_df,
|
| 174 |
+
database=database,
|
| 175 |
+
schema=schema,
|
| 176 |
+
overwrite=overwrite
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
worker_logger.info(f"Batch {batch_num}: Stored {len(output_df)} records to Snowflake ({table_name})")
|
| 180 |
+
else:
|
| 181 |
+
worker_logger.warning(f"Batch {batch_num}: No successful records to store")
|
| 182 |
+
|
| 183 |
+
# Close Snowflake connection
|
| 184 |
+
snowflake.close_connection()
|
| 185 |
+
|
| 186 |
+
# Calculate statistics
|
| 187 |
+
translations = output_df['TRANSLATION_PERFORMED'].sum() if 'TRANSLATION_PERFORMED' in output_df.columns else 0
|
| 188 |
+
non_english = (~output_df['IS_ENGLISH']).sum() if 'IS_ENGLISH' in output_df.columns else 0
|
| 189 |
+
requires_reply = output_df['REQUIRES_REPLY'].sum() if 'REQUIRES_REPLY' in output_df.columns else 0
|
| 190 |
+
|
| 191 |
+
return {
|
| 192 |
+
'batch_num': batch_num,
|
| 193 |
+
'success': True,
|
| 194 |
+
'total_processed': initial_count,
|
| 195 |
+
'total_stored': len(output_df),
|
| 196 |
+
'failed_count': filtered_count,
|
| 197 |
+
'translations': int(translations),
|
| 198 |
+
'non_english': int(non_english),
|
| 199 |
+
'requires_reply': int(requires_reply),
|
| 200 |
+
'error': None
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
except Exception as e:
|
| 204 |
+
error_msg = f"Batch {batch_num} failed: {str(e)}"
|
| 205 |
+
worker_logger.error(error_msg)
|
| 206 |
+
worker_logger.error(traceback.format_exc())
|
| 207 |
+
|
| 208 |
+
return {
|
| 209 |
+
'batch_num': batch_num,
|
| 210 |
+
'success': False,
|
| 211 |
+
'total_processed': len(batch_comments),
|
| 212 |
+
'total_stored': 0,
|
| 213 |
+
'failed_count': len(batch_comments),
|
| 214 |
+
'translations': 0,
|
| 215 |
+
'non_english': 0,
|
| 216 |
+
'requires_reply': 0,
|
| 217 |
+
'error': error_msg
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
class CommentProcessor:
|
| 222 |
+
"""
|
| 223 |
+
Main processor class that orchestrates the entire workflow.
|
| 224 |
+
Supports multiple data sources (social media and Musora internal comments).
|
| 225 |
+
"""
|
| 226 |
+
|
| 227 |
+
def __init__(self, config_path: str = None, data_sources_config_path: str = None):
|
| 228 |
+
"""
|
| 229 |
+
Initialize the comment processor.
|
| 230 |
+
|
| 231 |
+
Args:
|
| 232 |
+
config_path: Path to configuration file (default: config_files/sentiment_config.json relative to script)
|
| 233 |
+
data_sources_config_path: Path to data sources config (default: config_files/data_sources_config.json)
|
| 234 |
+
"""
|
| 235 |
+
# Set default config path if not provided
|
| 236 |
+
if config_path is None:
|
| 237 |
+
config_path = os.path.join(SCRIPT_DIR, 'config_files', 'sentiment_config.json')
|
| 238 |
+
|
| 239 |
+
if data_sources_config_path is None:
|
| 240 |
+
data_sources_config_path = os.path.join(SCRIPT_DIR, 'config_files', 'data_sources_config.json')
|
| 241 |
+
|
| 242 |
+
# Load configuration
|
| 243 |
+
with open(config_path, 'r') as f:
|
| 244 |
+
self.config = json.load(f)
|
| 245 |
+
|
| 246 |
+
# Load data sources configuration
|
| 247 |
+
with open(data_sources_config_path, 'r') as f:
|
| 248 |
+
self.data_sources_config = json.load(f)
|
| 249 |
+
|
| 250 |
+
# Initialize Snowflake connection
|
| 251 |
+
self.snowflake = SnowFlakeConn()
|
| 252 |
+
|
| 253 |
+
# Get OpenAI API key
|
| 254 |
+
self.api_key = os.getenv("OPENAI_API_KEY")
|
| 255 |
+
if not self.api_key:
|
| 256 |
+
raise ValueError("OPENAI_API_KEY not found in environment variables")
|
| 257 |
+
|
| 258 |
+
# Initialize workflow
|
| 259 |
+
self.workflow = CommentProcessingWorkflow(self.config, self.api_key)
|
| 260 |
+
|
| 261 |
+
logger.info("CommentProcessor initialized successfully")
|
| 262 |
+
|
| 263 |
+
def get_enabled_data_sources(self) -> List[Dict[str, Any]]:
|
| 264 |
+
"""
|
| 265 |
+
Get list of enabled data sources from configuration.
|
| 266 |
+
|
| 267 |
+
Returns:
|
| 268 |
+
List of enabled data source configurations
|
| 269 |
+
"""
|
| 270 |
+
enabled_sources = []
|
| 271 |
+
for source_key, source_config in self.data_sources_config['data_sources'].items():
|
| 272 |
+
if source_config.get('enabled', True):
|
| 273 |
+
enabled_sources.append({
|
| 274 |
+
'key': source_key,
|
| 275 |
+
'config': source_config
|
| 276 |
+
})
|
| 277 |
+
return enabled_sources
|
| 278 |
+
|
| 279 |
+
def fetch_comments(self, data_source_key: str, limit: int = None) -> pd.DataFrame:
|
| 280 |
+
"""
|
| 281 |
+
Fetch comments from Snowflake using the SQL query for a specific data source.
|
| 282 |
+
|
| 283 |
+
Args:
|
| 284 |
+
data_source_key: Key identifying the data source (e.g., 'social_media', 'musora_comments')
|
| 285 |
+
limit: Optional limit on number of comments to fetch
|
| 286 |
+
|
| 287 |
+
Returns:
|
| 288 |
+
DataFrame containing comment data
|
| 289 |
+
"""
|
| 290 |
+
data_source_config = self.data_sources_config['data_sources'][data_source_key]
|
| 291 |
+
source_name = data_source_config['name']
|
| 292 |
+
|
| 293 |
+
logger.info(f"Fetching comments from {source_name}...")
|
| 294 |
+
|
| 295 |
+
# Read SQL query
|
| 296 |
+
sql_file = data_source_config['sql_query_file']
|
| 297 |
+
sql_path = os.path.join(SCRIPT_DIR, sql_file)
|
| 298 |
+
with open(sql_path, 'r') as f:
|
| 299 |
+
query = f.read()
|
| 300 |
+
|
| 301 |
+
# Add limit if specified
|
| 302 |
+
if limit:
|
| 303 |
+
query = query.rstrip(';') + f"\nLIMIT {limit};"
|
| 304 |
+
|
| 305 |
+
# Execute query
|
| 306 |
+
df = self.snowflake.run_read_query(query, f"{source_name} comments")
|
| 307 |
+
|
| 308 |
+
logger.info(f"Fetched {len(df)} comments from {source_name}")
|
| 309 |
+
|
| 310 |
+
# Normalize column names to lowercase for consistent processing
|
| 311 |
+
df.columns = df.columns.str.lower()
|
| 312 |
+
|
| 313 |
+
# Additional validation: filter out any empty comments that might have slipped through
|
| 314 |
+
if 'comment_text' in df.columns:
|
| 315 |
+
initial_count = len(df)
|
| 316 |
+
df = df[df['comment_text'].notna() & (df['comment_text'].str.strip() != '')]
|
| 317 |
+
filtered_count = initial_count - len(df)
|
| 318 |
+
if filtered_count > 0:
|
| 319 |
+
logger.info(f"Filtered out {filtered_count} empty comments in post-processing")
|
| 320 |
+
|
| 321 |
+
logger.info(f"Final count: {len(df)} non-empty comments")
|
| 322 |
+
return df
|
| 323 |
+
|
| 324 |
+
def calculate_num_workers(self) -> int:
|
| 325 |
+
"""
|
| 326 |
+
Calculate the number of parallel workers to use.
|
| 327 |
+
Uses CPU count - 2, with a maximum of 5 workers.
|
| 328 |
+
|
| 329 |
+
Returns:
|
| 330 |
+
Number of workers
|
| 331 |
+
"""
|
| 332 |
+
num_cpus = cpu_count()
|
| 333 |
+
num_workers = max(1, min(5, num_cpus - 2))
|
| 334 |
+
logger.info(f"Using {num_workers} parallel workers (CPU count: {num_cpus})")
|
| 335 |
+
return num_workers
|
| 336 |
+
|
| 337 |
+
def process_comments_parallel(self, df: pd.DataFrame, data_source_config: Dict[str, Any], overwrite: bool = False) -> dict:
|
| 338 |
+
"""
|
| 339 |
+
Process comments through the agentic workflow using parallel processing.
|
| 340 |
+
|
| 341 |
+
Args:
|
| 342 |
+
df: DataFrame containing raw comment data
|
| 343 |
+
data_source_config: Configuration for the data source being processed
|
| 344 |
+
overwrite: Whether to overwrite existing Snowflake table
|
| 345 |
+
|
| 346 |
+
Returns:
|
| 347 |
+
Dictionary with aggregated statistics
|
| 348 |
+
"""
|
| 349 |
+
# Convert DataFrame to list of dictionaries
|
| 350 |
+
comments = df.to_dict('records')
|
| 351 |
+
total_comments = len(comments)
|
| 352 |
+
|
| 353 |
+
logger.info(f"Processing {total_comments} comments using parallel processing...")
|
| 354 |
+
|
| 355 |
+
# Calculate number of workers
|
| 356 |
+
num_workers = self.calculate_num_workers()
|
| 357 |
+
|
| 358 |
+
# Calculate optimal batch size
|
| 359 |
+
batch_size = calculate_optimal_batch_size(total_comments, num_workers)
|
| 360 |
+
logger.info(f"Batch size: {batch_size} (min: 20, max: 100)")
|
| 361 |
+
|
| 362 |
+
# Create batches
|
| 363 |
+
batches = []
|
| 364 |
+
for i in range(0, total_comments, batch_size):
|
| 365 |
+
batch = comments[i:i + batch_size]
|
| 366 |
+
batch_num = (i // batch_size) + 1
|
| 367 |
+
batches.append((batch_num, batch, self.config, self.api_key, overwrite, data_source_config))
|
| 368 |
+
|
| 369 |
+
total_batches = len(batches)
|
| 370 |
+
logger.info(f"Split into {total_batches} batches")
|
| 371 |
+
|
| 372 |
+
# Process batches in parallel
|
| 373 |
+
with Pool(processes=num_workers) as pool:
|
| 374 |
+
results = pool.map(process_batch_worker, batches)
|
| 375 |
+
|
| 376 |
+
# Aggregate statistics
|
| 377 |
+
total_processed = sum(r['total_processed'] for r in results)
|
| 378 |
+
total_stored = sum(r['total_stored'] for r in results)
|
| 379 |
+
failed_count = sum(r['failed_count'] for r in results)
|
| 380 |
+
translations = sum(r['translations'] for r in results)
|
| 381 |
+
non_english = sum(r['non_english'] for r in results)
|
| 382 |
+
requires_reply = sum(r['requires_reply'] for r in results)
|
| 383 |
+
|
| 384 |
+
# Count failed batches
|
| 385 |
+
failed_batches = [r for r in results if not r['success']]
|
| 386 |
+
if failed_batches:
|
| 387 |
+
logger.error(f"{len(failed_batches)} batch(es) failed:")
|
| 388 |
+
for fb in failed_batches:
|
| 389 |
+
logger.error(f" Batch {fb['batch_num']}: {fb['error']}")
|
| 390 |
+
|
| 391 |
+
return {
|
| 392 |
+
'total_processed': total_processed,
|
| 393 |
+
'total_stored': total_stored,
|
| 394 |
+
'failed_count': failed_count,
|
| 395 |
+
'translations': translations,
|
| 396 |
+
'non_english': non_english,
|
| 397 |
+
'requires_reply': requires_reply,
|
| 398 |
+
'failed_batches': len(failed_batches)
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
def process_comments_sequential(self, df: pd.DataFrame, data_source_config: Dict[str, Any], overwrite: bool = False) -> dict:
|
| 402 |
+
"""
|
| 403 |
+
Process comments through the agentic workflow sequentially (for debugging).
|
| 404 |
+
|
| 405 |
+
Args:
|
| 406 |
+
df: DataFrame containing raw comment data
|
| 407 |
+
data_source_config: Configuration for the data source being processed
|
| 408 |
+
overwrite: Whether to overwrite existing Snowflake table
|
| 409 |
+
|
| 410 |
+
Returns:
|
| 411 |
+
Dictionary with aggregated statistics
|
| 412 |
+
"""
|
| 413 |
+
logger.info(f"Processing {len(df)} comments using sequential processing (debug mode)...")
|
| 414 |
+
|
| 415 |
+
# Convert DataFrame to list of dictionaries
|
| 416 |
+
comments = df.to_dict('records')
|
| 417 |
+
|
| 418 |
+
# Process as a single batch
|
| 419 |
+
batch_data = (1, comments, self.config, self.api_key, overwrite, data_source_config)
|
| 420 |
+
result = process_batch_worker(batch_data)
|
| 421 |
+
|
| 422 |
+
return {
|
| 423 |
+
'total_processed': result['total_processed'],
|
| 424 |
+
'total_stored': result['total_stored'],
|
| 425 |
+
'failed_count': result['failed_count'],
|
| 426 |
+
'translations': result['translations'],
|
| 427 |
+
'non_english': result['non_english'],
|
| 428 |
+
'requires_reply': result['requires_reply'],
|
| 429 |
+
'failed_batches': 0 if result['success'] else 1
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
def run(self, limit: int = None, overwrite: bool = False, sequential: bool = False, data_source_filter: str = None):
|
| 433 |
+
"""
|
| 434 |
+
Run the complete processing pipeline for all enabled data sources.
|
| 435 |
+
|
| 436 |
+
Args:
|
| 437 |
+
limit: Optional limit on number of comments to process per data source
|
| 438 |
+
overwrite: Whether to overwrite existing Snowflake table
|
| 439 |
+
sequential: If True, use sequential processing instead of parallel (for debugging)
|
| 440 |
+
data_source_filter: Optional filter to process only a specific data source
|
| 441 |
+
"""
|
| 442 |
+
try:
|
| 443 |
+
logger.info("=" * 80)
|
| 444 |
+
logger.info("Starting Comment Processing Workflow")
|
| 445 |
+
if sequential:
|
| 446 |
+
logger.info("Mode: SEQUENTIAL (Debug Mode)")
|
| 447 |
+
else:
|
| 448 |
+
logger.info("Mode: PARALLEL")
|
| 449 |
+
logger.info("=" * 80)
|
| 450 |
+
|
| 451 |
+
# Get enabled data sources
|
| 452 |
+
enabled_sources = self.get_enabled_data_sources()
|
| 453 |
+
|
| 454 |
+
if data_source_filter:
|
| 455 |
+
enabled_sources = [s for s in enabled_sources if s['key'] == data_source_filter]
|
| 456 |
+
if not enabled_sources:
|
| 457 |
+
logger.error(f"Data source '{data_source_filter}' not found or not enabled")
|
| 458 |
+
return
|
| 459 |
+
|
| 460 |
+
logger.info(f"Processing {len(enabled_sources)} data source(s)")
|
| 461 |
+
|
| 462 |
+
# Process each data source
|
| 463 |
+
for source_info in enabled_sources:
|
| 464 |
+
source_key = source_info['key']
|
| 465 |
+
source_config = source_info['config']
|
| 466 |
+
source_name = source_config['name']
|
| 467 |
+
|
| 468 |
+
logger.info("=" * 80)
|
| 469 |
+
logger.info(f"Processing Data Source: {source_name}")
|
| 470 |
+
logger.info("=" * 80)
|
| 471 |
+
|
| 472 |
+
# Step 1: Fetch comments
|
| 473 |
+
df_comments = self.fetch_comments(data_source_key=source_key, limit=limit)
|
| 474 |
+
|
| 475 |
+
if df_comments.empty:
|
| 476 |
+
logger.warning(f"No comments to process from {source_name}")
|
| 477 |
+
continue
|
| 478 |
+
|
| 479 |
+
# Step 2: Process comments through workflow (parallel or sequential)
|
| 480 |
+
start_time = datetime.now()
|
| 481 |
+
|
| 482 |
+
if sequential:
|
| 483 |
+
stats = self.process_comments_sequential(df_comments, source_config, overwrite=overwrite)
|
| 484 |
+
else:
|
| 485 |
+
stats = self.process_comments_parallel(df_comments, source_config, overwrite=overwrite)
|
| 486 |
+
|
| 487 |
+
end_time = datetime.now()
|
| 488 |
+
processing_time = (end_time - start_time).total_seconds()
|
| 489 |
+
|
| 490 |
+
# Summary statistics
|
| 491 |
+
logger.info("=" * 80)
|
| 492 |
+
logger.info(f"Processing Summary for {source_name}:")
|
| 493 |
+
logger.info(f" Processing Mode: {'Sequential' if sequential else 'Parallel'}")
|
| 494 |
+
logger.info(f" Output Table: {source_config['output_config']['table_name']}")
|
| 495 |
+
logger.info(f" Total comments processed: {stats['total_processed']}")
|
| 496 |
+
logger.info(f" Successfully stored: {stats['total_stored']}")
|
| 497 |
+
logger.info(f" Failed sentiment analysis (not stored): {stats['failed_count']}")
|
| 498 |
+
if stats.get('failed_batches', 0) > 0:
|
| 499 |
+
logger.info(f" Failed batches: {stats['failed_batches']}")
|
| 500 |
+
logger.info(f" Non-English comments: {stats['non_english']}")
|
| 501 |
+
logger.info(f" Translations performed: {stats['translations']}")
|
| 502 |
+
logger.info(f" Comments requiring reply: {stats['requires_reply']}")
|
| 503 |
+
logger.info(f" Processing time: {processing_time:.2f} seconds")
|
| 504 |
+
logger.info(f" Average time per comment: {processing_time / stats['total_processed']:.2f} seconds")
|
| 505 |
+
logger.info("=" * 80)
|
| 506 |
+
|
| 507 |
+
except Exception as e:
|
| 508 |
+
logger.error(f"Error in workflow execution: {str(e)}", exc_info=True)
|
| 509 |
+
raise
|
| 510 |
+
|
| 511 |
+
finally:
|
| 512 |
+
# Close main Snowflake connection (workers have their own connections)
|
| 513 |
+
self.snowflake.close_connection()
|
| 514 |
+
logger.info("Snowflake connection closed")
|
| 515 |
+
|
| 516 |
+
|
| 517 |
+
def main():
|
| 518 |
+
"""
|
| 519 |
+
Main entry point for the script.
|
| 520 |
+
"""
|
| 521 |
+
parser = argparse.ArgumentParser(
|
| 522 |
+
description="Process comments with language detection, translation, and sentiment analysis from multiple data sources"
|
| 523 |
+
)
|
| 524 |
+
parser.add_argument(
|
| 525 |
+
'--limit',
|
| 526 |
+
type=int,
|
| 527 |
+
default=5000,
|
| 528 |
+
help='Limit number of comments to process per data source (default: 10000)'
|
| 529 |
+
)
|
| 530 |
+
parser.add_argument(
|
| 531 |
+
'--overwrite',
|
| 532 |
+
action='store_true',
|
| 533 |
+
default=False,
|
| 534 |
+
help='Overwrite existing Snowflake table (default: False, appends new records)'
|
| 535 |
+
)
|
| 536 |
+
parser.add_argument(
|
| 537 |
+
'--config',
|
| 538 |
+
type=str,
|
| 539 |
+
default=None,
|
| 540 |
+
help='Path to configuration file (default: config_files/sentiment_config.json relative to script)'
|
| 541 |
+
)
|
| 542 |
+
parser.add_argument(
|
| 543 |
+
'--sequential',
|
| 544 |
+
action='store_true',
|
| 545 |
+
default=False,
|
| 546 |
+
help='Use sequential processing instead of parallel (for debugging)'
|
| 547 |
+
)
|
| 548 |
+
parser.add_argument(
|
| 549 |
+
'--data-source',
|
| 550 |
+
type=str,
|
| 551 |
+
default=None,
|
| 552 |
+
help='Process only a specific data source (e.g., social_media, musora_comments). If not specified, all enabled sources are processed.'
|
| 553 |
+
)
|
| 554 |
+
|
| 555 |
+
args = parser.parse_args()
|
| 556 |
+
|
| 557 |
+
# Create logs directory if it doesn't exist
|
| 558 |
+
logs_dir = os.path.join(SCRIPT_DIR, 'logs')
|
| 559 |
+
os.makedirs(logs_dir, exist_ok=True)
|
| 560 |
+
|
| 561 |
+
# Initialize and run processor
|
| 562 |
+
processor = CommentProcessor(config_path=args.config)
|
| 563 |
+
processor.run(
|
| 564 |
+
limit=args.limit,
|
| 565 |
+
overwrite=args.overwrite,
|
| 566 |
+
sequential=args.sequential,
|
| 567 |
+
data_source_filter=args.data_source
|
| 568 |
+
)
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
if __name__ == "__main__":
|
| 572 |
+
main()
|
processing_comments/requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
snowflake-snowpark-python>=1.0.0
|
| 2 |
+
pandas>=1.3.0
|
| 3 |
+
python-dotenv>=0.19.0
|
| 4 |
+
openai>=1.0.0
|
| 5 |
+
argparse
|
| 6 |
+
langchain>=0.1.0
|
| 7 |
+
langchain-openai>=0.0.5
|
| 8 |
+
langgraph>=0.0.20
|
| 9 |
+
lingua-language-detector>=2.0.0
|
| 10 |
+
pydantic>=2.0.0
|
processing_comments/sql/create_ml_features_table.sql
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Create table in ML_FEATURES schema to store comment sentiment analysis results
|
| 2 |
+
-- This table stores the output from the language detection, translation, and sentiment analysis workflow
|
| 3 |
+
|
| 4 |
+
USE DATABASE SOCIAL_MEDIA_DB;
|
| 5 |
+
USE SCHEMA ML_FEATURES;
|
| 6 |
+
|
| 7 |
+
CREATE TABLE IF NOT EXISTS COMMENT_SENTIMENT_FEATURES (
|
| 8 |
+
-- Primary identifiers
|
| 9 |
+
COMMENT_SK NUMBER(38,0) NOT NULL COMMENT 'Surrogate key from FACT_COMMENTS',
|
| 10 |
+
COMMENT_ID VARCHAR(16777216) NOT NULL COMMENT 'Platform comment ID',
|
| 11 |
+
ORIGINAL_TEXT VARCHAR(16777216) COMMENT 'Original comment text',
|
| 12 |
+
PLATFORM VARCHAR(16777216) COMMENT 'Social platform',
|
| 13 |
+
COMMENT_TIMESTAMP TIMESTAMP_NTZ(9) COMMENT 'When comment was posted',
|
| 14 |
+
AUTHOR_NAME VARCHAR(16777216) COMMENT 'Commenter name',
|
| 15 |
+
AUTHOR_ID VARCHAR(16777216) COMMENT 'Platform user ID',
|
| 16 |
+
|
| 17 |
+
-- Parent comment information
|
| 18 |
+
PARENT_COMMENT_ID VARCHAR(16777216) COMMENT 'ID of parent comment if this is a reply',
|
| 19 |
+
PARENT_COMMENT_TEXT VARCHAR(16777216) COMMENT 'Text of parent comment for context',
|
| 20 |
+
|
| 21 |
+
-- Content references
|
| 22 |
+
CONTENT_SK NUMBER(38,0) COMMENT 'Foreign key to content',
|
| 23 |
+
CONTENT_ID VARCHAR(16777216) COMMENT 'Platform content ID',
|
| 24 |
+
CONTENT_DESCRIPTION VARCHAR(16777216) COMMENT 'Content description/message',
|
| 25 |
+
|
| 26 |
+
-- Channel references
|
| 27 |
+
CHANNEL_SK NUMBER(38,0) COMMENT 'Foreign key to channel',
|
| 28 |
+
CHANNEL_NAME VARCHAR(16777216) COMMENT 'Brand/channel name',
|
| 29 |
+
CHANNEL_DISPLAY_NAME VARCHAR(16777216) COMMENT 'Channel display name',
|
| 30 |
+
|
| 31 |
+
-- Language detection features
|
| 32 |
+
DETECTED_LANGUAGE VARCHAR(100) COMMENT 'Detected language name (e.g., English, Spanish)',
|
| 33 |
+
LANGUAGE_CODE VARCHAR(10) COMMENT 'ISO 639-1 language code (e.g., en, es)',
|
| 34 |
+
IS_ENGLISH BOOLEAN COMMENT 'True if comment is in English',
|
| 35 |
+
LANGUAGE_CONFIDENCE VARCHAR(20) COMMENT 'Confidence level: high, medium, low',
|
| 36 |
+
DETECTION_METHOD VARCHAR(50) COMMENT 'Method used: library, llm, or default',
|
| 37 |
+
HAS_TEXT BOOLEAN COMMENT 'True if comment has textual content (not just emojis)',
|
| 38 |
+
|
| 39 |
+
-- Translation features
|
| 40 |
+
TRANSLATED_TEXT VARCHAR(16777216) COMMENT 'English translation (or original if already English)',
|
| 41 |
+
TRANSLATION_PERFORMED BOOLEAN COMMENT 'True if translation was performed',
|
| 42 |
+
TRANSLATION_CONFIDENCE VARCHAR(20) COMMENT 'Translation confidence level',
|
| 43 |
+
TRANSLATION_NOTES VARCHAR(16777216) COMMENT 'Notes about translation',
|
| 44 |
+
|
| 45 |
+
-- Sentiment analysis features
|
| 46 |
+
SENTIMENT_POLARITY VARCHAR(20) COMMENT 'Sentiment: very_positive, positive, neutral, negative, very_negative',
|
| 47 |
+
INTENT VARCHAR(500) COMMENT 'Multi-label intents (comma-separated): praise, question, request, feedback_negative, suggestion, humor_sarcasm, off_topic, spam_selfpromo',
|
| 48 |
+
REQUIRES_REPLY BOOLEAN COMMENT 'True if comment requires a response (genuine questions/requests only)',
|
| 49 |
+
SENTIMENT_CONFIDENCE VARCHAR(20) COMMENT 'Sentiment analysis confidence: high, medium, low',
|
| 50 |
+
ANALYSIS_NOTES VARCHAR(16777216) COMMENT 'Concise notes with key topics/highlights for summarization',
|
| 51 |
+
|
| 52 |
+
-- Processing metadata
|
| 53 |
+
PROCESSING_SUCCESS BOOLEAN COMMENT 'True if processing completed successfully',
|
| 54 |
+
PROCESSING_ERRORS VARCHAR(16777216) COMMENT 'Any errors encountered during processing',
|
| 55 |
+
PROCESSED_AT TIMESTAMP_NTZ(9) COMMENT 'When this record was processed',
|
| 56 |
+
WORKFLOW_VERSION VARCHAR(20) COMMENT 'Version of the processing workflow',
|
| 57 |
+
|
| 58 |
+
-- Audit fields
|
| 59 |
+
CREATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP() COMMENT 'Record creation time',
|
| 60 |
+
UPDATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP() COMMENT 'Record update time'
|
| 61 |
+
)
|
| 62 |
+
COMMENT='ML Features table for language detection, translation, and sentiment analysis results from social media comments';
|
| 63 |
+
|
| 64 |
+
-- Create indexes for common queries
|
| 65 |
+
-- Note: Snowflake automatically optimizes queries, but we can define clustering keys
|
| 66 |
+
ALTER TABLE COMMENT_SENTIMENT_FEATURES CLUSTER BY (COMMENT_TIMESTAMP, CHANNEL_NAME);
|
| 67 |
+
|
| 68 |
+
-- Create view for comments requiring reply
|
| 69 |
+
CREATE OR REPLACE VIEW VW_COMMENTS_REQUIRING_REPLY AS
|
| 70 |
+
SELECT
|
| 71 |
+
COMMENT_SK,
|
| 72 |
+
COMMENT_ID,
|
| 73 |
+
ORIGINAL_TEXT,
|
| 74 |
+
TRANSLATED_TEXT,
|
| 75 |
+
PARENT_COMMENT_ID,
|
| 76 |
+
PARENT_COMMENT_TEXT,
|
| 77 |
+
INTENT,
|
| 78 |
+
SENTIMENT_POLARITY,
|
| 79 |
+
SENTIMENT_CONFIDENCE,
|
| 80 |
+
CHANNEL_NAME,
|
| 81 |
+
AUTHOR_NAME,
|
| 82 |
+
COMMENT_TIMESTAMP,
|
| 83 |
+
PLATFORM,
|
| 84 |
+
CONTENT_DESCRIPTION
|
| 85 |
+
FROM COMMENT_SENTIMENT_FEATURES
|
| 86 |
+
WHERE REQUIRES_REPLY = TRUE
|
| 87 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 88 |
+
ORDER BY COMMENT_TIMESTAMP DESC;
|
| 89 |
+
|
| 90 |
+
-- Create view for sentiment distribution
|
| 91 |
+
CREATE OR REPLACE VIEW VW_SENTIMENT_DISTRIBUTION AS
|
| 92 |
+
SELECT
|
| 93 |
+
CHANNEL_NAME,
|
| 94 |
+
SENTIMENT_POLARITY,
|
| 95 |
+
INTENT,
|
| 96 |
+
COUNT(*) AS COMMENT_COUNT,
|
| 97 |
+
COUNT(CASE WHEN REQUIRES_REPLY THEN 1 END) AS REPLIES_NEEDED,
|
| 98 |
+
COUNT(CASE WHEN PARENT_COMMENT_ID IS NOT NULL THEN 1 END) AS REPLY_COMMENTS,
|
| 99 |
+
AVG(CASE WHEN SENTIMENT_CONFIDENCE = 'high' THEN 3
|
| 100 |
+
WHEN SENTIMENT_CONFIDENCE = 'medium' THEN 2
|
| 101 |
+
WHEN SENTIMENT_CONFIDENCE = 'low' THEN 1
|
| 102 |
+
ELSE 0 END) AS AVG_CONFIDENCE_SCORE,
|
| 103 |
+
MAX(PROCESSED_AT) AS LAST_PROCESSED
|
| 104 |
+
FROM COMMENT_SENTIMENT_FEATURES
|
| 105 |
+
WHERE PROCESSING_SUCCESS = TRUE
|
| 106 |
+
GROUP BY CHANNEL_NAME, SENTIMENT_POLARITY, INTENT
|
| 107 |
+
ORDER BY CHANNEL_NAME, COMMENT_COUNT DESC;
|
| 108 |
+
|
| 109 |
+
-- Create view for non-English comments
|
| 110 |
+
CREATE OR REPLACE VIEW VW_NON_ENGLISH_COMMENTS AS
|
| 111 |
+
SELECT
|
| 112 |
+
COMMENT_SK,
|
| 113 |
+
COMMENT_ID,
|
| 114 |
+
ORIGINAL_TEXT,
|
| 115 |
+
DETECTED_LANGUAGE,
|
| 116 |
+
LANGUAGE_CODE,
|
| 117 |
+
TRANSLATED_TEXT,
|
| 118 |
+
TRANSLATION_CONFIDENCE,
|
| 119 |
+
SENTIMENT_POLARITY,
|
| 120 |
+
INTENT,
|
| 121 |
+
CHANNEL_NAME,
|
| 122 |
+
COMMENT_TIMESTAMP,
|
| 123 |
+
PLATFORM
|
| 124 |
+
FROM COMMENT_SENTIMENT_FEATURES
|
| 125 |
+
WHERE IS_ENGLISH = FALSE
|
| 126 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 127 |
+
ORDER BY COMMENT_TIMESTAMP DESC;
|
processing_comments/sql/create_musora_ml_features_table.sql
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Create table in ML_FEATURES schema to store Musora comment sentiment analysis results
|
| 2 |
+
-- This table stores the output from the language detection, translation, and sentiment analysis workflow
|
| 3 |
+
-- Schema matches COMMENT_SENTIMENT_FEATURES with additional Musora-specific fields
|
| 4 |
+
|
| 5 |
+
USE DATABASE SOCIAL_MEDIA_DB;
|
| 6 |
+
USE SCHEMA ML_FEATURES;
|
| 7 |
+
|
| 8 |
+
CREATE TABLE IF NOT EXISTS MUSORA_COMMENT_SENTIMENT_FEATURES (
|
| 9 |
+
-- Primary identifiers
|
| 10 |
+
COMMENT_SK NUMBER(38,0) NOT NULL COMMENT 'Generated surrogate key (hash of COMMENT_ID and PLATFORM)',
|
| 11 |
+
COMMENT_ID VARCHAR(16777216) NOT NULL COMMENT 'Musora comment ID',
|
| 12 |
+
ORIGINAL_TEXT VARCHAR(16777216) COMMENT 'Original comment text',
|
| 13 |
+
PLATFORM VARCHAR(16777216) COMMENT 'Musora platform/brand',
|
| 14 |
+
COMMENT_TIMESTAMP TIMESTAMP_NTZ(9) COMMENT 'When comment was posted',
|
| 15 |
+
AUTHOR_NAME VARCHAR(16777216) COMMENT 'Commenter name',
|
| 16 |
+
AUTHOR_ID VARCHAR(16777216) COMMENT 'User ID',
|
| 17 |
+
|
| 18 |
+
-- Parent comment information
|
| 19 |
+
PARENT_COMMENT_ID VARCHAR(16777216) COMMENT 'ID of parent comment if this is a reply',
|
| 20 |
+
PARENT_COMMENT_TEXT VARCHAR(16777216) COMMENT 'Text of parent comment for context',
|
| 21 |
+
|
| 22 |
+
-- Content references
|
| 23 |
+
CONTENT_SK NUMBER(38,0) COMMENT 'Generated surrogate key for content',
|
| 24 |
+
CONTENT_ID VARCHAR(16777216) COMMENT 'Content ID',
|
| 25 |
+
CONTENT_DESCRIPTION VARCHAR(16777216) COMMENT 'Content profile/description',
|
| 26 |
+
|
| 27 |
+
-- Channel references
|
| 28 |
+
CHANNEL_SK NUMBER(38,0) COMMENT 'Generated surrogate key for channel',
|
| 29 |
+
CHANNEL_NAME VARCHAR(16777216) COMMENT 'Brand/channel name',
|
| 30 |
+
CHANNEL_DISPLAY_NAME VARCHAR(16777216) COMMENT 'Channel display name',
|
| 31 |
+
|
| 32 |
+
-- Musora-specific fields
|
| 33 |
+
PERMALINK_URL VARCHAR(16777216) COMMENT 'Web URL path of the content',
|
| 34 |
+
THUMBNAIL_URL VARCHAR(16777216) COMMENT 'Thumbnail URL of the content',
|
| 35 |
+
|
| 36 |
+
-- Language detection features
|
| 37 |
+
DETECTED_LANGUAGE VARCHAR(100) COMMENT 'Detected language name (e.g., English, Spanish)',
|
| 38 |
+
LANGUAGE_CODE VARCHAR(10) COMMENT 'ISO 639-1 language code (e.g., en, es)',
|
| 39 |
+
IS_ENGLISH BOOLEAN COMMENT 'True if comment is in English',
|
| 40 |
+
LANGUAGE_CONFIDENCE VARCHAR(20) COMMENT 'Confidence level: high, medium, low',
|
| 41 |
+
DETECTION_METHOD VARCHAR(50) COMMENT 'Method used: library, llm, or default',
|
| 42 |
+
HAS_TEXT BOOLEAN COMMENT 'True if comment has textual content (not just emojis)',
|
| 43 |
+
|
| 44 |
+
-- Translation features
|
| 45 |
+
TRANSLATED_TEXT VARCHAR(16777216) COMMENT 'English translation (or original if already English)',
|
| 46 |
+
TRANSLATION_PERFORMED BOOLEAN COMMENT 'True if translation was performed',
|
| 47 |
+
TRANSLATION_CONFIDENCE VARCHAR(20) COMMENT 'Translation confidence level',
|
| 48 |
+
TRANSLATION_NOTES VARCHAR(16777216) COMMENT 'Notes about translation',
|
| 49 |
+
|
| 50 |
+
-- Sentiment analysis features
|
| 51 |
+
SENTIMENT_POLARITY VARCHAR(20) COMMENT 'Sentiment: very_positive, positive, neutral, negative, very_negative',
|
| 52 |
+
INTENT VARCHAR(500) COMMENT 'Multi-label intents (comma-separated): praise, question, request, feedback_negative, suggestion, humor_sarcasm, off_topic, spam_selfpromo',
|
| 53 |
+
REQUIRES_REPLY BOOLEAN COMMENT 'True if comment requires a response (genuine questions/requests only)',
|
| 54 |
+
SENTIMENT_CONFIDENCE VARCHAR(20) COMMENT 'Sentiment analysis confidence: high, medium, low',
|
| 55 |
+
ANALYSIS_NOTES VARCHAR(16777216) COMMENT 'Concise notes with key topics/highlights for summarization',
|
| 56 |
+
|
| 57 |
+
-- Processing metadata
|
| 58 |
+
PROCESSING_SUCCESS BOOLEAN COMMENT 'True if processing completed successfully',
|
| 59 |
+
PROCESSING_ERRORS VARCHAR(16777216) COMMENT 'Any errors encountered during processing',
|
| 60 |
+
PROCESSED_AT TIMESTAMP_NTZ(9) COMMENT 'When this record was processed',
|
| 61 |
+
WORKFLOW_VERSION VARCHAR(20) COMMENT 'Version of the processing workflow',
|
| 62 |
+
|
| 63 |
+
-- Audit fields
|
| 64 |
+
CREATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP() COMMENT 'Record creation time',
|
| 65 |
+
UPDATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP() COMMENT 'Record update time'
|
| 66 |
+
)
|
| 67 |
+
COMMENT='ML Features table for language detection, translation, and sentiment analysis results from Musora internal app comments';
|
| 68 |
+
|
| 69 |
+
-- Create indexes for common queries
|
| 70 |
+
-- Note: Snowflake automatically optimizes queries, but we can define clustering keys
|
| 71 |
+
ALTER TABLE MUSORA_COMMENT_SENTIMENT_FEATURES CLUSTER BY (COMMENT_TIMESTAMP, CHANNEL_NAME);
|
| 72 |
+
|
| 73 |
+
-- Create view for Musora comments requiring reply
|
| 74 |
+
CREATE OR REPLACE VIEW VW_MUSORA_COMMENTS_REQUIRING_REPLY AS
|
| 75 |
+
SELECT
|
| 76 |
+
COMMENT_SK,
|
| 77 |
+
COMMENT_ID,
|
| 78 |
+
ORIGINAL_TEXT,
|
| 79 |
+
TRANSLATED_TEXT,
|
| 80 |
+
PARENT_COMMENT_ID,
|
| 81 |
+
PARENT_COMMENT_TEXT,
|
| 82 |
+
INTENT,
|
| 83 |
+
SENTIMENT_POLARITY,
|
| 84 |
+
SENTIMENT_CONFIDENCE,
|
| 85 |
+
CHANNEL_NAME,
|
| 86 |
+
AUTHOR_ID,
|
| 87 |
+
COMMENT_TIMESTAMP,
|
| 88 |
+
PLATFORM,
|
| 89 |
+
CONTENT_DESCRIPTION,
|
| 90 |
+
PERMALINK_URL,
|
| 91 |
+
THUMBNAIL_URL
|
| 92 |
+
FROM MUSORA_COMMENT_SENTIMENT_FEATURES
|
| 93 |
+
WHERE REQUIRES_REPLY = TRUE
|
| 94 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 95 |
+
ORDER BY COMMENT_TIMESTAMP DESC;
|
| 96 |
+
|
| 97 |
+
-- Create view for Musora sentiment distribution
|
| 98 |
+
CREATE OR REPLACE VIEW VW_MUSORA_SENTIMENT_DISTRIBUTION AS
|
| 99 |
+
SELECT
|
| 100 |
+
CHANNEL_NAME,
|
| 101 |
+
SENTIMENT_POLARITY,
|
| 102 |
+
INTENT,
|
| 103 |
+
COUNT(*) AS COMMENT_COUNT,
|
| 104 |
+
COUNT(CASE WHEN REQUIRES_REPLY THEN 1 END) AS REPLIES_NEEDED,
|
| 105 |
+
COUNT(CASE WHEN PARENT_COMMENT_ID IS NOT NULL THEN 1 END) AS REPLY_COMMENTS,
|
| 106 |
+
AVG(CASE WHEN SENTIMENT_CONFIDENCE = 'high' THEN 3
|
| 107 |
+
WHEN SENTIMENT_CONFIDENCE = 'medium' THEN 2
|
| 108 |
+
WHEN SENTIMENT_CONFIDENCE = 'low' THEN 1
|
| 109 |
+
ELSE 0 END) AS AVG_CONFIDENCE_SCORE,
|
| 110 |
+
MAX(PROCESSED_AT) AS LAST_PROCESSED
|
| 111 |
+
FROM MUSORA_COMMENT_SENTIMENT_FEATURES
|
| 112 |
+
WHERE PROCESSING_SUCCESS = TRUE
|
| 113 |
+
GROUP BY CHANNEL_NAME, SENTIMENT_POLARITY, INTENT
|
| 114 |
+
ORDER BY CHANNEL_NAME, COMMENT_COUNT DESC;
|
| 115 |
+
|
| 116 |
+
-- Create view for non-English Musora comments
|
| 117 |
+
CREATE OR REPLACE VIEW VW_MUSORA_NON_ENGLISH_COMMENTS AS
|
| 118 |
+
SELECT
|
| 119 |
+
COMMENT_SK,
|
| 120 |
+
COMMENT_ID,
|
| 121 |
+
ORIGINAL_TEXT,
|
| 122 |
+
DETECTED_LANGUAGE,
|
| 123 |
+
LANGUAGE_CODE,
|
| 124 |
+
TRANSLATED_TEXT,
|
| 125 |
+
TRANSLATION_CONFIDENCE,
|
| 126 |
+
SENTIMENT_POLARITY,
|
| 127 |
+
INTENT,
|
| 128 |
+
CHANNEL_NAME,
|
| 129 |
+
COMMENT_TIMESTAMP,
|
| 130 |
+
PLATFORM,
|
| 131 |
+
PERMALINK_URL
|
| 132 |
+
FROM MUSORA_COMMENT_SENTIMENT_FEATURES
|
| 133 |
+
WHERE IS_ENGLISH = FALSE
|
| 134 |
+
AND PROCESSING_SUCCESS = TRUE
|
| 135 |
+
ORDER BY COMMENT_TIMESTAMP DESC;
|