Danialebrat commited on
Commit
9858829
·
1 Parent(s): f89e3ef

Deploying sentiment analysis project

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +0 -35
  2. .idea/vcs.xml +4 -0
  3. Dockerfile +0 -20
  4. README.md +304 -15
  5. processing_brand_sentiment/ARCHITECTURE_PROPOSAL.md +437 -0
  6. processing_brand_sentiment/README.md +402 -0
  7. processing_brand_sentiment/config_files/analysis_categories.json +123 -0
  8. processing_brand_sentiment/config_files/brand_config.json +111 -0
  9. processing_brand_sentiment/config_files/workflow_config.json +60 -0
  10. processing_brand_sentiment/database/__init__.py +8 -0
  11. processing_brand_sentiment/database/snowflake_connection.py +240 -0
  12. processing_brand_sentiment/database/sql/create_comments_output_table.sql +161 -0
  13. processing_brand_sentiment/database/sql/create_output_table.sql +250 -0
  14. processing_brand_sentiment/database/sql/fetch_comments.sql +82 -0
  15. processing_brand_sentiment/database/sql/fetch_forum_posts.sql +106 -0
  16. processing_brand_sentiment/database/sql/init_comments_output_table.sql +78 -0
  17. processing_brand_sentiment/database/sql/init_output_table.sql +89 -0
  18. processing_brand_sentiment/main.py +1088 -0
  19. processing_brand_sentiment/utils/__init__.py +8 -0
  20. processing_brand_sentiment/utils/html_parser.py +253 -0
  21. processing_brand_sentiment/workflow/__init__.py +10 -0
  22. processing_brand_sentiment/workflow/agents/__init__.py +39 -0
  23. processing_brand_sentiment/workflow/agents/base_agent.py +169 -0
  24. processing_brand_sentiment/workflow/agents/comment_preprocessor_agent.py +211 -0
  25. processing_brand_sentiment/workflow/agents/content_preprocessor_agent.py +570 -0
  26. processing_brand_sentiment/workflow/agents/output_validator_agent.py +408 -0
  27. processing_brand_sentiment/workflow/agents/preprocessor_agent.py +408 -0
  28. processing_brand_sentiment/workflow/agents/relevance_validator_agent.py +289 -0
  29. processing_brand_sentiment/workflow/agents/sabian_analyzer_agent.py +388 -0
  30. processing_brand_sentiment/workflow/agents/sabian_relevance_extraction_agent.py +431 -0
  31. processing_brand_sentiment/workflow/agents/sabian_sentiment_analyzer_agent.py +434 -0
  32. processing_brand_sentiment/workflow/comment_orchestrator.py +558 -0
  33. processing_brand_sentiment/workflow/orchestrator.py +551 -0
  34. processing_comments/.dockerignore +8 -0
  35. processing_comments/LICENSE +201 -0
  36. processing_comments/README.md +726 -0
  37. processing_comments/SnowFlakeConnection.py +121 -0
  38. processing_comments/agents/README.md +1571 -0
  39. processing_comments/agents/__init__.py +14 -0
  40. processing_comments/agents/base_agent.py +104 -0
  41. processing_comments/agents/language_detection_agent.py +292 -0
  42. processing_comments/agents/sentiment_analysis_agent.py +381 -0
  43. processing_comments/agents/translation_agent.py +210 -0
  44. processing_comments/config_files/data_sources_config.json +56 -0
  45. processing_comments/config_files/sentiment_analysis_config.json +96 -0
  46. processing_comments/config_files/sentiment_config.json +49 -0
  47. processing_comments/main.py +572 -0
  48. processing_comments/requirements.txt +10 -0
  49. processing_comments/sql/create_ml_features_table.sql +127 -0
  50. processing_comments/sql/create_musora_ml_features_table.sql +135 -0
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.idea/vcs.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings" defaultProject="true" />
4
+ </project>
Dockerfile DELETED
@@ -1,20 +0,0 @@
1
- FROM python:3.13.5-slim
2
-
3
- WORKDIR /app
4
-
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- git \
9
- && rm -rf /var/lib/apt/lists/*
10
-
11
- COPY requirements.txt ./
12
- COPY src/ ./src/
13
-
14
- RUN pip3 install -r requirements.txt
15
-
16
- EXPOSE 8501
17
-
18
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
-
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,20 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- title: Sentiment Analysis
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: Sentiment Analysis Dashboard for Musora
12
- license: cc-by-4.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  ---
14
 
15
- # Welcome to Streamlit!
16
 
17
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
 
19
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Musora Sentiment Analysis Dashboard
2
+
3
+ A Streamlit dashboard for visualising sentiment analysis results from **social media comments** (Facebook, Instagram, YouTube, Twitter) and the **Musora internal app** across brands (Drumeo, Pianote, Guitareo, Singeo, Musora).
4
+
5
+ ---
6
+
7
+ ## Table of Contents
8
+
9
+ 1. [Project Structure](#project-structure)
10
+ 2. [How Data Flows](#how-data-flows)
11
+ 3. [Data Loading Strategy](#data-loading-strategy)
12
+ 4. [Pages](#pages)
13
+ 5. [Global Filters & Session State](#global-filters--session-state)
14
+ 6. [Snowflake Queries](#snowflake-queries)
15
+ 7. [Adding or Changing Things](#adding-or-changing-things)
16
+ 8. [Running the App](#running-the-app)
17
+ 9. [Configuration Reference](#configuration-reference)
18
+
19
+ ---
20
+
21
+ ## Project Structure
22
+
23
+ ```
24
+ visualization/
25
+ ├── app.py # Entry point — routing, sidebar, session state
26
+ ├── config/
27
+ │ └── viz_config.json # Colors, query strings, dashboard settings
28
+ ├── data/
29
+ │ └── data_loader.py # All Snowflake queries and caching logic
30
+ ├── utils/
31
+ │ ├── data_processor.py # Pandas aggregations (intent dist, content summary, etc.)
32
+ │ └── metrics.py # KPI calculations (sentiment score, urgency, etc.)
33
+ ├── components/
34
+ │ ├── dashboard.py # Dashboard page renderer
35
+ │ ├── sentiment_analysis.py # Sentiment Analysis page renderer
36
+ │ └── reply_required.py # Reply Required page renderer
37
+ ├── visualizations/
38
+ │ ├── sentiment_charts.py # Plotly sentiment chart functions
39
+ │ ├── distribution_charts.py # Plotly distribution / heatmap / scatter functions
40
+ │ ├── demographic_charts.py # Plotly demographic chart functions
41
+ │ └── content_cards.py # Streamlit card components (comment cards, content cards)
42
+ ├── agents/
43
+ │ └── content_summary_agent.py # AI analysis agent (OpenAI) for comment summarisation
44
+ ├── img/
45
+ │ └── musora.png # Sidebar logo
46
+ └── SnowFlakeConnection.py # Snowflake connection wrapper (Snowpark session)
47
+ ```
48
+
49
+ ---
50
+
51
+ ## How Data Flows
52
+
53
+ ```
54
+ Snowflake
55
+
56
+
57
+ data_loader.py ← Three separate loading modes (see below)
58
+
59
+ ├── load_dashboard_data() ──► st.session_state['dashboard_df']
60
+ │ └─► app.py sidebar (filter options, counts)
61
+ │ └─► dashboard.py (all charts)
62
+
63
+ ├── load_sa_data() ──► st.session_state['sa_contents']
64
+ │ (on-demand, button) st.session_state['sa_comments']
65
+ │ └─► sentiment_analysis.py
66
+
67
+ └── load_reply_required_data() ► st.session_state['rr_df']
68
+ (on-demand, button) └─► reply_required.py
69
+ ```
70
+
71
+ **Key principle:** Data is loaded as little as possible, as late as possible.
72
+
73
+ - The **Dashboard** uses a lightweight query (no text columns, no content join) cached for 24 hours.
74
+ - The **Sentiment Analysis** and **Reply Required** pages never load data automatically — they wait for the user to click **Fetch Data**.
75
+ - All data is stored in `st.session_state` so page navigation and widget interactions do not re-trigger Snowflake queries.
76
+
77
+ ---
78
+
79
+ ## Data Loading Strategy
80
+
81
+ All loading logic lives in **`data/data_loader.py`** (`SentimentDataLoader` class).
82
+
83
+ ### `load_dashboard_data()`
84
+ - Uses `dashboard_query` from `viz_config.json`.
85
+ - Fetches only: `comment_sk, content_sk, platform, brand, sentiment_polarity, intent, requires_reply, detected_language, comment_timestamp, processed_at, author_id`.
86
+ - No text columns, no `DIM_CONTENT` join — significantly faster than the full query.
87
+ - Also merges demographics data if `demographics_query` is configured.
88
+ - Cached for **24 hours** (`@st.cache_data(ttl=86400)`).
89
+ - Called once by `app.py` at startup; result stored in `st.session_state['dashboard_df']`.
90
+
91
+ ### `load_sa_data(platform, brand, top_n, min_comments, sort_by, sentiments, intents, date_range)`
92
+ - Runs **two** sequential Snowflake queries:
93
+ 1. **Content aggregation** — groups by `content_sk`, counts per sentiment, computes severity score, returns top N.
94
+ 2. **Sampled comments** — for the top N `content_sk`s only, fetches up to 50 comments per sentiment group per content (negative, positive, other), using Snowflake `QUALIFY ROW_NUMBER()`. `display_text` is computed in SQL (`CASE WHEN IS_ENGLISH = FALSE AND TRANSLATED_TEXT IS NOT NULL THEN TRANSLATED_TEXT ELSE ORIGINAL_TEXT END`).
95
+ - Returns a tuple `(contents_df, comments_df)`.
96
+ - Cached for **24 hours**.
97
+ - Called only when the user clicks **Fetch Data** on the Sentiment Analysis page.
98
+
99
+ ### `load_reply_required_data(platforms, brands, date_range)`
100
+ - Runs a single query filtering `REQUIRES_REPLY = TRUE`.
101
+ - Dynamically includes/excludes the social media table and musora table based on selected platforms.
102
+ - `display_text` computed in SQL.
103
+ - Cached for **24 hours**.
104
+ - Called only when the user clicks **Fetch Data** on the Reply Required page.
105
+
106
+ ### Important: SQL Column Qualification
107
+ Both the social media table (`COMMENT_SENTIMENT_FEATURES`) and the content dimension table (`DIM_CONTENT`) share column names. Any `WHERE` clause inside a query that joins these two tables **must** use the table alias prefix (e.g. `s.PLATFORM`, `s.COMMENT_TIMESTAMP`, `s.CHANNEL_NAME`) to avoid Snowflake `ambiguous column name` errors. The musora table (`MUSORA_COMMENT_SENTIMENT_FEATURES`) has no joins so unqualified column names are fine there.
108
+
109
+ ---
110
+
111
+ ## Pages
112
+
113
+ ### Dashboard (`components/dashboard.py`)
114
+
115
+ **Receives:** `filtered_df` — the lightweight dashboard dataframe (after optional global filter applied by `app.py`).
116
+
117
+ **Does not need:** text, translations, content URLs. All charts work purely on aggregated columns (sentiment_polarity, brand, platform, intent, requires_reply, comment_timestamp).
118
+
119
+ **Key sections:**
120
+ - Summary stats + health indicator
121
+ - Sentiment distribution (pie + gauge)
122
+ - Sentiment by brand and platform (stacked + percentage bar charts)
123
+ - Intent analysis
124
+ - Brand-Platform heatmap
125
+ - Reply requirements + urgency breakdown
126
+ - Demographics (age, timezone, experience level) — only rendered if `author_id` is present and demographics were merged
127
+
128
+ **To add a new chart:** create the chart function in `visualizations/` and call it from `render_dashboard()`. The function receives `filtered_df`.
129
+
130
+ ---
131
+
132
+ ### Sentiment Analysis (`components/sentiment_analysis.py`)
133
+
134
+ **Receives:** `data_loader` instance only (no dataframe).
135
+
136
+ **Flow:**
137
+ 1. Reads `st.session_state['dashboard_df']` for filter option lists (platforms, brands, sentiments, intents).
138
+ 2. Pre-populates platform/brand dropdowns from `st.session_state['global_filters']`.
139
+ 3. Shows filter controls (platform, brand, sentiment, intent, top_n, min_comments, sort_by).
140
+ 4. On **Fetch Data** click: calls `data_loader.load_sa_data(...)` and stores results in `st.session_state['sa_contents']` and `['sa_comments']`.
141
+ 5. Renders content cards, per-content sentiment + intent charts, AI analysis buttons, and sampled comment expanders.
142
+
143
+ **Pagination:** `st.session_state['sentiment_page']` (5 contents per page). Reset on new fetch.
144
+
145
+ **Comments:** Sampled (up to 50 negative + 50 positive + 50 neutral per content). These are already in memory after the fetch — no extra query is needed when the user expands a comment section.
146
+
147
+ **AI Analysis:** Uses `ContentSummaryAgent` (see `agents/`). Results cached in `st.session_state['content_summaries']`.
148
+
149
  ---
150
+
151
+ ### Reply Required (`components/reply_required.py`)
152
+
153
+ **Receives:** `data_loader` instance only.
154
+
155
+ **Flow:**
156
+ 1. Reads `st.session_state['dashboard_df']` for filter option lists.
157
+ 2. Pre-populates platform, brand, and date from `st.session_state['global_filters']`.
158
+ 3. On **Fetch Data** click: calls `data_loader.load_reply_required_data(...)` and stores result in `st.session_state['rr_df']`.
159
+ 4. Shows urgency breakdown, in-page view filters (priority, platform, brand, intent — applied in Python, no new query), paginated comment cards, and a "Reply by Content" summary.
160
+
161
+ **Pagination:** `st.session_state['reply_page']` (10 comments per page). Reset on new fetch.
162
+
163
+ ---
164
+
165
+ ## Global Filters & Session State
166
+
167
+ Global filters live in the sidebar (`app.py`) and are stored in `st.session_state['global_filters']` as a dict:
168
+
169
+ ```python
170
+ {
171
+ 'platforms': ['facebook', 'instagram'], # list or []
172
+ 'brands': ['drumeo'],
173
+ 'sentiments': [],
174
+ 'date_range': (date(2025, 1, 1), date(2025, 12, 31)), # or None
175
+ }
176
+ ```
177
+
178
+ - **Dashboard:** `app.py` applies global filters to `dashboard_df` using `data_loader.apply_filters()` and passes the result to `render_dashboard()`.
179
+ - **Sentiment Analysis / Reply Required:** global filters are used to pre-populate their own filter widgets. The actual Snowflake query uses those values when the user clicks Fetch. The pages do **not** receive a pre-filtered dataframe.
180
+
181
+ ### Full session state key reference
182
+
183
+ | Key | Set by | Used by |
184
+ |-----|--------|---------|
185
+ | `dashboard_df` | `app.py` on startup | sidebar (filter options), dashboard, SA + RR (filter option lists) |
186
+ | `global_filters` | sidebar "Apply Filters" button | app.py (dashboard filter), SA + RR (pre-populate widgets) |
187
+ | `filters_applied` | sidebar buttons | app.py (whether to apply filters) |
188
+ | `sa_contents` | SA fetch button | SA page rendering |
189
+ | `sa_comments` | SA fetch button | SA page rendering |
190
+ | `sa_fetch_key` | SA fetch button | SA page (detect stale data) |
191
+ | `rr_df` | RR fetch button | RR page rendering |
192
+ | `rr_fetch_key` | RR fetch button | RR page (detect stale data) |
193
+ | `sentiment_page` | SA page / fetch | SA pagination |
194
+ | `reply_page` | RR page / fetch | RR pagination |
195
+ | `content_summaries` | AI analysis buttons | SA AI analysis display |
196
+
197
+ ---
198
+
199
+ ## Snowflake Queries
200
+
201
+ All query strings are either stored in `config/viz_config.json` (static queries) or built dynamically in `data/data_loader.py` (page-specific queries).
202
+
203
+ ### Static queries (in `viz_config.json`)
204
+
205
+ | Key | Purpose |
206
+ |-----|---------|
207
+ | `query` | Full query with all columns (legacy, kept for compatibility) |
208
+ | `dashboard_query` | Lightweight query — no text, no DIM_CONTENT join |
209
+ | `demographics_query` | Joins `usora_users` with `preprocessed.users` to get age/timezone/experience |
210
+
211
+ ### Dynamic queries (built in `data_loader.py`)
212
+
213
+ | Method | Description |
214
+ |--------|-------------|
215
+ | `_build_sa_content_query()` | Content aggregation for SA page; filters by platform + brand + date |
216
+ | `_build_sa_comments_query()` | Sampled comments for SA page; uses `QUALIFY ROW_NUMBER() <= 50` |
217
+ | `_build_rr_query()` | Reply-required comments; filters by platform/brand/date; conditionally includes social media and/or musora table |
218
+
219
+ ### Data source tables
220
+
221
+ | Table | Platform | Notes |
222
+ |-------|----------|-------|
223
+ | `SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES` | facebook, instagram, youtube, twitter | Needs `LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT` for `PERMALINK_URL` |
224
+ | `SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES` | musora_app | Has `PERMALINK_URL` and `THUMBNAIL_URL` natively; platform stored as `'musora'`, mapped to `'musora_app'` in queries |
225
+
226
+ ---
227
+
228
+ ## Adding or Changing Things
229
+
230
+ ### Add a new chart to the Dashboard
231
+ 1. Write the chart function in the appropriate `visualizations/` file.
232
+ 2. Call it from `render_dashboard()` in `components/dashboard.py`, passing `filtered_df`.
233
+ 3. The chart function receives a lightweight df — it has no text columns but has all the columns listed in `dashboard_query`.
234
+
235
+ ### Add a new filter to the Dashboard sidebar
236
+ 1. Add the widget in `app.py` under the "Global Filters" section.
237
+ 2. Store the selected value in the `global_filters` dict under `st.session_state`.
238
+ 3. Pass it to `data_loader.apply_filters()`.
239
+
240
+ ### Change what the Sentiment Analysis page queries
241
+ - Edit `_build_sa_content_query()` and/or `_build_sa_comments_query()` in `data_loader.py`.
242
+ - If you add new columns to the content aggregation result, also update `_process_sa_content_stats()` so they are available in `contents_df`.
243
+ - If you add new columns to the comments result, update `_process_sa_comments()`.
244
+
245
+ ### Change what the Reply Required page queries
246
+ - Edit `_build_rr_query()` in `data_loader.py`.
247
+ - Remember: all column references inside the social media block (which has a `JOIN`) must be prefixed with `s.` to avoid Snowflake ambiguity errors.
248
+
249
+ ### Change the cache duration
250
+ - `@st.cache_data(ttl=86400)` is set on `load_dashboard_data`, `_fetch_sa_data`, `_fetch_rr_data`, and `load_demographics_data`.
251
+ - Change `86400` (seconds) to the desired TTL, or set `ttl=None` for no expiry.
252
+ - Users can always force a refresh with the "Reload Data" button in the sidebar (which calls `st.cache_data.clear()` and deletes `st.session_state['dashboard_df']`).
253
+
254
+ ### Add a new page
255
+ 1. Create `components/new_page.py` with a `render_new_page(data_loader)` function.
256
+ 2. Import and add a radio option in `app.py`.
257
+ 3. If the page needs its own Snowflake data, add a `load_new_page_data()` method to `SentimentDataLoader` following the same pattern as `load_sa_data`.
258
+
259
+ ### Add a new column to the Dashboard query
260
+ - Edit `dashboard_query` in `config/viz_config.json`.
261
+ - Both UNION branches must select the same columns in the same order.
262
+ - `_process_dashboard_dataframe()` in `data_loader.py` handles basic type casting — add processing there if needed.
263
+
264
+ ---
265
+
266
+ ## Running the App
267
+
268
+ ```bash
269
+ # From the project root
270
+ streamlit run visualization/app.py
271
+ ```
272
+
273
+ **Required environment variables** (in `.env` at project root):
274
+
275
+ ```
276
+ SNOWFLAKE_USER
277
+ SNOWFLAKE_PASSWORD
278
+ SNOWFLAKE_ACCOUNT
279
+ SNOWFLAKE_ROLE
280
+ SNOWFLAKE_DATABASE
281
+ SNOWFLAKE_WAREHOUSE
282
+ SNOWFLAKE_SCHEMA
283
+ ```
284
+
285
  ---
286
 
287
+ ## Configuration Reference
288
 
289
+ `config/viz_config.json` controls:
290
 
291
+ | Section | What it configures |
292
+ |---------|-------------------|
293
+ | `color_schemes.sentiment_polarity` | Hex colors for each sentiment level |
294
+ | `color_schemes.intent` | Hex colors for each intent label |
295
+ | `color_schemes.platform` | Hex colors for each platform |
296
+ | `color_schemes.brand` | Hex colors for each brand |
297
+ | `sentiment_order` | Display order for sentiment categories in charts |
298
+ | `intent_order` | Display order for intent categories |
299
+ | `negative_sentiments` | Which sentiment values count as "negative" |
300
+ | `dashboard.default_date_range_days` | Default date filter window (days) |
301
+ | `dashboard.max_comments_display` | Max comments shown per pagination page |
302
+ | `dashboard.chart_height` | Default Plotly chart height |
303
+ | `dashboard.top_n_contents` | Default top-N for content ranking |
304
+ | `snowflake.query` | Full query (legacy, all columns) |
305
+ | `snowflake.dashboard_query` | Lightweight dashboard query (no text columns) |
306
+ | `snowflake.demographics_query` | Demographics join query |
307
+ | `demographics.age_groups` | Age bucket definitions (label → [min, max]) |
308
+ | `demographics.experience_groups` | Experience bucket definitions |
309
+ | `demographics.top_timezones_count` | How many timezones to show in the geographic chart |
processing_brand_sentiment/ARCHITECTURE_PROPOSAL.md ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Brand Sentiment Analysis - Architecture Redesign Proposal
2
+
3
+ ## Executive Summary
4
+
5
+ This document proposes a redesigned multi-agent architecture to address accuracy issues identified during manual evaluation. The new design separates **fact extraction** from **analysis**, adds strict validation, and improves content preprocessing.
6
+
7
+ ---
8
+
9
+ ## Current Issues Analysis
10
+
11
+ | Issue | Root Cause | Impact |
12
+ |-------|------------|--------|
13
+ | **B8X/B8 variation** | Word-boundary matching misses aliases | Missing relevant posts |
14
+ | **Competitor products attributed to Sabian** | LLM lacks competitor awareness, no strict list enforcement | False positives, wrong product attribution |
15
+ | **Short text language detection** | Lingua fails on short brand-heavy text | Skipping valid English posts |
16
+ | **False positive relevance** | Single-pass relevance + no verification | Pizza oven marked as Sabian discussion |
17
+ | **Long posts with overlapping content** | Poor quote separation, raw thread context | Confusing LLM, extraction from wrong content |
18
+
19
+ ---
20
+
21
+ ## Proposed Architecture
22
+
23
+ ### Design Principles
24
+
25
+ 1. **Separation of Concerns**: Fact extraction vs. interpretation/analysis
26
+ 2. **Strict Validation**: Enforce predefined value lists at every step
27
+ 3. **Structured Data Flow**: Each agent receives clean, relevant input
28
+ 4. **Fail-Safe Defaults**: Conservative approach - when uncertain, mark as not relevant
29
+
30
+ ### New Workflow
31
+
32
+ ```
33
+ ┌─────────────────────────────────────────────────────────────────┐
34
+ │ 1. CONTENT PREPROCESSOR │
35
+ │ (No LLM) │
36
+ │ • Enhanced HTML parsing (better quote separation) │
37
+ │ • Text cleaning and normalization │
38
+ │ • Language detection (skip for short texts < 50 chars) │
39
+ │ • Keyword screening with aliases (B8 → B8X) │
40
+ │ • Extract: cleaned_content, quoted_content, raw_thread_context │
41
+ └─────────────────────────────┬───────────────────────────────────┘
42
+
43
+
44
+ ┌───────────────────────────────┐
45
+ │ Has any Sabian-related │
46
+ │ keywords (primary/contextual)?│
47
+ └───────────────┬───────────────┘
48
+ │ │
49
+ YES NO
50
+ │ │
51
+ ▼ ▼
52
+ ┌─────────────────────────────────┐ ┌──────────────────┐
53
+ │ 2. RELEVANCE & EXTRACTION │ │ Mark as │
54
+ │ AGENT (LLM #1) │ │ NOT RELEVANT │
55
+ │ │ │ (0 LLM calls) │
56
+ │ INPUT: │ └──────────────────┘
57
+ │ • cleaned_content │
58
+ │ • quoted_content │
59
+ │ • raw_thread_context │
60
+ │ • keywords_found │
61
+ │ │
62
+ │ OUTPUT: │
63
+ │ • IS_RELEVANT: boolean │
64
+ │ • RELEVANCE_CONFIDENCE: h/m/l │
65
+ │ • RELEVANCE_REASON: string │
66
+ │ • PRODUCTS_MENTIONED: [] │ ← STRICT: only from predefined list
67
+ │ • SABIAN_MENTION_CONTEXT │
68
+ │ • AUTHOR_ROLE │
69
+ │ • COMPETITORS_MENTIONED: [] │ ← Brand names only, no products
70
+ │ • THREAD_CONTEXT_SUMMARY │ ← 1-2 sentence summary
71
+ └─────────────────┬───────────────┘
72
+
73
+
74
+ ┌─────────────────┐
75
+ │ IS_RELEVANT? │
76
+ └────────┬────────┘
77
+ │ │
78
+ YES NO
79
+ │ │
80
+ ▼ ▼
81
+ ┌─────────────────────────────────┐ ┌──────────────────┐
82
+ │ 3. SENTIMENT & INTENT │ │ Store with │
83
+ │ ANALYZER (LLM #2) │ │ is_relevant=F │
84
+ │ │ │ (1 LLM call) │
85
+ │ INPUT (structured): │ └──────────────────┘
86
+ │ • cleaned_content │
87
+ │ • PRODUCTS_MENTIONED │ ← Pre-validated list
88
+ │ • SABIAN_MENTION_CONTEXT │
89
+ │ • AUTHOR_ROLE │
90
+ │ • COMPETITORS_MENTIONED │
91
+ │ • THREAD_CONTEXT_SUMMARY │ ← Clean, concise context
92
+ │ │
93
+ │ OUTPUT: │
94
+ │ • SENTIMENT_LEVEL │
95
+ │ • EMOTION_TYPE │
96
+ │ • SENTIMENT_CONFIDENCE │
97
+ │ • SARCASM_DETECTED │
98
+ │ • PRODUCT_ATTRIBUTES: [] │
99
+ │ • COMPETITOR_PRODUCTS_OWNED: []│
100
+ │ • COMPARISON_TYPE │
101
+ │ • INTENTS: [] │
102
+ │ • PURCHASE_STAGE │
103
+ │ • DECISION_DRIVERS: [] │
104
+ │ • PAIN_POINTS: [] │
105
+ │ • DELIGHT_FACTORS: [] │
106
+ │ • ANALYSIS_NOTES │
107
+ └─────────────────┬───────────────┘
108
+
109
+
110
+ ┌─────────────────────────────────┐
111
+ │ 4. OUTPUT VALIDATOR │
112
+ │ (No LLM - Rule-based) │
113
+ │ │
114
+ │ • Verify all values from lists │
115
+ │ • Check logical consistency │
116
+ │ • Flag anomalies for review │
117
+ │ • Set processing_status │
118
+ └─────────────────────────────────┘
119
+ ```
120
+
121
+ ---
122
+
123
+ ## API Call Summary
124
+
125
+ | Scenario | Current Calls | New Calls | Notes |
126
+ |----------|--------------|-----------|-------|
127
+ | No keywords found | 0 | 0 | Same |
128
+ | Primary keywords, relevant | 1 | 2 | +1 for better extraction |
129
+ | Primary keywords, not relevant | 1 | 1 | Extraction determines not relevant |
130
+ | Ambiguous keywords, relevant | 2 | 2 | Same |
131
+ | Ambiguous keywords, not relevant | 2 | 1 | Early exit after extraction |
132
+
133
+ **Net Impact**: Slight increase for some cases, but significantly better accuracy.
134
+
135
+ ---
136
+
137
+ ## Agent Specifications
138
+
139
+ ### Agent 1: Content Preprocessor (No LLM)
140
+
141
+ **File**: `workflow/agents/content_preprocessor_agent.py`
142
+
143
+ **Improvements over current**:
144
+ 1. Enhanced HTML parsing with better quote/reply separation
145
+ 2. Product alias mapping (B8 → B8X, etc.)
146
+ 3. Skip language detection for texts < 50 characters
147
+ 4. Always process if primary Sabian keywords found (regardless of language detection)
148
+
149
+ **Product Aliases** (add to brand_config.json):
150
+ ```json
151
+ "product_aliases": {
152
+ "B8": "B8X",
153
+ "sbrs": "SBR",
154
+ "hand hammered": "HH",
155
+ "hand-hammered": "HH"
156
+ }
157
+ ```
158
+
159
+ ---
160
+
161
+ ### Agent 2: Relevance & Extraction Agent (LLM #1)
162
+
163
+ **File**: `workflow/agents/relevance_extraction_agent.py`
164
+
165
+ **Purpose**: Determine relevance with HIGH confidence and extract verifiable facts.
166
+
167
+ **Key Design Decisions**:
168
+
169
+ 1. **Strict Product Matching**:
170
+ - Provide explicit product list in prompt
171
+ - Instruction: "ONLY return products that EXACTLY match items in this list"
172
+ - Return empty list if no exact matches (not hallucinated guesses)
173
+
174
+ 2. **Competitor Awareness**:
175
+ - List competitor BRAND names (not products)
176
+ - Instruction: "Products like '2002', 'Signature', 'K Custom' belong to competitors, NOT Sabian"
177
+ - Prevent cross-brand attribution
178
+
179
+ 3. **Thread Context Summarization**:
180
+ - Summarize in 1-2 sentences maximum
181
+ - Focus only on information relevant to understanding the post's context
182
+
183
+ 4. **Conservative Relevance**:
184
+ - When uncertain, mark as NOT relevant
185
+ - Require explicit Sabian product/brand mention IN THE POST CONTENT
186
+ - Quoted content mentioning Sabian does NOT make post relevant
187
+
188
+ **System Prompt Structure**:
189
+ ```
190
+ You are a brand mention extractor for Sabian cymbals. Your job is to:
191
+ 1. Determine if the POST CONTENT discusses Sabian products
192
+ 2. Extract ONLY facts, not interpretations
193
+
194
+ ## CRITICAL RULES
195
+
196
+ ### Rule 1: Relevance Based on POST CONTENT Only
197
+ - The post is relevant ONLY if the POST CONTENT itself mentions Sabian products
198
+ - Quoted/parent content mentioning Sabian does NOT make the post relevant
199
+ - Generic replies ("Thanks!", "Got it!") are NEVER relevant
200
+
201
+ ### Rule 2: Strict Product Matching
202
+ SABIAN PRODUCTS (use ONLY these exact values):
203
+ [HHX, HH, AAX, AA, Artisan, FRX, Omni, Chopper, Stratus, XSR, B8X, SBR]
204
+
205
+ - Return ONLY products from this list
206
+ - If you see a product not in this list, do NOT include it
207
+ - "2002", "Signature", "Sound Edge", "Formula 602" are PAISTE products, NOT Sabian
208
+ - "K Custom", "A Custom", "K Zildjian" are ZILDJIAN products, NOT Sabian
209
+ - When uncertain, return empty list []
210
+
211
+ ### Rule 3: Competitor Brand Awareness
212
+ COMPETITOR BRANDS: [Zildjian, Paiste, Meinl, Dream Cymbals, Istanbul Agop, Bosphorus]
213
+
214
+ - Only return competitor BRAND names in competitors_mentioned
215
+ - Do NOT guess competitor products
216
+
217
+ ### Rule 4: Thread Context Summary
218
+ - Summarize thread context in 1-2 sentences maximum
219
+ - Focus on what helps understand the post's topic
220
+ - If thread is about pizza ovens, say "Thread discusses pizza ovens and cooking"
221
+
222
+ ## OUTPUT FORMAT
223
+ Return ONLY valid JSON:
224
+ {
225
+ "is_relevant": boolean,
226
+ "relevance_confidence": "high" | "medium" | "low",
227
+ "relevance_reason": "1-2 sentences explaining decision",
228
+ "products_mentioned": [], // ONLY from Sabian list above
229
+ "sabian_mention_context": "primary_focus" | "significant_mention" | "casual_mention" | "comparison_context" | null,
230
+ "author_role": "current_owner" | "past_owner" | "potential_buyer" | "never_owned" | "unknown",
231
+ "competitors_mentioned": [], // Brand names only
232
+ "thread_context_summary": "1-2 sentence summary"
233
+ }
234
+ ```
235
+
236
+ ---
237
+
238
+ ### Agent 3: Sentiment & Intent Analyzer (LLM #2)
239
+
240
+ **File**: `workflow/agents/sentiment_analyzer_agent.py`
241
+
242
+ **Purpose**: Deep analysis on VERIFIED relevant posts with STRUCTURED input.
243
+
244
+ **Key Design Decisions**:
245
+
246
+ 1. **Receives Pre-Validated Input**:
247
+ - Products already extracted and validated
248
+ - Thread context already summarized
249
+ - Author role already determined
250
+
251
+ 2. **Focused Analysis**:
252
+ - Sentiment TOWARDS SABIAN ONLY
253
+ - Intent classification
254
+ - Pain points / Delights (author's own experience only)
255
+ - Purchase journey (author's own journey only)
256
+
257
+ 3. **No Hallucination on Products**:
258
+ - Products are GIVEN in input, not re-extracted
259
+ - Can only discuss attributes of provided products
260
+
261
+ **System Prompt Structure**:
262
+ ```
263
+ You are a sentiment analyst for Sabian cymbal discussions.
264
+
265
+ ## INPUT CONTEXT (Pre-validated, trust these values)
266
+ - Products mentioned: {products_mentioned}
267
+ - Sabian mention context: {sabian_mention_context}
268
+ - Author role: {author_role}
269
+ - Thread summary: {thread_context_summary}
270
+ - Competitors mentioned: {competitors_mentioned}
271
+
272
+ ## YOUR TASK
273
+ Analyze the sentiment, emotions, and intents in this post about Sabian.
274
+
275
+ ## CRITICAL RULES
276
+
277
+ ### Rule 1: Sabian-Specific Sentiment
278
+ - Sentiment MUST be about Sabian, NOT overall post tone
279
+ - Example: "Love my new kit! The SBR cymbals sound terrible."
280
+ - Overall: positive | Sabian sentiment: NEGATIVE
281
+
282
+ ### Rule 2: Author Perspective Only
283
+ These fields are ONLY for author's OWN experience:
284
+ - purchase_stage, decision_drivers, pain_points, delight_factors
285
+ - If author is giving ADVICE to others, these should be null/empty
286
+
287
+ ### Rule 3: Use Only Valid Values
288
+ [List all valid values for each field]
289
+
290
+ ## OUTPUT FORMAT
291
+ {
292
+ "sentiment_level": "...",
293
+ "emotion_type": "..." or null,
294
+ "sentiment_confidence": "high" | "medium" | "low",
295
+ "sarcasm_detected": boolean,
296
+ "product_attributes": [],
297
+ "competitor_products_owned": [],
298
+ "comparison_type": "..." or null,
299
+ "intents": [],
300
+ "purchase_stage": "..." or null,
301
+ "decision_drivers": [],
302
+ "pain_points": [],
303
+ "delight_factors": [],
304
+ "analysis_notes": "1-2 sentences"
305
+ }
306
+ ```
307
+
308
+ ---
309
+
310
+ ### Agent 4: Output Validator (No LLM)
311
+
312
+ **File**: `workflow/agents/output_validator_agent.py`
313
+
314
+ **Purpose**: Final validation and anomaly detection.
315
+
316
+ **Validation Rules**:
317
+
318
+ 1. **List Validation**:
319
+ - All products_mentioned are in Sabian product list
320
+ - All competitors_mentioned are in competitor list
321
+ - All categorical values are from predefined lists
322
+
323
+ 2. **Logical Consistency**:
324
+ - If is_relevant=True, products_mentioned should not be empty (flag if empty)
325
+ - If sabian_mention_context="primary_focus", products_mentioned should have items
326
+ - If sentiment_level="very_negative", pain_points should not be empty (warn)
327
+
328
+ 3. **Anomaly Flagging**:
329
+ - Flag for manual review if inconsistencies detected
330
+ - Add `validation_flags` field to output
331
+
332
+ ---
333
+
334
+ ## Configuration Changes
335
+
336
+ ### brand_config.json Updates
337
+
338
+ ```json
339
+ {
340
+ "brand": {
341
+ "name": "Sabian",
342
+ "products": ["HHX", "HH", "AAX", "AA", "Artisan", "FRX", "Omni", "Chopper", "Stratus", "XSR", "B8X", "SBR"],
343
+ "product_aliases": {
344
+ "B8": "B8X",
345
+ "sbrs": "SBR",
346
+ "hhx's": "HHX",
347
+ "aax's": "AAX"
348
+ },
349
+ "competitor_products_warning": [
350
+ "2002", "Signature", "Sound Edge", "Formula 602", "Giant Beat",
351
+ "K Custom", "A Custom", "K Zildjian", "A Zildjian", "S Family",
352
+ "Byzance", "Pure Alloy", "HCS",
353
+ "Bliss", "Contact", "Energy"
354
+ ],
355
+ "competitors": [...]
356
+ },
357
+ "preprocessing": {
358
+ "min_length_for_language_detection": 50,
359
+ "always_process_if_primary_keyword": true
360
+ }
361
+ }
362
+ ```
363
+
364
+ ---
365
+
366
+ ## File Structure
367
+
368
+ ```
369
+ processing_brand_sentiment/
370
+ ├── config_files/
371
+ │ ├── brand_config.json # Updated with aliases, warnings
372
+ │ ├── workflow_config.json # Agent configurations
373
+ │ └── analysis_categories.json # Category definitions (unchanged)
374
+ ├── workflow/
375
+ │ ├── orchestrator.py # Updated workflow graph
376
+ │ └── agents/
377
+ │ ├── base_agent.py # Base class (unchanged)
378
+ │ ├── content_preprocessor_agent.py # Enhanced preprocessing
379
+ │ ├── relevance_extraction_agent.py # NEW: Extraction + relevance
380
+ │ ├── sentiment_analyzer_agent.py # NEW: Focused analysis
381
+ │ └── output_validator_agent.py # NEW: Validation
382
+ ```
383
+
384
+ ---
385
+
386
+ ## Migration Path
387
+
388
+ ### Phase 1: Configuration Updates
389
+ 1. Update brand_config.json with product aliases
390
+ 2. Add competitor product warnings
391
+ 3. Update preprocessing settings
392
+
393
+ ### Phase 2: New Agents
394
+ 1. Create relevance_extraction_agent.py
395
+ 2. Create sentiment_analyzer_agent.py
396
+ 3. Create output_validator_agent.py
397
+ 4. Update content_preprocessor_agent.py
398
+
399
+ ### Phase 3: Orchestrator Update
400
+ 1. Update workflow graph with new flow
401
+ 2. Update state definition
402
+ 3. Add new routing logic
403
+
404
+ ### Phase 4: Testing & Validation
405
+ 1. Run on test batch with known issues
406
+ 2. Compare accuracy metrics
407
+ 3. Fine-tune prompts based on results
408
+
409
+ ---
410
+
411
+ ## Expected Improvements
412
+
413
+ | Issue | Current Behavior | Expected After |
414
+ |-------|------------------|----------------|
415
+ | B8/B8X | Missed | Caught via alias mapping |
416
+ | Paiste products as Sabian | Attributed to Sabian | Correctly identified as competitor |
417
+ | Short text language | Marked as Latin | Processed as English |
418
+ | False positive (pizza) | Marked relevant | Marked not relevant |
419
+ | Long confusing context | Raw text confuses LLM | Summarized 1-2 sentences |
420
+
421
+ ---
422
+
423
+ ## Success Metrics
424
+
425
+ 1. **Relevance Accuracy**: >99% (currently ~90%)
426
+ 2. **Product Attribution Accuracy**: >99% (currently ~85%)
427
+ 3. **Sentiment Accuracy**: >95% (current unknown)
428
+ 4. **False Positive Rate**: <1%
429
+ 5. **False Negative Rate**: <1%
430
+
431
+ ---
432
+
433
+ ## Questions for Review
434
+
435
+ 1. Should we add a manual review queue for flagged posts?
436
+ 2. Should thread_context_summary be stored in output for debugging?
437
+ 3. Preferred batch size for re-processing existing data?
processing_brand_sentiment/README.md ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Brand Sentiment Analysis Pipeline
2
+
3
+ A modular, scalable system for analyzing forum discussions and social media comments about specific brands using an agentic workflow with LLMs. The initial implementation focuses on **Sabian** (a cymbal manufacturer), but the architecture supports easy addition of new brands through configuration.
4
+
5
+ ## Overview
6
+
7
+ The pipeline fetches data from Snowflake (forum posts and/or social media comments), preprocesses them (parsing HTML for forums or cleaning plain text for comments), detects language, validates brand relevance, performs comprehensive sentiment and intelligence extraction using OpenAI's API, and stores enriched results back to Snowflake.
8
+
9
+ ## Data Sources
10
+
11
+ | Source | Table | Output Table | Description |
12
+ |--------|-------|--------------|-------------|
13
+ | **Forums** | `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS` | `SABIAN_BRAND_ANALYSIS` | Forum posts with thread context |
14
+ | **Comments** | `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS` | `SABIAN_BRAND_ANALYSIS_COMMENTS` | Social media comments with content context |
15
+
16
+ ## Architecture v4.0
17
+
18
+ The system uses a 4-agent pipeline that separates **fact extraction** from **analysis** for improved accuracy. Both data sources share the same extraction, analysis, and validation agents - only the preprocessor differs.
19
+
20
+ ```
21
+ ┌─────────────────────────────────────────────────────────────────┐
22
+ │ 1a. CONTENT PREPROCESSOR (Forums) │
23
+ │ (No LLM) │
24
+ │ - HTML parsing with quote/reply separation │
25
+ │ - Product alias mapping (B8 → B8X) │
26
+ │ - Smart language detection │
27
+ │ - Keyword-based relevance screening │
28
+ ├─────────────────────────────────────────────────────────────────┤
29
+ │ 1b. COMMENT PREPROCESSOR (Comments) │
30
+ │ (No LLM) │
31
+ │ - Plain text cleaning (no HTML) │
32
+ │ - Product alias mapping (B8 → B8X) │
33
+ │ - Smart language detection │
34
+ │ - Keyword-based relevance screening │
35
+ │ - Context: content title + description + parent comment │
36
+ └─────────────────────────────┬───────────────────────────────────┘
37
+
38
+
39
+ ┌───────────────────────────────┐
40
+ │ Has Sabian-related keywords? │
41
+ └───────────────┬───────────────┘
42
+ │ │
43
+ YES NO
44
+ │ │
45
+ ▼ ▼
46
+ ┌─────────────────────────────────┐ ┌──────────────────┐
47
+ │ 2. RELEVANCE & EXTRACTION │ │ Mark as │
48
+ │ AGENT (LLM #1) │ │ NOT RELEVANT │
49
+ │ [SHARED] │ │ (0 LLM calls) │
50
+ │ - Validates relevance │ └──────────────────┘
51
+ │ - Extracts products (strict) │
52
+ │ - Identifies author role │
53
+ │ - Summarizes context │
54
+ │ - Detects competitors │
55
+ └─────────────────┬───────────────┘
56
+
57
+
58
+ ┌─────────────────┐
59
+ │ IS_RELEVANT? │
60
+ └────────┬────────┘
61
+ │ │
62
+ YES NO
63
+ │ │
64
+ ▼ ▼
65
+ ┌─────────────────────────────────┐ ┌──────────────────┐
66
+ │ 3. SENTIMENT & INTENT │ │ Store with │
67
+ │ ANALYZER (LLM #2) │ │ is_relevant=F │
68
+ │ [SHARED] │ │ (1 LLM call) │
69
+ │ - Sabian-specific sentiment │ └──────────────────┘
70
+ │ - Intent classification │
71
+ │ - Pain points / Delights │
72
+ ��� - Purchase journey (author) │
73
+ │ - Competitor products owned │
74
+ └─────────────────┬───────────────┘
75
+
76
+
77
+ ┌─────────────────────────────────┐
78
+ │ 4. OUTPUT VALIDATOR │
79
+ │ (No LLM - Rule-based) │
80
+ │ [SHARED] │
81
+ │ - Validates all values │
82
+ │ - Checks logical consistency │
83
+ │ - Flags anomalies for review │
84
+ └─────────────────────────────────┘
85
+ ```
86
+
87
+ ## Features
88
+
89
+ - **Multi-Source Support**: Process forums, social media comments, or both
90
+ - **4-Agent Pipeline**: Separation of extraction and analysis for improved accuracy
91
+ - **Strict Product Matching**: Only returns products from predefined list, preventing hallucination
92
+ - **Competitor Awareness**: Knows which products belong to competitors
93
+ - **Smart Language Detection**: Skips detection for short texts, always processes if primary keywords found
94
+ - **Product Alias Mapping**: Handles variations (B8 → B8X, "hand hammered" → HH)
95
+ - **Thread/Comment Context**: LLM summarizes context for clarity
96
+ - **Validation & Anomaly Detection**: Rule-based validator catches errors and flags edge cases
97
+ - **Author Perspective Tracking**: Distinguishes author's own experience from advice to others
98
+ - **Platform Tracking**: Records source platform for each processed item
99
+
100
+ ## Project Structure
101
+
102
+ ```
103
+ processing_brand_sentiment/
104
+ ├── config_files/
105
+ │ ├── brand_config.json # Brand products, aliases, competitors, keywords, data sources
106
+ │ ├── workflow_config.json # LLM settings, batch sizes, output config (forums + comments)
107
+ │ └── analysis_categories.json # Sentiment, intent, pain point categories
108
+ ├── database/
109
+ │ ├── __init__.py
110
+ │ ├── snowflake_connection.py # Snowflake connection handler
111
+ │ └── sql/
112
+ │ ├── fetch_forum_posts.sql # Query for forum posts with thread context
113
+ │ ├── fetch_comments.sql # Query for social media comments with content context
114
+ │ ├── create_output_table.sql # Forum output schema with views
115
+ │ ├── init_output_table.sql # Forum table initialization
116
+ │ ├── create_comments_output_table.sql # Comment output schema with views
117
+ │ └── init_comments_output_table.sql # Comment table initialization
118
+ ├── workflow/
119
+ │ ├── __init__.py
120
+ │ ├── orchestrator.py # Forum LangGraph workflow coordinator
121
+ │ ├── comment_orchestrator.py # Comment LangGraph workflow coordinator
122
+ │ └── agents/
123
+ │ ├── __init__.py
124
+ │ ├── base_agent.py # Abstract base class
125
+ │ ├── content_preprocessor_agent.py # Forum: HTML parsing, alias mapping
126
+ │ ├── comment_preprocessor_agent.py # Comments: plain text, comment context
127
+ │ ├── sabian_relevance_extraction_agent.py # Shared: relevance + extraction
128
+ │ ├── sabian_sentiment_analyzer_agent.py # Shared: sentiment analysis
129
+ │ └── output_validator_agent.py # Shared: rule-based validation
130
+ ├── utils/
131
+ │ ├── __init__.py
132
+ │ └── html_parser.py # HTML content extraction (forums only)
133
+ ├── logs/ # Processing logs (auto-created)
134
+ ├── main.py # Main execution script (multi-source)
135
+ ├── .env # Environment variables
136
+ └── README.md # This file
137
+ ```
138
+
139
+ ## Setup
140
+
141
+ ### 1. Install Dependencies
142
+
143
+ ```bash
144
+ pip install langchain-openai langgraph snowflake-snowpark-python python-dotenv pandas beautifulsoup4 lingua-language-detector
145
+ ```
146
+
147
+ ### 2. Configure Environment Variables
148
+
149
+ Ensure `.env` file contains:
150
+
151
+ ```env
152
+ # Snowflake
153
+ SNOWFLAKE_USER=your_user
154
+ SNOWFLAKE_PASSWORD=your_password
155
+ SNOWFLAKE_ACCOUNT=your_account
156
+ SNOWFLAKE_ROLE=your_role
157
+ SNOWFLAKE_DATABASE=SOCIAL_MEDIA_DB
158
+ SNOWFLAKE_WAREHOUSE=your_warehouse
159
+ SNOWFLAKE_SCHEMA=ML_FEATURES
160
+
161
+ # OpenAI
162
+ OPENAI_API_KEY=your_openai_key
163
+ ```
164
+
165
+ ### 3. Initialize Snowflake Tables
166
+
167
+ Run the initialization scripts before first processing:
168
+
169
+ ```sql
170
+ -- For forums
171
+ database/sql/init_output_table.sql
172
+
173
+ -- For social media comments
174
+ database/sql/init_comments_output_table.sql
175
+ ```
176
+
177
+ ## Usage
178
+
179
+ ### Process All Sources (Default)
180
+
181
+ ```bash
182
+ python main.py
183
+ ```
184
+
185
+ ### Process Forums Only
186
+
187
+ ```bash
188
+ python main.py --data-source forums
189
+ ```
190
+
191
+ ### Process Social Media Comments Only
192
+
193
+ ```bash
194
+ python main.py --data-source comments
195
+ ```
196
+
197
+ ### Process Limited Number
198
+
199
+ ```bash
200
+ python main.py --limit 100
201
+ python main.py --data-source comments --limit 50
202
+ ```
203
+
204
+ ### Sequential Processing (Debug Mode)
205
+
206
+ ```bash
207
+ python main.py --limit 50 --sequential
208
+ ```
209
+
210
+ ### First Run (Overwrite Mode)
211
+
212
+ ```bash
213
+ python main.py --overwrite --limit 100
214
+ ```
215
+
216
+ ### Command-Line Arguments
217
+
218
+ | Argument | Description | Default |
219
+ |----------|-------------|---------|
220
+ | `--limit N` | Process only N items per source | All unprocessed |
221
+ | `--overwrite` | Overwrite existing table | Append mode |
222
+ | `--sequential` | Single-threaded processing | Parallel |
223
+ | `--config-dir PATH` | Custom config directory | config_files/ |
224
+ | `--data-source SOURCE` | Source to process: `forums`, `comments`, `all` | `all` |
225
+
226
+ ## Configuration
227
+
228
+ ### brand_config.json
229
+
230
+ Key sections:
231
+
232
+ ```json
233
+ {
234
+ "brand": {
235
+ "name": "Sabian",
236
+ "products": ["HHX", "HH", "AAX", "AA", "Artisan", "FRX", "Omni", "Chopper", "Stratus", "XSR", "B8X", "SBR"],
237
+ "product_aliases": {
238
+ "b8": "B8X",
239
+ "hand hammered": "HH"
240
+ },
241
+ "competitor_products_warning": {
242
+ "paiste_products": ["2002", "signature", "sound edge", "formula 602"],
243
+ "zildjian_products": ["k custom", "a custom", "k zildjian"]
244
+ },
245
+ "competitors": [...]
246
+ },
247
+ "data_sources": {
248
+ "forums": {
249
+ "table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS",
250
+ "platform": "musora_forums"
251
+ },
252
+ "comments": {
253
+ "table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS",
254
+ "platform_column": "PLATFORM"
255
+ }
256
+ }
257
+ }
258
+ ```
259
+
260
+ ### analysis_categories.json
261
+
262
+ Defines valid values for all categorical fields:
263
+
264
+ - `author_role`: current_owner, past_owner, potential_buyer, never_owned, unknown
265
+ - `sabian_mention_context`: primary_focus, significant_mention, casual_mention, comparison_context
266
+ - `sentiment_level`: very_negative, negative, neutral, positive, very_positive
267
+ - `intents`: seeking_information, providing_information, sharing_experience, comparing, praising, criticizing, buying_selling, general_discussion
268
+ - `feedback_aspects`: sound_quality, price_value, durability, playability, versatility, customer_service, availability, aesthetics
269
+
270
+ ## Output Tables
271
+
272
+ ### Forum Output: `SABIAN_BRAND_ANALYSIS`
273
+
274
+ | Category | Key Columns |
275
+ |----------|-------------|
276
+ | **Identifiers** | POST_ID, THREAD_ID, POST_AUTHOR_ID, PLATFORM |
277
+ | **Content** | ORIGINAL_CONTENT, CLEANED_CONTENT, QUOTED_CONTENT, THREAD_CONTEXT_SUMMARY |
278
+ | **Thread** | THREAD_TITLE, THREAD_FIRST_POST, POST_CREATED_AT, THREAD_STARTED_AT |
279
+ | **Category** | CATEGORY_TITLE, CATEGORY_TOPIC |
280
+
281
+ ### Comment Output: `SABIAN_BRAND_ANALYSIS_COMMENTS`
282
+
283
+ | Category | Key Columns |
284
+ |----------|-------------|
285
+ | **Identifiers** | COMMENT_SK, COMMENT_ID, PLATFORM, AUTHOR_NAME, AUTHOR_ID |
286
+ | **Content** | ORIGINAL_TEXT, COMMENT_TIMESTAMP |
287
+ | **Context** | CONTENT_SK, CONTENT_ID, CONTENT_DESCRIPTION, PARENT_COMMENT_ID, PARENT_COMMENT_TEXT |
288
+ | **Channel** | CHANNEL_SK, CHANNEL_NAME, CHANNEL_DISPLAY_NAME |
289
+
290
+ ### Shared Analysis Columns (Both Tables)
291
+
292
+ | Category | Fields | Notes |
293
+ |----------|--------|-------|
294
+ | **Language** | DETECTED_LANGUAGE, LANGUAGE_CODE, IS_ENGLISH | Language detection |
295
+ | **Relevance** | IS_RELEVANT, RELEVANCE_CONFIDENCE, RELEVANCE_REASON | Brand relevance |
296
+ | **Extraction** | PRODUCTS_MENTIONED, AUTHOR_ROLE, SABIAN_MENTION_CONTEXT | From Agent 1 |
297
+ | **Sentiment** | SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_CONFIDENCE | Sabian-specific |
298
+ | **Intents** | INTENTS (multi-label) | What author is trying to accomplish |
299
+ | **Journey** | PURCHASE_STAGE, DECISION_DRIVERS | Author perspective only |
300
+ | **Feedback** | PAIN_POINTS, DELIGHT_FACTORS | Author's own experience |
301
+ | **Competitive** | COMPETITORS_MENTIONED, COMPETITOR_PRODUCTS_OWNED, COMPARISON_TYPE | Competitive intel |
302
+ | **Validation** | VALIDATION_FLAGS, PROCESSING_STATUS | Anomaly detection |
303
+
304
+ ### Processing Status Values
305
+
306
+ | Status | Description |
307
+ |--------|-------------|
308
+ | `completed` | Successfully processed, no issues |
309
+ | `completed_with_flags` | Processed but has anomalies to review |
310
+ | `validation_failed` | Validation errors detected |
311
+ | `workflow_error` | Unexpected error during processing |
312
+
313
+ ### Available Views
314
+
315
+ #### Forum Views
316
+
317
+ | View | Description |
318
+ |------|-------------|
319
+ | `VW_SABIAN_RELEVANT_ANALYSIS` | Only relevant, successfully processed posts |
320
+ | `VW_SABIAN_FLAGGED_POSTS` | Posts with validation flags for review |
321
+ | `VW_SABIAN_SENTIMENT_DISTRIBUTION` | Sentiment breakdown statistics |
322
+ | `VW_SABIAN_PRODUCT_MENTIONS` | Product mention summary |
323
+ | `VW_SABIAN_COMPETITOR_ANALYSIS` | Competitor comparison analysis |
324
+ | `VW_SABIAN_PAIN_POINTS` | Pain point frequency analysis |
325
+ | `VW_SABIAN_AUTHOR_ROLES` | Author role distribution |
326
+ | `VW_SABIAN_COMPETITOR_OWNERSHIP` | Competitor brands owned by authors |
327
+ | `VW_SABIAN_VALIDATION_SUMMARY` | Processing status breakdown |
328
+
329
+ #### Comment Views
330
+
331
+ | View | Description |
332
+ |------|-------------|
333
+ | `VW_SABIAN_COMMENTS_RELEVANT_ANALYSIS` | Relevant, successful comments |
334
+ | `VW_SABIAN_COMMENTS_FLAGGED` | Comments with validation flags |
335
+ | `VW_SABIAN_COMMENTS_SENTIMENT_DISTRIBUTION` | Sentiment by platform |
336
+ | `VW_SABIAN_COMMENTS_PRODUCT_MENTIONS` | Product mentions by platform |
337
+ | `VW_SABIAN_COMMENTS_VALIDATION_SUMMARY` | Processing status by platform |
338
+
339
+ ## API Call Efficiency
340
+
341
+ | Scenario | LLM Calls | Notes |
342
+ |----------|-----------|-------|
343
+ | No keywords found | 0 | Early exit in preprocessor |
344
+ | Primary keywords, relevant | 2 | Extraction + Analysis |
345
+ | Primary keywords, not relevant | 1 | Only Extraction |
346
+ | Non-English content | 0 | Skipped |
347
+
348
+ ## Key Design Decisions
349
+
350
+ ### Why Separate Forum and Comment Preprocessors?
351
+
352
+ 1. **Different input formats**: Forums use HTML (quotes, blockquotes), comments are plain text
353
+ 2. **Different context sources**: Forums have thread title + first post + category; comments have content title + description + parent comment
354
+ 3. **Shared analysis**: Both feed into the same extraction and analysis agents
355
+
356
+ ### Why Separate Output Tables?
357
+
358
+ 1. **Different identifiers**: Forums use POST_ID/THREAD_ID; comments use COMMENT_SK/COMMENT_ID/PLATFORM
359
+ 2. **Different metadata**: Forums have thread context; comments have content/channel metadata
360
+ 3. **Clean separation**: Avoids NULL columns and schema confusion
361
+ 4. **Shared analysis columns**: All extracted intelligence fields are identical
362
+
363
+ ### Why Platform Column for Forums?
364
+
365
+ The `PLATFORM` column was added to `SABIAN_BRAND_ANALYSIS` (defaulting to `musora_forums`) to enable cross-source analysis and maintain consistency with the comments table which uses the dynamic platform value from the source data.
366
+
367
+ ## Troubleshooting
368
+
369
+ ### "Table does not exist" on First Run
370
+
371
+ Run the appropriate init SQL in Snowflake first:
372
+ - Forums: `database/sql/init_output_table.sql`
373
+ - Comments: `database/sql/init_comments_output_table.sql`
374
+
375
+ ### No Comments Being Processed
376
+
377
+ Check that `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS` table exists and contains data. The query joins with `DIM_CONTENT` and `DIM_CHANNEL` - verify these dimension tables have matching records.
378
+
379
+ ### Competitor Products Attributed to Sabian
380
+
381
+ Check `brand_config.json` for `competitor_products_warning` section. Add any missing competitor products.
382
+
383
+ ### API Rate Limits
384
+
385
+ Use `--sequential` mode or reduce `--limit`:
386
+ ```bash
387
+ python main.py --sequential --limit 50
388
+ ```
389
+
390
+ ## Schema Version History
391
+
392
+ | Version | Changes |
393
+ |---------|---------|
394
+ | 1.0 | Initial release |
395
+ | 2.0 | Added author_role, post_type, sabian_mention_context |
396
+ | 3.0 | Removed post_type (merged into intents), unified feedback_aspects |
397
+ | 4.0 | 4-agent pipeline, thread_context_summary, validation flags, product aliases |
398
+ | 4.0+ | Added social media comments support, PLATFORM column, separate comment output table |
399
+
400
+ ## License
401
+
402
+ Internal use only - Brand sentiment analysis project.
processing_brand_sentiment/config_files/analysis_categories.json ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "author_role": {
3
+ "description": "Author's relationship to Sabian products",
4
+ "categories": [
5
+ {"value": "current_owner", "description": "Currently owns/uses Sabian"},
6
+ {"value": "past_owner", "description": "Previously owned, sold/replaced"},
7
+ {"value": "potential_buyer", "description": "Considering purchasing Sabian"},
8
+ {"value": "never_owned", "description": "Explicitly doesn't own Sabian"},
9
+ {"value": "unknown", "description": "Cannot determine from post"}
10
+ ]
11
+ },
12
+ "sabian_mention_context": {
13
+ "description": "How prominently Sabian is discussed",
14
+ "categories": [
15
+ {"value": "primary_focus", "description": "Sabian is the main topic"},
16
+ {"value": "significant_mention", "description": "Discussed with detail, not main focus"},
17
+ {"value": "casual_mention", "description": "Brief mention among other topics"},
18
+ {"value": "comparison_context", "description": "Mentioned while comparing to competitors"}
19
+ ]
20
+ },
21
+ "sentiment": {
22
+ "brand_specific": true,
23
+ "description": "Sentiment TOWARDS SABIAN ONLY (not overall post tone)",
24
+ "levels": [
25
+ {"value": "very_negative", "description": "Strong criticism, anger, severe disappointment"},
26
+ {"value": "negative", "description": "Complaints, dissatisfaction, mild criticism"},
27
+ {"value": "neutral", "description": "Factual mention, balanced, no clear sentiment"},
28
+ {"value": "positive", "description": "Satisfaction, appreciation, mild praise"},
29
+ {"value": "very_positive", "description": "Enthusiasm, strong praise, highly recommend"}
30
+ ]
31
+ },
32
+ "emotions": {
33
+ "brand_specific": true,
34
+ "description": "Emotion towards SABIAN specifically",
35
+ "categories": [
36
+ {"value": "frustration", "description": "Annoyance with product issues"},
37
+ {"value": "disappointment", "description": "Unmet expectations"},
38
+ {"value": "anger", "description": "Strong negative emotion"},
39
+ {"value": "satisfaction", "description": "Expectations met, content"},
40
+ {"value": "excitement", "description": "Eagerness, anticipation"},
41
+ {"value": "curiosity", "description": "Interest, wanting to know more"},
42
+ {"value": "indifference", "description": "No strong feelings"}
43
+ ]
44
+ },
45
+ "intents": {
46
+ "multi_label": true,
47
+ "description": "What the author is trying to accomplish (can select multiple)",
48
+ "categories": [
49
+ {"value": "seeking_information", "description": "Asking questions, seeking advice/recommendations"},
50
+ {"value": "providing_information", "description": "Answering questions, giving advice, helping others"},
51
+ {"value": "sharing_experience", "description": "Personal experience, review, testimonial, purchase announcement"},
52
+ {"value": "comparing", "description": "Comparing brands/products against each other"},
53
+ {"value": "praising", "description": "Actively endorsing, recommending, advocating for Sabian"},
54
+ {"value": "criticizing", "description": "Actively complaining, warning others, reporting issues"},
55
+ {"value": "buying_selling", "description": "Listing gear for sale, looking to buy/trade"},
56
+ {"value": "general_discussion", "description": "General conversation not fitting above"}
57
+ ]
58
+ },
59
+ "purchase_stage": {
60
+ "author_perspective_only": true,
61
+ "description": "Author's own purchase journey stage (null if giving advice to others)",
62
+ "categories": [
63
+ {"value": "researching", "description": "Gathering info before buying"},
64
+ {"value": "deciding", "description": "Actively comparing, about to decide"},
65
+ {"value": "recently_purchased", "description": "Just bought the product"},
66
+ {"value": "long_term_owner", "description": "Owned for extended period"},
67
+ {"value": "selling_replacing", "description": "Selling or replacing gear"}
68
+ ]
69
+ },
70
+ "comparison_type": {
71
+ "description": "Type of competitive comparison (if comparing)",
72
+ "categories": [
73
+ {"value": "direct_comparison", "description": "Side-by-side evaluation"},
74
+ {"value": "preference_statement", "description": "Stating brand preference"},
75
+ {"value": "switching_to_sabian", "description": "Moving or Moved from competitor to Sabian"},
76
+ {"value": "switching_from_sabian", "description": "Moving or Moved from Sabian to competitor"}
77
+ ]
78
+ },
79
+ "feedback_aspects": {
80
+ "description": "Product/brand aspects discussed. Used for BOTH pain_points (negative) and delight_factors (positive)",
81
+ "categories": [
82
+ {"value": "sound_quality", "description": "Sound, tone, character, audio qualities"},
83
+ {"value": "price_value", "description": "Cost, value for money, deals"},
84
+ {"value": "durability", "description": "Build quality, longevity, cracking/wear"},
85
+ {"value": "playability", "description": "Feel, response, ease of playing"},
86
+ {"value": "versatility", "description": "Range of genres/applications, flexibility"},
87
+ {"value": "customer_service", "description": "Support, warranty, brand interaction"},
88
+ {"value": "availability", "description": "Stock, ease of finding/purchasing"},
89
+ {"value": "aesthetics", "description": "Appearance, finish, visual appeal"}
90
+ ]
91
+ },
92
+ "decision_drivers": {
93
+ "author_perspective_only": true,
94
+ "description": "What influenced AUTHOR's own purchase decision (empty if giving advice)",
95
+ "categories": [
96
+ {"value": "sound_quality", "description": "Sound characteristics"},
97
+ {"value": "price", "description": "Cost/budget considerations"},
98
+ {"value": "durability", "description": "Build quality, longevity"},
99
+ {"value": "artist_endorsement", "description": "Influenced by endorsed artists"},
100
+ {"value": "peer_recommendation", "description": "Friends/community recommended"},
101
+ {"value": "hands_on_testing", "description": "Tried before buying"},
102
+ {"value": "brand_loyalty", "description": "Previous positive experience"},
103
+ {"value": "versatility", "description": "Multi-genre/application use"},
104
+ {"value": "online_reviews", "description": "Read reviews that influenced"}
105
+ ]
106
+ },
107
+ "product_attributes": {
108
+ "description": "Attributes being discussed about Sabian products",
109
+ "categories": [
110
+ {"value": "sound_quality", "description": "Tone, character, audio qualities"},
111
+ {"value": "durability", "description": "Build quality, longevity"},
112
+ {"value": "price", "description": "Cost and value"},
113
+ {"value": "playability", "description": "Feel, response"},
114
+ {"value": "aesthetics", "description": "Appearance, finish"},
115
+ {"value": "volume", "description": "Loudness, projection"},
116
+ {"value": "sustain", "description": "How long sound lasts"},
117
+ {"value": "versatility", "description": "Range of applications"}
118
+ ]
119
+ },
120
+ "analysis_notes_guidelines": {
121
+ "description": "Keep to 1-2 sentences. Focus on Sabian-specific insights not captured by other fields."
122
+ }
123
+ }
processing_brand_sentiment/config_files/brand_config.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "brand": {
3
+ "name": "Sabian",
4
+ "description": "Sabian is a Canadian manufacturer of cymbals founded in 1981",
5
+ "products": [
6
+ "HHX",
7
+ "AAX",
8
+ "Artisan",
9
+ "FRX",
10
+ "Omni",
11
+ "Chopper",
12
+ "Stratus",
13
+ "XSR",
14
+ "B8X",
15
+ "SBR"
16
+ ],
17
+ "product_aliases": {
18
+ "b8": "B8X",
19
+ "sbrs": "SBR",
20
+ "hhxs": "HHX",
21
+ "aaxs": "AAX",
22
+ "hhx's": "HHX",
23
+ "aax's": "AAX"
24
+ },
25
+ "product_descriptions": {
26
+ "HHX": "Hand Hammered Xtreme - Professional series with dark, complex tones",
27
+ "AAX": "Bright, cutting cymbals for modern music",
28
+ "Artisan": "Premium hand-crafted cymbals with unique character",
29
+ "FRX": "Frequency Reduced Xtreme - Lower volume cymbals",
30
+ "Omni": "Multi-purpose cymbals for various playing styles",
31
+ "Chopper": "Effect cymbals with unique sound",
32
+ "Stratus": "Dark, complex sounds for jazz and fusion",
33
+ "XSR": "Entry-level professional cymbals",
34
+ "B8X": "Bronze entry-level cymbals",
35
+ "SBR": "Entry-level brass cymbals"
36
+ },
37
+ "competitor_products_warning": {
38
+ "description": "Products that belong to competitors - DO NOT attribute to Sabian",
39
+ "paiste_products": ["2002", "signature", "sound edge", "formula 602", "giant beat", "pst", "rude", "masters", "traditionals", "twenty", "dark energy"],
40
+ "zildjian_products": ["k custom", "a custom", "k zildjian", "a zildjian", "s family", "i family", "l80", "kerope", "constantinople", "k sweet"],
41
+ "meinl_products": ["byzance", "pure alloy", "hcs", "classics custom", "mb20", "mb10", "soundcaster"],
42
+ "dream_products": ["bliss", "contact", "energy", "dark matter", "vintage bliss", "eclipse"],
43
+ "istanbul_products": ["agop", "xist", "traditional", "sultan", "mehmet"]
44
+ },
45
+ "competitors": [
46
+ {
47
+ "name": "Zildjian",
48
+ "aliases": ["zildjian", "zil", "z custom", "a custom", "k custom", "k zildjian", "a zildjian"]
49
+ },
50
+ {
51
+ "name": "Meinl",
52
+ "aliases": ["meinl", "byzance", "classics"]
53
+ },
54
+ {
55
+ "name": "Paiste",
56
+ "aliases": ["paiste", "2002", "signature", "formula 602", "sound edge"]
57
+ },
58
+ {
59
+ "name": "Dream Cymbals",
60
+ "aliases": ["dream", "dream cymbals", "bliss"]
61
+ },
62
+ {
63
+ "name": "Istanbul Agop",
64
+ "aliases": ["istanbul", "agop", "istanbul agop", "istanbul mehmet"]
65
+ },
66
+ {
67
+ "name": "Bosphorus",
68
+ "aliases": ["bosphorus"]
69
+ }
70
+ ]
71
+ },
72
+ "relevance_keywords": {
73
+ "primary": {
74
+ "description": "Keywords that definitively indicate Sabian content",
75
+ "keywords": ["sabian", "hhx", "aax", "artisan", "frx", "omni", "chopper", "stratus", "xsr", "b8x", "sbr"]
76
+ },
77
+ "contextual": {
78
+ "description": "Ambiguous keywords that need context verification",
79
+ "keywords": ["b8"]
80
+ },
81
+ "cymbal_context": {
82
+ "description": "Keywords that provide cymbal-related context for disambiguation",
83
+ "keywords": ["cymbal", "cymbals", "crash", "ride", "hi-hat", "hihat", "hi hat", "splash", "china", "bell", "stack", "effects"]
84
+ }
85
+ },
86
+ "preprocessing": {
87
+ "min_length_for_language_detection": 50,
88
+ "default_language_for_short_text": "English",
89
+ "always_process_if_primary_keyword": true,
90
+ "min_content_length": 3
91
+ },
92
+ "filter_conditions": {
93
+ "exclude_access_levels": ["team", "house-coach"],
94
+ "exclude_post_states": ["deleted", "spam"],
95
+ "require_content_length_min": 3
96
+ },
97
+ "data_sources": {
98
+ "forums": {
99
+ "table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS",
100
+ "description": "Forum posts mentioning Sabian and their products",
101
+ "sql_query_file": "database/sql/fetch_forum_posts.sql",
102
+ "platform": "musora_forums"
103
+ },
104
+ "comments": {
105
+ "table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS",
106
+ "description": "Social media comments potentially related to Sabian brand",
107
+ "sql_query_file": "database/sql/fetch_comments.sql",
108
+ "platform_column": "PLATFORM"
109
+ }
110
+ }
111
+ }
processing_brand_sentiment/config_files/workflow_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "llm": {
3
+ "default_model": "gpt-5-nano",
4
+ "default_temperature": 0.2,
5
+ "max_retries": 3,
6
+ "timeout": 60
7
+ },
8
+ "agents": {
9
+ "preprocessor": {
10
+ "name": "PreprocessorAgent",
11
+ "description": "Deterministic agent for HTML parsing, text cleaning, language detection",
12
+ "model": "gpt-5-nano",
13
+ "temperature": 0.0,
14
+ "uses_llm": false
15
+ },
16
+ "relevance_validator": {
17
+ "name": "RelevanceValidatorAgent",
18
+ "description": "Lightweight LLM for disambiguation of ambiguous terms (HH, AA)",
19
+ "model": "gpt-5-nano",
20
+ "temperature": 0.0,
21
+ "max_retries": 2
22
+ },
23
+ "brand_analyzer": {
24
+ "name": "SabianAnalyzerAgent",
25
+ "description": "Comprehensive brand analysis for Sabian products",
26
+ "model": "gpt-5-nano",
27
+ "temperature": 0.2,
28
+ "max_retries": 3
29
+ }
30
+ },
31
+ "workflow": {
32
+ "parallel_processing": {
33
+ "enabled": true,
34
+ "worker_calculation": "CPU count - 2, max 5 workers",
35
+ "max_workers": 5,
36
+ "min_batch_size": 20,
37
+ "max_batch_size": 500
38
+ },
39
+ "thread_context": {
40
+ "enabled": true,
41
+ "include_thread_title": true,
42
+ "include_first_post": true
43
+ }
44
+ },
45
+ "output": {
46
+ "table_name": "SABIAN_BRAND_ANALYSIS",
47
+ "database": "SOCIAL_MEDIA_DB",
48
+ "schema": "ML_FEATURES"
49
+ },
50
+ "comments_output": {
51
+ "table_name": "SABIAN_BRAND_ANALYSIS_COMMENTS",
52
+ "database": "SOCIAL_MEDIA_DB",
53
+ "schema": "ML_FEATURES"
54
+ },
55
+ "logging": {
56
+ "level": "INFO",
57
+ "log_directory": "logs",
58
+ "log_file_prefix": "brand_sentiment_processing"
59
+ }
60
+ }
processing_brand_sentiment/database/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Database module for brand sentiment analysis.
3
+ Contains Snowflake connection handler and SQL query utilities.
4
+ """
5
+
6
+ from .snowflake_connection import SnowFlakeConn
7
+
8
+ __all__ = ['SnowFlakeConn']
processing_brand_sentiment/database/snowflake_connection.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Snowflake connection handler for brand sentiment analysis.
3
+ Provides methods for reading data, executing queries, and storing results.
4
+ """
5
+
6
+ import os
7
+ from snowflake.snowpark import Session
8
+ from dotenv import load_dotenv
9
+ import logging
10
+ import pandas as pd
11
+ from typing import Optional, List, Any
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Load environment variables
16
+ load_dotenv()
17
+
18
+
19
+ class SnowFlakeConn:
20
+ """
21
+ Handles Snowflake database connections and operations for brand sentiment analysis.
22
+ """
23
+
24
+ def __init__(self):
25
+ """Initialize Snowflake connection."""
26
+ self.session = self.connect_to_snowflake()
27
+
28
+ def connect_to_snowflake(self) -> Session:
29
+ """
30
+ Create a connection to Snowflake using environment variables.
31
+
32
+ Returns:
33
+ Snowflake Session object
34
+ """
35
+ conn = dict(
36
+ user=self.get_credential("SNOWFLAKE_USER"),
37
+ password=self.get_credential("SNOWFLAKE_PASSWORD"),
38
+ account=self.get_credential("SNOWFLAKE_ACCOUNT"),
39
+ role=self.get_credential("SNOWFLAKE_ROLE"),
40
+ database=self.get_credential("SNOWFLAKE_DATABASE"),
41
+ warehouse=self.get_credential("SNOWFLAKE_WAREHOUSE"),
42
+ schema=self.get_credential("SNOWFLAKE_SCHEMA"),
43
+ )
44
+
45
+ session = Session.builder.configs(conn).create()
46
+ logger.info("Successfully connected to Snowflake")
47
+ return session
48
+
49
+ def get_credential(self, key: str) -> str:
50
+ """
51
+ Get credential from environment variables.
52
+
53
+ Args:
54
+ key: Environment variable name
55
+
56
+ Returns:
57
+ Credential value
58
+ """
59
+ return os.getenv(key)
60
+
61
+ def run_read_query(self, query: str, description: str = "data") -> pd.DataFrame:
62
+ """
63
+ Execute a SQL query that fetches data.
64
+
65
+ Args:
66
+ query: SQL query string
67
+ description: Description of what data is being fetched
68
+
69
+ Returns:
70
+ Pandas DataFrame containing query results
71
+ """
72
+ try:
73
+ dataframe = self.session.sql(query).to_pandas()
74
+ dataframe.columns = dataframe.columns.str.lower()
75
+ logger.info(f"Successfully read {len(dataframe)} rows for {description}")
76
+ return dataframe
77
+ except Exception as e:
78
+ logger.error(f"Error reading {description}: {e}")
79
+ raise
80
+
81
+ def store_df_to_snowflake(
82
+ self,
83
+ table_name: str,
84
+ dataframe: pd.DataFrame,
85
+ database: str = "SOCIAL_MEDIA_DB",
86
+ schema: str = "ML_FEATURES",
87
+ overwrite: bool = False
88
+ ) -> None:
89
+ """
90
+ Store a DataFrame to Snowflake.
91
+
92
+ Args:
93
+ table_name: Target table name
94
+ dataframe: DataFrame to store
95
+ database: Target database
96
+ schema: Target schema
97
+ overwrite: If True, overwrite existing data; if False, append
98
+ """
99
+ try:
100
+ self.session.use_database(database)
101
+ self.session.use_schema(schema)
102
+
103
+ dataframe = dataframe.reset_index(drop=True)
104
+ dataframe.columns = dataframe.columns.str.upper()
105
+
106
+ self.session.write_pandas(
107
+ df=dataframe,
108
+ table_name=table_name.strip().upper(),
109
+ auto_create_table=True,
110
+ overwrite=overwrite,
111
+ use_logical_type=True
112
+ )
113
+ logger.info(f"Successfully stored {len(dataframe)} rows to {table_name}")
114
+
115
+ except Exception as e:
116
+ logger.error(f"Error storing data to {table_name}: {e}")
117
+ raise
118
+
119
+ def execute_sql_file(self, file_path: str) -> Optional[List[Any]]:
120
+ """
121
+ Execute SQL queries from a file.
122
+
123
+ Args:
124
+ file_path: Path to SQL file
125
+
126
+ Returns:
127
+ Query result or None for DDL/DML
128
+ """
129
+ try:
130
+ with open(file_path, 'r', encoding='utf-8') as file:
131
+ sql_content = file.read()
132
+
133
+ result = self.session.sql(sql_content).collect()
134
+ logger.info(f"Successfully executed SQL from {file_path}")
135
+ return result
136
+ except Exception as e:
137
+ logger.error(f"Error executing SQL file {file_path}: {e}")
138
+ return None
139
+
140
+ def execute_query(self, query: str, description: str = "query") -> Optional[List[Any]]:
141
+ """
142
+ Execute a SQL query and return results.
143
+
144
+ Args:
145
+ query: SQL query string
146
+ description: Description of the query for logging
147
+
148
+ Returns:
149
+ Query results
150
+ """
151
+ try:
152
+ result = self.session.sql(query).collect()
153
+ logger.info(f"Successfully executed {description}")
154
+ return result
155
+ except Exception as e:
156
+ logger.error(f"Error executing {description}: {e}")
157
+ return None
158
+
159
+ def fetch_forum_posts_with_context(
160
+ self,
161
+ sql_file_path: str,
162
+ limit: Optional[int] = None
163
+ ) -> pd.DataFrame:
164
+ """
165
+ Fetch forum posts with thread context from SQL file.
166
+
167
+ Args:
168
+ sql_file_path: Path to the SQL query file
169
+ limit: Optional limit on number of posts to fetch
170
+
171
+ Returns:
172
+ DataFrame containing forum posts with context
173
+ """
174
+ try:
175
+ with open(sql_file_path, 'r', encoding='utf-8') as f:
176
+ query = f.read()
177
+
178
+ # Add limit if specified
179
+ if limit:
180
+ # Strip whitespace first, then semicolon, to handle Windows line endings
181
+ query = query.strip().rstrip(';') + f"\nLIMIT {limit};"
182
+
183
+ df = self.run_read_query(query, "forum posts with context")
184
+
185
+ # Validate required columns
186
+ required_cols = ['post_id', 'post_content', 'thread_id']
187
+ missing_cols = [col for col in required_cols if col not in df.columns]
188
+ if missing_cols:
189
+ logger.warning(f"Missing expected columns: {missing_cols}")
190
+
191
+ return df
192
+
193
+ except Exception as e:
194
+ logger.error(f"Error fetching forum posts: {e}")
195
+ raise
196
+
197
+ def fetch_comments(
198
+ self,
199
+ sql_file_path: str,
200
+ limit: Optional[int] = None
201
+ ) -> pd.DataFrame:
202
+ """
203
+ Fetch social media comments with context from SQL file.
204
+
205
+ Args:
206
+ sql_file_path: Path to the SQL query file
207
+ limit: Optional limit on number of comments to fetch
208
+
209
+ Returns:
210
+ DataFrame containing comments with context
211
+ """
212
+ try:
213
+ with open(sql_file_path, 'r', encoding='utf-8') as f:
214
+ query = f.read()
215
+
216
+ # Add limit if specified
217
+ if limit:
218
+ query = query.strip().rstrip(';') + f"\nLIMIT {limit};"
219
+
220
+ df = self.run_read_query(query, "social media comments with context")
221
+
222
+ # Validate required columns
223
+ required_cols = ['comment_sk', 'comment_id', 'comment_text', 'platform']
224
+ missing_cols = [col for col in required_cols if col not in df.columns]
225
+ if missing_cols:
226
+ logger.warning(f"Missing expected columns: {missing_cols}")
227
+
228
+ return df
229
+
230
+ except Exception as e:
231
+ logger.error(f"Error fetching comments: {e}")
232
+ raise
233
+
234
+ def close_connection(self) -> None:
235
+ """Close the Snowflake session."""
236
+ try:
237
+ self.session.close()
238
+ logger.info("Snowflake connection closed")
239
+ except Exception as e:
240
+ logger.error(f"Error closing connection: {e}")
processing_brand_sentiment/database/sql/create_comments_output_table.sql ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Create the output table for Sabian brand sentiment analysis on social media comments
2
+ -- Stores processed comments with extracted brand intelligence
3
+ -- Schema Version 4.0: Same analysis fields as forum table, different source identifiers
4
+
5
+ CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS (
6
+ -- Source identifiers (comment-specific)
7
+ COMMENT_SK NUMBER(38,0),
8
+ COMMENT_ID VARCHAR(16777216),
9
+ ORIGINAL_TEXT VARCHAR(16777216),
10
+ PLATFORM VARCHAR(16777216),
11
+ COMMENT_TIMESTAMP TIMESTAMP_NTZ(9),
12
+ AUTHOR_NAME VARCHAR(16777216),
13
+ AUTHOR_ID VARCHAR(16777216),
14
+ CONTENT_SK NUMBER(38,0),
15
+ CONTENT_ID VARCHAR(16777216),
16
+ CONTENT_DESCRIPTION VARCHAR(16777216),
17
+ CHANNEL_SK NUMBER(38,0),
18
+ CHANNEL_NAME VARCHAR(16777216),
19
+ CHANNEL_DISPLAY_NAME VARCHAR(16777216),
20
+ PARENT_COMMENT_ID VARCHAR(16777216),
21
+ PARENT_COMMENT_TEXT VARCHAR(16777216),
22
+
23
+ -- Language detection
24
+ DETECTED_LANGUAGE VARCHAR(100),
25
+ LANGUAGE_CODE VARCHAR(10),
26
+ IS_ENGLISH BOOLEAN,
27
+
28
+ -- Relevance assessment
29
+ IS_RELEVANT BOOLEAN,
30
+ RELEVANCE_CONFIDENCE VARCHAR(20),
31
+ RELEVANCE_REASON VARCHAR(500),
32
+
33
+ -- Author classification
34
+ AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
35
+ SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
36
+
37
+ -- Sentiment analysis
38
+ SENTIMENT_LEVEL VARCHAR(20),
39
+ EMOTION_TYPE VARCHAR(50),
40
+ SENTIMENT_TARGET VARCHAR(50),
41
+ SENTIMENT_CONFIDENCE VARCHAR(20),
42
+
43
+ -- Product information (stored as JSON arrays)
44
+ PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"]
45
+ PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
46
+ PURCHASE_STAGE VARCHAR(50),
47
+
48
+ -- Competitive intelligence
49
+ COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"]
50
+ COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
51
+ COMPARISON_TYPE VARCHAR(50),
52
+ COMPETITIVE_POSITIONING VARCHAR(500),
53
+ BRAND_SWITCHING VARCHAR(100),
54
+
55
+ -- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
56
+ INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"]
57
+ DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"]
58
+ PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"]
59
+ DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
60
+
61
+ -- Analysis notes
62
+ ANALYSIS_NOTES VARCHAR(16777216),
63
+ SARCASM_DETECTED BOOLEAN,
64
+
65
+ -- Validation results
66
+ VALIDATION_PASSED BOOLEAN,
67
+ VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
68
+ VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
69
+ VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags
70
+
71
+ -- Processing metadata
72
+ PROCESSING_SUCCESS BOOLEAN,
73
+ PROCESSING_STATUS VARCHAR(50), -- completed, completed_with_flags, validation_failed, workflow_error
74
+ PROCESSING_ERRORS VARCHAR(16777216),
75
+ PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
76
+ WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
77
+ )
78
+ COMMENT = 'Brand sentiment analysis results for Sabian social media comments. Schema v4.0: 4-agent pipeline with extraction/analysis separation and validation.';
79
+
80
+ -- Create indexes for common query patterns
81
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_SK ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(COMMENT_SK);
82
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_PLATFORM ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PLATFORM);
83
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_IS_RELEVANT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(IS_RELEVANT);
84
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_SENTIMENT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(SENTIMENT_LEVEL);
85
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_PROCESSED_AT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PROCESSED_AT);
86
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_AUTHOR_ROLE ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(AUTHOR_ROLE);
87
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_MENTION_CTX ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(SABIAN_MENTION_CONTEXT);
88
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_STATUS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PROCESSING_STATUS);
89
+
90
+ -- Create view for relevant comments only
91
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_RELEVANT_ANALYSIS AS
92
+ SELECT *
93
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
94
+ WHERE IS_RELEVANT = TRUE
95
+ AND PROCESSING_SUCCESS = TRUE;
96
+
97
+ -- Create view for comments needing review (flagged by validator)
98
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_FLAGGED AS
99
+ SELECT
100
+ COMMENT_SK,
101
+ COMMENT_ID,
102
+ PLATFORM,
103
+ ORIGINAL_TEXT,
104
+ IS_RELEVANT,
105
+ RELEVANCE_CONFIDENCE,
106
+ RELEVANCE_REASON,
107
+ PRODUCTS_MENTIONED,
108
+ SABIAN_MENTION_CONTEXT,
109
+ SENTIMENT_LEVEL,
110
+ VALIDATION_FLAGS,
111
+ VALIDATION_WARNINGS,
112
+ PROCESSING_STATUS
113
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
114
+ WHERE PROCESSING_STATUS = 'completed_with_flags'
115
+ OR VALIDATION_PASSED = FALSE
116
+ ORDER BY PROCESSED_AT DESC;
117
+
118
+ -- Create view for sentiment distribution
119
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_SENTIMENT_DISTRIBUTION AS
120
+ SELECT
121
+ PLATFORM,
122
+ SENTIMENT_LEVEL,
123
+ EMOTION_TYPE,
124
+ SENTIMENT_TARGET,
125
+ COUNT(*) AS COMMENT_COUNT,
126
+ COUNT(CASE WHEN SARCASM_DETECTED = TRUE THEN 1 END) AS SARCASM_COUNT
127
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
128
+ WHERE IS_RELEVANT = TRUE
129
+ AND PROCESSING_SUCCESS = TRUE
130
+ GROUP BY PLATFORM, SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_TARGET
131
+ ORDER BY COMMENT_COUNT DESC;
132
+
133
+ -- Create view for product mentions summary
134
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_PRODUCT_MENTIONS AS
135
+ SELECT
136
+ PLATFORM,
137
+ TRIM(product.VALUE::STRING) AS PRODUCT,
138
+ SENTIMENT_LEVEL,
139
+ COUNT(*) AS MENTION_COUNT,
140
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT,
141
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT
142
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS,
143
+ LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PRODUCTS_MENTIONED)) AS product
144
+ WHERE IS_RELEVANT = TRUE
145
+ AND PROCESSING_SUCCESS = TRUE
146
+ AND PRODUCTS_MENTIONED IS NOT NULL
147
+ GROUP BY PLATFORM, TRIM(product.VALUE::STRING), SENTIMENT_LEVEL
148
+ ORDER BY MENTION_COUNT DESC;
149
+
150
+ -- Create view for validation summary
151
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_VALIDATION_SUMMARY AS
152
+ SELECT
153
+ PLATFORM,
154
+ PROCESSING_STATUS,
155
+ VALIDATION_PASSED,
156
+ COUNT(*) AS COMMENT_COUNT,
157
+ COUNT(CASE WHEN IS_RELEVANT = TRUE THEN 1 END) AS RELEVANT_COUNT,
158
+ COUNT(CASE WHEN IS_RELEVANT = FALSE THEN 1 END) AS NOT_RELEVANT_COUNT
159
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS
160
+ GROUP BY PLATFORM, PROCESSING_STATUS, VALIDATION_PASSED
161
+ ORDER BY COMMENT_COUNT DESC;
processing_brand_sentiment/database/sql/create_output_table.sql ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Create the output table for Sabian brand sentiment analysis
2
+ -- Stores processed forum posts with extracted brand intelligence
3
+ -- Schema Version 4.0: Added THREAD_CONTEXT_SUMMARY, validation fields, and processing status
4
+
5
+ CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS (
6
+ -- Source identifiers
7
+ POST_ID NUMBER(38,0) PRIMARY KEY,
8
+ THREAD_ID NUMBER(38,0),
9
+ POST_AUTHOR_ID NUMBER(38,0),
10
+
11
+ -- Original and processed content
12
+ ORIGINAL_CONTENT VARCHAR(16777216),
13
+ CLEANED_CONTENT VARCHAR(16777216),
14
+ QUOTED_CONTENT VARCHAR(16777216),
15
+ THREAD_CONTEXT VARCHAR(16777216), -- Raw thread context (legacy)
16
+ THREAD_CONTEXT_SUMMARY VARCHAR(4096), -- NEW v4.0: Summarized thread context for analysis
17
+
18
+ -- Thread metadata
19
+ THREAD_TITLE VARCHAR(16777216),
20
+ THREAD_FIRST_POST VARCHAR(16777216),
21
+
22
+ -- Timestamps
23
+ POST_CREATED_AT TIMESTAMP_LTZ(9),
24
+ THREAD_STARTED_AT TIMESTAMP_LTZ(9),
25
+
26
+ -- Category information
27
+ CATEGORY_TITLE VARCHAR(16777216),
28
+ CATEGORY_TOPIC VARCHAR(16777216),
29
+
30
+ -- Language detection
31
+ DETECTED_LANGUAGE VARCHAR(100),
32
+ LANGUAGE_CODE VARCHAR(10),
33
+ IS_ENGLISH BOOLEAN,
34
+
35
+ -- Relevance assessment
36
+ IS_RELEVANT BOOLEAN,
37
+ RELEVANCE_CONFIDENCE VARCHAR(20),
38
+ RELEVANCE_REASON VARCHAR(500),
39
+
40
+ -- Author classification
41
+ AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
42
+ SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
43
+
44
+ -- Sentiment analysis
45
+ SENTIMENT_LEVEL VARCHAR(20),
46
+ EMOTION_TYPE VARCHAR(50),
47
+ SENTIMENT_TARGET VARCHAR(50),
48
+ SENTIMENT_CONFIDENCE VARCHAR(20),
49
+
50
+ -- Product information (stored as JSON arrays)
51
+ PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"]
52
+ PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
53
+ PURCHASE_STAGE VARCHAR(50),
54
+
55
+ -- Competitive intelligence
56
+ COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"]
57
+ COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
58
+ COMPARISON_TYPE VARCHAR(50),
59
+ COMPETITIVE_POSITIONING VARCHAR(500),
60
+ BRAND_SWITCHING VARCHAR(100),
61
+
62
+ -- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
63
+ INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"]
64
+ DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"]
65
+ PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"]
66
+ DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
67
+
68
+ -- Analysis notes
69
+ ANALYSIS_NOTES VARCHAR(16777216),
70
+ SARCASM_DETECTED BOOLEAN,
71
+
72
+ -- Validation results (NEW v4.0)
73
+ VALIDATION_PASSED BOOLEAN,
74
+ VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
75
+ VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
76
+ VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags (e.g., "sarcasm_detected", "low_confidence_relevant")
77
+
78
+ -- Platform identifier
79
+ PLATFORM VARCHAR(50) DEFAULT 'musora_forums',
80
+
81
+ -- Processing metadata
82
+ PROCESSING_SUCCESS BOOLEAN,
83
+ PROCESSING_STATUS VARCHAR(50), -- NEW v4.0: completed, completed_with_flags, validation_failed, workflow_error
84
+ PROCESSING_ERRORS VARCHAR(16777216),
85
+ PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
86
+ WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
87
+ )
88
+ COMMENT = 'Brand sentiment analysis results for Sabian forum posts. Schema v4.0: 4-agent pipeline with extraction/analysis separation, thread context summarization, and validation.';
89
+
90
+ -- Create indexes for common query patterns
91
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_THREAD_ID ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(THREAD_ID);
92
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_IS_RELEVANT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(IS_RELEVANT);
93
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_SENTIMENT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(SENTIMENT_LEVEL);
94
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_PROCESSED_AT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(PROCESSED_AT);
95
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_AUTHOR_ROLE ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(AUTHOR_ROLE);
96
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_MENTION_CONTEXT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(SABIAN_MENTION_CONTEXT);
97
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_PROCESSING_STATUS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(PROCESSING_STATUS);
98
+ CREATE INDEX IF NOT EXISTS IDX_SABIAN_VALIDATION_FLAGS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(VALIDATION_PASSED);
99
+
100
+ -- Create view for relevant posts only
101
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_RELEVANT_ANALYSIS AS
102
+ SELECT *
103
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
104
+ WHERE IS_RELEVANT = TRUE
105
+ AND PROCESSING_SUCCESS = TRUE;
106
+
107
+ -- Create view for posts needing review (flagged by validator)
108
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_FLAGGED_POSTS AS
109
+ SELECT
110
+ POST_ID,
111
+ THREAD_ID,
112
+ CLEANED_CONTENT,
113
+ THREAD_CONTEXT_SUMMARY,
114
+ IS_RELEVANT,
115
+ RELEVANCE_CONFIDENCE,
116
+ RELEVANCE_REASON,
117
+ PRODUCTS_MENTIONED,
118
+ SABIAN_MENTION_CONTEXT,
119
+ SENTIMENT_LEVEL,
120
+ VALIDATION_FLAGS,
121
+ VALIDATION_WARNINGS,
122
+ PROCESSING_STATUS
123
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
124
+ WHERE PROCESSING_STATUS = 'completed_with_flags'
125
+ OR VALIDATION_PASSED = FALSE
126
+ ORDER BY PROCESSED_AT DESC;
127
+
128
+ -- Create view for sentiment distribution
129
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_SENTIMENT_DISTRIBUTION AS
130
+ SELECT
131
+ SENTIMENT_LEVEL,
132
+ EMOTION_TYPE,
133
+ SENTIMENT_TARGET,
134
+ COUNT(*) AS POST_COUNT,
135
+ COUNT(CASE WHEN SARCASM_DETECTED = TRUE THEN 1 END) AS SARCASM_COUNT
136
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
137
+ WHERE IS_RELEVANT = TRUE
138
+ AND PROCESSING_SUCCESS = TRUE
139
+ GROUP BY SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_TARGET
140
+ ORDER BY POST_COUNT DESC;
141
+
142
+ -- Create view for product mentions summary
143
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_PRODUCT_MENTIONS AS
144
+ SELECT
145
+ TRIM(product.VALUE::STRING) AS PRODUCT,
146
+ SENTIMENT_LEVEL,
147
+ COUNT(*) AS MENTION_COUNT,
148
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT,
149
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT
150
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
151
+ LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PRODUCTS_MENTIONED)) AS product
152
+ WHERE IS_RELEVANT = TRUE
153
+ AND PROCESSING_SUCCESS = TRUE
154
+ AND PRODUCTS_MENTIONED IS NOT NULL
155
+ GROUP BY TRIM(product.VALUE::STRING), SENTIMENT_LEVEL
156
+ ORDER BY MENTION_COUNT DESC;
157
+
158
+ -- Create view for competitor analysis
159
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMPETITOR_ANALYSIS AS
160
+ SELECT
161
+ TRIM(competitor.VALUE::STRING) AS COMPETITOR,
162
+ COMPARISON_TYPE,
163
+ BRAND_SWITCHING,
164
+ COUNT(*) AS MENTION_COUNT,
165
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_SENTIMENT,
166
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_SENTIMENT
167
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
168
+ LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(COMPETITORS_MENTIONED)) AS competitor
169
+ WHERE IS_RELEVANT = TRUE
170
+ AND PROCESSING_SUCCESS = TRUE
171
+ AND COMPETITORS_MENTIONED IS NOT NULL
172
+ GROUP BY TRIM(competitor.VALUE::STRING), COMPARISON_TYPE, BRAND_SWITCHING
173
+ ORDER BY MENTION_COUNT DESC;
174
+
175
+ -- Create view for pain points analysis
176
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_PAIN_POINTS AS
177
+ SELECT
178
+ TRIM(pain_point.VALUE::STRING) AS PAIN_POINT,
179
+ COUNT(*) AS OCCURRENCE_COUNT,
180
+ ARRAY_AGG(DISTINCT SENTIMENT_LEVEL) AS SENTIMENT_LEVELS
181
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
182
+ LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PAIN_POINTS)) AS pain_point
183
+ WHERE IS_RELEVANT = TRUE
184
+ AND PROCESSING_SUCCESS = TRUE
185
+ AND PAIN_POINTS IS NOT NULL
186
+ GROUP BY TRIM(pain_point.VALUE::STRING)
187
+ ORDER BY OCCURRENCE_COUNT DESC;
188
+
189
+ -- Create view for author role analysis
190
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_AUTHOR_ROLES AS
191
+ SELECT
192
+ AUTHOR_ROLE,
193
+ SABIAN_MENTION_CONTEXT,
194
+ COUNT(*) AS POST_COUNT,
195
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT,
196
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT,
197
+ COUNT(CASE WHEN SENTIMENT_LEVEL = 'neutral' THEN 1 END) AS NEUTRAL_COUNT
198
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
199
+ WHERE IS_RELEVANT = TRUE
200
+ AND PROCESSING_SUCCESS = TRUE
201
+ GROUP BY AUTHOR_ROLE, SABIAN_MENTION_CONTEXT
202
+ ORDER BY POST_COUNT DESC;
203
+
204
+ -- Create view for competitor ownership analysis
205
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMPETITOR_OWNERSHIP AS
206
+ SELECT
207
+ TRIM(competitor.VALUE::STRING) AS COMPETITOR_OWNED,
208
+ AUTHOR_ROLE,
209
+ COUNT(*) AS AUTHOR_COUNT,
210
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_TOWARD_SABIAN,
211
+ COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_TOWARD_SABIAN
212
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS,
213
+ LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(COMPETITOR_PRODUCTS_OWNED)) AS competitor
214
+ WHERE IS_RELEVANT = TRUE
215
+ AND PROCESSING_SUCCESS = TRUE
216
+ AND COMPETITOR_PRODUCTS_OWNED IS NOT NULL
217
+ GROUP BY TRIM(competitor.VALUE::STRING), AUTHOR_ROLE
218
+ ORDER BY AUTHOR_COUNT DESC;
219
+
220
+ -- Create view for mention context by sentiment
221
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_MENTION_DEPTH AS
222
+ SELECT
223
+ SABIAN_MENTION_CONTEXT,
224
+ SENTIMENT_LEVEL,
225
+ COUNT(*) AS POST_COUNT,
226
+ AVG(CASE
227
+ WHEN SENTIMENT_LEVEL = 'very_positive' THEN 2
228
+ WHEN SENTIMENT_LEVEL = 'positive' THEN 1
229
+ WHEN SENTIMENT_LEVEL = 'neutral' THEN 0
230
+ WHEN SENTIMENT_LEVEL = 'negative' THEN -1
231
+ WHEN SENTIMENT_LEVEL = 'very_negative' THEN -2
232
+ ELSE 0
233
+ END) AS AVG_SENTIMENT_SCORE
234
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
235
+ WHERE IS_RELEVANT = TRUE
236
+ AND PROCESSING_SUCCESS = TRUE
237
+ GROUP BY SABIAN_MENTION_CONTEXT, SENTIMENT_LEVEL
238
+ ORDER BY SABIAN_MENTION_CONTEXT, POST_COUNT DESC;
239
+
240
+ -- Create view for validation flags analysis (NEW v4.0)
241
+ CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_VALIDATION_SUMMARY AS
242
+ SELECT
243
+ PROCESSING_STATUS,
244
+ VALIDATION_PASSED,
245
+ COUNT(*) AS POST_COUNT,
246
+ COUNT(CASE WHEN IS_RELEVANT = TRUE THEN 1 END) AS RELEVANT_COUNT,
247
+ COUNT(CASE WHEN IS_RELEVANT = FALSE THEN 1 END) AS NOT_RELEVANT_COUNT
248
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS
249
+ GROUP BY PROCESSING_STATUS, VALIDATION_PASSED
250
+ ORDER BY POST_COUNT DESC;
processing_brand_sentiment/database/sql/fetch_comments.sql ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Query to fetch social media comments with context for brand sentiment analysis
2
+ -- Source: SOCIAL_MEDIA_DB.brand_sentiment.SABIAN_comments (same structure as CORE.FACT_COMMENTS)
3
+ -- Includes: comment content, parent comment text, content metadata, channel info
4
+ -- Excludes: official accounts, already-processed comments, empty comments
5
+
6
+ SELECT
7
+ -- Comment identifiers
8
+ fc.COMMENT_SK,
9
+ fc.COMMENT_ID,
10
+ fc.PLATFORM,
11
+ fc.MESSAGE AS COMMENT_TEXT,
12
+ fc.CREATED_TIME AS COMMENT_TIMESTAMP,
13
+ fc.AUTHOR_NAME,
14
+ fc.AUTHOR_ID,
15
+ fc.LIKE_COUNT,
16
+ fc.PARENT_COMMENT_ID,
17
+ fc.REPLIES_COUNT,
18
+ fc.COMMENT_LENGTH,
19
+ fc.IS_ACTIVE AS COMMENT_IS_ACTIVE,
20
+
21
+ -- Parent comment information (self-join to get parent comment text)
22
+ parent_fc.MESSAGE AS PARENT_COMMENT_TEXT,
23
+
24
+ -- Content information
25
+ dc.CONTENT_SK,
26
+ dc.CONTENT_ID,
27
+ dc.CONTENT_TYPE,
28
+ dc.MESSAGE AS CONTENT_DESCRIPTION,
29
+ dc.TITLE AS CONTENT_TITLE,
30
+ dc.PERMALINK_URL,
31
+ dc.CREATED_TIME AS CONTENT_TIMESTAMP,
32
+
33
+ -- Channel information
34
+ dch.CHANNEL_SK,
35
+ dch.CHANNEL_NAME,
36
+ dch.CHANNEL_DISPLAY_NAME
37
+
38
+ FROM
39
+ SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS fc
40
+
41
+ -- Left join to get parent comment text if it exists
42
+ LEFT JOIN
43
+ SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS parent_fc
44
+ ON fc.PARENT_COMMENT_ID = parent_fc.COMMENT_ID
45
+ AND fc.PLATFORM = parent_fc.PLATFORM
46
+
47
+ INNER JOIN
48
+ SOCIAL_MEDIA_DB.CORE.DIM_CONTENT dc
49
+ ON fc.CONTENT_SK = dc.CONTENT_SK
50
+
51
+ INNER JOIN
52
+ SOCIAL_MEDIA_DB.CORE.DIM_CHANNEL dch
53
+ ON dc.CHANNEL_NAME = dch.CHANNEL_NAME
54
+ AND dc.PLATFORM = dch.PLATFORM
55
+
56
+ -- Left join with output table to exclude already-processed comments
57
+ LEFT JOIN
58
+ SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS sba
59
+ ON fc.COMMENT_SK = sba.COMMENT_SK
60
+
61
+ WHERE
62
+ -- Active records only
63
+ fc.IS_ACTIVE = TRUE
64
+ AND dc.IS_ACTIVE = TRUE
65
+ AND dch.IS_ACTIVE = TRUE
66
+
67
+ -- Exclude official accounts
68
+ AND (fc.AUTHOR_NAME IS NULL OR fc.AUTHOR_NAME NOT IN (
69
+ 'Musora', 'Drumeo', 'Pianote',
70
+ '@PianoteOfficial', '@DrumeoOfficial', '@MusoraOfficial'
71
+ ))
72
+
73
+ -- Exclude already-processed comments
74
+ AND sba.COMMENT_SK IS NULL
75
+
76
+ -- Ensure comment has content
77
+ AND fc.MESSAGE IS NOT NULL
78
+ AND TRIM(fc.MESSAGE) != ''
79
+ AND LENGTH(TRIM(fc.MESSAGE)) > 0
80
+
81
+ ORDER BY
82
+ fc.CREATED_TIME DESC;
processing_brand_sentiment/database/sql/fetch_forum_posts.sql ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Query to fetch forum posts with thread context for brand sentiment analysis
2
+ -- Includes: post content, thread context (title, first post), parent relationships
3
+ -- Excludes: team/house-coach posts, already-processed posts, deleted posts
4
+
5
+ WITH thread_first_posts AS (
6
+ -- Get the first post (by creation date) for each thread to use as context
7
+ -- Using ROW_NUMBER for reliable first post identification
8
+ SELECT
9
+ THREAD_ID,
10
+ POST_CONTENT AS FIRST_POST_CONTENT,
11
+ POST_AUTHOR_ID AS FIRST_POST_AUTHOR_ID,
12
+ POST_CREATED_AT AS FIRST_POST_CREATED_AT
13
+ FROM (
14
+ SELECT
15
+ THREAD_ID,
16
+ POST_CONTENT,
17
+ POST_AUTHOR_ID,
18
+ POST_CREATED_AT,
19
+ ROW_NUMBER() OVER (PARTITION BY THREAD_ID ORDER BY POST_CREATED_AT ASC) AS rn
20
+ FROM SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS
21
+ WHERE POST_CONTENT IS NOT NULL
22
+ AND TRIM(POST_CONTENT) != ''
23
+ ) ranked
24
+ WHERE rn = 1
25
+ )
26
+
27
+ SELECT
28
+ -- Post identifiers
29
+ fp.POST_ID,
30
+ fp.POST_AUTHOR_ID,
31
+ fp.THREAD_ID,
32
+
33
+ -- Post content (may contain HTML with quoted parent)
34
+ fp.POST_CONTENT,
35
+
36
+ -- Post timestamps
37
+ fp.POST_CREATED_AT,
38
+ fp.POST_EDITED_ON,
39
+ fp.POST_PUBLISHED_ON,
40
+ fp.POST_STATE,
41
+
42
+ -- Parent/Child relationships (for context)
43
+ fp.PROMPTING_POST_ID,
44
+ fp.PARENT_ID,
45
+ fp.PARENT_CONTENT,
46
+ fp.PARENT_AUTHOR_ID,
47
+ fp.PARENT_CREATED_AT,
48
+ fp.CHILD_ID,
49
+ fp.CHILD_CONTENT,
50
+
51
+ -- Thread context
52
+ fp.THREAD_TITLE,
53
+ fp.THREAD_SLUG,
54
+ fp.THREAD_STATE,
55
+ fp.THREAD_LOCKED,
56
+ fp.THREAD_PINNED,
57
+ fp.THREAD_POST_COUNT,
58
+ fp.THREAD_PUBLISHED_ON,
59
+
60
+ -- First post of the thread (for context)
61
+ tfp.FIRST_POST_CONTENT AS THREAD_FIRST_POST,
62
+ tfp.FIRST_POST_CREATED_AT AS THREAD_STARTED_AT,
63
+
64
+ -- Category information
65
+ fp.CATEGORY_ID,
66
+ fp.CATEGORY_BRAND,
67
+ fp.CATEGORY_DESCRIPTION,
68
+ fp.CATEGORY_TITLE,
69
+ fp.CATEGORY_TOPIC,
70
+ fp.CATEGORY_SLUG,
71
+
72
+ -- Access levels (for filtering)
73
+ fp.POST_AUTHOR_ACCESS_LEVEL,
74
+ fp.PARENT_AUTHOR_ACCESS_LEVEL,
75
+ fp.CHILD_AUTHOR_ACCESS_LEVEL
76
+
77
+ FROM
78
+ SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS fp
79
+
80
+ -- Join to get thread's first post for context
81
+ LEFT JOIN
82
+ thread_first_posts tfp ON fp.THREAD_ID = tfp.THREAD_ID
83
+
84
+ -- Left join with output table to exclude already-processed posts
85
+ LEFT JOIN
86
+ SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS sba
87
+ ON fp.POST_ID = sba.POST_ID
88
+
89
+ WHERE
90
+ -- Exclude team and house-coach posts (internal comments)
91
+ (fp.POST_AUTHOR_ACCESS_LEVEL IS NULL OR fp.POST_AUTHOR_ACCESS_LEVEL NOT IN ('team', 'house-coach'))
92
+
93
+ -- Exclude deleted posts
94
+ AND (fp.POST_STATE IS NULL OR fp.POST_STATE != 'deleted')
95
+ AND fp.POST_DELETED_AT IS NULL
96
+
97
+ -- Exclude already-processed posts
98
+ AND sba.POST_ID IS NULL
99
+
100
+ -- Ensure post has content
101
+ AND fp.POST_CONTENT IS NOT NULL
102
+ AND TRIM(fp.POST_CONTENT) != ''
103
+ AND LENGTH(TRIM(fp.POST_CONTENT)) > 0
104
+
105
+ ORDER BY
106
+ fp.POST_CREATED_AT DESC;
processing_brand_sentiment/database/sql/init_comments_output_table.sql ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Initialize empty output table for Sabian brand sentiment analysis on social media comments
2
+ -- Run this script BEFORE the first processing run to create the table structure
3
+ -- This prevents "table not found" errors when the fetch query tries to check for already-processed comments
4
+
5
+ CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS (
6
+ -- Source identifiers (comment-specific)
7
+ COMMENT_SK NUMBER(38,0),
8
+ COMMENT_ID VARCHAR(16777216),
9
+ ORIGINAL_TEXT VARCHAR(16777216),
10
+ PLATFORM VARCHAR(16777216),
11
+ COMMENT_TIMESTAMP TIMESTAMP_NTZ(9),
12
+ AUTHOR_NAME VARCHAR(16777216),
13
+ AUTHOR_ID VARCHAR(16777216),
14
+ CONTENT_SK NUMBER(38,0),
15
+ CONTENT_ID VARCHAR(16777216),
16
+ CONTENT_DESCRIPTION VARCHAR(16777216),
17
+ CHANNEL_SK NUMBER(38,0),
18
+ CHANNEL_NAME VARCHAR(16777216),
19
+ CHANNEL_DISPLAY_NAME VARCHAR(16777216),
20
+ PARENT_COMMENT_ID VARCHAR(16777216),
21
+ PARENT_COMMENT_TEXT VARCHAR(16777216),
22
+
23
+ -- Language detection
24
+ DETECTED_LANGUAGE VARCHAR(100),
25
+ LANGUAGE_CODE VARCHAR(10),
26
+ IS_ENGLISH BOOLEAN,
27
+
28
+ -- Relevance assessment
29
+ IS_RELEVANT BOOLEAN,
30
+ RELEVANCE_CONFIDENCE VARCHAR(20),
31
+ RELEVANCE_REASON VARCHAR(500),
32
+
33
+ -- Author classification
34
+ AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
35
+ SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
36
+
37
+ -- Sentiment analysis
38
+ SENTIMENT_LEVEL VARCHAR(20),
39
+ EMOTION_TYPE VARCHAR(50),
40
+ SENTIMENT_TARGET VARCHAR(50),
41
+ SENTIMENT_CONFIDENCE VARCHAR(20),
42
+
43
+ -- Product information (stored as JSON arrays)
44
+ PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"]
45
+ PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
46
+ PURCHASE_STAGE VARCHAR(50),
47
+
48
+ -- Competitive intelligence
49
+ COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"]
50
+ COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
51
+ COMPARISON_TYPE VARCHAR(50),
52
+ COMPETITIVE_POSITIONING VARCHAR(500),
53
+ BRAND_SWITCHING VARCHAR(100),
54
+
55
+ -- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
56
+ INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"]
57
+ DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"]
58
+ PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"]
59
+ DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"]
60
+
61
+ -- Analysis notes
62
+ ANALYSIS_NOTES VARCHAR(16777216),
63
+ SARCASM_DETECTED BOOLEAN,
64
+
65
+ -- Validation results
66
+ VALIDATION_PASSED BOOLEAN,
67
+ VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
68
+ VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
69
+ VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags
70
+
71
+ -- Processing metadata
72
+ PROCESSING_SUCCESS BOOLEAN,
73
+ PROCESSING_STATUS VARCHAR(50), -- completed, completed_with_flags, validation_failed, workflow_error
74
+ PROCESSING_ERRORS VARCHAR(16777216),
75
+ PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
76
+ WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
77
+ )
78
+ COMMENT = 'Brand sentiment analysis results for Sabian social media comments. Schema v4.0: 4-agent pipeline with extraction/analysis separation and validation.';
processing_brand_sentiment/database/sql/init_output_table.sql ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Initialize empty output table for Sabian brand sentiment analysis
2
+ -- Run this script BEFORE the first processing run to create the table structure
3
+ -- This prevents "table not found" errors when the fetch query tries to check for already-processed posts
4
+ -- Schema Version 4.0: Added THREAD_CONTEXT_SUMMARY, validation fields, and processing status
5
+
6
+ CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS (
7
+ -- Source identifiers
8
+ POST_ID NUMBER(38,0) PRIMARY KEY,
9
+ THREAD_ID NUMBER(38,0),
10
+ POST_AUTHOR_ID NUMBER(38,0),
11
+
12
+ -- Original and processed content
13
+ ORIGINAL_CONTENT VARCHAR(16777216),
14
+ CLEANED_CONTENT VARCHAR(16777216),
15
+ QUOTED_CONTENT VARCHAR(16777216),
16
+ THREAD_CONTEXT VARCHAR(16777216), -- Raw thread context (legacy)
17
+ THREAD_CONTEXT_SUMMARY VARCHAR(4096), -- NEW v4.0: Summarized thread context
18
+
19
+ -- Thread metadata
20
+ THREAD_TITLE VARCHAR(16777216),
21
+ THREAD_FIRST_POST VARCHAR(16777216),
22
+
23
+ -- Timestamps
24
+ POST_CREATED_AT TIMESTAMP_LTZ(9),
25
+ THREAD_STARTED_AT TIMESTAMP_LTZ(9),
26
+
27
+ -- Category information
28
+ CATEGORY_TITLE VARCHAR(16777216),
29
+ CATEGORY_TOPIC VARCHAR(16777216),
30
+
31
+ -- Language detection
32
+ DETECTED_LANGUAGE VARCHAR(100),
33
+ LANGUAGE_CODE VARCHAR(10),
34
+ IS_ENGLISH BOOLEAN,
35
+
36
+ -- Relevance assessment
37
+ IS_RELEVANT BOOLEAN,
38
+ RELEVANCE_CONFIDENCE VARCHAR(20),
39
+ RELEVANCE_REASON VARCHAR(500),
40
+
41
+ -- Author classification
42
+ AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown
43
+ SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context
44
+
45
+ -- Sentiment analysis
46
+ SENTIMENT_LEVEL VARCHAR(20),
47
+ EMOTION_TYPE VARCHAR(50),
48
+ SENTIMENT_TARGET VARCHAR(50),
49
+ SENTIMENT_CONFIDENCE VARCHAR(20),
50
+
51
+ -- Product information (stored as JSON arrays)
52
+ PRODUCTS_MENTIONED VARCHAR(16777216),
53
+ PRODUCT_ATTRIBUTES VARCHAR(16777216),
54
+
55
+ -- Competitive intelligence
56
+ COMPETITORS_MENTIONED VARCHAR(16777216),
57
+ COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns
58
+ COMPARISON_TYPE VARCHAR(50),
59
+ COMPETITIVE_POSITIONING VARCHAR(500),
60
+ BRAND_SWITCHING VARCHAR(100),
61
+
62
+ -- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others)
63
+ INTENTS VARCHAR(16777216), -- Multi-label: seeking_information, providing_information, sharing_experience, comparing, praising, criticizing, buying_selling, general_discussion
64
+ PURCHASE_STAGE VARCHAR(50), -- AUTHOR's own stage only
65
+ DECISION_DRIVERS VARCHAR(16777216), -- AUTHOR's own decision drivers only
66
+ PAIN_POINTS VARCHAR(16777216), -- AUTHOR's negative feedback aspects (uses feedback_aspects categories)
67
+ DELIGHT_FACTORS VARCHAR(16777216), -- AUTHOR's positive feedback aspects (uses feedback_aspects categories)
68
+
69
+ -- Analysis notes
70
+ ANALYSIS_NOTES VARCHAR(16777216),
71
+ SARCASM_DETECTED BOOLEAN,
72
+
73
+ -- Validation results (NEW v4.0)
74
+ VALIDATION_PASSED BOOLEAN,
75
+ VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages
76
+ VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages
77
+ VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags
78
+
79
+ -- Platform identifier
80
+ PLATFORM VARCHAR(50) DEFAULT 'musora_forums',
81
+
82
+ -- Processing metadata
83
+ PROCESSING_SUCCESS BOOLEAN,
84
+ PROCESSING_STATUS VARCHAR(50), -- NEW v4.0: completed, completed_with_flags, validation_failed, workflow_error
85
+ PROCESSING_ERRORS VARCHAR(16777216),
86
+ PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(),
87
+ WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0'
88
+ )
89
+ COMMENT = 'Brand sentiment analysis results for Sabian forum posts. Schema v4.0: Added thread_context_summary, validation fields, and processing status.';
processing_brand_sentiment/main.py ADDED
@@ -0,0 +1,1088 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main execution script for brand sentiment analysis workflow.
3
+ Orchestrates data fetching, processing, and storage using an agentic workflow.
4
+ Supports parallel processing with multiprocessing for improved performance.
5
+ Supports multiple data sources: forums, social media comments, or both.
6
+ """
7
+
8
+ import json
9
+ import os
10
+ import logging
11
+ import argparse
12
+ from datetime import datetime
13
+ import pandas as pd
14
+ from dotenv import load_dotenv
15
+ from multiprocessing import Pool, cpu_count
16
+ import traceback
17
+ from typing import Dict, Any, List
18
+
19
+ from database.snowflake_connection import SnowFlakeConn
20
+ from workflow.orchestrator import BrandAnalysisWorkflow
21
+ from workflow.comment_orchestrator import CommentAnalysisWorkflow
22
+
23
+ # Get the directory where this script is located
24
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
25
+
26
+ # Load environment variables
27
+ load_dotenv(os.path.join(SCRIPT_DIR, '.env'))
28
+
29
+ # Ensure logs directory exists
30
+ LOGS_DIR = os.path.join(SCRIPT_DIR, 'logs')
31
+ os.makedirs(LOGS_DIR, exist_ok=True)
32
+
33
+ # Configure logging
34
+ logging.basicConfig(
35
+ level=logging.INFO,
36
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
37
+ handlers=[
38
+ logging.FileHandler(
39
+ os.path.join(LOGS_DIR, f'brand_sentiment_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
40
+ ),
41
+ logging.StreamHandler()
42
+ ]
43
+ )
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ # ============================================================
48
+ # Configuration Loading
49
+ # ============================================================
50
+
51
+ def load_configs(config_dir: str = None) -> Dict[str, Dict]:
52
+ """
53
+ Load all configuration files.
54
+
55
+ Args:
56
+ config_dir: Directory containing config files
57
+
58
+ Returns:
59
+ Dictionary with all configurations
60
+ """
61
+ if config_dir is None:
62
+ config_dir = os.path.join(SCRIPT_DIR, 'config_files')
63
+
64
+ configs = {}
65
+
66
+ # Load workflow config
67
+ with open(os.path.join(config_dir, 'workflow_config.json'), 'r') as f:
68
+ configs['workflow'] = json.load(f)
69
+
70
+ # Load brand config
71
+ with open(os.path.join(config_dir, 'brand_config.json'), 'r') as f:
72
+ configs['brand'] = json.load(f)
73
+
74
+ # Load analysis categories
75
+ with open(os.path.join(config_dir, 'analysis_categories.json'), 'r') as f:
76
+ configs['categories'] = json.load(f)
77
+
78
+ return configs
79
+
80
+
81
+ # ============================================================
82
+ # Batch Processing Utilities
83
+ # ============================================================
84
+
85
+ def calculate_optimal_batch_size(
86
+ total_posts: int,
87
+ num_workers: int,
88
+ min_batch: int = 20,
89
+ max_batch: int = 500
90
+ ) -> int:
91
+ """
92
+ Calculate optimal batch size based on total posts and workers.
93
+
94
+ Args:
95
+ total_posts: Total number of posts to process
96
+ num_workers: Number of parallel workers
97
+ min_batch: Minimum batch size
98
+ max_batch: Maximum batch size
99
+
100
+ Returns:
101
+ Optimal batch size
102
+ """
103
+ if total_posts <= min_batch:
104
+ return total_posts
105
+
106
+ batch_size = total_posts // num_workers
107
+ batch_size = max(min_batch, min(max_batch, batch_size))
108
+
109
+ return batch_size
110
+
111
+
112
+ def safe_to_json(value: Any) -> Any:
113
+ """
114
+ Safely convert a value to JSON string.
115
+ Handles None, NaN, lists, and already-string values.
116
+
117
+ Args:
118
+ value: Value to convert
119
+
120
+ Returns:
121
+ JSON string if list, None if null, original value otherwise
122
+ """
123
+ # Handle None and NaN
124
+ if value is None or (isinstance(value, float) and pd.isna(value)):
125
+ return None
126
+ # Handle lists - convert to JSON
127
+ if isinstance(value, list):
128
+ return json.dumps(value) if value else None
129
+ # Handle already-string values
130
+ if isinstance(value, str):
131
+ return value if value else None
132
+ # Return as-is for other types
133
+ return value
134
+
135
+
136
+ def safe_json_list_length(value: Any) -> int:
137
+ """
138
+ Safely get the length of a JSON array string.
139
+ Handles None, NaN, empty strings, and invalid JSON.
140
+
141
+ Args:
142
+ value: Value to parse (expected JSON string of array)
143
+
144
+ Returns:
145
+ Length of the array, or 0 if invalid/empty
146
+ """
147
+ # Handle None and NaN
148
+ if value is None or (isinstance(value, float) and pd.isna(value)):
149
+ return 0
150
+ # Handle non-string values
151
+ if not isinstance(value, str):
152
+ return 0
153
+ # Handle empty strings
154
+ if not value or value == '[]' or value == 'null':
155
+ return 0
156
+ # Try to parse JSON
157
+ try:
158
+ parsed = json.loads(value)
159
+ return len(parsed) if isinstance(parsed, list) else 0
160
+ except (json.JSONDecodeError, TypeError):
161
+ return 0
162
+
163
+
164
+ def calculate_batch_stats(df: pd.DataFrame) -> Dict[str, int]:
165
+ """
166
+ Calculate statistics from batch results.
167
+ Handles null values safely for all fields.
168
+
169
+ Args:
170
+ df: DataFrame with processed results
171
+
172
+ Returns:
173
+ Dictionary with statistics
174
+ """
175
+ stats = {
176
+ 'relevant_count': 0,
177
+ 'not_relevant_count': 0,
178
+ 'products_mentioned_count': 0,
179
+ 'competitors_mentioned_count': 0,
180
+ 'positive_sentiment_count': 0,
181
+ 'negative_sentiment_count': 0,
182
+ # Author role stats
183
+ 'current_owner_count': 0,
184
+ 'potential_buyer_count': 0,
185
+ 'primary_focus_count': 0
186
+ }
187
+
188
+ # Handle empty dataframe
189
+ if df.empty:
190
+ return stats
191
+
192
+ # Count relevant/not relevant posts
193
+ if 'IS_RELEVANT' in df.columns:
194
+ relevant_col = df['IS_RELEVANT']
195
+ non_null_mask = relevant_col.notna()
196
+ if non_null_mask.any():
197
+ stats['relevant_count'] = int(relevant_col[non_null_mask].astype(bool).sum())
198
+ stats['not_relevant_count'] = int((~relevant_col[non_null_mask].astype(bool)).sum())
199
+
200
+ # Count product mentions using safe helper
201
+ if 'PRODUCTS_MENTIONED' in df.columns:
202
+ stats['products_mentioned_count'] = int(
203
+ df['PRODUCTS_MENTIONED'].apply(safe_json_list_length).sum()
204
+ )
205
+
206
+ # Count competitor mentions using safe helper
207
+ if 'COMPETITORS_MENTIONED' in df.columns:
208
+ stats['competitors_mentioned_count'] = int(
209
+ df['COMPETITORS_MENTIONED'].apply(safe_json_list_length).sum()
210
+ )
211
+
212
+ # Count sentiment distribution
213
+ if 'SENTIMENT_LEVEL' in df.columns:
214
+ sentiment_values = df['SENTIMENT_LEVEL'].dropna()
215
+ if not sentiment_values.empty:
216
+ stats['positive_sentiment_count'] = int(
217
+ sentiment_values.isin(['positive', 'very_positive']).sum()
218
+ )
219
+ stats['negative_sentiment_count'] = int(
220
+ sentiment_values.isin(['negative', 'very_negative']).sum()
221
+ )
222
+
223
+ # Count author roles
224
+ if 'AUTHOR_ROLE' in df.columns:
225
+ author_roles = df['AUTHOR_ROLE'].dropna()
226
+ if not author_roles.empty:
227
+ stats['current_owner_count'] = int((author_roles == 'current_owner').sum())
228
+ stats['potential_buyer_count'] = int((author_roles == 'potential_buyer').sum())
229
+
230
+ # Count mention context
231
+ if 'SABIAN_MENTION_CONTEXT' in df.columns:
232
+ mention_context = df['SABIAN_MENTION_CONTEXT'].dropna()
233
+ if not mention_context.empty:
234
+ stats['primary_focus_count'] = int((mention_context == 'primary_focus').sum())
235
+
236
+ return stats
237
+
238
+
239
+ def aggregate_results(results: List[Dict[str, Any]]) -> Dict[str, Any]:
240
+ """
241
+ Aggregate results from multiple batches.
242
+
243
+ Args:
244
+ results: List of batch result dictionaries
245
+
246
+ Returns:
247
+ Aggregated statistics dictionary
248
+ """
249
+ aggregated = {
250
+ 'total_processed': sum(r.get('total_processed', 0) for r in results),
251
+ 'total_stored': sum(r.get('total_stored', 0) for r in results),
252
+ 'failed_count': sum(r.get('failed_count', 0) for r in results),
253
+ 'relevant_count': sum(r.get('relevant_count', 0) for r in results),
254
+ 'not_relevant_count': sum(r.get('not_relevant_count', 0) for r in results),
255
+ 'products_mentioned_count': sum(r.get('products_mentioned_count', 0) for r in results),
256
+ 'competitors_mentioned_count': sum(r.get('competitors_mentioned_count', 0) for r in results),
257
+ 'positive_sentiment_count': sum(r.get('positive_sentiment_count', 0) for r in results),
258
+ 'negative_sentiment_count': sum(r.get('negative_sentiment_count', 0) for r in results),
259
+ 'current_owner_count': sum(r.get('current_owner_count', 0) for r in results),
260
+ 'potential_buyer_count': sum(r.get('potential_buyer_count', 0) for r in results),
261
+ 'primary_focus_count': sum(r.get('primary_focus_count', 0) for r in results),
262
+ 'failed_batches': sum(1 for r in results if not r.get('success', False))
263
+ }
264
+
265
+ # Log failed batches
266
+ failed_batches = [r for r in results if not r.get('success', False)]
267
+ if failed_batches:
268
+ logger.error(f"{len(failed_batches)} batch(es) failed:")
269
+ for fb in failed_batches:
270
+ logger.error(f" Batch {fb.get('batch_num')}: {fb.get('error')}")
271
+
272
+ return aggregated
273
+
274
+
275
+ # ============================================================
276
+ # Forum Processing (existing functionality)
277
+ # ============================================================
278
+
279
+ # Columns that should be converted from lists to JSON strings
280
+ FORUM_JSON_ARRAY_COLUMNS = [
281
+ 'products_mentioned', 'product_attributes', 'competitors_mentioned',
282
+ 'competitor_products_owned', 'intents', 'decision_drivers',
283
+ 'pain_points', 'delight_factors', 'processing_errors', 'relevance_keywords_found',
284
+ 'validation_errors', 'validation_warnings', 'validation_flags'
285
+ ]
286
+
287
+ # Column mapping from forum workflow state to output table
288
+ FORUM_COLUMN_MAPPING = {
289
+ 'post_id': 'POST_ID',
290
+ 'thread_id': 'THREAD_ID',
291
+ 'post_author_id': 'POST_AUTHOR_ID',
292
+ 'original_content': 'ORIGINAL_CONTENT',
293
+ 'cleaned_content': 'CLEANED_CONTENT',
294
+ 'quoted_content': 'QUOTED_CONTENT',
295
+ 'raw_thread_context': 'THREAD_CONTEXT',
296
+ 'thread_context_summary': 'THREAD_CONTEXT_SUMMARY',
297
+ 'thread_title': 'THREAD_TITLE',
298
+ 'thread_first_post': 'THREAD_FIRST_POST',
299
+ 'post_created_at': 'POST_CREATED_AT',
300
+ 'thread_started_at': 'THREAD_STARTED_AT',
301
+ 'category_title': 'CATEGORY_TITLE',
302
+ 'category_topic': 'CATEGORY_TOPIC',
303
+ 'detected_language': 'DETECTED_LANGUAGE',
304
+ 'language_code': 'LANGUAGE_CODE',
305
+ 'is_english': 'IS_ENGLISH',
306
+ 'is_relevant': 'IS_RELEVANT',
307
+ 'relevance_confidence': 'RELEVANCE_CONFIDENCE',
308
+ 'relevance_reason': 'RELEVANCE_REASON',
309
+ 'author_role': 'AUTHOR_ROLE',
310
+ 'sabian_mention_context': 'SABIAN_MENTION_CONTEXT',
311
+ 'sentiment_level': 'SENTIMENT_LEVEL',
312
+ 'emotion_type': 'EMOTION_TYPE',
313
+ 'sentiment_target': 'SENTIMENT_TARGET',
314
+ 'sentiment_confidence': 'SENTIMENT_CONFIDENCE',
315
+ 'products_mentioned': 'PRODUCTS_MENTIONED',
316
+ 'product_attributes': 'PRODUCT_ATTRIBUTES',
317
+ 'competitors_mentioned': 'COMPETITORS_MENTIONED',
318
+ 'competitor_products_owned': 'COMPETITOR_PRODUCTS_OWNED',
319
+ 'comparison_type': 'COMPARISON_TYPE',
320
+ 'competitive_positioning': 'COMPETITIVE_POSITIONING',
321
+ 'brand_switching': 'BRAND_SWITCHING',
322
+ 'intents': 'INTENTS',
323
+ 'purchase_stage': 'PURCHASE_STAGE',
324
+ 'decision_drivers': 'DECISION_DRIVERS',
325
+ 'pain_points': 'PAIN_POINTS',
326
+ 'delight_factors': 'DELIGHT_FACTORS',
327
+ 'analysis_notes': 'ANALYSIS_NOTES',
328
+ 'sarcasm_detected': 'SARCASM_DETECTED',
329
+ 'validation_passed': 'VALIDATION_PASSED',
330
+ 'validation_errors': 'VALIDATION_ERRORS',
331
+ 'validation_warnings': 'VALIDATION_WARNINGS',
332
+ 'validation_flags': 'VALIDATION_FLAGS',
333
+ 'success': 'PROCESSING_SUCCESS',
334
+ 'processing_status': 'PROCESSING_STATUS',
335
+ 'processing_errors': 'PROCESSING_ERRORS'
336
+ }
337
+
338
+
339
+ def prepare_forum_output_dataframe(df: pd.DataFrame) -> pd.DataFrame:
340
+ """
341
+ Prepare forum output DataFrame with proper column mapping.
342
+
343
+ Args:
344
+ df: DataFrame with processing results
345
+
346
+ Returns:
347
+ DataFrame ready for Snowflake storage
348
+ """
349
+ output_df = pd.DataFrame()
350
+
351
+ for source_col, target_col in FORUM_COLUMN_MAPPING.items():
352
+ if source_col in df.columns:
353
+ value = df[source_col].copy()
354
+ if source_col in FORUM_JSON_ARRAY_COLUMNS:
355
+ value = value.apply(safe_to_json)
356
+ output_df[target_col] = value
357
+ else:
358
+ output_df[target_col] = None
359
+
360
+ # Add metadata
361
+ output_df['PLATFORM'] = 'musora_forums'
362
+ output_df['PROCESSED_AT'] = datetime.now()
363
+ output_df['WORKFLOW_VERSION'] = '4.0'
364
+
365
+ return output_df
366
+
367
+
368
+ def process_forum_batch_worker(batch_data: tuple) -> Dict[str, Any]:
369
+ """
370
+ Worker function to process a single batch of forum posts.
371
+ Runs in a separate process.
372
+
373
+ Args:
374
+ batch_data: Tuple containing (batch_num, posts, configs, api_key, overwrite_first_batch, output_config)
375
+
376
+ Returns:
377
+ Dictionary with batch statistics
378
+ """
379
+ batch_num, posts, configs, api_key, overwrite_first_batch, output_config = batch_data
380
+
381
+ worker_logger = logging.getLogger(f"ForumWorker-{batch_num}")
382
+
383
+ try:
384
+ worker_logger.info(f"Forum Batch {batch_num}: Starting processing of {len(posts)} posts")
385
+
386
+ # Initialize Snowflake connection for this worker
387
+ snowflake = SnowFlakeConn()
388
+
389
+ # Initialize workflow for this worker
390
+ workflow = BrandAnalysisWorkflow(
391
+ workflow_config=configs['workflow'],
392
+ brand_config=configs['brand'],
393
+ analysis_categories=configs['categories'],
394
+ api_key=api_key
395
+ )
396
+
397
+ # Process posts
398
+ results = workflow.process_batch(posts)
399
+
400
+ # Convert to DataFrame
401
+ results_df = pd.DataFrame(results)
402
+
403
+ # Filter successful results
404
+ initial_count = len(results_df)
405
+ df_successful = results_df[results_df['success'] == True].copy()
406
+ failed_count = initial_count - len(df_successful)
407
+
408
+ worker_logger.info(f"Forum Batch {batch_num}: Processed {initial_count} posts, {len(df_successful)} successful")
409
+
410
+ # Prepare output DataFrame
411
+ output_df = prepare_forum_output_dataframe(df_successful)
412
+
413
+ # Store results
414
+ if len(output_df) > 0:
415
+ overwrite = overwrite_first_batch and batch_num == 1
416
+
417
+ snowflake.store_df_to_snowflake(
418
+ table_name=output_config['table_name'],
419
+ dataframe=output_df,
420
+ database=output_config['database'],
421
+ schema=output_config['schema'],
422
+ overwrite=overwrite
423
+ )
424
+
425
+ worker_logger.info(f"Forum Batch {batch_num}: Stored {len(output_df)} records to Snowflake")
426
+ else:
427
+ worker_logger.warning(f"Forum Batch {batch_num}: No successful records to store")
428
+
429
+ # Close connection
430
+ snowflake.close_connection()
431
+
432
+ # Calculate statistics
433
+ stats = calculate_batch_stats(output_df)
434
+ stats.update({
435
+ 'batch_num': batch_num,
436
+ 'success': True,
437
+ 'total_processed': initial_count,
438
+ 'total_stored': len(output_df),
439
+ 'failed_count': failed_count,
440
+ 'error': None
441
+ })
442
+
443
+ return stats
444
+
445
+ except Exception as e:
446
+ error_msg = f"Forum Batch {batch_num} failed: {str(e)}"
447
+ worker_logger.error(error_msg)
448
+ worker_logger.error(traceback.format_exc())
449
+
450
+ return {
451
+ 'batch_num': batch_num,
452
+ 'success': False,
453
+ 'total_processed': len(posts),
454
+ 'total_stored': 0,
455
+ 'failed_count': len(posts),
456
+ 'error': error_msg
457
+ }
458
+
459
+
460
+ # ============================================================
461
+ # Comment Processing (new functionality)
462
+ # ============================================================
463
+
464
+ # Columns that should be converted from lists to JSON strings (same analysis fields)
465
+ COMMENT_JSON_ARRAY_COLUMNS = [
466
+ 'products_mentioned', 'product_attributes', 'competitors_mentioned',
467
+ 'competitor_products_owned', 'intents', 'decision_drivers',
468
+ 'pain_points', 'delight_factors', 'processing_errors', 'relevance_keywords_found',
469
+ 'validation_errors', 'validation_warnings', 'validation_flags'
470
+ ]
471
+
472
+ # Column mapping from comment workflow state to output table
473
+ COMMENT_COLUMN_MAPPING = {
474
+ # Comment-specific identifiers
475
+ 'comment_sk': 'COMMENT_SK',
476
+ 'comment_id': 'COMMENT_ID',
477
+ 'original_text': 'ORIGINAL_TEXT',
478
+ 'platform': 'PLATFORM',
479
+ 'comment_timestamp': 'COMMENT_TIMESTAMP',
480
+ 'author_name': 'AUTHOR_NAME',
481
+ 'author_id': 'AUTHOR_ID',
482
+ 'content_sk': 'CONTENT_SK',
483
+ 'content_id': 'CONTENT_ID',
484
+ 'content_description': 'CONTENT_DESCRIPTION',
485
+ 'channel_sk': 'CHANNEL_SK',
486
+ 'channel_name': 'CHANNEL_NAME',
487
+ 'channel_display_name': 'CHANNEL_DISPLAY_NAME',
488
+ 'parent_comment_id': 'PARENT_COMMENT_ID',
489
+ 'parent_comment_text': 'PARENT_COMMENT_TEXT',
490
+ # Analysis fields (same as forums)
491
+ 'detected_language': 'DETECTED_LANGUAGE',
492
+ 'language_code': 'LANGUAGE_CODE',
493
+ 'is_english': 'IS_ENGLISH',
494
+ 'is_relevant': 'IS_RELEVANT',
495
+ 'relevance_confidence': 'RELEVANCE_CONFIDENCE',
496
+ 'relevance_reason': 'RELEVANCE_REASON',
497
+ 'author_role': 'AUTHOR_ROLE',
498
+ 'sabian_mention_context': 'SABIAN_MENTION_CONTEXT',
499
+ 'sentiment_level': 'SENTIMENT_LEVEL',
500
+ 'emotion_type': 'EMOTION_TYPE',
501
+ 'sentiment_target': 'SENTIMENT_TARGET',
502
+ 'sentiment_confidence': 'SENTIMENT_CONFIDENCE',
503
+ 'products_mentioned': 'PRODUCTS_MENTIONED',
504
+ 'product_attributes': 'PRODUCT_ATTRIBUTES',
505
+ 'purchase_stage': 'PURCHASE_STAGE',
506
+ 'competitors_mentioned': 'COMPETITORS_MENTIONED',
507
+ 'competitor_products_owned': 'COMPETITOR_PRODUCTS_OWNED',
508
+ 'comparison_type': 'COMPARISON_TYPE',
509
+ 'competitive_positioning': 'COMPETITIVE_POSITIONING',
510
+ 'brand_switching': 'BRAND_SWITCHING',
511
+ 'intents': 'INTENTS',
512
+ 'decision_drivers': 'DECISION_DRIVERS',
513
+ 'pain_points': 'PAIN_POINTS',
514
+ 'delight_factors': 'DELIGHT_FACTORS',
515
+ 'analysis_notes': 'ANALYSIS_NOTES',
516
+ 'sarcasm_detected': 'SARCASM_DETECTED',
517
+ 'validation_passed': 'VALIDATION_PASSED',
518
+ 'validation_errors': 'VALIDATION_ERRORS',
519
+ 'validation_warnings': 'VALIDATION_WARNINGS',
520
+ 'validation_flags': 'VALIDATION_FLAGS',
521
+ 'success': 'PROCESSING_SUCCESS',
522
+ 'processing_status': 'PROCESSING_STATUS',
523
+ 'processing_errors': 'PROCESSING_ERRORS'
524
+ }
525
+
526
+
527
+ def prepare_comment_output_dataframe(df: pd.DataFrame) -> pd.DataFrame:
528
+ """
529
+ Prepare comment output DataFrame with proper column mapping.
530
+
531
+ Args:
532
+ df: DataFrame with processing results
533
+
534
+ Returns:
535
+ DataFrame ready for Snowflake storage
536
+ """
537
+ output_df = pd.DataFrame()
538
+
539
+ for source_col, target_col in COMMENT_COLUMN_MAPPING.items():
540
+ if source_col in df.columns:
541
+ value = df[source_col].copy()
542
+ if source_col in COMMENT_JSON_ARRAY_COLUMNS:
543
+ value = value.apply(safe_to_json)
544
+ output_df[target_col] = value
545
+ else:
546
+ output_df[target_col] = None
547
+
548
+ # Add metadata
549
+ output_df['PROCESSED_AT'] = datetime.now()
550
+ output_df['WORKFLOW_VERSION'] = '4.0'
551
+
552
+ return output_df
553
+
554
+
555
+ def process_comment_batch_worker(batch_data: tuple) -> Dict[str, Any]:
556
+ """
557
+ Worker function to process a single batch of social media comments.
558
+ Runs in a separate process.
559
+
560
+ Args:
561
+ batch_data: Tuple containing (batch_num, comments, configs, api_key, overwrite_first_batch, output_config)
562
+
563
+ Returns:
564
+ Dictionary with batch statistics
565
+ """
566
+ batch_num, comments, configs, api_key, overwrite_first_batch, output_config = batch_data
567
+
568
+ worker_logger = logging.getLogger(f"CommentWorker-{batch_num}")
569
+
570
+ try:
571
+ worker_logger.info(f"Comment Batch {batch_num}: Starting processing of {len(comments)} comments")
572
+
573
+ # Initialize Snowflake connection for this worker
574
+ snowflake = SnowFlakeConn()
575
+
576
+ # Initialize comment workflow for this worker
577
+ workflow = CommentAnalysisWorkflow(
578
+ workflow_config=configs['workflow'],
579
+ brand_config=configs['brand'],
580
+ analysis_categories=configs['categories'],
581
+ api_key=api_key
582
+ )
583
+
584
+ # Process comments
585
+ results = workflow.process_batch(comments)
586
+
587
+ # Convert to DataFrame
588
+ results_df = pd.DataFrame(results)
589
+
590
+ # Filter successful results
591
+ initial_count = len(results_df)
592
+ df_successful = results_df[results_df['success'] == True].copy()
593
+ failed_count = initial_count - len(df_successful)
594
+
595
+ worker_logger.info(f"Comment Batch {batch_num}: Processed {initial_count} comments, {len(df_successful)} successful")
596
+
597
+ # Prepare output DataFrame
598
+ output_df = prepare_comment_output_dataframe(df_successful)
599
+
600
+ # Store results
601
+ if len(output_df) > 0:
602
+ overwrite = overwrite_first_batch and batch_num == 1
603
+
604
+ snowflake.store_df_to_snowflake(
605
+ table_name=output_config['table_name'],
606
+ dataframe=output_df,
607
+ database=output_config['database'],
608
+ schema=output_config['schema'],
609
+ overwrite=overwrite
610
+ )
611
+
612
+ worker_logger.info(f"Comment Batch {batch_num}: Stored {len(output_df)} records to Snowflake")
613
+ else:
614
+ worker_logger.warning(f"Comment Batch {batch_num}: No successful records to store")
615
+
616
+ # Close connection
617
+ snowflake.close_connection()
618
+
619
+ # Calculate statistics
620
+ stats = calculate_batch_stats(output_df)
621
+ stats.update({
622
+ 'batch_num': batch_num,
623
+ 'success': True,
624
+ 'total_processed': initial_count,
625
+ 'total_stored': len(output_df),
626
+ 'failed_count': failed_count,
627
+ 'error': None
628
+ })
629
+
630
+ return stats
631
+
632
+ except Exception as e:
633
+ error_msg = f"Comment Batch {batch_num} failed: {str(e)}"
634
+ worker_logger.error(error_msg)
635
+ worker_logger.error(traceback.format_exc())
636
+
637
+ return {
638
+ 'batch_num': batch_num,
639
+ 'success': False,
640
+ 'total_processed': len(comments),
641
+ 'total_stored': 0,
642
+ 'failed_count': len(comments),
643
+ 'error': error_msg
644
+ }
645
+
646
+
647
+ # ============================================================
648
+ # Main Processor Class
649
+ # ============================================================
650
+
651
+ class BrandSentimentProcessor:
652
+ """
653
+ Main processor class that orchestrates the entire workflow.
654
+ Supports processing forums, social media comments, or both.
655
+ """
656
+
657
+ def __init__(self, config_dir: str = None):
658
+ """
659
+ Initialize the processor.
660
+
661
+ Args:
662
+ config_dir: Directory containing configuration files
663
+ """
664
+ # Load configurations
665
+ self.configs = load_configs(config_dir)
666
+
667
+ # Initialize Snowflake connection
668
+ self.snowflake = SnowFlakeConn()
669
+
670
+ # Get OpenAI API key
671
+ self.api_key = os.getenv("OPENAI_API_KEY")
672
+ if not self.api_key:
673
+ raise ValueError("OPENAI_API_KEY not found in environment variables")
674
+
675
+ # Get output configurations
676
+ self.forum_output_config = self.configs['workflow'].get('output', {
677
+ 'table_name': 'SABIAN_BRAND_ANALYSIS',
678
+ 'database': 'SOCIAL_MEDIA_DB',
679
+ 'schema': 'ML_FEATURES'
680
+ })
681
+
682
+ self.comment_output_config = self.configs['workflow'].get('comments_output', {
683
+ 'table_name': 'SABIAN_BRAND_ANALYSIS_COMMENTS',
684
+ 'database': 'SOCIAL_MEDIA_DB',
685
+ 'schema': 'ML_FEATURES'
686
+ })
687
+
688
+ logger.info("BrandSentimentProcessor initialized successfully")
689
+
690
+ def fetch_forum_posts(self, limit: int = None) -> pd.DataFrame:
691
+ """
692
+ Fetch forum posts from Snowflake.
693
+
694
+ Args:
695
+ limit: Optional limit on number of posts
696
+
697
+ Returns:
698
+ DataFrame containing post data
699
+ """
700
+ logger.info("Fetching forum posts...")
701
+
702
+ sql_path = os.path.join(SCRIPT_DIR, 'database', 'sql', 'fetch_forum_posts.sql')
703
+ df = self.snowflake.fetch_forum_posts_with_context(sql_path, limit)
704
+
705
+ logger.info(f"Fetched {len(df)} forum posts")
706
+ return df
707
+
708
+ def fetch_comments(self, limit: int = None) -> pd.DataFrame:
709
+ """
710
+ Fetch social media comments from Snowflake.
711
+
712
+ Args:
713
+ limit: Optional limit on number of comments
714
+
715
+ Returns:
716
+ DataFrame containing comment data
717
+ """
718
+ logger.info("Fetching social media comments...")
719
+
720
+ sql_path = os.path.join(SCRIPT_DIR, 'database', 'sql', 'fetch_comments.sql')
721
+ df = self.snowflake.fetch_comments(sql_path, limit)
722
+
723
+ logger.info(f"Fetched {len(df)} social media comments")
724
+ return df
725
+
726
+ def calculate_num_workers(self) -> int:
727
+ """
728
+ Calculate number of parallel workers.
729
+
730
+ Returns:
731
+ Number of workers
732
+ """
733
+ parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {})
734
+ max_workers = parallel_config.get('max_workers', 5)
735
+
736
+ num_cpus = cpu_count()
737
+ num_workers = max(1, min(max_workers, num_cpus - 2))
738
+
739
+ logger.info(f"Using {num_workers} parallel workers (CPU count: {num_cpus})")
740
+ return num_workers
741
+
742
+ # ---- Forum Processing ----
743
+
744
+ def process_forums_parallel(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
745
+ """
746
+ Process forum posts using parallel workers.
747
+
748
+ Args:
749
+ df: DataFrame containing posts
750
+ overwrite: Whether to overwrite existing table
751
+
752
+ Returns:
753
+ Dictionary with aggregated statistics
754
+ """
755
+ posts = df.to_dict('records')
756
+ total_posts = len(posts)
757
+
758
+ logger.info(f"Processing {total_posts} forum posts using parallel processing...")
759
+
760
+ num_workers = self.calculate_num_workers()
761
+
762
+ parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {})
763
+ min_batch = parallel_config.get('min_batch_size', 20)
764
+ max_batch = parallel_config.get('max_batch_size', 400)
765
+
766
+ batch_size = calculate_optimal_batch_size(total_posts, num_workers, min_batch, max_batch)
767
+ logger.info(f"Forum batch size: {batch_size}")
768
+
769
+ # Create batches
770
+ batches = []
771
+ for i in range(0, total_posts, batch_size):
772
+ batch = posts[i:i + batch_size]
773
+ batch_num = (i // batch_size) + 1
774
+ batches.append((batch_num, batch, self.configs, self.api_key, overwrite, self.forum_output_config))
775
+
776
+ total_batches = len(batches)
777
+ logger.info(f"Split into {total_batches} forum batches")
778
+
779
+ # Process in parallel
780
+ with Pool(processes=num_workers) as pool:
781
+ results = pool.map(process_forum_batch_worker, batches)
782
+
783
+ return aggregate_results(results)
784
+
785
+ def process_forums_sequential(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
786
+ """
787
+ Process forum posts sequentially (for debugging).
788
+
789
+ Args:
790
+ df: DataFrame containing posts
791
+ overwrite: Whether to overwrite existing table
792
+
793
+ Returns:
794
+ Dictionary with statistics
795
+ """
796
+ logger.info(f"Processing {len(df)} forum posts using sequential processing...")
797
+
798
+ posts = df.to_dict('records')
799
+ batch_data = (1, posts, self.configs, self.api_key, overwrite, self.forum_output_config)
800
+ result = process_forum_batch_worker(batch_data)
801
+
802
+ return {
803
+ 'total_processed': result.get('total_processed', 0),
804
+ 'total_stored': result.get('total_stored', 0),
805
+ 'failed_count': result.get('failed_count', 0),
806
+ 'relevant_count': result.get('relevant_count', 0),
807
+ 'not_relevant_count': result.get('not_relevant_count', 0),
808
+ 'products_mentioned_count': result.get('products_mentioned_count', 0),
809
+ 'competitors_mentioned_count': result.get('competitors_mentioned_count', 0),
810
+ 'positive_sentiment_count': result.get('positive_sentiment_count', 0),
811
+ 'negative_sentiment_count': result.get('negative_sentiment_count', 0),
812
+ 'current_owner_count': result.get('current_owner_count', 0),
813
+ 'potential_buyer_count': result.get('potential_buyer_count', 0),
814
+ 'primary_focus_count': result.get('primary_focus_count', 0),
815
+ 'failed_batches': 0 if result.get('success', False) else 1
816
+ }
817
+
818
+ # ---- Comment Processing ----
819
+
820
+ def process_comments_parallel(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
821
+ """
822
+ Process social media comments using parallel workers.
823
+
824
+ Args:
825
+ df: DataFrame containing comments
826
+ overwrite: Whether to overwrite existing table
827
+
828
+ Returns:
829
+ Dictionary with aggregated statistics
830
+ """
831
+ comments = df.to_dict('records')
832
+ total_comments = len(comments)
833
+
834
+ logger.info(f"Processing {total_comments} comments using parallel processing...")
835
+
836
+ num_workers = self.calculate_num_workers()
837
+
838
+ parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {})
839
+ min_batch = parallel_config.get('min_batch_size', 20)
840
+ max_batch = parallel_config.get('max_batch_size', 400)
841
+
842
+ batch_size = calculate_optimal_batch_size(total_comments, num_workers, min_batch, max_batch)
843
+ logger.info(f"Comment batch size: {batch_size}")
844
+
845
+ # Create batches
846
+ batches = []
847
+ for i in range(0, total_comments, batch_size):
848
+ batch = comments[i:i + batch_size]
849
+ batch_num = (i // batch_size) + 1
850
+ batches.append((batch_num, batch, self.configs, self.api_key, overwrite, self.comment_output_config))
851
+
852
+ total_batches = len(batches)
853
+ logger.info(f"Split into {total_batches} comment batches")
854
+
855
+ # Process in parallel
856
+ with Pool(processes=num_workers) as pool:
857
+ results = pool.map(process_comment_batch_worker, batches)
858
+
859
+ return aggregate_results(results)
860
+
861
+ def process_comments_sequential(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]:
862
+ """
863
+ Process social media comments sequentially (for debugging).
864
+
865
+ Args:
866
+ df: DataFrame containing comments
867
+ overwrite: Whether to overwrite existing table
868
+
869
+ Returns:
870
+ Dictionary with statistics
871
+ """
872
+ logger.info(f"Processing {len(df)} comments using sequential processing...")
873
+
874
+ comments = df.to_dict('records')
875
+ batch_data = (1, comments, self.configs, self.api_key, overwrite, self.comment_output_config)
876
+ result = process_comment_batch_worker(batch_data)
877
+
878
+ return {
879
+ 'total_processed': result.get('total_processed', 0),
880
+ 'total_stored': result.get('total_stored', 0),
881
+ 'failed_count': result.get('failed_count', 0),
882
+ 'relevant_count': result.get('relevant_count', 0),
883
+ 'not_relevant_count': result.get('not_relevant_count', 0),
884
+ 'products_mentioned_count': result.get('products_mentioned_count', 0),
885
+ 'competitors_mentioned_count': result.get('competitors_mentioned_count', 0),
886
+ 'positive_sentiment_count': result.get('positive_sentiment_count', 0),
887
+ 'negative_sentiment_count': result.get('negative_sentiment_count', 0),
888
+ 'current_owner_count': result.get('current_owner_count', 0),
889
+ 'potential_buyer_count': result.get('potential_buyer_count', 0),
890
+ 'primary_focus_count': result.get('primary_focus_count', 0),
891
+ 'failed_batches': 0 if result.get('success', False) else 1
892
+ }
893
+
894
+ # ---- Unified Processing ----
895
+
896
+ def _log_source_summary(self, source_name: str, stats: Dict[str, Any], processing_time: float) -> None:
897
+ """
898
+ Log processing summary for a data source.
899
+
900
+ Args:
901
+ source_name: Name of the data source
902
+ stats: Processing statistics
903
+ processing_time: Time taken in seconds
904
+ """
905
+ logger.info(f" --- {source_name} ---")
906
+ logger.info(f" Total processed: {stats.get('total_processed', 0)}")
907
+ logger.info(f" Successfully stored: {stats.get('total_stored', 0)}")
908
+ logger.info(f" Failed: {stats.get('failed_count', 0)}")
909
+ logger.info(f" Relevant: {stats.get('relevant_count', 0)}")
910
+ logger.info(f" Not relevant: {stats.get('not_relevant_count', 0)}")
911
+ logger.info(f" Product mentions: {stats.get('products_mentioned_count', 0)}")
912
+ logger.info(f" Competitor mentions: {stats.get('competitors_mentioned_count', 0)}")
913
+ logger.info(f" Positive sentiment: {stats.get('positive_sentiment_count', 0)}")
914
+ logger.info(f" Negative sentiment: {stats.get('negative_sentiment_count', 0)}")
915
+ logger.info(f" Current owners: {stats.get('current_owner_count', 0)}")
916
+ logger.info(f" Potential buyers: {stats.get('potential_buyer_count', 0)}")
917
+ logger.info(f" Primary focus: {stats.get('primary_focus_count', 0)}")
918
+ if stats.get('failed_batches', 0) > 0:
919
+ logger.info(f" Failed batches: {stats['failed_batches']}")
920
+ logger.info(f" Processing time: {processing_time:.2f} seconds")
921
+ if stats.get('total_processed', 0) > 0:
922
+ logger.info(f" Average per item: {processing_time / stats['total_processed']:.2f} seconds")
923
+
924
+ def run(
925
+ self,
926
+ limit: int = None,
927
+ overwrite: bool = False,
928
+ sequential: bool = False,
929
+ data_source: str = 'all'
930
+ ):
931
+ """
932
+ Run the complete processing pipeline.
933
+
934
+ Args:
935
+ limit: Optional limit on items to process per source
936
+ overwrite: Whether to overwrite existing table
937
+ sequential: Use sequential processing instead of parallel
938
+ data_source: Which data source to process ('forums', 'comments', 'all')
939
+ """
940
+ try:
941
+ logger.info("=" * 80)
942
+ logger.info("Starting Brand Sentiment Analysis Workflow")
943
+ logger.info(f"Brand: {self.configs['brand'].get('brand', {}).get('name', 'Unknown')}")
944
+ logger.info(f"Mode: {'SEQUENTIAL' if sequential else 'PARALLEL'}")
945
+ logger.info(f"Data source: {data_source}")
946
+ logger.info("=" * 80)
947
+
948
+ process_forums = data_source in ('forums', 'all')
949
+ process_comments = data_source in ('comments', 'all')
950
+
951
+ # Track results for summary
952
+ forum_stats = None
953
+ forum_time = 0.0
954
+ comment_stats = None
955
+ comment_time = 0.0
956
+
957
+ # ---- Process Forums ----
958
+ if process_forums:
959
+ logger.info("-" * 40)
960
+ logger.info("Processing FORUMS")
961
+ logger.info("-" * 40)
962
+
963
+ df_posts = self.fetch_forum_posts(limit)
964
+
965
+ if df_posts.empty:
966
+ logger.warning("No forum posts to process")
967
+ else:
968
+ start_time = datetime.now()
969
+
970
+ if sequential:
971
+ forum_stats = self.process_forums_sequential(df_posts, overwrite)
972
+ else:
973
+ forum_stats = self.process_forums_parallel(df_posts, overwrite)
974
+
975
+ forum_time = (datetime.now() - start_time).total_seconds()
976
+
977
+ # ---- Process Comments ----
978
+ if process_comments:
979
+ logger.info("-" * 40)
980
+ logger.info("Processing SOCIAL MEDIA COMMENTS")
981
+ logger.info("-" * 40)
982
+
983
+ df_comments = self.fetch_comments(limit)
984
+
985
+ if df_comments.empty:
986
+ logger.warning("No social media comments to process")
987
+ else:
988
+ start_time = datetime.now()
989
+
990
+ if sequential:
991
+ comment_stats = self.process_comments_sequential(df_comments, overwrite)
992
+ else:
993
+ comment_stats = self.process_comments_parallel(df_comments, overwrite)
994
+
995
+ comment_time = (datetime.now() - start_time).total_seconds()
996
+
997
+ # ---- Summary ----
998
+ logger.info("=" * 80)
999
+ logger.info("Processing Summary:")
1000
+ logger.info(f" Mode: {'Sequential' if sequential else 'Parallel'}")
1001
+ logger.info(f" Data source: {data_source}")
1002
+
1003
+ if forum_stats is not None:
1004
+ self._log_source_summary("Forums", forum_stats, forum_time)
1005
+
1006
+ if comment_stats is not None:
1007
+ self._log_source_summary("Social Media Comments", comment_stats, comment_time)
1008
+
1009
+ logger.info("=" * 80)
1010
+
1011
+ except Exception as e:
1012
+ logger.error(f"Error in workflow execution: {str(e)}", exc_info=True)
1013
+ raise
1014
+
1015
+ finally:
1016
+ self.snowflake.close_connection()
1017
+ logger.info("Snowflake connection closed")
1018
+
1019
+
1020
+ # ============================================================
1021
+ # Legacy compatibility - keep old function names working
1022
+ # ============================================================
1023
+
1024
+ def prepare_output_dataframe(df: pd.DataFrame) -> pd.DataFrame:
1025
+ """Legacy wrapper for forum output preparation."""
1026
+ return prepare_forum_output_dataframe(df)
1027
+
1028
+
1029
+ def process_batch_worker(batch_data: tuple) -> Dict[str, Any]:
1030
+ """Legacy wrapper for forum batch worker."""
1031
+ return process_forum_batch_worker(batch_data)
1032
+
1033
+
1034
+ # ============================================================
1035
+ # Main Entry Point
1036
+ # ============================================================
1037
+
1038
+ def main():
1039
+ """Main entry point."""
1040
+ parser = argparse.ArgumentParser(
1041
+ description="Brand Sentiment Analysis - Analyze forum posts and social media comments for brand intelligence"
1042
+ )
1043
+ parser.add_argument(
1044
+ '--limit',
1045
+ type=int,
1046
+ default=None,
1047
+ help='Limit number of items to process per source (default: all unprocessed)'
1048
+ )
1049
+ parser.add_argument(
1050
+ '--overwrite',
1051
+ action='store_true',
1052
+ default=False,
1053
+ help='Overwrite existing Snowflake table (default: append)'
1054
+ )
1055
+ parser.add_argument(
1056
+ '--sequential',
1057
+ action='store_true',
1058
+ default=False,
1059
+ help='Use sequential processing instead of parallel (for debugging)'
1060
+ )
1061
+ parser.add_argument(
1062
+ '--config-dir',
1063
+ type=str,
1064
+ default=None,
1065
+ help='Path to configuration directory (default: config_files/)'
1066
+ )
1067
+ parser.add_argument(
1068
+ '--data-source',
1069
+ type=str,
1070
+ choices=['forums', 'comments', 'all'],
1071
+ default='all',
1072
+ help='Data source to process: forums, comments, or all (default: all)'
1073
+ )
1074
+
1075
+ args = parser.parse_args()
1076
+
1077
+ # Initialize and run
1078
+ processor = BrandSentimentProcessor(config_dir=args.config_dir)
1079
+ processor.run(
1080
+ limit=args.limit,
1081
+ overwrite=args.overwrite,
1082
+ sequential=args.sequential,
1083
+ data_source=args.data_source
1084
+ )
1085
+
1086
+
1087
+ if __name__ == "__main__":
1088
+ main()
processing_brand_sentiment/utils/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities module for brand sentiment analysis.
3
+ Contains HTML parsing and other helper functions.
4
+ """
5
+
6
+ from .html_parser import HTMLParser
7
+
8
+ __all__ = ['HTMLParser']
processing_brand_sentiment/utils/html_parser.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HTML Parser utility for extracting content from forum posts.
3
+ Handles the complex HTML structure where replies contain quoted parent content.
4
+ """
5
+
6
+ import re
7
+ import html
8
+ from typing import Dict, Optional, Tuple
9
+ from bs4 import BeautifulSoup
10
+ import logging
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class HTMLParser:
16
+ """
17
+ Parses HTML content from forum posts to extract actual reply content
18
+ and quoted parent content separately.
19
+ """
20
+
21
+ def __init__(self):
22
+ """Initialize the HTML parser."""
23
+ pass
24
+
25
+ def parse_post_content(self, html_content: str) -> Dict[str, Optional[str]]:
26
+ """
27
+ Parse HTML post content to extract reply and quoted content.
28
+
29
+ The forum posts have a structure where:
30
+ - <blockquote> contains the quoted parent post
31
+ - Content outside blockquote is the actual reply
32
+
33
+ Example input:
34
+ <blockquote><span class="post-id">125015</span>
35
+ <p class="quote-heading"><strong>JackO</strong><em> - Feb 3, 2015</em></p>
36
+ <br /><p>Parent content here...</p></blockquote>
37
+ <br /><p>Actual reply content here...</p>
38
+
39
+ Args:
40
+ html_content: Raw HTML content from POST_CONTENT field
41
+
42
+ Returns:
43
+ Dictionary with:
44
+ - reply_content: The actual reply text (cleaned)
45
+ - quoted_content: The quoted parent text (cleaned), if any
46
+ - quoted_author: Author of the quoted post, if any
47
+ - quoted_date: Date of the quoted post, if any
48
+ - has_quote: Boolean indicating if post contains a quote
49
+ """
50
+ if not html_content or not html_content.strip():
51
+ return {
52
+ "reply_content": "",
53
+ "quoted_content": None,
54
+ "quoted_author": None,
55
+ "quoted_date": None,
56
+ "has_quote": False
57
+ }
58
+
59
+ try:
60
+ soup = BeautifulSoup(html_content, 'html.parser')
61
+
62
+ # Extract quoted content from blockquotes
63
+ quoted_content = None
64
+ quoted_author = None
65
+ quoted_date = None
66
+ has_quote = False
67
+
68
+ blockquotes = soup.find_all('blockquote')
69
+
70
+ if blockquotes:
71
+ has_quote = True
72
+ quote_parts = []
73
+
74
+ for blockquote in blockquotes:
75
+ # Extract quote heading info (author and date)
76
+ quote_heading = blockquote.find('p', class_='quote-heading')
77
+ if quote_heading:
78
+ author_tag = quote_heading.find('strong')
79
+ if author_tag:
80
+ quoted_author = author_tag.get_text(strip=True)
81
+
82
+ date_tag = quote_heading.find('em')
83
+ if date_tag:
84
+ quoted_date = date_tag.get_text(strip=True).lstrip(' - ')
85
+
86
+ # Get the quote text content (excluding heading)
87
+ # Remove the heading first to get just the content
88
+ if quote_heading:
89
+ quote_heading.decompose()
90
+
91
+ # Remove post-id spans
92
+ for post_id_span in blockquote.find_all('span', class_='post-id'):
93
+ post_id_span.decompose()
94
+
95
+ quote_text = self._clean_text(blockquote.get_text())
96
+ if quote_text:
97
+ quote_parts.append(quote_text)
98
+
99
+ # Remove the blockquote from the soup to get remaining content
100
+ blockquote.decompose()
101
+
102
+ quoted_content = " ".join(quote_parts) if quote_parts else None
103
+
104
+ # Get the remaining content (actual reply)
105
+ reply_content = self._clean_text(soup.get_text())
106
+
107
+ return {
108
+ "reply_content": reply_content,
109
+ "quoted_content": quoted_content,
110
+ "quoted_author": quoted_author,
111
+ "quoted_date": quoted_date,
112
+ "has_quote": has_quote
113
+ }
114
+
115
+ except Exception as e:
116
+ logger.warning(f"Error parsing HTML content: {e}")
117
+ # Fallback: try to extract text directly
118
+ return {
119
+ "reply_content": self._clean_text(self._strip_html_tags(html_content)),
120
+ "quoted_content": None,
121
+ "quoted_author": None,
122
+ "quoted_date": None,
123
+ "has_quote": False
124
+ }
125
+
126
+ def _clean_text(self, text: str) -> str:
127
+ """
128
+ Clean extracted text by removing extra whitespace and normalizing.
129
+
130
+ Args:
131
+ text: Raw text to clean
132
+
133
+ Returns:
134
+ Cleaned text
135
+ """
136
+ if not text:
137
+ return ""
138
+
139
+ # Decode HTML entities
140
+ text = html.unescape(text)
141
+
142
+ # Replace multiple whitespace with single space
143
+ text = re.sub(r'\s+', ' ', text)
144
+
145
+ # Strip leading/trailing whitespace
146
+ text = text.strip()
147
+
148
+ return text
149
+
150
+ def _strip_html_tags(self, html_content: str) -> str:
151
+ """
152
+ Fallback method to strip HTML tags if BeautifulSoup fails.
153
+
154
+ Args:
155
+ html_content: HTML content
156
+
157
+ Returns:
158
+ Text without HTML tags
159
+ """
160
+ # Remove HTML tags
161
+ clean = re.sub(r'<[^>]+>', ' ', html_content)
162
+ # Decode entities
163
+ clean = html.unescape(clean)
164
+ # Clean whitespace
165
+ clean = re.sub(r'\s+', ' ', clean)
166
+ return clean.strip()
167
+
168
+ def extract_plain_text(self, html_content: str) -> str:
169
+ """
170
+ Extract plain text from HTML content, preserving readability.
171
+
172
+ Args:
173
+ html_content: HTML content
174
+
175
+ Returns:
176
+ Plain text version
177
+ """
178
+ if not html_content:
179
+ return ""
180
+
181
+ try:
182
+ soup = BeautifulSoup(html_content, 'html.parser')
183
+
184
+ # Add newlines for block elements
185
+ for br in soup.find_all('br'):
186
+ br.replace_with('\n')
187
+ for p in soup.find_all('p'):
188
+ p.append('\n')
189
+
190
+ text = soup.get_text()
191
+ return self._clean_text(text)
192
+
193
+ except Exception as e:
194
+ logger.warning(f"Error extracting plain text: {e}")
195
+ return self._clean_text(self._strip_html_tags(html_content))
196
+
197
+ def build_thread_context(
198
+ self,
199
+ thread_title: Optional[str],
200
+ first_post_content: Optional[str],
201
+ category_title: Optional[str] = None,
202
+ category_topic: Optional[str] = None
203
+ ) -> str:
204
+ """
205
+ Build a context string from thread information.
206
+
207
+ Args:
208
+ thread_title: Title of the discussion thread
209
+ first_post_content: Content of the first post in the thread
210
+ category_title: Category title
211
+ category_topic: Category topic
212
+
213
+ Returns:
214
+ Formatted context string
215
+ """
216
+ context_parts = []
217
+
218
+ if category_title:
219
+ context_parts.append(f"Category: {category_title}")
220
+
221
+ if category_topic:
222
+ context_parts.append(f"Topic: {category_topic}")
223
+
224
+ if thread_title:
225
+ context_parts.append(f"Thread: {thread_title}")
226
+
227
+ if first_post_content:
228
+ # Parse and clean the first post content
229
+ parsed = self.parse_post_content(first_post_content)
230
+ first_post_text = parsed.get("reply_content", "")
231
+ if first_post_text:
232
+ # Truncate if too long
233
+ if len(first_post_text) > 500:
234
+ first_post_text = first_post_text[:500] + "..."
235
+ context_parts.append(f"Original discussion: {first_post_text}")
236
+
237
+ return " | ".join(context_parts) if context_parts else ""
238
+
239
+ def is_empty_content(self, html_content: str) -> bool:
240
+ """
241
+ Check if HTML content is effectively empty.
242
+
243
+ Args:
244
+ html_content: HTML content to check
245
+
246
+ Returns:
247
+ True if content is empty or contains no meaningful text
248
+ """
249
+ if not html_content:
250
+ return True
251
+
252
+ text = self.extract_plain_text(html_content)
253
+ return len(text.strip()) == 0
processing_brand_sentiment/workflow/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Workflow module for brand sentiment analysis.
3
+ Contains the LangGraph orchestrators and agent implementations.
4
+ Supports both forum posts and social media comments.
5
+ """
6
+
7
+ from .orchestrator import BrandAnalysisWorkflow
8
+ from .comment_orchestrator import CommentAnalysisWorkflow
9
+
10
+ __all__ = ['BrandAnalysisWorkflow', 'CommentAnalysisWorkflow']
processing_brand_sentiment/workflow/agents/__init__.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agents module for brand sentiment analysis v4.0.
3
+
4
+ Contains specialized agents for the 4-stage pipeline:
5
+ 1. ContentPreprocessorAgent - HTML parsing, cleaning, keyword detection (forums)
6
+ CommentPreprocessorAgent - Plain text cleaning, keyword detection (comments)
7
+ 2. SabianRelevanceExtractionAgent - Relevance + fact extraction
8
+ 3. SabianSentimentAnalyzerAgent - Deep sentiment analysis
9
+ 4. OutputValidatorAgent - Rule-based validation
10
+ """
11
+
12
+ from .base_agent import BaseAgent
13
+ from .content_preprocessor_agent import ContentPreprocessorAgent
14
+ from .comment_preprocessor_agent import CommentPreprocessorAgent
15
+ from .sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent
16
+ from .sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent
17
+ from .output_validator_agent import OutputValidatorAgent
18
+
19
+ # Legacy imports for backward compatibility
20
+ from .preprocessor_agent import PreprocessorAgent
21
+ from .relevance_validator_agent import RelevanceValidatorAgent
22
+ from .sabian_analyzer_agent import SabianAnalyzerAgent
23
+
24
+ __all__ = [
25
+ # Base
26
+ 'BaseAgent',
27
+
28
+ # New agents (v4.0)
29
+ 'ContentPreprocessorAgent',
30
+ 'CommentPreprocessorAgent',
31
+ 'SabianRelevanceExtractionAgent',
32
+ 'SabianSentimentAnalyzerAgent',
33
+ 'OutputValidatorAgent',
34
+
35
+ # Legacy agents (for backward compatibility)
36
+ 'PreprocessorAgent',
37
+ 'RelevanceValidatorAgent',
38
+ 'SabianAnalyzerAgent'
39
+ ]
processing_brand_sentiment/workflow/agents/base_agent.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Base Agent class for all agents in the brand sentiment analysis workflow.
3
+ Provides a common interface and structure for extensibility.
4
+ """
5
+
6
+ from abc import ABC, abstractmethod
7
+ from typing import Dict, Any, Optional
8
+ import json
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class BaseAgent(ABC):
15
+ """
16
+ Abstract base class for all agents in the brand sentiment analysis workflow.
17
+ Provides common functionality and enforces consistent interface.
18
+ """
19
+
20
+ def __init__(self, name: str, config: Dict[str, Any]):
21
+ """
22
+ Initialize the base agent.
23
+
24
+ Args:
25
+ name: Name of the agent
26
+ config: Configuration dictionary for the agent
27
+ """
28
+ self.name = name
29
+ self.config = config
30
+ self.model = config.get("model", "gpt-5-nano")
31
+ self.temperature = config.get("temperature", 0.2)
32
+ self.max_retries = config.get("max_retries", 3)
33
+ logger.info(f"Initialized {self.name} with model {self.model}")
34
+
35
+ @abstractmethod
36
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
37
+ """
38
+ Process input data and return results.
39
+ This method must be implemented by all concrete agent classes.
40
+
41
+ Args:
42
+ input_data: Dictionary containing input data for processing
43
+
44
+ Returns:
45
+ Dictionary containing processing results
46
+ """
47
+ pass
48
+
49
+ @abstractmethod
50
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
51
+ """
52
+ Validate input data before processing.
53
+
54
+ Args:
55
+ input_data: Dictionary containing input data
56
+
57
+ Returns:
58
+ True if input is valid, False otherwise
59
+ """
60
+ pass
61
+
62
+ def get_name(self) -> str:
63
+ """Get the agent name."""
64
+ return self.name
65
+
66
+ def get_config(self) -> Dict[str, Any]:
67
+ """Get the agent configuration."""
68
+ return self.config
69
+
70
+ def log_processing(self, message: str, level: str = "info"):
71
+ """
72
+ Log processing information.
73
+
74
+ Args:
75
+ message: Log message
76
+ level: Log level (info, warning, error, debug)
77
+ """
78
+ log_method = getattr(logger, level, logger.info)
79
+ log_method(f"[{self.name}] {message}")
80
+
81
+ def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]:
82
+ """
83
+ Handle errors consistently across all agents.
84
+
85
+ Args:
86
+ error: The exception that occurred
87
+ context: Additional context about the error
88
+
89
+ Returns:
90
+ Error dictionary with details
91
+ """
92
+ error_msg = f"Error in {self.name}"
93
+ if context:
94
+ error_msg += f" ({context})"
95
+ error_msg += f": {str(error)}"
96
+
97
+ logger.error(error_msg)
98
+
99
+ return {
100
+ "success": False,
101
+ "error": str(error),
102
+ "agent": self.name,
103
+ "context": context
104
+ }
105
+
106
+ def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]:
107
+ """
108
+ Parse LLM response that may contain JSON wrapped in markdown code blocks.
109
+
110
+ Args:
111
+ response_content: Raw response content from LLM
112
+
113
+ Returns:
114
+ Parsed JSON dictionary
115
+
116
+ Raises:
117
+ json.JSONDecodeError: If JSON cannot be parsed
118
+ """
119
+ content = response_content.strip()
120
+
121
+ # Check if response is wrapped in markdown code block
122
+ if content.startswith("```json"):
123
+ # Remove ```json prefix and ``` suffix
124
+ content = content[7:] # Remove ```json
125
+ if content.endswith("```"):
126
+ content = content[:-3] # Remove trailing ```
127
+ content = content.strip()
128
+ elif content.startswith("```"):
129
+ # Remove generic ``` code block
130
+ content = content[3:]
131
+ if content.endswith("```"):
132
+ content = content[:-3]
133
+ content = content.strip()
134
+
135
+ # Parse the cleaned JSON
136
+ return json.loads(content)
137
+
138
+ def _safe_get(self, data: Dict[str, Any], key: str, default: Any = None) -> Any:
139
+ """
140
+ Safely get a value from a dictionary with a default.
141
+
142
+ Args:
143
+ data: Dictionary to get value from
144
+ key: Key to look up
145
+ default: Default value if key not found
146
+
147
+ Returns:
148
+ Value from dictionary or default
149
+ """
150
+ return data.get(key, default)
151
+
152
+ def _ensure_list(self, value: Any) -> list:
153
+ """
154
+ Ensure a value is a list.
155
+
156
+ Args:
157
+ value: Value to convert
158
+
159
+ Returns:
160
+ List version of value
161
+ """
162
+ if value is None:
163
+ return []
164
+ if isinstance(value, list):
165
+ return value
166
+ if isinstance(value, str):
167
+ # Try to parse as comma-separated
168
+ return [v.strip() for v in value.split(",") if v.strip()]
169
+ return [value]
processing_brand_sentiment/workflow/agents/comment_preprocessor_agent.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Comment Preprocessor Agent for brand sentiment analysis on social media comments.
3
+
4
+ Extends ContentPreprocessorAgent but handles plain text (no HTML parsing).
5
+ Builds context from content title, content description, and parent comment text
6
+ instead of thread title and first post.
7
+
8
+ Reuses: keyword sets, product alias mapping, language detection, relevance screening.
9
+ Overrides: process() method for plain text handling and comment-specific context building.
10
+ """
11
+
12
+ from typing import Dict, Any, Optional
13
+ import logging
14
+
15
+ from .content_preprocessor_agent import ContentPreprocessorAgent
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class CommentPreprocessorAgent(ContentPreprocessorAgent):
21
+ """
22
+ Agent that preprocesses social media comments for brand sentiment analysis.
23
+
24
+ Inherits keyword detection, product alias mapping, language detection,
25
+ and relevance screening from ContentPreprocessorAgent.
26
+
27
+ Key differences from forum preprocessor:
28
+ - No HTML parsing (comments are plain text)
29
+ - Context built from content title + description + parent comment
30
+ - Different input field names (comment_text vs post_content)
31
+ """
32
+
33
+ def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]):
34
+ """
35
+ Initialize the Comment Preprocessor Agent.
36
+
37
+ Args:
38
+ config: Agent configuration
39
+ brand_config: Brand-specific configuration with keywords, products, and aliases
40
+ """
41
+ super().__init__(config, brand_config)
42
+ self.name = "CommentPreprocessorAgent"
43
+
44
+ logger.info(
45
+ f"CommentPreprocessorAgent initialized with {len(self.primary_keywords)} primary keywords, "
46
+ f"{len(self.product_aliases)} product aliases"
47
+ )
48
+
49
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
50
+ """
51
+ Validate that input contains required fields for comment processing.
52
+
53
+ Args:
54
+ input_data: Input dictionary
55
+
56
+ Returns:
57
+ True if valid, False otherwise
58
+ """
59
+ required_fields = ["comment_sk", "comment_text"]
60
+ return all(field in input_data for field in required_fields)
61
+
62
+ def _build_comment_context(
63
+ self,
64
+ content_title: Optional[str] = None,
65
+ content_description: Optional[str] = None,
66
+ parent_comment_text: Optional[str] = None
67
+ ) -> str:
68
+ """
69
+ Build context string from social media content and parent comment information.
70
+
71
+ Args:
72
+ content_title: Title of the social media post/content
73
+ content_description: Description/message of the social media post
74
+ parent_comment_text: Text of the parent comment (if this is a reply)
75
+
76
+ Returns:
77
+ Formatted context string
78
+ """
79
+ context_parts = []
80
+
81
+ if content_title:
82
+ context_parts.append(f"Post title: {content_title}")
83
+
84
+ if content_description:
85
+ # Truncate if too long
86
+ truncated = content_description[:500] + "..." if len(content_description) > 500 else content_description
87
+ context_parts.append(f"Post description: {truncated}")
88
+
89
+ if parent_comment_text:
90
+ truncated = parent_comment_text[:500] + "..." if len(parent_comment_text) > 500 else parent_comment_text
91
+ context_parts.append(f"Parent comment: {truncated}")
92
+
93
+ return " | ".join(context_parts) if context_parts else ""
94
+
95
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
96
+ """
97
+ Process a social media comment through the preprocessing pipeline.
98
+
99
+ Unlike forum posts, comments are plain text (no HTML parsing needed).
100
+ Context is built from content title, description, and parent comment.
101
+
102
+ Args:
103
+ input_data: Dictionary containing comment data with at least:
104
+ - comment_sk: Comment surrogate key
105
+ - comment_text: Raw comment text (plain text)
106
+ - content_title: Title of the post (optional)
107
+ - content_description: Description of the post (optional)
108
+ - parent_comment_text: Parent comment text if reply (optional)
109
+
110
+ Returns:
111
+ Dictionary with preprocessing results
112
+ """
113
+ try:
114
+ # Validate input
115
+ if not self.validate_input(input_data):
116
+ return {
117
+ "success": False,
118
+ "error": "Invalid input: missing required fields (comment_sk, comment_text)",
119
+ **input_data
120
+ }
121
+
122
+ comment_text = input_data.get("comment_text", "")
123
+
124
+ # Step 1: Clean text (plain text - no HTML parsing needed)
125
+ cleaned_content = comment_text.strip() if comment_text else ""
126
+
127
+ # Check for empty content
128
+ if not cleaned_content or len(cleaned_content) < self.min_content_length:
129
+ return {
130
+ "success": True,
131
+ "cleaned_content": cleaned_content,
132
+ "quoted_content": None,
133
+ "is_empty": True,
134
+ "preliminary_relevant": False,
135
+ "needs_relevance_validation": False,
136
+ **{k: v for k, v in input_data.items() if k != "comment_text"}
137
+ }
138
+
139
+ # Step 2: Check relevance (reused from parent class)
140
+ relevance_result = self._check_relevance(cleaned_content)
141
+ has_primary_keywords = relevance_result.get("has_primary_keywords", False)
142
+
143
+ # Step 3: Build comment context
144
+ raw_thread_context = self._build_comment_context(
145
+ content_title=input_data.get("content_title"),
146
+ content_description=input_data.get("content_description"),
147
+ parent_comment_text=input_data.get("parent_comment_text")
148
+ )
149
+
150
+ # Step 4: Detect language (reused from parent class)
151
+ lang_result = self._detect_language(cleaned_content, has_primary_keywords)
152
+
153
+ # Step 5: Extract product and competitor mentions (reused from parent class)
154
+ products_found = self._extract_mentioned_products(cleaned_content)
155
+ competitors_found = self._extract_mentioned_competitors(cleaned_content)
156
+
157
+ # Determine quoted content (parent comment serves as quoted context)
158
+ parent_comment = input_data.get("parent_comment_text")
159
+ has_parent = parent_comment is not None and str(parent_comment).strip() != ""
160
+
161
+ # Build result
162
+ result = {
163
+ "success": True,
164
+ "is_empty": False,
165
+
166
+ # Cleaned content
167
+ "cleaned_content": cleaned_content,
168
+ "quoted_content": parent_comment if has_parent else None,
169
+ "has_quote": has_parent,
170
+ "quoted_author": None,
171
+ "raw_thread_context": raw_thread_context,
172
+
173
+ # Language detection
174
+ "detected_language": lang_result["language"],
175
+ "language_code": lang_result["language_code"],
176
+ "is_english": lang_result["is_english"],
177
+ "language_confidence": lang_result["confidence"],
178
+ "language_detection_skipped": lang_result.get("detection_skipped", False),
179
+
180
+ # Relevance assessment
181
+ "preliminary_relevant": relevance_result["preliminary_relevant"],
182
+ "needs_relevance_validation": relevance_result["needs_relevance_validation"],
183
+ "relevance_keywords_found": relevance_result["found_keywords"],
184
+ "relevance_type": relevance_result["relevance_type"],
185
+ "relevance_confidence": relevance_result["relevance_confidence"],
186
+ "has_primary_keywords": has_primary_keywords,
187
+
188
+ # Initial extractions
189
+ "products_detected": products_found,
190
+ "competitors_detected": competitors_found,
191
+
192
+ # Preserve original data (exclude raw text to avoid duplication)
193
+ **{k: v for k, v in input_data.items() if k not in ["comment_text"]}
194
+ }
195
+
196
+ # Keep original content for reference
197
+ result["original_text"] = comment_text
198
+
199
+ self.log_processing(
200
+ f"Processed comment {input_data.get('comment_sk')}: "
201
+ f"lang={lang_result['language']}, "
202
+ f"relevant={relevance_result['preliminary_relevant']}, "
203
+ f"needs_validation={relevance_result['needs_relevance_validation']}, "
204
+ f"products={products_found}",
205
+ "debug"
206
+ )
207
+
208
+ return result
209
+
210
+ except Exception as e:
211
+ return self.handle_error(e, f"preprocessing comment {input_data.get('comment_sk')}")
processing_brand_sentiment/workflow/agents/content_preprocessor_agent.py ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Content Preprocessor Agent for brand sentiment analysis.
3
+ Handles HTML parsing, text cleaning, language detection, product alias mapping,
4
+ and initial relevance screening. This is a deterministic agent (no LLM calls).
5
+
6
+ Enhanced version with:
7
+ - Product alias mapping (B8 -> B8X)
8
+ - Smart language detection (skip for short texts)
9
+ - Always process if primary keywords found
10
+ - Better content separation
11
+ """
12
+
13
+ import re
14
+ from typing import Dict, Any, List, Optional, Set
15
+ from lingua import Language, LanguageDetectorBuilder
16
+ import logging
17
+
18
+ from .base_agent import BaseAgent
19
+ from utils.html_parser import HTMLParser
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class ContentPreprocessorAgent(BaseAgent):
25
+ """
26
+ Agent that preprocesses forum posts:
27
+ - Parses HTML to extract reply and quoted content
28
+ - Cleans and normalizes text
29
+ - Maps product aliases to canonical names
30
+ - Detects language (with smart handling for short texts)
31
+ - Performs initial keyword-based relevance screening
32
+ """
33
+
34
+ # Lingua to ISO 639-1 language code mapping
35
+ LINGUA_TO_ISO = {
36
+ Language.ENGLISH: "en",
37
+ Language.SPANISH: "es",
38
+ Language.FRENCH: "fr",
39
+ Language.GERMAN: "de",
40
+ Language.ITALIAN: "it",
41
+ Language.PORTUGUESE: "pt",
42
+ Language.RUSSIAN: "ru",
43
+ Language.JAPANESE: "ja",
44
+ Language.KOREAN: "ko",
45
+ Language.CHINESE: "zh",
46
+ Language.ARABIC: "ar",
47
+ Language.HINDI: "hi",
48
+ Language.DUTCH: "nl",
49
+ Language.SWEDISH: "sv",
50
+ Language.POLISH: "pl",
51
+ Language.TURKISH: "tr"
52
+ }
53
+
54
+ def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]):
55
+ """
56
+ Initialize the Content Preprocessor Agent.
57
+
58
+ Args:
59
+ config: Agent configuration
60
+ brand_config: Brand-specific configuration with keywords, products, and aliases
61
+ """
62
+ super().__init__("ContentPreprocessorAgent", config)
63
+ self.brand_config = brand_config
64
+ self.html_parser = HTMLParser()
65
+
66
+ # Get preprocessing settings
67
+ preprocessing_config = brand_config.get("preprocessing", {})
68
+ self.min_length_for_lang_detection = preprocessing_config.get(
69
+ "min_length_for_language_detection", 50
70
+ )
71
+ self.default_language = preprocessing_config.get(
72
+ "default_language_for_short_text", "English"
73
+ )
74
+ self.always_process_primary = preprocessing_config.get(
75
+ "always_process_if_primary_keyword", True
76
+ )
77
+ self.min_content_length = preprocessing_config.get("min_content_length", 3)
78
+
79
+ # Initialize lingua detector
80
+ self.language_detector = LanguageDetectorBuilder.from_all_languages().build()
81
+
82
+ # Build keyword sets and alias mappings
83
+ self._build_keyword_sets()
84
+ self._build_alias_mappings()
85
+
86
+ logger.info(
87
+ f"ContentPreprocessorAgent initialized with {len(self.primary_keywords)} primary keywords, "
88
+ f"{len(self.product_aliases)} product aliases"
89
+ )
90
+
91
+ def _build_keyword_sets(self) -> None:
92
+ """Build keyword sets from brand configuration for efficient relevance checking."""
93
+ relevance_config = self.brand_config.get("relevance_keywords", {})
94
+
95
+ # Primary keywords - definitive Sabian mentions
96
+ primary = relevance_config.get("primary", {}).get("keywords", [])
97
+ self.primary_keywords: Set[str] = set(k.lower() for k in primary)
98
+
99
+ # Contextual keywords - need disambiguation (HH, AA)
100
+ contextual = relevance_config.get("contextual", {}).get("keywords", [])
101
+ self.contextual_keywords: Set[str] = set(k.lower() for k in contextual)
102
+
103
+ # Cymbal context keywords - help disambiguate contextual terms
104
+ cymbal_context = relevance_config.get("cymbal_context", {}).get("keywords", [])
105
+ self.cymbal_context_keywords: Set[str] = set(k.lower() for k in cymbal_context)
106
+
107
+ # Competitor names and aliases for detection
108
+ competitors = self.brand_config.get("brand", {}).get("competitors", [])
109
+ self.competitor_keywords: Set[str] = set()
110
+ self.competitor_name_map: Dict[str, str] = {} # alias -> canonical name
111
+
112
+ for comp in competitors:
113
+ if isinstance(comp, dict):
114
+ name = comp.get("name", "")
115
+ self.competitor_keywords.add(name.lower())
116
+ self.competitor_name_map[name.lower()] = name
117
+ for alias in comp.get("aliases", []):
118
+ alias_lower = alias.lower()
119
+ self.competitor_keywords.add(alias_lower)
120
+ self.competitor_name_map[alias_lower] = name
121
+ else:
122
+ comp_str = str(comp).lower()
123
+ self.competitor_keywords.add(comp_str)
124
+ self.competitor_name_map[comp_str] = str(comp)
125
+
126
+ # Product names
127
+ products = self.brand_config.get("brand", {}).get("products", [])
128
+ self.product_keywords: Set[str] = set(p.lower() for p in products)
129
+ self.products_list = products # Keep original case
130
+
131
+ logger.debug(
132
+ f"Built keyword sets: {len(self.primary_keywords)} primary, "
133
+ f"{len(self.contextual_keywords)} contextual, "
134
+ f"{len(self.product_keywords)} products, "
135
+ f"{len(self.competitor_keywords)} competitor terms"
136
+ )
137
+
138
+ def _build_alias_mappings(self) -> None:
139
+ """Build product alias mappings from brand configuration."""
140
+ aliases = self.brand_config.get("brand", {}).get("product_aliases", {})
141
+
142
+ # Build alias -> canonical product mapping
143
+ self.product_aliases: Dict[str, str] = {}
144
+ for alias, canonical in aliases.items():
145
+ self.product_aliases[alias.lower()] = canonical
146
+
147
+ # Also add primary keywords that are aliases to contextual keywords
148
+ # e.g., "b8" should trigger contextual check since it maps to "B8X"
149
+ for alias in self.product_aliases.keys():
150
+ if alias not in self.primary_keywords:
151
+ self.contextual_keywords.add(alias)
152
+
153
+ logger.debug(f"Built {len(self.product_aliases)} product alias mappings")
154
+
155
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
156
+ """
157
+ Validate that input contains required fields.
158
+
159
+ Args:
160
+ input_data: Input dictionary
161
+
162
+ Returns:
163
+ True if valid, False otherwise
164
+ """
165
+ required_fields = ["post_id", "post_content"]
166
+ return all(field in input_data for field in required_fields)
167
+
168
+ def _detect_language(self, text: str, has_primary_keywords: bool = False) -> Dict[str, Any]:
169
+ """
170
+ Detect the language of text using lingua library.
171
+
172
+ Enhanced logic:
173
+ - Skip detection for short texts (< min_length_for_lang_detection chars)
174
+ - Always return English if primary Sabian keywords are found
175
+
176
+ Args:
177
+ text: Text to analyze
178
+ has_primary_keywords: Whether primary Sabian keywords were found
179
+
180
+ Returns:
181
+ Dictionary with language detection results
182
+ """
183
+ try:
184
+ cleaned_text = text.strip()
185
+
186
+ # If text is too short, default to English
187
+ if len(cleaned_text) < self.min_length_for_lang_detection:
188
+ return {
189
+ "language": self.default_language,
190
+ "language_code": "en",
191
+ "is_english": True,
192
+ "confidence": "low",
193
+ "detection_skipped": True,
194
+ "skip_reason": f"Text too short ({len(cleaned_text)} < {self.min_length_for_lang_detection} chars)"
195
+ }
196
+
197
+ # If primary keywords found and always_process_primary is True, treat as English
198
+ if has_primary_keywords and self.always_process_primary:
199
+ # Still try to detect, but override if non-English
200
+ detected = self.language_detector.detect_language_of(cleaned_text)
201
+
202
+ if detected == Language.ENGLISH:
203
+ return {
204
+ "language": "English",
205
+ "language_code": "en",
206
+ "is_english": True,
207
+ "confidence": "high",
208
+ "detection_skipped": False,
209
+ "skip_reason": None
210
+ }
211
+ else:
212
+ # Primary keyword found but detected as non-English
213
+ # Force to English since Sabian is explicitly mentioned
214
+ lang_name = detected.name.capitalize() if detected else "Unknown"
215
+ return {
216
+ "language": "English",
217
+ "language_code": "en",
218
+ "is_english": True,
219
+ "confidence": "medium",
220
+ "detection_skipped": False,
221
+ "skip_reason": None,
222
+ "original_detected_language": lang_name,
223
+ "override_reason": "Primary Sabian keyword found, treating as English"
224
+ }
225
+
226
+ # Standard detection
227
+ detected = self.language_detector.detect_language_of(cleaned_text)
228
+
229
+ if detected is None:
230
+ return {
231
+ "language": self.default_language,
232
+ "language_code": "en",
233
+ "is_english": True,
234
+ "confidence": "low",
235
+ "detection_skipped": False,
236
+ "skip_reason": None
237
+ }
238
+
239
+ if detected == Language.ENGLISH:
240
+ return {
241
+ "language": "English",
242
+ "language_code": "en",
243
+ "is_english": True,
244
+ "confidence": "high",
245
+ "detection_skipped": False,
246
+ "skip_reason": None
247
+ }
248
+
249
+ lang_code = self.LINGUA_TO_ISO.get(detected, "unknown")
250
+ lang_name = detected.name.capitalize()
251
+
252
+ return {
253
+ "language": lang_name,
254
+ "language_code": lang_code,
255
+ "is_english": False,
256
+ "confidence": "high",
257
+ "detection_skipped": False,
258
+ "skip_reason": None
259
+ }
260
+
261
+ except Exception as e:
262
+ logger.warning(f"Language detection failed: {e}")
263
+ return {
264
+ "language": self.default_language,
265
+ "language_code": "en",
266
+ "is_english": True,
267
+ "confidence": "low",
268
+ "detection_skipped": False,
269
+ "skip_reason": None,
270
+ "detection_error": str(e)
271
+ }
272
+
273
+ def _normalize_product_mentions(self, found_products: List[str]) -> List[str]:
274
+ """
275
+ Normalize product mentions using alias mappings.
276
+
277
+ Args:
278
+ found_products: List of product terms found
279
+
280
+ Returns:
281
+ List of canonical product names
282
+ """
283
+ normalized = []
284
+ for product in found_products:
285
+ product_lower = product.lower()
286
+
287
+ # Check if it's an alias
288
+ if product_lower in self.product_aliases:
289
+ canonical = self.product_aliases[product_lower]
290
+ if canonical not in normalized:
291
+ normalized.append(canonical)
292
+ # Check if it's a direct product match
293
+ elif product_lower in self.product_keywords:
294
+ # Find the original case version
295
+ for p in self.products_list:
296
+ if p.lower() == product_lower:
297
+ if p not in normalized:
298
+ normalized.append(p)
299
+ break
300
+
301
+ return normalized
302
+
303
+ def _check_relevance(self, text: str) -> Dict[str, Any]:
304
+ """
305
+ Check if text is relevant to the brand using keyword matching.
306
+
307
+ Enhanced to handle product aliases.
308
+
309
+ Returns:
310
+ Dictionary with relevance assessment
311
+ """
312
+ text_lower = text.lower()
313
+
314
+ # Tokenize for word boundary matching
315
+ words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
316
+
317
+ # Also check for multi-word phrases (for aliases like "hand hammered")
318
+ all_aliases = set(self.product_aliases.keys())
319
+
320
+ # Check for primary keywords (definitive matches)
321
+ found_primary = self.primary_keywords.intersection(words)
322
+
323
+ # Check for product aliases in text
324
+ found_aliases = []
325
+ for alias in all_aliases:
326
+ if ' ' in alias:
327
+ # Multi-word alias - check in full text
328
+ if alias in text_lower:
329
+ found_aliases.append(alias)
330
+ elif alias in words:
331
+ found_aliases.append(alias)
332
+
333
+ # Map aliases to canonical products
334
+ alias_products = []
335
+ for alias in found_aliases:
336
+ if alias in self.product_aliases:
337
+ canonical = self.product_aliases[alias]
338
+ if canonical not in alias_products:
339
+ alias_products.append(canonical)
340
+
341
+ if found_primary or alias_products:
342
+ all_found = list(found_primary) + found_aliases
343
+ return {
344
+ "preliminary_relevant": True,
345
+ "needs_relevance_validation": False,
346
+ "found_keywords": all_found,
347
+ "mapped_products": alias_products,
348
+ "relevance_type": "primary",
349
+ "relevance_confidence": "high",
350
+ "has_primary_keywords": True
351
+ }
352
+
353
+ # Check for contextual keywords (need validation)
354
+ found_contextual = self.contextual_keywords.intersection(words)
355
+ if found_contextual:
356
+ # Check if there's cymbal context
357
+ found_cymbal_context = self.cymbal_context_keywords.intersection(words)
358
+ has_cymbal_context = len(found_cymbal_context) > 0
359
+
360
+ return {
361
+ "preliminary_relevant": True,
362
+ "needs_relevance_validation": True,
363
+ "found_keywords": list(found_contextual),
364
+ "cymbal_context_found": list(found_cymbal_context) if found_cymbal_context else [],
365
+ "has_cymbal_context": has_cymbal_context,
366
+ "mapped_products": [],
367
+ "relevance_type": "contextual",
368
+ "relevance_confidence": "medium" if has_cymbal_context else "low",
369
+ "has_primary_keywords": False
370
+ }
371
+
372
+ # Check for competitor mentions (might be comparative discussion)
373
+ found_competitors = self.competitor_keywords.intersection(words)
374
+ if found_competitors:
375
+ return {
376
+ "preliminary_relevant": False,
377
+ "needs_relevance_validation": True,
378
+ "found_keywords": list(found_competitors),
379
+ "mapped_products": [],
380
+ "relevance_type": "competitor_only",
381
+ "relevance_confidence": "low",
382
+ "has_primary_keywords": False
383
+ }
384
+
385
+ # No relevant keywords found
386
+ return {
387
+ "preliminary_relevant": False,
388
+ "needs_relevance_validation": False,
389
+ "found_keywords": [],
390
+ "mapped_products": [],
391
+ "relevance_type": "none",
392
+ "relevance_confidence": "high",
393
+ "has_primary_keywords": False
394
+ }
395
+
396
+ def _extract_mentioned_products(self, text: str) -> List[str]:
397
+ """
398
+ Extract product names mentioned in the text, including aliases.
399
+
400
+ Args:
401
+ text: Text to search
402
+
403
+ Returns:
404
+ List of canonical product names found
405
+ """
406
+ text_lower = text.lower()
407
+ words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
408
+
409
+ found_products = []
410
+
411
+ # Check direct product mentions
412
+ for product in self.products_list:
413
+ if product.lower() in words:
414
+ if product not in found_products:
415
+ found_products.append(product)
416
+
417
+ # Check aliases
418
+ for alias, canonical in self.product_aliases.items():
419
+ if ' ' in alias:
420
+ # Multi-word alias
421
+ if alias in text_lower:
422
+ if canonical not in found_products:
423
+ found_products.append(canonical)
424
+ elif alias in words:
425
+ if canonical not in found_products:
426
+ found_products.append(canonical)
427
+
428
+ return found_products
429
+
430
+ def _extract_mentioned_competitors(self, text: str) -> List[str]:
431
+ """
432
+ Extract competitor brand names mentioned in the text.
433
+
434
+ Args:
435
+ text: Text to search
436
+
437
+ Returns:
438
+ List of canonical competitor names found
439
+ """
440
+ text_lower = text.lower()
441
+ words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
442
+
443
+ found_competitors = set()
444
+
445
+ for alias in self.competitor_keywords:
446
+ if ' ' in alias:
447
+ # Multi-word check
448
+ if alias in text_lower:
449
+ canonical = self.competitor_name_map.get(alias, alias)
450
+ found_competitors.add(canonical)
451
+ elif alias in words:
452
+ canonical = self.competitor_name_map.get(alias, alias)
453
+ found_competitors.add(canonical)
454
+
455
+ return list(found_competitors)
456
+
457
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
458
+ """
459
+ Process a forum post through preprocessing pipeline.
460
+
461
+ Args:
462
+ input_data: Dictionary containing post data with at least:
463
+ - post_id: Post identifier
464
+ - post_content: Raw HTML content
465
+ - thread_title: Thread title (optional)
466
+ - thread_first_post: First post content (optional)
467
+ - category_title: Category title (optional)
468
+ - category_topic: Category topic (optional)
469
+
470
+ Returns:
471
+ Dictionary with preprocessing results
472
+ """
473
+ try:
474
+ # Validate input
475
+ if not self.validate_input(input_data):
476
+ return {
477
+ "success": False,
478
+ "error": "Invalid input: missing required fields",
479
+ **input_data
480
+ }
481
+
482
+ post_content = input_data.get("post_content", "")
483
+
484
+ # Step 1: Parse HTML content
485
+ parsed = self.html_parser.parse_post_content(post_content)
486
+ reply_content = parsed.get("reply_content", "")
487
+ quoted_content = parsed.get("quoted_content")
488
+
489
+ # Check for empty content
490
+ if not reply_content or len(reply_content.strip()) < self.min_content_length:
491
+ return {
492
+ "success": True,
493
+ "cleaned_content": reply_content,
494
+ "quoted_content": quoted_content,
495
+ "is_empty": True,
496
+ "preliminary_relevant": False,
497
+ "needs_relevance_validation": False,
498
+ **{k: v for k, v in input_data.items() if k != "post_content"}
499
+ }
500
+
501
+ # Step 2: Check relevance FIRST (needed for language detection logic)
502
+ relevance_result = self._check_relevance(reply_content)
503
+ has_primary_keywords = relevance_result.get("has_primary_keywords", False)
504
+
505
+ # Step 3: Build thread context (raw - will be summarized by extraction agent)
506
+ raw_thread_context = self.html_parser.build_thread_context(
507
+ thread_title=input_data.get("thread_title"),
508
+ first_post_content=input_data.get("thread_first_post"),
509
+ category_title=input_data.get("category_title"),
510
+ category_topic=input_data.get("category_topic")
511
+ )
512
+
513
+ # Step 4: Detect language (with smart handling)
514
+ lang_result = self._detect_language(reply_content, has_primary_keywords)
515
+
516
+ # Step 5: Extract product and competitor mentions from actual post content
517
+ products_found = self._extract_mentioned_products(reply_content)
518
+ competitors_found = self._extract_mentioned_competitors(reply_content)
519
+
520
+ # Build result
521
+ result = {
522
+ "success": True,
523
+ "is_empty": False,
524
+
525
+ # Cleaned content
526
+ "cleaned_content": reply_content,
527
+ "quoted_content": quoted_content,
528
+ "has_quote": parsed.get("has_quote", False),
529
+ "quoted_author": parsed.get("quoted_author"),
530
+ "raw_thread_context": raw_thread_context,
531
+
532
+ # Language detection
533
+ "detected_language": lang_result["language"],
534
+ "language_code": lang_result["language_code"],
535
+ "is_english": lang_result["is_english"],
536
+ "language_confidence": lang_result["confidence"],
537
+ "language_detection_skipped": lang_result.get("detection_skipped", False),
538
+
539
+ # Relevance assessment
540
+ "preliminary_relevant": relevance_result["preliminary_relevant"],
541
+ "needs_relevance_validation": relevance_result["needs_relevance_validation"],
542
+ "relevance_keywords_found": relevance_result["found_keywords"],
543
+ "relevance_type": relevance_result["relevance_type"],
544
+ "relevance_confidence": relevance_result["relevance_confidence"],
545
+ "has_primary_keywords": has_primary_keywords,
546
+
547
+ # Initial extractions
548
+ "products_detected": products_found,
549
+ "competitors_detected": competitors_found,
550
+
551
+ # Preserve original data
552
+ **{k: v for k, v in input_data.items() if k not in ["post_content"]}
553
+ }
554
+
555
+ # Keep original content for reference
556
+ result["original_content"] = post_content
557
+
558
+ self.log_processing(
559
+ f"Processed post {input_data.get('post_id')}: "
560
+ f"lang={lang_result['language']}, "
561
+ f"relevant={relevance_result['preliminary_relevant']}, "
562
+ f"needs_validation={relevance_result['needs_relevance_validation']}, "
563
+ f"products={products_found}",
564
+ "debug"
565
+ )
566
+
567
+ return result
568
+
569
+ except Exception as e:
570
+ return self.handle_error(e, f"preprocessing post {input_data.get('post_id')}")
processing_brand_sentiment/workflow/agents/output_validator_agent.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Output Validator Agent for brand sentiment analysis.
3
+
4
+ This agent performs rule-based validation on the final output to ensure:
5
+ 1. All values are from predefined lists
6
+ 2. Logical consistency between fields
7
+ 3. Anomaly detection for manual review flagging
8
+
9
+ This is a deterministic agent (no LLM calls) that acts as a quality gate.
10
+ """
11
+
12
+ from typing import Dict, Any, List, Set
13
+ import logging
14
+
15
+ from .base_agent import BaseAgent
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class OutputValidatorAgent(BaseAgent):
21
+ """
22
+ Agent that validates the final output for consistency and quality.
23
+
24
+ Performs rule-based checks without LLM calls to ensure data quality
25
+ and flag posts that may need manual review.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ config: Dict[str, Any],
31
+ brand_config: Dict[str, Any],
32
+ analysis_categories: Dict[str, Any]
33
+ ):
34
+ """
35
+ Initialize the Output Validator Agent.
36
+
37
+ Args:
38
+ config: Agent configuration
39
+ brand_config: Brand-specific configuration
40
+ analysis_categories: Category definitions for validation
41
+ """
42
+ super().__init__("OutputValidatorAgent", config)
43
+ self.brand_config = brand_config
44
+ self.analysis_categories = analysis_categories
45
+
46
+ # Build valid value sets for validation
47
+ self._build_valid_value_sets()
48
+
49
+ logger.info("OutputValidatorAgent initialized")
50
+
51
+ def _build_valid_value_sets(self) -> None:
52
+ """Build sets of valid values for efficient validation."""
53
+ brand = self.brand_config.get("brand", {})
54
+
55
+ # Products
56
+ self.valid_products: Set[str] = set(
57
+ p.lower() for p in brand.get("products", [])
58
+ )
59
+ self.products_canonical = {p.lower(): p for p in brand.get("products", [])}
60
+
61
+ # Competitors
62
+ self.valid_competitors: Set[str] = set()
63
+ self.competitors_canonical = {}
64
+ for comp in brand.get("competitors", []):
65
+ if isinstance(comp, dict):
66
+ name = comp.get("name", "")
67
+ self.valid_competitors.add(name.lower())
68
+ self.competitors_canonical[name.lower()] = name
69
+
70
+ # Extract all category values
71
+ self.valid_values = {}
72
+
73
+ category_configs = {
74
+ "author_role": self.analysis_categories.get("author_role", {}),
75
+ "sabian_mention_context": self.analysis_categories.get("sabian_mention_context", {}),
76
+ "sentiment_level": self.analysis_categories.get("sentiment", {}),
77
+ "emotion_type": self.analysis_categories.get("emotions", {}),
78
+ "intents": self.analysis_categories.get("intents", {}),
79
+ "purchase_stage": self.analysis_categories.get("purchase_stage", {}),
80
+ "comparison_type": self.analysis_categories.get("comparison_type", {}),
81
+ "feedback_aspects": self.analysis_categories.get("feedback_aspects", {}),
82
+ "decision_drivers": self.analysis_categories.get("decision_drivers", {}),
83
+ "product_attributes": self.analysis_categories.get("product_attributes", {}),
84
+ }
85
+
86
+ for key, config in category_configs.items():
87
+ if "categories" in config:
88
+ self.valid_values[key] = set(
89
+ c["value"].lower() for c in config["categories"]
90
+ )
91
+ elif "levels" in config:
92
+ self.valid_values[key] = set(
93
+ c["value"].lower() for c in config["levels"]
94
+ )
95
+ else:
96
+ self.valid_values[key] = set()
97
+
98
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
99
+ """Validate that input contains required fields."""
100
+ # The validator accepts any input - it will validate what's there
101
+ return True
102
+
103
+ def _validate_list_values(
104
+ self,
105
+ values: List[Any],
106
+ valid_set: Set[str],
107
+ field_name: str
108
+ ) -> Dict[str, Any]:
109
+ """
110
+ Validate list values against a set of valid values.
111
+
112
+ Returns:
113
+ Dictionary with validation results
114
+ """
115
+ if not values:
116
+ return {"valid": True, "invalid_values": [], "field": field_name}
117
+
118
+ invalid = []
119
+ for v in values:
120
+ if isinstance(v, str) and v.lower() not in valid_set:
121
+ invalid.append(v)
122
+
123
+ return {
124
+ "valid": len(invalid) == 0,
125
+ "invalid_values": invalid,
126
+ "field": field_name
127
+ }
128
+
129
+ def _validate_single_value(
130
+ self,
131
+ value: Any,
132
+ valid_set: Set[str],
133
+ field_name: str,
134
+ allow_none: bool = True
135
+ ) -> Dict[str, Any]:
136
+ """
137
+ Validate a single value against a set of valid values.
138
+
139
+ Returns:
140
+ Dictionary with validation results
141
+ """
142
+ if value is None:
143
+ return {"valid": allow_none, "invalid_value": None if allow_none else value, "field": field_name}
144
+
145
+ if isinstance(value, str) and value.lower() in valid_set:
146
+ return {"valid": True, "invalid_value": None, "field": field_name}
147
+
148
+ return {"valid": False, "invalid_value": value, "field": field_name}
149
+
150
+ def _check_logical_consistency(self, data: Dict[str, Any]) -> List[str]:
151
+ """
152
+ Check for logical consistency between fields.
153
+
154
+ Note: Empty products_mentioned is OK even when relevant - users may
155
+ discuss the Sabian brand generally without specific products.
156
+
157
+ Returns:
158
+ List of inconsistency warnings
159
+ """
160
+ warnings = []
161
+ is_relevant = data.get("is_relevant", False)
162
+
163
+ # Check 1: If not relevant, certain fields should be empty/null
164
+ if not is_relevant:
165
+ if data.get("sabian_mention_context"):
166
+ warnings.append(
167
+ "sabian_mention_context should be null when is_relevant=False"
168
+ )
169
+ if data.get("sentiment_level") and data.get("sentiment_level") != "neutral":
170
+ warnings.append(
171
+ "sentiment_level should be null/neutral when is_relevant=False"
172
+ )
173
+
174
+ # Check 2: Comparison type should only be set if comparing intent exists
175
+ if data.get("comparison_type"):
176
+ intents = data.get("intents", [])
177
+ if "comparing" not in intents:
178
+ warnings.append(
179
+ "comparison_type is set but 'comparing' not in intents"
180
+ )
181
+
182
+ # Check 3: Author perspective fields consistency
183
+ # If author is giving advice (providing_information) without sharing experience,
184
+ # pain_points and delight_factors should typically be empty
185
+ intents = data.get("intents", [])
186
+ if "providing_information" in intents and "sharing_experience" not in intents:
187
+ if data.get("pain_points") or data.get("delight_factors"):
188
+ warnings.append(
189
+ "pain_points/delight_factors set for advice-giving post without sharing_experience intent"
190
+ )
191
+
192
+ return warnings
193
+
194
+ def _fix_overlapping_feedback(self, data: Dict[str, Any]) -> Dict[str, Any]:
195
+ """
196
+ Fix overlapping values between pain_points and delight_factors.
197
+
198
+ Rule: The same aspect cannot be both a pain point and a delight factor.
199
+ Resolution: Use sentiment to determine which to keep, or clear both if neutral.
200
+
201
+ Args:
202
+ data: Dictionary with analysis results
203
+
204
+ Returns:
205
+ Updated dictionary with fixed pain_points and delight_factors
206
+ """
207
+ pain_points = data.get("pain_points", []) or []
208
+ delight_factors = data.get("delight_factors", []) or []
209
+
210
+ if not pain_points or not delight_factors:
211
+ return data
212
+
213
+ # Find overlapping values
214
+ pain_set = set(p.lower() if isinstance(p, str) else p for p in pain_points)
215
+ delight_set = set(d.lower() if isinstance(d, str) else d for d in delight_factors)
216
+ overlap = pain_set.intersection(delight_set)
217
+
218
+ if not overlap:
219
+ return data
220
+
221
+ # Get sentiment to determine which to keep
222
+ sentiment = data.get("sentiment_level", "neutral")
223
+
224
+ # Create new lists without overlapping values
225
+ if sentiment in ["positive", "very_positive"]:
226
+ # Keep in delight_factors, remove from pain_points
227
+ new_pain_points = [p for p in pain_points if p.lower() not in overlap]
228
+ new_delight_factors = delight_factors
229
+ elif sentiment in ["negative", "very_negative"]:
230
+ # Keep in pain_points, remove from delight_factors
231
+ new_pain_points = pain_points
232
+ new_delight_factors = [d for d in delight_factors if d.lower() not in overlap]
233
+ else:
234
+ # Neutral sentiment - clear both (can't determine intent)
235
+ new_pain_points = [p for p in pain_points if p.lower() not in overlap]
236
+ new_delight_factors = [d for d in delight_factors if d.lower() not in overlap]
237
+
238
+ # Update data
239
+ data["pain_points"] = new_pain_points
240
+ data["delight_factors"] = new_delight_factors
241
+
242
+ logger.debug(
243
+ f"Fixed overlapping feedback: removed {overlap} from "
244
+ f"{'pain_points' if sentiment in ['positive', 'very_positive'] else 'delight_factors' if sentiment in ['negative', 'very_negative'] else 'both'}"
245
+ )
246
+
247
+ return data
248
+
249
+ def _detect_anomalies(self, data: Dict[str, Any]) -> List[str]:
250
+ """
251
+ Detect anomalies that might need manual review.
252
+
253
+ Returns:
254
+ List of anomaly flags
255
+ """
256
+ anomalies = []
257
+
258
+ # Anomaly 1: Low confidence relevance
259
+ if data.get("is_relevant") and data.get("relevance_confidence") == "low":
260
+ anomalies.append("low_confidence_relevant")
261
+
262
+ # Anomaly 2: Sarcasm detected - sentiment might be inverted
263
+ if data.get("sarcasm_detected"):
264
+ anomalies.append("sarcasm_detected")
265
+
266
+ # Anomaly 3: Very short content marked as relevant
267
+ content = data.get("cleaned_content", "")
268
+ if data.get("is_relevant") and len(content) < 20:
269
+ anomalies.append("short_relevant_content")
270
+
271
+ # Anomaly 4: Switching behavior detected
272
+ comparison_type = data.get("comparison_type", "")
273
+ if comparison_type in ["switching_to_sabian", "switching_from_sabian"]:
274
+ anomalies.append(f"brand_switching_{comparison_type}")
275
+
276
+ return anomalies
277
+
278
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
279
+ """
280
+ Process and validate the analysis output.
281
+
282
+ Args:
283
+ input_data: Dictionary with all analysis results
284
+
285
+ Returns:
286
+ Dictionary with validation results added
287
+ """
288
+ try:
289
+ validation_errors = []
290
+ validation_warnings = []
291
+
292
+ # Skip detailed validation for non-relevant or skipped posts
293
+ if not input_data.get("is_relevant", False) or input_data.get("analysis_skipped", False):
294
+ return {
295
+ **input_data,
296
+ "validation_passed": True,
297
+ "validation_errors": [],
298
+ "validation_warnings": [],
299
+ "validation_flags": [],
300
+ "processing_status": "completed"
301
+ }
302
+
303
+ # Fix overlapping pain_points and delight_factors (safety net)
304
+ input_data = self._fix_overlapping_feedback(input_data)
305
+
306
+ # Validate products_mentioned
307
+ products_result = self._validate_list_values(
308
+ input_data.get("products_mentioned", []),
309
+ self.valid_products,
310
+ "products_mentioned"
311
+ )
312
+ if not products_result["valid"]:
313
+ validation_errors.append(
314
+ f"Invalid products: {products_result['invalid_values']}"
315
+ )
316
+
317
+ # Validate competitors_mentioned
318
+ competitors_result = self._validate_list_values(
319
+ input_data.get("competitors_mentioned", []),
320
+ self.valid_competitors,
321
+ "competitors_mentioned"
322
+ )
323
+ if not competitors_result["valid"]:
324
+ validation_errors.append(
325
+ f"Invalid competitors: {competitors_result['invalid_values']}"
326
+ )
327
+
328
+ # Validate categorical fields
329
+ categorical_validations = [
330
+ ("author_role", "author_role", True),
331
+ ("sabian_mention_context", "sabian_mention_context", True),
332
+ ("sentiment_level", "sentiment_level", True),
333
+ ("emotion_type", "emotion_type", True),
334
+ ("purchase_stage", "purchase_stage", True),
335
+ ("comparison_type", "comparison_type", True),
336
+ ]
337
+
338
+ for field, valid_key, allow_none in categorical_validations:
339
+ result = self._validate_single_value(
340
+ input_data.get(field),
341
+ self.valid_values.get(valid_key, set()),
342
+ field,
343
+ allow_none
344
+ )
345
+ if not result["valid"]:
346
+ validation_errors.append(
347
+ f"Invalid {field}: {result['invalid_value']}"
348
+ )
349
+
350
+ # Validate list fields
351
+ list_validations = [
352
+ ("intents", "intents"),
353
+ ("product_attributes", "product_attributes"),
354
+ ("pain_points", "feedback_aspects"),
355
+ ("delight_factors", "feedback_aspects"),
356
+ ("decision_drivers", "decision_drivers"),
357
+ ]
358
+
359
+ for field, valid_key in list_validations:
360
+ result = self._validate_list_values(
361
+ input_data.get(field, []),
362
+ self.valid_values.get(valid_key, set()),
363
+ field
364
+ )
365
+ if not result["valid"]:
366
+ validation_warnings.append(
367
+ f"Invalid values in {field}: {result['invalid_values']}"
368
+ )
369
+
370
+ # Check logical consistency
371
+ consistency_warnings = self._check_logical_consistency(input_data)
372
+ validation_warnings.extend(consistency_warnings)
373
+
374
+ # Detect anomalies
375
+ anomalies = self._detect_anomalies(input_data)
376
+
377
+ # Determine overall validation status
378
+ validation_passed = len(validation_errors) == 0
379
+
380
+ # Set processing status
381
+ if validation_errors:
382
+ processing_status = "validation_failed"
383
+ elif anomalies:
384
+ processing_status = "completed_with_flags"
385
+ else:
386
+ processing_status = "completed"
387
+
388
+ result = {
389
+ **input_data,
390
+ "validation_passed": validation_passed,
391
+ "validation_errors": validation_errors,
392
+ "validation_warnings": validation_warnings,
393
+ "validation_flags": anomalies,
394
+ "processing_status": processing_status
395
+ }
396
+
397
+ if validation_errors or validation_warnings or anomalies:
398
+ self.log_processing(
399
+ f"Validation complete: passed={validation_passed}, "
400
+ f"errors={len(validation_errors)}, warnings={len(validation_warnings)}, "
401
+ f"flags={anomalies}",
402
+ "debug"
403
+ )
404
+
405
+ return result
406
+
407
+ except Exception as e:
408
+ return self.handle_error(e, "output validation")
processing_brand_sentiment/workflow/agents/preprocessor_agent.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Preprocessor Agent for brand sentiment analysis.
3
+ Handles HTML parsing, text cleaning, language detection, and initial relevance screening.
4
+ This is a deterministic agent (no LLM calls except for language detection fallback).
5
+ """
6
+
7
+ import re
8
+ from typing import Dict, Any, List, Optional, Set
9
+ from lingua import Language, LanguageDetectorBuilder
10
+ import logging
11
+
12
+ from .base_agent import BaseAgent
13
+ from utils.html_parser import HTMLParser
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class PreprocessorAgent(BaseAgent):
19
+ """
20
+ Agent that preprocesses forum posts:
21
+ - Parses HTML to extract reply and quoted content
22
+ - Cleans and normalizes text
23
+ - Detects language
24
+ - Performs initial keyword-based relevance screening
25
+ """
26
+
27
+ # Lingua to ISO 639-1 language code mapping
28
+ LINGUA_TO_ISO = {
29
+ Language.ENGLISH: "en",
30
+ Language.SPANISH: "es",
31
+ Language.FRENCH: "fr",
32
+ Language.GERMAN: "de",
33
+ Language.ITALIAN: "it",
34
+ Language.PORTUGUESE: "pt",
35
+ Language.RUSSIAN: "ru",
36
+ Language.JAPANESE: "ja",
37
+ Language.KOREAN: "ko",
38
+ Language.CHINESE: "zh",
39
+ Language.ARABIC: "ar",
40
+ Language.HINDI: "hi",
41
+ Language.DUTCH: "nl",
42
+ Language.SWEDISH: "sv",
43
+ Language.POLISH: "pl",
44
+ Language.TURKISH: "tr"
45
+ }
46
+
47
+ def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]):
48
+ """
49
+ Initialize the Preprocessor Agent.
50
+
51
+ Args:
52
+ config: Agent configuration
53
+ brand_config: Brand-specific configuration with keywords and products
54
+ """
55
+ super().__init__("PreprocessorAgent", config)
56
+ self.brand_config = brand_config
57
+ self.html_parser = HTMLParser()
58
+
59
+ # Initialize lingua detector
60
+ self.language_detector = LanguageDetectorBuilder.from_all_languages().build()
61
+
62
+ # Build keyword sets for efficient lookup
63
+ self._build_keyword_sets()
64
+
65
+ logger.info("PreprocessorAgent initialized")
66
+
67
+ def _build_keyword_sets(self) -> None:
68
+ """Build keyword sets from brand configuration for efficient relevance checking."""
69
+ relevance_config = self.brand_config.get("relevance_keywords", {})
70
+
71
+ # Primary keywords - definitive Sabian mentions
72
+ primary = relevance_config.get("primary", {}).get("keywords", [])
73
+ self.primary_keywords: Set[str] = set(k.lower() for k in primary)
74
+
75
+ # Contextual keywords - need disambiguation (HH, AA)
76
+ contextual = relevance_config.get("contextual", {}).get("keywords", [])
77
+ self.contextual_keywords: Set[str] = set(k.lower() for k in contextual)
78
+
79
+ # Cymbal context keywords - help disambiguate contextual terms
80
+ cymbal_context = relevance_config.get("cymbal_context", {}).get("keywords", [])
81
+ self.cymbal_context_keywords: Set[str] = set(k.lower() for k in cymbal_context)
82
+
83
+ # Competitor names for detection
84
+ competitors = self.brand_config.get("brand", {}).get("competitors", [])
85
+ self.competitor_keywords: Set[str] = set()
86
+ for comp in competitors:
87
+ if isinstance(comp, dict):
88
+ self.competitor_keywords.add(comp.get("name", "").lower())
89
+ for alias in comp.get("aliases", []):
90
+ self.competitor_keywords.add(alias.lower())
91
+ else:
92
+ self.competitor_keywords.add(str(comp).lower())
93
+
94
+ # Product names
95
+ products = self.brand_config.get("brand", {}).get("products", [])
96
+ self.product_keywords: Set[str] = set(p.lower() for p in products)
97
+
98
+ logger.info(f"Built keyword sets: {len(self.primary_keywords)} primary, "
99
+ f"{len(self.contextual_keywords)} contextual, "
100
+ f"{len(self.product_keywords)} products")
101
+
102
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
103
+ """
104
+ Validate that input contains required fields.
105
+
106
+ Args:
107
+ input_data: Input dictionary
108
+
109
+ Returns:
110
+ True if valid, False otherwise
111
+ """
112
+ required_fields = ["post_id", "post_content"]
113
+ return all(field in input_data for field in required_fields)
114
+
115
+ def _detect_language(self, text: str) -> Dict[str, Any]:
116
+ """
117
+ Detect the language of text using lingua library.
118
+
119
+ Args:
120
+ text: Text to analyze
121
+
122
+ Returns:
123
+ Dictionary with language detection results
124
+ """
125
+ try:
126
+ cleaned_text = text.strip()
127
+ if not cleaned_text or len(cleaned_text) < 3:
128
+ return {
129
+ "language": "English",
130
+ "language_code": "en",
131
+ "is_english": True,
132
+ "confidence": "low"
133
+ }
134
+
135
+ detected = self.language_detector.detect_language_of(cleaned_text)
136
+
137
+ if detected is None:
138
+ return {
139
+ "language": "English",
140
+ "language_code": "en",
141
+ "is_english": True,
142
+ "confidence": "low"
143
+ }
144
+
145
+ if detected == Language.ENGLISH:
146
+ return {
147
+ "language": "English",
148
+ "language_code": "en",
149
+ "is_english": True,
150
+ "confidence": "high"
151
+ }
152
+
153
+ lang_code = self.LINGUA_TO_ISO.get(detected, "unknown")
154
+ lang_name = detected.name.capitalize()
155
+
156
+ return {
157
+ "language": lang_name,
158
+ "language_code": lang_code,
159
+ "is_english": False,
160
+ "confidence": "high"
161
+ }
162
+
163
+ except Exception as e:
164
+ logger.warning(f"Language detection failed: {e}")
165
+ return {
166
+ "language": "English",
167
+ "language_code": "en",
168
+ "is_english": True,
169
+ "confidence": "low"
170
+ }
171
+
172
+ def _check_relevance(self, text: str) -> Dict[str, Any]:
173
+ """
174
+ Check if text is relevant to the brand using keyword matching.
175
+
176
+ Returns:
177
+ Dictionary with relevance assessment:
178
+ - preliminary_relevant: Initial relevance assessment
179
+ - needs_relevance_validation: True if contains ambiguous terms needing LLM check
180
+ - found_keywords: Keywords found in the text
181
+ - relevance_type: 'primary', 'contextual', or 'none'
182
+ """
183
+ text_lower = text.lower()
184
+
185
+ # Tokenize for word boundary matching
186
+ words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
187
+
188
+ # Check for primary keywords (definitive matches)
189
+ found_primary = self.primary_keywords.intersection(words)
190
+ if found_primary:
191
+ return {
192
+ "preliminary_relevant": True,
193
+ "needs_relevance_validation": False,
194
+ "found_keywords": list(found_primary),
195
+ "relevance_type": "primary",
196
+ "relevance_confidence": "high"
197
+ }
198
+
199
+ # Check for contextual keywords (need validation)
200
+ found_contextual = self.contextual_keywords.intersection(words)
201
+ if found_contextual:
202
+ # Check if there's cymbal context
203
+ found_cymbal_context = self.cymbal_context_keywords.intersection(words)
204
+ has_cymbal_context = len(found_cymbal_context) > 0
205
+
206
+ return {
207
+ "preliminary_relevant": True, # Potentially relevant
208
+ "needs_relevance_validation": True, # Needs LLM confirmation
209
+ "found_keywords": list(found_contextual),
210
+ "cymbal_context_found": list(found_cymbal_context) if found_cymbal_context else [],
211
+ "has_cymbal_context": has_cymbal_context,
212
+ "relevance_type": "contextual",
213
+ "relevance_confidence": "medium" if has_cymbal_context else "low"
214
+ }
215
+
216
+ # Check for competitor mentions (might be comparative discussion)
217
+ found_competitors = self.competitor_keywords.intersection(words)
218
+ if found_competitors:
219
+ # Has competitor mention but no Sabian mention
220
+ # Could still be relevant in a comparison context
221
+ return {
222
+ "preliminary_relevant": False,
223
+ "needs_relevance_validation": True, # LLM should check context
224
+ "found_keywords": list(found_competitors),
225
+ "relevance_type": "competitor_only",
226
+ "relevance_confidence": "low"
227
+ }
228
+
229
+ # No relevant keywords found
230
+ return {
231
+ "preliminary_relevant": False,
232
+ "needs_relevance_validation": False,
233
+ "found_keywords": [],
234
+ "relevance_type": "none",
235
+ "relevance_confidence": "high"
236
+ }
237
+
238
+ def _extract_mentioned_products(self, text: str) -> List[str]:
239
+ """
240
+ Extract product names mentioned in the text.
241
+
242
+ Args:
243
+ text: Text to search
244
+
245
+ Returns:
246
+ List of product names found
247
+ """
248
+ text_lower = text.lower()
249
+ words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
250
+
251
+ found_products = []
252
+ products = self.brand_config.get("brand", {}).get("products", [])
253
+
254
+ for product in products:
255
+ if product.lower() in words:
256
+ found_products.append(product)
257
+
258
+ return found_products
259
+
260
+ def _extract_mentioned_competitors(self, text: str) -> List[str]:
261
+ """
262
+ Extract competitor names mentioned in the text.
263
+
264
+ Args:
265
+ text: Text to search
266
+
267
+ Returns:
268
+ List of competitor names found
269
+ """
270
+ text_lower = text.lower()
271
+ words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower))
272
+
273
+ found_competitors = []
274
+ competitors = self.brand_config.get("brand", {}).get("competitors", [])
275
+
276
+ for comp in competitors:
277
+ if isinstance(comp, dict):
278
+ name = comp.get("name", "")
279
+ aliases = comp.get("aliases", [])
280
+
281
+ # Check name and aliases
282
+ if name.lower() in words:
283
+ if name not in found_competitors:
284
+ found_competitors.append(name)
285
+ else:
286
+ for alias in aliases:
287
+ if alias.lower() in words:
288
+ if name not in found_competitors:
289
+ found_competitors.append(name)
290
+ break
291
+ else:
292
+ if str(comp).lower() in words:
293
+ found_competitors.append(str(comp))
294
+
295
+ return found_competitors
296
+
297
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
298
+ """
299
+ Process a forum post through preprocessing pipeline.
300
+
301
+ Args:
302
+ input_data: Dictionary containing post data with at least:
303
+ - post_id: Post identifier
304
+ - post_content: Raw HTML content
305
+ - thread_title: Thread title (optional)
306
+ - thread_first_post: First post content (optional)
307
+ - category_title: Category title (optional)
308
+ - category_topic: Category topic (optional)
309
+
310
+ Returns:
311
+ Dictionary with preprocessing results
312
+ """
313
+ try:
314
+ # Validate input
315
+ if not self.validate_input(input_data):
316
+ return {
317
+ "success": False,
318
+ "error": "Invalid input: missing required fields",
319
+ **input_data
320
+ }
321
+
322
+ post_content = input_data.get("post_content", "")
323
+
324
+ # Step 1: Parse HTML content
325
+ parsed = self.html_parser.parse_post_content(post_content)
326
+ reply_content = parsed.get("reply_content", "")
327
+ quoted_content = parsed.get("quoted_content")
328
+
329
+ # Check for empty content
330
+ if not reply_content or len(reply_content.strip()) < 3:
331
+ return {
332
+ "success": True,
333
+ "cleaned_content": reply_content,
334
+ "quoted_content": quoted_content,
335
+ "is_empty": True,
336
+ "preliminary_relevant": False,
337
+ "needs_relevance_validation": False,
338
+ **{k: v for k, v in input_data.items() if k != "post_content"}
339
+ }
340
+
341
+ # Step 2: Build thread context
342
+ thread_context = self.html_parser.build_thread_context(
343
+ thread_title=input_data.get("thread_title"),
344
+ first_post_content=input_data.get("thread_first_post"),
345
+ category_title=input_data.get("category_title"),
346
+ category_topic=input_data.get("category_topic")
347
+ )
348
+
349
+ # Step 3: Detect language
350
+ lang_result = self._detect_language(reply_content)
351
+
352
+ # Step 4: Check relevance - ONLY on the actual post content, NOT quoted/context
353
+ # The quoted content and thread context are for understanding, not for relevance determination
354
+ relevance_result = self._check_relevance(reply_content)
355
+
356
+ # Step 5: Extract product and competitor mentions - ONLY from actual post content
357
+ # We don't want to extract from quoted content as that will be processed separately
358
+ products_found = self._extract_mentioned_products(reply_content)
359
+ competitors_found = self._extract_mentioned_competitors(reply_content)
360
+
361
+ # Build result
362
+ result = {
363
+ "success": True,
364
+ "is_empty": False,
365
+
366
+ # Cleaned content
367
+ "cleaned_content": reply_content,
368
+ "quoted_content": quoted_content,
369
+ "has_quote": parsed.get("has_quote", False),
370
+ "quoted_author": parsed.get("quoted_author"),
371
+ "thread_context": thread_context,
372
+
373
+ # Language detection
374
+ "detected_language": lang_result["language"],
375
+ "language_code": lang_result["language_code"],
376
+ "is_english": lang_result["is_english"],
377
+ "language_confidence": lang_result["confidence"],
378
+
379
+ # Relevance assessment
380
+ "preliminary_relevant": relevance_result["preliminary_relevant"],
381
+ "needs_relevance_validation": relevance_result["needs_relevance_validation"],
382
+ "relevance_keywords_found": relevance_result["found_keywords"],
383
+ "relevance_type": relevance_result["relevance_type"],
384
+ "relevance_confidence": relevance_result["relevance_confidence"],
385
+
386
+ # Initial extractions
387
+ "products_detected": products_found,
388
+ "competitors_detected": competitors_found,
389
+
390
+ # Preserve original data
391
+ **{k: v for k, v in input_data.items() if k not in ["post_content"]}
392
+ }
393
+
394
+ # Keep original content for reference
395
+ result["original_content"] = post_content
396
+
397
+ self.log_processing(
398
+ f"Processed post {input_data.get('post_id')}: "
399
+ f"lang={lang_result['language']}, "
400
+ f"relevant={relevance_result['preliminary_relevant']}, "
401
+ f"needs_validation={relevance_result['needs_relevance_validation']}",
402
+ "debug"
403
+ )
404
+
405
+ return result
406
+
407
+ except Exception as e:
408
+ return self.handle_error(e, f"preprocessing post {input_data.get('post_id')}")
processing_brand_sentiment/workflow/agents/relevance_validator_agent.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Relevance Validator Agent for brand sentiment analysis.
3
+ Lightweight LLM-based agent that confirms whether ambiguous terms (HH, AA)
4
+ refer to Sabian products or generic terms.
5
+ """
6
+
7
+ from typing import Dict, Any
8
+ import json
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain.schema import HumanMessage, SystemMessage
11
+ import logging
12
+
13
+ from .base_agent import BaseAgent
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class RelevanceValidatorAgent(BaseAgent):
19
+ """
20
+ Agent that validates whether posts with ambiguous terms (like HH, AA)
21
+ are actually referring to Sabian products or generic terms.
22
+
23
+ This is a lightweight LLM call specifically for disambiguation.
24
+ """
25
+
26
+ def __init__(self, config: Dict[str, Any], api_key: str, brand_config: Dict[str, Any]):
27
+ """
28
+ Initialize the Relevance Validator Agent.
29
+
30
+ Args:
31
+ config: Agent configuration
32
+ api_key: OpenAI API key
33
+ brand_config: Brand-specific configuration with product info
34
+ """
35
+ super().__init__("RelevanceValidatorAgent", config)
36
+ self.api_key = api_key
37
+ self.brand_config = brand_config
38
+
39
+ self.llm = ChatOpenAI(
40
+ model=self.model,
41
+ temperature=self.temperature,
42
+ api_key=self.api_key
43
+ )
44
+
45
+ # Build disambiguation context from brand config
46
+ self._build_disambiguation_context()
47
+
48
+ logger.info("RelevanceValidatorAgent initialized")
49
+
50
+ def _build_disambiguation_context(self) -> None:
51
+ """Build context strings for disambiguation from brand config."""
52
+ brand = self.brand_config.get("brand", {})
53
+ ambiguous = brand.get("ambiguous_terms", {})
54
+
55
+ self.disambiguation_info = {}
56
+ for term, info in ambiguous.items():
57
+ if isinstance(info, dict):
58
+ self.disambiguation_info[term] = {
59
+ "description": info.get("description", ""),
60
+ "context_clues": info.get("disambiguation_context", [])
61
+ }
62
+ else:
63
+ self.disambiguation_info[term] = {
64
+ "description": str(info),
65
+ "context_clues": []
66
+ }
67
+
68
+ # Product descriptions for context
69
+ self.product_descriptions = brand.get("product_descriptions", {})
70
+
71
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
72
+ """
73
+ Validate that input contains required fields.
74
+
75
+ Args:
76
+ input_data: Input dictionary
77
+
78
+ Returns:
79
+ True if valid, False otherwise
80
+ """
81
+ required = ["cleaned_content", "relevance_keywords_found"]
82
+ return all(field in input_data for field in required)
83
+
84
+ def _build_system_prompt(self) -> str:
85
+ """Build the system prompt for relevance validation."""
86
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
87
+ products = self.brand_config.get("brand", {}).get("products", [])
88
+
89
+ # Build disambiguation rules
90
+ disambiguation_rules = []
91
+ for term, info in self.disambiguation_info.items():
92
+ desc = info.get("description", "")
93
+ clues = info.get("context_clues", [])
94
+ rule = f"- '{term}': {desc}"
95
+ if clues:
96
+ rule += f" Context clues for {brand_name}: {', '.join(clues)}"
97
+ disambiguation_rules.append(rule)
98
+
99
+ disambiguation_text = "\n".join(disambiguation_rules) if disambiguation_rules else "No specific disambiguation rules."
100
+
101
+ system_prompt = f"""You are an expert at identifying brand mentions in drum/cymbal forum discussions.
102
+
103
+ Your task is to determine if the POST CONTENT itself discusses {brand_name} products.
104
+
105
+ **CRITICAL RULE:**
106
+ - You must determine relevance based ONLY on the POST CONTENT
107
+ - The context (thread info, quoted/parent content) is provided to help you understand ambiguous terms
108
+ - But if the POST CONTENT itself does not mention or discuss {brand_name}, it is NOT relevant
109
+ - Example: If quoted content mentions Sabian but the post just says "Got it! Thanks!" → NOT relevant
110
+
111
+ **{brand_name} Product Lines:**
112
+ {', '.join(products)}
113
+
114
+ **Ambiguous Terms to Watch For:**
115
+ {disambiguation_text}
116
+
117
+ **Key Disambiguation Rules:**
118
+ - "HH" alone usually means "Hi-Hat" (a type of cymbal), NOT Sabian HH series
119
+ - "HH" WITH Sabian context IN THE POST (e.g., "Sabian HH", "HH crashes", "my HH ride") likely refers to Sabian
120
+ - "AA" alone might be a general abbreviation, NOT Sabian AA series
121
+ - "AA" WITH Sabian context IN THE POST (e.g., "Sabian AA", "AA cymbals", "AA medium ride") likely refers to Sabian
122
+ - Generic replies like "Thanks!", "Got it!", "Good point!" are NOT relevant even if context mentions {brand_name}
123
+
124
+ **Return JSON with:**
125
+ - is_relevant: boolean - true ONLY if the POST CONTENT itself discusses {brand_name} products
126
+ - confidence: "high", "medium", or "low"
127
+ - reason: brief explanation (1-2 sentences) - explain what IN THE POST made you decide
128
+ - detected_products: list of {brand_name} products mentioned IN THE POST (empty if none)
129
+
130
+ Return only valid JSON."""
131
+
132
+ return system_prompt
133
+
134
+ def validate_relevance(
135
+ self,
136
+ content: str,
137
+ keywords_found: list,
138
+ thread_context: str = "",
139
+ quoted_content: str = ""
140
+ ) -> Dict[str, Any]:
141
+ """
142
+ Validate whether content is relevant to the brand.
143
+
144
+ Args:
145
+ content: The cleaned post content
146
+ keywords_found: Keywords that triggered validation
147
+ thread_context: Thread context for additional context
148
+ quoted_content: Quoted content if any
149
+
150
+ Returns:
151
+ Dictionary with validation results
152
+ """
153
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
154
+
155
+ # Build context for the LLM
156
+ context_parts = []
157
+ if thread_context:
158
+ context_parts.append(f"Thread context: {thread_context}")
159
+ if quoted_content:
160
+ context_parts.append(f"Replying to: {quoted_content[:300]}...")
161
+
162
+ context_str = "\n".join(context_parts) if context_parts else "No additional context."
163
+
164
+ user_prompt = f"""Determine if this POST CONTENT discusses {brand_name} cymbal products.
165
+
166
+ **Keywords found in post:** {', '.join(keywords_found)}
167
+
168
+ **CONTEXT (for understanding ambiguous terms only - do NOT base relevance on this):**
169
+ {context_str}
170
+
171
+ **POST CONTENT TO EVALUATE (base your relevance decision ONLY on this):**
172
+ "{content}"
173
+
174
+ Does the POST CONTENT itself discuss {brand_name} products? Remember: generic replies are NOT relevant even if context mentions {brand_name}. Return JSON only."""
175
+
176
+ try:
177
+ messages = [
178
+ SystemMessage(content=self._build_system_prompt()),
179
+ HumanMessage(content=user_prompt)
180
+ ]
181
+
182
+ response = self.llm.invoke(messages)
183
+ result = self._parse_llm_json_response(response.content)
184
+
185
+ return {
186
+ "success": True,
187
+ "is_relevant": result.get("is_relevant", False),
188
+ "relevance_confidence": result.get("confidence", "low"),
189
+ "relevance_reason": result.get("reason", ""),
190
+ "detected_products": result.get("detected_products", [])
191
+ }
192
+
193
+ except json.JSONDecodeError as e:
194
+ self.log_processing(f"JSON decode error in relevance validation: {e}", "warning")
195
+ # Default to relevant if we can't determine
196
+ return {
197
+ "success": True,
198
+ "is_relevant": True,
199
+ "relevance_confidence": "low",
200
+ "relevance_reason": "Could not parse LLM response, defaulting to relevant",
201
+ "detected_products": []
202
+ }
203
+
204
+ except Exception as e:
205
+ self.log_processing(f"Relevance validation error: {e}", "error")
206
+ return {
207
+ "success": False,
208
+ "is_relevant": True, # Default to relevant on error
209
+ "relevance_confidence": "low",
210
+ "relevance_reason": f"Error during validation: {str(e)}",
211
+ "detected_products": [],
212
+ "error": str(e)
213
+ }
214
+
215
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
216
+ """
217
+ Process a post to validate its relevance to the brand.
218
+
219
+ Args:
220
+ input_data: Dictionary containing:
221
+ - cleaned_content: Cleaned post text
222
+ - relevance_keywords_found: Keywords that triggered validation
223
+ - thread_context: Optional thread context
224
+ - quoted_content: Optional quoted content
225
+
226
+ Returns:
227
+ Dictionary with validation results and original data
228
+ """
229
+ try:
230
+ if not self.validate_input(input_data):
231
+ return {
232
+ "success": False,
233
+ "error": "Invalid input: missing required fields",
234
+ "is_relevant": True, # Default to relevant
235
+ "relevance_confidence": "low",
236
+ **input_data
237
+ }
238
+
239
+ # Check if validation is actually needed
240
+ if not input_data.get("needs_relevance_validation", False):
241
+ # No validation needed, use preliminary assessment
242
+ return {
243
+ "success": True,
244
+ "is_relevant": input_data.get("preliminary_relevant", False),
245
+ "relevance_confidence": input_data.get("relevance_confidence", "high"),
246
+ "relevance_reason": "No validation needed - preliminary assessment used",
247
+ "validation_performed": False,
248
+ **input_data
249
+ }
250
+
251
+ # Perform LLM validation
252
+ validation_result = self.validate_relevance(
253
+ content=input_data.get("cleaned_content", ""),
254
+ keywords_found=input_data.get("relevance_keywords_found", []),
255
+ thread_context=input_data.get("thread_context", ""),
256
+ quoted_content=input_data.get("quoted_content", "")
257
+ )
258
+
259
+ # Merge results
260
+ result = {
261
+ **input_data,
262
+ "is_relevant": validation_result["is_relevant"],
263
+ "relevance_confidence": validation_result["relevance_confidence"],
264
+ "relevance_reason": validation_result["relevance_reason"],
265
+ "validation_performed": True,
266
+ "success": validation_result["success"]
267
+ }
268
+
269
+ # Update products detected if LLM found any
270
+ if validation_result.get("detected_products"):
271
+ existing_products = input_data.get("products_detected", [])
272
+ llm_products = validation_result["detected_products"]
273
+ # Merge without duplicates
274
+ all_products = list(set(existing_products + llm_products))
275
+ result["products_detected"] = all_products
276
+
277
+ if "error" in validation_result:
278
+ result["validation_error"] = validation_result["error"]
279
+
280
+ self.log_processing(
281
+ f"Validated relevance for post: is_relevant={result['is_relevant']}, "
282
+ f"confidence={result['relevance_confidence']}",
283
+ "debug"
284
+ )
285
+
286
+ return result
287
+
288
+ except Exception as e:
289
+ return self.handle_error(e, "relevance validation")
processing_brand_sentiment/workflow/agents/sabian_analyzer_agent.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sabian Analyzer Agent for comprehensive brand sentiment analysis.
3
+ LLM-based agent that extracts products, competitors, sentiment, intents,
4
+ pain points, and other brand intelligence from forum posts.
5
+ """
6
+
7
+ from typing import Dict, Any, List
8
+ import json
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain.schema import HumanMessage, SystemMessage
11
+ import logging
12
+
13
+ from .base_agent import BaseAgent
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class SabianAnalyzerAgent(BaseAgent):
19
+ """
20
+ Comprehensive brand analysis agent for Sabian cymbal discussions.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ config: Dict[str, Any],
26
+ api_key: str,
27
+ brand_config: Dict[str, Any],
28
+ analysis_categories: Dict[str, Any]
29
+ ):
30
+ super().__init__("SabianAnalyzerAgent", config)
31
+ self.api_key = api_key
32
+ self.brand_config = brand_config
33
+ self.analysis_categories = analysis_categories
34
+
35
+ self.llm = ChatOpenAI(
36
+ model=self.model,
37
+ temperature=self.temperature,
38
+ api_key=self.api_key
39
+ )
40
+
41
+ # Pre-compute valid values for validation
42
+ self._valid_values = self._compute_valid_values()
43
+ logger.info("SabianAnalyzerAgent initialized")
44
+
45
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
46
+ required = ["cleaned_content", "is_relevant"]
47
+ return all(field in input_data for field in required)
48
+
49
+ def _compute_valid_values(self) -> Dict[str, List[str]]:
50
+ """Pre-compute all valid values from config for validation."""
51
+ valid = {}
52
+
53
+ # Products from brand config
54
+ valid["products"] = self.brand_config.get("brand", {}).get("products", [])
55
+
56
+ # Competitors
57
+ competitor_names = []
58
+ for comp in self.brand_config.get("brand", {}).get("competitors", []):
59
+ if isinstance(comp, dict):
60
+ competitor_names.append(comp.get("name", ""))
61
+ valid["competitors"] = competitor_names
62
+
63
+ # Extract category values from analysis_categories
64
+ category_map = {
65
+ "author_role": "author_role",
66
+ "sabian_mention_context": "sabian_mention_context",
67
+ "sentiment_level": "sentiment",
68
+ "emotion_type": "emotions",
69
+ "intents": "intents",
70
+ "purchase_stage": "purchase_stage",
71
+ "comparison_type": "comparison_type",
72
+ "feedback_aspects": "feedback_aspects",
73
+ "decision_drivers": "decision_drivers",
74
+ "product_attributes": "product_attributes",
75
+ }
76
+
77
+ for key, config_key in category_map.items():
78
+ config_section = self.analysis_categories.get(config_key, {})
79
+ if "categories" in config_section:
80
+ valid[key] = [c["value"] for c in config_section["categories"]]
81
+ elif "levels" in config_section:
82
+ valid[key] = [c["value"] for c in config_section["levels"]]
83
+ else:
84
+ valid[key] = []
85
+
86
+ return valid
87
+
88
+ def _get_category_list(self, key: str) -> List[str]:
89
+ """Get list of valid values for a category."""
90
+ return self._valid_values.get(key, [])
91
+
92
+ def _build_system_prompt(self) -> str:
93
+ """Build optimized system prompt for brand analysis."""
94
+ brand = self.brand_config.get("brand", {})
95
+ brand_name = brand.get("name", "Sabian")
96
+ products = brand.get("products", [])
97
+
98
+ competitors = [c.get("name", "") for c in brand.get("competitors", []) if isinstance(c, dict)]
99
+
100
+ # Get all valid values
101
+ v = self._valid_values
102
+
103
+ return f"""You are a brand analyst extracting insights from forum posts about {brand_name} cymbals.
104
+
105
+ ## STRICT RULES
106
+ 1. Extract ONLY from POST CONTENT, never from quoted/context text
107
+ 2. Use ONLY values from the lists below - return null/[] if no match
108
+ 3. Sentiment must be about {brand_name} specifically, NOT overall post tone
109
+ 4. pain_points/delight_factors use SAME value list (feedback_aspects) - classification determines positive vs negative
110
+
111
+ ## VALID VALUES
112
+
113
+ **{brand_name} Products:** {products}
114
+ **Competitors:** {competitors}
115
+
116
+ | Field | Valid Values |
117
+ |-------|--------------|
118
+ | author_role | {v.get('author_role', [])} |
119
+ | sabian_mention_context | {v.get('sabian_mention_context', [])} |
120
+ | sentiment_level | {v.get('sentiment_level', [])} |
121
+ | emotion_type | {v.get('emotion_type', [])} |
122
+ | intents (multi) | {v.get('intents', [])} |
123
+ | purchase_stage | {v.get('purchase_stage', [])} |
124
+ | comparison_type | {v.get('comparison_type', [])} |
125
+ | feedback_aspects | {v.get('feedback_aspects', [])} |
126
+ | decision_drivers | {v.get('decision_drivers', [])} |
127
+ | product_attributes | {v.get('product_attributes', [])} |
128
+
129
+ ## KEY DISTINCTIONS
130
+
131
+ **Sentiment vs Intent:**
132
+ - sentiment_level = How author FEELS about {brand_name} (positive/negative/neutral)
133
+ - praising/criticizing intent = Author is actively ENDORSING or WARNING others
134
+
135
+ **Author-only fields (null if giving advice to others):**
136
+ - purchase_stage, decision_drivers, pain_points, delight_factors
137
+
138
+ **Example - Sabian-specific sentiment:**
139
+ Post: "Love my new drum kit! The SBR cymbals sound terrible though."
140
+ - Overall post: positive (happy about kit)
141
+ - {brand_name} sentiment: NEGATIVE (dislikes SBR sound)
142
+ - pain_points: ["sound_quality"]
143
+
144
+ ## OUTPUT JSON
145
+ ```json
146
+ {{
147
+ "author_role": "value from list",
148
+ "sabian_mention_context": "value from list",
149
+ "sentiment_level": "value from list",
150
+ "emotion_type": "value or null",
151
+ "sentiment_confidence": "high|medium|low",
152
+ "sarcasm_detected": false,
153
+ "products_mentioned": [],
154
+ "product_attributes": [],
155
+ "competitors_mentioned": [],
156
+ "competitor_products_owned": [],
157
+ "comparison_type": "value or null",
158
+ "intents": [],
159
+ "purchase_stage": "value or null",
160
+ "decision_drivers": [],
161
+ "pain_points": [],
162
+ "delight_factors": [],
163
+ "analysis_notes": "1-2 sentences on key {brand_name}-specific insights"
164
+ }}
165
+ ```
166
+
167
+ Return ONLY valid JSON."""
168
+
169
+ def analyze_post(
170
+ self,
171
+ content: str,
172
+ thread_context: str = "",
173
+ quoted_content: str = ""
174
+ ) -> Dict[str, Any]:
175
+ """Perform brand analysis on a post."""
176
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
177
+
178
+ context_str = ""
179
+ if thread_context:
180
+ context_str += f"[Thread: {thread_context[:200]}] "
181
+ if quoted_content:
182
+ context_str += f"[Replying to: {quoted_content[:200]}...]"
183
+
184
+ user_prompt = f"""Analyze this post about {brand_name}.
185
+
186
+ CONTEXT (for understanding only, DO NOT extract from): {context_str or "None"}
187
+
188
+ POST CONTENT (extract from THIS only):
189
+ "{content}"
190
+
191
+ Return JSON only."""
192
+
193
+ try:
194
+ messages = [
195
+ SystemMessage(content=self._build_system_prompt()),
196
+ HumanMessage(content=user_prompt)
197
+ ]
198
+
199
+ response = self.llm.invoke(messages)
200
+ result = self._parse_llm_json_response(response.content)
201
+ validated = self._validate_and_normalize(result)
202
+
203
+ return {"success": True, **validated}
204
+
205
+ except json.JSONDecodeError as e:
206
+ self.log_processing(f"JSON decode error: {e}", "warning")
207
+ return {
208
+ "success": False,
209
+ "error": f"JSON parse error: {str(e)}",
210
+ "sentiment_level": "neutral",
211
+ "intents": ["general_discussion"]
212
+ }
213
+ except Exception as e:
214
+ self.log_processing(f"Analysis error: {e}", "error")
215
+ return {"success": False, "error": str(e)}
216
+
217
+ def _validate_single(self, value: Any, valid_list: List[str], default: Any = None) -> Any:
218
+ """Validate single value against list, return canonical form or default."""
219
+ if value is None:
220
+ return default
221
+ if isinstance(value, str):
222
+ val_lower = value.lower()
223
+ for v in valid_list:
224
+ if v.lower() == val_lower:
225
+ return v
226
+ return default
227
+
228
+ def _validate_list(self, values: Any, valid_list: List[str]) -> List[str]:
229
+ """Validate list values, return only valid items in canonical form."""
230
+ if not values:
231
+ return []
232
+ if not isinstance(values, list):
233
+ values = [values]
234
+
235
+ validated = []
236
+ valid_lower = {v.lower(): v for v in valid_list}
237
+ for val in values:
238
+ if isinstance(val, str) and val.lower() in valid_lower:
239
+ validated.append(valid_lower[val.lower()])
240
+ return validated
241
+
242
+ def _validate_and_normalize(self, result: Dict[str, Any]) -> Dict[str, Any]:
243
+ """Validate all fields against predefined values and normalize."""
244
+ v = self._valid_values
245
+
246
+ normalized = {
247
+ # Classification
248
+ "author_role": self._validate_single(
249
+ result.get("author_role"), v["author_role"], "unknown"
250
+ ),
251
+ "sabian_mention_context": self._validate_single(
252
+ result.get("sabian_mention_context"), v["sabian_mention_context"], "casual_mention"
253
+ ),
254
+
255
+ # Sentiment
256
+ "sentiment_level": self._validate_single(
257
+ result.get("sentiment_level"), v["sentiment_level"], "neutral"
258
+ ),
259
+ "emotion_type": self._validate_single(
260
+ result.get("emotion_type"), v["emotion_type"], None
261
+ ),
262
+ "sentiment_confidence": result.get("sentiment_confidence", "medium"),
263
+ "sarcasm_detected": bool(result.get("sarcasm_detected", False)),
264
+
265
+ # Products
266
+ "products_mentioned": self._validate_list(
267
+ result.get("products_mentioned"), v["products"]
268
+ ),
269
+ "product_attributes": self._validate_list(
270
+ result.get("product_attributes"), v["product_attributes"]
271
+ ),
272
+
273
+ # Competitors
274
+ "competitors_mentioned": self._validate_list(
275
+ result.get("competitors_mentioned"), v["competitors"]
276
+ ),
277
+ "competitor_products_owned": self._validate_list(
278
+ result.get("competitor_products_owned"), v["competitors"]
279
+ ),
280
+ "comparison_type": self._validate_single(
281
+ result.get("comparison_type"), v["comparison_type"], None
282
+ ),
283
+
284
+ # Intents
285
+ "intents": self._validate_list(
286
+ result.get("intents"), v["intents"]
287
+ ) or ["general_discussion"],
288
+
289
+ # Author journey (null if advising others)
290
+ "purchase_stage": self._validate_single(
291
+ result.get("purchase_stage"), v["purchase_stage"], None
292
+ ),
293
+ "decision_drivers": self._validate_list(
294
+ result.get("decision_drivers"), v["decision_drivers"]
295
+ ),
296
+
297
+ # Feedback - both use feedback_aspects
298
+ "pain_points": self._validate_list(
299
+ result.get("pain_points"), v["feedback_aspects"]
300
+ ),
301
+ "delight_factors": self._validate_list(
302
+ result.get("delight_factors"), v["feedback_aspects"]
303
+ ),
304
+
305
+ # Notes
306
+ "analysis_notes": result.get("analysis_notes", ""),
307
+ }
308
+
309
+ # Log filtered values for debugging
310
+ for field in ["products_mentioned", "product_attributes", "pain_points", "delight_factors"]:
311
+ original = result.get(field, [])
312
+ if isinstance(original, list) and len(original) > len(normalized[field]):
313
+ filtered = set(str(x) for x in original) - set(normalized[field])
314
+ if filtered:
315
+ logger.debug(f"Filtered invalid {field}: {filtered}")
316
+
317
+ return normalized
318
+
319
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
320
+ """Process a post through brand analysis."""
321
+ try:
322
+ if not self.validate_input(input_data):
323
+ return {
324
+ "success": False,
325
+ "error": "Invalid input: missing required fields",
326
+ **input_data
327
+ }
328
+
329
+ # Skip non-relevant posts
330
+ if not input_data.get("is_relevant", False):
331
+ return {
332
+ "success": True,
333
+ "analysis_skipped": True,
334
+ "analysis_skip_reason": "Post marked as not relevant",
335
+ "author_role": None,
336
+ "sabian_mention_context": None,
337
+ "sentiment_level": None,
338
+ "emotion_type": None,
339
+ "products_mentioned": [],
340
+ "competitors_mentioned": [],
341
+ "competitor_products_owned": [],
342
+ "intents": [],
343
+ "purchase_stage": None,
344
+ "decision_drivers": [],
345
+ "pain_points": [],
346
+ "delight_factors": [],
347
+ **input_data
348
+ }
349
+
350
+ # Skip non-English posts
351
+ if not input_data.get("is_english", True):
352
+ return {
353
+ "success": True,
354
+ "analysis_skipped": True,
355
+ "analysis_skip_reason": f"Non-English: {input_data.get('detected_language')}",
356
+ "author_role": None,
357
+ "sabian_mention_context": None,
358
+ "sentiment_level": None,
359
+ "emotion_type": None,
360
+ "intents": [],
361
+ "competitor_products_owned": [],
362
+ **input_data
363
+ }
364
+
365
+ # Perform analysis
366
+ analysis_result = self.analyze_post(
367
+ content=input_data.get("cleaned_content", ""),
368
+ thread_context=input_data.get("thread_context", ""),
369
+ quoted_content=input_data.get("quoted_content", "")
370
+ )
371
+
372
+ result = {
373
+ **input_data,
374
+ **analysis_result,
375
+ "analysis_skipped": False
376
+ }
377
+
378
+ self.log_processing(
379
+ f"Analyzed: sentiment={result.get('sentiment_level')}, "
380
+ f"products={len(result.get('products_mentioned', []))}, "
381
+ f"intents={result.get('intents', [])}",
382
+ "debug"
383
+ )
384
+
385
+ return result
386
+
387
+ except Exception as e:
388
+ return self.handle_error(e, "brand analysis")
processing_brand_sentiment/workflow/agents/sabian_relevance_extraction_agent.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sabian Relevance & Extraction Agent for brand sentiment analysis.
3
+
4
+ This agent performs two critical functions:
5
+ 1. Determines relevance with HIGH confidence using strict rules
6
+ 2. Extracts verifiable facts (products, author role, context summary)
7
+
8
+ Key Design Principles:
9
+ - Strict product matching: ONLY return products from predefined list
10
+ - Competitor awareness: Know what products belong to competitors
11
+ - Conservative relevance: When uncertain, mark as NOT relevant
12
+ - Thread context summarization: Provide clean, concise context for next agent
13
+ """
14
+
15
+ from typing import Dict, Any, List
16
+ import json
17
+ from langchain_openai import ChatOpenAI
18
+ from langchain.schema import HumanMessage, SystemMessage
19
+ import logging
20
+
21
+ from .base_agent import BaseAgent
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class SabianRelevanceExtractionAgent(BaseAgent):
27
+ """
28
+ Agent that validates relevance and extracts key facts from posts.
29
+
30
+ This agent is the first LLM call in the pipeline and serves as the
31
+ gatekeeper for relevance while also extracting structured information
32
+ for downstream analysis.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ config: Dict[str, Any],
38
+ api_key: str,
39
+ brand_config: Dict[str, Any],
40
+ analysis_categories: Dict[str, Any]
41
+ ):
42
+ """
43
+ Initialize the Relevance & Extraction Agent.
44
+
45
+ Args:
46
+ config: Agent configuration
47
+ api_key: OpenAI API key
48
+ brand_config: Brand-specific configuration with products and competitors
49
+ analysis_categories: Category definitions for validation
50
+ """
51
+ super().__init__("SabianRelevanceExtractionAgent", config)
52
+ self.api_key = api_key
53
+ self.brand_config = brand_config
54
+ self.analysis_categories = analysis_categories
55
+
56
+ self.llm = ChatOpenAI(
57
+ model=self.model,
58
+ temperature=self.temperature,
59
+ api_key=self.api_key
60
+ )
61
+
62
+ # Pre-compute valid values
63
+ self._build_valid_values()
64
+ self._build_competitor_product_warnings()
65
+
66
+ logger.info("SabianRelevanceExtractionAgent initialized")
67
+
68
+ def _build_valid_values(self) -> None:
69
+ """Build valid value lists for validation."""
70
+ brand = self.brand_config.get("brand", {})
71
+
72
+ # Products
73
+ self.valid_products = brand.get("products", [])
74
+
75
+ # Competitors (brand names only)
76
+ self.valid_competitors = []
77
+ for comp in brand.get("competitors", []):
78
+ if isinstance(comp, dict):
79
+ self.valid_competitors.append(comp.get("name", ""))
80
+ else:
81
+ self.valid_competitors.append(str(comp))
82
+
83
+ # Author roles from categories
84
+ author_role_config = self.analysis_categories.get("author_role", {})
85
+ self.valid_author_roles = [
86
+ c["value"] for c in author_role_config.get("categories", [])
87
+ ]
88
+
89
+ # Sabian mention context from categories
90
+ mention_context_config = self.analysis_categories.get("sabian_mention_context", {})
91
+ self.valid_mention_contexts = [
92
+ c["value"] for c in mention_context_config.get("categories", [])
93
+ ]
94
+
95
+ def _build_competitor_product_warnings(self) -> None:
96
+ """Build list of competitor products to warn about in prompts."""
97
+ warnings = self.brand_config.get("brand", {}).get("competitor_products_warning", {})
98
+
99
+ self.competitor_products_by_brand = {}
100
+ for key, products in warnings.items():
101
+ if key == "description":
102
+ continue
103
+ # Extract brand name from key (e.g., "paiste_products" -> "Paiste")
104
+ brand_name = key.replace("_products", "").capitalize()
105
+ self.competitor_products_by_brand[brand_name] = products
106
+
107
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
108
+ """Validate input contains required fields."""
109
+ required = ["cleaned_content"]
110
+ return all(field in input_data for field in required)
111
+
112
+ def _build_system_prompt(self) -> str:
113
+ """Build the system prompt for relevance and extraction."""
114
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
115
+
116
+ # Build competitor product warnings
117
+ competitor_warnings = []
118
+ for brand, products in self.competitor_products_by_brand.items():
119
+ products_str = ", ".join(f'"{p}"' for p in products[:5]) # Limit to 5 examples
120
+ if len(products) > 5:
121
+ products_str += f" (and {len(products)-5} more)"
122
+ competitor_warnings.append(f"- {brand}: {products_str}")
123
+
124
+ competitor_warnings_text = "\n".join(competitor_warnings) if competitor_warnings else "None specified"
125
+
126
+ return f"""You are a brand mention extractor for {brand_name} cymbals. Your job is to:
127
+ 1. Determine if the POST CONTENT discusses {brand_name} products or brand
128
+ 2. Extract ONLY verifiable facts, not interpretations
129
+
130
+ ## CRITICAL RULES
131
+
132
+ ### Rule 1: Relevance Based on POST CONTENT Only
133
+ - The post is relevant ONLY if the POST CONTENT itself mentions {brand_name} brand or products
134
+ - Quoted/parent content mentioning {brand_name} does NOT make the post relevant
135
+ - Generic replies ("Thanks!", "Got it!", "Good point!") are NEVER relevant
136
+ - Posts can be relevant even without specific product mentions if they discuss the {brand_name} brand
137
+
138
+ ### Rule 2: Strict Product Matching
139
+ {brand_name.upper()} PRODUCTS (use ONLY these exact values):
140
+ {self.valid_products}
141
+
142
+ CRITICAL:
143
+ - Return ONLY products from this exact list above
144
+ - If you see a product not in this list, do NOT include it
145
+ - Return empty list [] if no products from the list are mentioned
146
+ - It's OK to have empty products_mentioned if the post discusses {brand_name} brand generally
147
+
148
+ ### Rule 3: Competitor Product Awareness
149
+ These products belong to COMPETITORS, NOT {brand_name}:
150
+ {competitor_warnings_text}
151
+
152
+ COMPETITOR BRANDS: {self.valid_competitors}
153
+ - Only return competitor BRAND names in competitors_mentioned (not their products)
154
+ - If you see "2002", "Signature", "Sound Edge", "Formula 602" - these are PAISTE, not {brand_name}
155
+ - If you see "K Custom", "A Custom" - these are ZILDJIAN, not {brand_name}
156
+
157
+ ### Rule 4: Thread Context Summary
158
+ - Summarize thread context in 1-2 sentences MAXIMUM
159
+ - Focus only on what helps understand what the post is responding to
160
+ - If thread is about unrelated topics (pizza, general life), say so briefly
161
+ - Keep it factual and concise
162
+
163
+ ### Rule 5: Author Role Classification
164
+ Determine the author's relationship to {brand_name}:
165
+ - current_owner: Currently owns/uses {brand_name} products
166
+ - past_owner: Previously owned but sold/replaced
167
+ - potential_buyer: Considering purchasing {brand_name}
168
+ - never_owned: Explicitly states they don't own {brand_name}
169
+ - unknown: Cannot determine from post content
170
+
171
+ ### Rule 6: Mention Context Classification
172
+ How prominently is {brand_name} discussed IN THE POST CONTENT:
173
+ - primary_focus: {brand_name} is the main topic of the post
174
+ - significant_mention: {brand_name} discussed with some detail, but not main focus
175
+ - casual_mention: Brief mention among other topics
176
+ - comparison_context: Mentioned while comparing to competitors
177
+ - null: Not relevant (use when is_relevant=false)
178
+
179
+ ## OUTPUT FORMAT
180
+ Return ONLY valid JSON with these exact fields:
181
+ ```json
182
+ {{
183
+ "is_relevant": true/false,
184
+ "relevance_confidence": "high" | "medium" | "low",
185
+ "relevance_reason": "1-2 sentences explaining your decision",
186
+ "products_mentioned": [],
187
+ "sabian_mention_context": "value from list" | null,
188
+ "author_role": "value from list",
189
+ "competitors_mentioned": [],
190
+ "thread_context_summary": "1-2 sentence summary of thread context"
191
+ }}
192
+ ```
193
+
194
+ IMPORTANT: Return ONLY the JSON object, no additional text."""
195
+
196
+ def _build_user_prompt(
197
+ self,
198
+ content: str,
199
+ quoted_content: str,
200
+ raw_thread_context: str,
201
+ keywords_found: List[str]
202
+ ) -> str:
203
+ """Build the user prompt with post content and context."""
204
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
205
+
206
+ context_section = ""
207
+ if raw_thread_context:
208
+ # Truncate if too long
209
+ truncated_context = raw_thread_context[:1000] if len(raw_thread_context) > 1000 else raw_thread_context
210
+ context_section += f"THREAD CONTEXT (for understanding only):\n{truncated_context}\n\n"
211
+
212
+ if quoted_content:
213
+ truncated_quote = quoted_content[:500] if len(quoted_content) > 500 else quoted_content
214
+ context_section += f"QUOTED/PARENT CONTENT (for understanding only):\n{truncated_quote}\n\n"
215
+
216
+ keywords_info = ""
217
+ if keywords_found:
218
+ keywords_info = f"Keywords detected by preprocessor: {', '.join(keywords_found)}\n\n"
219
+
220
+ return f"""Analyze this post for {brand_name} relevance and extract facts.
221
+
222
+ {keywords_info}{context_section}POST CONTENT TO EVALUATE (base your decision ONLY on this):
223
+ \"\"\"{content}\"\"\"
224
+
225
+ Remember:
226
+ - is_relevant=true ONLY if POST CONTENT discusses {brand_name}
227
+ - products_mentioned must be from the exact product list provided
228
+ - competitors_mentioned should be brand names only (Zildjian, Paiste, etc.)
229
+ - thread_context_summary should be 1-2 sentences max
230
+
231
+ Return JSON only."""
232
+
233
+ def extract_and_validate(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
234
+ """
235
+ Perform relevance check and fact extraction.
236
+
237
+ Args:
238
+ input_data: Preprocessed post data
239
+
240
+ Returns:
241
+ Dictionary with extraction results
242
+ """
243
+ content = input_data.get("cleaned_content", "")
244
+ quoted_content = input_data.get("quoted_content", "")
245
+ raw_thread_context = input_data.get("raw_thread_context", "")
246
+ keywords_found = input_data.get("relevance_keywords_found", [])
247
+
248
+ try:
249
+ messages = [
250
+ SystemMessage(content=self._build_system_prompt()),
251
+ HumanMessage(content=self._build_user_prompt(
252
+ content, quoted_content, raw_thread_context, keywords_found
253
+ ))
254
+ ]
255
+
256
+ response = self.llm.invoke(messages)
257
+ result = self._parse_llm_json_response(response.content)
258
+
259
+ # Validate and normalize the response
260
+ validated = self._validate_response(result)
261
+
262
+ return {
263
+ "success": True,
264
+ **validated
265
+ }
266
+
267
+ except json.JSONDecodeError as e:
268
+ self.log_processing(f"JSON decode error: {e}", "warning")
269
+ return {
270
+ "success": False,
271
+ "error": f"JSON parse error: {str(e)}",
272
+ "is_relevant": False,
273
+ "relevance_confidence": "low",
274
+ "relevance_reason": "Failed to parse LLM response"
275
+ }
276
+
277
+ except Exception as e:
278
+ self.log_processing(f"Extraction error: {e}", "error")
279
+ return {
280
+ "success": False,
281
+ "error": str(e),
282
+ "is_relevant": False,
283
+ "relevance_confidence": "low",
284
+ "relevance_reason": f"Error during extraction: {str(e)}"
285
+ }
286
+
287
+ def _validate_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
288
+ """Validate and normalize LLM response against allowed values."""
289
+
290
+ # Validate products
291
+ products = result.get("products_mentioned", [])
292
+ if not isinstance(products, list):
293
+ products = []
294
+ valid_products = [
295
+ p for p in products
296
+ if any(p.lower() == vp.lower() for vp in self.valid_products)
297
+ ]
298
+ # Normalize to canonical case
299
+ normalized_products = []
300
+ for p in valid_products:
301
+ for vp in self.valid_products:
302
+ if p.lower() == vp.lower():
303
+ normalized_products.append(vp)
304
+ break
305
+
306
+ # Validate competitors
307
+ competitors = result.get("competitors_mentioned", [])
308
+ if not isinstance(competitors, list):
309
+ competitors = []
310
+ valid_competitors = [
311
+ c for c in competitors
312
+ if any(c.lower() == vc.lower() for vc in self.valid_competitors)
313
+ ]
314
+ # Normalize to canonical case
315
+ normalized_competitors = []
316
+ for c in valid_competitors:
317
+ for vc in self.valid_competitors:
318
+ if c.lower() == vc.lower():
319
+ normalized_competitors.append(vc)
320
+ break
321
+
322
+ # Validate author_role
323
+ author_role = result.get("author_role", "unknown")
324
+ if author_role not in self.valid_author_roles:
325
+ author_role = "unknown"
326
+
327
+ # Validate sabian_mention_context
328
+ mention_context = result.get("sabian_mention_context")
329
+ is_relevant = result.get("is_relevant", False)
330
+
331
+ if not is_relevant:
332
+ mention_context = None
333
+ elif mention_context and mention_context not in self.valid_mention_contexts:
334
+ mention_context = "casual_mention" # Default for relevant posts
335
+
336
+ # Validate confidence
337
+ confidence = result.get("relevance_confidence", "medium")
338
+ if confidence not in ["high", "medium", "low"]:
339
+ confidence = "medium"
340
+
341
+ return {
342
+ "is_relevant": bool(is_relevant),
343
+ "relevance_confidence": confidence,
344
+ "relevance_reason": result.get("relevance_reason", ""),
345
+ "products_mentioned": normalized_products,
346
+ "sabian_mention_context": mention_context,
347
+ "author_role": author_role,
348
+ "competitors_mentioned": normalized_competitors,
349
+ "thread_context_summary": result.get("thread_context_summary", "")
350
+ }
351
+
352
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
353
+ """
354
+ Process a post through relevance validation and fact extraction.
355
+
356
+ Args:
357
+ input_data: Dictionary from preprocessor containing:
358
+ - cleaned_content: Cleaned post text
359
+ - quoted_content: Quoted content if any
360
+ - raw_thread_context: Raw thread context
361
+ - relevance_keywords_found: Keywords from preprocessor
362
+ - preliminary_relevant: Preprocessor's relevance assessment
363
+ - needs_relevance_validation: Whether LLM validation needed
364
+
365
+ Returns:
366
+ Dictionary with extraction results and original data
367
+ """
368
+ try:
369
+ if not self.validate_input(input_data):
370
+ return {
371
+ "success": False,
372
+ "error": "Invalid input: missing required fields",
373
+ "is_relevant": False,
374
+ **input_data
375
+ }
376
+
377
+ # Skip if already determined not relevant and no validation needed
378
+ if (not input_data.get("preliminary_relevant", False) and
379
+ not input_data.get("needs_relevance_validation", False)):
380
+ return {
381
+ "success": True,
382
+ "is_relevant": False,
383
+ "relevance_confidence": "high",
384
+ "relevance_reason": "No Sabian-related keywords found in post",
385
+ "products_mentioned": [],
386
+ "sabian_mention_context": None,
387
+ "author_role": "unknown",
388
+ "competitors_mentioned": input_data.get("competitors_detected", []),
389
+ "thread_context_summary": "",
390
+ "extraction_performed": False,
391
+ **input_data
392
+ }
393
+
394
+ # Skip non-English posts
395
+ if not input_data.get("is_english", True):
396
+ return {
397
+ "success": True,
398
+ "is_relevant": False,
399
+ "relevance_confidence": "high",
400
+ "relevance_reason": f"Non-English post: {input_data.get('detected_language')}",
401
+ "products_mentioned": [],
402
+ "sabian_mention_context": None,
403
+ "author_role": "unknown",
404
+ "competitors_mentioned": [],
405
+ "thread_context_summary": "",
406
+ "extraction_performed": False,
407
+ **input_data
408
+ }
409
+
410
+ # Perform LLM extraction
411
+ extraction_result = self.extract_and_validate(input_data)
412
+
413
+ # Merge results
414
+ result = {
415
+ **input_data,
416
+ **extraction_result,
417
+ "extraction_performed": True
418
+ }
419
+
420
+ # Log the result
421
+ self.log_processing(
422
+ f"Extraction complete: is_relevant={result.get('is_relevant')}, "
423
+ f"products={result.get('products_mentioned')}, "
424
+ f"context={result.get('sabian_mention_context')}",
425
+ "debug"
426
+ )
427
+
428
+ return result
429
+
430
+ except Exception as e:
431
+ return self.handle_error(e, "relevance extraction")
processing_brand_sentiment/workflow/agents/sabian_sentiment_analyzer_agent.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sabian Sentiment & Intent Analyzer Agent for brand sentiment analysis.
3
+
4
+ This agent performs deep analysis on VERIFIED relevant posts with STRUCTURED input.
5
+ It receives pre-validated data from the Relevance Extraction Agent including:
6
+ - Products already extracted and validated
7
+ - Thread context already summarized
8
+ - Author role already determined
9
+
10
+ Key Design Principles:
11
+ - Focused analysis: Only sentiment, intents, and customer journey
12
+ - No re-extraction: Products are given, not re-detected
13
+ - Sabian-specific sentiment: How author feels about Sabian, not overall post tone
14
+ - Author perspective: Pain points/delights only from author's own experience
15
+ """
16
+
17
+ from typing import Dict, Any, List
18
+ import json
19
+ from langchain_openai import ChatOpenAI
20
+ from langchain.schema import HumanMessage, SystemMessage
21
+ import logging
22
+
23
+ from .base_agent import BaseAgent
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class SabianSentimentAnalyzerAgent(BaseAgent):
29
+ """
30
+ Agent that performs deep sentiment and intent analysis on relevant posts.
31
+
32
+ This agent is the second LLM call in the pipeline and focuses purely on
33
+ analysis, not extraction. It receives structured input from the extraction
34
+ agent and produces sentiment, intent, and customer journey insights.
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ config: Dict[str, Any],
40
+ api_key: str,
41
+ brand_config: Dict[str, Any],
42
+ analysis_categories: Dict[str, Any]
43
+ ):
44
+ """
45
+ Initialize the Sentiment Analyzer Agent.
46
+
47
+ Args:
48
+ config: Agent configuration
49
+ api_key: OpenAI API key
50
+ brand_config: Brand-specific configuration
51
+ analysis_categories: Category definitions for analysis
52
+ """
53
+ super().__init__("SabianSentimentAnalyzerAgent", config)
54
+ self.api_key = api_key
55
+ self.brand_config = brand_config
56
+ self.analysis_categories = analysis_categories
57
+
58
+ self.llm = ChatOpenAI(
59
+ model=self.model,
60
+ temperature=self.temperature,
61
+ api_key=self.api_key
62
+ )
63
+
64
+ # Pre-compute valid values for validation
65
+ self._valid_values = self._compute_valid_values()
66
+
67
+ logger.info("SabianSentimentAnalyzerAgent initialized")
68
+
69
+ def _compute_valid_values(self) -> Dict[str, List[str]]:
70
+ """Pre-compute all valid values from config for validation."""
71
+ valid = {}
72
+
73
+ # Products from brand config
74
+ valid["products"] = self.brand_config.get("brand", {}).get("products", [])
75
+
76
+ # Competitors
77
+ competitor_names = []
78
+ for comp in self.brand_config.get("brand", {}).get("competitors", []):
79
+ if isinstance(comp, dict):
80
+ competitor_names.append(comp.get("name", ""))
81
+ valid["competitors"] = competitor_names
82
+
83
+ # Extract category values from analysis_categories
84
+ category_map = {
85
+ "sentiment_level": "sentiment",
86
+ "emotion_type": "emotions",
87
+ "intents": "intents",
88
+ "purchase_stage": "purchase_stage",
89
+ "comparison_type": "comparison_type",
90
+ "feedback_aspects": "feedback_aspects",
91
+ "decision_drivers": "decision_drivers",
92
+ "product_attributes": "product_attributes",
93
+ }
94
+
95
+ for key, config_key in category_map.items():
96
+ config_section = self.analysis_categories.get(config_key, {})
97
+ if "categories" in config_section:
98
+ valid[key] = [c["value"] for c in config_section["categories"]]
99
+ elif "levels" in config_section:
100
+ valid[key] = [c["value"] for c in config_section["levels"]]
101
+ else:
102
+ valid[key] = []
103
+
104
+ return valid
105
+
106
+ def _get_valid_list(self, key: str) -> List[str]:
107
+ """Get list of valid values for a category."""
108
+ return self._valid_values.get(key, [])
109
+
110
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
111
+ """Validate that input contains required fields."""
112
+ required = ["cleaned_content", "is_relevant"]
113
+ return all(field in input_data for field in required)
114
+
115
+ def _build_system_prompt(self) -> str:
116
+ """Build optimized system prompt for sentiment analysis."""
117
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
118
+ v = self._valid_values
119
+
120
+ return f"""You are a sentiment analyst for {brand_name} cymbal discussions.
121
+
122
+ ## YOUR TASK
123
+ Analyze the sentiment, emotions, and intents in posts about {brand_name}.
124
+ You will receive PRE-VALIDATED context (products, author role, etc.) - trust these values.
125
+
126
+ ## CRITICAL RULES
127
+
128
+ ### Rule 1: Neutral by Default
129
+ Sentiment defaults to NEUTRAL unless there is EXPLICIT positive or negative language toward {brand_name}.
130
+ - Factual statements = neutral
131
+ - Comparative statements ("sounds different", "not the same as") = neutral (different ≠ worse)
132
+ - Advice-giving without personal opinion = neutral
133
+
134
+ Only assign positive/negative sentiment when the author CLEARLY expresses satisfaction or dissatisfaction with {brand_name}.
135
+
136
+ ### Rule 2: {brand_name}-Specific Sentiment
137
+ Sentiment MUST be about {brand_name} specifically, NOT overall post tone or other products.
138
+
139
+ EXAMPLE:
140
+ Post: "I have SBR cymbals and bought a Pearl crash. The Pearl sounds different from the SBR. Go with what feels best!"
141
+ - This is NEUTRAL toward {brand_name} - "different" is not criticism
142
+ - The author owns SBR (no complaint), is giving advice
143
+ - pain_points: [] (no negative experience expressed)
144
+ - delight_factors: [] (no positive experience expressed)
145
+
146
+ ### Rule 3: Mutually Exclusive Feedback
147
+ pain_points and delight_factors CANNOT contain the same values.
148
+ - If an aspect is positive → delight_factors only
149
+ - If an aspect is negative → pain_points only
150
+ - Never both
151
+
152
+ ### Rule 4: Author Perspective Only
153
+ These fields are ONLY for author's OWN experience, not advice to others:
154
+ - purchase_stage, decision_drivers, pain_points, delight_factors
155
+
156
+ If author is primarily giving ADVICE to someone else, these should be null/empty.
157
+
158
+ ### Rule 5: Valid Values
159
+
160
+ | Field | Valid Values |
161
+ |-------|--------------|
162
+ | sentiment_level | {v.get('sentiment_level', [])} |
163
+ | emotion_type | {v.get('emotion_type', [])} |
164
+ | intents (multi-select) | {v.get('intents', [])} |
165
+ | purchase_stage | {v.get('purchase_stage', [])} |
166
+ | comparison_type | {v.get('comparison_type', [])} |
167
+ | feedback_aspects | {v.get('feedback_aspects', [])} |
168
+ | decision_drivers | {v.get('decision_drivers', [])} |
169
+ | product_attributes | {v.get('product_attributes', [])} |
170
+ | competitor brands | {v.get('competitors', [])} |
171
+
172
+ ### Rule 6: Intent Classification
173
+ - seeking_information: Asking questions, seeking advice
174
+ - providing_information: Answering questions, giving advice
175
+ - sharing_experience: Personal experience, review, testimonial
176
+ - comparing: Comparing brands/products
177
+ - praising: Actively endorsing {brand_name}
178
+ - criticizing: Actively complaining about {brand_name}
179
+ - buying_selling: Listing gear for sale/trade
180
+ - general_discussion: General conversation
181
+
182
+ ## OUTPUT FORMAT
183
+ ```json
184
+ {{
185
+ "sentiment_level": "neutral unless explicit positive/negative",
186
+ "emotion_type": "value or null",
187
+ "sentiment_confidence": "high" | "medium" | "low",
188
+ "sarcasm_detected": false,
189
+ "product_attributes": [],
190
+ "competitor_products_owned": [],
191
+ "comparison_type": "value or null",
192
+ "intents": [],
193
+ "purchase_stage": "value or null",
194
+ "decision_drivers": [],
195
+ "pain_points": [],
196
+ "delight_factors": [],
197
+ "analysis_notes": "1-2 sentences"
198
+ }}
199
+ ```
200
+
201
+ Return ONLY valid JSON."""
202
+
203
+ def _build_user_prompt(self, input_data: Dict[str, Any]) -> str:
204
+ """Build user prompt with structured context."""
205
+ brand_name = self.brand_config.get("brand", {}).get("name", "Sabian")
206
+
207
+ content = input_data.get("cleaned_content", "")
208
+ products_mentioned = input_data.get("products_mentioned", [])
209
+ sabian_context = input_data.get("sabian_mention_context", "")
210
+ author_role = input_data.get("author_role", "unknown")
211
+ thread_summary = input_data.get("thread_context_summary", "")
212
+ competitors_mentioned = input_data.get("competitors_mentioned", [])
213
+
214
+ context_section = f"""## PRE-VALIDATED CONTEXT (trust these values)
215
+ - Products mentioned: {products_mentioned if products_mentioned else 'None specific'}
216
+ - {brand_name} mention context: {sabian_context}
217
+ - Author role: {author_role}
218
+ - Competitors mentioned: {competitors_mentioned if competitors_mentioned else 'None'}
219
+ - Thread summary: {thread_summary if thread_summary else 'Not available'}
220
+ """
221
+
222
+ return f"""Analyze this post about {brand_name} for sentiment and intents.
223
+
224
+ {context_section}
225
+ ## POST CONTENT TO ANALYZE:
226
+ \"\"\"{content}\"\"\"
227
+
228
+ Remember:
229
+ - Sentiment is about {brand_name} ONLY, not overall post tone
230
+ - pain_points/delight_factors only from author's OWN experience
231
+ - Use only values from the valid lists provided
232
+
233
+ Return JSON only."""
234
+
235
+ def analyze_post(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
236
+ """
237
+ Perform sentiment and intent analysis.
238
+
239
+ Args:
240
+ input_data: Structured data from extraction agent
241
+
242
+ Returns:
243
+ Dictionary with analysis results
244
+ """
245
+ try:
246
+ messages = [
247
+ SystemMessage(content=self._build_system_prompt()),
248
+ HumanMessage(content=self._build_user_prompt(input_data))
249
+ ]
250
+
251
+ response = self.llm.invoke(messages)
252
+ result = self._parse_llm_json_response(response.content)
253
+
254
+ # Validate and normalize
255
+ validated = self._validate_and_normalize(result)
256
+
257
+ return {"success": True, **validated}
258
+
259
+ except json.JSONDecodeError as e:
260
+ self.log_processing(f"JSON decode error: {e}", "warning")
261
+ return {
262
+ "success": False,
263
+ "error": f"JSON parse error: {str(e)}",
264
+ "sentiment_level": "neutral",
265
+ "intents": ["general_discussion"]
266
+ }
267
+
268
+ except Exception as e:
269
+ self.log_processing(f"Analysis error: {e}", "error")
270
+ return {"success": False, "error": str(e)}
271
+
272
+ def _validate_single(self, value: Any, valid_list: List[str], default: Any = None) -> Any:
273
+ """Validate single value against list, return canonical form or default."""
274
+ if value is None:
275
+ return default
276
+ if isinstance(value, str):
277
+ val_lower = value.lower()
278
+ for v in valid_list:
279
+ if v.lower() == val_lower:
280
+ return v
281
+ return default
282
+
283
+ def _validate_list(self, values: Any, valid_list: List[str]) -> List[str]:
284
+ """Validate list values, return only valid items in canonical form."""
285
+ if not values:
286
+ return []
287
+ if not isinstance(values, list):
288
+ values = [values]
289
+
290
+ validated = []
291
+ valid_lower = {v.lower(): v for v in valid_list}
292
+ for val in values:
293
+ if isinstance(val, str) and val.lower() in valid_lower:
294
+ validated.append(valid_lower[val.lower()])
295
+ return validated
296
+
297
+ def _validate_and_normalize(self, result: Dict[str, Any]) -> Dict[str, Any]:
298
+ """Validate all fields against predefined values and normalize."""
299
+ v = self._valid_values
300
+
301
+ normalized = {
302
+ # Sentiment
303
+ "sentiment_level": self._validate_single(
304
+ result.get("sentiment_level"), v["sentiment_level"], "neutral"
305
+ ),
306
+ "emotion_type": self._validate_single(
307
+ result.get("emotion_type"), v["emotion_type"], None
308
+ ),
309
+ "sentiment_confidence": result.get("sentiment_confidence", "medium"),
310
+ "sarcasm_detected": bool(result.get("sarcasm_detected", False)),
311
+
312
+ # Product info
313
+ "product_attributes": self._validate_list(
314
+ result.get("product_attributes"), v["product_attributes"]
315
+ ),
316
+
317
+ # Competitors
318
+ "competitor_products_owned": self._validate_list(
319
+ result.get("competitor_products_owned"), v["competitors"]
320
+ ),
321
+ "comparison_type": self._validate_single(
322
+ result.get("comparison_type"), v["comparison_type"], None
323
+ ),
324
+
325
+ # Intents
326
+ "intents": self._validate_list(
327
+ result.get("intents"), v["intents"]
328
+ ) or ["general_discussion"],
329
+
330
+ # Author journey (null if advising others)
331
+ "purchase_stage": self._validate_single(
332
+ result.get("purchase_stage"), v["purchase_stage"], None
333
+ ),
334
+ "decision_drivers": self._validate_list(
335
+ result.get("decision_drivers"), v["decision_drivers"]
336
+ ),
337
+
338
+ # Feedback - both use feedback_aspects
339
+ "pain_points": self._validate_list(
340
+ result.get("pain_points"), v["feedback_aspects"]
341
+ ),
342
+ "delight_factors": self._validate_list(
343
+ result.get("delight_factors"), v["feedback_aspects"]
344
+ ),
345
+
346
+ # Notes
347
+ "analysis_notes": result.get("analysis_notes", ""),
348
+ }
349
+
350
+ # Validate confidence
351
+ if normalized["sentiment_confidence"] not in ["high", "medium", "low"]:
352
+ normalized["sentiment_confidence"] = "medium"
353
+
354
+ return normalized
355
+
356
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
357
+ """
358
+ Process a post through sentiment and intent analysis.
359
+
360
+ Args:
361
+ input_data: Dictionary from extraction agent containing:
362
+ - cleaned_content: Post text
363
+ - is_relevant: Relevance determination
364
+ - products_mentioned: Pre-validated products
365
+ - sabian_mention_context: How Sabian is discussed
366
+ - author_role: Author's relationship to Sabian
367
+ - thread_context_summary: Summarized context
368
+ - competitors_mentioned: Competitor brands
369
+
370
+ Returns:
371
+ Dictionary with analysis results and original data
372
+ """
373
+ try:
374
+ if not self.validate_input(input_data):
375
+ return {
376
+ "success": False,
377
+ "error": "Invalid input: missing required fields",
378
+ **input_data
379
+ }
380
+
381
+ # Skip non-relevant posts
382
+ if not input_data.get("is_relevant", False):
383
+ return {
384
+ "success": True,
385
+ "analysis_skipped": True,
386
+ "analysis_skip_reason": "Post marked as not relevant",
387
+ "sentiment_level": None,
388
+ "emotion_type": None,
389
+ "sentiment_confidence": None,
390
+ "sarcasm_detected": False,
391
+ "product_attributes": [],
392
+ "competitor_products_owned": [],
393
+ "comparison_type": None,
394
+ "intents": [],
395
+ "purchase_stage": None,
396
+ "decision_drivers": [],
397
+ "pain_points": [],
398
+ "delight_factors": [],
399
+ "analysis_notes": "",
400
+ **input_data
401
+ }
402
+
403
+ # Skip non-English posts (should already be filtered, but double-check)
404
+ if not input_data.get("is_english", True):
405
+ return {
406
+ "success": True,
407
+ "analysis_skipped": True,
408
+ "analysis_skip_reason": f"Non-English: {input_data.get('detected_language')}",
409
+ "sentiment_level": None,
410
+ "emotion_type": None,
411
+ "intents": [],
412
+ **input_data
413
+ }
414
+
415
+ # Perform analysis
416
+ analysis_result = self.analyze_post(input_data)
417
+
418
+ result = {
419
+ **input_data,
420
+ **analysis_result,
421
+ "analysis_skipped": False
422
+ }
423
+
424
+ self.log_processing(
425
+ f"Analyzed: sentiment={result.get('sentiment_level')}, "
426
+ f"intents={result.get('intents')}, "
427
+ f"pain_points={result.get('pain_points')}",
428
+ "debug"
429
+ )
430
+
431
+ return result
432
+
433
+ except Exception as e:
434
+ return self.handle_error(e, "sentiment analysis")
processing_brand_sentiment/workflow/comment_orchestrator.py ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Comment Analysis Workflow Orchestrator using LangGraph.
3
+
4
+ Coordinates the 4-agent pipeline for social media comments:
5
+ 1. CommentPreprocessorAgent - Plain text cleaning, keyword detection (no LLM)
6
+ 2. SabianRelevanceExtractionAgent - Relevance + fact extraction (LLM #1) [shared]
7
+ 3. SabianSentimentAnalyzerAgent - Deep sentiment analysis (LLM #2) [shared]
8
+ 4. OutputValidatorAgent - Rule-based validation (no LLM) [shared]
9
+
10
+ Architecture v4.0:
11
+ - Same analysis pipeline as forums, different preprocessing and state
12
+ - Plain text input (no HTML parsing)
13
+ - Context from social media content metadata and parent comments
14
+ - Comment-specific identifiers (comment_sk, comment_id, platform, etc.)
15
+ """
16
+
17
+ from typing import Dict, Any, List, TypedDict, Annotated, Optional
18
+ import operator
19
+ import json
20
+ import os
21
+ from langgraph.graph import StateGraph, END
22
+ import logging
23
+
24
+ from .agents.comment_preprocessor_agent import CommentPreprocessorAgent
25
+ from .agents.sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent
26
+ from .agents.sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent
27
+ from .agents.output_validator_agent import OutputValidatorAgent
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class CommentAnalysisState(TypedDict):
33
+ """
34
+ State definition for the comment analysis workflow v4.0.
35
+
36
+ Uses comment-specific identifiers but shares the same analysis fields
37
+ as the forum workflow for consistent output.
38
+ """
39
+ # ============== Source Identifiers (Comment-specific) ==============
40
+ comment_sk: int
41
+ comment_id: str
42
+ platform: str
43
+ comment_timestamp: Any
44
+ author_name: str
45
+ author_id: str
46
+ parent_comment_id: str
47
+ parent_comment_text: str
48
+
49
+ # Content metadata
50
+ content_sk: int
51
+ content_id: str
52
+ content_description: str
53
+ content_title: str
54
+ channel_sk: int
55
+ channel_name: str
56
+ channel_display_name: str
57
+
58
+ # ============== Original Content ==============
59
+ comment_text: str
60
+ original_text: str
61
+
62
+ # ============== Preprocessor Output ==============
63
+ cleaned_content: str
64
+ quoted_content: str
65
+ has_quote: bool
66
+ quoted_author: str
67
+ raw_thread_context: str # Comment context (reuses field name for agent compatibility)
68
+ is_empty: bool
69
+
70
+ # Language detection
71
+ detected_language: str
72
+ language_code: str
73
+ is_english: bool
74
+ language_confidence: str
75
+ language_detection_skipped: bool
76
+
77
+ # Preliminary relevance (keyword-based)
78
+ preliminary_relevant: bool
79
+ needs_relevance_validation: bool
80
+ relevance_keywords_found: List[str]
81
+ relevance_type: str
82
+ has_primary_keywords: bool
83
+
84
+ # Initial detections
85
+ products_detected: List[str]
86
+ competitors_detected: List[str]
87
+
88
+ # ============== Extraction Agent Output ==============
89
+ is_relevant: bool
90
+ relevance_confidence: str
91
+ relevance_reason: str
92
+ extraction_performed: bool
93
+
94
+ # Extracted facts
95
+ products_mentioned: List[str]
96
+ sabian_mention_context: str
97
+ author_role: str
98
+ competitors_mentioned: List[str]
99
+ thread_context_summary: str
100
+
101
+ # ============== Sentiment Analyzer Output ==============
102
+ sentiment_level: str
103
+ emotion_type: str
104
+ sentiment_confidence: str
105
+ sarcasm_detected: bool
106
+
107
+ # Product information
108
+ product_attributes: List[str]
109
+
110
+ # Competitive intelligence
111
+ competitor_products_owned: List[str]
112
+ comparison_type: str
113
+
114
+ # Customer journey (AUTHOR PERSPECTIVE ONLY)
115
+ intents: List[str]
116
+ purchase_stage: str
117
+ decision_drivers: List[str]
118
+ pain_points: List[str]
119
+ delight_factors: List[str]
120
+
121
+ # Analysis notes
122
+ analysis_notes: str
123
+ analysis_skipped: bool
124
+ analysis_skip_reason: str
125
+
126
+ # ============== Validator Output ==============
127
+ validation_passed: bool
128
+ validation_errors: List[str]
129
+ validation_warnings: List[str]
130
+ validation_flags: List[str]
131
+ processing_status: str
132
+
133
+ # ============== Processing Metadata ==============
134
+ processing_errors: Annotated[List[str], operator.add]
135
+ success: bool
136
+
137
+
138
+ class CommentAnalysisWorkflow:
139
+ """
140
+ LangGraph-based workflow for comment brand sentiment analysis v4.0.
141
+
142
+ Pipeline:
143
+ 1. Comment Preprocessor (no LLM) - plain text, comment context
144
+ 2. Relevance & Extraction Agent (LLM #1) - shared with forums
145
+ 3. Sentiment Analyzer Agent (LLM #2) - shared with forums
146
+ 4. Output Validator (no LLM) - shared with forums
147
+ """
148
+
149
+ def __init__(
150
+ self,
151
+ workflow_config: Dict[str, Any],
152
+ brand_config: Dict[str, Any],
153
+ analysis_categories: Dict[str, Any],
154
+ api_key: str
155
+ ):
156
+ """
157
+ Initialize the workflow with agents and configuration.
158
+
159
+ Args:
160
+ workflow_config: Workflow and agent configuration
161
+ brand_config: Brand-specific configuration
162
+ analysis_categories: Analysis category definitions
163
+ api_key: OpenAI API key
164
+ """
165
+ self.workflow_config = workflow_config
166
+ self.brand_config = brand_config
167
+ self.analysis_categories = analysis_categories
168
+ self.api_key = api_key
169
+
170
+ # Initialize agents
171
+ self._init_agents()
172
+
173
+ # Build the workflow graph
174
+ self.workflow = self._build_workflow()
175
+
176
+ logger.info("CommentAnalysisWorkflow v4.0 initialized successfully")
177
+
178
+ def _init_agents(self) -> None:
179
+ """Initialize all agents with their configurations."""
180
+ agents_config = self.workflow_config.get("agents", {})
181
+
182
+ # 1. Comment Preprocessor Agent (no LLM) - comment-specific
183
+ preprocessor_config = agents_config.get("preprocessor", {})
184
+ self.preprocessor = CommentPreprocessorAgent(
185
+ preprocessor_config,
186
+ self.brand_config
187
+ )
188
+
189
+ # 2. Relevance & Extraction Agent (LLM #1) - shared with forums
190
+ extraction_config = agents_config.get("relevance_extraction",
191
+ agents_config.get("relevance_validator", {})
192
+ )
193
+ self.extraction_agent = SabianRelevanceExtractionAgent(
194
+ extraction_config,
195
+ self.api_key,
196
+ self.brand_config,
197
+ self.analysis_categories
198
+ )
199
+
200
+ # 3. Sentiment Analyzer Agent (LLM #2) - shared with forums
201
+ analyzer_config = agents_config.get("sentiment_analyzer",
202
+ agents_config.get("brand_analyzer", {})
203
+ )
204
+ self.sentiment_analyzer = SabianSentimentAnalyzerAgent(
205
+ analyzer_config,
206
+ self.api_key,
207
+ self.brand_config,
208
+ self.analysis_categories
209
+ )
210
+
211
+ # 4. Output Validator Agent (no LLM) - shared with forums
212
+ validator_config = agents_config.get("output_validator", {})
213
+ self.output_validator = OutputValidatorAgent(
214
+ validator_config,
215
+ self.brand_config,
216
+ self.analysis_categories
217
+ )
218
+
219
+ logger.info("All 4 agents initialized for comment processing")
220
+
221
+ def _build_workflow(self) -> StateGraph:
222
+ """
223
+ Build the LangGraph workflow.
224
+
225
+ Flow:
226
+ preprocessing -> extraction -> (analysis if relevant) -> validation -> END
227
+
228
+ Returns:
229
+ Compiled StateGraph workflow
230
+ """
231
+ workflow = StateGraph(CommentAnalysisState)
232
+
233
+ # Add nodes
234
+ workflow.add_node("preprocessing", self._preprocessing_node)
235
+ workflow.add_node("extraction", self._extraction_node)
236
+ workflow.add_node("analysis", self._analysis_node)
237
+ workflow.add_node("validation", self._validation_node)
238
+
239
+ # Set entry point
240
+ workflow.set_entry_point("preprocessing")
241
+
242
+ # Define edges
243
+ workflow.add_conditional_edges(
244
+ "preprocessing",
245
+ self._route_after_preprocessing,
246
+ {
247
+ "extract": "extraction",
248
+ "skip_to_validation": "validation"
249
+ }
250
+ )
251
+
252
+ workflow.add_conditional_edges(
253
+ "extraction",
254
+ self._route_after_extraction,
255
+ {
256
+ "analyze": "analysis",
257
+ "skip_to_validation": "validation"
258
+ }
259
+ )
260
+
261
+ workflow.add_edge("analysis", "validation")
262
+ workflow.add_edge("validation", END)
263
+
264
+ return workflow.compile()
265
+
266
+ def _preprocessing_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
267
+ """
268
+ Preprocessing node: Plain text cleaning, language detection, keyword check.
269
+ """
270
+ try:
271
+ input_data = {
272
+ "comment_sk": state.get("comment_sk"),
273
+ "comment_text": state.get("comment_text", ""),
274
+ "content_title": state.get("content_title"),
275
+ "content_description": state.get("content_description"),
276
+ "parent_comment_text": state.get("parent_comment_text")
277
+ }
278
+
279
+ result = self.preprocessor.process(input_data)
280
+
281
+ if result.get("success", False):
282
+ # Content
283
+ state["cleaned_content"] = result.get("cleaned_content", "")
284
+ state["quoted_content"] = result.get("quoted_content")
285
+ state["has_quote"] = result.get("has_quote", False)
286
+ state["quoted_author"] = result.get("quoted_author")
287
+ state["raw_thread_context"] = result.get("raw_thread_context", "")
288
+ state["is_empty"] = result.get("is_empty", False)
289
+ state["original_text"] = result.get("original_text", state.get("comment_text", ""))
290
+
291
+ # Language
292
+ state["detected_language"] = result.get("detected_language", "English")
293
+ state["language_code"] = result.get("language_code", "en")
294
+ state["is_english"] = result.get("is_english", True)
295
+ state["language_confidence"] = result.get("language_confidence", "low")
296
+ state["language_detection_skipped"] = result.get("language_detection_skipped", False)
297
+
298
+ # Relevance
299
+ state["preliminary_relevant"] = result.get("preliminary_relevant", False)
300
+ state["needs_relevance_validation"] = result.get("needs_relevance_validation", False)
301
+ state["relevance_keywords_found"] = result.get("relevance_keywords_found", [])
302
+ state["relevance_type"] = result.get("relevance_type", "none")
303
+ state["has_primary_keywords"] = result.get("has_primary_keywords", False)
304
+
305
+ # Detections
306
+ state["products_detected"] = result.get("products_detected", [])
307
+ state["competitors_detected"] = result.get("competitors_detected", [])
308
+
309
+ state["success"] = True
310
+ else:
311
+ error_msg = f"Preprocessing failed: {result.get('error', 'Unknown error')}"
312
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
313
+ state["success"] = False
314
+
315
+ logger.debug(f"Preprocessing complete for comment {state.get('comment_sk')}")
316
+ return state
317
+
318
+ except Exception as e:
319
+ error_msg = f"Preprocessing node error: {str(e)}"
320
+ logger.error(error_msg)
321
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
322
+ state["success"] = False
323
+ return state
324
+
325
+ def _extraction_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
326
+ """
327
+ Extraction node: LLM-based relevance validation and fact extraction.
328
+ Reuses the same extraction agent as forums.
329
+ """
330
+ try:
331
+ input_data = {
332
+ "cleaned_content": state.get("cleaned_content", ""),
333
+ "quoted_content": state.get("quoted_content"),
334
+ "raw_thread_context": state.get("raw_thread_context", ""),
335
+ "relevance_keywords_found": state.get("relevance_keywords_found", []),
336
+ "preliminary_relevant": state.get("preliminary_relevant", False),
337
+ "needs_relevance_validation": state.get("needs_relevance_validation", True),
338
+ "products_detected": state.get("products_detected", []),
339
+ "competitors_detected": state.get("competitors_detected", []),
340
+ "is_english": state.get("is_english", True),
341
+ "detected_language": state.get("detected_language", "English")
342
+ }
343
+
344
+ result = self.extraction_agent.process(input_data)
345
+
346
+ # Update state with extraction results
347
+ state["is_relevant"] = result.get("is_relevant", False)
348
+ state["relevance_confidence"] = result.get("relevance_confidence", "low")
349
+ state["relevance_reason"] = result.get("relevance_reason", "")
350
+ state["extraction_performed"] = result.get("extraction_performed", True)
351
+
352
+ # Extracted facts
353
+ state["products_mentioned"] = result.get("products_mentioned", [])
354
+ state["sabian_mention_context"] = result.get("sabian_mention_context")
355
+ state["author_role"] = result.get("author_role", "unknown")
356
+ state["competitors_mentioned"] = result.get("competitors_mentioned", [])
357
+ state["thread_context_summary"] = result.get("thread_context_summary", "")
358
+
359
+ if not result.get("success", False) and result.get("error"):
360
+ state["processing_errors"] = state.get("processing_errors", []) + [result["error"]]
361
+
362
+ logger.debug(f"Extraction complete: is_relevant={state['is_relevant']}")
363
+ return state
364
+
365
+ except Exception as e:
366
+ error_msg = f"Extraction node error: {str(e)}"
367
+ logger.error(error_msg)
368
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
369
+ state["is_relevant"] = False
370
+ state["relevance_confidence"] = "low"
371
+ return state
372
+
373
+ def _analysis_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
374
+ """
375
+ Analysis node: Deep sentiment and intent analysis for relevant comments.
376
+ Reuses the same sentiment analyzer as forums.
377
+ """
378
+ try:
379
+ input_data = {
380
+ "cleaned_content": state.get("cleaned_content", ""),
381
+ "is_relevant": state.get("is_relevant", True),
382
+ "is_english": state.get("is_english", True),
383
+ "detected_language": state.get("detected_language", "English"),
384
+ "products_mentioned": state.get("products_mentioned", []),
385
+ "sabian_mention_context": state.get("sabian_mention_context"),
386
+ "author_role": state.get("author_role", "unknown"),
387
+ "competitors_mentioned": state.get("competitors_mentioned", []),
388
+ "thread_context_summary": state.get("thread_context_summary", "")
389
+ }
390
+
391
+ result = self.sentiment_analyzer.process(input_data)
392
+
393
+ if result.get("success", False):
394
+ # Sentiment
395
+ state["sentiment_level"] = result.get("sentiment_level")
396
+ state["emotion_type"] = result.get("emotion_type")
397
+ state["sentiment_confidence"] = result.get("sentiment_confidence", "medium")
398
+ state["sarcasm_detected"] = result.get("sarcasm_detected", False)
399
+
400
+ # Products
401
+ state["product_attributes"] = result.get("product_attributes", [])
402
+
403
+ # Competitive
404
+ state["competitor_products_owned"] = result.get("competitor_products_owned", [])
405
+ state["comparison_type"] = result.get("comparison_type")
406
+
407
+ # Journey
408
+ state["intents"] = result.get("intents", [])
409
+ state["purchase_stage"] = result.get("purchase_stage")
410
+ state["decision_drivers"] = result.get("decision_drivers", [])
411
+ state["pain_points"] = result.get("pain_points", [])
412
+ state["delight_factors"] = result.get("delight_factors", [])
413
+
414
+ # Notes
415
+ state["analysis_notes"] = result.get("analysis_notes", "")
416
+ state["analysis_skipped"] = result.get("analysis_skipped", False)
417
+ state["analysis_skip_reason"] = result.get("analysis_skip_reason", "")
418
+ else:
419
+ error_msg = f"Analysis failed: {result.get('error', 'Unknown error')}"
420
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
421
+
422
+ logger.debug(f"Analysis complete for comment {state.get('comment_sk')}")
423
+ return state
424
+
425
+ except Exception as e:
426
+ error_msg = f"Analysis node error: {str(e)}"
427
+ logger.error(error_msg)
428
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
429
+ return state
430
+
431
+ def _validation_node(self, state: CommentAnalysisState) -> CommentAnalysisState:
432
+ """
433
+ Validation node: Rule-based validation and anomaly detection.
434
+ Reuses the same output validator as forums.
435
+ """
436
+ try:
437
+ result = self.output_validator.process(dict(state))
438
+
439
+ state["validation_passed"] = result.get("validation_passed", True)
440
+ state["validation_errors"] = result.get("validation_errors", [])
441
+ state["validation_warnings"] = result.get("validation_warnings", [])
442
+ state["validation_flags"] = result.get("validation_flags", [])
443
+ state["processing_status"] = result.get("processing_status", "completed")
444
+
445
+ # Set overall success
446
+ has_errors = len(state.get("processing_errors", [])) > 0
447
+ state["success"] = not has_errors or state.get("is_relevant") is not None
448
+
449
+ logger.debug(f"Validation complete: status={state['processing_status']}")
450
+ return state
451
+
452
+ except Exception as e:
453
+ error_msg = f"Validation node error: {str(e)}"
454
+ logger.error(error_msg)
455
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
456
+ state["validation_passed"] = False
457
+ state["processing_status"] = "validation_failed"
458
+ state["success"] = False
459
+ return state
460
+
461
+ def _route_after_preprocessing(self, state: CommentAnalysisState) -> str:
462
+ """Determine routing after preprocessing."""
463
+ if state.get("is_empty", False):
464
+ state["is_relevant"] = False
465
+ state["relevance_reason"] = "Empty content"
466
+ return "skip_to_validation"
467
+
468
+ if not state.get("is_english", True):
469
+ state["is_relevant"] = False
470
+ state["relevance_reason"] = f"Non-English: {state.get('detected_language')}"
471
+ return "skip_to_validation"
472
+
473
+ if (not state.get("preliminary_relevant", False) and
474
+ not state.get("needs_relevance_validation", False)):
475
+ state["is_relevant"] = False
476
+ state["relevance_reason"] = "No relevant keywords found"
477
+ return "skip_to_validation"
478
+
479
+ return "extract"
480
+
481
+ def _route_after_extraction(self, state: CommentAnalysisState) -> str:
482
+ """Determine routing after extraction."""
483
+ if state.get("is_relevant", False):
484
+ return "analyze"
485
+ return "skip_to_validation"
486
+
487
+ def process_comment(self, comment_data: Dict[str, Any]) -> Dict[str, Any]:
488
+ """
489
+ Process a single social media comment through the workflow.
490
+
491
+ Args:
492
+ comment_data: Dictionary containing comment data
493
+
494
+ Returns:
495
+ Dictionary with processed results
496
+ """
497
+ try:
498
+ initial_state = {
499
+ # Comment identifiers
500
+ "comment_sk": comment_data.get("comment_sk"),
501
+ "comment_id": comment_data.get("comment_id"),
502
+ "platform": comment_data.get("platform"),
503
+ "comment_timestamp": comment_data.get("comment_timestamp"),
504
+ "author_name": comment_data.get("author_name"),
505
+ "author_id": comment_data.get("author_id"),
506
+ "parent_comment_id": comment_data.get("parent_comment_id"),
507
+ "parent_comment_text": comment_data.get("parent_comment_text"),
508
+
509
+ # Content metadata
510
+ "content_sk": comment_data.get("content_sk"),
511
+ "content_id": comment_data.get("content_id"),
512
+ "content_description": comment_data.get("content_description"),
513
+ "content_title": comment_data.get("content_title"),
514
+ "channel_sk": comment_data.get("channel_sk"),
515
+ "channel_name": comment_data.get("channel_name"),
516
+ "channel_display_name": comment_data.get("channel_display_name"),
517
+
518
+ # Comment text
519
+ "comment_text": comment_data.get("comment_text", ""),
520
+
521
+ # Processing metadata
522
+ "processing_errors": [],
523
+ "success": True
524
+ }
525
+
526
+ final_state = self.workflow.invoke(initial_state)
527
+
528
+ return dict(final_state)
529
+
530
+ except Exception as e:
531
+ logger.error(f"Workflow execution error: {str(e)}")
532
+ return {
533
+ **comment_data,
534
+ "success": False,
535
+ "processing_errors": [str(e)],
536
+ "processing_status": "workflow_error"
537
+ }
538
+
539
+ def process_batch(self, comments: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
540
+ """
541
+ Process a batch of social media comments.
542
+
543
+ Args:
544
+ comments: List of comment dictionaries
545
+
546
+ Returns:
547
+ List of processed comment dictionaries
548
+ """
549
+ results = []
550
+ total = len(comments)
551
+
552
+ for idx, comment in enumerate(comments, 1):
553
+ logger.info(f"Processing comment {idx}/{total} (SK: {comment.get('comment_sk')})")
554
+ result = self.process_comment(comment)
555
+ results.append(result)
556
+
557
+ logger.info(f"Batch processing complete: {total} comments processed")
558
+ return results
processing_brand_sentiment/workflow/orchestrator.py ADDED
@@ -0,0 +1,551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Brand Analysis Workflow Orchestrator using LangGraph.
3
+
4
+ Coordinates the 4-agent pipeline:
5
+ 1. ContentPreprocessorAgent - HTML parsing, cleaning, keyword detection (no LLM)
6
+ 2. SabianRelevanceExtractionAgent - Relevance + fact extraction (LLM #1)
7
+ 3. SabianSentimentAnalyzerAgent - Deep sentiment analysis (LLM #2)
8
+ 4. OutputValidatorAgent - Rule-based validation (no LLM)
9
+
10
+ Architecture v4.0:
11
+ - Separation of concerns: extraction vs analysis
12
+ - Strict validation at every step
13
+ - Structured data flow between agents
14
+ - Conservative relevance determination
15
+ """
16
+
17
+ from typing import Dict, Any, List, TypedDict, Annotated, Optional
18
+ import operator
19
+ import json
20
+ import os
21
+ from langgraph.graph import StateGraph, END
22
+ import logging
23
+
24
+ from .agents.content_preprocessor_agent import ContentPreprocessorAgent
25
+ from .agents.sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent
26
+ from .agents.sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent
27
+ from .agents.output_validator_agent import OutputValidatorAgent
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class BrandAnalysisState(TypedDict):
33
+ """
34
+ State definition for the brand analysis workflow v4.0.
35
+
36
+ This state flows through all agents, accumulating data at each step.
37
+ """
38
+ # ============== Source Identifiers ==============
39
+ post_id: int
40
+ thread_id: int
41
+ post_author_id: int
42
+
43
+ # ============== Original Content ==============
44
+ post_content: str
45
+ original_content: str
46
+
47
+ # ============== Thread Context ==============
48
+ thread_title: str
49
+ thread_first_post: str
50
+ thread_started_at: Any
51
+ category_title: str
52
+ category_topic: str
53
+
54
+ # ============== Timestamps ==============
55
+ post_created_at: Any
56
+
57
+ # ============== Preprocessor Output ==============
58
+ cleaned_content: str
59
+ quoted_content: str
60
+ has_quote: bool
61
+ quoted_author: str
62
+ raw_thread_context: str # Raw context for extraction agent
63
+ is_empty: bool
64
+
65
+ # Language detection
66
+ detected_language: str
67
+ language_code: str
68
+ is_english: bool
69
+ language_confidence: str
70
+ language_detection_skipped: bool
71
+
72
+ # Preliminary relevance (keyword-based)
73
+ preliminary_relevant: bool
74
+ needs_relevance_validation: bool
75
+ relevance_keywords_found: List[str]
76
+ relevance_type: str
77
+ has_primary_keywords: bool
78
+
79
+ # Initial detections
80
+ products_detected: List[str]
81
+ competitors_detected: List[str]
82
+
83
+ # ============== Extraction Agent Output ==============
84
+ is_relevant: bool
85
+ relevance_confidence: str
86
+ relevance_reason: str
87
+ extraction_performed: bool
88
+
89
+ # Extracted facts
90
+ products_mentioned: List[str]
91
+ sabian_mention_context: str # primary_focus, significant_mention, casual_mention, comparison_context
92
+ author_role: str # current_owner, past_owner, potential_buyer, never_owned, unknown
93
+ competitors_mentioned: List[str]
94
+ thread_context_summary: str # NEW: Summarized context for storage and analysis
95
+
96
+ # ============== Sentiment Analyzer Output ==============
97
+ sentiment_level: str
98
+ emotion_type: str
99
+ sentiment_confidence: str
100
+ sarcasm_detected: bool
101
+
102
+ # Product information
103
+ product_attributes: List[str]
104
+
105
+ # Competitive intelligence
106
+ competitor_products_owned: List[str]
107
+ comparison_type: str
108
+
109
+ # Customer journey (AUTHOR PERSPECTIVE ONLY)
110
+ intents: List[str]
111
+ purchase_stage: str
112
+ decision_drivers: List[str]
113
+ pain_points: List[str]
114
+ delight_factors: List[str]
115
+
116
+ # Analysis notes
117
+ analysis_notes: str
118
+ analysis_skipped: bool
119
+ analysis_skip_reason: str
120
+
121
+ # ============== Validator Output ==============
122
+ validation_passed: bool
123
+ validation_errors: List[str]
124
+ validation_warnings: List[str]
125
+ validation_flags: List[str]
126
+ processing_status: str # completed, completed_with_flags, validation_failed
127
+
128
+ # ============== Processing Metadata ==============
129
+ processing_errors: Annotated[List[str], operator.add]
130
+ success: bool
131
+
132
+
133
+ class BrandAnalysisWorkflow:
134
+ """
135
+ LangGraph-based workflow for brand sentiment analysis v4.0.
136
+
137
+ Pipeline:
138
+ 1. Content Preprocessor (no LLM)
139
+ 2. Relevance & Extraction Agent (LLM #1)
140
+ 3. Sentiment Analyzer Agent (LLM #2) - only for relevant posts
141
+ 4. Output Validator (no LLM)
142
+ """
143
+
144
+ def __init__(
145
+ self,
146
+ workflow_config: Dict[str, Any],
147
+ brand_config: Dict[str, Any],
148
+ analysis_categories: Dict[str, Any],
149
+ api_key: str
150
+ ):
151
+ """
152
+ Initialize the workflow with agents and configuration.
153
+
154
+ Args:
155
+ workflow_config: Workflow and agent configuration
156
+ brand_config: Brand-specific configuration
157
+ analysis_categories: Analysis category definitions
158
+ api_key: OpenAI API key
159
+ """
160
+ self.workflow_config = workflow_config
161
+ self.brand_config = brand_config
162
+ self.analysis_categories = analysis_categories
163
+ self.api_key = api_key
164
+
165
+ # Initialize agents
166
+ self._init_agents()
167
+
168
+ # Build the workflow graph
169
+ self.workflow = self._build_workflow()
170
+
171
+ logger.info("BrandAnalysisWorkflow v4.0 initialized successfully")
172
+
173
+ def _init_agents(self) -> None:
174
+ """Initialize all agents with their configurations."""
175
+ agents_config = self.workflow_config.get("agents", {})
176
+
177
+ # 1. Content Preprocessor Agent (no LLM)
178
+ preprocessor_config = agents_config.get("preprocessor", {})
179
+ self.preprocessor = ContentPreprocessorAgent(
180
+ preprocessor_config,
181
+ self.brand_config
182
+ )
183
+
184
+ # 2. Relevance & Extraction Agent (LLM #1)
185
+ extraction_config = agents_config.get("relevance_extraction",
186
+ agents_config.get("relevance_validator", {}) # Fallback to old config
187
+ )
188
+ self.extraction_agent = SabianRelevanceExtractionAgent(
189
+ extraction_config,
190
+ self.api_key,
191
+ self.brand_config,
192
+ self.analysis_categories
193
+ )
194
+
195
+ # 3. Sentiment Analyzer Agent (LLM #2)
196
+ analyzer_config = agents_config.get("sentiment_analyzer",
197
+ agents_config.get("brand_analyzer", {}) # Fallback to old config
198
+ )
199
+ self.sentiment_analyzer = SabianSentimentAnalyzerAgent(
200
+ analyzer_config,
201
+ self.api_key,
202
+ self.brand_config,
203
+ self.analysis_categories
204
+ )
205
+
206
+ # 4. Output Validator Agent (no LLM)
207
+ validator_config = agents_config.get("output_validator", {})
208
+ self.output_validator = OutputValidatorAgent(
209
+ validator_config,
210
+ self.brand_config,
211
+ self.analysis_categories
212
+ )
213
+
214
+ logger.info("All 4 agents initialized")
215
+
216
+ def _build_workflow(self) -> StateGraph:
217
+ """
218
+ Build the LangGraph workflow.
219
+
220
+ Flow:
221
+ preprocessing -> extraction -> (analysis if relevant) -> validation -> END
222
+
223
+ Returns:
224
+ Compiled StateGraph workflow
225
+ """
226
+ workflow = StateGraph(BrandAnalysisState)
227
+
228
+ # Add nodes
229
+ workflow.add_node("preprocessing", self._preprocessing_node)
230
+ workflow.add_node("extraction", self._extraction_node)
231
+ workflow.add_node("analysis", self._analysis_node)
232
+ workflow.add_node("validation", self._validation_node)
233
+
234
+ # Set entry point
235
+ workflow.set_entry_point("preprocessing")
236
+
237
+ # Define edges
238
+ # Preprocessing -> conditional routing
239
+ workflow.add_conditional_edges(
240
+ "preprocessing",
241
+ self._route_after_preprocessing,
242
+ {
243
+ "extract": "extraction",
244
+ "skip_to_validation": "validation"
245
+ }
246
+ )
247
+
248
+ # Extraction -> conditional routing
249
+ workflow.add_conditional_edges(
250
+ "extraction",
251
+ self._route_after_extraction,
252
+ {
253
+ "analyze": "analysis",
254
+ "skip_to_validation": "validation"
255
+ }
256
+ )
257
+
258
+ # Analysis -> validation
259
+ workflow.add_edge("analysis", "validation")
260
+
261
+ # Validation -> END
262
+ workflow.add_edge("validation", END)
263
+
264
+ return workflow.compile()
265
+
266
+ def _preprocessing_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
267
+ """
268
+ Preprocessing node: HTML parsing, cleaning, language detection, keyword check.
269
+ """
270
+ try:
271
+ input_data = {
272
+ "post_id": state.get("post_id"),
273
+ "post_content": state.get("post_content", ""),
274
+ "thread_title": state.get("thread_title"),
275
+ "thread_first_post": state.get("thread_first_post"),
276
+ "category_title": state.get("category_title"),
277
+ "category_topic": state.get("category_topic")
278
+ }
279
+
280
+ result = self.preprocessor.process(input_data)
281
+
282
+ if result.get("success", False):
283
+ # Content
284
+ state["cleaned_content"] = result.get("cleaned_content", "")
285
+ state["quoted_content"] = result.get("quoted_content")
286
+ state["has_quote"] = result.get("has_quote", False)
287
+ state["quoted_author"] = result.get("quoted_author")
288
+ state["raw_thread_context"] = result.get("raw_thread_context", "")
289
+ state["is_empty"] = result.get("is_empty", False)
290
+ state["original_content"] = result.get("original_content", state.get("post_content", ""))
291
+
292
+ # Language
293
+ state["detected_language"] = result.get("detected_language", "English")
294
+ state["language_code"] = result.get("language_code", "en")
295
+ state["is_english"] = result.get("is_english", True)
296
+ state["language_confidence"] = result.get("language_confidence", "low")
297
+ state["language_detection_skipped"] = result.get("language_detection_skipped", False)
298
+
299
+ # Relevance
300
+ state["preliminary_relevant"] = result.get("preliminary_relevant", False)
301
+ state["needs_relevance_validation"] = result.get("needs_relevance_validation", False)
302
+ state["relevance_keywords_found"] = result.get("relevance_keywords_found", [])
303
+ state["relevance_type"] = result.get("relevance_type", "none")
304
+ state["has_primary_keywords"] = result.get("has_primary_keywords", False)
305
+
306
+ # Detections
307
+ state["products_detected"] = result.get("products_detected", [])
308
+ state["competitors_detected"] = result.get("competitors_detected", [])
309
+
310
+ state["success"] = True
311
+ else:
312
+ error_msg = f"Preprocessing failed: {result.get('error', 'Unknown error')}"
313
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
314
+ state["success"] = False
315
+
316
+ logger.debug(f"Preprocessing complete for post {state.get('post_id')}")
317
+ return state
318
+
319
+ except Exception as e:
320
+ error_msg = f"Preprocessing node error: {str(e)}"
321
+ logger.error(error_msg)
322
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
323
+ state["success"] = False
324
+ return state
325
+
326
+ def _extraction_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
327
+ """
328
+ Extraction node: LLM-based relevance validation and fact extraction.
329
+ """
330
+ try:
331
+ input_data = {
332
+ "cleaned_content": state.get("cleaned_content", ""),
333
+ "quoted_content": state.get("quoted_content"),
334
+ "raw_thread_context": state.get("raw_thread_context", ""),
335
+ "relevance_keywords_found": state.get("relevance_keywords_found", []),
336
+ "preliminary_relevant": state.get("preliminary_relevant", False),
337
+ "needs_relevance_validation": state.get("needs_relevance_validation", True),
338
+ "products_detected": state.get("products_detected", []),
339
+ "competitors_detected": state.get("competitors_detected", []),
340
+ "is_english": state.get("is_english", True),
341
+ "detected_language": state.get("detected_language", "English")
342
+ }
343
+
344
+ result = self.extraction_agent.process(input_data)
345
+
346
+ # Update state with extraction results
347
+ state["is_relevant"] = result.get("is_relevant", False)
348
+ state["relevance_confidence"] = result.get("relevance_confidence", "low")
349
+ state["relevance_reason"] = result.get("relevance_reason", "")
350
+ state["extraction_performed"] = result.get("extraction_performed", True)
351
+
352
+ # Extracted facts
353
+ state["products_mentioned"] = result.get("products_mentioned", [])
354
+ state["sabian_mention_context"] = result.get("sabian_mention_context")
355
+ state["author_role"] = result.get("author_role", "unknown")
356
+ state["competitors_mentioned"] = result.get("competitors_mentioned", [])
357
+ state["thread_context_summary"] = result.get("thread_context_summary", "")
358
+
359
+ if not result.get("success", False) and result.get("error"):
360
+ state["processing_errors"] = state.get("processing_errors", []) + [result["error"]]
361
+
362
+ logger.debug(f"Extraction complete: is_relevant={state['is_relevant']}")
363
+ return state
364
+
365
+ except Exception as e:
366
+ error_msg = f"Extraction node error: {str(e)}"
367
+ logger.error(error_msg)
368
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
369
+ state["is_relevant"] = False
370
+ state["relevance_confidence"] = "low"
371
+ return state
372
+
373
+ def _analysis_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
374
+ """
375
+ Analysis node: Deep sentiment and intent analysis for relevant posts.
376
+ """
377
+ try:
378
+ input_data = {
379
+ "cleaned_content": state.get("cleaned_content", ""),
380
+ "is_relevant": state.get("is_relevant", True),
381
+ "is_english": state.get("is_english", True),
382
+ "detected_language": state.get("detected_language", "English"),
383
+ "products_mentioned": state.get("products_mentioned", []),
384
+ "sabian_mention_context": state.get("sabian_mention_context"),
385
+ "author_role": state.get("author_role", "unknown"),
386
+ "competitors_mentioned": state.get("competitors_mentioned", []),
387
+ "thread_context_summary": state.get("thread_context_summary", "")
388
+ }
389
+
390
+ result = self.sentiment_analyzer.process(input_data)
391
+
392
+ if result.get("success", False):
393
+ # Sentiment
394
+ state["sentiment_level"] = result.get("sentiment_level")
395
+ state["emotion_type"] = result.get("emotion_type")
396
+ state["sentiment_confidence"] = result.get("sentiment_confidence", "medium")
397
+ state["sarcasm_detected"] = result.get("sarcasm_detected", False)
398
+
399
+ # Products
400
+ state["product_attributes"] = result.get("product_attributes", [])
401
+
402
+ # Competitive
403
+ state["competitor_products_owned"] = result.get("competitor_products_owned", [])
404
+ state["comparison_type"] = result.get("comparison_type")
405
+
406
+ # Journey
407
+ state["intents"] = result.get("intents", [])
408
+ state["purchase_stage"] = result.get("purchase_stage")
409
+ state["decision_drivers"] = result.get("decision_drivers", [])
410
+ state["pain_points"] = result.get("pain_points", [])
411
+ state["delight_factors"] = result.get("delight_factors", [])
412
+
413
+ # Notes
414
+ state["analysis_notes"] = result.get("analysis_notes", "")
415
+ state["analysis_skipped"] = result.get("analysis_skipped", False)
416
+ state["analysis_skip_reason"] = result.get("analysis_skip_reason", "")
417
+ else:
418
+ error_msg = f"Analysis failed: {result.get('error', 'Unknown error')}"
419
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
420
+
421
+ logger.debug(f"Analysis complete for post {state.get('post_id')}")
422
+ return state
423
+
424
+ except Exception as e:
425
+ error_msg = f"Analysis node error: {str(e)}"
426
+ logger.error(error_msg)
427
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
428
+ return state
429
+
430
+ def _validation_node(self, state: BrandAnalysisState) -> BrandAnalysisState:
431
+ """
432
+ Validation node: Rule-based validation and anomaly detection.
433
+ """
434
+ try:
435
+ result = self.output_validator.process(dict(state))
436
+
437
+ state["validation_passed"] = result.get("validation_passed", True)
438
+ state["validation_errors"] = result.get("validation_errors", [])
439
+ state["validation_warnings"] = result.get("validation_warnings", [])
440
+ state["validation_flags"] = result.get("validation_flags", [])
441
+ state["processing_status"] = result.get("processing_status", "completed")
442
+
443
+ # Set overall success
444
+ has_errors = len(state.get("processing_errors", [])) > 0
445
+ state["success"] = not has_errors or state.get("is_relevant") is not None
446
+
447
+ logger.debug(f"Validation complete: status={state['processing_status']}")
448
+ return state
449
+
450
+ except Exception as e:
451
+ error_msg = f"Validation node error: {str(e)}"
452
+ logger.error(error_msg)
453
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
454
+ state["validation_passed"] = False
455
+ state["processing_status"] = "validation_failed"
456
+ state["success"] = False
457
+ return state
458
+
459
+ def _route_after_preprocessing(self, state: BrandAnalysisState) -> str:
460
+ """
461
+ Determine routing after preprocessing.
462
+ """
463
+ # If empty content, skip to validation
464
+ if state.get("is_empty", False):
465
+ state["is_relevant"] = False
466
+ state["relevance_reason"] = "Empty content"
467
+ return "skip_to_validation"
468
+
469
+ # If not English, skip to validation
470
+ if not state.get("is_english", True):
471
+ state["is_relevant"] = False
472
+ state["relevance_reason"] = f"Non-English: {state.get('detected_language')}"
473
+ return "skip_to_validation"
474
+
475
+ # If no keywords found and no need for validation, skip
476
+ if (not state.get("preliminary_relevant", False) and
477
+ not state.get("needs_relevance_validation", False)):
478
+ state["is_relevant"] = False
479
+ state["relevance_reason"] = "No relevant keywords found"
480
+ return "skip_to_validation"
481
+
482
+ # Otherwise, go to extraction
483
+ return "extract"
484
+
485
+ def _route_after_extraction(self, state: BrandAnalysisState) -> str:
486
+ """
487
+ Determine routing after extraction.
488
+ """
489
+ if state.get("is_relevant", False):
490
+ return "analyze"
491
+ return "skip_to_validation"
492
+
493
+ def process_post(self, post_data: Dict[str, Any]) -> Dict[str, Any]:
494
+ """
495
+ Process a single forum post through the workflow.
496
+
497
+ Args:
498
+ post_data: Dictionary containing post data
499
+
500
+ Returns:
501
+ Dictionary with processed results
502
+ """
503
+ try:
504
+ initial_state = {
505
+ "post_id": post_data.get("post_id"),
506
+ "thread_id": post_data.get("thread_id"),
507
+ "post_author_id": post_data.get("post_author_id"),
508
+ "post_content": post_data.get("post_content", ""),
509
+ "thread_title": post_data.get("thread_title"),
510
+ "thread_first_post": post_data.get("thread_first_post"),
511
+ "thread_started_at": post_data.get("thread_started_at"),
512
+ "category_title": post_data.get("category_title"),
513
+ "category_topic": post_data.get("category_topic"),
514
+ "post_created_at": post_data.get("post_created_at"),
515
+ "processing_errors": [],
516
+ "success": True
517
+ }
518
+
519
+ final_state = self.workflow.invoke(initial_state)
520
+
521
+ return dict(final_state)
522
+
523
+ except Exception as e:
524
+ logger.error(f"Workflow execution error: {str(e)}")
525
+ return {
526
+ **post_data,
527
+ "success": False,
528
+ "processing_errors": [str(e)],
529
+ "processing_status": "workflow_error"
530
+ }
531
+
532
+ def process_batch(self, posts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
533
+ """
534
+ Process a batch of forum posts.
535
+
536
+ Args:
537
+ posts: List of post dictionaries
538
+
539
+ Returns:
540
+ List of processed post dictionaries
541
+ """
542
+ results = []
543
+ total = len(posts)
544
+
545
+ for idx, post in enumerate(posts, 1):
546
+ logger.info(f"Processing post {idx}/{total} (ID: {post.get('post_id')})")
547
+ result = self.process_post(post)
548
+ results.append(result)
549
+
550
+ logger.info(f"Batch processing complete: {total} posts processed")
551
+ return results
processing_comments/.dockerignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ **/__pycache__/
2
+ **/*.pyc
3
+ .git
4
+ .gitignore
5
+ .env
6
+ *.log
7
+ dist
8
+ build
processing_comments/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
processing_comments/README.md ADDED
@@ -0,0 +1,726 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Comment Processing with Agentic Workflow
2
+
3
+ A scalable, modular system for processing comments from multiple data sources using OpenAI API, LangChain, and LangGraph. The system performs language detection, translation, and context-aware sentiment analysis using an agentic workflow architecture.
4
+
5
+ ## Data Sources Supported
6
+
7
+ - **Social Media Comments**: External platforms (Facebook, Instagram, YouTube, etc.)
8
+ - **Musora Internal Comments**: Comments from Musora internal applications
9
+ - **Extensible Architecture**: Easily add new data sources via configuration
10
+
11
+ ## Features
12
+
13
+ - **Multi-Source Support**: Process comments from multiple data sources with a single codebase
14
+ - **Configuration-Driven**: Add new data sources without code changes
15
+ - **Parent Comment Context**: Automatically includes parent comment text for reply analysis
16
+ - **Modular Agent Architecture**: Extensible base classes for easy addition of new agents
17
+ - **Language Detection**: Hybrid approach using lingua library for fast English detection, with LLM fallback for non-English languages
18
+ - **Translation**: High-quality translation for non-English comments using OpenAI models
19
+ - **Context-Aware Sentiment Analysis**:
20
+ - Uses content description for context
21
+ - Includes parent comment text when analyzing replies
22
+ - Multi-label intent classification
23
+ - **LangGraph Workflow**: Flexible graph-based orchestration of agent operations
24
+ - **Snowflake Integration**: Seamless data fetching and storage with source-specific tables
25
+ - **Parallel Processing**: Multiprocessing support for high-performance batch processing
26
+ - **Dynamic Batch Sizing**: Intelligent batch size calculation based on workload and available resources
27
+ - **Independent Batch Execution**: Each batch processes and stores results independently
28
+ - **Comprehensive Logging**: Detailed logging for monitoring and debugging
29
+ - **Scalable Configuration**: Easy-to-modify sentiment categories and intents via JSON config
30
+
31
+ ## Project Structure
32
+
33
+ ```
34
+ musora-sentiment-analysis/
35
+ ├── agents/
36
+ │ ├── __init__.py
37
+ │ ├── base_agent.py # Base class for all agents
38
+ │ ├── language_detection_agent.py # Language detection agent
39
+ │ ├── translation_agent.py # Translation agent
40
+ │ └── sentiment_analysis_agent.py # Sentiment analysis agent (parent context support)
41
+ ├── workflow/
42
+ │ ├── __init__.py
43
+ │ └── comment_processor.py # LangGraph workflow orchestrator
44
+ ├── sql/
45
+ │ ├── fetch_comments.sql # Query for social media comments (with parent join)
46
+ │ ├── fetch_musora_comments.sql # Query for Musora internal comments (with parent join)
47
+ │ ├── create_ml_features_table.sql # Schema for social media table (with parent fields)
48
+ │ ├── init_musora_table.sql # Initialize empty Musora table (run first!)
49
+ │ └── create_musora_ml_features_table.sql # Full Musora schema with views (optional)
50
+ ├── config_files/
51
+ │ ├── data_sources_config.json # Data source configuration (NEW)
52
+ │ ├── sentiment_config.json # Configuration for agents and workflow
53
+ │ └── sentiment_analysis_config.json # Sentiment categories and intents
54
+ ├── logs/ # Processing logs (auto-created)
55
+ ├── LLM.py # LLM utility class
56
+ ├── SnowFlakeConnection.py # Snowflake connection handler
57
+ ├── main.py # Main execution script (multi-source support)
58
+ ├── requirements.txt # Python dependencies
59
+ ├── .env # Environment variables (not in git)
60
+ ├── README.md # This file
61
+ └── CLAUDE.md # Detailed technical documentation
62
+ ```
63
+
64
+ ## Setup
65
+
66
+ ### 1. Install Dependencies
67
+
68
+ ```bash
69
+ pip install -r requirements.txt
70
+ ```
71
+
72
+ ### 2. Configure Environment Variables
73
+
74
+ Ensure your `.env` file contains the required credentials:
75
+
76
+ ```env
77
+ # Snowflake
78
+ SNOWFLAKE_USER=your_user
79
+ SNOWFLAKE_PASSWORD=your_password
80
+ SNOWFLAKE_ACCOUNT=your_account
81
+ SNOWFLAKE_ROLE=your_role
82
+ SNOWFLAKE_DATABASE=SOCIAL_MEDIA_DB
83
+ SNOWFLAKE_WAREHOUSE=your_warehouse
84
+ SNOWFLAKE_SCHEMA=ML_FEATURES
85
+
86
+ # OpenAI
87
+ OPENAI_API_KEY=your_openai_key
88
+ ```
89
+
90
+ ### 3. Create Snowflake Tables
91
+
92
+ Run the SQL scripts to create the output tables:
93
+
94
+ ```bash
95
+ # Execute the SQL files in Snowflake
96
+ # For social media comments (if not already exists)
97
+ sql/create_ml_features_table.sql
98
+
99
+ # For Musora internal comments - INITIAL SETUP (First time only)
100
+ # This creates the empty table structure
101
+ sql/init_musora_table.sql
102
+ ```
103
+
104
+ **Note**: Run `init_musora_table.sql` before the first Musora comments processing run. After that, you can optionally run `create_musora_ml_features_table.sql` to create the additional views if needed.
105
+
106
+ ## Usage
107
+
108
+ ### Basic Usage (Process All Data Sources)
109
+
110
+ Process unprocessed comments from all enabled data sources:
111
+
112
+ ```bash
113
+ python main.py
114
+ ```
115
+
116
+ This will:
117
+ - Process all enabled data sources (social media and Musora comments)
118
+ - Fetch only comments that haven't been processed yet
119
+ - Process them through the workflow using parallel workers (CPU count - 2, max 5)
120
+ - Each batch processes and stores to Snowflake independently
121
+ - Append new results to the existing tables (no overwrite)
122
+
123
+ ### Process Specific Data Source
124
+
125
+ Process only social media comments:
126
+
127
+ ```bash
128
+ python main.py --data-source social_media
129
+ ```
130
+
131
+ Process only Musora internal comments:
132
+
133
+ ```bash
134
+ python main.py --data-source musora_comments
135
+ ```
136
+
137
+ ### Process Limited Number of Comments
138
+
139
+ Limit applies per data source:
140
+
141
+ ```bash
142
+ # Process 100 comments from each enabled data source
143
+ python main.py --limit 100
144
+
145
+ # Process 100 comments from only Musora source
146
+ python main.py --limit 100 --data-source musora_comments
147
+ ```
148
+
149
+ ### Sequential Processing (Debug Mode)
150
+
151
+ For debugging purposes, use sequential processing:
152
+
153
+ ```bash
154
+ python main.py --limit 100 --sequential
155
+ ```
156
+
157
+ This processes all comments in a single batch, making it easier to debug issues.
158
+
159
+ ### First Run for New Data Source
160
+
161
+ For the first run of Musora comments:
162
+
163
+ 1. **First**: Run the initialization SQL script in Snowflake:
164
+ ```sql
165
+ -- Execute in Snowflake
166
+ sql/init_musora_table.sql
167
+ ```
168
+
169
+ 2. **Then**: Run the processing with overwrite flag:
170
+ ```bash
171
+ python main.py --overwrite --data-source musora_comments --limit 100
172
+ ```
173
+
174
+ **Why two steps?**
175
+ - The fetch query checks for already-processed comments by querying the output table
176
+ - On first run, that table doesn't exist, causing an error
177
+ - The init script creates the empty table structure first
178
+ - Then processing can run normally
179
+
180
+ **Warning**: Overwrite will replace all existing data in the output table. Only use for initial table creation or when reprocessing from scratch.
181
+
182
+ ### Custom Configuration File
183
+
184
+ ```bash
185
+ python main.py --config path/to/custom_config.json
186
+ ```
187
+
188
+ ### Command-Line Arguments
189
+
190
+ - `--limit N`: Process only N comments per data source (default: 10000)
191
+ - `--overwrite`: Overwrite existing Snowflake table (default: append mode)
192
+ - `--config PATH`: Custom configuration file path
193
+ - `--sequential`: Use sequential processing instead of parallel (for debugging)
194
+ - `--data-source SOURCE`: Process only specific data source (e.g., social_media, musora_comments)
195
+
196
+ ### Parallel Processing
197
+
198
+ The system uses multiprocessing to process comments in parallel:
199
+
200
+ **Worker Calculation**:
201
+ - Number of workers: `CPU count - 2` (max 5 workers)
202
+ - Leaves CPU cores available for system operations
203
+ - Example: 8-core system → 5 workers (capped at max)
204
+
205
+ **Dynamic Batch Sizing**:
206
+ - Batch size calculated as: `total_comments / num_workers`
207
+ - Minimum batch size: 20 comments
208
+ - Maximum batch size: 1000 comments
209
+ - Batches ≤ 20 comments are not split
210
+
211
+ **Independent Execution**:
212
+ - Each batch runs in a separate process
213
+ - Batches store to Snowflake immediately upon completion
214
+ - No waiting for all batches to complete
215
+ - Failed batches don't affect successful ones
216
+
217
+ **Performance**:
218
+ - Expected speedup: ~1.8-4.5x depending on number of workers
219
+ - Real-time progress reporting as batches complete
220
+ - Processing time and average per comment displayed in summary
221
+
222
+ ### Incremental Processing
223
+
224
+ The pipeline is designed for incremental processing:
225
+ - **Automatic deduplication**: SQL query excludes comments already in `COMMENT_SENTIMENT_FEATURES`
226
+ - **Append-only by default**: New results are added without overwriting existing data
227
+ - **Failed comment retry**: Comments with `success=False` are not stored and will be retried in future runs
228
+ - **Run regularly**: Safe to run daily/weekly to process new comments
229
+
230
+ ## Configuration
231
+
232
+ ### Data Sources Configuration
233
+
234
+ The `config_files/data_sources_config.json` file defines available data sources:
235
+
236
+ ```json
237
+ {
238
+ "data_sources": {
239
+ "social_media": {
240
+ "name": "Social Media Comments",
241
+ "enabled": true,
242
+ "sql_query_file": "sql/fetch_comments.sql",
243
+ "output_config": {
244
+ "table_name": "COMMENT_SENTIMENT_FEATURES",
245
+ "database": "SOCIAL_MEDIA_DB",
246
+ "schema": "ML_FEATURES"
247
+ }
248
+ },
249
+ "musora_comments": {
250
+ "name": "Musora Internal Comments",
251
+ "enabled": true,
252
+ "sql_query_file": "sql/fetch_musora_comments.sql",
253
+ "output_config": {
254
+ "table_name": "MUSORA_COMMENT_SENTIMENT_FEATURES",
255
+ "database": "SOCIAL_MEDIA_DB",
256
+ "schema": "ML_FEATURES"
257
+ },
258
+ "additional_fields": [
259
+ "PERMALINK_URL",
260
+ "THUMBNAIL_URL"
261
+ ]
262
+ }
263
+ }
264
+ }
265
+ ```
266
+
267
+ **To add a new data source**: Simply add a new entry to this config file and create the corresponding SQL query file.
268
+
269
+ ### Agent Configuration
270
+
271
+ The `config_files/sentiment_config.json` file controls agent behavior:
272
+
273
+ ```json
274
+ {
275
+ "agents": {
276
+ "language_detection": {
277
+ "model": "gpt-5-nano",
278
+ "temperature": 0.0,
279
+ "max_retries": 3
280
+ },
281
+ "translation": {
282
+ "model": "gpt-5-nano",
283
+ "temperature": 0.3,
284
+ "max_retries": 3
285
+ },
286
+ "sentiment_analysis": {
287
+ "model": "gpt-5-nano",
288
+ "temperature": 0.2,
289
+ "max_retries": 3
290
+ }
291
+ },
292
+ "workflow": {
293
+ "description": "Batch size is calculated dynamically based on number of workers (min: 20, max: 1000)",
294
+ "parallel_processing": {
295
+ "enabled": true,
296
+ "worker_calculation": "CPU count - 2, max 5 workers",
297
+ "min_batch_size": 20,
298
+ "max_batch_size": 1000
299
+ }
300
+ },
301
+ "snowflake": {
302
+ "output_table": "COMMENT_SENTIMENT_FEATURES",
303
+ "database": "SOCIAL_MEDIA_DB",
304
+ "schema": "ML_FEATURES"
305
+ }
306
+ }
307
+ ```
308
+
309
+ **Note**: Batch size is now calculated dynamically and no longer needs to be configured manually.
310
+
311
+ ### Sentiment Categories Configuration
312
+
313
+ The `config_files/sentiment_analysis_config.json` file defines sentiment categories and intents (easily extensible):
314
+
315
+ ```json
316
+ {
317
+ "sentiment_polarity": {
318
+ "categories": [
319
+ {"value": "very_positive", "label": "Very Positive", "description": "..."},
320
+ {"value": "positive", "label": "Positive", "description": "..."},
321
+ {"value": "neutral", "label": "Neutral", "description": "..."},
322
+ {"value": "negative", "label": "Negative", "description": "..."},
323
+ {"value": "very_negative", "label": "Very Negative", "description": "..."}
324
+ ]
325
+ },
326
+ "intent": {
327
+ "categories": [
328
+ {"value": "praise", "label": "Praise", "description": "..."},
329
+ {"value": "question", "label": "Question", "description": "..."},
330
+ {"value": "request", "label": "Request", "description": "..."},
331
+ {"value": "feedback_negative", "label": "Negative Feedback", "description": "..."},
332
+ {"value": "suggestion", "label": "Suggestion", "description": "..."},
333
+ {"value": "humor_sarcasm", "label": "Humor/Sarcasm", "description": "..."},
334
+ {"value": "off_topic", "label": "Off Topic", "description": "..."},
335
+ {"value": "spam_selfpromo", "label": "Spam/Self-Promotion", "description": "..."}
336
+ ]
337
+ },
338
+ "reply_policy": {
339
+ "requires_reply_intents": ["question", "request"],
340
+ "description": "Comments with these intents should be flagged for reply"
341
+ },
342
+ "intent_settings": {
343
+ "multi_label": true,
344
+ "description": "Intent can have multiple labels as a comment can express multiple intents"
345
+ }
346
+ }
347
+ ```
348
+
349
+ ## Adding New Agents
350
+
351
+ The system is designed for easy extensibility. To add a new agent:
352
+
353
+ ### 1. Create Agent Class
354
+
355
+ ```python
356
+ from agents.base_agent import BaseAgent
357
+ from typing import Dict, Any
358
+
359
+ class MyNewAgent(BaseAgent):
360
+ def __init__(self, config: Dict[str, Any], api_key: str):
361
+ super().__init__("MyNewAgent", config)
362
+ # Initialize your agent-specific components
363
+
364
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
365
+ # Validate input data
366
+ return True
367
+
368
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
369
+ # Implement your agent logic
370
+ return {"success": True, "result": "..."}
371
+ ```
372
+
373
+ ### 2. Update Workflow
374
+
375
+ Add the agent to `workflow/comment_processor.py`:
376
+
377
+ ```python
378
+ # Add to CommentState TypedDict
379
+ new_agent_result: str
380
+
381
+ # Add node
382
+ workflow.add_node("my_new_agent", self._my_new_agent_node)
383
+
384
+ # Add edges
385
+ workflow.add_edge("translation", "my_new_agent")
386
+ workflow.add_edge("my_new_agent", END)
387
+ ```
388
+
389
+ ### 3. Update Configuration
390
+
391
+ Add agent config to `sentiment_config.json`:
392
+
393
+ ```json
394
+ {
395
+ "agents": {
396
+ "my_new_agent": {
397
+ "name": "MyNewAgent",
398
+ "model": "gpt-4o-mini",
399
+ "temperature": 0.5,
400
+ "max_retries": 3
401
+ }
402
+ }
403
+ }
404
+ ```
405
+
406
+ ## Output Schema
407
+
408
+ ### Social Media Comments Table
409
+ Stored in `SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES`
410
+
411
+ ### Musora Comments Table
412
+ Stored in `SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES`
413
+
414
+ ### Common Columns (Both Tables)
415
+
416
+ | Column | Type | Description |
417
+ |--------|------|-------------|
418
+ | COMMENT_SK | NUMBER | Surrogate key from source |
419
+ | COMMENT_ID | VARCHAR | Platform comment ID |
420
+ | ORIGINAL_TEXT | VARCHAR | Original comment text |
421
+ | **PARENT_COMMENT_ID** | **VARCHAR** | **ID of parent comment if this is a reply** |
422
+ | **PARENT_COMMENT_TEXT** | **VARCHAR** | **Text of parent comment for context** |
423
+ | DETECTED_LANGUAGE | VARCHAR | Detected language name |
424
+ | LANGUAGE_CODE | VARCHAR | ISO 639-1 code |
425
+ | IS_ENGLISH | BOOLEAN | Is comment in English |
426
+ | TRANSLATED_TEXT | VARCHAR | English translation |
427
+ | TRANSLATION_PERFORMED | BOOLEAN | Was translation performed |
428
+ | SENTIMENT_POLARITY | VARCHAR | Sentiment (very_positive, positive, neutral, negative, very_negative) |
429
+ | INTENT | VARCHAR | Multi-label intents (comma-separated) |
430
+ | REQUIRES_REPLY | BOOLEAN | Does comment need a response |
431
+ | SENTIMENT_CONFIDENCE | VARCHAR | Analysis confidence (high, medium, low) |
432
+ | PROCESSING_SUCCESS | BOOLEAN | Processing status |
433
+ | PROCESSED_AT | TIMESTAMP | Processing timestamp |
434
+
435
+ ### Musora-Specific Additional Columns
436
+
437
+ | Column | Type | Description |
438
+ |--------|------|-------------|
439
+ | PERMALINK_URL | VARCHAR | Web URL path of the content |
440
+ | THUMBNAIL_URL | VARCHAR | Thumbnail URL of the content |
441
+
442
+ ### Available Views
443
+
444
+ **Social Media:**
445
+ - `VW_COMMENTS_REQUIRING_REPLY`: Comments that need responses (includes parent comment info)
446
+ - `VW_SENTIMENT_DISTRIBUTION`: Sentiment and intent statistics by channel (includes reply comment count)
447
+ - `VW_NON_ENGLISH_COMMENTS`: Filtered view of non-English comments
448
+
449
+ **Musora:**
450
+ - `VW_MUSORA_COMMENTS_REQUIRING_REPLY`: Musora comments needing responses
451
+ - `VW_MUSORA_SENTIMENT_DISTRIBUTION`: Musora sentiment and intent statistics
452
+ - `VW_MUSORA_NON_ENGLISH_COMMENTS`: Non-English Musora comments
453
+
454
+ ## Workflow Architecture
455
+
456
+ The system uses LangGraph to create a flexible, state-based workflow:
457
+
458
+ ```
459
+ ┌─────────────────────┐
460
+ │ Fetch Comments │
461
+ │ from Snowflake │
462
+ │ (Unprocessed Only) │
463
+ └──────────┬──────────┘
464
+
465
+
466
+ ┌─────────────────────┐
467
+ │ Language Detection │
468
+ │ Agent │
469
+ └──────────┬──────────┘
470
+
471
+
472
+ ┌────┴────┐
473
+ │ English?│
474
+ └────┬────┘
475
+
476
+ ┌─────┴─────┐
477
+ │ │
478
+ Yes No
479
+ │ │
480
+ │ ▼
481
+ │ ┌─────────────┐
482
+ │ │ Translation │
483
+ │ │ Agent │
484
+ │ └──────┬──────┘
485
+ │ │
486
+ └─────┬─────┘
487
+
488
+
489
+ ┌──────────────────┐
490
+ │ Sentiment │
491
+ │ Analysis Agent │
492
+ └─────────┬────────┘
493
+
494
+
495
+ ┌──────────────┐
496
+ │Store Results │
497
+ │to Snowflake │
498
+ │(Append Mode) │
499
+ └──────────────┘
500
+ ```
501
+
502
+ **Note**: The fetch step automatically excludes comments already present in `COMMENT_SENTIMENT_FEATURES`, enabling incremental processing.
503
+
504
+ ## Logging
505
+
506
+ Logs are automatically created in the `logs/` directory with timestamps:
507
+
508
+ ```
509
+ logs/comment_processing_20251001_143022.log
510
+ ```
511
+
512
+ ## Adding New Data Sources
513
+
514
+ The system is designed to make adding new data sources easy:
515
+
516
+ ### Steps to Add a New Source:
517
+
518
+ 1. **Update Configuration** (`config_files/data_sources_config.json`):
519
+ ```json
520
+ "your_new_source": {
521
+ "name": "Your New Source Name",
522
+ "enabled": true,
523
+ "sql_query_file": "sql/fetch_your_source.sql",
524
+ "output_config": {
525
+ "table_name": "YOUR_SOURCE_SENTIMENT_FEATURES",
526
+ "database": "SOCIAL_MEDIA_DB",
527
+ "schema": "ML_FEATURES"
528
+ },
529
+ "additional_fields": ["FIELD1", "FIELD2"] // Optional
530
+ }
531
+ ```
532
+
533
+ 2. **Create SQL Query File** (`sql/fetch_your_source.sql`):
534
+ - Fetch comments with consistent column names
535
+ - Include self-join for parent comments if available
536
+ - Exclude already-processed comments (LEFT JOIN with output table)
537
+
538
+ 3. **Create Table Initialization Script** (`sql/init_your_source_table.sql`):
539
+ - Creates empty table structure
540
+ - Base schema on `init_musora_table.sql`
541
+ - Add source-specific fields as needed
542
+ - **Run this in Snowflake FIRST before processing**
543
+
544
+ 4. **Create Full Schema** (optional):
545
+ - Base schema on `create_musora_ml_features_table.sql`
546
+ - Include views and indexes
547
+
548
+ 5. **Run First Time**:
549
+ ```bash
550
+ # Step 1: Run init script in Snowflake
551
+ sql/init_your_source_table.sql
552
+
553
+ # Step 2: Process first batch
554
+ python main.py --overwrite --data-source your_new_source --limit 100
555
+ ```
556
+
557
+ **No code changes required!**
558
+
559
+ ## Best Practices
560
+
561
+ 1. **Testing**: Always test with `--limit` flag first (e.g., `--limit 100`)
562
+ 2. **New Data Sources**: Test new sources with `--sequential --limit 100` first
563
+ 3. **Debugging**: Use `--sequential` flag for easier debugging of processing issues
564
+ 4. **Incremental Processing**: Run regularly without `--overwrite` to process only new comments
565
+ 5. **Monitoring**: Check logs for processing errors and batch completion
566
+ 6. **Performance**: Use default parallel mode for production workloads
567
+ 7. **Extensibility**: Follow the base agent pattern for consistency
568
+ 8. **Error Handling**: All agents include robust error handling
569
+ 9. **Failed Comments**: Review logs for failed comments - they'll be automatically retried in future runs
570
+ 10. **Resource Management**: System automatically adapts to available CPU resources
571
+ 11. **Parent Comments**: Ensure SQL queries include parent comment joins for best accuracy
572
+
573
+ ## Sentiment Analysis Features
574
+
575
+ ### Multi-Label Intent Classification
576
+
577
+ The system supports **multi-label intent classification**, meaning a single comment can have multiple intents:
578
+
579
+ - **Example**: "This is amazing! What scale are you using?" → `["praise", "question"]`
580
+ - **Example**: "Love this but can you make a tutorial on it?" → `["praise", "request"]`
581
+
582
+ ### Context-Aware Analysis with Parent Comment Support
583
+
584
+ The sentiment analysis agent provides rich context understanding:
585
+
586
+ 1. **Content Context**: Uses the `content_description` field to understand what the comment is about
587
+ 2. **Parent Comment Context** (NEW): When analyzing reply comments, the system:
588
+ - Automatically detects when a comment is a reply
589
+ - Fetches the parent comment text from the database
590
+ - Includes parent comment in the LLM prompt
591
+ - Explicitly instructs the LLM that this is a reply comment
592
+ - Results in more accurate sentiment and intent classification
593
+
594
+ **Example**:
595
+ - Parent Comment: "Does anyone know how to play this riff?"
596
+ - Reply Comment: "Yes!"
597
+ - Without parent context: Might be classified as unclear/off-topic
598
+ - With parent context: Correctly classified as answering a question
599
+
600
+ This dramatically improves accuracy for:
601
+ - Short reply comments ("Yes", "Thanks!", "Agreed")
602
+ - Sarcastic replies (context crucial for understanding)
603
+ - Continuation of discussions
604
+ - Agreement/disagreement comments
605
+
606
+ ### Failure Handling & Reprocessing
607
+
608
+ Comments that fail sentiment analysis (missing critical fields like sentiment_polarity or intents) are:
609
+ - Marked as `success=False` in the workflow
610
+ - **NOT stored in Snowflake**
611
+ - **Automatically available for reprocessing** in future runs
612
+
613
+ This ensures only successfully processed comments are stored, while failed comments remain available for retry.
614
+
615
+ ### Incremental Processing & Deduplication
616
+
617
+ The pipeline automatically handles incremental processing:
618
+ - **SQL-level deduplication**: Query excludes comments already in `COMMENT_SENTIMENT_FEATURES` using `LEFT JOIN`
619
+ - **Automatic retry**: Failed comments (not stored) are automatically retried on next run
620
+ - **Append-only mode**: Default behavior appends new records without overwriting
621
+ - **Production-ready**: Safe to run daily/weekly/monthly to process new comments
622
+
623
+ ### Scalable Configuration
624
+
625
+ To add or modify sentiment categories or intents:
626
+
627
+ 1. Edit `config_files/sentiment_analysis_config.json`
628
+ 2. Add/modify categories in the `sentiment_polarity` or `intent` sections
629
+ 3. Update `reply_policy.requires_reply_intents` if needed
630
+ 4. No code changes required!
631
+
632
+ ## Future Extensions
633
+
634
+ The modular architecture supports easy addition of:
635
+
636
+ - Topic classification agent
637
+ - Entity extraction agent
638
+ - Engagement score prediction agent
639
+ - Named entity recognition agent
640
+
641
+ Simply create a new agent inheriting from `BaseAgent` and add it to the workflow graph.
642
+
643
+ ## Troubleshooting
644
+
645
+ ### Issue: "Object does not exist or not authorized" on First Run
646
+
647
+ **Error**: `Object 'SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES' does not exist or not authorized`
648
+
649
+ **Cause**: The fetch query tries to check for already-processed comments, but the output table doesn't exist yet on first run.
650
+
651
+ **Solution**:
652
+ 1. Run the initialization script first:
653
+ ```sql
654
+ -- Execute in Snowflake
655
+ sql/init_musora_table.sql
656
+ ```
657
+ 2. Then run the processing:
658
+ ```bash
659
+ python main.py --overwrite --data-source musora_comments --limit 100
660
+ ```
661
+
662
+ ### Issue: API Rate Limits
663
+
664
+ If hitting API rate limits, reduce the number of parallel workers or process fewer comments:
665
+ ```bash
666
+ # Process fewer comments at a time
667
+ python main.py --limit 500
668
+
669
+ # Or use sequential mode
670
+ python main.py --sequential --limit 100
671
+ ```
672
+
673
+ ### Issue: Memory Issues
674
+
675
+ Process in smaller batches using `--limit`:
676
+ ```bash
677
+ python main.py --limit 500
678
+ ```
679
+
680
+ ### Issue: Debugging Processing Errors
681
+
682
+ Use sequential mode to debug issues more easily:
683
+ ```bash
684
+ python main.py --sequential --limit 50
685
+ ```
686
+
687
+ This processes all comments in a single batch with clearer error messages.
688
+
689
+ ### Issue: Connection Timeouts
690
+
691
+ Check Snowflake credentials in `.env` and network connectivity.
692
+
693
+ ### Issue: Parallel Processing Not Working
694
+
695
+ If multiprocessing issues occur, use sequential mode:
696
+ ```bash
697
+ python main.py --sequential
698
+ ```
699
+
700
+ ## Performance
701
+
702
+ ### Expected Speedup
703
+
704
+ Parallel processing provides significant performance improvements:
705
+
706
+ - **Sequential**: 1x (baseline)
707
+ - **2 workers**: ~1.8-1.9x faster
708
+ - **5 workers**: ~4-4.5x faster
709
+
710
+ Speedup isn't perfectly linear due to:
711
+ - Snowflake connection overhead
712
+ - LLM API rate limits (shared across workers)
713
+ - I/O operations
714
+
715
+ ### Monitoring Performance
716
+
717
+ The processing summary includes:
718
+ - Total processing time
719
+ - Average time per comment
720
+ - Number of workers used
721
+ - Batch size calculations
722
+ - Failed batches (if any)
723
+
724
+ ## License
725
+
726
+ Internal use only - Musora sentiment analysis project.
processing_comments/SnowFlakeConnection.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This class create a connection to Snowflake, run queries (read and write)
3
+ """
4
+ import json
5
+ import os
6
+ from snowflake.snowpark import Session
7
+ from dotenv import load_dotenv
8
+ import logging
9
+ logger = logging.getLogger()
10
+ load_dotenv()
11
+
12
+ class SnowFlakeConn:
13
+ def __init__(self):
14
+ self. session = self.connect_to_snowflake()
15
+
16
+
17
+ # =========================================================
18
+ def connect_to_snowflake(self):
19
+ # --- Snowflake connection via env vars ---
20
+ conn = dict(
21
+ user=self.get_credential("SNOWFLAKE_USER"),
22
+ password=self.get_credential("SNOWFLAKE_PASSWORD"),
23
+ account=self.get_credential("SNOWFLAKE_ACCOUNT"),
24
+ role=self.get_credential("SNOWFLAKE_ROLE"),
25
+ database=self.get_credential("SNOWFLAKE_DATABASE"),
26
+ warehouse=self.get_credential("SNOWFLAKE_WAREHOUSE"),
27
+ schema=self.get_credential("SNOWFLAKE_SCHEMA"),
28
+ )
29
+
30
+ session = Session.builder.configs(conn).create()
31
+ return session
32
+
33
+ # =========================================================
34
+ def get_credential(self, key):
35
+ return os.getenv(key)
36
+
37
+ # =========================================================
38
+ def run_read_query(self, query, data):
39
+ """
40
+ Executes a SQL query on Snowflake that fetch the data
41
+ :return: Pandas dataframe containing the query results
42
+ """
43
+
44
+ # Connect to Snowflake
45
+ try:
46
+ dataframe = self.session.sql(query).to_pandas()
47
+ dataframe.columns = dataframe.columns.str.lower()
48
+ print(f"reading {data} table successfully")
49
+ return dataframe
50
+ except Exception as e:
51
+ print(f"Error in creating/updating table: {e}")
52
+
53
+ # =========================================================
54
+ def store_df_to_snowflake(self, table_name, dataframe, database="SOCIAL_MEDIA_DB", schema="ML_FEATURES", overwrite=False):
55
+ """
56
+ Executes a SQL query on Snowflake that write the preprocessed data on new tables
57
+ :param query: SQL query string to be executed
58
+ :return: None
59
+ """
60
+
61
+ try:
62
+ self.session.use_database(database)
63
+ self.session.use_schema(schema)
64
+
65
+ dataframe = dataframe.reset_index(drop=True)
66
+ dataframe.columns = dataframe.columns.str.upper()
67
+
68
+ self.session.write_pandas(df=dataframe,
69
+ table_name=table_name.strip().upper(),
70
+ auto_create_table=True,
71
+ overwrite=overwrite,
72
+ use_logical_type=True)
73
+ print(f"Data inserted into {table_name} successfully.")
74
+
75
+ except Exception as e:
76
+ print(f"Error in creating/updating/inserting table: {e}")
77
+
78
+ # =========================================================
79
+ def execute_sql_file(self, file_path):
80
+ """
81
+ Executes SQL queries from a file
82
+ :param file_path: Path to SQL file
83
+ :return: Query result or None for DDL/DML
84
+ """
85
+ try:
86
+ with open(file_path, 'r', encoding='utf-8') as file:
87
+ sql_content = file.read()
88
+
89
+ result = self.session.sql(sql_content).collect()
90
+ print(f"Successfully executed SQL from {file_path}")
91
+ return result
92
+ except Exception as e:
93
+ print(f"Error executing SQL file {file_path}: {e}")
94
+ return None
95
+
96
+ # =========================================================
97
+ def execute_query(self, query, description="query"):
98
+ """
99
+ Executes a SQL query and returns results
100
+ :param query: SQL query string
101
+ :param description: Description of the query for logging
102
+ :return: Query results
103
+ """
104
+ try:
105
+ result = self.session.sql(query).collect()
106
+ print(f"Successfully executed {description}")
107
+ return result
108
+ except Exception as e:
109
+ print(f"Error executing {description}: {e}")
110
+ return None
111
+
112
+
113
+ # =========================================================
114
+ def get_data(self, data):
115
+ # get any sort of data based on requirement --> comments, contents, etc
116
+ pass
117
+
118
+ # =========================================================
119
+ def close_connection(self):
120
+ self.session.close()
121
+
processing_comments/agents/README.md ADDED
@@ -0,0 +1,1571 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Agents Architecture Documentation
2
+
3
+ ## Table of Contents
4
+ - [Overview](#overview)
5
+ - [Agent Architecture](#agent-architecture)
6
+ - [Existing Agents](#existing-agents)
7
+ - [How Agents Work](#how-agents-work)
8
+ - [Adding New Agents](#adding-new-agents)
9
+ - [Modifying Existing Agents](#modifying-existing-agents)
10
+ - [Configuration System](#configuration-system)
11
+ - [Best Practices](#best-practices)
12
+ - [Troubleshooting](#troubleshooting)
13
+
14
+ ## Overview
15
+
16
+ The agent system in this project is built on a modular, extensible architecture that processes social media comments through a series of specialized agents. Each agent performs a specific task (language detection, translation, sentiment analysis) and is orchestrated through a LangGraph workflow.
17
+
18
+ ### Key Design Principles
19
+
20
+ 1. **Modularity**: Each agent handles a single responsibility
21
+ 2. **Extensibility**: Easy to add new agents without modifying existing code
22
+ 3. **Consistency**: All agents inherit from a common base class
23
+ 4. **Configuration-Driven**: Agent behavior controlled through JSON config files
24
+ 5. **Error Resilience**: Robust error handling at every level
25
+
26
+ ### Technology Stack
27
+
28
+ - **LangChain**: For LLM interactions and agent framework
29
+ - **LangGraph**: For workflow orchestration
30
+ - **OpenAI API**: LLM backend for NLP tasks
31
+ - **Lingua**: Fast language detection library
32
+ - **Python 3.x**: Core language
33
+
34
+ ## Agent Architecture
35
+
36
+ ### Directory Structure
37
+
38
+ ```
39
+ agents/
40
+ ├── __init__.py # Module exports
41
+ ├── base_agent.py # Abstract base class
42
+ ├── language_detection_agent.py # Language detection agent
43
+ ├── translation_agent.py # Translation agent
44
+ ├── sentiment_analysis_agent.py # Sentiment analysis agent
45
+ └── README.md # This file
46
+ ```
47
+
48
+ ### Base Agent Class
49
+
50
+ All agents inherit from `BaseAgent` (`base_agent.py`), which provides:
51
+
52
+ ```python
53
+ class BaseAgent(ABC):
54
+ """Abstract base class for all agents"""
55
+
56
+ # Common attributes
57
+ - name: str # Agent name
58
+ - config: Dict[str, Any] # Configuration dictionary
59
+ - model: str # LLM model to use
60
+ - temperature: float # LLM temperature
61
+ - max_retries: int # Maximum retry attempts
62
+
63
+ # Abstract methods (must be implemented)
64
+ @abstractmethod
65
+ def process(input_data: Dict) -> Dict
66
+ @abstractmethod
67
+ def validate_input(input_data: Dict) -> bool
68
+
69
+ # Common methods (inherited)
70
+ def get_name() -> str
71
+ def get_config() -> Dict
72
+ def log_processing(message: str, level: str)
73
+ def handle_error(error: Exception, context: str) -> Dict
74
+ ```
75
+
76
+ ### Workflow Integration
77
+
78
+ Agents are orchestrated through `workflow/comment_processor.py` using LangGraph:
79
+
80
+ ```
81
+ ┌─────────────────────┐
82
+ │ Language Detection │
83
+ │ Agent │
84
+ └──────────┬──────────┘
85
+
86
+
87
+ ┌────┴────┐
88
+ │ English?│
89
+ └────┬────┘
90
+
91
+ ┌─────┴─────┐
92
+ │ │
93
+ Yes No
94
+ │ │
95
+ │ ▼
96
+ │ ┌─────────────┐
97
+ │ │ Translation │
98
+ │ │ Agent │
99
+ │ └──────┬──────┘
100
+ │ │
101
+ └─────┬─────┘
102
+
103
+
104
+ ┌──────────────────┐
105
+ │ Sentiment │
106
+ │ Analysis Agent │
107
+ └──────────────────┘
108
+ ```
109
+
110
+ ## Existing Agents
111
+
112
+ ### 1. Language Detection Agent
113
+
114
+ **File**: `language_detection_agent.py`
115
+
116
+ **Purpose**: Detects the language of comment text using a hybrid approach.
117
+
118
+ **Strategy**:
119
+ - Uses **Lingua library** for fast English detection
120
+ - Falls back to **LLM** for non-English languages (higher accuracy)
121
+ - Returns language name, ISO code, and confidence level
122
+
123
+ **Key Methods**:
124
+ ```python
125
+ def detect_with_lingua(text: str) -> tuple[str, str, bool]
126
+ # Fast detection using lingua library
127
+ # Returns: (language_code, language_name, is_english)
128
+
129
+ def detect_with_llm(text: str) -> Dict[str, Any]
130
+ # LLM-based detection for nuanced analysis
131
+ # Returns: {language, language_code, confidence, has_text}
132
+
133
+ def process(input_data: Dict) -> Dict
134
+ # Main processing: lingua first, LLM if not English
135
+ ```
136
+
137
+ **Configuration** (`sentiment_config.json`):
138
+ ```json
139
+ {
140
+ "language_detection": {
141
+ "model": "gpt-5-nano",
142
+ "temperature": 0.0,
143
+ "max_retries": 3
144
+ }
145
+ }
146
+ ```
147
+
148
+ **Input Requirements**:
149
+ - `comment_text`: str
150
+
151
+ **Output**:
152
+ - `language`: str (e.g., "English", "Spanish")
153
+ - `language_code`: str (ISO 639-1, e.g., "en", "es")
154
+ - `is_english`: bool
155
+ - `confidence`: str ("high", "medium", "low")
156
+ - `detection_method`: str ("lingua", "llm", "default")
157
+ - `has_text`: bool
158
+
159
+ ### 2. Translation Agent
160
+
161
+ **File**: `translation_agent.py`
162
+
163
+ **Purpose**: Translates non-English comments to English using LLM.
164
+
165
+ **Strategy**:
166
+ - Skips translation if already English
167
+ - Uses LLM for context-aware, high-quality translation
168
+ - Preserves tone, intent, emojis, and special characters
169
+ - Specialized for music/education social media content
170
+
171
+ **Key Methods**:
172
+ ```python
173
+ def translate_text(text: str, source_language: str) -> Dict
174
+ # LLM-based translation with context preservation
175
+ # Returns: {translated_text, translation_confidence, notes}
176
+
177
+ def process(input_data: Dict) -> Dict
178
+ # Main processing: checks is_english, translates if needed
179
+ ```
180
+
181
+ **Configuration**:
182
+ ```json
183
+ {
184
+ "translation": {
185
+ "model": "gpt-5-nano",
186
+ "temperature": 0.3,
187
+ "max_retries": 3
188
+ }
189
+ }
190
+ ```
191
+
192
+ **Input Requirements**:
193
+ - `comment_text`: str
194
+ - `is_english`: bool
195
+ - `language`: str (optional, for context)
196
+
197
+ **Output**:
198
+ - `translated_text`: str
199
+ - `translation_performed`: bool
200
+ - `translation_confidence`: str
201
+ - `translation_notes`: str
202
+
203
+ ### 3. Sentiment Analysis Agent
204
+
205
+ **File**: `sentiment_analysis_agent.py`
206
+
207
+ **Purpose**: Analyzes sentiment polarity, intent, and determines if reply is needed.
208
+
209
+ **Strategy**:
210
+ - Uses content description for context
211
+ - Supports parent comment context for reply analysis
212
+ - Multi-label intent classification
213
+ - Differentiates genuine vs rhetorical/sarcastic questions
214
+ - Platform-aware analysis (YouTube, Facebook, Instagram)
215
+
216
+ **Key Features**:
217
+ - **Context-Aware**: Uses content description and parent comment
218
+ - **Multi-Label**: Can assign multiple intents to a single comment
219
+ - **Reply Policy**: Flags comments requiring responses
220
+ - **Rhetorical Detection**: Identifies sarcastic/rhetorical questions
221
+
222
+ **Key Methods**:
223
+ ```python
224
+ def _build_context_string(
225
+ content_description: str,
226
+ parent_comment_text: str = None,
227
+ platform: str = None,
228
+ content_title: str = None
229
+ ) -> str
230
+ # Builds context for LLM prompt
231
+ # Handles YouTube title+description vs other platforms
232
+
233
+ def analyze_sentiment(
234
+ comment_text: str,
235
+ content_description: str,
236
+ parent_comment_text: str = None,
237
+ platform: str = None,
238
+ content_title: str = None
239
+ ) -> Dict
240
+ # Performs sentiment analysis with full context
241
+ # Returns: {sentiment_polarity, intent, requires_reply, confidence, analysis_notes}
242
+
243
+ def process(input_data: Dict) -> Dict
244
+ # Main processing: validates input, analyzes sentiment
245
+ ```
246
+
247
+ **Configuration**:
248
+ Uses two config files:
249
+
250
+ 1. **Agent Config** (`sentiment_config.json`):
251
+ ```json
252
+ {
253
+ "sentiment_analysis": {
254
+ "model": "gpt-5-nano",
255
+ "temperature": 0.2,
256
+ "max_retries": 3
257
+ }
258
+ }
259
+ ```
260
+
261
+ 2. **Categories Config** (`sentiment_analysis_config.json`):
262
+ ```json
263
+ {
264
+ "sentiment_polarity": {
265
+ "categories": [
266
+ {"value": "very_positive", "label": "Very Positive", "description": "..."},
267
+ {"value": "positive", "label": "Positive", "description": "..."},
268
+ {"value": "neutral", "label": "Neutral", "description": "..."},
269
+ {"value": "negative", "label": "Negative", "description": "..."},
270
+ {"value": "very_negative", "label": "Very Negative", "description": "..."}
271
+ ]
272
+ },
273
+ "intent": {
274
+ "categories": [
275
+ {"value": "praise", "description": "..."},
276
+ {"value": "question", "description": "..."},
277
+ {"value": "request", "description": "..."},
278
+ {"value": "feedback_negative", "description": "..."},
279
+ {"value": "suggestion", "description": "..."},
280
+ {"value": "humor_sarcasm", "description": "..."},
281
+ {"value": "off_topic", "description": "..."},
282
+ {"value": "spam_selfpromo", "description": "..."}
283
+ ]
284
+ },
285
+ "reply_policy": {
286
+ "requires_reply_intents": ["question", "request"],
287
+ "not_include": ["humor_sarcasm"]
288
+ }
289
+ }
290
+ ```
291
+
292
+ **Input Requirements**:
293
+ - `comment_text`: str
294
+ - `content_description`: str
295
+ - `parent_comment_text`: str (optional)
296
+ - `platform`: str (optional, e.g., "youtube", "facebook")
297
+ - `content_title`: str (optional, mainly for YouTube)
298
+
299
+ **Output**:
300
+ - `sentiment_polarity`: str (one of: very_positive, positive, neutral, negative, very_negative)
301
+ - `intent`: str (comma-separated list, e.g., "praise, question")
302
+ - `requires_reply`: bool
303
+ - `sentiment_confidence`: str ("high", "medium", "low")
304
+ - `analysis_notes`: str (1-2 sentence summary)
305
+ - `success`: bool (False if critical fields missing)
306
+
307
+ ### Common Patterns Across All Agents
308
+
309
+ 1. **JSON Response Parsing**: All agents have `_parse_llm_json_response()` method to handle markdown-wrapped JSON
310
+ 2. **Error Handling**: All use `handle_error()` from base class
311
+ 3. **Logging**: All use `log_processing()` for consistent logging
312
+ 4. **Validation**: All implement `validate_input()` before processing
313
+ 5. **State Preservation**: All preserve original input data in output
314
+
315
+ ## How Agents Work
316
+
317
+ ### Workflow Execution Flow
318
+
319
+ 1. **Initialization** (`CommentProcessingWorkflow.__init__`):
320
+ ```python
321
+ # Load configurations
322
+ lang_detect_config = config["agents"]["language_detection"]
323
+ translation_config = config["agents"]["translation"]
324
+ sentiment_config = config["agents"]["sentiment_analysis"]
325
+
326
+ # Initialize agents
327
+ self.language_agent = LanguageDetectionAgent(lang_detect_config, api_key)
328
+ self.translation_agent = TranslationAgent(translation_config, api_key)
329
+ self.sentiment_agent = SentimentAnalysisAgent(sentiment_config, api_key, sentiment_categories)
330
+
331
+ # Build workflow graph
332
+ self.workflow = self._build_workflow()
333
+ ```
334
+
335
+ 2. **Workflow Graph** (`_build_workflow()`):
336
+ ```python
337
+ workflow = StateGraph(CommentState)
338
+
339
+ # Add nodes (agents)
340
+ workflow.add_node("language_detection", self._language_detection_node)
341
+ workflow.add_node("translation", self._translation_node)
342
+ workflow.add_node("sentiment_analysis", self._sentiment_analysis_node)
343
+
344
+ # Define edges (control flow)
345
+ workflow.set_entry_point("language_detection")
346
+ workflow.add_conditional_edges(
347
+ "language_detection",
348
+ self._should_translate,
349
+ {"translate": "translation", "skip_translation": "sentiment_analysis"}
350
+ )
351
+ workflow.add_edge("translation", "sentiment_analysis")
352
+ workflow.add_edge("sentiment_analysis", END)
353
+
354
+ return workflow.compile()
355
+ ```
356
+
357
+ 3. **Node Execution** (Example: `_language_detection_node`):
358
+ ```python
359
+ def _language_detection_node(self, state: CommentState) -> CommentState:
360
+ try:
361
+ # Prepare input
362
+ input_data = {"comment_text": state["comment_text"]}
363
+
364
+ # Process with agent
365
+ result = self.language_agent.process(input_data)
366
+
367
+ # Update state
368
+ if result.get("success", False):
369
+ state["language"] = result.get("language", "English")
370
+ state["language_code"] = result.get("language_code", "en")
371
+ state["is_english"] = result.get("is_english", True)
372
+ # ... more fields
373
+ else:
374
+ # Handle error, set defaults
375
+ state["processing_errors"].append(result.get("error"))
376
+
377
+ return state
378
+ except Exception as e:
379
+ # Error handling
380
+ state["processing_errors"].append(str(e))
381
+ return state
382
+ ```
383
+
384
+ 4. **Decision Points** (Example: `_should_translate`):
385
+ ```python
386
+ def _should_translate(self, state: CommentState) -> str:
387
+ if state.get("is_english", True) or not state.get("has_text", True):
388
+ # Set defaults for skipped translation
389
+ state["translated_text"] = state["comment_text"]
390
+ state["translation_performed"] = False
391
+ return "skip_translation"
392
+ else:
393
+ return "translate"
394
+ ```
395
+
396
+ 5. **Comment Processing** (`process_comment()`):
397
+ ```python
398
+ def process_comment(self, comment_data: Dict) -> Dict:
399
+ # Initialize state
400
+ initial_state = {
401
+ "comment_sk": comment_data.get("comment_sk"),
402
+ "comment_text": comment_data.get("comment_text"),
403
+ # ... all fields
404
+ "processing_errors": [],
405
+ "success": True
406
+ }
407
+
408
+ # Run workflow
409
+ final_state = self.workflow.invoke(initial_state)
410
+
411
+ # Merge and return
412
+ return dict(final_state)
413
+ ```
414
+
415
+ ### State Management
416
+
417
+ The workflow uses a `CommentState` TypedDict to pass data between agents:
418
+
419
+ ```python
420
+ class CommentState(TypedDict):
421
+ # Input fields
422
+ comment_sk: int
423
+ comment_id: str
424
+ comment_text: str
425
+ # ... more fields
426
+
427
+ # Processing fields (populated by agents)
428
+ language: str
429
+ language_code: str
430
+ is_english: bool
431
+ translated_text: str
432
+ sentiment_polarity: str
433
+ intent: str
434
+ # ... more fields
435
+
436
+ # Metadata
437
+ processing_errors: Annotated[List[str], operator.add]
438
+ success: bool
439
+ ```
440
+
441
+ ### Error Handling Strategy
442
+
443
+ 1. **Agent Level**: Each agent returns `{"success": False, "error": "..."}` on failure
444
+ 2. **Node Level**: Nodes catch exceptions, set defaults, append to `processing_errors`
445
+ 3. **Workflow Level**: Workflow continues even if an agent fails (graceful degradation)
446
+ 4. **Critical Failures**: Sentiment agent marks `success=False` if critical fields missing (comment not stored)
447
+
448
+ ## Adding New Agents
449
+
450
+ ### Step-by-Step Guide
451
+
452
+ #### Step 1: Create the Agent Class
453
+
454
+ Create a new file in the `agents/` directory (e.g., `topic_classification_agent.py`):
455
+
456
+ ```python
457
+ """
458
+ Topic Classification Agent
459
+ Extracts topics and themes from comments
460
+ """
461
+
462
+ from typing import Dict, Any
463
+ import json
464
+ from langchain_openai import ChatOpenAI
465
+ from langchain.schema import HumanMessage, SystemMessage
466
+ from agents.base_agent import BaseAgent
467
+ import logging
468
+
469
+ logger = logging.getLogger(__name__)
470
+
471
+
472
+ class TopicClassificationAgent(BaseAgent):
473
+ """
474
+ Agent that classifies comments into predefined topics/themes.
475
+ """
476
+
477
+ def __init__(self, config: Dict[str, Any], api_key: str, topic_categories: Dict[str, Any]):
478
+ """
479
+ Initialize the Topic Classification Agent.
480
+
481
+ Args:
482
+ config: Configuration dictionary
483
+ api_key: OpenAI API key
484
+ topic_categories: Dictionary with topic categories
485
+ """
486
+ super().__init__("TopicClassificationAgent", config)
487
+ self.api_key = api_key
488
+ self.topic_categories = topic_categories
489
+ self.llm = ChatOpenAI(
490
+ model=self.model,
491
+ temperature=self.temperature,
492
+ api_key=self.api_key
493
+ )
494
+
495
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
496
+ """
497
+ Validate that input contains required fields.
498
+
499
+ Args:
500
+ input_data: Input dictionary
501
+
502
+ Returns:
503
+ True if valid, False otherwise
504
+ """
505
+ required_fields = ["comment_text"]
506
+ return all(field in input_data for field in required_fields)
507
+
508
+ def classify_topics(self, comment_text: str) -> Dict[str, Any]:
509
+ """
510
+ Classify comment into topics using LLM.
511
+
512
+ Args:
513
+ comment_text: The comment text to analyze
514
+
515
+ Returns:
516
+ Dictionary with topic classification results
517
+ """
518
+ # Build topic options from config
519
+ topic_options = "\n".join([
520
+ f"- {cat['value']}: {cat['description']}"
521
+ for cat in self.topic_categories["topics"]["categories"]
522
+ ])
523
+
524
+ system_prompt = f"""You are an expert at classifying music-related comments into topics.
525
+
526
+ Available Topics:
527
+ {topic_options}
528
+
529
+ Return your response in JSON format with the following fields:
530
+ - topics: array of topic values (multi-label, can have multiple topics)
531
+ - confidence: your confidence level (high, medium, low)
532
+ - reasoning: brief explanation of your classification
533
+ """
534
+
535
+ user_prompt = f"""Classify this comment into relevant topics:
536
+
537
+ Comment: "{comment_text}"
538
+
539
+ Return JSON only."""
540
+
541
+ try:
542
+ messages = [
543
+ SystemMessage(content=system_prompt),
544
+ HumanMessage(content=user_prompt)
545
+ ]
546
+
547
+ response = self.llm.invoke(messages)
548
+ result = self._parse_llm_json_response(response.content)
549
+
550
+ topics = result.get("topics", [])
551
+ if isinstance(topics, str):
552
+ topics = [topics]
553
+
554
+ topic_str = ", ".join(topics) if topics else None
555
+
556
+ return {
557
+ "success": True,
558
+ "topics": topic_str,
559
+ "topic_confidence": result.get("confidence", "medium"),
560
+ "topic_reasoning": result.get("reasoning", "")
561
+ }
562
+
563
+ except json.JSONDecodeError as e:
564
+ self.log_processing(f"JSON decode error: {str(e)}", "warning")
565
+ return {
566
+ "success": False,
567
+ "error": str(e)
568
+ }
569
+ except Exception as e:
570
+ self.log_processing(f"Topic classification failed: {str(e)}", "error")
571
+ return {
572
+ "success": False,
573
+ "error": str(e)
574
+ }
575
+
576
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
577
+ """
578
+ Process comment and extract topics.
579
+
580
+ Args:
581
+ input_data: Dictionary containing comment data
582
+
583
+ Returns:
584
+ Dictionary with topic classification results
585
+ """
586
+ try:
587
+ # Validate input
588
+ if not self.validate_input(input_data):
589
+ return {
590
+ "success": False,
591
+ "error": "Invalid input: missing required fields"
592
+ }
593
+
594
+ comment_text = input_data["comment_text"]
595
+
596
+ self.log_processing(f"Classifying topics for comment", "debug")
597
+
598
+ # Perform classification
599
+ classification_result = self.classify_topics(comment_text)
600
+
601
+ result = {
602
+ "success": classification_result.get("success", False),
603
+ "topics": classification_result.get("topics"),
604
+ "topic_confidence": classification_result.get("topic_confidence"),
605
+ "topic_reasoning": classification_result.get("topic_reasoning", "")
606
+ }
607
+
608
+ if "error" in classification_result:
609
+ result["topic_error"] = classification_result["error"]
610
+
611
+ # Preserve all original data
612
+ for key, value in input_data.items():
613
+ if key not in result:
614
+ result[key] = value
615
+
616
+ return result
617
+
618
+ except Exception as e:
619
+ return self.handle_error(e, "topic_classification")
620
+
621
+ def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]:
622
+ """
623
+ Parse LLM response that may contain JSON wrapped in markdown code blocks.
624
+
625
+ Args:
626
+ response_content: Raw response content from LLM
627
+
628
+ Returns:
629
+ Parsed JSON dictionary
630
+
631
+ Raises:
632
+ json.JSONDecodeError: If JSON cannot be parsed
633
+ """
634
+ content = response_content.strip()
635
+
636
+ # Check if response is wrapped in markdown code block
637
+ if content.startswith("```json"):
638
+ content = content[7:]
639
+ if content.endswith("```"):
640
+ content = content[:-3]
641
+ content = content.strip()
642
+ elif content.startswith("```"):
643
+ content = content[3:]
644
+ if content.endswith("```"):
645
+ content = content[:-3]
646
+ content = content.strip()
647
+
648
+ return json.loads(content)
649
+ ```
650
+
651
+ #### Step 2: Update `__init__.py`
652
+
653
+ Add your agent to `agents/__init__.py`:
654
+
655
+ ```python
656
+ """
657
+ Agents module for the sentiment analysis workflow.
658
+ Provides modular, extensible agents for various NLP tasks.
659
+ """
660
+
661
+ from agents.base_agent import BaseAgent
662
+ from agents.language_detection_agent import LanguageDetectionAgent
663
+ from agents.translation_agent import TranslationAgent
664
+ from agents.sentiment_analysis_agent import SentimentAnalysisAgent
665
+ from agents.topic_classification_agent import TopicClassificationAgent # ADD THIS
666
+
667
+ __all__ = [
668
+ "BaseAgent",
669
+ "LanguageDetectionAgent",
670
+ "TranslationAgent",
671
+ "SentimentAnalysisAgent",
672
+ "TopicClassificationAgent" # ADD THIS
673
+ ]
674
+ ```
675
+
676
+ #### Step 3: Update Configuration Files
677
+
678
+ Add agent configuration to `config_files/sentiment_config.json`:
679
+
680
+ ```json
681
+ {
682
+ "agents": {
683
+ "language_detection": { ... },
684
+ "translation": { ... },
685
+ "sentiment_analysis": { ... },
686
+ "topic_classification": {
687
+ "name": "TopicClassificationAgent",
688
+ "model": "gpt-5-nano",
689
+ "temperature": 0.2,
690
+ "max_retries": 3,
691
+ "description": "Classifies comments into topic categories"
692
+ }
693
+ }
694
+ }
695
+ ```
696
+
697
+ Create topic categories config (or add to existing `sentiment_analysis_config.json`):
698
+
699
+ ```json
700
+ {
701
+ "topics": {
702
+ "categories": [
703
+ {
704
+ "value": "technique",
705
+ "label": "Technique",
706
+ "description": "Playing technique, finger positioning, hand coordination"
707
+ },
708
+ {
709
+ "value": "theory",
710
+ "label": "Music Theory",
711
+ "description": "Scales, chords, harmony, composition theory"
712
+ },
713
+ {
714
+ "value": "equipment",
715
+ "label": "Equipment",
716
+ "description": "Instruments, gear, accessories, software"
717
+ },
718
+ {
719
+ "value": "performance",
720
+ "label": "Performance",
721
+ "description": "Stage presence, live playing, performance anxiety"
722
+ },
723
+ {
724
+ "value": "practice",
725
+ "label": "Practice",
726
+ "description": "Practice routines, discipline, improvement tips"
727
+ }
728
+ ]
729
+ }
730
+ }
731
+ ```
732
+
733
+ #### Step 4: Update Workflow State
734
+
735
+ Add fields to `CommentState` in `workflow/comment_processor.py`:
736
+
737
+ ```python
738
+ class CommentState(TypedDict):
739
+ # ... existing fields ...
740
+
741
+ # Topic classification fields
742
+ topics: str
743
+ topic_confidence: str
744
+ topic_reasoning: str
745
+ ```
746
+
747
+ #### Step 5: Add Workflow Node
748
+
749
+ Add the node method to `CommentProcessingWorkflow` class:
750
+
751
+ ```python
752
+ def _topic_classification_node(self, state: CommentState) -> CommentState:
753
+ """
754
+ Node for topic classification.
755
+
756
+ Args:
757
+ state: Current workflow state
758
+
759
+ Returns:
760
+ Updated state with topic classification results
761
+ """
762
+ try:
763
+ # Prepare input
764
+ input_data = {
765
+ "comment_text": state.get("translated_text", state["comment_text"])
766
+ }
767
+
768
+ # Process with topic classification agent
769
+ result = self.topic_agent.process(input_data)
770
+
771
+ # Update state
772
+ if result.get("success", False):
773
+ state["topics"] = result.get("topics")
774
+ state["topic_confidence"] = result.get("topic_confidence")
775
+ state["topic_reasoning"] = result.get("topic_reasoning", "")
776
+ else:
777
+ error_msg = f"Topic classification failed: {result.get('error', 'Unknown error')}"
778
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
779
+ state["topics"] = None
780
+ state["topic_confidence"] = None
781
+ state["topic_reasoning"] = "Topic classification failed"
782
+
783
+ logger.debug(f"Topics: {state['topics']}")
784
+ return state
785
+
786
+ except Exception as e:
787
+ error_msg = f"Topic classification node error: {str(e)}"
788
+ logger.error(error_msg)
789
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
790
+ state["topics"] = None
791
+ state["topic_confidence"] = None
792
+ state["topic_reasoning"] = "Error during topic classification"
793
+ return state
794
+ ```
795
+
796
+ #### Step 6: Initialize Agent in Workflow
797
+
798
+ Update `__init__` method:
799
+
800
+ ```python
801
+ def __init__(self, config: Dict[str, Any], api_key: str):
802
+ # ... existing initialization ...
803
+
804
+ # Load topic categories
805
+ topic_categories_path = config.get("topic_categories_config", "config_files/topic_categories.json")
806
+ with open(topic_categories_path, 'r') as f:
807
+ topic_categories = json.load(f)
808
+
809
+ # Initialize topic agent
810
+ topic_config = config["agents"]["topic_classification"]
811
+ self.topic_agent = TopicClassificationAgent(topic_config, api_key, topic_categories)
812
+ ```
813
+
814
+ #### Step 7: Update Workflow Graph
815
+
816
+ Modify `_build_workflow()`:
817
+
818
+ ```python
819
+ def _build_workflow(self) -> StateGraph:
820
+ workflow = StateGraph(CommentState)
821
+
822
+ # Add nodes
823
+ workflow.add_node("language_detection", self._language_detection_node)
824
+ workflow.add_node("translation", self._translation_node)
825
+ workflow.add_node("sentiment_analysis", self._sentiment_analysis_node)
826
+ workflow.add_node("topic_classification", self._topic_classification_node) # ADD THIS
827
+
828
+ # Define edges
829
+ workflow.set_entry_point("language_detection")
830
+ workflow.add_conditional_edges(
831
+ "language_detection",
832
+ self._should_translate,
833
+ {"translate": "translation", "skip_translation": "sentiment_analysis"}
834
+ )
835
+ workflow.add_edge("translation", "sentiment_analysis")
836
+ workflow.add_edge("sentiment_analysis", "topic_classification") # ADD THIS
837
+ workflow.add_edge("topic_classification", END) # MODIFY THIS
838
+
839
+ return workflow.compile()
840
+ ```
841
+
842
+ #### Step 8: Update Database Schema
843
+
844
+ Add columns to your Snowflake table:
845
+
846
+ ```sql
847
+ ALTER TABLE COMMENT_SENTIMENT_FEATURES
848
+ ADD COLUMN TOPICS VARCHAR(500),
849
+ ADD COLUMN TOPIC_CONFIDENCE VARCHAR(20),
850
+ ADD COLUMN TOPIC_REASONING VARCHAR(1000);
851
+ ```
852
+
853
+ #### Step 9: Test Your Agent
854
+
855
+ Test with a small batch first:
856
+
857
+ ```bash
858
+ python main.py --limit 10 --sequential
859
+ ```
860
+
861
+ Check logs for any errors and verify output in Snowflake.
862
+
863
+ ### Quick Checklist for Adding New Agents
864
+
865
+ - [ ] Create agent class inheriting from `BaseAgent`
866
+ - [ ] Implement `validate_input()` method
867
+ - [ ] Implement `process()` method
868
+ - [ ] Implement `_parse_llm_json_response()` if using LLM
869
+ - [ ] Add agent to `agents/__init__.py`
870
+ - [ ] Add configuration to `sentiment_config.json`
871
+ - [ ] Create/update category config file if needed
872
+ - [ ] Add fields to `CommentState` TypedDict
873
+ - [ ] Create node method in `CommentProcessingWorkflow`
874
+ - [ ] Initialize agent in `__init__`
875
+ - [ ] Add node to workflow graph
876
+ - [ ] Update edges in workflow
877
+ - [ ] Update database schema
878
+ - [ ] Test with small batch
879
+
880
+ ## Modifying Existing Agents
881
+
882
+ ### Common Modifications
883
+
884
+ #### 1. Change LLM Model
885
+
886
+ Update `config_files/sentiment_config.json`:
887
+
888
+ ```json
889
+ {
890
+ "agents": {
891
+ "sentiment_analysis": {
892
+ "model": "gpt-4o", // Change from gpt-5-nano
893
+ "temperature": 0.2,
894
+ "max_retries": 3
895
+ }
896
+ }
897
+ }
898
+ ```
899
+
900
+ No code changes needed! Configuration is loaded dynamically.
901
+
902
+ #### 2. Add New Sentiment Category
903
+
904
+ Update `config_files/sentiment_analysis_config.json`:
905
+
906
+ ```json
907
+ {
908
+ "sentiment_polarity": {
909
+ "categories": [
910
+ // ... existing categories ...
911
+ {
912
+ "value": "mixed",
913
+ "label": "Mixed",
914
+ "description": "Contains both positive and negative elements"
915
+ }
916
+ ]
917
+ }
918
+ }
919
+ ```
920
+
921
+ The agent will automatically include this in prompts. No code changes needed.
922
+
923
+ #### 3. Add New Intent Category
924
+
925
+ Update `config_files/sentiment_analysis_config.json`:
926
+
927
+ ```json
928
+ {
929
+ "intent": {
930
+ "categories": [
931
+ // ... existing categories ...
932
+ {
933
+ "value": "collaboration",
934
+ "label": "Collaboration",
935
+ "description": "Seeking or offering collaboration opportunities"
936
+ }
937
+ ]
938
+ }
939
+ }
940
+ ```
941
+
942
+ #### 4. Modify Reply Policy
943
+
944
+ Update `config_files/sentiment_analysis_config.json`:
945
+
946
+ ```json
947
+ {
948
+ "reply_policy": {
949
+ "requires_reply_intents": ["question", "request", "feedback_negative"], // Added feedback_negative
950
+ "not_include": ["humor_sarcasm", "spam_selfpromo"] // Added spam_selfpromo
951
+ }
952
+ }
953
+ ```
954
+
955
+ #### 5. Adjust Temperature for Better Results
956
+
957
+ If getting inconsistent results, adjust temperature:
958
+
959
+ ```json
960
+ {
961
+ "agents": {
962
+ "sentiment_analysis": {
963
+ "model": "gpt-5-nano",
964
+ "temperature": 0.1, // Lower = more consistent, less creative
965
+ "max_retries": 3
966
+ }
967
+ }
968
+ }
969
+ ```
970
+
971
+ #### 6. Add Context to Sentiment Analysis
972
+
973
+ Modify `_build_context_string()` in `sentiment_analysis_agent.py`:
974
+
975
+ ```python
976
+ def _build_context_string(self, content_description: str, parent_comment_text: str = None,
977
+ platform: str = None, content_title: str = None,
978
+ channel_name: str = None) -> str: # ADD channel_name
979
+ """Build context string for sentiment analysis."""
980
+ context_parts = []
981
+
982
+ # ... existing code ...
983
+
984
+ # ADD THIS
985
+ if channel_name:
986
+ context_parts.append(f"Channel: {channel_name}")
987
+
988
+ return "\n".join(context_parts)
989
+ ```
990
+
991
+ Then update the `analyze_sentiment()` method to accept and pass `channel_name`.
992
+
993
+ #### 7. Improve Language Detection Accuracy
994
+
995
+ Modify `language_detection_agent.py` to add more languages to LINGUA_TO_ISO:
996
+
997
+ ```python
998
+ LINGUA_TO_ISO = {
999
+ # ... existing mappings ...
1000
+ Language.VIETNAMESE: "vi",
1001
+ Language.THAI: "th",
1002
+ Language.INDONESIAN: "id",
1003
+ # Add more as needed
1004
+ }
1005
+ ```
1006
+
1007
+ #### 8. Customize Translation Prompt
1008
+
1009
+ Modify `translate_text()` in `translation_agent.py`:
1010
+
1011
+ ```python
1012
+ system_prompt = """You are a professional translator specializing in social media content related to music and education.
1013
+ Translate the given text from the source language to English. The text is a comment on a musical content.
1014
+ Preserve the tone, intent, and any emojis or special characters.
1015
+ For informal social media language, maintain the casual tone in translation.
1016
+
1017
+ // ADD THESE GUIDELINES:
1018
+ Special Instructions:
1019
+ - Preserve musical terminology (e.g., "legato", "staccato") untranslated
1020
+ - Translate instrument names (e.g., "guitarra" → "guitar")
1021
+ - Keep artist names and brand names in original language
1022
+ - Maintain slang and colloquialisms when possible
1023
+
1024
+ Return your response in JSON format with the following fields:
1025
+ - translated_text: The English translation
1026
+ - translation_confidence: Your confidence level (high, medium, low)
1027
+ - notes: Any important notes about the translation (optional)
1028
+ """
1029
+ ```
1030
+
1031
+ #### 9. Add Retry Logic for Failed Analyses
1032
+
1033
+ Modify `process()` in `sentiment_analysis_agent.py`:
1034
+
1035
+ ```python
1036
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
1037
+ try:
1038
+ # ... existing validation code ...
1039
+
1040
+ # ADD RETRY LOGIC
1041
+ max_attempts = self.max_retries
1042
+ for attempt in range(max_attempts):
1043
+ analysis_result = self.analyze_sentiment(
1044
+ comment_text, content_description,
1045
+ parent_comment_text, platform, content_title
1046
+ )
1047
+
1048
+ if analysis_result.get("success"):
1049
+ break
1050
+
1051
+ if attempt < max_attempts - 1:
1052
+ self.log_processing(f"Attempt {attempt + 1} failed, retrying...", "warning")
1053
+
1054
+ # ... rest of existing code ...
1055
+ ```
1056
+
1057
+ #### 10. Add Custom Validation Rules
1058
+
1059
+ Modify `validate_input()` in any agent:
1060
+
1061
+ ```python
1062
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
1063
+ """Validate that input contains required fields."""
1064
+ required_fields = ["comment_text", "content_description"]
1065
+
1066
+ # Check required fields exist
1067
+ if not all(field in input_data for field in required_fields):
1068
+ return False
1069
+
1070
+ # ADD CUSTOM VALIDATION
1071
+ # Ensure comment_text is not empty or too short
1072
+ comment_text = input_data.get("comment_text", "")
1073
+ if not comment_text or len(comment_text.strip()) < 2:
1074
+ self.log_processing("Comment text too short or empty", "warning")
1075
+ return False
1076
+
1077
+ # Ensure content_description exists
1078
+ content_desc = input_data.get("content_description", "")
1079
+ if not content_desc or content_desc.strip() == "":
1080
+ self.log_processing("Content description missing", "warning")
1081
+ return False
1082
+
1083
+ return True
1084
+ ```
1085
+
1086
+ ### Testing Modified Agents
1087
+
1088
+ After making modifications, always test:
1089
+
1090
+ ```bash
1091
+ # Test with a small batch
1092
+ python main.py --limit 10 --sequential
1093
+
1094
+ # Check specific data source
1095
+ python main.py --limit 10 --sequential --data-source social_media
1096
+
1097
+ # Review logs for errors
1098
+ tail -f logs/comment_processing_*.log
1099
+ ```
1100
+
1101
+ ## Configuration System
1102
+
1103
+ ### Configuration Files Overview
1104
+
1105
+ ```
1106
+ config_files/
1107
+ ├── sentiment_config.json # Agent behavior config
1108
+ ├── sentiment_analysis_config.json # Sentiment categories and intents
1109
+ └── data_sources_config.json # Data source configuration
1110
+ ```
1111
+
1112
+ ### Agent Configuration Structure
1113
+
1114
+ **File**: `sentiment_config.json`
1115
+
1116
+ ```json
1117
+ {
1118
+ "agents": {
1119
+ "agent_name": {
1120
+ "name": "AgentClassName",
1121
+ "model": "gpt-5-nano", // LLM model to use
1122
+ "temperature": 0.0, // Creativity (0.0 = deterministic, 1.0 = creative)
1123
+ "max_retries": 3, // Max retry attempts
1124
+ "description": "What this agent does"
1125
+ }
1126
+ },
1127
+ "workflow": {
1128
+ "parallel_processing": {
1129
+ "enabled": true,
1130
+ "worker_calculation": "CPU count - 2, max 5 workers",
1131
+ "min_batch_size": 20,
1132
+ "max_batch_size": 1000
1133
+ }
1134
+ }
1135
+ }
1136
+ ```
1137
+
1138
+ ### Temperature Guidelines
1139
+
1140
+ - **0.0 - 0.1**: Deterministic, consistent (good for classification)
1141
+ - **0.2 - 0.4**: Slight variation, mostly consistent (good for sentiment analysis)
1142
+ - **0.5 - 0.7**: Balanced creativity and consistency (good for translation)
1143
+ - **0.8 - 1.0**: Creative, varied (good for content generation)
1144
+
1145
+ ### Model Selection Guidelines
1146
+
1147
+ - **gpt-5-nano**: Fast, cheap, good for simple tasks
1148
+ - **gpt-4o-mini**: Balanced speed/quality, good for most tasks
1149
+ - **gpt-4o**: High quality, slower, good for complex analysis
1150
+
1151
+ ### Category Configuration Structure
1152
+
1153
+ **File**: `sentiment_analysis_config.json`
1154
+
1155
+ ```json
1156
+ {
1157
+ "category_type": {
1158
+ "categories": [
1159
+ {
1160
+ "value": "machine_readable_value", // Used in code/DB
1161
+ "label": "Human Readable Label", // Used in UI
1162
+ "description": "Detailed description for LLM prompt"
1163
+ }
1164
+ ]
1165
+ }
1166
+ }
1167
+ ```
1168
+
1169
+ ### Loading Configuration in Code
1170
+
1171
+ ```python
1172
+ # In workflow/__init__ or agent __init__
1173
+ import json
1174
+ import os
1175
+
1176
+ # Load agent config
1177
+ with open('config_files/sentiment_config.json', 'r') as f:
1178
+ config = json.load(f)
1179
+
1180
+ agent_config = config["agents"]["agent_name"]
1181
+
1182
+ # Load category config
1183
+ with open('config_files/sentiment_analysis_config.json', 'r') as f:
1184
+ categories = json.load(f)
1185
+
1186
+ sentiment_categories = categories["sentiment_polarity"]["categories"]
1187
+ ```
1188
+
1189
+ ## Best Practices
1190
+
1191
+ ### Agent Development
1192
+
1193
+ 1. **Single Responsibility**: Each agent should do one thing well
1194
+ 2. **Fail Gracefully**: Always return structured error responses
1195
+ 3. **Preserve Data**: Never lose original input data - pass it through
1196
+ 4. **Log Everything**: Use `log_processing()` for debugging
1197
+ 5. **Validate Early**: Check inputs before processing
1198
+ 6. **Configuration Over Code**: Use config files for behavior changes
1199
+ 7. **Test Incrementally**: Test with `--limit 10 --sequential` first
1200
+
1201
+ ### Prompt Engineering
1202
+
1203
+ 1. **Be Specific**: Clearly define expected output format
1204
+ 2. **Use Examples**: Include few-shot examples in prompts
1205
+ 3. **Request JSON**: Always request JSON format for structured data
1206
+ 4. **Handle Edge Cases**: Document edge cases in prompts
1207
+ 5. **Provide Context**: Give LLM all relevant context
1208
+ 6. **Set Constraints**: Clearly define boundaries and limitations
1209
+
1210
+ Example of good prompt structure:
1211
+
1212
+ ```python
1213
+ system_prompt = """You are an expert at [TASK].
1214
+
1215
+ Your task is to:
1216
+ 1. [Step 1]
1217
+ 2. [Step 2]
1218
+ 3. [Step 3]
1219
+
1220
+ Context: [Explain the context]
1221
+
1222
+ Rules:
1223
+ - Rule 1
1224
+ - Rule 2
1225
+ - Rule 3
1226
+
1227
+ Examples:
1228
+ - Input: "..." → Output: {...}
1229
+ - Input: "..." → Output: {...}
1230
+
1231
+ Return your response in JSON format with the following fields:
1232
+ - field1: description
1233
+ - field2: description
1234
+ """
1235
+ ```
1236
+
1237
+ ### Error Handling
1238
+
1239
+ 1. **Try-Catch Everything**: Wrap all processing in try-catch
1240
+ 2. **Specific Error Messages**: Make errors actionable
1241
+ 3. **Graceful Degradation**: Continue workflow even if one agent fails
1242
+ 4. **Error Accumulation**: Collect errors in `processing_errors` list
1243
+ 5. **Critical vs Non-Critical**: Distinguish between recoverable and fatal errors
1244
+
1245
+ Example:
1246
+
1247
+ ```python
1248
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
1249
+ try:
1250
+ # Validate
1251
+ if not self.validate_input(input_data):
1252
+ return {
1253
+ "success": False,
1254
+ "error": "Invalid input: missing required fields",
1255
+ **input_data # Preserve original data
1256
+ }
1257
+
1258
+ # Process
1259
+ result = self.do_processing(input_data)
1260
+
1261
+ # Check result
1262
+ if not result.get("success"):
1263
+ return {
1264
+ "success": False,
1265
+ "error": result.get("error", "Unknown error"),
1266
+ **input_data
1267
+ }
1268
+
1269
+ # Return success
1270
+ return {
1271
+ "success": True,
1272
+ "output_field": result["output"],
1273
+ **input_data
1274
+ }
1275
+
1276
+ except Exception as e:
1277
+ return self.handle_error(e, "process")
1278
+ ```
1279
+
1280
+ ### Testing
1281
+
1282
+ 1. **Unit Test Agents**: Test agents independently before integration
1283
+ 2. **Small Batches**: Always test with `--limit 10` first
1284
+ 3. **Sequential Mode**: Use `--sequential` for debugging
1285
+ 4. **Check Logs**: Review logs after every test run
1286
+ 5. **Validate Output**: Check Snowflake results
1287
+ 6. **Test Edge Cases**: Empty text, emojis only, very long text, special characters
1288
+
1289
+ Test script example:
1290
+
1291
+ ```python
1292
+ # test_agent.py
1293
+ from agents.sentiment_analysis_agent import SentimentAnalysisAgent
1294
+ import json
1295
+
1296
+ # Load config
1297
+ with open('config_files/sentiment_config.json', 'r') as f:
1298
+ config = json.load(f)
1299
+ with open('config_files/sentiment_analysis_config.json', 'r') as f:
1300
+ categories = json.load(f)
1301
+
1302
+ # Initialize agent
1303
+ agent = SentimentAnalysisAgent(
1304
+ config["agents"]["sentiment_analysis"],
1305
+ "your-api-key",
1306
+ categories
1307
+ )
1308
+
1309
+ # Test cases
1310
+ test_cases = [
1311
+ {"comment_text": "This is amazing!", "content_description": "Guitar tutorial"},
1312
+ {"comment_text": "😊😊😊", "content_description": "Piano cover"},
1313
+ {"comment_text": "What scale is this?", "content_description": "Blues solo"},
1314
+ ]
1315
+
1316
+ for test in test_cases:
1317
+ result = agent.process(test)
1318
+ print(f"Input: {test['comment_text']}")
1319
+ print(f"Result: {result}")
1320
+ print("---")
1321
+ ```
1322
+
1323
+ ### Performance Optimization
1324
+
1325
+ 1. **Batch Processing**: Process comments in batches (handled by workflow)
1326
+ 2. **Parallel Workers**: Use multiprocessing for large batches
1327
+ 3. **Minimize LLM Calls**: Cache results when possible
1328
+ 4. **Optimize Prompts**: Shorter prompts = faster responses
1329
+ 5. **Choose Right Model**: Use gpt-5-nano for simple tasks
1330
+
1331
+ ### Code Organization
1332
+
1333
+ 1. **One Agent Per File**: Don't combine multiple agents
1334
+ 2. **Helper Methods**: Use private methods (\_method\_name) for internal logic
1335
+ 3. **Type Hints**: Always use type hints for parameters and returns
1336
+ 4. **Docstrings**: Document all public methods
1337
+ 5. **Constants**: Define constants at class level
1338
+
1339
+ Example structure:
1340
+
1341
+ ```python
1342
+ class MyAgent(BaseAgent):
1343
+ # Constants
1344
+ DEFAULT_VALUE = "default"
1345
+ MAX_LENGTH = 1000
1346
+
1347
+ def __init__(self, config, api_key):
1348
+ """Initialize agent."""
1349
+ super().__init__("MyAgent", config)
1350
+ # ... initialization
1351
+
1352
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
1353
+ """Validate input data."""
1354
+ # ... validation
1355
+
1356
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
1357
+ """Main processing method."""
1358
+ # ... processing
1359
+
1360
+ def _helper_method(self, data: str) -> str:
1361
+ """Private helper method."""
1362
+ # ... helper logic
1363
+
1364
+ def _parse_llm_json_response(self, response: str) -> Dict[str, Any]:
1365
+ """Parse LLM JSON response."""
1366
+ # ... parsing
1367
+ ```
1368
+
1369
+ ## Troubleshooting
1370
+
1371
+ ### Common Issues
1372
+
1373
+ #### Issue 1: Agent Returns Empty Results
1374
+
1375
+ **Symptoms**: Agent succeeds but returns None or empty strings for key fields
1376
+
1377
+ **Causes**:
1378
+ - LLM not following JSON format
1379
+ - JSON parsing failing silently
1380
+ - Missing fields in LLM response
1381
+
1382
+ **Solutions**:
1383
+ 1. Check logs for JSON parsing warnings
1384
+ 2. Add validation after LLM call:
1385
+ ```python
1386
+ result = self._parse_llm_json_response(response.content)
1387
+
1388
+ # Validate result
1389
+ if not result.get("sentiment_polarity"):
1390
+ return {
1391
+ "success": False,
1392
+ "error": "Missing sentiment_polarity in LLM response"
1393
+ }
1394
+ ```
1395
+ 3. Improve prompt to be more specific about required fields
1396
+ 4. Add examples to prompt showing exact JSON structure
1397
+
1398
+ #### Issue 2: JSON Parsing Errors
1399
+
1400
+ **Symptoms**: `JSON decode error` in logs
1401
+
1402
+ **Causes**:
1403
+ - LLM returns markdown-wrapped JSON
1404
+ - LLM includes explanatory text before/after JSON
1405
+ - Malformed JSON from LLM
1406
+
1407
+ **Solutions**:
1408
+ 1. Use `_parse_llm_json_response()` helper (already handles markdown)
1409
+ 2. Add more explicit prompt:
1410
+ ```python
1411
+ user_prompt = """...
1412
+
1413
+ Return ONLY valid JSON, no explanation or markdown. Just the raw JSON object.
1414
+ """
1415
+ ```
1416
+ 3. Add fallback parsing:
1417
+ ```python
1418
+ try:
1419
+ result = json.loads(content)
1420
+ except json.JSONDecodeError:
1421
+ # Try to extract JSON from text
1422
+ import re
1423
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
1424
+ if json_match:
1425
+ result = json.loads(json_match.group())
1426
+ else:
1427
+ raise
1428
+ ```
1429
+
1430
+ #### Issue 3: Inconsistent Results
1431
+
1432
+ **Symptoms**: Same comment gets different classifications on reruns
1433
+
1434
+ **Causes**:
1435
+ - Temperature too high
1436
+ - Prompt too vague
1437
+ - Model inconsistency
1438
+
1439
+ **Solutions**:
1440
+ 1. Lower temperature to 0.0 - 0.2 for classification tasks
1441
+ 2. Make prompt more specific and rule-based
1442
+ 3. Add examples to prompt
1443
+ 4. Use a more consistent model (gpt-5-nano vs gpt-4o)
1444
+
1445
+ #### Issue 4: Agent Too Slow
1446
+
1447
+ **Symptoms**: Processing takes very long
1448
+
1449
+ **Causes**:
1450
+ - Large LLM model
1451
+ - Complex prompts
1452
+ - Sequential processing
1453
+ - API rate limits
1454
+
1455
+ **Solutions**:
1456
+ 1. Use faster model (gpt-5-nano instead of gpt-4o)
1457
+ 2. Simplify prompt (shorter = faster)
1458
+ 3. Enable parallel processing (already default)
1459
+ 4. Increase batch size (if not hitting rate limits)
1460
+ 5. Consider caching repeated analyses
1461
+
1462
+ #### Issue 5: Agent Failing Validation
1463
+
1464
+ **Symptoms**: `validate_input()` returns False, agent skips processing
1465
+
1466
+ **Causes**:
1467
+ - Missing required fields in input
1468
+ - Empty or None values
1469
+ - Wrong data types
1470
+
1471
+ **Solutions**:
1472
+ 1. Check workflow node - ensure all required fields passed:
1473
+ ```python
1474
+ input_data = {
1475
+ "comment_text": state.get("translated_text", state["comment_text"]),
1476
+ "content_description": state["content_description"],
1477
+ # Add all required fields
1478
+ }
1479
+ ```
1480
+ 2. Add logging to validation:
1481
+ ```python
1482
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
1483
+ for field in required_fields:
1484
+ if field not in input_data:
1485
+ self.log_processing(f"Missing field: {field}", "error")
1486
+ return False
1487
+ return True
1488
+ ```
1489
+
1490
+ #### Issue 6: Workflow Not Running New Agent
1491
+
1492
+ **Symptoms**: New agent not being called, no logs from new agent
1493
+
1494
+ **Causes**:
1495
+ - Forgot to add node to workflow graph
1496
+ - Forgot to initialize agent
1497
+ - Workflow edges not connected
1498
+
1499
+ **Solutions**:
1500
+ 1. Verify agent initialized in `__init__`:
1501
+ ```python
1502
+ self.new_agent = NewAgent(config, api_key)
1503
+ ```
1504
+ 2. Verify node added:
1505
+ ```python
1506
+ workflow.add_node("new_agent", self._new_agent_node)
1507
+ ```
1508
+ 3. Verify edges:
1509
+ ```python
1510
+ workflow.add_edge("previous_agent", "new_agent")
1511
+ workflow.add_edge("new_agent", END)
1512
+ ```
1513
+ 4. Check for exceptions in workflow compilation
1514
+
1515
+ #### Issue 7: Database Insert Fails
1516
+
1517
+ **Symptoms**: Processing succeeds but data not in Snowflake
1518
+
1519
+ **Causes**:
1520
+ - Missing columns in database
1521
+ - Data type mismatch
1522
+ - Field name mismatch
1523
+
1524
+ **Solutions**:
1525
+ 1. Check column exists:
1526
+ ```sql
1527
+ DESC TABLE COMMENT_SENTIMENT_FEATURES;
1528
+ ```
1529
+ 2. Add column if missing:
1530
+ ```sql
1531
+ ALTER TABLE COMMENT_SENTIMENT_FEATURES
1532
+ ADD COLUMN NEW_FIELD VARCHAR(500);
1533
+ ```
1534
+ 3. Check field names match exactly (case-sensitive)
1535
+ 4. Check main.py result_df construction includes new fields
1536
+
1537
+ ### Debugging Tips
1538
+
1539
+ 1. **Enable Debug Logging**: Set log level to DEBUG in main.py
1540
+ 2. **Print State**: Add print statements in workflow nodes to see state
1541
+ 3. **Test Agent Directly**: Test agent outside workflow first
1542
+ 4. **Use Sequential Mode**: `--sequential` flag for clearer debugging
1543
+ 5. **Check API Logs**: Review OpenAI API dashboard for errors
1544
+ 6. **Validate JSON**: Use online JSON validator for config files
1545
+ 7. **Check Git Status**: Ensure all files saved and changes committed
1546
+
1547
+ ### Getting Help
1548
+
1549
+ 1. **Check Logs**: Always check `logs/` directory first
1550
+ 2. **Review This README**: Answers to most questions are here
1551
+ 3. **Test Incrementally**: Isolate the problem to one agent
1552
+ 4. **Use Small Batches**: Test with `--limit 5` for faster iteration
1553
+ 5. **Document Issues**: Keep notes on what you tried
1554
+
1555
+ ## Conclusion
1556
+
1557
+ This agent architecture provides a flexible, maintainable foundation for processing social media comments. Key takeaways:
1558
+
1559
+ - **Base class pattern** ensures consistency
1560
+ - **LangGraph workflow** enables flexible orchestration
1561
+ - **Configuration-driven** design minimizes code changes
1562
+ - **Error resilience** at every level
1563
+ - **Extensible by design** - easy to add new agents
1564
+
1565
+ For questions or issues, refer to the main project README or review the existing agent implementations for patterns and examples.
1566
+
1567
+ ---
1568
+
1569
+ **Last Updated**: 2026-01-15
1570
+ **Version**: 1.0
1571
+ **Maintainer**: Musora Development Team
processing_comments/agents/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agents module for the sentiment analysis workflow.
3
+ Provides modular, extensible agents for various NLP tasks.
4
+ """
5
+
6
+ from agents.base_agent import BaseAgent
7
+ from agents.language_detection_agent import LanguageDetectionAgent
8
+ from agents.translation_agent import TranslationAgent
9
+
10
+ __all__ = [
11
+ "BaseAgent",
12
+ "LanguageDetectionAgent",
13
+ "TranslationAgent"
14
+ ]
processing_comments/agents/base_agent.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Base Agent class for all agents in the workflow
3
+ This provides a common interface and structure for extensibility
4
+ """
5
+
6
+ from abc import ABC, abstractmethod
7
+ from typing import Dict, Any, Optional
8
+ import json
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class BaseAgent(ABC):
15
+ """
16
+ Abstract base class for all agents in the agentic workflow.
17
+ Provides common functionality and enforces consistent interface.
18
+ """
19
+
20
+ def __init__(self, name: str, config: Dict[str, Any]):
21
+ """
22
+ Initialize the base agent.
23
+
24
+ Args:
25
+ name: Name of the agent
26
+ config: Configuration dictionary for the agent
27
+ """
28
+ self.name = name
29
+ self.config = config
30
+ self.model = config.get("model", "gpt-4o-mini")
31
+ self.temperature = config.get("temperature", 0.7)
32
+ self.max_retries = config.get("max_retries", 3)
33
+ logger.info(f"Initialized {self.name} with model {self.model}")
34
+
35
+ @abstractmethod
36
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
37
+ """
38
+ Process input data and return results.
39
+ This method must be implemented by all concrete agent classes.
40
+
41
+ Args:
42
+ input_data: Dictionary containing input data for processing
43
+
44
+ Returns:
45
+ Dictionary containing processing results
46
+ """
47
+ pass
48
+
49
+ @abstractmethod
50
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
51
+ """
52
+ Validate input data before processing.
53
+
54
+ Args:
55
+ input_data: Dictionary containing input data
56
+
57
+ Returns:
58
+ True if input is valid, False otherwise
59
+ """
60
+ pass
61
+
62
+ def get_name(self) -> str:
63
+ """Get the agent name."""
64
+ return self.name
65
+
66
+ def get_config(self) -> Dict[str, Any]:
67
+ """Get the agent configuration."""
68
+ return self.config
69
+
70
+ def log_processing(self, message: str, level: str = "info"):
71
+ """
72
+ Log processing information.
73
+
74
+ Args:
75
+ message: Log message
76
+ level: Log level (info, warning, error, debug)
77
+ """
78
+ log_method = getattr(logger, level, logger.info)
79
+ log_method(f"[{self.name}] {message}")
80
+
81
+ def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]:
82
+ """
83
+ Handle errors consistently across all agents.
84
+
85
+ Args:
86
+ error: The exception that occurred
87
+ context: Additional context about the error
88
+
89
+ Returns:
90
+ Error dictionary with details
91
+ """
92
+ error_msg = f"Error in {self.name}"
93
+ if context:
94
+ error_msg += f" ({context})"
95
+ error_msg += f": {str(error)}"
96
+
97
+ logger.error(error_msg)
98
+
99
+ return {
100
+ "success": False,
101
+ "error": str(error),
102
+ "agent": self.name,
103
+ "context": context
104
+ }
processing_comments/agents/language_detection_agent.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Language Detection Agent
3
+ Detects the language of social media comments using lingua library and LLM fallback
4
+ """
5
+
6
+ from typing import Dict, Any
7
+ import json
8
+ from lingua import Language, LanguageDetectorBuilder
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain.schema import HumanMessage, SystemMessage
11
+ from agents.base_agent import BaseAgent
12
+ import logging
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class LanguageDetectionAgent(BaseAgent):
18
+ """
19
+ Agent that detects the language of text comments.
20
+ Uses lingua library for fast English detection, then LLM for non-English languages.
21
+ """
22
+
23
+ # Lingua to ISO 639-1 language code mapping
24
+ LINGUA_TO_ISO = {
25
+ Language.ENGLISH: "en",
26
+ Language.SPANISH: "es",
27
+ Language.FRENCH: "fr",
28
+ Language.GERMAN: "de",
29
+ Language.ITALIAN: "it",
30
+ Language.PORTUGUESE: "pt",
31
+ Language.RUSSIAN: "ru",
32
+ Language.JAPANESE: "ja",
33
+ Language.KOREAN: "ko",
34
+ Language.CHINESE: "zh",
35
+ Language.ARABIC: "ar",
36
+ Language.HINDI: "hi",
37
+ Language.DUTCH: "nl",
38
+ Language.SWEDISH: "sv",
39
+ Language.POLISH: "pl",
40
+ Language.TURKISH: "tr"
41
+ }
42
+
43
+ def __init__(self, config: Dict[str, Any], api_key: str):
44
+ """
45
+ Initialize the Language Detection Agent.
46
+
47
+ Args:
48
+ config: Configuration dictionary
49
+ api_key: OpenAI API key
50
+ """
51
+ super().__init__("LanguageDetectionAgent", config)
52
+ self.api_key = api_key
53
+ self.llm = ChatOpenAI(
54
+ model=self.model,
55
+ temperature=self.temperature,
56
+ api_key=self.api_key
57
+ )
58
+
59
+ # Initialize lingua detector with all languages
60
+ self.detector = LanguageDetectorBuilder.from_all_languages().build()
61
+
62
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
63
+ """
64
+ Validate that input contains required fields.
65
+
66
+ Args:
67
+ input_data: Input dictionary
68
+
69
+ Returns:
70
+ True if valid, False otherwise
71
+ """
72
+ return "comment_text" in input_data and input_data["comment_text"]
73
+
74
+ def detect_with_lingua(self, text: str) -> tuple[str, str, bool]:
75
+ """
76
+ Detect language using lingua library.
77
+
78
+ Args:
79
+ text: Text to analyze
80
+
81
+ Returns:
82
+ Tuple of (language_code, language_name, is_english)
83
+ """
84
+ try:
85
+ # Clean text
86
+ cleaned_text = text.strip()
87
+ if not cleaned_text or len(cleaned_text) < 3:
88
+ return "en", "English", True # Default for very short text
89
+
90
+ # Detect language with lingua
91
+ detected_language = self.detector.detect_language_of(cleaned_text)
92
+
93
+ if detected_language is None:
94
+ # If detection fails, default to English
95
+ return "en", "English", True
96
+
97
+ # Check if it's English
98
+ if detected_language == Language.ENGLISH:
99
+ return "en", "English", True
100
+
101
+ # Map lingua language to ISO code
102
+ lang_code = self.LINGUA_TO_ISO.get(detected_language, "unknown")
103
+ lang_name = detected_language.name.capitalize()
104
+
105
+ return lang_code, lang_name, False
106
+
107
+ except Exception as e:
108
+ logger.warning(f"Lingua detection failed: {str(e)}")
109
+ # If detection fails, default to English
110
+ return "en", "English", True
111
+
112
+ def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]:
113
+ """
114
+ Parse LLM response that may contain JSON wrapped in markdown code blocks.
115
+
116
+ Args:
117
+ response_content: Raw response content from LLM
118
+
119
+ Returns:
120
+ Parsed JSON dictionary
121
+
122
+ Raises:
123
+ json.JSONDecodeError: If JSON cannot be parsed
124
+ """
125
+ content = response_content.strip()
126
+
127
+ # Check if response is wrapped in markdown code block
128
+ if content.startswith("```json"):
129
+ # Remove ```json prefix and ``` suffix
130
+ content = content[7:] # Remove ```json
131
+ if content.endswith("```"):
132
+ content = content[:-3] # Remove trailing ```
133
+ content = content.strip()
134
+ elif content.startswith("```"):
135
+ # Remove generic ``` code block
136
+ content = content[3:]
137
+ if content.endswith("```"):
138
+ content = content[:-3]
139
+ content = content.strip()
140
+
141
+ # Parse the cleaned JSON
142
+ return json.loads(content)
143
+
144
+ def detect_with_llm(self, text: str) -> Dict[str, Any]:
145
+ """
146
+ Detect language using LLM for more nuanced detection.
147
+
148
+ Args:
149
+ text: Text to analyze
150
+
151
+ Returns:
152
+ Dictionary with language detection results
153
+ """
154
+ system_prompt = """You are a language detection expert. Analyze the given text and detect its language.
155
+ For text with only emojis, special characters, or minimal content, classify as "English". Comment is about a music content, so having links or using musician name is normal and still be english.
156
+ Return your response in JSON format with the following fields:
157
+ - language: The detected language name (e.g., "English", "Spanish", "French")
158
+ - language_code: ISO 639-1 language code (e.g., "en", "es", "fr")
159
+ - confidence: Your confidence level (high, medium, low)
160
+ - has_text: boolean indicating if there is actual textual content (not just emojis/symbols)
161
+ """
162
+
163
+ user_prompt = f"""Detect the language of this comment related to a musical content:
164
+
165
+ "{text}"
166
+
167
+ Return JSON only."""
168
+
169
+ try:
170
+ messages = [
171
+ SystemMessage(content=system_prompt),
172
+ HumanMessage(content=user_prompt)
173
+ ]
174
+
175
+ response = self.llm.invoke(messages)
176
+
177
+ # Parse the response using helper function
178
+ result = self._parse_llm_json_response(response.content)
179
+
180
+ # If no text content, default to English
181
+ if not result.get("has_text", True):
182
+ result["language"] = "English"
183
+ result["language_code"] = "en"
184
+
185
+ return result
186
+
187
+ except json.JSONDecodeError as e:
188
+ self.log_processing(f"LLM response JSON parsing failed: {str(e)}", "warning")
189
+ self.log_processing(f"Raw response: {response.content[:200]}", "debug")
190
+ return {
191
+ "language": "English",
192
+ "language_code": "en",
193
+ "confidence": "low",
194
+ "has_text": True
195
+ }
196
+ except Exception as e:
197
+ self.log_processing(f"LLM detection failed: {str(e)}", "warning")
198
+ return {
199
+ "language": "English",
200
+ "language_code": "en",
201
+ "confidence": "low",
202
+ "has_text": True
203
+ }
204
+
205
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
206
+ """
207
+ Process comment and detect its language.
208
+ Strategy: Use lingua first. If English, done. If not English, use LLM for better accuracy.
209
+
210
+ Args:
211
+ input_data: Dictionary containing comment_text and other metadata
212
+
213
+ Returns:
214
+ Dictionary with language detection results
215
+ """
216
+ try:
217
+ # Validate input
218
+ if not self.validate_input(input_data):
219
+ return {
220
+ "success": False,
221
+ "error": "Invalid input: missing comment_text",
222
+ "language": "English",
223
+ "language_code": "en",
224
+ "is_english": True
225
+ }
226
+
227
+ comment_text = input_data["comment_text"]
228
+
229
+ # Check for empty or emoji-only content
230
+ if not comment_text or len(comment_text.strip()) == 0:
231
+ return {
232
+ "success": True,
233
+ "comment_text": comment_text,
234
+ "language": "English",
235
+ "language_code": "en",
236
+ "is_english": True,
237
+ "confidence": "high",
238
+ "detection_method": "default",
239
+ "has_text": False
240
+ }
241
+
242
+ # Step 1: Use lingua for initial detection
243
+ lingua_lang_code, lingua_lang_name, is_english = self.detect_with_lingua(comment_text)
244
+
245
+ # Step 2: If English, we're done (lingua is good at detecting English)
246
+ if is_english:
247
+ result = {
248
+ "success": True,
249
+ "comment_text": comment_text,
250
+ "language": "English",
251
+ "language_code": "en",
252
+ "is_english": True,
253
+ "confidence": "high",
254
+ "detection_method": "lingua",
255
+ "has_text": True
256
+ }
257
+ else:
258
+ # Step 3: If not English, use LLM for more accurate detection
259
+ llm_result = self.detect_with_llm(comment_text)
260
+ language = llm_result.get("language", lingua_lang_name)
261
+ language_code = llm_result.get("language_code", lingua_lang_code)
262
+ confidence = llm_result.get("confidence", "medium")
263
+ has_text = llm_result.get("has_text", True)
264
+ if language_code == "en" or language == "English":
265
+ is_english=True
266
+
267
+ result = {
268
+ "success": True,
269
+ "comment_text": comment_text,
270
+ "language": language,
271
+ "language_code": language_code,
272
+ "is_english": is_english,
273
+ "confidence": confidence,
274
+ "detection_method": "llm",
275
+ "has_text": has_text
276
+ }
277
+
278
+ # Preserve original metadata
279
+ for key, value in input_data.items():
280
+ if key not in result:
281
+ result[key] = value
282
+
283
+ self.log_processing(
284
+ f"Detected language: {result['language']} ({result['language_code']}) - "
285
+ f"Method: {result['detection_method']}",
286
+ "debug"
287
+ )
288
+
289
+ return result
290
+
291
+ except Exception as e:
292
+ return self.handle_error(e, "language detection")
processing_comments/agents/sentiment_analysis_agent.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sentiment Analysis Agent
3
+ Extracts sentiment polarity, intent, and determines if reply is needed
4
+ """
5
+
6
+ from typing import Dict, Any, List, Optional
7
+ import json
8
+ import re
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain.schema import HumanMessage, SystemMessage
11
+ from agents.base_agent import BaseAgent
12
+ import logging
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Reply policy constants — must stay in sync with reply_policy in sentiment_analysis_config.json
17
+ _REQUIRES_REPLY_INTENTS = {"question", "request", "subscription"}
18
+ _NO_REPLY_INTENTS = {"humor_sarcasm"}
19
+
20
+ # Compiled regexes for content description parsing (compiled once at module load)
21
+ _RE_FOLLOW_SECTION = re.compile(r"^Follow\b", re.IGNORECASE)
22
+ _RE_ARROW_LINK = re.compile(r"^►")
23
+ _RE_URL_ONLY = re.compile(r"^https?://\S+$")
24
+ _RE_TIMESTAMP = re.compile(r"^\d+:\d+\s*[-–]\s*(.*)")
25
+
26
+
27
+ class SentimentAnalysisAgent(BaseAgent):
28
+ """
29
+ Agent that analyzes comment sentiment, intent, and reply requirements.
30
+
31
+ Design decisions:
32
+ - System prompt is built once at init (static across all calls)
33
+ - requires_reply is computed deterministically in Python, not by the LLM
34
+ - LLM output is validated against config-defined allowed value sets
35
+ - Content descriptions are parsed to strip URLs, timestamps, and social sections
36
+ - Parent comments are passed as read-only context; classification targets the
37
+ TARGET comment only
38
+ """
39
+
40
+ def __init__(self, config: Dict[str, Any], api_key: str, sentiment_categories: Dict[str, Any]):
41
+ """
42
+ Initialize the Sentiment Analysis Agent.
43
+
44
+ Args:
45
+ config: Agent configuration dictionary
46
+ api_key: OpenAI API key
47
+ sentiment_categories: Loaded sentiment_analysis_config.json dict
48
+ """
49
+ super().__init__("SentimentAnalysisAgent", config)
50
+ self.api_key = api_key
51
+ self.sentiment_categories = sentiment_categories
52
+
53
+ # Pre-compute valid value sets from config for O(1) validation
54
+ self._valid_polarities = {
55
+ cat["value"] for cat in sentiment_categories["sentiment_polarity"]["categories"]
56
+ }
57
+ self._valid_intents = {
58
+ cat["value"] for cat in sentiment_categories["intent"]["categories"]
59
+ }
60
+
61
+ self.llm = ChatOpenAI(
62
+ model=self.model,
63
+ temperature=self.temperature,
64
+ api_key=self.api_key,
65
+ model_kwargs={"response_format": {"type": "json_object"}}
66
+ )
67
+
68
+ # Build system prompt once at init — reused for every LLM call
69
+ self._system_prompt = self._build_system_prompt()
70
+
71
+ # ------------------------------------------------------------------
72
+ # Prompt construction
73
+ # ------------------------------------------------------------------
74
+
75
+ def _build_system_prompt(self) -> str:
76
+ """
77
+ Build a compact, static system prompt from the sentiment config.
78
+ Pulls category descriptions directly from config so changes to
79
+ sentiment_analysis_config.json are automatically reflected.
80
+ """
81
+ polarity_lines = "\n".join(
82
+ f"- {cat['value']}: {cat['description']}"
83
+ for cat in self.sentiment_categories["sentiment_polarity"]["categories"]
84
+ )
85
+
86
+ intent_lines = "\n".join(
87
+ f"- {cat['value']}: {cat['description']}"
88
+ for cat in self.sentiment_categories["intent"]["categories"]
89
+ )
90
+
91
+ return (
92
+ "Classify a social media comment about musical content.\n\n"
93
+ "RULE: Analyze ONLY the TARGET comment. "
94
+ "The parent comment is context only — do not extract sentiment or intent from it.\n\n"
95
+ "Return JSON only:\n"
96
+ '{"sentiment_polarity": <value>, "intents": [<values>], '
97
+ '"confidence": "high"|"medium"|"low", "analysis_notes": "<1-2 sentences>"}\n\n'
98
+ f"POLARITY (pick one):\n{polarity_lines}\n\n"
99
+ f"INTENTS (multi-label, pick all that apply):\n{intent_lines}\n\n"
100
+ "Rhetorical/sarcasm rules:\n"
101
+ "- Rhetorical questions → humor_sarcasm or feedback_negative, NOT question\n"
102
+ "- Sarcastic suggestions → feedback_negative, NOT suggestion\n"
103
+ "- Sarcastic requests → feedback_negative, NOT request\n"
104
+ "- Only use question/request/suggestion for GENUINE expressions"
105
+ )
106
+
107
+ def _build_user_prompt(
108
+ self,
109
+ comment_text: str,
110
+ content_description: str,
111
+ parent_comment_text: Optional[str] = None,
112
+ platform: Optional[str] = None,
113
+ content_title: Optional[str] = None,
114
+ ) -> str:
115
+ """
116
+ Build the compact user prompt with parsed, truncated context.
117
+
118
+ YouTube stores the video title separately from the description, so they
119
+ are combined here. Other platforms already embed the title in the
120
+ description, so only the parsed description is used.
121
+ """
122
+ parsed_description = self._parse_content_description(content_description)
123
+
124
+ if platform and platform.lower() == "youtube" and content_title and str(content_title).strip():
125
+ content_context = f"{content_title.strip()} — {parsed_description}"[:500]
126
+ else:
127
+ content_context = parsed_description
128
+
129
+ parts = [f"Content: {content_context}"]
130
+
131
+ if parent_comment_text and str(parent_comment_text).strip():
132
+ parent_snippet = str(parent_comment_text).strip()[:500]
133
+ parts.append(f'Parent (context only): "{parent_snippet}"')
134
+
135
+ parts.append(f'TARGET: "{comment_text}"')
136
+
137
+ return "\n".join(parts)
138
+
139
+ # ------------------------------------------------------------------
140
+ # Content description parsing
141
+ # ------------------------------------------------------------------
142
+
143
+ @staticmethod
144
+ def _parse_content_description(text: str) -> str:
145
+ """
146
+ Extract meaningful narrative text from a raw content description.
147
+
148
+ Strips noise common in YouTube/social descriptions:
149
+ - "Follow [name]:" blocks and everything after them
150
+ - Lines starting with ► (hyperlinks)
151
+ - Lines that are a bare URL
152
+ - Timestamp chapter markers: "01:08 - Active listening" → "Active listening"
153
+
154
+ Returns at most 500 characters of joined clean text.
155
+ """
156
+ if not text or not str(text).strip():
157
+ return ""
158
+
159
+ cleaned = []
160
+ for line in str(text).splitlines():
161
+ stripped = line.strip()
162
+
163
+ # Stop at social-media "Follow" blocks
164
+ if _RE_FOLLOW_SECTION.match(stripped):
165
+ break
166
+
167
+ # Skip ► link lines
168
+ if _RE_ARROW_LINK.match(stripped):
169
+ continue
170
+
171
+ # Skip bare URL lines
172
+ if _RE_URL_ONLY.match(stripped):
173
+ continue
174
+
175
+ # Convert "MM:SS - Chapter label" → keep just the label
176
+ ts_match = _RE_TIMESTAMP.match(stripped)
177
+ if ts_match:
178
+ label = ts_match.group(1).strip()
179
+ if label:
180
+ cleaned.append(label)
181
+ continue
182
+
183
+ if stripped:
184
+ cleaned.append(stripped)
185
+
186
+ return " ".join(cleaned)[:500]
187
+
188
+ # ------------------------------------------------------------------
189
+ # Output validation and reply computation
190
+ # ------------------------------------------------------------------
191
+
192
+ def _validate_result(self, raw: Dict[str, Any]) -> Dict[str, Any]:
193
+ """
194
+ Validate LLM output against config-defined allowed value sets.
195
+
196
+ - Invalid polarity → fail (comment will not be stored)
197
+ - Invalid intent values → filtered out; if none remain → fail
198
+ - Invalid confidence → silently corrected to "medium"
199
+
200
+ Returns a success dict with cleaned fields, or a failure dict with
201
+ an explanatory error message.
202
+ """
203
+ sentiment_polarity = raw.get("sentiment_polarity")
204
+
205
+ if not sentiment_polarity or sentiment_polarity not in self._valid_polarities:
206
+ return {
207
+ "success": False,
208
+ "error": (
209
+ f"Invalid sentiment_polarity '{sentiment_polarity}'. "
210
+ f"Expected one of: {sorted(self._valid_polarities)}"
211
+ ),
212
+ }
213
+
214
+ # Normalize intents to a list
215
+ intents = raw.get("intents", raw.get("intent", []))
216
+ if isinstance(intents, str):
217
+ intents = [i.strip() for i in intents.split(",")]
218
+ if not isinstance(intents, list):
219
+ intents = []
220
+
221
+ valid_intents = [i for i in intents if i in self._valid_intents]
222
+ if not valid_intents:
223
+ return {
224
+ "success": False,
225
+ "error": (
226
+ f"No valid intents in response: {intents}. "
227
+ f"Expected values from: {sorted(self._valid_intents)}"
228
+ ),
229
+ }
230
+
231
+ confidence = raw.get("confidence", "medium")
232
+ if confidence not in {"high", "medium", "low"}:
233
+ confidence = "medium"
234
+
235
+ return {
236
+ "success": True,
237
+ "sentiment_polarity": sentiment_polarity,
238
+ "intents": valid_intents,
239
+ "confidence": confidence,
240
+ "analysis_notes": str(raw.get("analysis_notes", "")).strip(),
241
+ }
242
+
243
+ @staticmethod
244
+ def _compute_requires_reply(intents: List[str]) -> bool:
245
+ """
246
+ Deterministically decide if the comment requires a reply.
247
+
248
+ True when the comment contains at least one reply-required intent
249
+ (question, request, subscription) AND no no-reply intents (humor_sarcasm).
250
+ This mirrors the reply_policy section of sentiment_analysis_config.json
251
+ without delegating the decision to the LLM.
252
+ """
253
+ intent_set = set(intents)
254
+ return (
255
+ bool(intent_set & _REQUIRES_REPLY_INTENTS)
256
+ and not bool(intent_set & _NO_REPLY_INTENTS)
257
+ )
258
+
259
+ # ------------------------------------------------------------------
260
+ # Core analysis
261
+ # ------------------------------------------------------------------
262
+
263
+ def analyze_sentiment(
264
+ self,
265
+ comment_text: str,
266
+ content_description: str,
267
+ parent_comment_text: Optional[str] = None,
268
+ platform: Optional[str] = None,
269
+ content_title: Optional[str] = None,
270
+ ) -> Dict[str, Any]:
271
+ """
272
+ Call the LLM to classify the TARGET comment's sentiment and intents.
273
+
274
+ Args:
275
+ comment_text: The comment to analyze (translated to English if needed)
276
+ content_description: Raw content description (will be parsed internally)
277
+ parent_comment_text: Optional parent comment — context only, max 500 chars
278
+ platform: Platform name; drives YouTube title-handling logic
279
+ content_title: YouTube video title (YouTube only)
280
+
281
+ Returns:
282
+ Success dict with sentiment_polarity, intent (comma-separated str),
283
+ requires_reply, sentiment_confidence, analysis_notes
284
+ — or a failure dict with an error key.
285
+ """
286
+ user_prompt = self._build_user_prompt(
287
+ comment_text, content_description, parent_comment_text, platform, content_title
288
+ )
289
+
290
+ try:
291
+ messages = [
292
+ SystemMessage(content=self._system_prompt),
293
+ HumanMessage(content=user_prompt),
294
+ ]
295
+
296
+ response = self.llm.invoke(messages)
297
+ raw = json.loads(response.content)
298
+
299
+ validated = self._validate_result(raw)
300
+ if not validated["success"]:
301
+ self.log_processing(f"Validation failed: {validated['error']}", "warning")
302
+ return validated
303
+
304
+ requires_reply = self._compute_requires_reply(validated["intents"])
305
+ intent_str = ", ".join(validated["intents"])
306
+
307
+ return {
308
+ "success": True,
309
+ "sentiment_polarity": validated["sentiment_polarity"],
310
+ "intent": intent_str,
311
+ "requires_reply": requires_reply,
312
+ "sentiment_confidence": validated["confidence"],
313
+ "analysis_notes": validated["analysis_notes"],
314
+ }
315
+
316
+ except json.JSONDecodeError as e:
317
+ self.log_processing(f"JSON decode error: {e}", "warning")
318
+ return {"success": False, "error": f"JSON parse error: {e}"}
319
+
320
+ except Exception as e:
321
+ self.log_processing(f"Sentiment analysis failed: {e}", "error")
322
+ return {"success": False, "error": str(e)}
323
+
324
+ # ------------------------------------------------------------------
325
+ # Agent interface
326
+ # ------------------------------------------------------------------
327
+
328
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
329
+ return all(field in input_data for field in ("comment_text", "content_description"))
330
+
331
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
332
+ """
333
+ Process a comment and return sentiment analysis results merged with
334
+ the original input fields.
335
+
336
+ Args:
337
+ input_data: Must contain comment_text and content_description.
338
+ May contain parent_comment_text, platform, content_title,
339
+ and any additional source fields (permalink_url, etc.)
340
+
341
+ Returns:
342
+ Dict with sentiment fields merged on top of original input_data.
343
+ """
344
+ try:
345
+ if not self.validate_input(input_data):
346
+ return {
347
+ "success": False,
348
+ "error": "Invalid input: missing required fields (comment_text, content_description)",
349
+ }
350
+
351
+ self.log_processing("Analyzing sentiment for comment", "debug")
352
+
353
+ analysis_result = self.analyze_sentiment(
354
+ comment_text=input_data["comment_text"],
355
+ content_description=input_data["content_description"],
356
+ parent_comment_text=input_data.get("parent_comment_text"),
357
+ platform=input_data.get("platform"),
358
+ content_title=input_data.get("content_title"),
359
+ )
360
+
361
+ result = {
362
+ "success": analysis_result.get("success", False),
363
+ "sentiment_polarity": analysis_result.get("sentiment_polarity"),
364
+ "intent": analysis_result.get("intent"),
365
+ "requires_reply": analysis_result.get("requires_reply", False),
366
+ "sentiment_confidence": analysis_result.get("sentiment_confidence"),
367
+ "analysis_notes": analysis_result.get("analysis_notes", ""),
368
+ }
369
+
370
+ if "error" in analysis_result:
371
+ result["sentiment_error"] = analysis_result["error"]
372
+
373
+ # Preserve all original input fields (e.g. permalink_url, thumbnail_url)
374
+ for key, value in input_data.items():
375
+ if key not in result:
376
+ result[key] = value
377
+
378
+ return result
379
+
380
+ except Exception as e:
381
+ return self.handle_error(e, "sentiment_analysis")
processing_comments/agents/translation_agent.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Translation Agent
3
+ Translates non-English comments to English using LLM
4
+ """
5
+
6
+ from typing import Dict, Any
7
+ import json
8
+ from langchain_openai import ChatOpenAI
9
+ from langchain.schema import HumanMessage, SystemMessage
10
+ from agents.base_agent import BaseAgent
11
+ import logging
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class TranslationAgent(BaseAgent):
17
+ """
18
+ Agent that translates text from source language to English.
19
+ Uses LLM for high-quality, context-aware translation.
20
+ """
21
+
22
+ def __init__(self, config: Dict[str, Any], api_key: str):
23
+ """
24
+ Initialize the Translation Agent.
25
+
26
+ Args:
27
+ config: Configuration dictionary
28
+ api_key: OpenAI API key
29
+ """
30
+ super().__init__("TranslationAgent", config)
31
+ self.api_key = api_key
32
+ self.llm = ChatOpenAI(
33
+ model=self.model,
34
+ temperature=self.temperature,
35
+ api_key=self.api_key
36
+ )
37
+
38
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
39
+ """
40
+ Validate that input contains required fields.
41
+
42
+ Args:
43
+ input_data: Input dictionary
44
+
45
+ Returns:
46
+ True if valid, False otherwise
47
+ """
48
+ required_fields = ["comment_text", "is_english"]
49
+ return all(field in input_data for field in required_fields)
50
+
51
+ def translate_text(self, text: str, source_language: str) -> Dict[str, Any]:
52
+ """
53
+ Translate text from source language to English using LLM.
54
+
55
+ Args:
56
+ text: Text to translate
57
+ source_language: Source language name
58
+
59
+ Returns:
60
+ Dictionary with translation results
61
+ """
62
+ system_prompt = """You are a professional translator specializing in social media content related to music and education.
63
+ Translate the given text from the source language to English. The text is a comment on a musical content.
64
+ Preserve the tone, intent, and any emojis or special characters.
65
+ For informal social media language, maintain the casual tone in translation.
66
+
67
+ Return your response in JSON format with the following fields:
68
+ - translated_text: The English translation
69
+ - translation_confidence: Your confidence level (high, medium, low)
70
+ - notes: Any important notes about the translation (optional)
71
+ """
72
+
73
+ user_prompt = f"""Translate this {source_language} comment to English:
74
+
75
+ "{text}"
76
+
77
+ Return JSON only."""
78
+
79
+ try:
80
+ messages = [
81
+ SystemMessage(content=system_prompt),
82
+ HumanMessage(content=user_prompt)
83
+ ]
84
+
85
+ response = self.llm.invoke(messages)
86
+ result = self._parse_llm_json_response(response.content)
87
+
88
+ return {
89
+ "success": True,
90
+ "translated_text": result.get("translated_text", text),
91
+ "translation_confidence": result.get("translation_confidence", "medium"),
92
+ "translation_notes": result.get("notes", "")
93
+ }
94
+
95
+ except json.JSONDecodeError as e:
96
+ self.log_processing(f"JSON decode error: {str(e)}", "warning")
97
+ # Try to extract text from response
98
+ return {
99
+ "success": False,
100
+ "translated_text": text,
101
+ "translation_confidence": "low",
102
+ "translation_notes": "JSON parsing failed",
103
+ "error": str(e)
104
+ }
105
+
106
+ except Exception as e:
107
+ self.log_processing(f"Translation failed: {str(e)}", "error")
108
+ return {
109
+ "success": False,
110
+ "translated_text": text,
111
+ "translation_confidence": "low",
112
+ "translation_notes": "Translation error",
113
+ "error": str(e)
114
+ }
115
+
116
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
117
+ """
118
+ Process comment and translate if needed.
119
+
120
+ Args:
121
+ input_data: Dictionary containing comment data with language info
122
+
123
+ Returns:
124
+ Dictionary with translation results
125
+ """
126
+ try:
127
+ # Validate input
128
+ if not self.validate_input(input_data):
129
+ return {
130
+ "success": False,
131
+ "error": "Invalid input: missing required fields",
132
+ "translated_text": input_data.get("comment_text", ""),
133
+ "translation_performed": False
134
+ }
135
+
136
+ comment_text = input_data["comment_text"]
137
+ is_english = input_data["is_english"]
138
+ source_language = input_data.get("language", "Unknown")
139
+
140
+ # If already English, no translation needed
141
+ if is_english:
142
+ result = {
143
+ "success": True,
144
+ "translated_text": comment_text,
145
+ "translation_performed": False,
146
+ "translation_confidence": "N/A",
147
+ "translation_notes": "Original text is English"
148
+ }
149
+ self.log_processing("Text is already English, skipping translation", "debug")
150
+ else:
151
+ # Perform translation
152
+ self.log_processing(
153
+ f"Translating from {source_language} to English",
154
+ "debug"
155
+ )
156
+
157
+ translation_result = self.translate_text(comment_text, source_language)
158
+
159
+ result = {
160
+ "success": translation_result.get("success", True),
161
+ "translated_text": translation_result.get("translated_text", comment_text),
162
+ "translation_performed": True,
163
+ "translation_confidence": translation_result.get("translation_confidence", "medium"),
164
+ "translation_notes": translation_result.get("translation_notes", "")
165
+ }
166
+
167
+ if "error" in translation_result:
168
+ result["translation_error"] = translation_result["error"]
169
+
170
+ # Preserve all original data
171
+ for key, value in input_data.items():
172
+ if key not in result:
173
+ result[key] = value
174
+
175
+ return result
176
+
177
+ except Exception as e:
178
+ return self.handle_error(e, "translation")
179
+
180
+ def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]:
181
+ """
182
+ Parse LLM response that may contain JSON wrapped in markdown code blocks.
183
+
184
+ Args:
185
+ response_content: Raw response content from LLM
186
+
187
+ Returns:
188
+ Parsed JSON dictionary
189
+
190
+ Raises:
191
+ json.JSONDecodeError: If JSON cannot be parsed
192
+ """
193
+ content = response_content.strip()
194
+
195
+ # Check if response is wrapped in markdown code block
196
+ if content.startswith("```json"):
197
+ # Remove ```json prefix and ``` suffix
198
+ content = content[7:] # Remove ```json
199
+ if content.endswith("```"):
200
+ content = content[:-3] # Remove trailing ```
201
+ content = content.strip()
202
+ elif content.startswith("```"):
203
+ # Remove generic ``` code block
204
+ content = content[3:]
205
+ if content.endswith("```"):
206
+ content = content[:-3]
207
+ content = content.strip()
208
+
209
+ # Parse the cleaned JSON
210
+ return json.loads(content)
processing_comments/config_files/data_sources_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data_sources": {
3
+ "social_media": {
4
+ "name": "Social Media Comments",
5
+ "description": "Comments from external social media platforms (Facebook, Instagram, YouTube, etc.)",
6
+ "enabled": true,
7
+ "sql_query_file": "sql/fetch_comments.sql",
8
+ "output_config": {
9
+ "table_name": "COMMENT_SENTIMENT_FEATURES",
10
+ "database": "SOCIAL_MEDIA_DB",
11
+ "schema": "ML_FEATURES"
12
+ },
13
+ "source_columns": {
14
+ "comment_sk": "COMMENT_SK",
15
+ "comment_id": "COMMENT_ID",
16
+ "comment_text": "COMMENT_TEXT",
17
+ "parent_comment_id": "PARENT_COMMENT_ID",
18
+ "parent_comment_text": "PARENT_COMMENT_TEXT",
19
+ "platform": "PLATFORM",
20
+ "content_description": "CONTENT_DESCRIPTION"
21
+ }
22
+ },
23
+ "musora_comments": {
24
+ "name": "Musora Internal Comments",
25
+ "description": "Comments from Musora internal applications",
26
+ "enabled": true,
27
+ "sql_query_file": "sql/fetch_musora_comments.sql",
28
+ "output_config": {
29
+ "table_name": "MUSORA_COMMENT_SENTIMENT_FEATURES",
30
+ "database": "SOCIAL_MEDIA_DB",
31
+ "schema": "ML_FEATURES"
32
+ },
33
+ "source_columns": {
34
+ "comment_sk": "COMMENT_SK (generated via HASH)",
35
+ "comment_id": "COMMENT_ID",
36
+ "comment_text": "COMMENT_TEXT (aliased from MESSAGE)",
37
+ "parent_comment_id": "PARENT_COMMENT_ID",
38
+ "parent_comment_text": "PARENT_COMMENT_TEXT",
39
+ "platform": "PLATFORM",
40
+ "content_description": "CONTENT_DESCRIPTION (aliased from CONTENT_PROFILE)",
41
+ "author_id": "AUTHOR_ID (aliased from USER_ID)",
42
+ "permalink_url": "PERMALINK_URL (aliased from WEB_URL_PATH)",
43
+ "thumbnail_url": "THUMBNAIL_URL"
44
+ },
45
+ "additional_fields": [
46
+ "PERMALINK_URL",
47
+ "THUMBNAIL_URL"
48
+ ]
49
+ }
50
+ },
51
+ "processing": {
52
+ "default_limit": 10000,
53
+ "enable_parent_context": true,
54
+ "parent_context_description": "When a comment is a reply, include the parent comment text for better sentiment analysis context"
55
+ }
56
+ }
processing_comments/config_files/sentiment_analysis_config.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sentiment_polarity": {
3
+ "categories": [
4
+ {
5
+ "value": "very_positive",
6
+ "label": "Very Positive",
7
+ "description": "Extremely enthusiastic, excited, deeply grateful, or highly satisfied"
8
+ },
9
+ {
10
+ "value": "positive",
11
+ "label": "Positive",
12
+ "description": "Generally positive, appreciative, supportive, or encouraging"
13
+ },
14
+ {
15
+ "value": "neutral",
16
+ "label": "Neutral",
17
+ "description": "Factual, informational, balanced, or lacking clear emotional tone"
18
+ },
19
+ {
20
+ "value": "negative",
21
+ "label": "Negative",
22
+ "description": "Disappointed, critical, frustrated, or mildly dissatisfied"
23
+ },
24
+ {
25
+ "value": "very_negative",
26
+ "label": "Very Negative",
27
+ "description": "Highly critical, angry, abusive, or extremely dissatisfied"
28
+ }
29
+ ]
30
+ },
31
+ "intent": {
32
+ "categories": [
33
+ {
34
+ "value": "praise",
35
+ "label": "Praise",
36
+ "description": "Compliments, thanks, admiration, excitement, and similar positive expressions"
37
+ },
38
+ {
39
+ "value": "question",
40
+ "label": "Question",
41
+ "description": "Information seeking (e.g., 'what scale?', 'when's it out?', How to get account?)"
42
+ },
43
+ {
44
+ "value": "request",
45
+ "label": "Request",
46
+ "description": "Asking for something actionable (tutorial, feature, sheet music, etc.)"
47
+ },
48
+ {
49
+ "value": "feedback_negative",
50
+ "label": "Negative Feedback",
51
+ "description": "Critical feedback about the content or issues (mixing, performance, composition) without abuse"
52
+ },
53
+ {
54
+ "value": "suggestion",
55
+ "label": "Suggestion",
56
+ "description": "Constructive ideas/improvements (e.g., 'try slower tempo', 'add captions')"
57
+ },
58
+ {
59
+ "value": "humor_sarcasm",
60
+ "label": "Humor/Sarcasm",
61
+ "description": "Joking, teasing, memes, irony (non-toxic)"
62
+ },
63
+ {
64
+ "value": "off_topic",
65
+ "label": "Off Topic",
66
+ "description": "Unrelated chatter or unclear/no discernible intent"
67
+ },
68
+ {
69
+ "value": "spam_selfpromo",
70
+ "label": "Spam/Self-Promotion",
71
+ "description": "Ads, links, promos, scams"
72
+ },
73
+ {
74
+ "value": "subscription",
75
+ "label": "Subscription",
76
+ "description": "Questions about subscribing (e.g., 'How do I subscribe?', 'What's the cost?') or requests to unsubscribe/cancel (e.g., 'I want to cancel', 'How to unsubscribe?')"
77
+ }
78
+ ]
79
+ },
80
+ "reply_policy": {
81
+ "requires_reply_intents": ["question", "request", "subscription"],
82
+ "not_include": ["humor_sarcasm"],
83
+ "description": "Comments with these intents should be flagged for reply"
84
+ },
85
+ "intent_settings": {
86
+ "multi_label": true,
87
+ "description": "Intent can have multiple labels as a comment can express multiple intents",
88
+ "rhetorical_sarcasm_handling": true,
89
+ "rhetorical_sarcasm_description": "System differentiates between genuine questions/suggestions/requests and rhetorical/sarcastic ones"
90
+ },
91
+ "analysis_notes_policy": {
92
+ "max_length": "1-2 sentences",
93
+ "include_topics": true,
94
+ "description": "Concise notes including key topics/highlights not covered by other categories for future summarization"
95
+ }
96
+ }
processing_comments/config_files/sentiment_config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "LLM_models": ["gpt-5-nano", "gpt-4o-mini"],
3
+ "reasoning": ["gpt-5-nano"],
4
+
5
+ "agents": {
6
+ "language_detection": {
7
+ "name": "LanguageDetectionAgent",
8
+ "model": "gpt-5-nano",
9
+ "temperature": 0.0,
10
+ "max_retries": 3,
11
+ "description": "Detects language of comments and identifies non-English content"
12
+ },
13
+ "translation": {
14
+ "name": "TranslationAgent",
15
+ "model": "gpt-5-nano",
16
+ "temperature": 0.3,
17
+ "max_retries": 3,
18
+ "description": "Translates non-English comments to English"
19
+ },
20
+ "sentiment_analysis": {
21
+ "name": "SentimentAnalysisAgent",
22
+ "model": "gpt-5-nano",
23
+ "temperature": 0.0,
24
+ "max_retries": 3,
25
+ "description": "Analyzes sentiment polarity, intent, and determines if reply is needed"
26
+ }
27
+ },
28
+
29
+ "workflow": {
30
+ "description": "Batch size is calculated dynamically based on number of workers (min: 20, max: 1000)",
31
+ "parallel_processing": {
32
+ "enabled": true,
33
+ "worker_calculation": "CPU count - 2, max 5 workers",
34
+ "min_batch_size": 20,
35
+ "max_batch_size": 1000
36
+ }
37
+ },
38
+
39
+ "snowflake": {
40
+ "output_table": "COMMENT_SENTIMENT_FEATURES",
41
+ "database": "SOCIAL_MEDIA_DB",
42
+ "schema": "ML_FEATURES"
43
+ },
44
+
45
+ "default_language": "English"
46
+ }
47
+
48
+
49
+
processing_comments/main.py ADDED
@@ -0,0 +1,572 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main execution script for comment processing workflow.
3
+ Orchestrates data fetching, processing, and storage using agentic workflow.
4
+ Supports parallel processing with multiprocessing for improved performance.
5
+ Supports multiple data sources (social media and Musora internal comments).
6
+ """
7
+
8
+ import json
9
+ import os
10
+ import logging
11
+ import argparse
12
+ from datetime import datetime
13
+ import pandas as pd
14
+ from dotenv import load_dotenv
15
+ from multiprocessing import Pool, cpu_count, Manager
16
+ from functools import partial
17
+ import traceback
18
+ from typing import Dict, Any, List
19
+
20
+ from SnowFlakeConnection import SnowFlakeConn
21
+ from workflow.comment_processor import CommentProcessingWorkflow
22
+
23
+ # Get the directory where this script is located
24
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
25
+
26
+ # Load environment variables from root directory (parent of processing_comments)
27
+ ROOT_DIR = os.path.dirname(SCRIPT_DIR)
28
+ load_dotenv(os.path.join(ROOT_DIR, '.env'))
29
+
30
+ # Configure logging
31
+ logging.basicConfig(
32
+ level=logging.INFO,
33
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
34
+ handlers=[
35
+ logging.FileHandler(os.path.join(SCRIPT_DIR, 'logs', f'comment_processing_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')),
36
+ logging.StreamHandler()
37
+ ]
38
+ )
39
+ logger = logging.getLogger(__name__)
40
+
41
+
42
+ def calculate_optimal_batch_size(total_comments: int, num_workers: int, min_batch: int = 20, max_batch: int = 100) -> int:
43
+ """
44
+ Calculate optimal batch size based on total comments and number of workers.
45
+
46
+ Args:
47
+ total_comments: Total number of comments to process
48
+ num_workers: Number of parallel workers
49
+ min_batch: Minimum batch size (default: 20)
50
+ max_batch: Maximum batch size (default: 1000)
51
+
52
+ Returns:
53
+ Optimal batch size
54
+ """
55
+ if total_comments <= min_batch:
56
+ return total_comments
57
+
58
+ # Calculate batch size to distribute work evenly among workers
59
+ batch_size = total_comments // num_workers
60
+
61
+ # Apply constraints
62
+ batch_size = max(min_batch, min(max_batch, batch_size))
63
+
64
+ return batch_size
65
+
66
+
67
+ def process_batch_worker(batch_data: tuple) -> dict:
68
+ """
69
+ Worker function to process a single batch of comments.
70
+ This function runs in a separate process.
71
+
72
+ Args:
73
+ batch_data: Tuple containing (batch_num, batch_comments, config, api_key, overwrite_first_batch, data_source_config)
74
+
75
+ Returns:
76
+ Dictionary with batch statistics and results
77
+ """
78
+ batch_num, batch_comments, config, api_key, overwrite_first_batch, data_source_config = batch_data
79
+
80
+ # Configure logging for this worker
81
+ worker_logger = logging.getLogger(f"Worker-{batch_num}")
82
+
83
+ try:
84
+ worker_logger.info(f"Batch {batch_num}: Starting processing of {len(batch_comments)} comments")
85
+
86
+ # Initialize Snowflake connection for this worker
87
+ snowflake = SnowFlakeConn()
88
+
89
+ # Initialize workflow for this worker
90
+ workflow = CommentProcessingWorkflow(config, api_key)
91
+
92
+ # Process comments through workflow
93
+ results = workflow.process_batch(batch_comments)
94
+
95
+ # Convert to DataFrame
96
+ results_df = pd.DataFrame(results)
97
+
98
+ # Filter successful results
99
+ initial_count = len(results_df)
100
+ df_successful = results_df[results_df['success'] == True].copy()
101
+ filtered_count = initial_count - len(df_successful)
102
+
103
+ worker_logger.info(f"Batch {batch_num}: Processed {initial_count} comments, {len(df_successful)} successful")
104
+
105
+ # Prepare output data with base columns
106
+ output_columns = {
107
+ 'comment_sk': 'COMMENT_SK',
108
+ 'comment_id': 'COMMENT_ID',
109
+ 'comment_text': 'ORIGINAL_TEXT',
110
+ 'platform': 'PLATFORM',
111
+ 'comment_timestamp': 'COMMENT_TIMESTAMP',
112
+ 'author_name': 'AUTHOR_NAME',
113
+ 'author_id': 'AUTHOR_ID',
114
+ 'parent_comment_id': 'PARENT_COMMENT_ID',
115
+ 'parent_comment_text': 'PARENT_COMMENT_TEXT',
116
+ 'content_sk': 'CONTENT_SK',
117
+ 'content_id': 'CONTENT_ID',
118
+ 'content_description': 'CONTENT_DESCRIPTION',
119
+ 'channel_sk': 'CHANNEL_SK',
120
+ 'channel_name': 'CHANNEL_NAME',
121
+ 'channel_display_name': 'CHANNEL_DISPLAY_NAME',
122
+ 'language': 'DETECTED_LANGUAGE',
123
+ 'language_code': 'LANGUAGE_CODE',
124
+ 'is_english': 'IS_ENGLISH',
125
+ 'language_confidence': 'LANGUAGE_CONFIDENCE',
126
+ 'detection_method': 'DETECTION_METHOD',
127
+ 'has_text': 'HAS_TEXT',
128
+ 'translated_text': 'TRANSLATED_TEXT',
129
+ 'translation_performed': 'TRANSLATION_PERFORMED',
130
+ 'translation_confidence': 'TRANSLATION_CONFIDENCE',
131
+ 'translation_notes': 'TRANSLATION_NOTES',
132
+ 'sentiment_polarity': 'SENTIMENT_POLARITY',
133
+ 'intent': 'INTENT',
134
+ 'requires_reply': 'REQUIRES_REPLY',
135
+ 'sentiment_confidence': 'SENTIMENT_CONFIDENCE',
136
+ 'analysis_notes': 'ANALYSIS_NOTES',
137
+ 'success': 'PROCESSING_SUCCESS'
138
+ }
139
+
140
+ # Add data source-specific columns if present
141
+ if 'additional_fields' in data_source_config:
142
+ for field in data_source_config['additional_fields']:
143
+ field_lower = field.lower()
144
+ output_columns[field_lower] = field
145
+ worker_logger.debug(f"Batch {batch_num}: Added {len(data_source_config['additional_fields'])} additional fields")
146
+
147
+ output_df = pd.DataFrame()
148
+ for source_col, target_col in output_columns.items():
149
+ if source_col in df_successful.columns:
150
+ output_df[target_col] = df_successful[source_col]
151
+ else:
152
+ output_df[target_col] = None
153
+ # Log missing columns for debugging
154
+ if source_col in ['permalink_url', 'thumbnail_url']:
155
+ worker_logger.warning(f"Batch {batch_num}: Column '{source_col}' not found in DataFrame. Available columns: {list(df_successful.columns)}")
156
+
157
+ # Add processing metadata
158
+ output_df['PROCESSED_AT'] = datetime.now()
159
+ output_df['WORKFLOW_VERSION'] = '1.0'
160
+
161
+ # Store results to Snowflake
162
+ if len(output_df) > 0:
163
+ # Use data source-specific output configuration
164
+ table_name = data_source_config['output_config']['table_name']
165
+ database = data_source_config['output_config']['database']
166
+ schema = data_source_config['output_config']['schema']
167
+
168
+ # Only the first batch should overwrite if requested
169
+ overwrite = overwrite_first_batch and batch_num == 1
170
+
171
+ snowflake.store_df_to_snowflake(
172
+ table_name=table_name,
173
+ dataframe=output_df,
174
+ database=database,
175
+ schema=schema,
176
+ overwrite=overwrite
177
+ )
178
+
179
+ worker_logger.info(f"Batch {batch_num}: Stored {len(output_df)} records to Snowflake ({table_name})")
180
+ else:
181
+ worker_logger.warning(f"Batch {batch_num}: No successful records to store")
182
+
183
+ # Close Snowflake connection
184
+ snowflake.close_connection()
185
+
186
+ # Calculate statistics
187
+ translations = output_df['TRANSLATION_PERFORMED'].sum() if 'TRANSLATION_PERFORMED' in output_df.columns else 0
188
+ non_english = (~output_df['IS_ENGLISH']).sum() if 'IS_ENGLISH' in output_df.columns else 0
189
+ requires_reply = output_df['REQUIRES_REPLY'].sum() if 'REQUIRES_REPLY' in output_df.columns else 0
190
+
191
+ return {
192
+ 'batch_num': batch_num,
193
+ 'success': True,
194
+ 'total_processed': initial_count,
195
+ 'total_stored': len(output_df),
196
+ 'failed_count': filtered_count,
197
+ 'translations': int(translations),
198
+ 'non_english': int(non_english),
199
+ 'requires_reply': int(requires_reply),
200
+ 'error': None
201
+ }
202
+
203
+ except Exception as e:
204
+ error_msg = f"Batch {batch_num} failed: {str(e)}"
205
+ worker_logger.error(error_msg)
206
+ worker_logger.error(traceback.format_exc())
207
+
208
+ return {
209
+ 'batch_num': batch_num,
210
+ 'success': False,
211
+ 'total_processed': len(batch_comments),
212
+ 'total_stored': 0,
213
+ 'failed_count': len(batch_comments),
214
+ 'translations': 0,
215
+ 'non_english': 0,
216
+ 'requires_reply': 0,
217
+ 'error': error_msg
218
+ }
219
+
220
+
221
+ class CommentProcessor:
222
+ """
223
+ Main processor class that orchestrates the entire workflow.
224
+ Supports multiple data sources (social media and Musora internal comments).
225
+ """
226
+
227
+ def __init__(self, config_path: str = None, data_sources_config_path: str = None):
228
+ """
229
+ Initialize the comment processor.
230
+
231
+ Args:
232
+ config_path: Path to configuration file (default: config_files/sentiment_config.json relative to script)
233
+ data_sources_config_path: Path to data sources config (default: config_files/data_sources_config.json)
234
+ """
235
+ # Set default config path if not provided
236
+ if config_path is None:
237
+ config_path = os.path.join(SCRIPT_DIR, 'config_files', 'sentiment_config.json')
238
+
239
+ if data_sources_config_path is None:
240
+ data_sources_config_path = os.path.join(SCRIPT_DIR, 'config_files', 'data_sources_config.json')
241
+
242
+ # Load configuration
243
+ with open(config_path, 'r') as f:
244
+ self.config = json.load(f)
245
+
246
+ # Load data sources configuration
247
+ with open(data_sources_config_path, 'r') as f:
248
+ self.data_sources_config = json.load(f)
249
+
250
+ # Initialize Snowflake connection
251
+ self.snowflake = SnowFlakeConn()
252
+
253
+ # Get OpenAI API key
254
+ self.api_key = os.getenv("OPENAI_API_KEY")
255
+ if not self.api_key:
256
+ raise ValueError("OPENAI_API_KEY not found in environment variables")
257
+
258
+ # Initialize workflow
259
+ self.workflow = CommentProcessingWorkflow(self.config, self.api_key)
260
+
261
+ logger.info("CommentProcessor initialized successfully")
262
+
263
+ def get_enabled_data_sources(self) -> List[Dict[str, Any]]:
264
+ """
265
+ Get list of enabled data sources from configuration.
266
+
267
+ Returns:
268
+ List of enabled data source configurations
269
+ """
270
+ enabled_sources = []
271
+ for source_key, source_config in self.data_sources_config['data_sources'].items():
272
+ if source_config.get('enabled', True):
273
+ enabled_sources.append({
274
+ 'key': source_key,
275
+ 'config': source_config
276
+ })
277
+ return enabled_sources
278
+
279
+ def fetch_comments(self, data_source_key: str, limit: int = None) -> pd.DataFrame:
280
+ """
281
+ Fetch comments from Snowflake using the SQL query for a specific data source.
282
+
283
+ Args:
284
+ data_source_key: Key identifying the data source (e.g., 'social_media', 'musora_comments')
285
+ limit: Optional limit on number of comments to fetch
286
+
287
+ Returns:
288
+ DataFrame containing comment data
289
+ """
290
+ data_source_config = self.data_sources_config['data_sources'][data_source_key]
291
+ source_name = data_source_config['name']
292
+
293
+ logger.info(f"Fetching comments from {source_name}...")
294
+
295
+ # Read SQL query
296
+ sql_file = data_source_config['sql_query_file']
297
+ sql_path = os.path.join(SCRIPT_DIR, sql_file)
298
+ with open(sql_path, 'r') as f:
299
+ query = f.read()
300
+
301
+ # Add limit if specified
302
+ if limit:
303
+ query = query.rstrip(';') + f"\nLIMIT {limit};"
304
+
305
+ # Execute query
306
+ df = self.snowflake.run_read_query(query, f"{source_name} comments")
307
+
308
+ logger.info(f"Fetched {len(df)} comments from {source_name}")
309
+
310
+ # Normalize column names to lowercase for consistent processing
311
+ df.columns = df.columns.str.lower()
312
+
313
+ # Additional validation: filter out any empty comments that might have slipped through
314
+ if 'comment_text' in df.columns:
315
+ initial_count = len(df)
316
+ df = df[df['comment_text'].notna() & (df['comment_text'].str.strip() != '')]
317
+ filtered_count = initial_count - len(df)
318
+ if filtered_count > 0:
319
+ logger.info(f"Filtered out {filtered_count} empty comments in post-processing")
320
+
321
+ logger.info(f"Final count: {len(df)} non-empty comments")
322
+ return df
323
+
324
+ def calculate_num_workers(self) -> int:
325
+ """
326
+ Calculate the number of parallel workers to use.
327
+ Uses CPU count - 2, with a maximum of 5 workers.
328
+
329
+ Returns:
330
+ Number of workers
331
+ """
332
+ num_cpus = cpu_count()
333
+ num_workers = max(1, min(5, num_cpus - 2))
334
+ logger.info(f"Using {num_workers} parallel workers (CPU count: {num_cpus})")
335
+ return num_workers
336
+
337
+ def process_comments_parallel(self, df: pd.DataFrame, data_source_config: Dict[str, Any], overwrite: bool = False) -> dict:
338
+ """
339
+ Process comments through the agentic workflow using parallel processing.
340
+
341
+ Args:
342
+ df: DataFrame containing raw comment data
343
+ data_source_config: Configuration for the data source being processed
344
+ overwrite: Whether to overwrite existing Snowflake table
345
+
346
+ Returns:
347
+ Dictionary with aggregated statistics
348
+ """
349
+ # Convert DataFrame to list of dictionaries
350
+ comments = df.to_dict('records')
351
+ total_comments = len(comments)
352
+
353
+ logger.info(f"Processing {total_comments} comments using parallel processing...")
354
+
355
+ # Calculate number of workers
356
+ num_workers = self.calculate_num_workers()
357
+
358
+ # Calculate optimal batch size
359
+ batch_size = calculate_optimal_batch_size(total_comments, num_workers)
360
+ logger.info(f"Batch size: {batch_size} (min: 20, max: 100)")
361
+
362
+ # Create batches
363
+ batches = []
364
+ for i in range(0, total_comments, batch_size):
365
+ batch = comments[i:i + batch_size]
366
+ batch_num = (i // batch_size) + 1
367
+ batches.append((batch_num, batch, self.config, self.api_key, overwrite, data_source_config))
368
+
369
+ total_batches = len(batches)
370
+ logger.info(f"Split into {total_batches} batches")
371
+
372
+ # Process batches in parallel
373
+ with Pool(processes=num_workers) as pool:
374
+ results = pool.map(process_batch_worker, batches)
375
+
376
+ # Aggregate statistics
377
+ total_processed = sum(r['total_processed'] for r in results)
378
+ total_stored = sum(r['total_stored'] for r in results)
379
+ failed_count = sum(r['failed_count'] for r in results)
380
+ translations = sum(r['translations'] for r in results)
381
+ non_english = sum(r['non_english'] for r in results)
382
+ requires_reply = sum(r['requires_reply'] for r in results)
383
+
384
+ # Count failed batches
385
+ failed_batches = [r for r in results if not r['success']]
386
+ if failed_batches:
387
+ logger.error(f"{len(failed_batches)} batch(es) failed:")
388
+ for fb in failed_batches:
389
+ logger.error(f" Batch {fb['batch_num']}: {fb['error']}")
390
+
391
+ return {
392
+ 'total_processed': total_processed,
393
+ 'total_stored': total_stored,
394
+ 'failed_count': failed_count,
395
+ 'translations': translations,
396
+ 'non_english': non_english,
397
+ 'requires_reply': requires_reply,
398
+ 'failed_batches': len(failed_batches)
399
+ }
400
+
401
+ def process_comments_sequential(self, df: pd.DataFrame, data_source_config: Dict[str, Any], overwrite: bool = False) -> dict:
402
+ """
403
+ Process comments through the agentic workflow sequentially (for debugging).
404
+
405
+ Args:
406
+ df: DataFrame containing raw comment data
407
+ data_source_config: Configuration for the data source being processed
408
+ overwrite: Whether to overwrite existing Snowflake table
409
+
410
+ Returns:
411
+ Dictionary with aggregated statistics
412
+ """
413
+ logger.info(f"Processing {len(df)} comments using sequential processing (debug mode)...")
414
+
415
+ # Convert DataFrame to list of dictionaries
416
+ comments = df.to_dict('records')
417
+
418
+ # Process as a single batch
419
+ batch_data = (1, comments, self.config, self.api_key, overwrite, data_source_config)
420
+ result = process_batch_worker(batch_data)
421
+
422
+ return {
423
+ 'total_processed': result['total_processed'],
424
+ 'total_stored': result['total_stored'],
425
+ 'failed_count': result['failed_count'],
426
+ 'translations': result['translations'],
427
+ 'non_english': result['non_english'],
428
+ 'requires_reply': result['requires_reply'],
429
+ 'failed_batches': 0 if result['success'] else 1
430
+ }
431
+
432
+ def run(self, limit: int = None, overwrite: bool = False, sequential: bool = False, data_source_filter: str = None):
433
+ """
434
+ Run the complete processing pipeline for all enabled data sources.
435
+
436
+ Args:
437
+ limit: Optional limit on number of comments to process per data source
438
+ overwrite: Whether to overwrite existing Snowflake table
439
+ sequential: If True, use sequential processing instead of parallel (for debugging)
440
+ data_source_filter: Optional filter to process only a specific data source
441
+ """
442
+ try:
443
+ logger.info("=" * 80)
444
+ logger.info("Starting Comment Processing Workflow")
445
+ if sequential:
446
+ logger.info("Mode: SEQUENTIAL (Debug Mode)")
447
+ else:
448
+ logger.info("Mode: PARALLEL")
449
+ logger.info("=" * 80)
450
+
451
+ # Get enabled data sources
452
+ enabled_sources = self.get_enabled_data_sources()
453
+
454
+ if data_source_filter:
455
+ enabled_sources = [s for s in enabled_sources if s['key'] == data_source_filter]
456
+ if not enabled_sources:
457
+ logger.error(f"Data source '{data_source_filter}' not found or not enabled")
458
+ return
459
+
460
+ logger.info(f"Processing {len(enabled_sources)} data source(s)")
461
+
462
+ # Process each data source
463
+ for source_info in enabled_sources:
464
+ source_key = source_info['key']
465
+ source_config = source_info['config']
466
+ source_name = source_config['name']
467
+
468
+ logger.info("=" * 80)
469
+ logger.info(f"Processing Data Source: {source_name}")
470
+ logger.info("=" * 80)
471
+
472
+ # Step 1: Fetch comments
473
+ df_comments = self.fetch_comments(data_source_key=source_key, limit=limit)
474
+
475
+ if df_comments.empty:
476
+ logger.warning(f"No comments to process from {source_name}")
477
+ continue
478
+
479
+ # Step 2: Process comments through workflow (parallel or sequential)
480
+ start_time = datetime.now()
481
+
482
+ if sequential:
483
+ stats = self.process_comments_sequential(df_comments, source_config, overwrite=overwrite)
484
+ else:
485
+ stats = self.process_comments_parallel(df_comments, source_config, overwrite=overwrite)
486
+
487
+ end_time = datetime.now()
488
+ processing_time = (end_time - start_time).total_seconds()
489
+
490
+ # Summary statistics
491
+ logger.info("=" * 80)
492
+ logger.info(f"Processing Summary for {source_name}:")
493
+ logger.info(f" Processing Mode: {'Sequential' if sequential else 'Parallel'}")
494
+ logger.info(f" Output Table: {source_config['output_config']['table_name']}")
495
+ logger.info(f" Total comments processed: {stats['total_processed']}")
496
+ logger.info(f" Successfully stored: {stats['total_stored']}")
497
+ logger.info(f" Failed sentiment analysis (not stored): {stats['failed_count']}")
498
+ if stats.get('failed_batches', 0) > 0:
499
+ logger.info(f" Failed batches: {stats['failed_batches']}")
500
+ logger.info(f" Non-English comments: {stats['non_english']}")
501
+ logger.info(f" Translations performed: {stats['translations']}")
502
+ logger.info(f" Comments requiring reply: {stats['requires_reply']}")
503
+ logger.info(f" Processing time: {processing_time:.2f} seconds")
504
+ logger.info(f" Average time per comment: {processing_time / stats['total_processed']:.2f} seconds")
505
+ logger.info("=" * 80)
506
+
507
+ except Exception as e:
508
+ logger.error(f"Error in workflow execution: {str(e)}", exc_info=True)
509
+ raise
510
+
511
+ finally:
512
+ # Close main Snowflake connection (workers have their own connections)
513
+ self.snowflake.close_connection()
514
+ logger.info("Snowflake connection closed")
515
+
516
+
517
+ def main():
518
+ """
519
+ Main entry point for the script.
520
+ """
521
+ parser = argparse.ArgumentParser(
522
+ description="Process comments with language detection, translation, and sentiment analysis from multiple data sources"
523
+ )
524
+ parser.add_argument(
525
+ '--limit',
526
+ type=int,
527
+ default=5000,
528
+ help='Limit number of comments to process per data source (default: 10000)'
529
+ )
530
+ parser.add_argument(
531
+ '--overwrite',
532
+ action='store_true',
533
+ default=False,
534
+ help='Overwrite existing Snowflake table (default: False, appends new records)'
535
+ )
536
+ parser.add_argument(
537
+ '--config',
538
+ type=str,
539
+ default=None,
540
+ help='Path to configuration file (default: config_files/sentiment_config.json relative to script)'
541
+ )
542
+ parser.add_argument(
543
+ '--sequential',
544
+ action='store_true',
545
+ default=False,
546
+ help='Use sequential processing instead of parallel (for debugging)'
547
+ )
548
+ parser.add_argument(
549
+ '--data-source',
550
+ type=str,
551
+ default=None,
552
+ help='Process only a specific data source (e.g., social_media, musora_comments). If not specified, all enabled sources are processed.'
553
+ )
554
+
555
+ args = parser.parse_args()
556
+
557
+ # Create logs directory if it doesn't exist
558
+ logs_dir = os.path.join(SCRIPT_DIR, 'logs')
559
+ os.makedirs(logs_dir, exist_ok=True)
560
+
561
+ # Initialize and run processor
562
+ processor = CommentProcessor(config_path=args.config)
563
+ processor.run(
564
+ limit=args.limit,
565
+ overwrite=args.overwrite,
566
+ sequential=args.sequential,
567
+ data_source_filter=args.data_source
568
+ )
569
+
570
+
571
+ if __name__ == "__main__":
572
+ main()
processing_comments/requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ snowflake-snowpark-python>=1.0.0
2
+ pandas>=1.3.0
3
+ python-dotenv>=0.19.0
4
+ openai>=1.0.0
5
+ argparse
6
+ langchain>=0.1.0
7
+ langchain-openai>=0.0.5
8
+ langgraph>=0.0.20
9
+ lingua-language-detector>=2.0.0
10
+ pydantic>=2.0.0
processing_comments/sql/create_ml_features_table.sql ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Create table in ML_FEATURES schema to store comment sentiment analysis results
2
+ -- This table stores the output from the language detection, translation, and sentiment analysis workflow
3
+
4
+ USE DATABASE SOCIAL_MEDIA_DB;
5
+ USE SCHEMA ML_FEATURES;
6
+
7
+ CREATE TABLE IF NOT EXISTS COMMENT_SENTIMENT_FEATURES (
8
+ -- Primary identifiers
9
+ COMMENT_SK NUMBER(38,0) NOT NULL COMMENT 'Surrogate key from FACT_COMMENTS',
10
+ COMMENT_ID VARCHAR(16777216) NOT NULL COMMENT 'Platform comment ID',
11
+ ORIGINAL_TEXT VARCHAR(16777216) COMMENT 'Original comment text',
12
+ PLATFORM VARCHAR(16777216) COMMENT 'Social platform',
13
+ COMMENT_TIMESTAMP TIMESTAMP_NTZ(9) COMMENT 'When comment was posted',
14
+ AUTHOR_NAME VARCHAR(16777216) COMMENT 'Commenter name',
15
+ AUTHOR_ID VARCHAR(16777216) COMMENT 'Platform user ID',
16
+
17
+ -- Parent comment information
18
+ PARENT_COMMENT_ID VARCHAR(16777216) COMMENT 'ID of parent comment if this is a reply',
19
+ PARENT_COMMENT_TEXT VARCHAR(16777216) COMMENT 'Text of parent comment for context',
20
+
21
+ -- Content references
22
+ CONTENT_SK NUMBER(38,0) COMMENT 'Foreign key to content',
23
+ CONTENT_ID VARCHAR(16777216) COMMENT 'Platform content ID',
24
+ CONTENT_DESCRIPTION VARCHAR(16777216) COMMENT 'Content description/message',
25
+
26
+ -- Channel references
27
+ CHANNEL_SK NUMBER(38,0) COMMENT 'Foreign key to channel',
28
+ CHANNEL_NAME VARCHAR(16777216) COMMENT 'Brand/channel name',
29
+ CHANNEL_DISPLAY_NAME VARCHAR(16777216) COMMENT 'Channel display name',
30
+
31
+ -- Language detection features
32
+ DETECTED_LANGUAGE VARCHAR(100) COMMENT 'Detected language name (e.g., English, Spanish)',
33
+ LANGUAGE_CODE VARCHAR(10) COMMENT 'ISO 639-1 language code (e.g., en, es)',
34
+ IS_ENGLISH BOOLEAN COMMENT 'True if comment is in English',
35
+ LANGUAGE_CONFIDENCE VARCHAR(20) COMMENT 'Confidence level: high, medium, low',
36
+ DETECTION_METHOD VARCHAR(50) COMMENT 'Method used: library, llm, or default',
37
+ HAS_TEXT BOOLEAN COMMENT 'True if comment has textual content (not just emojis)',
38
+
39
+ -- Translation features
40
+ TRANSLATED_TEXT VARCHAR(16777216) COMMENT 'English translation (or original if already English)',
41
+ TRANSLATION_PERFORMED BOOLEAN COMMENT 'True if translation was performed',
42
+ TRANSLATION_CONFIDENCE VARCHAR(20) COMMENT 'Translation confidence level',
43
+ TRANSLATION_NOTES VARCHAR(16777216) COMMENT 'Notes about translation',
44
+
45
+ -- Sentiment analysis features
46
+ SENTIMENT_POLARITY VARCHAR(20) COMMENT 'Sentiment: very_positive, positive, neutral, negative, very_negative',
47
+ INTENT VARCHAR(500) COMMENT 'Multi-label intents (comma-separated): praise, question, request, feedback_negative, suggestion, humor_sarcasm, off_topic, spam_selfpromo',
48
+ REQUIRES_REPLY BOOLEAN COMMENT 'True if comment requires a response (genuine questions/requests only)',
49
+ SENTIMENT_CONFIDENCE VARCHAR(20) COMMENT 'Sentiment analysis confidence: high, medium, low',
50
+ ANALYSIS_NOTES VARCHAR(16777216) COMMENT 'Concise notes with key topics/highlights for summarization',
51
+
52
+ -- Processing metadata
53
+ PROCESSING_SUCCESS BOOLEAN COMMENT 'True if processing completed successfully',
54
+ PROCESSING_ERRORS VARCHAR(16777216) COMMENT 'Any errors encountered during processing',
55
+ PROCESSED_AT TIMESTAMP_NTZ(9) COMMENT 'When this record was processed',
56
+ WORKFLOW_VERSION VARCHAR(20) COMMENT 'Version of the processing workflow',
57
+
58
+ -- Audit fields
59
+ CREATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP() COMMENT 'Record creation time',
60
+ UPDATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP() COMMENT 'Record update time'
61
+ )
62
+ COMMENT='ML Features table for language detection, translation, and sentiment analysis results from social media comments';
63
+
64
+ -- Create indexes for common queries
65
+ -- Note: Snowflake automatically optimizes queries, but we can define clustering keys
66
+ ALTER TABLE COMMENT_SENTIMENT_FEATURES CLUSTER BY (COMMENT_TIMESTAMP, CHANNEL_NAME);
67
+
68
+ -- Create view for comments requiring reply
69
+ CREATE OR REPLACE VIEW VW_COMMENTS_REQUIRING_REPLY AS
70
+ SELECT
71
+ COMMENT_SK,
72
+ COMMENT_ID,
73
+ ORIGINAL_TEXT,
74
+ TRANSLATED_TEXT,
75
+ PARENT_COMMENT_ID,
76
+ PARENT_COMMENT_TEXT,
77
+ INTENT,
78
+ SENTIMENT_POLARITY,
79
+ SENTIMENT_CONFIDENCE,
80
+ CHANNEL_NAME,
81
+ AUTHOR_NAME,
82
+ COMMENT_TIMESTAMP,
83
+ PLATFORM,
84
+ CONTENT_DESCRIPTION
85
+ FROM COMMENT_SENTIMENT_FEATURES
86
+ WHERE REQUIRES_REPLY = TRUE
87
+ AND PROCESSING_SUCCESS = TRUE
88
+ ORDER BY COMMENT_TIMESTAMP DESC;
89
+
90
+ -- Create view for sentiment distribution
91
+ CREATE OR REPLACE VIEW VW_SENTIMENT_DISTRIBUTION AS
92
+ SELECT
93
+ CHANNEL_NAME,
94
+ SENTIMENT_POLARITY,
95
+ INTENT,
96
+ COUNT(*) AS COMMENT_COUNT,
97
+ COUNT(CASE WHEN REQUIRES_REPLY THEN 1 END) AS REPLIES_NEEDED,
98
+ COUNT(CASE WHEN PARENT_COMMENT_ID IS NOT NULL THEN 1 END) AS REPLY_COMMENTS,
99
+ AVG(CASE WHEN SENTIMENT_CONFIDENCE = 'high' THEN 3
100
+ WHEN SENTIMENT_CONFIDENCE = 'medium' THEN 2
101
+ WHEN SENTIMENT_CONFIDENCE = 'low' THEN 1
102
+ ELSE 0 END) AS AVG_CONFIDENCE_SCORE,
103
+ MAX(PROCESSED_AT) AS LAST_PROCESSED
104
+ FROM COMMENT_SENTIMENT_FEATURES
105
+ WHERE PROCESSING_SUCCESS = TRUE
106
+ GROUP BY CHANNEL_NAME, SENTIMENT_POLARITY, INTENT
107
+ ORDER BY CHANNEL_NAME, COMMENT_COUNT DESC;
108
+
109
+ -- Create view for non-English comments
110
+ CREATE OR REPLACE VIEW VW_NON_ENGLISH_COMMENTS AS
111
+ SELECT
112
+ COMMENT_SK,
113
+ COMMENT_ID,
114
+ ORIGINAL_TEXT,
115
+ DETECTED_LANGUAGE,
116
+ LANGUAGE_CODE,
117
+ TRANSLATED_TEXT,
118
+ TRANSLATION_CONFIDENCE,
119
+ SENTIMENT_POLARITY,
120
+ INTENT,
121
+ CHANNEL_NAME,
122
+ COMMENT_TIMESTAMP,
123
+ PLATFORM
124
+ FROM COMMENT_SENTIMENT_FEATURES
125
+ WHERE IS_ENGLISH = FALSE
126
+ AND PROCESSING_SUCCESS = TRUE
127
+ ORDER BY COMMENT_TIMESTAMP DESC;
processing_comments/sql/create_musora_ml_features_table.sql ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Create table in ML_FEATURES schema to store Musora comment sentiment analysis results
2
+ -- This table stores the output from the language detection, translation, and sentiment analysis workflow
3
+ -- Schema matches COMMENT_SENTIMENT_FEATURES with additional Musora-specific fields
4
+
5
+ USE DATABASE SOCIAL_MEDIA_DB;
6
+ USE SCHEMA ML_FEATURES;
7
+
8
+ CREATE TABLE IF NOT EXISTS MUSORA_COMMENT_SENTIMENT_FEATURES (
9
+ -- Primary identifiers
10
+ COMMENT_SK NUMBER(38,0) NOT NULL COMMENT 'Generated surrogate key (hash of COMMENT_ID and PLATFORM)',
11
+ COMMENT_ID VARCHAR(16777216) NOT NULL COMMENT 'Musora comment ID',
12
+ ORIGINAL_TEXT VARCHAR(16777216) COMMENT 'Original comment text',
13
+ PLATFORM VARCHAR(16777216) COMMENT 'Musora platform/brand',
14
+ COMMENT_TIMESTAMP TIMESTAMP_NTZ(9) COMMENT 'When comment was posted',
15
+ AUTHOR_NAME VARCHAR(16777216) COMMENT 'Commenter name',
16
+ AUTHOR_ID VARCHAR(16777216) COMMENT 'User ID',
17
+
18
+ -- Parent comment information
19
+ PARENT_COMMENT_ID VARCHAR(16777216) COMMENT 'ID of parent comment if this is a reply',
20
+ PARENT_COMMENT_TEXT VARCHAR(16777216) COMMENT 'Text of parent comment for context',
21
+
22
+ -- Content references
23
+ CONTENT_SK NUMBER(38,0) COMMENT 'Generated surrogate key for content',
24
+ CONTENT_ID VARCHAR(16777216) COMMENT 'Content ID',
25
+ CONTENT_DESCRIPTION VARCHAR(16777216) COMMENT 'Content profile/description',
26
+
27
+ -- Channel references
28
+ CHANNEL_SK NUMBER(38,0) COMMENT 'Generated surrogate key for channel',
29
+ CHANNEL_NAME VARCHAR(16777216) COMMENT 'Brand/channel name',
30
+ CHANNEL_DISPLAY_NAME VARCHAR(16777216) COMMENT 'Channel display name',
31
+
32
+ -- Musora-specific fields
33
+ PERMALINK_URL VARCHAR(16777216) COMMENT 'Web URL path of the content',
34
+ THUMBNAIL_URL VARCHAR(16777216) COMMENT 'Thumbnail URL of the content',
35
+
36
+ -- Language detection features
37
+ DETECTED_LANGUAGE VARCHAR(100) COMMENT 'Detected language name (e.g., English, Spanish)',
38
+ LANGUAGE_CODE VARCHAR(10) COMMENT 'ISO 639-1 language code (e.g., en, es)',
39
+ IS_ENGLISH BOOLEAN COMMENT 'True if comment is in English',
40
+ LANGUAGE_CONFIDENCE VARCHAR(20) COMMENT 'Confidence level: high, medium, low',
41
+ DETECTION_METHOD VARCHAR(50) COMMENT 'Method used: library, llm, or default',
42
+ HAS_TEXT BOOLEAN COMMENT 'True if comment has textual content (not just emojis)',
43
+
44
+ -- Translation features
45
+ TRANSLATED_TEXT VARCHAR(16777216) COMMENT 'English translation (or original if already English)',
46
+ TRANSLATION_PERFORMED BOOLEAN COMMENT 'True if translation was performed',
47
+ TRANSLATION_CONFIDENCE VARCHAR(20) COMMENT 'Translation confidence level',
48
+ TRANSLATION_NOTES VARCHAR(16777216) COMMENT 'Notes about translation',
49
+
50
+ -- Sentiment analysis features
51
+ SENTIMENT_POLARITY VARCHAR(20) COMMENT 'Sentiment: very_positive, positive, neutral, negative, very_negative',
52
+ INTENT VARCHAR(500) COMMENT 'Multi-label intents (comma-separated): praise, question, request, feedback_negative, suggestion, humor_sarcasm, off_topic, spam_selfpromo',
53
+ REQUIRES_REPLY BOOLEAN COMMENT 'True if comment requires a response (genuine questions/requests only)',
54
+ SENTIMENT_CONFIDENCE VARCHAR(20) COMMENT 'Sentiment analysis confidence: high, medium, low',
55
+ ANALYSIS_NOTES VARCHAR(16777216) COMMENT 'Concise notes with key topics/highlights for summarization',
56
+
57
+ -- Processing metadata
58
+ PROCESSING_SUCCESS BOOLEAN COMMENT 'True if processing completed successfully',
59
+ PROCESSING_ERRORS VARCHAR(16777216) COMMENT 'Any errors encountered during processing',
60
+ PROCESSED_AT TIMESTAMP_NTZ(9) COMMENT 'When this record was processed',
61
+ WORKFLOW_VERSION VARCHAR(20) COMMENT 'Version of the processing workflow',
62
+
63
+ -- Audit fields
64
+ CREATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP() COMMENT 'Record creation time',
65
+ UPDATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP() COMMENT 'Record update time'
66
+ )
67
+ COMMENT='ML Features table for language detection, translation, and sentiment analysis results from Musora internal app comments';
68
+
69
+ -- Create indexes for common queries
70
+ -- Note: Snowflake automatically optimizes queries, but we can define clustering keys
71
+ ALTER TABLE MUSORA_COMMENT_SENTIMENT_FEATURES CLUSTER BY (COMMENT_TIMESTAMP, CHANNEL_NAME);
72
+
73
+ -- Create view for Musora comments requiring reply
74
+ CREATE OR REPLACE VIEW VW_MUSORA_COMMENTS_REQUIRING_REPLY AS
75
+ SELECT
76
+ COMMENT_SK,
77
+ COMMENT_ID,
78
+ ORIGINAL_TEXT,
79
+ TRANSLATED_TEXT,
80
+ PARENT_COMMENT_ID,
81
+ PARENT_COMMENT_TEXT,
82
+ INTENT,
83
+ SENTIMENT_POLARITY,
84
+ SENTIMENT_CONFIDENCE,
85
+ CHANNEL_NAME,
86
+ AUTHOR_ID,
87
+ COMMENT_TIMESTAMP,
88
+ PLATFORM,
89
+ CONTENT_DESCRIPTION,
90
+ PERMALINK_URL,
91
+ THUMBNAIL_URL
92
+ FROM MUSORA_COMMENT_SENTIMENT_FEATURES
93
+ WHERE REQUIRES_REPLY = TRUE
94
+ AND PROCESSING_SUCCESS = TRUE
95
+ ORDER BY COMMENT_TIMESTAMP DESC;
96
+
97
+ -- Create view for Musora sentiment distribution
98
+ CREATE OR REPLACE VIEW VW_MUSORA_SENTIMENT_DISTRIBUTION AS
99
+ SELECT
100
+ CHANNEL_NAME,
101
+ SENTIMENT_POLARITY,
102
+ INTENT,
103
+ COUNT(*) AS COMMENT_COUNT,
104
+ COUNT(CASE WHEN REQUIRES_REPLY THEN 1 END) AS REPLIES_NEEDED,
105
+ COUNT(CASE WHEN PARENT_COMMENT_ID IS NOT NULL THEN 1 END) AS REPLY_COMMENTS,
106
+ AVG(CASE WHEN SENTIMENT_CONFIDENCE = 'high' THEN 3
107
+ WHEN SENTIMENT_CONFIDENCE = 'medium' THEN 2
108
+ WHEN SENTIMENT_CONFIDENCE = 'low' THEN 1
109
+ ELSE 0 END) AS AVG_CONFIDENCE_SCORE,
110
+ MAX(PROCESSED_AT) AS LAST_PROCESSED
111
+ FROM MUSORA_COMMENT_SENTIMENT_FEATURES
112
+ WHERE PROCESSING_SUCCESS = TRUE
113
+ GROUP BY CHANNEL_NAME, SENTIMENT_POLARITY, INTENT
114
+ ORDER BY CHANNEL_NAME, COMMENT_COUNT DESC;
115
+
116
+ -- Create view for non-English Musora comments
117
+ CREATE OR REPLACE VIEW VW_MUSORA_NON_ENGLISH_COMMENTS AS
118
+ SELECT
119
+ COMMENT_SK,
120
+ COMMENT_ID,
121
+ ORIGINAL_TEXT,
122
+ DETECTED_LANGUAGE,
123
+ LANGUAGE_CODE,
124
+ TRANSLATED_TEXT,
125
+ TRANSLATION_CONFIDENCE,
126
+ SENTIMENT_POLARITY,
127
+ INTENT,
128
+ CHANNEL_NAME,
129
+ COMMENT_TIMESTAMP,
130
+ PLATFORM,
131
+ PERMALINK_URL
132
+ FROM MUSORA_COMMENT_SENTIMENT_FEATURES
133
+ WHERE IS_ENGLISH = FALSE
134
+ AND PROCESSING_SUCCESS = TRUE
135
+ ORDER BY COMMENT_TIMESTAMP DESC;