diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index a6344aac8c09253b3b630fb776ae94478aa0275b..0000000000000000000000000000000000000000 --- a/.gitattributes +++ /dev/null @@ -1,35 +0,0 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000000000000000000000000000000000..d843f340d228380d0809b446083dfc066e5de60a --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 5f51ead59f36f13043e036290df9440e25fe8cbe..0000000000000000000000000000000000000000 --- a/Dockerfile +++ /dev/null @@ -1,20 +0,0 @@ -FROM python:3.13.5-slim - -WORKDIR /app - -RUN apt-get update && apt-get install -y \ - build-essential \ - curl \ - git \ - && rm -rf /var/lib/apt/lists/* - -COPY requirements.txt ./ -COPY src/ ./src/ - -RUN pip3 install -r requirements.txt - -EXPOSE 8501 - -HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health - -ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"] \ No newline at end of file diff --git a/README.md b/README.md index b574fef54ea282a7b63c1364a795fb122f75bd84..3e1be22dc2f61f5b03952eb063f1180579bcd575 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,309 @@ +# Musora Sentiment Analysis Dashboard + +A Streamlit dashboard for visualising sentiment analysis results from **social media comments** (Facebook, Instagram, YouTube, Twitter) and the **Musora internal app** across brands (Drumeo, Pianote, Guitareo, Singeo, Musora). + +--- + +## Table of Contents + +1. [Project Structure](#project-structure) +2. [How Data Flows](#how-data-flows) +3. [Data Loading Strategy](#data-loading-strategy) +4. [Pages](#pages) +5. [Global Filters & Session State](#global-filters--session-state) +6. [Snowflake Queries](#snowflake-queries) +7. [Adding or Changing Things](#adding-or-changing-things) +8. [Running the App](#running-the-app) +9. [Configuration Reference](#configuration-reference) + +--- + +## Project Structure + +``` +visualization/ +├── app.py # Entry point — routing, sidebar, session state +├── config/ +│ └── viz_config.json # Colors, query strings, dashboard settings +├── data/ +│ └── data_loader.py # All Snowflake queries and caching logic +├── utils/ +│ ├── data_processor.py # Pandas aggregations (intent dist, content summary, etc.) +│ └── metrics.py # KPI calculations (sentiment score, urgency, etc.) +├── components/ +│ ├── dashboard.py # Dashboard page renderer +│ ├── sentiment_analysis.py # Sentiment Analysis page renderer +│ └── reply_required.py # Reply Required page renderer +├── visualizations/ +│ ├── sentiment_charts.py # Plotly sentiment chart functions +│ ├── distribution_charts.py # Plotly distribution / heatmap / scatter functions +│ ├── demographic_charts.py # Plotly demographic chart functions +│ └── content_cards.py # Streamlit card components (comment cards, content cards) +├── agents/ +│ └── content_summary_agent.py # AI analysis agent (OpenAI) for comment summarisation +├── img/ +│ └── musora.png # Sidebar logo +└── SnowFlakeConnection.py # Snowflake connection wrapper (Snowpark session) +``` + +--- + +## How Data Flows + +``` +Snowflake + │ + ▼ +data_loader.py ← Three separate loading modes (see below) + │ + ├── load_dashboard_data() ──► st.session_state['dashboard_df'] + │ └─► app.py sidebar (filter options, counts) + │ └─► dashboard.py (all charts) + │ + ├── load_sa_data() ──► st.session_state['sa_contents'] + │ (on-demand, button) st.session_state['sa_comments'] + │ └─► sentiment_analysis.py + │ + └── load_reply_required_data() ► st.session_state['rr_df'] + (on-demand, button) └─► reply_required.py +``` + +**Key principle:** Data is loaded as little as possible, as late as possible. + +- The **Dashboard** uses a lightweight query (no text columns, no content join) cached for 24 hours. +- The **Sentiment Analysis** and **Reply Required** pages never load data automatically — they wait for the user to click **Fetch Data**. +- All data is stored in `st.session_state` so page navigation and widget interactions do not re-trigger Snowflake queries. + +--- + +## Data Loading Strategy + +All loading logic lives in **`data/data_loader.py`** (`SentimentDataLoader` class). + +### `load_dashboard_data()` +- Uses `dashboard_query` from `viz_config.json`. +- Fetches only: `comment_sk, content_sk, platform, brand, sentiment_polarity, intent, requires_reply, detected_language, comment_timestamp, processed_at, author_id`. +- No text columns, no `DIM_CONTENT` join — significantly faster than the full query. +- Also merges demographics data if `demographics_query` is configured. +- Cached for **24 hours** (`@st.cache_data(ttl=86400)`). +- Called once by `app.py` at startup; result stored in `st.session_state['dashboard_df']`. + +### `load_sa_data(platform, brand, top_n, min_comments, sort_by, sentiments, intents, date_range)` +- Runs **two** sequential Snowflake queries: + 1. **Content aggregation** — groups by `content_sk`, counts per sentiment, computes severity score, returns top N. + 2. **Sampled comments** — for the top N `content_sk`s only, fetches up to 50 comments per sentiment group per content (negative, positive, other), using Snowflake `QUALIFY ROW_NUMBER()`. `display_text` is computed in SQL (`CASE WHEN IS_ENGLISH = FALSE AND TRANSLATED_TEXT IS NOT NULL THEN TRANSLATED_TEXT ELSE ORIGINAL_TEXT END`). +- Returns a tuple `(contents_df, comments_df)`. +- Cached for **24 hours**. +- Called only when the user clicks **Fetch Data** on the Sentiment Analysis page. + +### `load_reply_required_data(platforms, brands, date_range)` +- Runs a single query filtering `REQUIRES_REPLY = TRUE`. +- Dynamically includes/excludes the social media table and musora table based on selected platforms. +- `display_text` computed in SQL. +- Cached for **24 hours**. +- Called only when the user clicks **Fetch Data** on the Reply Required page. + +### Important: SQL Column Qualification +Both the social media table (`COMMENT_SENTIMENT_FEATURES`) and the content dimension table (`DIM_CONTENT`) share column names. Any `WHERE` clause inside a query that joins these two tables **must** use the table alias prefix (e.g. `s.PLATFORM`, `s.COMMENT_TIMESTAMP`, `s.CHANNEL_NAME`) to avoid Snowflake `ambiguous column name` errors. The musora table (`MUSORA_COMMENT_SENTIMENT_FEATURES`) has no joins so unqualified column names are fine there. + +--- + +## Pages + +### Dashboard (`components/dashboard.py`) + +**Receives:** `filtered_df` — the lightweight dashboard dataframe (after optional global filter applied by `app.py`). + +**Does not need:** text, translations, content URLs. All charts work purely on aggregated columns (sentiment_polarity, brand, platform, intent, requires_reply, comment_timestamp). + +**Key sections:** +- Summary stats + health indicator +- Sentiment distribution (pie + gauge) +- Sentiment by brand and platform (stacked + percentage bar charts) +- Intent analysis +- Brand-Platform heatmap +- Reply requirements + urgency breakdown +- Demographics (age, timezone, experience level) — only rendered if `author_id` is present and demographics were merged + +**To add a new chart:** create the chart function in `visualizations/` and call it from `render_dashboard()`. The function receives `filtered_df`. + +--- + +### Sentiment Analysis (`components/sentiment_analysis.py`) + +**Receives:** `data_loader` instance only (no dataframe). + +**Flow:** +1. Reads `st.session_state['dashboard_df']` for filter option lists (platforms, brands, sentiments, intents). +2. Pre-populates platform/brand dropdowns from `st.session_state['global_filters']`. +3. Shows filter controls (platform, brand, sentiment, intent, top_n, min_comments, sort_by). +4. On **Fetch Data** click: calls `data_loader.load_sa_data(...)` and stores results in `st.session_state['sa_contents']` and `['sa_comments']`. +5. Renders content cards, per-content sentiment + intent charts, AI analysis buttons, and sampled comment expanders. + +**Pagination:** `st.session_state['sentiment_page']` (5 contents per page). Reset on new fetch. + +**Comments:** Sampled (up to 50 negative + 50 positive + 50 neutral per content). These are already in memory after the fetch — no extra query is needed when the user expands a comment section. + +**AI Analysis:** Uses `ContentSummaryAgent` (see `agents/`). Results cached in `st.session_state['content_summaries']`. + --- -title: Sentiment Analysis -emoji: 🚀 -colorFrom: red -colorTo: red -sdk: docker -app_port: 8501 -tags: -- streamlit -pinned: false -short_description: Sentiment Analysis Dashboard for Musora -license: cc-by-4.0 + +### Reply Required (`components/reply_required.py`) + +**Receives:** `data_loader` instance only. + +**Flow:** +1. Reads `st.session_state['dashboard_df']` for filter option lists. +2. Pre-populates platform, brand, and date from `st.session_state['global_filters']`. +3. On **Fetch Data** click: calls `data_loader.load_reply_required_data(...)` and stores result in `st.session_state['rr_df']`. +4. Shows urgency breakdown, in-page view filters (priority, platform, brand, intent — applied in Python, no new query), paginated comment cards, and a "Reply by Content" summary. + +**Pagination:** `st.session_state['reply_page']` (10 comments per page). Reset on new fetch. + +--- + +## Global Filters & Session State + +Global filters live in the sidebar (`app.py`) and are stored in `st.session_state['global_filters']` as a dict: + +```python +{ + 'platforms': ['facebook', 'instagram'], # list or [] + 'brands': ['drumeo'], + 'sentiments': [], + 'date_range': (date(2025, 1, 1), date(2025, 12, 31)), # or None +} +``` + +- **Dashboard:** `app.py` applies global filters to `dashboard_df` using `data_loader.apply_filters()` and passes the result to `render_dashboard()`. +- **Sentiment Analysis / Reply Required:** global filters are used to pre-populate their own filter widgets. The actual Snowflake query uses those values when the user clicks Fetch. The pages do **not** receive a pre-filtered dataframe. + +### Full session state key reference + +| Key | Set by | Used by | +|-----|--------|---------| +| `dashboard_df` | `app.py` on startup | sidebar (filter options), dashboard, SA + RR (filter option lists) | +| `global_filters` | sidebar "Apply Filters" button | app.py (dashboard filter), SA + RR (pre-populate widgets) | +| `filters_applied` | sidebar buttons | app.py (whether to apply filters) | +| `sa_contents` | SA fetch button | SA page rendering | +| `sa_comments` | SA fetch button | SA page rendering | +| `sa_fetch_key` | SA fetch button | SA page (detect stale data) | +| `rr_df` | RR fetch button | RR page rendering | +| `rr_fetch_key` | RR fetch button | RR page (detect stale data) | +| `sentiment_page` | SA page / fetch | SA pagination | +| `reply_page` | RR page / fetch | RR pagination | +| `content_summaries` | AI analysis buttons | SA AI analysis display | + +--- + +## Snowflake Queries + +All query strings are either stored in `config/viz_config.json` (static queries) or built dynamically in `data/data_loader.py` (page-specific queries). + +### Static queries (in `viz_config.json`) + +| Key | Purpose | +|-----|---------| +| `query` | Full query with all columns (legacy, kept for compatibility) | +| `dashboard_query` | Lightweight query — no text, no DIM_CONTENT join | +| `demographics_query` | Joins `usora_users` with `preprocessed.users` to get age/timezone/experience | + +### Dynamic queries (built in `data_loader.py`) + +| Method | Description | +|--------|-------------| +| `_build_sa_content_query()` | Content aggregation for SA page; filters by platform + brand + date | +| `_build_sa_comments_query()` | Sampled comments for SA page; uses `QUALIFY ROW_NUMBER() <= 50` | +| `_build_rr_query()` | Reply-required comments; filters by platform/brand/date; conditionally includes social media and/or musora table | + +### Data source tables + +| Table | Platform | Notes | +|-------|----------|-------| +| `SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES` | facebook, instagram, youtube, twitter | Needs `LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT` for `PERMALINK_URL` | +| `SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES` | musora_app | Has `PERMALINK_URL` and `THUMBNAIL_URL` natively; platform stored as `'musora'`, mapped to `'musora_app'` in queries | + +--- + +## Adding or Changing Things + +### Add a new chart to the Dashboard +1. Write the chart function in the appropriate `visualizations/` file. +2. Call it from `render_dashboard()` in `components/dashboard.py`, passing `filtered_df`. +3. The chart function receives a lightweight df — it has no text columns but has all the columns listed in `dashboard_query`. + +### Add a new filter to the Dashboard sidebar +1. Add the widget in `app.py` under the "Global Filters" section. +2. Store the selected value in the `global_filters` dict under `st.session_state`. +3. Pass it to `data_loader.apply_filters()`. + +### Change what the Sentiment Analysis page queries +- Edit `_build_sa_content_query()` and/or `_build_sa_comments_query()` in `data_loader.py`. +- If you add new columns to the content aggregation result, also update `_process_sa_content_stats()` so they are available in `contents_df`. +- If you add new columns to the comments result, update `_process_sa_comments()`. + +### Change what the Reply Required page queries +- Edit `_build_rr_query()` in `data_loader.py`. +- Remember: all column references inside the social media block (which has a `JOIN`) must be prefixed with `s.` to avoid Snowflake ambiguity errors. + +### Change the cache duration +- `@st.cache_data(ttl=86400)` is set on `load_dashboard_data`, `_fetch_sa_data`, `_fetch_rr_data`, and `load_demographics_data`. +- Change `86400` (seconds) to the desired TTL, or set `ttl=None` for no expiry. +- Users can always force a refresh with the "Reload Data" button in the sidebar (which calls `st.cache_data.clear()` and deletes `st.session_state['dashboard_df']`). + +### Add a new page +1. Create `components/new_page.py` with a `render_new_page(data_loader)` function. +2. Import and add a radio option in `app.py`. +3. If the page needs its own Snowflake data, add a `load_new_page_data()` method to `SentimentDataLoader` following the same pattern as `load_sa_data`. + +### Add a new column to the Dashboard query +- Edit `dashboard_query` in `config/viz_config.json`. +- Both UNION branches must select the same columns in the same order. +- `_process_dashboard_dataframe()` in `data_loader.py` handles basic type casting — add processing there if needed. + +--- + +## Running the App + +```bash +# From the project root +streamlit run visualization/app.py +``` + +**Required environment variables** (in `.env` at project root): + +``` +SNOWFLAKE_USER +SNOWFLAKE_PASSWORD +SNOWFLAKE_ACCOUNT +SNOWFLAKE_ROLE +SNOWFLAKE_DATABASE +SNOWFLAKE_WAREHOUSE +SNOWFLAKE_SCHEMA +``` + --- -# Welcome to Streamlit! +## Configuration Reference -Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart: +`config/viz_config.json` controls: -If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community -forums](https://discuss.streamlit.io). +| Section | What it configures | +|---------|-------------------| +| `color_schemes.sentiment_polarity` | Hex colors for each sentiment level | +| `color_schemes.intent` | Hex colors for each intent label | +| `color_schemes.platform` | Hex colors for each platform | +| `color_schemes.brand` | Hex colors for each brand | +| `sentiment_order` | Display order for sentiment categories in charts | +| `intent_order` | Display order for intent categories | +| `negative_sentiments` | Which sentiment values count as "negative" | +| `dashboard.default_date_range_days` | Default date filter window (days) | +| `dashboard.max_comments_display` | Max comments shown per pagination page | +| `dashboard.chart_height` | Default Plotly chart height | +| `dashboard.top_n_contents` | Default top-N for content ranking | +| `snowflake.query` | Full query (legacy, all columns) | +| `snowflake.dashboard_query` | Lightweight dashboard query (no text columns) | +| `snowflake.demographics_query` | Demographics join query | +| `demographics.age_groups` | Age bucket definitions (label → [min, max]) | +| `demographics.experience_groups` | Experience bucket definitions | +| `demographics.top_timezones_count` | How many timezones to show in the geographic chart | \ No newline at end of file diff --git a/processing_brand_sentiment/ARCHITECTURE_PROPOSAL.md b/processing_brand_sentiment/ARCHITECTURE_PROPOSAL.md new file mode 100644 index 0000000000000000000000000000000000000000..8fcab4f8976bfd203b9bb9cf6150dbe1dfc46f97 --- /dev/null +++ b/processing_brand_sentiment/ARCHITECTURE_PROPOSAL.md @@ -0,0 +1,437 @@ +# Brand Sentiment Analysis - Architecture Redesign Proposal + +## Executive Summary + +This document proposes a redesigned multi-agent architecture to address accuracy issues identified during manual evaluation. The new design separates **fact extraction** from **analysis**, adds strict validation, and improves content preprocessing. + +--- + +## Current Issues Analysis + +| Issue | Root Cause | Impact | +|-------|------------|--------| +| **B8X/B8 variation** | Word-boundary matching misses aliases | Missing relevant posts | +| **Competitor products attributed to Sabian** | LLM lacks competitor awareness, no strict list enforcement | False positives, wrong product attribution | +| **Short text language detection** | Lingua fails on short brand-heavy text | Skipping valid English posts | +| **False positive relevance** | Single-pass relevance + no verification | Pizza oven marked as Sabian discussion | +| **Long posts with overlapping content** | Poor quote separation, raw thread context | Confusing LLM, extraction from wrong content | + +--- + +## Proposed Architecture + +### Design Principles + +1. **Separation of Concerns**: Fact extraction vs. interpretation/analysis +2. **Strict Validation**: Enforce predefined value lists at every step +3. **Structured Data Flow**: Each agent receives clean, relevant input +4. **Fail-Safe Defaults**: Conservative approach - when uncertain, mark as not relevant + +### New Workflow + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ 1. CONTENT PREPROCESSOR │ +│ (No LLM) │ +│ • Enhanced HTML parsing (better quote separation) │ +│ • Text cleaning and normalization │ +│ • Language detection (skip for short texts < 50 chars) │ +│ • Keyword screening with aliases (B8 → B8X) │ +│ • Extract: cleaned_content, quoted_content, raw_thread_context │ +└─────────────────────────────┬───────────────────────────────────┘ + │ + ▼ + ┌───────────────────────────────┐ + │ Has any Sabian-related │ + │ keywords (primary/contextual)?│ + └───────────────┬───────────────┘ + │ │ + YES NO + │ │ + ▼ ▼ +┌─────────────────────────────────┐ ┌──────────────────┐ +│ 2. RELEVANCE & EXTRACTION │ │ Mark as │ +│ AGENT (LLM #1) │ │ NOT RELEVANT │ +│ │ │ (0 LLM calls) │ +│ INPUT: │ └──────────────────┘ +│ • cleaned_content │ +│ • quoted_content │ +│ • raw_thread_context │ +│ • keywords_found │ +│ │ +│ OUTPUT: │ +│ • IS_RELEVANT: boolean │ +│ • RELEVANCE_CONFIDENCE: h/m/l │ +│ • RELEVANCE_REASON: string │ +│ • PRODUCTS_MENTIONED: [] │ ← STRICT: only from predefined list +│ • SABIAN_MENTION_CONTEXT │ +│ • AUTHOR_ROLE │ +│ • COMPETITORS_MENTIONED: [] │ ← Brand names only, no products +│ • THREAD_CONTEXT_SUMMARY │ ← 1-2 sentence summary +└─────────────────┬───────────────┘ + │ + ▼ + ┌─────────────────┐ + │ IS_RELEVANT? │ + └────────┬────────┘ + │ │ + YES NO + │ │ + ▼ ▼ +┌─────────────────────────────────┐ ┌──────────────────┐ +│ 3. SENTIMENT & INTENT │ │ Store with │ +│ ANALYZER (LLM #2) │ │ is_relevant=F │ +│ │ │ (1 LLM call) │ +│ INPUT (structured): │ └──────────────────┘ +│ • cleaned_content │ +│ • PRODUCTS_MENTIONED │ ← Pre-validated list +│ • SABIAN_MENTION_CONTEXT │ +│ • AUTHOR_ROLE │ +│ • COMPETITORS_MENTIONED │ +│ • THREAD_CONTEXT_SUMMARY │ ← Clean, concise context +│ │ +│ OUTPUT: │ +│ • SENTIMENT_LEVEL │ +│ • EMOTION_TYPE │ +│ • SENTIMENT_CONFIDENCE │ +│ • SARCASM_DETECTED │ +│ • PRODUCT_ATTRIBUTES: [] │ +│ • COMPETITOR_PRODUCTS_OWNED: []│ +│ • COMPARISON_TYPE │ +│ • INTENTS: [] │ +│ • PURCHASE_STAGE │ +│ • DECISION_DRIVERS: [] │ +│ • PAIN_POINTS: [] │ +│ • DELIGHT_FACTORS: [] │ +│ • ANALYSIS_NOTES │ +└─────────────────┬───────────────┘ + │ + ▼ +┌─────────────────────────────────┐ +│ 4. OUTPUT VALIDATOR │ +│ (No LLM - Rule-based) │ +│ │ +│ • Verify all values from lists │ +│ • Check logical consistency │ +│ • Flag anomalies for review │ +│ • Set processing_status │ +└─────────────────────────────────┘ +``` + +--- + +## API Call Summary + +| Scenario | Current Calls | New Calls | Notes | +|----------|--------------|-----------|-------| +| No keywords found | 0 | 0 | Same | +| Primary keywords, relevant | 1 | 2 | +1 for better extraction | +| Primary keywords, not relevant | 1 | 1 | Extraction determines not relevant | +| Ambiguous keywords, relevant | 2 | 2 | Same | +| Ambiguous keywords, not relevant | 2 | 1 | Early exit after extraction | + +**Net Impact**: Slight increase for some cases, but significantly better accuracy. + +--- + +## Agent Specifications + +### Agent 1: Content Preprocessor (No LLM) + +**File**: `workflow/agents/content_preprocessor_agent.py` + +**Improvements over current**: +1. Enhanced HTML parsing with better quote/reply separation +2. Product alias mapping (B8 → B8X, etc.) +3. Skip language detection for texts < 50 characters +4. Always process if primary Sabian keywords found (regardless of language detection) + +**Product Aliases** (add to brand_config.json): +```json +"product_aliases": { + "B8": "B8X", + "sbrs": "SBR", + "hand hammered": "HH", + "hand-hammered": "HH" +} +``` + +--- + +### Agent 2: Relevance & Extraction Agent (LLM #1) + +**File**: `workflow/agents/relevance_extraction_agent.py` + +**Purpose**: Determine relevance with HIGH confidence and extract verifiable facts. + +**Key Design Decisions**: + +1. **Strict Product Matching**: + - Provide explicit product list in prompt + - Instruction: "ONLY return products that EXACTLY match items in this list" + - Return empty list if no exact matches (not hallucinated guesses) + +2. **Competitor Awareness**: + - List competitor BRAND names (not products) + - Instruction: "Products like '2002', 'Signature', 'K Custom' belong to competitors, NOT Sabian" + - Prevent cross-brand attribution + +3. **Thread Context Summarization**: + - Summarize in 1-2 sentences maximum + - Focus only on information relevant to understanding the post's context + +4. **Conservative Relevance**: + - When uncertain, mark as NOT relevant + - Require explicit Sabian product/brand mention IN THE POST CONTENT + - Quoted content mentioning Sabian does NOT make post relevant + +**System Prompt Structure**: +``` +You are a brand mention extractor for Sabian cymbals. Your job is to: +1. Determine if the POST CONTENT discusses Sabian products +2. Extract ONLY facts, not interpretations + +## CRITICAL RULES + +### Rule 1: Relevance Based on POST CONTENT Only +- The post is relevant ONLY if the POST CONTENT itself mentions Sabian products +- Quoted/parent content mentioning Sabian does NOT make the post relevant +- Generic replies ("Thanks!", "Got it!") are NEVER relevant + +### Rule 2: Strict Product Matching +SABIAN PRODUCTS (use ONLY these exact values): +[HHX, HH, AAX, AA, Artisan, FRX, Omni, Chopper, Stratus, XSR, B8X, SBR] + +- Return ONLY products from this list +- If you see a product not in this list, do NOT include it +- "2002", "Signature", "Sound Edge", "Formula 602" are PAISTE products, NOT Sabian +- "K Custom", "A Custom", "K Zildjian" are ZILDJIAN products, NOT Sabian +- When uncertain, return empty list [] + +### Rule 3: Competitor Brand Awareness +COMPETITOR BRANDS: [Zildjian, Paiste, Meinl, Dream Cymbals, Istanbul Agop, Bosphorus] + +- Only return competitor BRAND names in competitors_mentioned +- Do NOT guess competitor products + +### Rule 4: Thread Context Summary +- Summarize thread context in 1-2 sentences maximum +- Focus on what helps understand the post's topic +- If thread is about pizza ovens, say "Thread discusses pizza ovens and cooking" + +## OUTPUT FORMAT +Return ONLY valid JSON: +{ + "is_relevant": boolean, + "relevance_confidence": "high" | "medium" | "low", + "relevance_reason": "1-2 sentences explaining decision", + "products_mentioned": [], // ONLY from Sabian list above + "sabian_mention_context": "primary_focus" | "significant_mention" | "casual_mention" | "comparison_context" | null, + "author_role": "current_owner" | "past_owner" | "potential_buyer" | "never_owned" | "unknown", + "competitors_mentioned": [], // Brand names only + "thread_context_summary": "1-2 sentence summary" +} +``` + +--- + +### Agent 3: Sentiment & Intent Analyzer (LLM #2) + +**File**: `workflow/agents/sentiment_analyzer_agent.py` + +**Purpose**: Deep analysis on VERIFIED relevant posts with STRUCTURED input. + +**Key Design Decisions**: + +1. **Receives Pre-Validated Input**: + - Products already extracted and validated + - Thread context already summarized + - Author role already determined + +2. **Focused Analysis**: + - Sentiment TOWARDS SABIAN ONLY + - Intent classification + - Pain points / Delights (author's own experience only) + - Purchase journey (author's own journey only) + +3. **No Hallucination on Products**: + - Products are GIVEN in input, not re-extracted + - Can only discuss attributes of provided products + +**System Prompt Structure**: +``` +You are a sentiment analyst for Sabian cymbal discussions. + +## INPUT CONTEXT (Pre-validated, trust these values) +- Products mentioned: {products_mentioned} +- Sabian mention context: {sabian_mention_context} +- Author role: {author_role} +- Thread summary: {thread_context_summary} +- Competitors mentioned: {competitors_mentioned} + +## YOUR TASK +Analyze the sentiment, emotions, and intents in this post about Sabian. + +## CRITICAL RULES + +### Rule 1: Sabian-Specific Sentiment +- Sentiment MUST be about Sabian, NOT overall post tone +- Example: "Love my new kit! The SBR cymbals sound terrible." + - Overall: positive | Sabian sentiment: NEGATIVE + +### Rule 2: Author Perspective Only +These fields are ONLY for author's OWN experience: +- purchase_stage, decision_drivers, pain_points, delight_factors +- If author is giving ADVICE to others, these should be null/empty + +### Rule 3: Use Only Valid Values +[List all valid values for each field] + +## OUTPUT FORMAT +{ + "sentiment_level": "...", + "emotion_type": "..." or null, + "sentiment_confidence": "high" | "medium" | "low", + "sarcasm_detected": boolean, + "product_attributes": [], + "competitor_products_owned": [], + "comparison_type": "..." or null, + "intents": [], + "purchase_stage": "..." or null, + "decision_drivers": [], + "pain_points": [], + "delight_factors": [], + "analysis_notes": "1-2 sentences" +} +``` + +--- + +### Agent 4: Output Validator (No LLM) + +**File**: `workflow/agents/output_validator_agent.py` + +**Purpose**: Final validation and anomaly detection. + +**Validation Rules**: + +1. **List Validation**: + - All products_mentioned are in Sabian product list + - All competitors_mentioned are in competitor list + - All categorical values are from predefined lists + +2. **Logical Consistency**: + - If is_relevant=True, products_mentioned should not be empty (flag if empty) + - If sabian_mention_context="primary_focus", products_mentioned should have items + - If sentiment_level="very_negative", pain_points should not be empty (warn) + +3. **Anomaly Flagging**: + - Flag for manual review if inconsistencies detected + - Add `validation_flags` field to output + +--- + +## Configuration Changes + +### brand_config.json Updates + +```json +{ + "brand": { + "name": "Sabian", + "products": ["HHX", "HH", "AAX", "AA", "Artisan", "FRX", "Omni", "Chopper", "Stratus", "XSR", "B8X", "SBR"], + "product_aliases": { + "B8": "B8X", + "sbrs": "SBR", + "hhx's": "HHX", + "aax's": "AAX" + }, + "competitor_products_warning": [ + "2002", "Signature", "Sound Edge", "Formula 602", "Giant Beat", + "K Custom", "A Custom", "K Zildjian", "A Zildjian", "S Family", + "Byzance", "Pure Alloy", "HCS", + "Bliss", "Contact", "Energy" + ], + "competitors": [...] + }, + "preprocessing": { + "min_length_for_language_detection": 50, + "always_process_if_primary_keyword": true + } +} +``` + +--- + +## File Structure + +``` +processing_brand_sentiment/ +├── config_files/ +│ ├── brand_config.json # Updated with aliases, warnings +│ ├── workflow_config.json # Agent configurations +│ └── analysis_categories.json # Category definitions (unchanged) +├── workflow/ +│ ├── orchestrator.py # Updated workflow graph +│ └── agents/ +│ ├── base_agent.py # Base class (unchanged) +│ ├── content_preprocessor_agent.py # Enhanced preprocessing +│ ├── relevance_extraction_agent.py # NEW: Extraction + relevance +│ ├── sentiment_analyzer_agent.py # NEW: Focused analysis +│ └── output_validator_agent.py # NEW: Validation +``` + +--- + +## Migration Path + +### Phase 1: Configuration Updates +1. Update brand_config.json with product aliases +2. Add competitor product warnings +3. Update preprocessing settings + +### Phase 2: New Agents +1. Create relevance_extraction_agent.py +2. Create sentiment_analyzer_agent.py +3. Create output_validator_agent.py +4. Update content_preprocessor_agent.py + +### Phase 3: Orchestrator Update +1. Update workflow graph with new flow +2. Update state definition +3. Add new routing logic + +### Phase 4: Testing & Validation +1. Run on test batch with known issues +2. Compare accuracy metrics +3. Fine-tune prompts based on results + +--- + +## Expected Improvements + +| Issue | Current Behavior | Expected After | +|-------|------------------|----------------| +| B8/B8X | Missed | Caught via alias mapping | +| Paiste products as Sabian | Attributed to Sabian | Correctly identified as competitor | +| Short text language | Marked as Latin | Processed as English | +| False positive (pizza) | Marked relevant | Marked not relevant | +| Long confusing context | Raw text confuses LLM | Summarized 1-2 sentences | + +--- + +## Success Metrics + +1. **Relevance Accuracy**: >99% (currently ~90%) +2. **Product Attribution Accuracy**: >99% (currently ~85%) +3. **Sentiment Accuracy**: >95% (current unknown) +4. **False Positive Rate**: <1% +5. **False Negative Rate**: <1% + +--- + +## Questions for Review + +1. Should we add a manual review queue for flagged posts? +2. Should thread_context_summary be stored in output for debugging? +3. Preferred batch size for re-processing existing data? diff --git a/processing_brand_sentiment/README.md b/processing_brand_sentiment/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f30f671f4fc3d6019ed0dcf36aeeeef3f8c03310 --- /dev/null +++ b/processing_brand_sentiment/README.md @@ -0,0 +1,402 @@ +# Brand Sentiment Analysis Pipeline + +A modular, scalable system for analyzing forum discussions and social media comments about specific brands using an agentic workflow with LLMs. The initial implementation focuses on **Sabian** (a cymbal manufacturer), but the architecture supports easy addition of new brands through configuration. + +## Overview + +The pipeline fetches data from Snowflake (forum posts and/or social media comments), preprocesses them (parsing HTML for forums or cleaning plain text for comments), detects language, validates brand relevance, performs comprehensive sentiment and intelligence extraction using OpenAI's API, and stores enriched results back to Snowflake. + +## Data Sources + +| Source | Table | Output Table | Description | +|--------|-------|--------------|-------------| +| **Forums** | `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS` | `SABIAN_BRAND_ANALYSIS` | Forum posts with thread context | +| **Comments** | `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS` | `SABIAN_BRAND_ANALYSIS_COMMENTS` | Social media comments with content context | + +## Architecture v4.0 + +The system uses a 4-agent pipeline that separates **fact extraction** from **analysis** for improved accuracy. Both data sources share the same extraction, analysis, and validation agents - only the preprocessor differs. + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ 1a. CONTENT PREPROCESSOR (Forums) │ +│ (No LLM) │ +│ - HTML parsing with quote/reply separation │ +│ - Product alias mapping (B8 → B8X) │ +│ - Smart language detection │ +│ - Keyword-based relevance screening │ +├─────────────────────────────────────────────────────────────────┤ +│ 1b. COMMENT PREPROCESSOR (Comments) │ +│ (No LLM) │ +│ - Plain text cleaning (no HTML) │ +│ - Product alias mapping (B8 → B8X) │ +│ - Smart language detection │ +│ - Keyword-based relevance screening │ +│ - Context: content title + description + parent comment │ +└─────────────────────────────┬───────────────────────────────────┘ + │ + ▼ + ┌───────────────────────────────┐ + │ Has Sabian-related keywords? │ + └───────────────┬───────────────┘ + │ │ + YES NO + │ │ + ▼ ▼ +┌─────────────────────────────────┐ ┌──────────────────┐ +│ 2. RELEVANCE & EXTRACTION │ │ Mark as │ +│ AGENT (LLM #1) │ │ NOT RELEVANT │ +│ [SHARED] │ │ (0 LLM calls) │ +│ - Validates relevance │ └──────────────────┘ +│ - Extracts products (strict) │ +│ - Identifies author role │ +│ - Summarizes context │ +│ - Detects competitors │ +└─────────────────┬───────────────┘ + │ + ▼ + ┌─────────────────┐ + │ IS_RELEVANT? │ + └────────┬────────┘ + │ │ + YES NO + │ │ + ▼ ▼ +┌─────────────────────────────────┐ ┌──────────────────┐ +│ 3. SENTIMENT & INTENT │ │ Store with │ +│ ANALYZER (LLM #2) │ │ is_relevant=F │ +│ [SHARED] │ │ (1 LLM call) │ +│ - Sabian-specific sentiment │ └──────────────────┘ +│ - Intent classification │ +│ - Pain points / Delights │ +│ - Purchase journey (author) │ +│ - Competitor products owned │ +└─────────────────┬───────────────┘ + │ + ▼ +┌─────────────────────────────────┐ +│ 4. OUTPUT VALIDATOR │ +│ (No LLM - Rule-based) │ +│ [SHARED] │ +│ - Validates all values │ +│ - Checks logical consistency │ +│ - Flags anomalies for review │ +└─────────────────────────────────┘ +``` + +## Features + +- **Multi-Source Support**: Process forums, social media comments, or both +- **4-Agent Pipeline**: Separation of extraction and analysis for improved accuracy +- **Strict Product Matching**: Only returns products from predefined list, preventing hallucination +- **Competitor Awareness**: Knows which products belong to competitors +- **Smart Language Detection**: Skips detection for short texts, always processes if primary keywords found +- **Product Alias Mapping**: Handles variations (B8 → B8X, "hand hammered" → HH) +- **Thread/Comment Context**: LLM summarizes context for clarity +- **Validation & Anomaly Detection**: Rule-based validator catches errors and flags edge cases +- **Author Perspective Tracking**: Distinguishes author's own experience from advice to others +- **Platform Tracking**: Records source platform for each processed item + +## Project Structure + +``` +processing_brand_sentiment/ +├── config_files/ +│ ├── brand_config.json # Brand products, aliases, competitors, keywords, data sources +│ ├── workflow_config.json # LLM settings, batch sizes, output config (forums + comments) +│ └── analysis_categories.json # Sentiment, intent, pain point categories +├── database/ +│ ├── __init__.py +│ ├── snowflake_connection.py # Snowflake connection handler +│ └── sql/ +│ ├── fetch_forum_posts.sql # Query for forum posts with thread context +│ ├── fetch_comments.sql # Query for social media comments with content context +│ ├── create_output_table.sql # Forum output schema with views +│ ├── init_output_table.sql # Forum table initialization +│ ├── create_comments_output_table.sql # Comment output schema with views +│ └── init_comments_output_table.sql # Comment table initialization +├── workflow/ +│ ├── __init__.py +│ ├── orchestrator.py # Forum LangGraph workflow coordinator +│ ├── comment_orchestrator.py # Comment LangGraph workflow coordinator +│ └── agents/ +│ ├── __init__.py +│ ├── base_agent.py # Abstract base class +│ ├── content_preprocessor_agent.py # Forum: HTML parsing, alias mapping +│ ├── comment_preprocessor_agent.py # Comments: plain text, comment context +│ ├── sabian_relevance_extraction_agent.py # Shared: relevance + extraction +│ ├── sabian_sentiment_analyzer_agent.py # Shared: sentiment analysis +│ └── output_validator_agent.py # Shared: rule-based validation +├── utils/ +│ ├── __init__.py +│ └── html_parser.py # HTML content extraction (forums only) +├── logs/ # Processing logs (auto-created) +├── main.py # Main execution script (multi-source) +├── .env # Environment variables +└── README.md # This file +``` + +## Setup + +### 1. Install Dependencies + +```bash +pip install langchain-openai langgraph snowflake-snowpark-python python-dotenv pandas beautifulsoup4 lingua-language-detector +``` + +### 2. Configure Environment Variables + +Ensure `.env` file contains: + +```env +# Snowflake +SNOWFLAKE_USER=your_user +SNOWFLAKE_PASSWORD=your_password +SNOWFLAKE_ACCOUNT=your_account +SNOWFLAKE_ROLE=your_role +SNOWFLAKE_DATABASE=SOCIAL_MEDIA_DB +SNOWFLAKE_WAREHOUSE=your_warehouse +SNOWFLAKE_SCHEMA=ML_FEATURES + +# OpenAI +OPENAI_API_KEY=your_openai_key +``` + +### 3. Initialize Snowflake Tables + +Run the initialization scripts before first processing: + +```sql +-- For forums +database/sql/init_output_table.sql + +-- For social media comments +database/sql/init_comments_output_table.sql +``` + +## Usage + +### Process All Sources (Default) + +```bash +python main.py +``` + +### Process Forums Only + +```bash +python main.py --data-source forums +``` + +### Process Social Media Comments Only + +```bash +python main.py --data-source comments +``` + +### Process Limited Number + +```bash +python main.py --limit 100 +python main.py --data-source comments --limit 50 +``` + +### Sequential Processing (Debug Mode) + +```bash +python main.py --limit 50 --sequential +``` + +### First Run (Overwrite Mode) + +```bash +python main.py --overwrite --limit 100 +``` + +### Command-Line Arguments + +| Argument | Description | Default | +|----------|-------------|---------| +| `--limit N` | Process only N items per source | All unprocessed | +| `--overwrite` | Overwrite existing table | Append mode | +| `--sequential` | Single-threaded processing | Parallel | +| `--config-dir PATH` | Custom config directory | config_files/ | +| `--data-source SOURCE` | Source to process: `forums`, `comments`, `all` | `all` | + +## Configuration + +### brand_config.json + +Key sections: + +```json +{ + "brand": { + "name": "Sabian", + "products": ["HHX", "HH", "AAX", "AA", "Artisan", "FRX", "Omni", "Chopper", "Stratus", "XSR", "B8X", "SBR"], + "product_aliases": { + "b8": "B8X", + "hand hammered": "HH" + }, + "competitor_products_warning": { + "paiste_products": ["2002", "signature", "sound edge", "formula 602"], + "zildjian_products": ["k custom", "a custom", "k zildjian"] + }, + "competitors": [...] + }, + "data_sources": { + "forums": { + "table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS", + "platform": "musora_forums" + }, + "comments": { + "table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS", + "platform_column": "PLATFORM" + } + } +} +``` + +### analysis_categories.json + +Defines valid values for all categorical fields: + +- `author_role`: current_owner, past_owner, potential_buyer, never_owned, unknown +- `sabian_mention_context`: primary_focus, significant_mention, casual_mention, comparison_context +- `sentiment_level`: very_negative, negative, neutral, positive, very_positive +- `intents`: seeking_information, providing_information, sharing_experience, comparing, praising, criticizing, buying_selling, general_discussion +- `feedback_aspects`: sound_quality, price_value, durability, playability, versatility, customer_service, availability, aesthetics + +## Output Tables + +### Forum Output: `SABIAN_BRAND_ANALYSIS` + +| Category | Key Columns | +|----------|-------------| +| **Identifiers** | POST_ID, THREAD_ID, POST_AUTHOR_ID, PLATFORM | +| **Content** | ORIGINAL_CONTENT, CLEANED_CONTENT, QUOTED_CONTENT, THREAD_CONTEXT_SUMMARY | +| **Thread** | THREAD_TITLE, THREAD_FIRST_POST, POST_CREATED_AT, THREAD_STARTED_AT | +| **Category** | CATEGORY_TITLE, CATEGORY_TOPIC | + +### Comment Output: `SABIAN_BRAND_ANALYSIS_COMMENTS` + +| Category | Key Columns | +|----------|-------------| +| **Identifiers** | COMMENT_SK, COMMENT_ID, PLATFORM, AUTHOR_NAME, AUTHOR_ID | +| **Content** | ORIGINAL_TEXT, COMMENT_TIMESTAMP | +| **Context** | CONTENT_SK, CONTENT_ID, CONTENT_DESCRIPTION, PARENT_COMMENT_ID, PARENT_COMMENT_TEXT | +| **Channel** | CHANNEL_SK, CHANNEL_NAME, CHANNEL_DISPLAY_NAME | + +### Shared Analysis Columns (Both Tables) + +| Category | Fields | Notes | +|----------|--------|-------| +| **Language** | DETECTED_LANGUAGE, LANGUAGE_CODE, IS_ENGLISH | Language detection | +| **Relevance** | IS_RELEVANT, RELEVANCE_CONFIDENCE, RELEVANCE_REASON | Brand relevance | +| **Extraction** | PRODUCTS_MENTIONED, AUTHOR_ROLE, SABIAN_MENTION_CONTEXT | From Agent 1 | +| **Sentiment** | SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_CONFIDENCE | Sabian-specific | +| **Intents** | INTENTS (multi-label) | What author is trying to accomplish | +| **Journey** | PURCHASE_STAGE, DECISION_DRIVERS | Author perspective only | +| **Feedback** | PAIN_POINTS, DELIGHT_FACTORS | Author's own experience | +| **Competitive** | COMPETITORS_MENTIONED, COMPETITOR_PRODUCTS_OWNED, COMPARISON_TYPE | Competitive intel | +| **Validation** | VALIDATION_FLAGS, PROCESSING_STATUS | Anomaly detection | + +### Processing Status Values + +| Status | Description | +|--------|-------------| +| `completed` | Successfully processed, no issues | +| `completed_with_flags` | Processed but has anomalies to review | +| `validation_failed` | Validation errors detected | +| `workflow_error` | Unexpected error during processing | + +### Available Views + +#### Forum Views + +| View | Description | +|------|-------------| +| `VW_SABIAN_RELEVANT_ANALYSIS` | Only relevant, successfully processed posts | +| `VW_SABIAN_FLAGGED_POSTS` | Posts with validation flags for review | +| `VW_SABIAN_SENTIMENT_DISTRIBUTION` | Sentiment breakdown statistics | +| `VW_SABIAN_PRODUCT_MENTIONS` | Product mention summary | +| `VW_SABIAN_COMPETITOR_ANALYSIS` | Competitor comparison analysis | +| `VW_SABIAN_PAIN_POINTS` | Pain point frequency analysis | +| `VW_SABIAN_AUTHOR_ROLES` | Author role distribution | +| `VW_SABIAN_COMPETITOR_OWNERSHIP` | Competitor brands owned by authors | +| `VW_SABIAN_VALIDATION_SUMMARY` | Processing status breakdown | + +#### Comment Views + +| View | Description | +|------|-------------| +| `VW_SABIAN_COMMENTS_RELEVANT_ANALYSIS` | Relevant, successful comments | +| `VW_SABIAN_COMMENTS_FLAGGED` | Comments with validation flags | +| `VW_SABIAN_COMMENTS_SENTIMENT_DISTRIBUTION` | Sentiment by platform | +| `VW_SABIAN_COMMENTS_PRODUCT_MENTIONS` | Product mentions by platform | +| `VW_SABIAN_COMMENTS_VALIDATION_SUMMARY` | Processing status by platform | + +## API Call Efficiency + +| Scenario | LLM Calls | Notes | +|----------|-----------|-------| +| No keywords found | 0 | Early exit in preprocessor | +| Primary keywords, relevant | 2 | Extraction + Analysis | +| Primary keywords, not relevant | 1 | Only Extraction | +| Non-English content | 0 | Skipped | + +## Key Design Decisions + +### Why Separate Forum and Comment Preprocessors? + +1. **Different input formats**: Forums use HTML (quotes, blockquotes), comments are plain text +2. **Different context sources**: Forums have thread title + first post + category; comments have content title + description + parent comment +3. **Shared analysis**: Both feed into the same extraction and analysis agents + +### Why Separate Output Tables? + +1. **Different identifiers**: Forums use POST_ID/THREAD_ID; comments use COMMENT_SK/COMMENT_ID/PLATFORM +2. **Different metadata**: Forums have thread context; comments have content/channel metadata +3. **Clean separation**: Avoids NULL columns and schema confusion +4. **Shared analysis columns**: All extracted intelligence fields are identical + +### Why Platform Column for Forums? + +The `PLATFORM` column was added to `SABIAN_BRAND_ANALYSIS` (defaulting to `musora_forums`) to enable cross-source analysis and maintain consistency with the comments table which uses the dynamic platform value from the source data. + +## Troubleshooting + +### "Table does not exist" on First Run + +Run the appropriate init SQL in Snowflake first: +- Forums: `database/sql/init_output_table.sql` +- Comments: `database/sql/init_comments_output_table.sql` + +### No Comments Being Processed + +Check that `SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS` table exists and contains data. The query joins with `DIM_CONTENT` and `DIM_CHANNEL` - verify these dimension tables have matching records. + +### Competitor Products Attributed to Sabian + +Check `brand_config.json` for `competitor_products_warning` section. Add any missing competitor products. + +### API Rate Limits + +Use `--sequential` mode or reduce `--limit`: +```bash +python main.py --sequential --limit 50 +``` + +## Schema Version History + +| Version | Changes | +|---------|---------| +| 1.0 | Initial release | +| 2.0 | Added author_role, post_type, sabian_mention_context | +| 3.0 | Removed post_type (merged into intents), unified feedback_aspects | +| 4.0 | 4-agent pipeline, thread_context_summary, validation flags, product aliases | +| 4.0+ | Added social media comments support, PLATFORM column, separate comment output table | + +## License + +Internal use only - Brand sentiment analysis project. diff --git a/processing_brand_sentiment/config_files/analysis_categories.json b/processing_brand_sentiment/config_files/analysis_categories.json new file mode 100644 index 0000000000000000000000000000000000000000..5f7b69b6069e5aabd66bbc27b62dce5ee23e04f5 --- /dev/null +++ b/processing_brand_sentiment/config_files/analysis_categories.json @@ -0,0 +1,123 @@ +{ + "author_role": { + "description": "Author's relationship to Sabian products", + "categories": [ + {"value": "current_owner", "description": "Currently owns/uses Sabian"}, + {"value": "past_owner", "description": "Previously owned, sold/replaced"}, + {"value": "potential_buyer", "description": "Considering purchasing Sabian"}, + {"value": "never_owned", "description": "Explicitly doesn't own Sabian"}, + {"value": "unknown", "description": "Cannot determine from post"} + ] + }, + "sabian_mention_context": { + "description": "How prominently Sabian is discussed", + "categories": [ + {"value": "primary_focus", "description": "Sabian is the main topic"}, + {"value": "significant_mention", "description": "Discussed with detail, not main focus"}, + {"value": "casual_mention", "description": "Brief mention among other topics"}, + {"value": "comparison_context", "description": "Mentioned while comparing to competitors"} + ] + }, + "sentiment": { + "brand_specific": true, + "description": "Sentiment TOWARDS SABIAN ONLY (not overall post tone)", + "levels": [ + {"value": "very_negative", "description": "Strong criticism, anger, severe disappointment"}, + {"value": "negative", "description": "Complaints, dissatisfaction, mild criticism"}, + {"value": "neutral", "description": "Factual mention, balanced, no clear sentiment"}, + {"value": "positive", "description": "Satisfaction, appreciation, mild praise"}, + {"value": "very_positive", "description": "Enthusiasm, strong praise, highly recommend"} + ] + }, + "emotions": { + "brand_specific": true, + "description": "Emotion towards SABIAN specifically", + "categories": [ + {"value": "frustration", "description": "Annoyance with product issues"}, + {"value": "disappointment", "description": "Unmet expectations"}, + {"value": "anger", "description": "Strong negative emotion"}, + {"value": "satisfaction", "description": "Expectations met, content"}, + {"value": "excitement", "description": "Eagerness, anticipation"}, + {"value": "curiosity", "description": "Interest, wanting to know more"}, + {"value": "indifference", "description": "No strong feelings"} + ] + }, + "intents": { + "multi_label": true, + "description": "What the author is trying to accomplish (can select multiple)", + "categories": [ + {"value": "seeking_information", "description": "Asking questions, seeking advice/recommendations"}, + {"value": "providing_information", "description": "Answering questions, giving advice, helping others"}, + {"value": "sharing_experience", "description": "Personal experience, review, testimonial, purchase announcement"}, + {"value": "comparing", "description": "Comparing brands/products against each other"}, + {"value": "praising", "description": "Actively endorsing, recommending, advocating for Sabian"}, + {"value": "criticizing", "description": "Actively complaining, warning others, reporting issues"}, + {"value": "buying_selling", "description": "Listing gear for sale, looking to buy/trade"}, + {"value": "general_discussion", "description": "General conversation not fitting above"} + ] + }, + "purchase_stage": { + "author_perspective_only": true, + "description": "Author's own purchase journey stage (null if giving advice to others)", + "categories": [ + {"value": "researching", "description": "Gathering info before buying"}, + {"value": "deciding", "description": "Actively comparing, about to decide"}, + {"value": "recently_purchased", "description": "Just bought the product"}, + {"value": "long_term_owner", "description": "Owned for extended period"}, + {"value": "selling_replacing", "description": "Selling or replacing gear"} + ] + }, + "comparison_type": { + "description": "Type of competitive comparison (if comparing)", + "categories": [ + {"value": "direct_comparison", "description": "Side-by-side evaluation"}, + {"value": "preference_statement", "description": "Stating brand preference"}, + {"value": "switching_to_sabian", "description": "Moving or Moved from competitor to Sabian"}, + {"value": "switching_from_sabian", "description": "Moving or Moved from Sabian to competitor"} + ] + }, + "feedback_aspects": { + "description": "Product/brand aspects discussed. Used for BOTH pain_points (negative) and delight_factors (positive)", + "categories": [ + {"value": "sound_quality", "description": "Sound, tone, character, audio qualities"}, + {"value": "price_value", "description": "Cost, value for money, deals"}, + {"value": "durability", "description": "Build quality, longevity, cracking/wear"}, + {"value": "playability", "description": "Feel, response, ease of playing"}, + {"value": "versatility", "description": "Range of genres/applications, flexibility"}, + {"value": "customer_service", "description": "Support, warranty, brand interaction"}, + {"value": "availability", "description": "Stock, ease of finding/purchasing"}, + {"value": "aesthetics", "description": "Appearance, finish, visual appeal"} + ] + }, + "decision_drivers": { + "author_perspective_only": true, + "description": "What influenced AUTHOR's own purchase decision (empty if giving advice)", + "categories": [ + {"value": "sound_quality", "description": "Sound characteristics"}, + {"value": "price", "description": "Cost/budget considerations"}, + {"value": "durability", "description": "Build quality, longevity"}, + {"value": "artist_endorsement", "description": "Influenced by endorsed artists"}, + {"value": "peer_recommendation", "description": "Friends/community recommended"}, + {"value": "hands_on_testing", "description": "Tried before buying"}, + {"value": "brand_loyalty", "description": "Previous positive experience"}, + {"value": "versatility", "description": "Multi-genre/application use"}, + {"value": "online_reviews", "description": "Read reviews that influenced"} + ] + }, + "product_attributes": { + "description": "Attributes being discussed about Sabian products", + "categories": [ + {"value": "sound_quality", "description": "Tone, character, audio qualities"}, + {"value": "durability", "description": "Build quality, longevity"}, + {"value": "price", "description": "Cost and value"}, + {"value": "playability", "description": "Feel, response"}, + {"value": "aesthetics", "description": "Appearance, finish"}, + {"value": "volume", "description": "Loudness, projection"}, + {"value": "sustain", "description": "How long sound lasts"}, + {"value": "versatility", "description": "Range of applications"} + ] + }, + "analysis_notes_guidelines": { + "description": "Keep to 1-2 sentences. Focus on Sabian-specific insights not captured by other fields." + } +} diff --git a/processing_brand_sentiment/config_files/brand_config.json b/processing_brand_sentiment/config_files/brand_config.json new file mode 100644 index 0000000000000000000000000000000000000000..db4d37670b32c2b49bcf7085c1755ec7d6c4fab4 --- /dev/null +++ b/processing_brand_sentiment/config_files/brand_config.json @@ -0,0 +1,111 @@ +{ + "brand": { + "name": "Sabian", + "description": "Sabian is a Canadian manufacturer of cymbals founded in 1981", + "products": [ + "HHX", + "AAX", + "Artisan", + "FRX", + "Omni", + "Chopper", + "Stratus", + "XSR", + "B8X", + "SBR" + ], + "product_aliases": { + "b8": "B8X", + "sbrs": "SBR", + "hhxs": "HHX", + "aaxs": "AAX", + "hhx's": "HHX", + "aax's": "AAX" + }, + "product_descriptions": { + "HHX": "Hand Hammered Xtreme - Professional series with dark, complex tones", + "AAX": "Bright, cutting cymbals for modern music", + "Artisan": "Premium hand-crafted cymbals with unique character", + "FRX": "Frequency Reduced Xtreme - Lower volume cymbals", + "Omni": "Multi-purpose cymbals for various playing styles", + "Chopper": "Effect cymbals with unique sound", + "Stratus": "Dark, complex sounds for jazz and fusion", + "XSR": "Entry-level professional cymbals", + "B8X": "Bronze entry-level cymbals", + "SBR": "Entry-level brass cymbals" + }, + "competitor_products_warning": { + "description": "Products that belong to competitors - DO NOT attribute to Sabian", + "paiste_products": ["2002", "signature", "sound edge", "formula 602", "giant beat", "pst", "rude", "masters", "traditionals", "twenty", "dark energy"], + "zildjian_products": ["k custom", "a custom", "k zildjian", "a zildjian", "s family", "i family", "l80", "kerope", "constantinople", "k sweet"], + "meinl_products": ["byzance", "pure alloy", "hcs", "classics custom", "mb20", "mb10", "soundcaster"], + "dream_products": ["bliss", "contact", "energy", "dark matter", "vintage bliss", "eclipse"], + "istanbul_products": ["agop", "xist", "traditional", "sultan", "mehmet"] + }, + "competitors": [ + { + "name": "Zildjian", + "aliases": ["zildjian", "zil", "z custom", "a custom", "k custom", "k zildjian", "a zildjian"] + }, + { + "name": "Meinl", + "aliases": ["meinl", "byzance", "classics"] + }, + { + "name": "Paiste", + "aliases": ["paiste", "2002", "signature", "formula 602", "sound edge"] + }, + { + "name": "Dream Cymbals", + "aliases": ["dream", "dream cymbals", "bliss"] + }, + { + "name": "Istanbul Agop", + "aliases": ["istanbul", "agop", "istanbul agop", "istanbul mehmet"] + }, + { + "name": "Bosphorus", + "aliases": ["bosphorus"] + } + ] + }, + "relevance_keywords": { + "primary": { + "description": "Keywords that definitively indicate Sabian content", + "keywords": ["sabian", "hhx", "aax", "artisan", "frx", "omni", "chopper", "stratus", "xsr", "b8x", "sbr"] + }, + "contextual": { + "description": "Ambiguous keywords that need context verification", + "keywords": ["b8"] + }, + "cymbal_context": { + "description": "Keywords that provide cymbal-related context for disambiguation", + "keywords": ["cymbal", "cymbals", "crash", "ride", "hi-hat", "hihat", "hi hat", "splash", "china", "bell", "stack", "effects"] + } + }, + "preprocessing": { + "min_length_for_language_detection": 50, + "default_language_for_short_text": "English", + "always_process_if_primary_keyword": true, + "min_content_length": 3 + }, + "filter_conditions": { + "exclude_access_levels": ["team", "house-coach"], + "exclude_post_states": ["deleted", "spam"], + "require_content_length_min": 3 + }, + "data_sources": { + "forums": { + "table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS", + "description": "Forum posts mentioning Sabian and their products", + "sql_query_file": "database/sql/fetch_forum_posts.sql", + "platform": "musora_forums" + }, + "comments": { + "table": "SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS", + "description": "Social media comments potentially related to Sabian brand", + "sql_query_file": "database/sql/fetch_comments.sql", + "platform_column": "PLATFORM" + } + } +} diff --git a/processing_brand_sentiment/config_files/workflow_config.json b/processing_brand_sentiment/config_files/workflow_config.json new file mode 100644 index 0000000000000000000000000000000000000000..24bf9901d3f138441335cead6e974b965064bbe8 --- /dev/null +++ b/processing_brand_sentiment/config_files/workflow_config.json @@ -0,0 +1,60 @@ +{ + "llm": { + "default_model": "gpt-5-nano", + "default_temperature": 0.2, + "max_retries": 3, + "timeout": 60 + }, + "agents": { + "preprocessor": { + "name": "PreprocessorAgent", + "description": "Deterministic agent for HTML parsing, text cleaning, language detection", + "model": "gpt-5-nano", + "temperature": 0.0, + "uses_llm": false + }, + "relevance_validator": { + "name": "RelevanceValidatorAgent", + "description": "Lightweight LLM for disambiguation of ambiguous terms (HH, AA)", + "model": "gpt-5-nano", + "temperature": 0.0, + "max_retries": 2 + }, + "brand_analyzer": { + "name": "SabianAnalyzerAgent", + "description": "Comprehensive brand analysis for Sabian products", + "model": "gpt-5-nano", + "temperature": 0.2, + "max_retries": 3 + } + }, + "workflow": { + "parallel_processing": { + "enabled": true, + "worker_calculation": "CPU count - 2, max 5 workers", + "max_workers": 5, + "min_batch_size": 20, + "max_batch_size": 500 + }, + "thread_context": { + "enabled": true, + "include_thread_title": true, + "include_first_post": true + } + }, + "output": { + "table_name": "SABIAN_BRAND_ANALYSIS", + "database": "SOCIAL_MEDIA_DB", + "schema": "ML_FEATURES" + }, + "comments_output": { + "table_name": "SABIAN_BRAND_ANALYSIS_COMMENTS", + "database": "SOCIAL_MEDIA_DB", + "schema": "ML_FEATURES" + }, + "logging": { + "level": "INFO", + "log_directory": "logs", + "log_file_prefix": "brand_sentiment_processing" + } +} \ No newline at end of file diff --git a/processing_brand_sentiment/database/__init__.py b/processing_brand_sentiment/database/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..62548ceeb2222f499bcb28f93041b0b23d18c841 --- /dev/null +++ b/processing_brand_sentiment/database/__init__.py @@ -0,0 +1,8 @@ +""" +Database module for brand sentiment analysis. +Contains Snowflake connection handler and SQL query utilities. +""" + +from .snowflake_connection import SnowFlakeConn + +__all__ = ['SnowFlakeConn'] \ No newline at end of file diff --git a/processing_brand_sentiment/database/snowflake_connection.py b/processing_brand_sentiment/database/snowflake_connection.py new file mode 100644 index 0000000000000000000000000000000000000000..212ffad2c05be1e1873b130b0f9df8943e236d3a --- /dev/null +++ b/processing_brand_sentiment/database/snowflake_connection.py @@ -0,0 +1,240 @@ +""" +Snowflake connection handler for brand sentiment analysis. +Provides methods for reading data, executing queries, and storing results. +""" + +import os +from snowflake.snowpark import Session +from dotenv import load_dotenv +import logging +import pandas as pd +from typing import Optional, List, Any + +logger = logging.getLogger(__name__) + +# Load environment variables +load_dotenv() + + +class SnowFlakeConn: + """ + Handles Snowflake database connections and operations for brand sentiment analysis. + """ + + def __init__(self): + """Initialize Snowflake connection.""" + self.session = self.connect_to_snowflake() + + def connect_to_snowflake(self) -> Session: + """ + Create a connection to Snowflake using environment variables. + + Returns: + Snowflake Session object + """ + conn = dict( + user=self.get_credential("SNOWFLAKE_USER"), + password=self.get_credential("SNOWFLAKE_PASSWORD"), + account=self.get_credential("SNOWFLAKE_ACCOUNT"), + role=self.get_credential("SNOWFLAKE_ROLE"), + database=self.get_credential("SNOWFLAKE_DATABASE"), + warehouse=self.get_credential("SNOWFLAKE_WAREHOUSE"), + schema=self.get_credential("SNOWFLAKE_SCHEMA"), + ) + + session = Session.builder.configs(conn).create() + logger.info("Successfully connected to Snowflake") + return session + + def get_credential(self, key: str) -> str: + """ + Get credential from environment variables. + + Args: + key: Environment variable name + + Returns: + Credential value + """ + return os.getenv(key) + + def run_read_query(self, query: str, description: str = "data") -> pd.DataFrame: + """ + Execute a SQL query that fetches data. + + Args: + query: SQL query string + description: Description of what data is being fetched + + Returns: + Pandas DataFrame containing query results + """ + try: + dataframe = self.session.sql(query).to_pandas() + dataframe.columns = dataframe.columns.str.lower() + logger.info(f"Successfully read {len(dataframe)} rows for {description}") + return dataframe + except Exception as e: + logger.error(f"Error reading {description}: {e}") + raise + + def store_df_to_snowflake( + self, + table_name: str, + dataframe: pd.DataFrame, + database: str = "SOCIAL_MEDIA_DB", + schema: str = "ML_FEATURES", + overwrite: bool = False + ) -> None: + """ + Store a DataFrame to Snowflake. + + Args: + table_name: Target table name + dataframe: DataFrame to store + database: Target database + schema: Target schema + overwrite: If True, overwrite existing data; if False, append + """ + try: + self.session.use_database(database) + self.session.use_schema(schema) + + dataframe = dataframe.reset_index(drop=True) + dataframe.columns = dataframe.columns.str.upper() + + self.session.write_pandas( + df=dataframe, + table_name=table_name.strip().upper(), + auto_create_table=True, + overwrite=overwrite, + use_logical_type=True + ) + logger.info(f"Successfully stored {len(dataframe)} rows to {table_name}") + + except Exception as e: + logger.error(f"Error storing data to {table_name}: {e}") + raise + + def execute_sql_file(self, file_path: str) -> Optional[List[Any]]: + """ + Execute SQL queries from a file. + + Args: + file_path: Path to SQL file + + Returns: + Query result or None for DDL/DML + """ + try: + with open(file_path, 'r', encoding='utf-8') as file: + sql_content = file.read() + + result = self.session.sql(sql_content).collect() + logger.info(f"Successfully executed SQL from {file_path}") + return result + except Exception as e: + logger.error(f"Error executing SQL file {file_path}: {e}") + return None + + def execute_query(self, query: str, description: str = "query") -> Optional[List[Any]]: + """ + Execute a SQL query and return results. + + Args: + query: SQL query string + description: Description of the query for logging + + Returns: + Query results + """ + try: + result = self.session.sql(query).collect() + logger.info(f"Successfully executed {description}") + return result + except Exception as e: + logger.error(f"Error executing {description}: {e}") + return None + + def fetch_forum_posts_with_context( + self, + sql_file_path: str, + limit: Optional[int] = None + ) -> pd.DataFrame: + """ + Fetch forum posts with thread context from SQL file. + + Args: + sql_file_path: Path to the SQL query file + limit: Optional limit on number of posts to fetch + + Returns: + DataFrame containing forum posts with context + """ + try: + with open(sql_file_path, 'r', encoding='utf-8') as f: + query = f.read() + + # Add limit if specified + if limit: + # Strip whitespace first, then semicolon, to handle Windows line endings + query = query.strip().rstrip(';') + f"\nLIMIT {limit};" + + df = self.run_read_query(query, "forum posts with context") + + # Validate required columns + required_cols = ['post_id', 'post_content', 'thread_id'] + missing_cols = [col for col in required_cols if col not in df.columns] + if missing_cols: + logger.warning(f"Missing expected columns: {missing_cols}") + + return df + + except Exception as e: + logger.error(f"Error fetching forum posts: {e}") + raise + + def fetch_comments( + self, + sql_file_path: str, + limit: Optional[int] = None + ) -> pd.DataFrame: + """ + Fetch social media comments with context from SQL file. + + Args: + sql_file_path: Path to the SQL query file + limit: Optional limit on number of comments to fetch + + Returns: + DataFrame containing comments with context + """ + try: + with open(sql_file_path, 'r', encoding='utf-8') as f: + query = f.read() + + # Add limit if specified + if limit: + query = query.strip().rstrip(';') + f"\nLIMIT {limit};" + + df = self.run_read_query(query, "social media comments with context") + + # Validate required columns + required_cols = ['comment_sk', 'comment_id', 'comment_text', 'platform'] + missing_cols = [col for col in required_cols if col not in df.columns] + if missing_cols: + logger.warning(f"Missing expected columns: {missing_cols}") + + return df + + except Exception as e: + logger.error(f"Error fetching comments: {e}") + raise + + def close_connection(self) -> None: + """Close the Snowflake session.""" + try: + self.session.close() + logger.info("Snowflake connection closed") + except Exception as e: + logger.error(f"Error closing connection: {e}") \ No newline at end of file diff --git a/processing_brand_sentiment/database/sql/create_comments_output_table.sql b/processing_brand_sentiment/database/sql/create_comments_output_table.sql new file mode 100644 index 0000000000000000000000000000000000000000..2a714feae97e818cdd775e1dba03b678da77438b --- /dev/null +++ b/processing_brand_sentiment/database/sql/create_comments_output_table.sql @@ -0,0 +1,161 @@ +-- Create the output table for Sabian brand sentiment analysis on social media comments +-- Stores processed comments with extracted brand intelligence +-- Schema Version 4.0: Same analysis fields as forum table, different source identifiers + +CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS ( + -- Source identifiers (comment-specific) + COMMENT_SK NUMBER(38,0), + COMMENT_ID VARCHAR(16777216), + ORIGINAL_TEXT VARCHAR(16777216), + PLATFORM VARCHAR(16777216), + COMMENT_TIMESTAMP TIMESTAMP_NTZ(9), + AUTHOR_NAME VARCHAR(16777216), + AUTHOR_ID VARCHAR(16777216), + CONTENT_SK NUMBER(38,0), + CONTENT_ID VARCHAR(16777216), + CONTENT_DESCRIPTION VARCHAR(16777216), + CHANNEL_SK NUMBER(38,0), + CHANNEL_NAME VARCHAR(16777216), + CHANNEL_DISPLAY_NAME VARCHAR(16777216), + PARENT_COMMENT_ID VARCHAR(16777216), + PARENT_COMMENT_TEXT VARCHAR(16777216), + + -- Language detection + DETECTED_LANGUAGE VARCHAR(100), + LANGUAGE_CODE VARCHAR(10), + IS_ENGLISH BOOLEAN, + + -- Relevance assessment + IS_RELEVANT BOOLEAN, + RELEVANCE_CONFIDENCE VARCHAR(20), + RELEVANCE_REASON VARCHAR(500), + + -- Author classification + AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown + SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context + + -- Sentiment analysis + SENTIMENT_LEVEL VARCHAR(20), + EMOTION_TYPE VARCHAR(50), + SENTIMENT_TARGET VARCHAR(50), + SENTIMENT_CONFIDENCE VARCHAR(20), + + -- Product information (stored as JSON arrays) + PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"] + PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"] + PURCHASE_STAGE VARCHAR(50), + + -- Competitive intelligence + COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"] + COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns + COMPARISON_TYPE VARCHAR(50), + COMPETITIVE_POSITIONING VARCHAR(500), + BRAND_SWITCHING VARCHAR(100), + + -- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others) + INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"] + DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"] + PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"] + DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"] + + -- Analysis notes + ANALYSIS_NOTES VARCHAR(16777216), + SARCASM_DETECTED BOOLEAN, + + -- Validation results + VALIDATION_PASSED BOOLEAN, + VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages + VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages + VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags + + -- Processing metadata + PROCESSING_SUCCESS BOOLEAN, + PROCESSING_STATUS VARCHAR(50), -- completed, completed_with_flags, validation_failed, workflow_error + PROCESSING_ERRORS VARCHAR(16777216), + PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(), + WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0' +) +COMMENT = 'Brand sentiment analysis results for Sabian social media comments. Schema v4.0: 4-agent pipeline with extraction/analysis separation and validation.'; + +-- Create indexes for common query patterns +CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_SK ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(COMMENT_SK); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_PLATFORM ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PLATFORM); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_IS_RELEVANT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(IS_RELEVANT); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_SENTIMENT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(SENTIMENT_LEVEL); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_PROCESSED_AT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PROCESSED_AT); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_AUTHOR_ROLE ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(AUTHOR_ROLE); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_MENTION_CTX ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(SABIAN_MENTION_CONTEXT); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_COMMENTS_STATUS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS(PROCESSING_STATUS); + +-- Create view for relevant comments only +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_RELEVANT_ANALYSIS AS +SELECT * +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS +WHERE IS_RELEVANT = TRUE + AND PROCESSING_SUCCESS = TRUE; + +-- Create view for comments needing review (flagged by validator) +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_FLAGGED AS +SELECT + COMMENT_SK, + COMMENT_ID, + PLATFORM, + ORIGINAL_TEXT, + IS_RELEVANT, + RELEVANCE_CONFIDENCE, + RELEVANCE_REASON, + PRODUCTS_MENTIONED, + SABIAN_MENTION_CONTEXT, + SENTIMENT_LEVEL, + VALIDATION_FLAGS, + VALIDATION_WARNINGS, + PROCESSING_STATUS +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS +WHERE PROCESSING_STATUS = 'completed_with_flags' + OR VALIDATION_PASSED = FALSE +ORDER BY PROCESSED_AT DESC; + +-- Create view for sentiment distribution +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_SENTIMENT_DISTRIBUTION AS +SELECT + PLATFORM, + SENTIMENT_LEVEL, + EMOTION_TYPE, + SENTIMENT_TARGET, + COUNT(*) AS COMMENT_COUNT, + COUNT(CASE WHEN SARCASM_DETECTED = TRUE THEN 1 END) AS SARCASM_COUNT +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS +WHERE IS_RELEVANT = TRUE + AND PROCESSING_SUCCESS = TRUE +GROUP BY PLATFORM, SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_TARGET +ORDER BY COMMENT_COUNT DESC; + +-- Create view for product mentions summary +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_PRODUCT_MENTIONS AS +SELECT + PLATFORM, + TRIM(product.VALUE::STRING) AS PRODUCT, + SENTIMENT_LEVEL, + COUNT(*) AS MENTION_COUNT, + COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT, + COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS, + LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PRODUCTS_MENTIONED)) AS product +WHERE IS_RELEVANT = TRUE + AND PROCESSING_SUCCESS = TRUE + AND PRODUCTS_MENTIONED IS NOT NULL +GROUP BY PLATFORM, TRIM(product.VALUE::STRING), SENTIMENT_LEVEL +ORDER BY MENTION_COUNT DESC; + +-- Create view for validation summary +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMMENTS_VALIDATION_SUMMARY AS +SELECT + PLATFORM, + PROCESSING_STATUS, + VALIDATION_PASSED, + COUNT(*) AS COMMENT_COUNT, + COUNT(CASE WHEN IS_RELEVANT = TRUE THEN 1 END) AS RELEVANT_COUNT, + COUNT(CASE WHEN IS_RELEVANT = FALSE THEN 1 END) AS NOT_RELEVANT_COUNT +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS +GROUP BY PLATFORM, PROCESSING_STATUS, VALIDATION_PASSED +ORDER BY COMMENT_COUNT DESC; diff --git a/processing_brand_sentiment/database/sql/create_output_table.sql b/processing_brand_sentiment/database/sql/create_output_table.sql new file mode 100644 index 0000000000000000000000000000000000000000..614dba51f9449f250e20f798983a69130069e5bb --- /dev/null +++ b/processing_brand_sentiment/database/sql/create_output_table.sql @@ -0,0 +1,250 @@ +-- Create the output table for Sabian brand sentiment analysis +-- Stores processed forum posts with extracted brand intelligence +-- Schema Version 4.0: Added THREAD_CONTEXT_SUMMARY, validation fields, and processing status + +CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS ( + -- Source identifiers + POST_ID NUMBER(38,0) PRIMARY KEY, + THREAD_ID NUMBER(38,0), + POST_AUTHOR_ID NUMBER(38,0), + + -- Original and processed content + ORIGINAL_CONTENT VARCHAR(16777216), + CLEANED_CONTENT VARCHAR(16777216), + QUOTED_CONTENT VARCHAR(16777216), + THREAD_CONTEXT VARCHAR(16777216), -- Raw thread context (legacy) + THREAD_CONTEXT_SUMMARY VARCHAR(4096), -- NEW v4.0: Summarized thread context for analysis + + -- Thread metadata + THREAD_TITLE VARCHAR(16777216), + THREAD_FIRST_POST VARCHAR(16777216), + + -- Timestamps + POST_CREATED_AT TIMESTAMP_LTZ(9), + THREAD_STARTED_AT TIMESTAMP_LTZ(9), + + -- Category information + CATEGORY_TITLE VARCHAR(16777216), + CATEGORY_TOPIC VARCHAR(16777216), + + -- Language detection + DETECTED_LANGUAGE VARCHAR(100), + LANGUAGE_CODE VARCHAR(10), + IS_ENGLISH BOOLEAN, + + -- Relevance assessment + IS_RELEVANT BOOLEAN, + RELEVANCE_CONFIDENCE VARCHAR(20), + RELEVANCE_REASON VARCHAR(500), + + -- Author classification + AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown + SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context + + -- Sentiment analysis + SENTIMENT_LEVEL VARCHAR(20), + EMOTION_TYPE VARCHAR(50), + SENTIMENT_TARGET VARCHAR(50), + SENTIMENT_CONFIDENCE VARCHAR(20), + + -- Product information (stored as JSON arrays) + PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"] + PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"] + PURCHASE_STAGE VARCHAR(50), + + -- Competitive intelligence + COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"] + COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns + COMPARISON_TYPE VARCHAR(50), + COMPETITIVE_POSITIONING VARCHAR(500), + BRAND_SWITCHING VARCHAR(100), + + -- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others) + INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"] + DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"] + PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"] + DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"] + + -- Analysis notes + ANALYSIS_NOTES VARCHAR(16777216), + SARCASM_DETECTED BOOLEAN, + + -- Validation results (NEW v4.0) + VALIDATION_PASSED BOOLEAN, + VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages + VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages + VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags (e.g., "sarcasm_detected", "low_confidence_relevant") + + -- Platform identifier + PLATFORM VARCHAR(50) DEFAULT 'musora_forums', + + -- Processing metadata + PROCESSING_SUCCESS BOOLEAN, + PROCESSING_STATUS VARCHAR(50), -- NEW v4.0: completed, completed_with_flags, validation_failed, workflow_error + PROCESSING_ERRORS VARCHAR(16777216), + PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(), + WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0' +) +COMMENT = 'Brand sentiment analysis results for Sabian forum posts. Schema v4.0: 4-agent pipeline with extraction/analysis separation, thread context summarization, and validation.'; + +-- Create indexes for common query patterns +CREATE INDEX IF NOT EXISTS IDX_SABIAN_THREAD_ID ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(THREAD_ID); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_IS_RELEVANT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(IS_RELEVANT); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_SENTIMENT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(SENTIMENT_LEVEL); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_PROCESSED_AT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(PROCESSED_AT); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_AUTHOR_ROLE ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(AUTHOR_ROLE); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_MENTION_CONTEXT ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(SABIAN_MENTION_CONTEXT); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_PROCESSING_STATUS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(PROCESSING_STATUS); +CREATE INDEX IF NOT EXISTS IDX_SABIAN_VALIDATION_FLAGS ON SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS(VALIDATION_PASSED); + +-- Create view for relevant posts only +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_RELEVANT_ANALYSIS AS +SELECT * +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS +WHERE IS_RELEVANT = TRUE + AND PROCESSING_SUCCESS = TRUE; + +-- Create view for posts needing review (flagged by validator) +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_FLAGGED_POSTS AS +SELECT + POST_ID, + THREAD_ID, + CLEANED_CONTENT, + THREAD_CONTEXT_SUMMARY, + IS_RELEVANT, + RELEVANCE_CONFIDENCE, + RELEVANCE_REASON, + PRODUCTS_MENTIONED, + SABIAN_MENTION_CONTEXT, + SENTIMENT_LEVEL, + VALIDATION_FLAGS, + VALIDATION_WARNINGS, + PROCESSING_STATUS +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS +WHERE PROCESSING_STATUS = 'completed_with_flags' + OR VALIDATION_PASSED = FALSE +ORDER BY PROCESSED_AT DESC; + +-- Create view for sentiment distribution +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_SENTIMENT_DISTRIBUTION AS +SELECT + SENTIMENT_LEVEL, + EMOTION_TYPE, + SENTIMENT_TARGET, + COUNT(*) AS POST_COUNT, + COUNT(CASE WHEN SARCASM_DETECTED = TRUE THEN 1 END) AS SARCASM_COUNT +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS +WHERE IS_RELEVANT = TRUE + AND PROCESSING_SUCCESS = TRUE +GROUP BY SENTIMENT_LEVEL, EMOTION_TYPE, SENTIMENT_TARGET +ORDER BY POST_COUNT DESC; + +-- Create view for product mentions summary +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_PRODUCT_MENTIONS AS +SELECT + TRIM(product.VALUE::STRING) AS PRODUCT, + SENTIMENT_LEVEL, + COUNT(*) AS MENTION_COUNT, + COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT, + COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS, + LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PRODUCTS_MENTIONED)) AS product +WHERE IS_RELEVANT = TRUE + AND PROCESSING_SUCCESS = TRUE + AND PRODUCTS_MENTIONED IS NOT NULL +GROUP BY TRIM(product.VALUE::STRING), SENTIMENT_LEVEL +ORDER BY MENTION_COUNT DESC; + +-- Create view for competitor analysis +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMPETITOR_ANALYSIS AS +SELECT + TRIM(competitor.VALUE::STRING) AS COMPETITOR, + COMPARISON_TYPE, + BRAND_SWITCHING, + COUNT(*) AS MENTION_COUNT, + COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_SENTIMENT, + COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_SENTIMENT +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS, + LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(COMPETITORS_MENTIONED)) AS competitor +WHERE IS_RELEVANT = TRUE + AND PROCESSING_SUCCESS = TRUE + AND COMPETITORS_MENTIONED IS NOT NULL +GROUP BY TRIM(competitor.VALUE::STRING), COMPARISON_TYPE, BRAND_SWITCHING +ORDER BY MENTION_COUNT DESC; + +-- Create view for pain points analysis +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_PAIN_POINTS AS +SELECT + TRIM(pain_point.VALUE::STRING) AS PAIN_POINT, + COUNT(*) AS OCCURRENCE_COUNT, + ARRAY_AGG(DISTINCT SENTIMENT_LEVEL) AS SENTIMENT_LEVELS +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS, + LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(PAIN_POINTS)) AS pain_point +WHERE IS_RELEVANT = TRUE + AND PROCESSING_SUCCESS = TRUE + AND PAIN_POINTS IS NOT NULL +GROUP BY TRIM(pain_point.VALUE::STRING) +ORDER BY OCCURRENCE_COUNT DESC; + +-- Create view for author role analysis +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_AUTHOR_ROLES AS +SELECT + AUTHOR_ROLE, + SABIAN_MENTION_CONTEXT, + COUNT(*) AS POST_COUNT, + COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_COUNT, + COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_COUNT, + COUNT(CASE WHEN SENTIMENT_LEVEL = 'neutral' THEN 1 END) AS NEUTRAL_COUNT +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS +WHERE IS_RELEVANT = TRUE + AND PROCESSING_SUCCESS = TRUE +GROUP BY AUTHOR_ROLE, SABIAN_MENTION_CONTEXT +ORDER BY POST_COUNT DESC; + +-- Create view for competitor ownership analysis +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_COMPETITOR_OWNERSHIP AS +SELECT + TRIM(competitor.VALUE::STRING) AS COMPETITOR_OWNED, + AUTHOR_ROLE, + COUNT(*) AS AUTHOR_COUNT, + COUNT(CASE WHEN SENTIMENT_LEVEL IN ('positive', 'very_positive') THEN 1 END) AS POSITIVE_TOWARD_SABIAN, + COUNT(CASE WHEN SENTIMENT_LEVEL IN ('negative', 'very_negative') THEN 1 END) AS NEGATIVE_TOWARD_SABIAN +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS, + LATERAL FLATTEN(INPUT => TRY_PARSE_JSON(COMPETITOR_PRODUCTS_OWNED)) AS competitor +WHERE IS_RELEVANT = TRUE + AND PROCESSING_SUCCESS = TRUE + AND COMPETITOR_PRODUCTS_OWNED IS NOT NULL +GROUP BY TRIM(competitor.VALUE::STRING), AUTHOR_ROLE +ORDER BY AUTHOR_COUNT DESC; + +-- Create view for mention context by sentiment +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_MENTION_DEPTH AS +SELECT + SABIAN_MENTION_CONTEXT, + SENTIMENT_LEVEL, + COUNT(*) AS POST_COUNT, + AVG(CASE + WHEN SENTIMENT_LEVEL = 'very_positive' THEN 2 + WHEN SENTIMENT_LEVEL = 'positive' THEN 1 + WHEN SENTIMENT_LEVEL = 'neutral' THEN 0 + WHEN SENTIMENT_LEVEL = 'negative' THEN -1 + WHEN SENTIMENT_LEVEL = 'very_negative' THEN -2 + ELSE 0 + END) AS AVG_SENTIMENT_SCORE +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS +WHERE IS_RELEVANT = TRUE + AND PROCESSING_SUCCESS = TRUE +GROUP BY SABIAN_MENTION_CONTEXT, SENTIMENT_LEVEL +ORDER BY SABIAN_MENTION_CONTEXT, POST_COUNT DESC; + +-- Create view for validation flags analysis (NEW v4.0) +CREATE OR REPLACE VIEW SOCIAL_MEDIA_DB.ML_FEATURES.VW_SABIAN_VALIDATION_SUMMARY AS +SELECT + PROCESSING_STATUS, + VALIDATION_PASSED, + COUNT(*) AS POST_COUNT, + COUNT(CASE WHEN IS_RELEVANT = TRUE THEN 1 END) AS RELEVANT_COUNT, + COUNT(CASE WHEN IS_RELEVANT = FALSE THEN 1 END) AS NOT_RELEVANT_COUNT +FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS +GROUP BY PROCESSING_STATUS, VALIDATION_PASSED +ORDER BY POST_COUNT DESC; diff --git a/processing_brand_sentiment/database/sql/fetch_comments.sql b/processing_brand_sentiment/database/sql/fetch_comments.sql new file mode 100644 index 0000000000000000000000000000000000000000..e33fb7dd63879ef18ab916ad7aece3271e102675 --- /dev/null +++ b/processing_brand_sentiment/database/sql/fetch_comments.sql @@ -0,0 +1,82 @@ +-- Query to fetch social media comments with context for brand sentiment analysis +-- Source: SOCIAL_MEDIA_DB.brand_sentiment.SABIAN_comments (same structure as CORE.FACT_COMMENTS) +-- Includes: comment content, parent comment text, content metadata, channel info +-- Excludes: official accounts, already-processed comments, empty comments + +SELECT + -- Comment identifiers + fc.COMMENT_SK, + fc.COMMENT_ID, + fc.PLATFORM, + fc.MESSAGE AS COMMENT_TEXT, + fc.CREATED_TIME AS COMMENT_TIMESTAMP, + fc.AUTHOR_NAME, + fc.AUTHOR_ID, + fc.LIKE_COUNT, + fc.PARENT_COMMENT_ID, + fc.REPLIES_COUNT, + fc.COMMENT_LENGTH, + fc.IS_ACTIVE AS COMMENT_IS_ACTIVE, + + -- Parent comment information (self-join to get parent comment text) + parent_fc.MESSAGE AS PARENT_COMMENT_TEXT, + + -- Content information + dc.CONTENT_SK, + dc.CONTENT_ID, + dc.CONTENT_TYPE, + dc.MESSAGE AS CONTENT_DESCRIPTION, + dc.TITLE AS CONTENT_TITLE, + dc.PERMALINK_URL, + dc.CREATED_TIME AS CONTENT_TIMESTAMP, + + -- Channel information + dch.CHANNEL_SK, + dch.CHANNEL_NAME, + dch.CHANNEL_DISPLAY_NAME + +FROM + SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS fc + +-- Left join to get parent comment text if it exists +LEFT JOIN + SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_COMMENTS parent_fc + ON fc.PARENT_COMMENT_ID = parent_fc.COMMENT_ID + AND fc.PLATFORM = parent_fc.PLATFORM + +INNER JOIN + SOCIAL_MEDIA_DB.CORE.DIM_CONTENT dc + ON fc.CONTENT_SK = dc.CONTENT_SK + +INNER JOIN + SOCIAL_MEDIA_DB.CORE.DIM_CHANNEL dch + ON dc.CHANNEL_NAME = dch.CHANNEL_NAME + AND dc.PLATFORM = dch.PLATFORM + +-- Left join with output table to exclude already-processed comments +LEFT JOIN + SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS sba + ON fc.COMMENT_SK = sba.COMMENT_SK + +WHERE + -- Active records only + fc.IS_ACTIVE = TRUE + AND dc.IS_ACTIVE = TRUE + AND dch.IS_ACTIVE = TRUE + + -- Exclude official accounts + AND (fc.AUTHOR_NAME IS NULL OR fc.AUTHOR_NAME NOT IN ( + 'Musora', 'Drumeo', 'Pianote', + '@PianoteOfficial', '@DrumeoOfficial', '@MusoraOfficial' + )) + + -- Exclude already-processed comments + AND sba.COMMENT_SK IS NULL + + -- Ensure comment has content + AND fc.MESSAGE IS NOT NULL + AND TRIM(fc.MESSAGE) != '' + AND LENGTH(TRIM(fc.MESSAGE)) > 0 + +ORDER BY + fc.CREATED_TIME DESC; diff --git a/processing_brand_sentiment/database/sql/fetch_forum_posts.sql b/processing_brand_sentiment/database/sql/fetch_forum_posts.sql new file mode 100644 index 0000000000000000000000000000000000000000..32962b6659be6f591ceb8b2db1c7ce924021d17c --- /dev/null +++ b/processing_brand_sentiment/database/sql/fetch_forum_posts.sql @@ -0,0 +1,106 @@ +-- Query to fetch forum posts with thread context for brand sentiment analysis +-- Includes: post content, thread context (title, first post), parent relationships +-- Excludes: team/house-coach posts, already-processed posts, deleted posts + +WITH thread_first_posts AS ( + -- Get the first post (by creation date) for each thread to use as context + -- Using ROW_NUMBER for reliable first post identification + SELECT + THREAD_ID, + POST_CONTENT AS FIRST_POST_CONTENT, + POST_AUTHOR_ID AS FIRST_POST_AUTHOR_ID, + POST_CREATED_AT AS FIRST_POST_CREATED_AT + FROM ( + SELECT + THREAD_ID, + POST_CONTENT, + POST_AUTHOR_ID, + POST_CREATED_AT, + ROW_NUMBER() OVER (PARTITION BY THREAD_ID ORDER BY POST_CREATED_AT ASC) AS rn + FROM SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS + WHERE POST_CONTENT IS NOT NULL + AND TRIM(POST_CONTENT) != '' + ) ranked + WHERE rn = 1 +) + +SELECT + -- Post identifiers + fp.POST_ID, + fp.POST_AUTHOR_ID, + fp.THREAD_ID, + + -- Post content (may contain HTML with quoted parent) + fp.POST_CONTENT, + + -- Post timestamps + fp.POST_CREATED_AT, + fp.POST_EDITED_ON, + fp.POST_PUBLISHED_ON, + fp.POST_STATE, + + -- Parent/Child relationships (for context) + fp.PROMPTING_POST_ID, + fp.PARENT_ID, + fp.PARENT_CONTENT, + fp.PARENT_AUTHOR_ID, + fp.PARENT_CREATED_AT, + fp.CHILD_ID, + fp.CHILD_CONTENT, + + -- Thread context + fp.THREAD_TITLE, + fp.THREAD_SLUG, + fp.THREAD_STATE, + fp.THREAD_LOCKED, + fp.THREAD_PINNED, + fp.THREAD_POST_COUNT, + fp.THREAD_PUBLISHED_ON, + + -- First post of the thread (for context) + tfp.FIRST_POST_CONTENT AS THREAD_FIRST_POST, + tfp.FIRST_POST_CREATED_AT AS THREAD_STARTED_AT, + + -- Category information + fp.CATEGORY_ID, + fp.CATEGORY_BRAND, + fp.CATEGORY_DESCRIPTION, + fp.CATEGORY_TITLE, + fp.CATEGORY_TOPIC, + fp.CATEGORY_SLUG, + + -- Access levels (for filtering) + fp.POST_AUTHOR_ACCESS_LEVEL, + fp.PARENT_AUTHOR_ACCESS_LEVEL, + fp.CHILD_AUTHOR_ACCESS_LEVEL + +FROM + SOCIAL_MEDIA_DB.BRAND_SENTIMENT.SABIAN_FORUM_POSTS fp + +-- Join to get thread's first post for context +LEFT JOIN + thread_first_posts tfp ON fp.THREAD_ID = tfp.THREAD_ID + +-- Left join with output table to exclude already-processed posts +LEFT JOIN + SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS sba + ON fp.POST_ID = sba.POST_ID + +WHERE + -- Exclude team and house-coach posts (internal comments) + (fp.POST_AUTHOR_ACCESS_LEVEL IS NULL OR fp.POST_AUTHOR_ACCESS_LEVEL NOT IN ('team', 'house-coach')) + + -- Exclude deleted posts + AND (fp.POST_STATE IS NULL OR fp.POST_STATE != 'deleted') + AND fp.POST_DELETED_AT IS NULL + + -- Exclude already-processed posts + AND sba.POST_ID IS NULL + + -- Ensure post has content + AND fp.POST_CONTENT IS NOT NULL + AND TRIM(fp.POST_CONTENT) != '' + AND LENGTH(TRIM(fp.POST_CONTENT)) > 0 + +ORDER BY + fp.POST_CREATED_AT DESC; diff --git a/processing_brand_sentiment/database/sql/init_comments_output_table.sql b/processing_brand_sentiment/database/sql/init_comments_output_table.sql new file mode 100644 index 0000000000000000000000000000000000000000..da0a9b43a53b1193662002e6b4ff6559029695e9 --- /dev/null +++ b/processing_brand_sentiment/database/sql/init_comments_output_table.sql @@ -0,0 +1,78 @@ +-- Initialize empty output table for Sabian brand sentiment analysis on social media comments +-- Run this script BEFORE the first processing run to create the table structure +-- This prevents "table not found" errors when the fetch query tries to check for already-processed comments + +CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS ( + -- Source identifiers (comment-specific) + COMMENT_SK NUMBER(38,0), + COMMENT_ID VARCHAR(16777216), + ORIGINAL_TEXT VARCHAR(16777216), + PLATFORM VARCHAR(16777216), + COMMENT_TIMESTAMP TIMESTAMP_NTZ(9), + AUTHOR_NAME VARCHAR(16777216), + AUTHOR_ID VARCHAR(16777216), + CONTENT_SK NUMBER(38,0), + CONTENT_ID VARCHAR(16777216), + CONTENT_DESCRIPTION VARCHAR(16777216), + CHANNEL_SK NUMBER(38,0), + CHANNEL_NAME VARCHAR(16777216), + CHANNEL_DISPLAY_NAME VARCHAR(16777216), + PARENT_COMMENT_ID VARCHAR(16777216), + PARENT_COMMENT_TEXT VARCHAR(16777216), + + -- Language detection + DETECTED_LANGUAGE VARCHAR(100), + LANGUAGE_CODE VARCHAR(10), + IS_ENGLISH BOOLEAN, + + -- Relevance assessment + IS_RELEVANT BOOLEAN, + RELEVANCE_CONFIDENCE VARCHAR(20), + RELEVANCE_REASON VARCHAR(500), + + -- Author classification + AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown + SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context + + -- Sentiment analysis + SENTIMENT_LEVEL VARCHAR(20), + EMOTION_TYPE VARCHAR(50), + SENTIMENT_TARGET VARCHAR(50), + SENTIMENT_CONFIDENCE VARCHAR(20), + + -- Product information (stored as JSON arrays) + PRODUCTS_MENTIONED VARCHAR(16777216), -- JSON array: ["HHX", "AAX"] + PRODUCT_ATTRIBUTES VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"] + PURCHASE_STAGE VARCHAR(50), + + -- Competitive intelligence + COMPETITORS_MENTIONED VARCHAR(16777216), -- JSON array: ["Zildjian", "Meinl"] + COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns + COMPARISON_TYPE VARCHAR(50), + COMPETITIVE_POSITIONING VARCHAR(500), + BRAND_SWITCHING VARCHAR(100), + + -- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others) + INTENTS VARCHAR(16777216), -- JSON array (multi-label): ["sharing_experience", "praising"] + DECISION_DRIVERS VARCHAR(16777216), -- JSON array: ["sound_quality", "price"] + PAIN_POINTS VARCHAR(16777216), -- JSON array: ["price_value", "availability"] + DELIGHT_FACTORS VARCHAR(16777216), -- JSON array: ["sound_quality", "durability"] + + -- Analysis notes + ANALYSIS_NOTES VARCHAR(16777216), + SARCASM_DETECTED BOOLEAN, + + -- Validation results + VALIDATION_PASSED BOOLEAN, + VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages + VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages + VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags + + -- Processing metadata + PROCESSING_SUCCESS BOOLEAN, + PROCESSING_STATUS VARCHAR(50), -- completed, completed_with_flags, validation_failed, workflow_error + PROCESSING_ERRORS VARCHAR(16777216), + PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(), + WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0' +) +COMMENT = 'Brand sentiment analysis results for Sabian social media comments. Schema v4.0: 4-agent pipeline with extraction/analysis separation and validation.'; diff --git a/processing_brand_sentiment/database/sql/init_output_table.sql b/processing_brand_sentiment/database/sql/init_output_table.sql new file mode 100644 index 0000000000000000000000000000000000000000..7cc54ea023bb3718945b0983e350634f4cc262e4 --- /dev/null +++ b/processing_brand_sentiment/database/sql/init_output_table.sql @@ -0,0 +1,89 @@ +-- Initialize empty output table for Sabian brand sentiment analysis +-- Run this script BEFORE the first processing run to create the table structure +-- This prevents "table not found" errors when the fetch query tries to check for already-processed posts +-- Schema Version 4.0: Added THREAD_CONTEXT_SUMMARY, validation fields, and processing status + +CREATE TABLE IF NOT EXISTS SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS ( + -- Source identifiers + POST_ID NUMBER(38,0) PRIMARY KEY, + THREAD_ID NUMBER(38,0), + POST_AUTHOR_ID NUMBER(38,0), + + -- Original and processed content + ORIGINAL_CONTENT VARCHAR(16777216), + CLEANED_CONTENT VARCHAR(16777216), + QUOTED_CONTENT VARCHAR(16777216), + THREAD_CONTEXT VARCHAR(16777216), -- Raw thread context (legacy) + THREAD_CONTEXT_SUMMARY VARCHAR(4096), -- NEW v4.0: Summarized thread context + + -- Thread metadata + THREAD_TITLE VARCHAR(16777216), + THREAD_FIRST_POST VARCHAR(16777216), + + -- Timestamps + POST_CREATED_AT TIMESTAMP_LTZ(9), + THREAD_STARTED_AT TIMESTAMP_LTZ(9), + + -- Category information + CATEGORY_TITLE VARCHAR(16777216), + CATEGORY_TOPIC VARCHAR(16777216), + + -- Language detection + DETECTED_LANGUAGE VARCHAR(100), + LANGUAGE_CODE VARCHAR(10), + IS_ENGLISH BOOLEAN, + + -- Relevance assessment + IS_RELEVANT BOOLEAN, + RELEVANCE_CONFIDENCE VARCHAR(20), + RELEVANCE_REASON VARCHAR(500), + + -- Author classification + AUTHOR_ROLE VARCHAR(50), -- current_owner, past_owner, potential_buyer, never_owned, unknown + SABIAN_MENTION_CONTEXT VARCHAR(50), -- primary_focus, significant_mention, casual_mention, comparison_context + + -- Sentiment analysis + SENTIMENT_LEVEL VARCHAR(20), + EMOTION_TYPE VARCHAR(50), + SENTIMENT_TARGET VARCHAR(50), + SENTIMENT_CONFIDENCE VARCHAR(20), + + -- Product information (stored as JSON arrays) + PRODUCTS_MENTIONED VARCHAR(16777216), + PRODUCT_ATTRIBUTES VARCHAR(16777216), + + -- Competitive intelligence + COMPETITORS_MENTIONED VARCHAR(16777216), + COMPETITOR_PRODUCTS_OWNED VARCHAR(16777216), -- JSON array: competitor brands the AUTHOR owns + COMPARISON_TYPE VARCHAR(50), + COMPETITIVE_POSITIONING VARCHAR(500), + BRAND_SWITCHING VARCHAR(100), + + -- Customer journey (AUTHOR PERSPECTIVE ONLY - null if giving advice to others) + INTENTS VARCHAR(16777216), -- Multi-label: seeking_information, providing_information, sharing_experience, comparing, praising, criticizing, buying_selling, general_discussion + PURCHASE_STAGE VARCHAR(50), -- AUTHOR's own stage only + DECISION_DRIVERS VARCHAR(16777216), -- AUTHOR's own decision drivers only + PAIN_POINTS VARCHAR(16777216), -- AUTHOR's negative feedback aspects (uses feedback_aspects categories) + DELIGHT_FACTORS VARCHAR(16777216), -- AUTHOR's positive feedback aspects (uses feedback_aspects categories) + + -- Analysis notes + ANALYSIS_NOTES VARCHAR(16777216), + SARCASM_DETECTED BOOLEAN, + + -- Validation results (NEW v4.0) + VALIDATION_PASSED BOOLEAN, + VALIDATION_ERRORS VARCHAR(16777216), -- JSON array of error messages + VALIDATION_WARNINGS VARCHAR(16777216), -- JSON array of warning messages + VALIDATION_FLAGS VARCHAR(16777216), -- JSON array of anomaly flags + + -- Platform identifier + PLATFORM VARCHAR(50) DEFAULT 'musora_forums', + + -- Processing metadata + PROCESSING_SUCCESS BOOLEAN, + PROCESSING_STATUS VARCHAR(50), -- NEW v4.0: completed, completed_with_flags, validation_failed, workflow_error + PROCESSING_ERRORS VARCHAR(16777216), + PROCESSED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP(), + WORKFLOW_VERSION VARCHAR(20) DEFAULT '4.0' +) +COMMENT = 'Brand sentiment analysis results for Sabian forum posts. Schema v4.0: Added thread_context_summary, validation fields, and processing status.'; diff --git a/processing_brand_sentiment/main.py b/processing_brand_sentiment/main.py new file mode 100644 index 0000000000000000000000000000000000000000..98548c8655e4780805f1a85b7304cc6f06b1edf7 --- /dev/null +++ b/processing_brand_sentiment/main.py @@ -0,0 +1,1088 @@ +""" +Main execution script for brand sentiment analysis workflow. +Orchestrates data fetching, processing, and storage using an agentic workflow. +Supports parallel processing with multiprocessing for improved performance. +Supports multiple data sources: forums, social media comments, or both. +""" + +import json +import os +import logging +import argparse +from datetime import datetime +import pandas as pd +from dotenv import load_dotenv +from multiprocessing import Pool, cpu_count +import traceback +from typing import Dict, Any, List + +from database.snowflake_connection import SnowFlakeConn +from workflow.orchestrator import BrandAnalysisWorkflow +from workflow.comment_orchestrator import CommentAnalysisWorkflow + +# Get the directory where this script is located +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) + +# Load environment variables +load_dotenv(os.path.join(SCRIPT_DIR, '.env')) + +# Ensure logs directory exists +LOGS_DIR = os.path.join(SCRIPT_DIR, 'logs') +os.makedirs(LOGS_DIR, exist_ok=True) + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler( + os.path.join(LOGS_DIR, f'brand_sentiment_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log') + ), + logging.StreamHandler() + ] +) +logger = logging.getLogger(__name__) + + +# ============================================================ +# Configuration Loading +# ============================================================ + +def load_configs(config_dir: str = None) -> Dict[str, Dict]: + """ + Load all configuration files. + + Args: + config_dir: Directory containing config files + + Returns: + Dictionary with all configurations + """ + if config_dir is None: + config_dir = os.path.join(SCRIPT_DIR, 'config_files') + + configs = {} + + # Load workflow config + with open(os.path.join(config_dir, 'workflow_config.json'), 'r') as f: + configs['workflow'] = json.load(f) + + # Load brand config + with open(os.path.join(config_dir, 'brand_config.json'), 'r') as f: + configs['brand'] = json.load(f) + + # Load analysis categories + with open(os.path.join(config_dir, 'analysis_categories.json'), 'r') as f: + configs['categories'] = json.load(f) + + return configs + + +# ============================================================ +# Batch Processing Utilities +# ============================================================ + +def calculate_optimal_batch_size( + total_posts: int, + num_workers: int, + min_batch: int = 20, + max_batch: int = 500 +) -> int: + """ + Calculate optimal batch size based on total posts and workers. + + Args: + total_posts: Total number of posts to process + num_workers: Number of parallel workers + min_batch: Minimum batch size + max_batch: Maximum batch size + + Returns: + Optimal batch size + """ + if total_posts <= min_batch: + return total_posts + + batch_size = total_posts // num_workers + batch_size = max(min_batch, min(max_batch, batch_size)) + + return batch_size + + +def safe_to_json(value: Any) -> Any: + """ + Safely convert a value to JSON string. + Handles None, NaN, lists, and already-string values. + + Args: + value: Value to convert + + Returns: + JSON string if list, None if null, original value otherwise + """ + # Handle None and NaN + if value is None or (isinstance(value, float) and pd.isna(value)): + return None + # Handle lists - convert to JSON + if isinstance(value, list): + return json.dumps(value) if value else None + # Handle already-string values + if isinstance(value, str): + return value if value else None + # Return as-is for other types + return value + + +def safe_json_list_length(value: Any) -> int: + """ + Safely get the length of a JSON array string. + Handles None, NaN, empty strings, and invalid JSON. + + Args: + value: Value to parse (expected JSON string of array) + + Returns: + Length of the array, or 0 if invalid/empty + """ + # Handle None and NaN + if value is None or (isinstance(value, float) and pd.isna(value)): + return 0 + # Handle non-string values + if not isinstance(value, str): + return 0 + # Handle empty strings + if not value or value == '[]' or value == 'null': + return 0 + # Try to parse JSON + try: + parsed = json.loads(value) + return len(parsed) if isinstance(parsed, list) else 0 + except (json.JSONDecodeError, TypeError): + return 0 + + +def calculate_batch_stats(df: pd.DataFrame) -> Dict[str, int]: + """ + Calculate statistics from batch results. + Handles null values safely for all fields. + + Args: + df: DataFrame with processed results + + Returns: + Dictionary with statistics + """ + stats = { + 'relevant_count': 0, + 'not_relevant_count': 0, + 'products_mentioned_count': 0, + 'competitors_mentioned_count': 0, + 'positive_sentiment_count': 0, + 'negative_sentiment_count': 0, + # Author role stats + 'current_owner_count': 0, + 'potential_buyer_count': 0, + 'primary_focus_count': 0 + } + + # Handle empty dataframe + if df.empty: + return stats + + # Count relevant/not relevant posts + if 'IS_RELEVANT' in df.columns: + relevant_col = df['IS_RELEVANT'] + non_null_mask = relevant_col.notna() + if non_null_mask.any(): + stats['relevant_count'] = int(relevant_col[non_null_mask].astype(bool).sum()) + stats['not_relevant_count'] = int((~relevant_col[non_null_mask].astype(bool)).sum()) + + # Count product mentions using safe helper + if 'PRODUCTS_MENTIONED' in df.columns: + stats['products_mentioned_count'] = int( + df['PRODUCTS_MENTIONED'].apply(safe_json_list_length).sum() + ) + + # Count competitor mentions using safe helper + if 'COMPETITORS_MENTIONED' in df.columns: + stats['competitors_mentioned_count'] = int( + df['COMPETITORS_MENTIONED'].apply(safe_json_list_length).sum() + ) + + # Count sentiment distribution + if 'SENTIMENT_LEVEL' in df.columns: + sentiment_values = df['SENTIMENT_LEVEL'].dropna() + if not sentiment_values.empty: + stats['positive_sentiment_count'] = int( + sentiment_values.isin(['positive', 'very_positive']).sum() + ) + stats['negative_sentiment_count'] = int( + sentiment_values.isin(['negative', 'very_negative']).sum() + ) + + # Count author roles + if 'AUTHOR_ROLE' in df.columns: + author_roles = df['AUTHOR_ROLE'].dropna() + if not author_roles.empty: + stats['current_owner_count'] = int((author_roles == 'current_owner').sum()) + stats['potential_buyer_count'] = int((author_roles == 'potential_buyer').sum()) + + # Count mention context + if 'SABIAN_MENTION_CONTEXT' in df.columns: + mention_context = df['SABIAN_MENTION_CONTEXT'].dropna() + if not mention_context.empty: + stats['primary_focus_count'] = int((mention_context == 'primary_focus').sum()) + + return stats + + +def aggregate_results(results: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Aggregate results from multiple batches. + + Args: + results: List of batch result dictionaries + + Returns: + Aggregated statistics dictionary + """ + aggregated = { + 'total_processed': sum(r.get('total_processed', 0) for r in results), + 'total_stored': sum(r.get('total_stored', 0) for r in results), + 'failed_count': sum(r.get('failed_count', 0) for r in results), + 'relevant_count': sum(r.get('relevant_count', 0) for r in results), + 'not_relevant_count': sum(r.get('not_relevant_count', 0) for r in results), + 'products_mentioned_count': sum(r.get('products_mentioned_count', 0) for r in results), + 'competitors_mentioned_count': sum(r.get('competitors_mentioned_count', 0) for r in results), + 'positive_sentiment_count': sum(r.get('positive_sentiment_count', 0) for r in results), + 'negative_sentiment_count': sum(r.get('negative_sentiment_count', 0) for r in results), + 'current_owner_count': sum(r.get('current_owner_count', 0) for r in results), + 'potential_buyer_count': sum(r.get('potential_buyer_count', 0) for r in results), + 'primary_focus_count': sum(r.get('primary_focus_count', 0) for r in results), + 'failed_batches': sum(1 for r in results if not r.get('success', False)) + } + + # Log failed batches + failed_batches = [r for r in results if not r.get('success', False)] + if failed_batches: + logger.error(f"{len(failed_batches)} batch(es) failed:") + for fb in failed_batches: + logger.error(f" Batch {fb.get('batch_num')}: {fb.get('error')}") + + return aggregated + + +# ============================================================ +# Forum Processing (existing functionality) +# ============================================================ + +# Columns that should be converted from lists to JSON strings +FORUM_JSON_ARRAY_COLUMNS = [ + 'products_mentioned', 'product_attributes', 'competitors_mentioned', + 'competitor_products_owned', 'intents', 'decision_drivers', + 'pain_points', 'delight_factors', 'processing_errors', 'relevance_keywords_found', + 'validation_errors', 'validation_warnings', 'validation_flags' +] + +# Column mapping from forum workflow state to output table +FORUM_COLUMN_MAPPING = { + 'post_id': 'POST_ID', + 'thread_id': 'THREAD_ID', + 'post_author_id': 'POST_AUTHOR_ID', + 'original_content': 'ORIGINAL_CONTENT', + 'cleaned_content': 'CLEANED_CONTENT', + 'quoted_content': 'QUOTED_CONTENT', + 'raw_thread_context': 'THREAD_CONTEXT', + 'thread_context_summary': 'THREAD_CONTEXT_SUMMARY', + 'thread_title': 'THREAD_TITLE', + 'thread_first_post': 'THREAD_FIRST_POST', + 'post_created_at': 'POST_CREATED_AT', + 'thread_started_at': 'THREAD_STARTED_AT', + 'category_title': 'CATEGORY_TITLE', + 'category_topic': 'CATEGORY_TOPIC', + 'detected_language': 'DETECTED_LANGUAGE', + 'language_code': 'LANGUAGE_CODE', + 'is_english': 'IS_ENGLISH', + 'is_relevant': 'IS_RELEVANT', + 'relevance_confidence': 'RELEVANCE_CONFIDENCE', + 'relevance_reason': 'RELEVANCE_REASON', + 'author_role': 'AUTHOR_ROLE', + 'sabian_mention_context': 'SABIAN_MENTION_CONTEXT', + 'sentiment_level': 'SENTIMENT_LEVEL', + 'emotion_type': 'EMOTION_TYPE', + 'sentiment_target': 'SENTIMENT_TARGET', + 'sentiment_confidence': 'SENTIMENT_CONFIDENCE', + 'products_mentioned': 'PRODUCTS_MENTIONED', + 'product_attributes': 'PRODUCT_ATTRIBUTES', + 'competitors_mentioned': 'COMPETITORS_MENTIONED', + 'competitor_products_owned': 'COMPETITOR_PRODUCTS_OWNED', + 'comparison_type': 'COMPARISON_TYPE', + 'competitive_positioning': 'COMPETITIVE_POSITIONING', + 'brand_switching': 'BRAND_SWITCHING', + 'intents': 'INTENTS', + 'purchase_stage': 'PURCHASE_STAGE', + 'decision_drivers': 'DECISION_DRIVERS', + 'pain_points': 'PAIN_POINTS', + 'delight_factors': 'DELIGHT_FACTORS', + 'analysis_notes': 'ANALYSIS_NOTES', + 'sarcasm_detected': 'SARCASM_DETECTED', + 'validation_passed': 'VALIDATION_PASSED', + 'validation_errors': 'VALIDATION_ERRORS', + 'validation_warnings': 'VALIDATION_WARNINGS', + 'validation_flags': 'VALIDATION_FLAGS', + 'success': 'PROCESSING_SUCCESS', + 'processing_status': 'PROCESSING_STATUS', + 'processing_errors': 'PROCESSING_ERRORS' +} + + +def prepare_forum_output_dataframe(df: pd.DataFrame) -> pd.DataFrame: + """ + Prepare forum output DataFrame with proper column mapping. + + Args: + df: DataFrame with processing results + + Returns: + DataFrame ready for Snowflake storage + """ + output_df = pd.DataFrame() + + for source_col, target_col in FORUM_COLUMN_MAPPING.items(): + if source_col in df.columns: + value = df[source_col].copy() + if source_col in FORUM_JSON_ARRAY_COLUMNS: + value = value.apply(safe_to_json) + output_df[target_col] = value + else: + output_df[target_col] = None + + # Add metadata + output_df['PLATFORM'] = 'musora_forums' + output_df['PROCESSED_AT'] = datetime.now() + output_df['WORKFLOW_VERSION'] = '4.0' + + return output_df + + +def process_forum_batch_worker(batch_data: tuple) -> Dict[str, Any]: + """ + Worker function to process a single batch of forum posts. + Runs in a separate process. + + Args: + batch_data: Tuple containing (batch_num, posts, configs, api_key, overwrite_first_batch, output_config) + + Returns: + Dictionary with batch statistics + """ + batch_num, posts, configs, api_key, overwrite_first_batch, output_config = batch_data + + worker_logger = logging.getLogger(f"ForumWorker-{batch_num}") + + try: + worker_logger.info(f"Forum Batch {batch_num}: Starting processing of {len(posts)} posts") + + # Initialize Snowflake connection for this worker + snowflake = SnowFlakeConn() + + # Initialize workflow for this worker + workflow = BrandAnalysisWorkflow( + workflow_config=configs['workflow'], + brand_config=configs['brand'], + analysis_categories=configs['categories'], + api_key=api_key + ) + + # Process posts + results = workflow.process_batch(posts) + + # Convert to DataFrame + results_df = pd.DataFrame(results) + + # Filter successful results + initial_count = len(results_df) + df_successful = results_df[results_df['success'] == True].copy() + failed_count = initial_count - len(df_successful) + + worker_logger.info(f"Forum Batch {batch_num}: Processed {initial_count} posts, {len(df_successful)} successful") + + # Prepare output DataFrame + output_df = prepare_forum_output_dataframe(df_successful) + + # Store results + if len(output_df) > 0: + overwrite = overwrite_first_batch and batch_num == 1 + + snowflake.store_df_to_snowflake( + table_name=output_config['table_name'], + dataframe=output_df, + database=output_config['database'], + schema=output_config['schema'], + overwrite=overwrite + ) + + worker_logger.info(f"Forum Batch {batch_num}: Stored {len(output_df)} records to Snowflake") + else: + worker_logger.warning(f"Forum Batch {batch_num}: No successful records to store") + + # Close connection + snowflake.close_connection() + + # Calculate statistics + stats = calculate_batch_stats(output_df) + stats.update({ + 'batch_num': batch_num, + 'success': True, + 'total_processed': initial_count, + 'total_stored': len(output_df), + 'failed_count': failed_count, + 'error': None + }) + + return stats + + except Exception as e: + error_msg = f"Forum Batch {batch_num} failed: {str(e)}" + worker_logger.error(error_msg) + worker_logger.error(traceback.format_exc()) + + return { + 'batch_num': batch_num, + 'success': False, + 'total_processed': len(posts), + 'total_stored': 0, + 'failed_count': len(posts), + 'error': error_msg + } + + +# ============================================================ +# Comment Processing (new functionality) +# ============================================================ + +# Columns that should be converted from lists to JSON strings (same analysis fields) +COMMENT_JSON_ARRAY_COLUMNS = [ + 'products_mentioned', 'product_attributes', 'competitors_mentioned', + 'competitor_products_owned', 'intents', 'decision_drivers', + 'pain_points', 'delight_factors', 'processing_errors', 'relevance_keywords_found', + 'validation_errors', 'validation_warnings', 'validation_flags' +] + +# Column mapping from comment workflow state to output table +COMMENT_COLUMN_MAPPING = { + # Comment-specific identifiers + 'comment_sk': 'COMMENT_SK', + 'comment_id': 'COMMENT_ID', + 'original_text': 'ORIGINAL_TEXT', + 'platform': 'PLATFORM', + 'comment_timestamp': 'COMMENT_TIMESTAMP', + 'author_name': 'AUTHOR_NAME', + 'author_id': 'AUTHOR_ID', + 'content_sk': 'CONTENT_SK', + 'content_id': 'CONTENT_ID', + 'content_description': 'CONTENT_DESCRIPTION', + 'channel_sk': 'CHANNEL_SK', + 'channel_name': 'CHANNEL_NAME', + 'channel_display_name': 'CHANNEL_DISPLAY_NAME', + 'parent_comment_id': 'PARENT_COMMENT_ID', + 'parent_comment_text': 'PARENT_COMMENT_TEXT', + # Analysis fields (same as forums) + 'detected_language': 'DETECTED_LANGUAGE', + 'language_code': 'LANGUAGE_CODE', + 'is_english': 'IS_ENGLISH', + 'is_relevant': 'IS_RELEVANT', + 'relevance_confidence': 'RELEVANCE_CONFIDENCE', + 'relevance_reason': 'RELEVANCE_REASON', + 'author_role': 'AUTHOR_ROLE', + 'sabian_mention_context': 'SABIAN_MENTION_CONTEXT', + 'sentiment_level': 'SENTIMENT_LEVEL', + 'emotion_type': 'EMOTION_TYPE', + 'sentiment_target': 'SENTIMENT_TARGET', + 'sentiment_confidence': 'SENTIMENT_CONFIDENCE', + 'products_mentioned': 'PRODUCTS_MENTIONED', + 'product_attributes': 'PRODUCT_ATTRIBUTES', + 'purchase_stage': 'PURCHASE_STAGE', + 'competitors_mentioned': 'COMPETITORS_MENTIONED', + 'competitor_products_owned': 'COMPETITOR_PRODUCTS_OWNED', + 'comparison_type': 'COMPARISON_TYPE', + 'competitive_positioning': 'COMPETITIVE_POSITIONING', + 'brand_switching': 'BRAND_SWITCHING', + 'intents': 'INTENTS', + 'decision_drivers': 'DECISION_DRIVERS', + 'pain_points': 'PAIN_POINTS', + 'delight_factors': 'DELIGHT_FACTORS', + 'analysis_notes': 'ANALYSIS_NOTES', + 'sarcasm_detected': 'SARCASM_DETECTED', + 'validation_passed': 'VALIDATION_PASSED', + 'validation_errors': 'VALIDATION_ERRORS', + 'validation_warnings': 'VALIDATION_WARNINGS', + 'validation_flags': 'VALIDATION_FLAGS', + 'success': 'PROCESSING_SUCCESS', + 'processing_status': 'PROCESSING_STATUS', + 'processing_errors': 'PROCESSING_ERRORS' +} + + +def prepare_comment_output_dataframe(df: pd.DataFrame) -> pd.DataFrame: + """ + Prepare comment output DataFrame with proper column mapping. + + Args: + df: DataFrame with processing results + + Returns: + DataFrame ready for Snowflake storage + """ + output_df = pd.DataFrame() + + for source_col, target_col in COMMENT_COLUMN_MAPPING.items(): + if source_col in df.columns: + value = df[source_col].copy() + if source_col in COMMENT_JSON_ARRAY_COLUMNS: + value = value.apply(safe_to_json) + output_df[target_col] = value + else: + output_df[target_col] = None + + # Add metadata + output_df['PROCESSED_AT'] = datetime.now() + output_df['WORKFLOW_VERSION'] = '4.0' + + return output_df + + +def process_comment_batch_worker(batch_data: tuple) -> Dict[str, Any]: + """ + Worker function to process a single batch of social media comments. + Runs in a separate process. + + Args: + batch_data: Tuple containing (batch_num, comments, configs, api_key, overwrite_first_batch, output_config) + + Returns: + Dictionary with batch statistics + """ + batch_num, comments, configs, api_key, overwrite_first_batch, output_config = batch_data + + worker_logger = logging.getLogger(f"CommentWorker-{batch_num}") + + try: + worker_logger.info(f"Comment Batch {batch_num}: Starting processing of {len(comments)} comments") + + # Initialize Snowflake connection for this worker + snowflake = SnowFlakeConn() + + # Initialize comment workflow for this worker + workflow = CommentAnalysisWorkflow( + workflow_config=configs['workflow'], + brand_config=configs['brand'], + analysis_categories=configs['categories'], + api_key=api_key + ) + + # Process comments + results = workflow.process_batch(comments) + + # Convert to DataFrame + results_df = pd.DataFrame(results) + + # Filter successful results + initial_count = len(results_df) + df_successful = results_df[results_df['success'] == True].copy() + failed_count = initial_count - len(df_successful) + + worker_logger.info(f"Comment Batch {batch_num}: Processed {initial_count} comments, {len(df_successful)} successful") + + # Prepare output DataFrame + output_df = prepare_comment_output_dataframe(df_successful) + + # Store results + if len(output_df) > 0: + overwrite = overwrite_first_batch and batch_num == 1 + + snowflake.store_df_to_snowflake( + table_name=output_config['table_name'], + dataframe=output_df, + database=output_config['database'], + schema=output_config['schema'], + overwrite=overwrite + ) + + worker_logger.info(f"Comment Batch {batch_num}: Stored {len(output_df)} records to Snowflake") + else: + worker_logger.warning(f"Comment Batch {batch_num}: No successful records to store") + + # Close connection + snowflake.close_connection() + + # Calculate statistics + stats = calculate_batch_stats(output_df) + stats.update({ + 'batch_num': batch_num, + 'success': True, + 'total_processed': initial_count, + 'total_stored': len(output_df), + 'failed_count': failed_count, + 'error': None + }) + + return stats + + except Exception as e: + error_msg = f"Comment Batch {batch_num} failed: {str(e)}" + worker_logger.error(error_msg) + worker_logger.error(traceback.format_exc()) + + return { + 'batch_num': batch_num, + 'success': False, + 'total_processed': len(comments), + 'total_stored': 0, + 'failed_count': len(comments), + 'error': error_msg + } + + +# ============================================================ +# Main Processor Class +# ============================================================ + +class BrandSentimentProcessor: + """ + Main processor class that orchestrates the entire workflow. + Supports processing forums, social media comments, or both. + """ + + def __init__(self, config_dir: str = None): + """ + Initialize the processor. + + Args: + config_dir: Directory containing configuration files + """ + # Load configurations + self.configs = load_configs(config_dir) + + # Initialize Snowflake connection + self.snowflake = SnowFlakeConn() + + # Get OpenAI API key + self.api_key = os.getenv("OPENAI_API_KEY") + if not self.api_key: + raise ValueError("OPENAI_API_KEY not found in environment variables") + + # Get output configurations + self.forum_output_config = self.configs['workflow'].get('output', { + 'table_name': 'SABIAN_BRAND_ANALYSIS', + 'database': 'SOCIAL_MEDIA_DB', + 'schema': 'ML_FEATURES' + }) + + self.comment_output_config = self.configs['workflow'].get('comments_output', { + 'table_name': 'SABIAN_BRAND_ANALYSIS_COMMENTS', + 'database': 'SOCIAL_MEDIA_DB', + 'schema': 'ML_FEATURES' + }) + + logger.info("BrandSentimentProcessor initialized successfully") + + def fetch_forum_posts(self, limit: int = None) -> pd.DataFrame: + """ + Fetch forum posts from Snowflake. + + Args: + limit: Optional limit on number of posts + + Returns: + DataFrame containing post data + """ + logger.info("Fetching forum posts...") + + sql_path = os.path.join(SCRIPT_DIR, 'database', 'sql', 'fetch_forum_posts.sql') + df = self.snowflake.fetch_forum_posts_with_context(sql_path, limit) + + logger.info(f"Fetched {len(df)} forum posts") + return df + + def fetch_comments(self, limit: int = None) -> pd.DataFrame: + """ + Fetch social media comments from Snowflake. + + Args: + limit: Optional limit on number of comments + + Returns: + DataFrame containing comment data + """ + logger.info("Fetching social media comments...") + + sql_path = os.path.join(SCRIPT_DIR, 'database', 'sql', 'fetch_comments.sql') + df = self.snowflake.fetch_comments(sql_path, limit) + + logger.info(f"Fetched {len(df)} social media comments") + return df + + def calculate_num_workers(self) -> int: + """ + Calculate number of parallel workers. + + Returns: + Number of workers + """ + parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {}) + max_workers = parallel_config.get('max_workers', 5) + + num_cpus = cpu_count() + num_workers = max(1, min(max_workers, num_cpus - 2)) + + logger.info(f"Using {num_workers} parallel workers (CPU count: {num_cpus})") + return num_workers + + # ---- Forum Processing ---- + + def process_forums_parallel(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]: + """ + Process forum posts using parallel workers. + + Args: + df: DataFrame containing posts + overwrite: Whether to overwrite existing table + + Returns: + Dictionary with aggregated statistics + """ + posts = df.to_dict('records') + total_posts = len(posts) + + logger.info(f"Processing {total_posts} forum posts using parallel processing...") + + num_workers = self.calculate_num_workers() + + parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {}) + min_batch = parallel_config.get('min_batch_size', 20) + max_batch = parallel_config.get('max_batch_size', 400) + + batch_size = calculate_optimal_batch_size(total_posts, num_workers, min_batch, max_batch) + logger.info(f"Forum batch size: {batch_size}") + + # Create batches + batches = [] + for i in range(0, total_posts, batch_size): + batch = posts[i:i + batch_size] + batch_num = (i // batch_size) + 1 + batches.append((batch_num, batch, self.configs, self.api_key, overwrite, self.forum_output_config)) + + total_batches = len(batches) + logger.info(f"Split into {total_batches} forum batches") + + # Process in parallel + with Pool(processes=num_workers) as pool: + results = pool.map(process_forum_batch_worker, batches) + + return aggregate_results(results) + + def process_forums_sequential(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]: + """ + Process forum posts sequentially (for debugging). + + Args: + df: DataFrame containing posts + overwrite: Whether to overwrite existing table + + Returns: + Dictionary with statistics + """ + logger.info(f"Processing {len(df)} forum posts using sequential processing...") + + posts = df.to_dict('records') + batch_data = (1, posts, self.configs, self.api_key, overwrite, self.forum_output_config) + result = process_forum_batch_worker(batch_data) + + return { + 'total_processed': result.get('total_processed', 0), + 'total_stored': result.get('total_stored', 0), + 'failed_count': result.get('failed_count', 0), + 'relevant_count': result.get('relevant_count', 0), + 'not_relevant_count': result.get('not_relevant_count', 0), + 'products_mentioned_count': result.get('products_mentioned_count', 0), + 'competitors_mentioned_count': result.get('competitors_mentioned_count', 0), + 'positive_sentiment_count': result.get('positive_sentiment_count', 0), + 'negative_sentiment_count': result.get('negative_sentiment_count', 0), + 'current_owner_count': result.get('current_owner_count', 0), + 'potential_buyer_count': result.get('potential_buyer_count', 0), + 'primary_focus_count': result.get('primary_focus_count', 0), + 'failed_batches': 0 if result.get('success', False) else 1 + } + + # ---- Comment Processing ---- + + def process_comments_parallel(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]: + """ + Process social media comments using parallel workers. + + Args: + df: DataFrame containing comments + overwrite: Whether to overwrite existing table + + Returns: + Dictionary with aggregated statistics + """ + comments = df.to_dict('records') + total_comments = len(comments) + + logger.info(f"Processing {total_comments} comments using parallel processing...") + + num_workers = self.calculate_num_workers() + + parallel_config = self.configs['workflow'].get('workflow', {}).get('parallel_processing', {}) + min_batch = parallel_config.get('min_batch_size', 20) + max_batch = parallel_config.get('max_batch_size', 400) + + batch_size = calculate_optimal_batch_size(total_comments, num_workers, min_batch, max_batch) + logger.info(f"Comment batch size: {batch_size}") + + # Create batches + batches = [] + for i in range(0, total_comments, batch_size): + batch = comments[i:i + batch_size] + batch_num = (i // batch_size) + 1 + batches.append((batch_num, batch, self.configs, self.api_key, overwrite, self.comment_output_config)) + + total_batches = len(batches) + logger.info(f"Split into {total_batches} comment batches") + + # Process in parallel + with Pool(processes=num_workers) as pool: + results = pool.map(process_comment_batch_worker, batches) + + return aggregate_results(results) + + def process_comments_sequential(self, df: pd.DataFrame, overwrite: bool = False) -> Dict[str, Any]: + """ + Process social media comments sequentially (for debugging). + + Args: + df: DataFrame containing comments + overwrite: Whether to overwrite existing table + + Returns: + Dictionary with statistics + """ + logger.info(f"Processing {len(df)} comments using sequential processing...") + + comments = df.to_dict('records') + batch_data = (1, comments, self.configs, self.api_key, overwrite, self.comment_output_config) + result = process_comment_batch_worker(batch_data) + + return { + 'total_processed': result.get('total_processed', 0), + 'total_stored': result.get('total_stored', 0), + 'failed_count': result.get('failed_count', 0), + 'relevant_count': result.get('relevant_count', 0), + 'not_relevant_count': result.get('not_relevant_count', 0), + 'products_mentioned_count': result.get('products_mentioned_count', 0), + 'competitors_mentioned_count': result.get('competitors_mentioned_count', 0), + 'positive_sentiment_count': result.get('positive_sentiment_count', 0), + 'negative_sentiment_count': result.get('negative_sentiment_count', 0), + 'current_owner_count': result.get('current_owner_count', 0), + 'potential_buyer_count': result.get('potential_buyer_count', 0), + 'primary_focus_count': result.get('primary_focus_count', 0), + 'failed_batches': 0 if result.get('success', False) else 1 + } + + # ---- Unified Processing ---- + + def _log_source_summary(self, source_name: str, stats: Dict[str, Any], processing_time: float) -> None: + """ + Log processing summary for a data source. + + Args: + source_name: Name of the data source + stats: Processing statistics + processing_time: Time taken in seconds + """ + logger.info(f" --- {source_name} ---") + logger.info(f" Total processed: {stats.get('total_processed', 0)}") + logger.info(f" Successfully stored: {stats.get('total_stored', 0)}") + logger.info(f" Failed: {stats.get('failed_count', 0)}") + logger.info(f" Relevant: {stats.get('relevant_count', 0)}") + logger.info(f" Not relevant: {stats.get('not_relevant_count', 0)}") + logger.info(f" Product mentions: {stats.get('products_mentioned_count', 0)}") + logger.info(f" Competitor mentions: {stats.get('competitors_mentioned_count', 0)}") + logger.info(f" Positive sentiment: {stats.get('positive_sentiment_count', 0)}") + logger.info(f" Negative sentiment: {stats.get('negative_sentiment_count', 0)}") + logger.info(f" Current owners: {stats.get('current_owner_count', 0)}") + logger.info(f" Potential buyers: {stats.get('potential_buyer_count', 0)}") + logger.info(f" Primary focus: {stats.get('primary_focus_count', 0)}") + if stats.get('failed_batches', 0) > 0: + logger.info(f" Failed batches: {stats['failed_batches']}") + logger.info(f" Processing time: {processing_time:.2f} seconds") + if stats.get('total_processed', 0) > 0: + logger.info(f" Average per item: {processing_time / stats['total_processed']:.2f} seconds") + + def run( + self, + limit: int = None, + overwrite: bool = False, + sequential: bool = False, + data_source: str = 'all' + ): + """ + Run the complete processing pipeline. + + Args: + limit: Optional limit on items to process per source + overwrite: Whether to overwrite existing table + sequential: Use sequential processing instead of parallel + data_source: Which data source to process ('forums', 'comments', 'all') + """ + try: + logger.info("=" * 80) + logger.info("Starting Brand Sentiment Analysis Workflow") + logger.info(f"Brand: {self.configs['brand'].get('brand', {}).get('name', 'Unknown')}") + logger.info(f"Mode: {'SEQUENTIAL' if sequential else 'PARALLEL'}") + logger.info(f"Data source: {data_source}") + logger.info("=" * 80) + + process_forums = data_source in ('forums', 'all') + process_comments = data_source in ('comments', 'all') + + # Track results for summary + forum_stats = None + forum_time = 0.0 + comment_stats = None + comment_time = 0.0 + + # ---- Process Forums ---- + if process_forums: + logger.info("-" * 40) + logger.info("Processing FORUMS") + logger.info("-" * 40) + + df_posts = self.fetch_forum_posts(limit) + + if df_posts.empty: + logger.warning("No forum posts to process") + else: + start_time = datetime.now() + + if sequential: + forum_stats = self.process_forums_sequential(df_posts, overwrite) + else: + forum_stats = self.process_forums_parallel(df_posts, overwrite) + + forum_time = (datetime.now() - start_time).total_seconds() + + # ---- Process Comments ---- + if process_comments: + logger.info("-" * 40) + logger.info("Processing SOCIAL MEDIA COMMENTS") + logger.info("-" * 40) + + df_comments = self.fetch_comments(limit) + + if df_comments.empty: + logger.warning("No social media comments to process") + else: + start_time = datetime.now() + + if sequential: + comment_stats = self.process_comments_sequential(df_comments, overwrite) + else: + comment_stats = self.process_comments_parallel(df_comments, overwrite) + + comment_time = (datetime.now() - start_time).total_seconds() + + # ---- Summary ---- + logger.info("=" * 80) + logger.info("Processing Summary:") + logger.info(f" Mode: {'Sequential' if sequential else 'Parallel'}") + logger.info(f" Data source: {data_source}") + + if forum_stats is not None: + self._log_source_summary("Forums", forum_stats, forum_time) + + if comment_stats is not None: + self._log_source_summary("Social Media Comments", comment_stats, comment_time) + + logger.info("=" * 80) + + except Exception as e: + logger.error(f"Error in workflow execution: {str(e)}", exc_info=True) + raise + + finally: + self.snowflake.close_connection() + logger.info("Snowflake connection closed") + + +# ============================================================ +# Legacy compatibility - keep old function names working +# ============================================================ + +def prepare_output_dataframe(df: pd.DataFrame) -> pd.DataFrame: + """Legacy wrapper for forum output preparation.""" + return prepare_forum_output_dataframe(df) + + +def process_batch_worker(batch_data: tuple) -> Dict[str, Any]: + """Legacy wrapper for forum batch worker.""" + return process_forum_batch_worker(batch_data) + + +# ============================================================ +# Main Entry Point +# ============================================================ + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser( + description="Brand Sentiment Analysis - Analyze forum posts and social media comments for brand intelligence" + ) + parser.add_argument( + '--limit', + type=int, + default=None, + help='Limit number of items to process per source (default: all unprocessed)' + ) + parser.add_argument( + '--overwrite', + action='store_true', + default=False, + help='Overwrite existing Snowflake table (default: append)' + ) + parser.add_argument( + '--sequential', + action='store_true', + default=False, + help='Use sequential processing instead of parallel (for debugging)' + ) + parser.add_argument( + '--config-dir', + type=str, + default=None, + help='Path to configuration directory (default: config_files/)' + ) + parser.add_argument( + '--data-source', + type=str, + choices=['forums', 'comments', 'all'], + default='all', + help='Data source to process: forums, comments, or all (default: all)' + ) + + args = parser.parse_args() + + # Initialize and run + processor = BrandSentimentProcessor(config_dir=args.config_dir) + processor.run( + limit=args.limit, + overwrite=args.overwrite, + sequential=args.sequential, + data_source=args.data_source + ) + + +if __name__ == "__main__": + main() diff --git a/processing_brand_sentiment/utils/__init__.py b/processing_brand_sentiment/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..81a4baaaf7416f7f76a084ce4a151231626f2ee8 --- /dev/null +++ b/processing_brand_sentiment/utils/__init__.py @@ -0,0 +1,8 @@ +""" +Utilities module for brand sentiment analysis. +Contains HTML parsing and other helper functions. +""" + +from .html_parser import HTMLParser + +__all__ = ['HTMLParser'] \ No newline at end of file diff --git a/processing_brand_sentiment/utils/html_parser.py b/processing_brand_sentiment/utils/html_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..aa2537014080fc5525f48f3f04f73c41defff7a7 --- /dev/null +++ b/processing_brand_sentiment/utils/html_parser.py @@ -0,0 +1,253 @@ +""" +HTML Parser utility for extracting content from forum posts. +Handles the complex HTML structure where replies contain quoted parent content. +""" + +import re +import html +from typing import Dict, Optional, Tuple +from bs4 import BeautifulSoup +import logging + +logger = logging.getLogger(__name__) + + +class HTMLParser: + """ + Parses HTML content from forum posts to extract actual reply content + and quoted parent content separately. + """ + + def __init__(self): + """Initialize the HTML parser.""" + pass + + def parse_post_content(self, html_content: str) -> Dict[str, Optional[str]]: + """ + Parse HTML post content to extract reply and quoted content. + + The forum posts have a structure where: + -
contains the quoted parent post + - Content outside blockquote is the actual reply + + Example input: +
125015 +

JackO - Feb 3, 2015

+

Parent content here...

+

Actual reply content here...

+ + Args: + html_content: Raw HTML content from POST_CONTENT field + + Returns: + Dictionary with: + - reply_content: The actual reply text (cleaned) + - quoted_content: The quoted parent text (cleaned), if any + - quoted_author: Author of the quoted post, if any + - quoted_date: Date of the quoted post, if any + - has_quote: Boolean indicating if post contains a quote + """ + if not html_content or not html_content.strip(): + return { + "reply_content": "", + "quoted_content": None, + "quoted_author": None, + "quoted_date": None, + "has_quote": False + } + + try: + soup = BeautifulSoup(html_content, 'html.parser') + + # Extract quoted content from blockquotes + quoted_content = None + quoted_author = None + quoted_date = None + has_quote = False + + blockquotes = soup.find_all('blockquote') + + if blockquotes: + has_quote = True + quote_parts = [] + + for blockquote in blockquotes: + # Extract quote heading info (author and date) + quote_heading = blockquote.find('p', class_='quote-heading') + if quote_heading: + author_tag = quote_heading.find('strong') + if author_tag: + quoted_author = author_tag.get_text(strip=True) + + date_tag = quote_heading.find('em') + if date_tag: + quoted_date = date_tag.get_text(strip=True).lstrip(' - ') + + # Get the quote text content (excluding heading) + # Remove the heading first to get just the content + if quote_heading: + quote_heading.decompose() + + # Remove post-id spans + for post_id_span in blockquote.find_all('span', class_='post-id'): + post_id_span.decompose() + + quote_text = self._clean_text(blockquote.get_text()) + if quote_text: + quote_parts.append(quote_text) + + # Remove the blockquote from the soup to get remaining content + blockquote.decompose() + + quoted_content = " ".join(quote_parts) if quote_parts else None + + # Get the remaining content (actual reply) + reply_content = self._clean_text(soup.get_text()) + + return { + "reply_content": reply_content, + "quoted_content": quoted_content, + "quoted_author": quoted_author, + "quoted_date": quoted_date, + "has_quote": has_quote + } + + except Exception as e: + logger.warning(f"Error parsing HTML content: {e}") + # Fallback: try to extract text directly + return { + "reply_content": self._clean_text(self._strip_html_tags(html_content)), + "quoted_content": None, + "quoted_author": None, + "quoted_date": None, + "has_quote": False + } + + def _clean_text(self, text: str) -> str: + """ + Clean extracted text by removing extra whitespace and normalizing. + + Args: + text: Raw text to clean + + Returns: + Cleaned text + """ + if not text: + return "" + + # Decode HTML entities + text = html.unescape(text) + + # Replace multiple whitespace with single space + text = re.sub(r'\s+', ' ', text) + + # Strip leading/trailing whitespace + text = text.strip() + + return text + + def _strip_html_tags(self, html_content: str) -> str: + """ + Fallback method to strip HTML tags if BeautifulSoup fails. + + Args: + html_content: HTML content + + Returns: + Text without HTML tags + """ + # Remove HTML tags + clean = re.sub(r'<[^>]+>', ' ', html_content) + # Decode entities + clean = html.unescape(clean) + # Clean whitespace + clean = re.sub(r'\s+', ' ', clean) + return clean.strip() + + def extract_plain_text(self, html_content: str) -> str: + """ + Extract plain text from HTML content, preserving readability. + + Args: + html_content: HTML content + + Returns: + Plain text version + """ + if not html_content: + return "" + + try: + soup = BeautifulSoup(html_content, 'html.parser') + + # Add newlines for block elements + for br in soup.find_all('br'): + br.replace_with('\n') + for p in soup.find_all('p'): + p.append('\n') + + text = soup.get_text() + return self._clean_text(text) + + except Exception as e: + logger.warning(f"Error extracting plain text: {e}") + return self._clean_text(self._strip_html_tags(html_content)) + + def build_thread_context( + self, + thread_title: Optional[str], + first_post_content: Optional[str], + category_title: Optional[str] = None, + category_topic: Optional[str] = None + ) -> str: + """ + Build a context string from thread information. + + Args: + thread_title: Title of the discussion thread + first_post_content: Content of the first post in the thread + category_title: Category title + category_topic: Category topic + + Returns: + Formatted context string + """ + context_parts = [] + + if category_title: + context_parts.append(f"Category: {category_title}") + + if category_topic: + context_parts.append(f"Topic: {category_topic}") + + if thread_title: + context_parts.append(f"Thread: {thread_title}") + + if first_post_content: + # Parse and clean the first post content + parsed = self.parse_post_content(first_post_content) + first_post_text = parsed.get("reply_content", "") + if first_post_text: + # Truncate if too long + if len(first_post_text) > 500: + first_post_text = first_post_text[:500] + "..." + context_parts.append(f"Original discussion: {first_post_text}") + + return " | ".join(context_parts) if context_parts else "" + + def is_empty_content(self, html_content: str) -> bool: + """ + Check if HTML content is effectively empty. + + Args: + html_content: HTML content to check + + Returns: + True if content is empty or contains no meaningful text + """ + if not html_content: + return True + + text = self.extract_plain_text(html_content) + return len(text.strip()) == 0 \ No newline at end of file diff --git a/processing_brand_sentiment/workflow/__init__.py b/processing_brand_sentiment/workflow/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9e6f94d9d8d81a676c52edae75231b8618e40e91 --- /dev/null +++ b/processing_brand_sentiment/workflow/__init__.py @@ -0,0 +1,10 @@ +""" +Workflow module for brand sentiment analysis. +Contains the LangGraph orchestrators and agent implementations. +Supports both forum posts and social media comments. +""" + +from .orchestrator import BrandAnalysisWorkflow +from .comment_orchestrator import CommentAnalysisWorkflow + +__all__ = ['BrandAnalysisWorkflow', 'CommentAnalysisWorkflow'] \ No newline at end of file diff --git a/processing_brand_sentiment/workflow/agents/__init__.py b/processing_brand_sentiment/workflow/agents/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8fa1360975ddc231b6829efadf804ac0c428406d --- /dev/null +++ b/processing_brand_sentiment/workflow/agents/__init__.py @@ -0,0 +1,39 @@ +""" +Agents module for brand sentiment analysis v4.0. + +Contains specialized agents for the 4-stage pipeline: +1. ContentPreprocessorAgent - HTML parsing, cleaning, keyword detection (forums) + CommentPreprocessorAgent - Plain text cleaning, keyword detection (comments) +2. SabianRelevanceExtractionAgent - Relevance + fact extraction +3. SabianSentimentAnalyzerAgent - Deep sentiment analysis +4. OutputValidatorAgent - Rule-based validation +""" + +from .base_agent import BaseAgent +from .content_preprocessor_agent import ContentPreprocessorAgent +from .comment_preprocessor_agent import CommentPreprocessorAgent +from .sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent +from .sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent +from .output_validator_agent import OutputValidatorAgent + +# Legacy imports for backward compatibility +from .preprocessor_agent import PreprocessorAgent +from .relevance_validator_agent import RelevanceValidatorAgent +from .sabian_analyzer_agent import SabianAnalyzerAgent + +__all__ = [ + # Base + 'BaseAgent', + + # New agents (v4.0) + 'ContentPreprocessorAgent', + 'CommentPreprocessorAgent', + 'SabianRelevanceExtractionAgent', + 'SabianSentimentAnalyzerAgent', + 'OutputValidatorAgent', + + # Legacy agents (for backward compatibility) + 'PreprocessorAgent', + 'RelevanceValidatorAgent', + 'SabianAnalyzerAgent' +] diff --git a/processing_brand_sentiment/workflow/agents/base_agent.py b/processing_brand_sentiment/workflow/agents/base_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..fb722accf36ed6cce891c1e6d22dd7a157744481 --- /dev/null +++ b/processing_brand_sentiment/workflow/agents/base_agent.py @@ -0,0 +1,169 @@ +""" +Base Agent class for all agents in the brand sentiment analysis workflow. +Provides a common interface and structure for extensibility. +""" + +from abc import ABC, abstractmethod +from typing import Dict, Any, Optional +import json +import logging + +logger = logging.getLogger(__name__) + + +class BaseAgent(ABC): + """ + Abstract base class for all agents in the brand sentiment analysis workflow. + Provides common functionality and enforces consistent interface. + """ + + def __init__(self, name: str, config: Dict[str, Any]): + """ + Initialize the base agent. + + Args: + name: Name of the agent + config: Configuration dictionary for the agent + """ + self.name = name + self.config = config + self.model = config.get("model", "gpt-5-nano") + self.temperature = config.get("temperature", 0.2) + self.max_retries = config.get("max_retries", 3) + logger.info(f"Initialized {self.name} with model {self.model}") + + @abstractmethod + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process input data and return results. + This method must be implemented by all concrete agent classes. + + Args: + input_data: Dictionary containing input data for processing + + Returns: + Dictionary containing processing results + """ + pass + + @abstractmethod + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """ + Validate input data before processing. + + Args: + input_data: Dictionary containing input data + + Returns: + True if input is valid, False otherwise + """ + pass + + def get_name(self) -> str: + """Get the agent name.""" + return self.name + + def get_config(self) -> Dict[str, Any]: + """Get the agent configuration.""" + return self.config + + def log_processing(self, message: str, level: str = "info"): + """ + Log processing information. + + Args: + message: Log message + level: Log level (info, warning, error, debug) + """ + log_method = getattr(logger, level, logger.info) + log_method(f"[{self.name}] {message}") + + def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]: + """ + Handle errors consistently across all agents. + + Args: + error: The exception that occurred + context: Additional context about the error + + Returns: + Error dictionary with details + """ + error_msg = f"Error in {self.name}" + if context: + error_msg += f" ({context})" + error_msg += f": {str(error)}" + + logger.error(error_msg) + + return { + "success": False, + "error": str(error), + "agent": self.name, + "context": context + } + + def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]: + """ + Parse LLM response that may contain JSON wrapped in markdown code blocks. + + Args: + response_content: Raw response content from LLM + + Returns: + Parsed JSON dictionary + + Raises: + json.JSONDecodeError: If JSON cannot be parsed + """ + content = response_content.strip() + + # Check if response is wrapped in markdown code block + if content.startswith("```json"): + # Remove ```json prefix and ``` suffix + content = content[7:] # Remove ```json + if content.endswith("```"): + content = content[:-3] # Remove trailing ``` + content = content.strip() + elif content.startswith("```"): + # Remove generic ``` code block + content = content[3:] + if content.endswith("```"): + content = content[:-3] + content = content.strip() + + # Parse the cleaned JSON + return json.loads(content) + + def _safe_get(self, data: Dict[str, Any], key: str, default: Any = None) -> Any: + """ + Safely get a value from a dictionary with a default. + + Args: + data: Dictionary to get value from + key: Key to look up + default: Default value if key not found + + Returns: + Value from dictionary or default + """ + return data.get(key, default) + + def _ensure_list(self, value: Any) -> list: + """ + Ensure a value is a list. + + Args: + value: Value to convert + + Returns: + List version of value + """ + if value is None: + return [] + if isinstance(value, list): + return value + if isinstance(value, str): + # Try to parse as comma-separated + return [v.strip() for v in value.split(",") if v.strip()] + return [value] \ No newline at end of file diff --git a/processing_brand_sentiment/workflow/agents/comment_preprocessor_agent.py b/processing_brand_sentiment/workflow/agents/comment_preprocessor_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..19d3847af17605ffc7722e041d6ddb295fda3ba3 --- /dev/null +++ b/processing_brand_sentiment/workflow/agents/comment_preprocessor_agent.py @@ -0,0 +1,211 @@ +""" +Comment Preprocessor Agent for brand sentiment analysis on social media comments. + +Extends ContentPreprocessorAgent but handles plain text (no HTML parsing). +Builds context from content title, content description, and parent comment text +instead of thread title and first post. + +Reuses: keyword sets, product alias mapping, language detection, relevance screening. +Overrides: process() method for plain text handling and comment-specific context building. +""" + +from typing import Dict, Any, Optional +import logging + +from .content_preprocessor_agent import ContentPreprocessorAgent + +logger = logging.getLogger(__name__) + + +class CommentPreprocessorAgent(ContentPreprocessorAgent): + """ + Agent that preprocesses social media comments for brand sentiment analysis. + + Inherits keyword detection, product alias mapping, language detection, + and relevance screening from ContentPreprocessorAgent. + + Key differences from forum preprocessor: + - No HTML parsing (comments are plain text) + - Context built from content title + description + parent comment + - Different input field names (comment_text vs post_content) + """ + + def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]): + """ + Initialize the Comment Preprocessor Agent. + + Args: + config: Agent configuration + brand_config: Brand-specific configuration with keywords, products, and aliases + """ + super().__init__(config, brand_config) + self.name = "CommentPreprocessorAgent" + + logger.info( + f"CommentPreprocessorAgent initialized with {len(self.primary_keywords)} primary keywords, " + f"{len(self.product_aliases)} product aliases" + ) + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """ + Validate that input contains required fields for comment processing. + + Args: + input_data: Input dictionary + + Returns: + True if valid, False otherwise + """ + required_fields = ["comment_sk", "comment_text"] + return all(field in input_data for field in required_fields) + + def _build_comment_context( + self, + content_title: Optional[str] = None, + content_description: Optional[str] = None, + parent_comment_text: Optional[str] = None + ) -> str: + """ + Build context string from social media content and parent comment information. + + Args: + content_title: Title of the social media post/content + content_description: Description/message of the social media post + parent_comment_text: Text of the parent comment (if this is a reply) + + Returns: + Formatted context string + """ + context_parts = [] + + if content_title: + context_parts.append(f"Post title: {content_title}") + + if content_description: + # Truncate if too long + truncated = content_description[:500] + "..." if len(content_description) > 500 else content_description + context_parts.append(f"Post description: {truncated}") + + if parent_comment_text: + truncated = parent_comment_text[:500] + "..." if len(parent_comment_text) > 500 else parent_comment_text + context_parts.append(f"Parent comment: {truncated}") + + return " | ".join(context_parts) if context_parts else "" + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a social media comment through the preprocessing pipeline. + + Unlike forum posts, comments are plain text (no HTML parsing needed). + Context is built from content title, description, and parent comment. + + Args: + input_data: Dictionary containing comment data with at least: + - comment_sk: Comment surrogate key + - comment_text: Raw comment text (plain text) + - content_title: Title of the post (optional) + - content_description: Description of the post (optional) + - parent_comment_text: Parent comment text if reply (optional) + + Returns: + Dictionary with preprocessing results + """ + try: + # Validate input + if not self.validate_input(input_data): + return { + "success": False, + "error": "Invalid input: missing required fields (comment_sk, comment_text)", + **input_data + } + + comment_text = input_data.get("comment_text", "") + + # Step 1: Clean text (plain text - no HTML parsing needed) + cleaned_content = comment_text.strip() if comment_text else "" + + # Check for empty content + if not cleaned_content or len(cleaned_content) < self.min_content_length: + return { + "success": True, + "cleaned_content": cleaned_content, + "quoted_content": None, + "is_empty": True, + "preliminary_relevant": False, + "needs_relevance_validation": False, + **{k: v for k, v in input_data.items() if k != "comment_text"} + } + + # Step 2: Check relevance (reused from parent class) + relevance_result = self._check_relevance(cleaned_content) + has_primary_keywords = relevance_result.get("has_primary_keywords", False) + + # Step 3: Build comment context + raw_thread_context = self._build_comment_context( + content_title=input_data.get("content_title"), + content_description=input_data.get("content_description"), + parent_comment_text=input_data.get("parent_comment_text") + ) + + # Step 4: Detect language (reused from parent class) + lang_result = self._detect_language(cleaned_content, has_primary_keywords) + + # Step 5: Extract product and competitor mentions (reused from parent class) + products_found = self._extract_mentioned_products(cleaned_content) + competitors_found = self._extract_mentioned_competitors(cleaned_content) + + # Determine quoted content (parent comment serves as quoted context) + parent_comment = input_data.get("parent_comment_text") + has_parent = parent_comment is not None and str(parent_comment).strip() != "" + + # Build result + result = { + "success": True, + "is_empty": False, + + # Cleaned content + "cleaned_content": cleaned_content, + "quoted_content": parent_comment if has_parent else None, + "has_quote": has_parent, + "quoted_author": None, + "raw_thread_context": raw_thread_context, + + # Language detection + "detected_language": lang_result["language"], + "language_code": lang_result["language_code"], + "is_english": lang_result["is_english"], + "language_confidence": lang_result["confidence"], + "language_detection_skipped": lang_result.get("detection_skipped", False), + + # Relevance assessment + "preliminary_relevant": relevance_result["preliminary_relevant"], + "needs_relevance_validation": relevance_result["needs_relevance_validation"], + "relevance_keywords_found": relevance_result["found_keywords"], + "relevance_type": relevance_result["relevance_type"], + "relevance_confidence": relevance_result["relevance_confidence"], + "has_primary_keywords": has_primary_keywords, + + # Initial extractions + "products_detected": products_found, + "competitors_detected": competitors_found, + + # Preserve original data (exclude raw text to avoid duplication) + **{k: v for k, v in input_data.items() if k not in ["comment_text"]} + } + + # Keep original content for reference + result["original_text"] = comment_text + + self.log_processing( + f"Processed comment {input_data.get('comment_sk')}: " + f"lang={lang_result['language']}, " + f"relevant={relevance_result['preliminary_relevant']}, " + f"needs_validation={relevance_result['needs_relevance_validation']}, " + f"products={products_found}", + "debug" + ) + + return result + + except Exception as e: + return self.handle_error(e, f"preprocessing comment {input_data.get('comment_sk')}") diff --git a/processing_brand_sentiment/workflow/agents/content_preprocessor_agent.py b/processing_brand_sentiment/workflow/agents/content_preprocessor_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..eab488d7f1f0646b39960b539c584298b68ccb53 --- /dev/null +++ b/processing_brand_sentiment/workflow/agents/content_preprocessor_agent.py @@ -0,0 +1,570 @@ +""" +Content Preprocessor Agent for brand sentiment analysis. +Handles HTML parsing, text cleaning, language detection, product alias mapping, +and initial relevance screening. This is a deterministic agent (no LLM calls). + +Enhanced version with: +- Product alias mapping (B8 -> B8X) +- Smart language detection (skip for short texts) +- Always process if primary keywords found +- Better content separation +""" + +import re +from typing import Dict, Any, List, Optional, Set +from lingua import Language, LanguageDetectorBuilder +import logging + +from .base_agent import BaseAgent +from utils.html_parser import HTMLParser + +logger = logging.getLogger(__name__) + + +class ContentPreprocessorAgent(BaseAgent): + """ + Agent that preprocesses forum posts: + - Parses HTML to extract reply and quoted content + - Cleans and normalizes text + - Maps product aliases to canonical names + - Detects language (with smart handling for short texts) + - Performs initial keyword-based relevance screening + """ + + # Lingua to ISO 639-1 language code mapping + LINGUA_TO_ISO = { + Language.ENGLISH: "en", + Language.SPANISH: "es", + Language.FRENCH: "fr", + Language.GERMAN: "de", + Language.ITALIAN: "it", + Language.PORTUGUESE: "pt", + Language.RUSSIAN: "ru", + Language.JAPANESE: "ja", + Language.KOREAN: "ko", + Language.CHINESE: "zh", + Language.ARABIC: "ar", + Language.HINDI: "hi", + Language.DUTCH: "nl", + Language.SWEDISH: "sv", + Language.POLISH: "pl", + Language.TURKISH: "tr" + } + + def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]): + """ + Initialize the Content Preprocessor Agent. + + Args: + config: Agent configuration + brand_config: Brand-specific configuration with keywords, products, and aliases + """ + super().__init__("ContentPreprocessorAgent", config) + self.brand_config = brand_config + self.html_parser = HTMLParser() + + # Get preprocessing settings + preprocessing_config = brand_config.get("preprocessing", {}) + self.min_length_for_lang_detection = preprocessing_config.get( + "min_length_for_language_detection", 50 + ) + self.default_language = preprocessing_config.get( + "default_language_for_short_text", "English" + ) + self.always_process_primary = preprocessing_config.get( + "always_process_if_primary_keyword", True + ) + self.min_content_length = preprocessing_config.get("min_content_length", 3) + + # Initialize lingua detector + self.language_detector = LanguageDetectorBuilder.from_all_languages().build() + + # Build keyword sets and alias mappings + self._build_keyword_sets() + self._build_alias_mappings() + + logger.info( + f"ContentPreprocessorAgent initialized with {len(self.primary_keywords)} primary keywords, " + f"{len(self.product_aliases)} product aliases" + ) + + def _build_keyword_sets(self) -> None: + """Build keyword sets from brand configuration for efficient relevance checking.""" + relevance_config = self.brand_config.get("relevance_keywords", {}) + + # Primary keywords - definitive Sabian mentions + primary = relevance_config.get("primary", {}).get("keywords", []) + self.primary_keywords: Set[str] = set(k.lower() for k in primary) + + # Contextual keywords - need disambiguation (HH, AA) + contextual = relevance_config.get("contextual", {}).get("keywords", []) + self.contextual_keywords: Set[str] = set(k.lower() for k in contextual) + + # Cymbal context keywords - help disambiguate contextual terms + cymbal_context = relevance_config.get("cymbal_context", {}).get("keywords", []) + self.cymbal_context_keywords: Set[str] = set(k.lower() for k in cymbal_context) + + # Competitor names and aliases for detection + competitors = self.brand_config.get("brand", {}).get("competitors", []) + self.competitor_keywords: Set[str] = set() + self.competitor_name_map: Dict[str, str] = {} # alias -> canonical name + + for comp in competitors: + if isinstance(comp, dict): + name = comp.get("name", "") + self.competitor_keywords.add(name.lower()) + self.competitor_name_map[name.lower()] = name + for alias in comp.get("aliases", []): + alias_lower = alias.lower() + self.competitor_keywords.add(alias_lower) + self.competitor_name_map[alias_lower] = name + else: + comp_str = str(comp).lower() + self.competitor_keywords.add(comp_str) + self.competitor_name_map[comp_str] = str(comp) + + # Product names + products = self.brand_config.get("brand", {}).get("products", []) + self.product_keywords: Set[str] = set(p.lower() for p in products) + self.products_list = products # Keep original case + + logger.debug( + f"Built keyword sets: {len(self.primary_keywords)} primary, " + f"{len(self.contextual_keywords)} contextual, " + f"{len(self.product_keywords)} products, " + f"{len(self.competitor_keywords)} competitor terms" + ) + + def _build_alias_mappings(self) -> None: + """Build product alias mappings from brand configuration.""" + aliases = self.brand_config.get("brand", {}).get("product_aliases", {}) + + # Build alias -> canonical product mapping + self.product_aliases: Dict[str, str] = {} + for alias, canonical in aliases.items(): + self.product_aliases[alias.lower()] = canonical + + # Also add primary keywords that are aliases to contextual keywords + # e.g., "b8" should trigger contextual check since it maps to "B8X" + for alias in self.product_aliases.keys(): + if alias not in self.primary_keywords: + self.contextual_keywords.add(alias) + + logger.debug(f"Built {len(self.product_aliases)} product alias mappings") + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """ + Validate that input contains required fields. + + Args: + input_data: Input dictionary + + Returns: + True if valid, False otherwise + """ + required_fields = ["post_id", "post_content"] + return all(field in input_data for field in required_fields) + + def _detect_language(self, text: str, has_primary_keywords: bool = False) -> Dict[str, Any]: + """ + Detect the language of text using lingua library. + + Enhanced logic: + - Skip detection for short texts (< min_length_for_lang_detection chars) + - Always return English if primary Sabian keywords are found + + Args: + text: Text to analyze + has_primary_keywords: Whether primary Sabian keywords were found + + Returns: + Dictionary with language detection results + """ + try: + cleaned_text = text.strip() + + # If text is too short, default to English + if len(cleaned_text) < self.min_length_for_lang_detection: + return { + "language": self.default_language, + "language_code": "en", + "is_english": True, + "confidence": "low", + "detection_skipped": True, + "skip_reason": f"Text too short ({len(cleaned_text)} < {self.min_length_for_lang_detection} chars)" + } + + # If primary keywords found and always_process_primary is True, treat as English + if has_primary_keywords and self.always_process_primary: + # Still try to detect, but override if non-English + detected = self.language_detector.detect_language_of(cleaned_text) + + if detected == Language.ENGLISH: + return { + "language": "English", + "language_code": "en", + "is_english": True, + "confidence": "high", + "detection_skipped": False, + "skip_reason": None + } + else: + # Primary keyword found but detected as non-English + # Force to English since Sabian is explicitly mentioned + lang_name = detected.name.capitalize() if detected else "Unknown" + return { + "language": "English", + "language_code": "en", + "is_english": True, + "confidence": "medium", + "detection_skipped": False, + "skip_reason": None, + "original_detected_language": lang_name, + "override_reason": "Primary Sabian keyword found, treating as English" + } + + # Standard detection + detected = self.language_detector.detect_language_of(cleaned_text) + + if detected is None: + return { + "language": self.default_language, + "language_code": "en", + "is_english": True, + "confidence": "low", + "detection_skipped": False, + "skip_reason": None + } + + if detected == Language.ENGLISH: + return { + "language": "English", + "language_code": "en", + "is_english": True, + "confidence": "high", + "detection_skipped": False, + "skip_reason": None + } + + lang_code = self.LINGUA_TO_ISO.get(detected, "unknown") + lang_name = detected.name.capitalize() + + return { + "language": lang_name, + "language_code": lang_code, + "is_english": False, + "confidence": "high", + "detection_skipped": False, + "skip_reason": None + } + + except Exception as e: + logger.warning(f"Language detection failed: {e}") + return { + "language": self.default_language, + "language_code": "en", + "is_english": True, + "confidence": "low", + "detection_skipped": False, + "skip_reason": None, + "detection_error": str(e) + } + + def _normalize_product_mentions(self, found_products: List[str]) -> List[str]: + """ + Normalize product mentions using alias mappings. + + Args: + found_products: List of product terms found + + Returns: + List of canonical product names + """ + normalized = [] + for product in found_products: + product_lower = product.lower() + + # Check if it's an alias + if product_lower in self.product_aliases: + canonical = self.product_aliases[product_lower] + if canonical not in normalized: + normalized.append(canonical) + # Check if it's a direct product match + elif product_lower in self.product_keywords: + # Find the original case version + for p in self.products_list: + if p.lower() == product_lower: + if p not in normalized: + normalized.append(p) + break + + return normalized + + def _check_relevance(self, text: str) -> Dict[str, Any]: + """ + Check if text is relevant to the brand using keyword matching. + + Enhanced to handle product aliases. + + Returns: + Dictionary with relevance assessment + """ + text_lower = text.lower() + + # Tokenize for word boundary matching + words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower)) + + # Also check for multi-word phrases (for aliases like "hand hammered") + all_aliases = set(self.product_aliases.keys()) + + # Check for primary keywords (definitive matches) + found_primary = self.primary_keywords.intersection(words) + + # Check for product aliases in text + found_aliases = [] + for alias in all_aliases: + if ' ' in alias: + # Multi-word alias - check in full text + if alias in text_lower: + found_aliases.append(alias) + elif alias in words: + found_aliases.append(alias) + + # Map aliases to canonical products + alias_products = [] + for alias in found_aliases: + if alias in self.product_aliases: + canonical = self.product_aliases[alias] + if canonical not in alias_products: + alias_products.append(canonical) + + if found_primary or alias_products: + all_found = list(found_primary) + found_aliases + return { + "preliminary_relevant": True, + "needs_relevance_validation": False, + "found_keywords": all_found, + "mapped_products": alias_products, + "relevance_type": "primary", + "relevance_confidence": "high", + "has_primary_keywords": True + } + + # Check for contextual keywords (need validation) + found_contextual = self.contextual_keywords.intersection(words) + if found_contextual: + # Check if there's cymbal context + found_cymbal_context = self.cymbal_context_keywords.intersection(words) + has_cymbal_context = len(found_cymbal_context) > 0 + + return { + "preliminary_relevant": True, + "needs_relevance_validation": True, + "found_keywords": list(found_contextual), + "cymbal_context_found": list(found_cymbal_context) if found_cymbal_context else [], + "has_cymbal_context": has_cymbal_context, + "mapped_products": [], + "relevance_type": "contextual", + "relevance_confidence": "medium" if has_cymbal_context else "low", + "has_primary_keywords": False + } + + # Check for competitor mentions (might be comparative discussion) + found_competitors = self.competitor_keywords.intersection(words) + if found_competitors: + return { + "preliminary_relevant": False, + "needs_relevance_validation": True, + "found_keywords": list(found_competitors), + "mapped_products": [], + "relevance_type": "competitor_only", + "relevance_confidence": "low", + "has_primary_keywords": False + } + + # No relevant keywords found + return { + "preliminary_relevant": False, + "needs_relevance_validation": False, + "found_keywords": [], + "mapped_products": [], + "relevance_type": "none", + "relevance_confidence": "high", + "has_primary_keywords": False + } + + def _extract_mentioned_products(self, text: str) -> List[str]: + """ + Extract product names mentioned in the text, including aliases. + + Args: + text: Text to search + + Returns: + List of canonical product names found + """ + text_lower = text.lower() + words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower)) + + found_products = [] + + # Check direct product mentions + for product in self.products_list: + if product.lower() in words: + if product not in found_products: + found_products.append(product) + + # Check aliases + for alias, canonical in self.product_aliases.items(): + if ' ' in alias: + # Multi-word alias + if alias in text_lower: + if canonical not in found_products: + found_products.append(canonical) + elif alias in words: + if canonical not in found_products: + found_products.append(canonical) + + return found_products + + def _extract_mentioned_competitors(self, text: str) -> List[str]: + """ + Extract competitor brand names mentioned in the text. + + Args: + text: Text to search + + Returns: + List of canonical competitor names found + """ + text_lower = text.lower() + words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower)) + + found_competitors = set() + + for alias in self.competitor_keywords: + if ' ' in alias: + # Multi-word check + if alias in text_lower: + canonical = self.competitor_name_map.get(alias, alias) + found_competitors.add(canonical) + elif alias in words: + canonical = self.competitor_name_map.get(alias, alias) + found_competitors.add(canonical) + + return list(found_competitors) + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a forum post through preprocessing pipeline. + + Args: + input_data: Dictionary containing post data with at least: + - post_id: Post identifier + - post_content: Raw HTML content + - thread_title: Thread title (optional) + - thread_first_post: First post content (optional) + - category_title: Category title (optional) + - category_topic: Category topic (optional) + + Returns: + Dictionary with preprocessing results + """ + try: + # Validate input + if not self.validate_input(input_data): + return { + "success": False, + "error": "Invalid input: missing required fields", + **input_data + } + + post_content = input_data.get("post_content", "") + + # Step 1: Parse HTML content + parsed = self.html_parser.parse_post_content(post_content) + reply_content = parsed.get("reply_content", "") + quoted_content = parsed.get("quoted_content") + + # Check for empty content + if not reply_content or len(reply_content.strip()) < self.min_content_length: + return { + "success": True, + "cleaned_content": reply_content, + "quoted_content": quoted_content, + "is_empty": True, + "preliminary_relevant": False, + "needs_relevance_validation": False, + **{k: v for k, v in input_data.items() if k != "post_content"} + } + + # Step 2: Check relevance FIRST (needed for language detection logic) + relevance_result = self._check_relevance(reply_content) + has_primary_keywords = relevance_result.get("has_primary_keywords", False) + + # Step 3: Build thread context (raw - will be summarized by extraction agent) + raw_thread_context = self.html_parser.build_thread_context( + thread_title=input_data.get("thread_title"), + first_post_content=input_data.get("thread_first_post"), + category_title=input_data.get("category_title"), + category_topic=input_data.get("category_topic") + ) + + # Step 4: Detect language (with smart handling) + lang_result = self._detect_language(reply_content, has_primary_keywords) + + # Step 5: Extract product and competitor mentions from actual post content + products_found = self._extract_mentioned_products(reply_content) + competitors_found = self._extract_mentioned_competitors(reply_content) + + # Build result + result = { + "success": True, + "is_empty": False, + + # Cleaned content + "cleaned_content": reply_content, + "quoted_content": quoted_content, + "has_quote": parsed.get("has_quote", False), + "quoted_author": parsed.get("quoted_author"), + "raw_thread_context": raw_thread_context, + + # Language detection + "detected_language": lang_result["language"], + "language_code": lang_result["language_code"], + "is_english": lang_result["is_english"], + "language_confidence": lang_result["confidence"], + "language_detection_skipped": lang_result.get("detection_skipped", False), + + # Relevance assessment + "preliminary_relevant": relevance_result["preliminary_relevant"], + "needs_relevance_validation": relevance_result["needs_relevance_validation"], + "relevance_keywords_found": relevance_result["found_keywords"], + "relevance_type": relevance_result["relevance_type"], + "relevance_confidence": relevance_result["relevance_confidence"], + "has_primary_keywords": has_primary_keywords, + + # Initial extractions + "products_detected": products_found, + "competitors_detected": competitors_found, + + # Preserve original data + **{k: v for k, v in input_data.items() if k not in ["post_content"]} + } + + # Keep original content for reference + result["original_content"] = post_content + + self.log_processing( + f"Processed post {input_data.get('post_id')}: " + f"lang={lang_result['language']}, " + f"relevant={relevance_result['preliminary_relevant']}, " + f"needs_validation={relevance_result['needs_relevance_validation']}, " + f"products={products_found}", + "debug" + ) + + return result + + except Exception as e: + return self.handle_error(e, f"preprocessing post {input_data.get('post_id')}") diff --git a/processing_brand_sentiment/workflow/agents/output_validator_agent.py b/processing_brand_sentiment/workflow/agents/output_validator_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..334de8c2bc10f0ad08bc103035e4a658d9effaf9 --- /dev/null +++ b/processing_brand_sentiment/workflow/agents/output_validator_agent.py @@ -0,0 +1,408 @@ +""" +Output Validator Agent for brand sentiment analysis. + +This agent performs rule-based validation on the final output to ensure: +1. All values are from predefined lists +2. Logical consistency between fields +3. Anomaly detection for manual review flagging + +This is a deterministic agent (no LLM calls) that acts as a quality gate. +""" + +from typing import Dict, Any, List, Set +import logging + +from .base_agent import BaseAgent + +logger = logging.getLogger(__name__) + + +class OutputValidatorAgent(BaseAgent): + """ + Agent that validates the final output for consistency and quality. + + Performs rule-based checks without LLM calls to ensure data quality + and flag posts that may need manual review. + """ + + def __init__( + self, + config: Dict[str, Any], + brand_config: Dict[str, Any], + analysis_categories: Dict[str, Any] + ): + """ + Initialize the Output Validator Agent. + + Args: + config: Agent configuration + brand_config: Brand-specific configuration + analysis_categories: Category definitions for validation + """ + super().__init__("OutputValidatorAgent", config) + self.brand_config = brand_config + self.analysis_categories = analysis_categories + + # Build valid value sets for validation + self._build_valid_value_sets() + + logger.info("OutputValidatorAgent initialized") + + def _build_valid_value_sets(self) -> None: + """Build sets of valid values for efficient validation.""" + brand = self.brand_config.get("brand", {}) + + # Products + self.valid_products: Set[str] = set( + p.lower() for p in brand.get("products", []) + ) + self.products_canonical = {p.lower(): p for p in brand.get("products", [])} + + # Competitors + self.valid_competitors: Set[str] = set() + self.competitors_canonical = {} + for comp in brand.get("competitors", []): + if isinstance(comp, dict): + name = comp.get("name", "") + self.valid_competitors.add(name.lower()) + self.competitors_canonical[name.lower()] = name + + # Extract all category values + self.valid_values = {} + + category_configs = { + "author_role": self.analysis_categories.get("author_role", {}), + "sabian_mention_context": self.analysis_categories.get("sabian_mention_context", {}), + "sentiment_level": self.analysis_categories.get("sentiment", {}), + "emotion_type": self.analysis_categories.get("emotions", {}), + "intents": self.analysis_categories.get("intents", {}), + "purchase_stage": self.analysis_categories.get("purchase_stage", {}), + "comparison_type": self.analysis_categories.get("comparison_type", {}), + "feedback_aspects": self.analysis_categories.get("feedback_aspects", {}), + "decision_drivers": self.analysis_categories.get("decision_drivers", {}), + "product_attributes": self.analysis_categories.get("product_attributes", {}), + } + + for key, config in category_configs.items(): + if "categories" in config: + self.valid_values[key] = set( + c["value"].lower() for c in config["categories"] + ) + elif "levels" in config: + self.valid_values[key] = set( + c["value"].lower() for c in config["levels"] + ) + else: + self.valid_values[key] = set() + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """Validate that input contains required fields.""" + # The validator accepts any input - it will validate what's there + return True + + def _validate_list_values( + self, + values: List[Any], + valid_set: Set[str], + field_name: str + ) -> Dict[str, Any]: + """ + Validate list values against a set of valid values. + + Returns: + Dictionary with validation results + """ + if not values: + return {"valid": True, "invalid_values": [], "field": field_name} + + invalid = [] + for v in values: + if isinstance(v, str) and v.lower() not in valid_set: + invalid.append(v) + + return { + "valid": len(invalid) == 0, + "invalid_values": invalid, + "field": field_name + } + + def _validate_single_value( + self, + value: Any, + valid_set: Set[str], + field_name: str, + allow_none: bool = True + ) -> Dict[str, Any]: + """ + Validate a single value against a set of valid values. + + Returns: + Dictionary with validation results + """ + if value is None: + return {"valid": allow_none, "invalid_value": None if allow_none else value, "field": field_name} + + if isinstance(value, str) and value.lower() in valid_set: + return {"valid": True, "invalid_value": None, "field": field_name} + + return {"valid": False, "invalid_value": value, "field": field_name} + + def _check_logical_consistency(self, data: Dict[str, Any]) -> List[str]: + """ + Check for logical consistency between fields. + + Note: Empty products_mentioned is OK even when relevant - users may + discuss the Sabian brand generally without specific products. + + Returns: + List of inconsistency warnings + """ + warnings = [] + is_relevant = data.get("is_relevant", False) + + # Check 1: If not relevant, certain fields should be empty/null + if not is_relevant: + if data.get("sabian_mention_context"): + warnings.append( + "sabian_mention_context should be null when is_relevant=False" + ) + if data.get("sentiment_level") and data.get("sentiment_level") != "neutral": + warnings.append( + "sentiment_level should be null/neutral when is_relevant=False" + ) + + # Check 2: Comparison type should only be set if comparing intent exists + if data.get("comparison_type"): + intents = data.get("intents", []) + if "comparing" not in intents: + warnings.append( + "comparison_type is set but 'comparing' not in intents" + ) + + # Check 3: Author perspective fields consistency + # If author is giving advice (providing_information) without sharing experience, + # pain_points and delight_factors should typically be empty + intents = data.get("intents", []) + if "providing_information" in intents and "sharing_experience" not in intents: + if data.get("pain_points") or data.get("delight_factors"): + warnings.append( + "pain_points/delight_factors set for advice-giving post without sharing_experience intent" + ) + + return warnings + + def _fix_overlapping_feedback(self, data: Dict[str, Any]) -> Dict[str, Any]: + """ + Fix overlapping values between pain_points and delight_factors. + + Rule: The same aspect cannot be both a pain point and a delight factor. + Resolution: Use sentiment to determine which to keep, or clear both if neutral. + + Args: + data: Dictionary with analysis results + + Returns: + Updated dictionary with fixed pain_points and delight_factors + """ + pain_points = data.get("pain_points", []) or [] + delight_factors = data.get("delight_factors", []) or [] + + if not pain_points or not delight_factors: + return data + + # Find overlapping values + pain_set = set(p.lower() if isinstance(p, str) else p for p in pain_points) + delight_set = set(d.lower() if isinstance(d, str) else d for d in delight_factors) + overlap = pain_set.intersection(delight_set) + + if not overlap: + return data + + # Get sentiment to determine which to keep + sentiment = data.get("sentiment_level", "neutral") + + # Create new lists without overlapping values + if sentiment in ["positive", "very_positive"]: + # Keep in delight_factors, remove from pain_points + new_pain_points = [p for p in pain_points if p.lower() not in overlap] + new_delight_factors = delight_factors + elif sentiment in ["negative", "very_negative"]: + # Keep in pain_points, remove from delight_factors + new_pain_points = pain_points + new_delight_factors = [d for d in delight_factors if d.lower() not in overlap] + else: + # Neutral sentiment - clear both (can't determine intent) + new_pain_points = [p for p in pain_points if p.lower() not in overlap] + new_delight_factors = [d for d in delight_factors if d.lower() not in overlap] + + # Update data + data["pain_points"] = new_pain_points + data["delight_factors"] = new_delight_factors + + logger.debug( + f"Fixed overlapping feedback: removed {overlap} from " + f"{'pain_points' if sentiment in ['positive', 'very_positive'] else 'delight_factors' if sentiment in ['negative', 'very_negative'] else 'both'}" + ) + + return data + + def _detect_anomalies(self, data: Dict[str, Any]) -> List[str]: + """ + Detect anomalies that might need manual review. + + Returns: + List of anomaly flags + """ + anomalies = [] + + # Anomaly 1: Low confidence relevance + if data.get("is_relevant") and data.get("relevance_confidence") == "low": + anomalies.append("low_confidence_relevant") + + # Anomaly 2: Sarcasm detected - sentiment might be inverted + if data.get("sarcasm_detected"): + anomalies.append("sarcasm_detected") + + # Anomaly 3: Very short content marked as relevant + content = data.get("cleaned_content", "") + if data.get("is_relevant") and len(content) < 20: + anomalies.append("short_relevant_content") + + # Anomaly 4: Switching behavior detected + comparison_type = data.get("comparison_type", "") + if comparison_type in ["switching_to_sabian", "switching_from_sabian"]: + anomalies.append(f"brand_switching_{comparison_type}") + + return anomalies + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process and validate the analysis output. + + Args: + input_data: Dictionary with all analysis results + + Returns: + Dictionary with validation results added + """ + try: + validation_errors = [] + validation_warnings = [] + + # Skip detailed validation for non-relevant or skipped posts + if not input_data.get("is_relevant", False) or input_data.get("analysis_skipped", False): + return { + **input_data, + "validation_passed": True, + "validation_errors": [], + "validation_warnings": [], + "validation_flags": [], + "processing_status": "completed" + } + + # Fix overlapping pain_points and delight_factors (safety net) + input_data = self._fix_overlapping_feedback(input_data) + + # Validate products_mentioned + products_result = self._validate_list_values( + input_data.get("products_mentioned", []), + self.valid_products, + "products_mentioned" + ) + if not products_result["valid"]: + validation_errors.append( + f"Invalid products: {products_result['invalid_values']}" + ) + + # Validate competitors_mentioned + competitors_result = self._validate_list_values( + input_data.get("competitors_mentioned", []), + self.valid_competitors, + "competitors_mentioned" + ) + if not competitors_result["valid"]: + validation_errors.append( + f"Invalid competitors: {competitors_result['invalid_values']}" + ) + + # Validate categorical fields + categorical_validations = [ + ("author_role", "author_role", True), + ("sabian_mention_context", "sabian_mention_context", True), + ("sentiment_level", "sentiment_level", True), + ("emotion_type", "emotion_type", True), + ("purchase_stage", "purchase_stage", True), + ("comparison_type", "comparison_type", True), + ] + + for field, valid_key, allow_none in categorical_validations: + result = self._validate_single_value( + input_data.get(field), + self.valid_values.get(valid_key, set()), + field, + allow_none + ) + if not result["valid"]: + validation_errors.append( + f"Invalid {field}: {result['invalid_value']}" + ) + + # Validate list fields + list_validations = [ + ("intents", "intents"), + ("product_attributes", "product_attributes"), + ("pain_points", "feedback_aspects"), + ("delight_factors", "feedback_aspects"), + ("decision_drivers", "decision_drivers"), + ] + + for field, valid_key in list_validations: + result = self._validate_list_values( + input_data.get(field, []), + self.valid_values.get(valid_key, set()), + field + ) + if not result["valid"]: + validation_warnings.append( + f"Invalid values in {field}: {result['invalid_values']}" + ) + + # Check logical consistency + consistency_warnings = self._check_logical_consistency(input_data) + validation_warnings.extend(consistency_warnings) + + # Detect anomalies + anomalies = self._detect_anomalies(input_data) + + # Determine overall validation status + validation_passed = len(validation_errors) == 0 + + # Set processing status + if validation_errors: + processing_status = "validation_failed" + elif anomalies: + processing_status = "completed_with_flags" + else: + processing_status = "completed" + + result = { + **input_data, + "validation_passed": validation_passed, + "validation_errors": validation_errors, + "validation_warnings": validation_warnings, + "validation_flags": anomalies, + "processing_status": processing_status + } + + if validation_errors or validation_warnings or anomalies: + self.log_processing( + f"Validation complete: passed={validation_passed}, " + f"errors={len(validation_errors)}, warnings={len(validation_warnings)}, " + f"flags={anomalies}", + "debug" + ) + + return result + + except Exception as e: + return self.handle_error(e, "output validation") diff --git a/processing_brand_sentiment/workflow/agents/preprocessor_agent.py b/processing_brand_sentiment/workflow/agents/preprocessor_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..391d955ec65fbf1f752392c4c0ac5075d8199306 --- /dev/null +++ b/processing_brand_sentiment/workflow/agents/preprocessor_agent.py @@ -0,0 +1,408 @@ +""" +Preprocessor Agent for brand sentiment analysis. +Handles HTML parsing, text cleaning, language detection, and initial relevance screening. +This is a deterministic agent (no LLM calls except for language detection fallback). +""" + +import re +from typing import Dict, Any, List, Optional, Set +from lingua import Language, LanguageDetectorBuilder +import logging + +from .base_agent import BaseAgent +from utils.html_parser import HTMLParser + +logger = logging.getLogger(__name__) + + +class PreprocessorAgent(BaseAgent): + """ + Agent that preprocesses forum posts: + - Parses HTML to extract reply and quoted content + - Cleans and normalizes text + - Detects language + - Performs initial keyword-based relevance screening + """ + + # Lingua to ISO 639-1 language code mapping + LINGUA_TO_ISO = { + Language.ENGLISH: "en", + Language.SPANISH: "es", + Language.FRENCH: "fr", + Language.GERMAN: "de", + Language.ITALIAN: "it", + Language.PORTUGUESE: "pt", + Language.RUSSIAN: "ru", + Language.JAPANESE: "ja", + Language.KOREAN: "ko", + Language.CHINESE: "zh", + Language.ARABIC: "ar", + Language.HINDI: "hi", + Language.DUTCH: "nl", + Language.SWEDISH: "sv", + Language.POLISH: "pl", + Language.TURKISH: "tr" + } + + def __init__(self, config: Dict[str, Any], brand_config: Dict[str, Any]): + """ + Initialize the Preprocessor Agent. + + Args: + config: Agent configuration + brand_config: Brand-specific configuration with keywords and products + """ + super().__init__("PreprocessorAgent", config) + self.brand_config = brand_config + self.html_parser = HTMLParser() + + # Initialize lingua detector + self.language_detector = LanguageDetectorBuilder.from_all_languages().build() + + # Build keyword sets for efficient lookup + self._build_keyword_sets() + + logger.info("PreprocessorAgent initialized") + + def _build_keyword_sets(self) -> None: + """Build keyword sets from brand configuration for efficient relevance checking.""" + relevance_config = self.brand_config.get("relevance_keywords", {}) + + # Primary keywords - definitive Sabian mentions + primary = relevance_config.get("primary", {}).get("keywords", []) + self.primary_keywords: Set[str] = set(k.lower() for k in primary) + + # Contextual keywords - need disambiguation (HH, AA) + contextual = relevance_config.get("contextual", {}).get("keywords", []) + self.contextual_keywords: Set[str] = set(k.lower() for k in contextual) + + # Cymbal context keywords - help disambiguate contextual terms + cymbal_context = relevance_config.get("cymbal_context", {}).get("keywords", []) + self.cymbal_context_keywords: Set[str] = set(k.lower() for k in cymbal_context) + + # Competitor names for detection + competitors = self.brand_config.get("brand", {}).get("competitors", []) + self.competitor_keywords: Set[str] = set() + for comp in competitors: + if isinstance(comp, dict): + self.competitor_keywords.add(comp.get("name", "").lower()) + for alias in comp.get("aliases", []): + self.competitor_keywords.add(alias.lower()) + else: + self.competitor_keywords.add(str(comp).lower()) + + # Product names + products = self.brand_config.get("brand", {}).get("products", []) + self.product_keywords: Set[str] = set(p.lower() for p in products) + + logger.info(f"Built keyword sets: {len(self.primary_keywords)} primary, " + f"{len(self.contextual_keywords)} contextual, " + f"{len(self.product_keywords)} products") + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """ + Validate that input contains required fields. + + Args: + input_data: Input dictionary + + Returns: + True if valid, False otherwise + """ + required_fields = ["post_id", "post_content"] + return all(field in input_data for field in required_fields) + + def _detect_language(self, text: str) -> Dict[str, Any]: + """ + Detect the language of text using lingua library. + + Args: + text: Text to analyze + + Returns: + Dictionary with language detection results + """ + try: + cleaned_text = text.strip() + if not cleaned_text or len(cleaned_text) < 3: + return { + "language": "English", + "language_code": "en", + "is_english": True, + "confidence": "low" + } + + detected = self.language_detector.detect_language_of(cleaned_text) + + if detected is None: + return { + "language": "English", + "language_code": "en", + "is_english": True, + "confidence": "low" + } + + if detected == Language.ENGLISH: + return { + "language": "English", + "language_code": "en", + "is_english": True, + "confidence": "high" + } + + lang_code = self.LINGUA_TO_ISO.get(detected, "unknown") + lang_name = detected.name.capitalize() + + return { + "language": lang_name, + "language_code": lang_code, + "is_english": False, + "confidence": "high" + } + + except Exception as e: + logger.warning(f"Language detection failed: {e}") + return { + "language": "English", + "language_code": "en", + "is_english": True, + "confidence": "low" + } + + def _check_relevance(self, text: str) -> Dict[str, Any]: + """ + Check if text is relevant to the brand using keyword matching. + + Returns: + Dictionary with relevance assessment: + - preliminary_relevant: Initial relevance assessment + - needs_relevance_validation: True if contains ambiguous terms needing LLM check + - found_keywords: Keywords found in the text + - relevance_type: 'primary', 'contextual', or 'none' + """ + text_lower = text.lower() + + # Tokenize for word boundary matching + words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower)) + + # Check for primary keywords (definitive matches) + found_primary = self.primary_keywords.intersection(words) + if found_primary: + return { + "preliminary_relevant": True, + "needs_relevance_validation": False, + "found_keywords": list(found_primary), + "relevance_type": "primary", + "relevance_confidence": "high" + } + + # Check for contextual keywords (need validation) + found_contextual = self.contextual_keywords.intersection(words) + if found_contextual: + # Check if there's cymbal context + found_cymbal_context = self.cymbal_context_keywords.intersection(words) + has_cymbal_context = len(found_cymbal_context) > 0 + + return { + "preliminary_relevant": True, # Potentially relevant + "needs_relevance_validation": True, # Needs LLM confirmation + "found_keywords": list(found_contextual), + "cymbal_context_found": list(found_cymbal_context) if found_cymbal_context else [], + "has_cymbal_context": has_cymbal_context, + "relevance_type": "contextual", + "relevance_confidence": "medium" if has_cymbal_context else "low" + } + + # Check for competitor mentions (might be comparative discussion) + found_competitors = self.competitor_keywords.intersection(words) + if found_competitors: + # Has competitor mention but no Sabian mention + # Could still be relevant in a comparison context + return { + "preliminary_relevant": False, + "needs_relevance_validation": True, # LLM should check context + "found_keywords": list(found_competitors), + "relevance_type": "competitor_only", + "relevance_confidence": "low" + } + + # No relevant keywords found + return { + "preliminary_relevant": False, + "needs_relevance_validation": False, + "found_keywords": [], + "relevance_type": "none", + "relevance_confidence": "high" + } + + def _extract_mentioned_products(self, text: str) -> List[str]: + """ + Extract product names mentioned in the text. + + Args: + text: Text to search + + Returns: + List of product names found + """ + text_lower = text.lower() + words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower)) + + found_products = [] + products = self.brand_config.get("brand", {}).get("products", []) + + for product in products: + if product.lower() in words: + found_products.append(product) + + return found_products + + def _extract_mentioned_competitors(self, text: str) -> List[str]: + """ + Extract competitor names mentioned in the text. + + Args: + text: Text to search + + Returns: + List of competitor names found + """ + text_lower = text.lower() + words = set(re.findall(r'\b[a-zA-Z0-9]+\b', text_lower)) + + found_competitors = [] + competitors = self.brand_config.get("brand", {}).get("competitors", []) + + for comp in competitors: + if isinstance(comp, dict): + name = comp.get("name", "") + aliases = comp.get("aliases", []) + + # Check name and aliases + if name.lower() in words: + if name not in found_competitors: + found_competitors.append(name) + else: + for alias in aliases: + if alias.lower() in words: + if name not in found_competitors: + found_competitors.append(name) + break + else: + if str(comp).lower() in words: + found_competitors.append(str(comp)) + + return found_competitors + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a forum post through preprocessing pipeline. + + Args: + input_data: Dictionary containing post data with at least: + - post_id: Post identifier + - post_content: Raw HTML content + - thread_title: Thread title (optional) + - thread_first_post: First post content (optional) + - category_title: Category title (optional) + - category_topic: Category topic (optional) + + Returns: + Dictionary with preprocessing results + """ + try: + # Validate input + if not self.validate_input(input_data): + return { + "success": False, + "error": "Invalid input: missing required fields", + **input_data + } + + post_content = input_data.get("post_content", "") + + # Step 1: Parse HTML content + parsed = self.html_parser.parse_post_content(post_content) + reply_content = parsed.get("reply_content", "") + quoted_content = parsed.get("quoted_content") + + # Check for empty content + if not reply_content or len(reply_content.strip()) < 3: + return { + "success": True, + "cleaned_content": reply_content, + "quoted_content": quoted_content, + "is_empty": True, + "preliminary_relevant": False, + "needs_relevance_validation": False, + **{k: v for k, v in input_data.items() if k != "post_content"} + } + + # Step 2: Build thread context + thread_context = self.html_parser.build_thread_context( + thread_title=input_data.get("thread_title"), + first_post_content=input_data.get("thread_first_post"), + category_title=input_data.get("category_title"), + category_topic=input_data.get("category_topic") + ) + + # Step 3: Detect language + lang_result = self._detect_language(reply_content) + + # Step 4: Check relevance - ONLY on the actual post content, NOT quoted/context + # The quoted content and thread context are for understanding, not for relevance determination + relevance_result = self._check_relevance(reply_content) + + # Step 5: Extract product and competitor mentions - ONLY from actual post content + # We don't want to extract from quoted content as that will be processed separately + products_found = self._extract_mentioned_products(reply_content) + competitors_found = self._extract_mentioned_competitors(reply_content) + + # Build result + result = { + "success": True, + "is_empty": False, + + # Cleaned content + "cleaned_content": reply_content, + "quoted_content": quoted_content, + "has_quote": parsed.get("has_quote", False), + "quoted_author": parsed.get("quoted_author"), + "thread_context": thread_context, + + # Language detection + "detected_language": lang_result["language"], + "language_code": lang_result["language_code"], + "is_english": lang_result["is_english"], + "language_confidence": lang_result["confidence"], + + # Relevance assessment + "preliminary_relevant": relevance_result["preliminary_relevant"], + "needs_relevance_validation": relevance_result["needs_relevance_validation"], + "relevance_keywords_found": relevance_result["found_keywords"], + "relevance_type": relevance_result["relevance_type"], + "relevance_confidence": relevance_result["relevance_confidence"], + + # Initial extractions + "products_detected": products_found, + "competitors_detected": competitors_found, + + # Preserve original data + **{k: v for k, v in input_data.items() if k not in ["post_content"]} + } + + # Keep original content for reference + result["original_content"] = post_content + + self.log_processing( + f"Processed post {input_data.get('post_id')}: " + f"lang={lang_result['language']}, " + f"relevant={relevance_result['preliminary_relevant']}, " + f"needs_validation={relevance_result['needs_relevance_validation']}", + "debug" + ) + + return result + + except Exception as e: + return self.handle_error(e, f"preprocessing post {input_data.get('post_id')}") \ No newline at end of file diff --git a/processing_brand_sentiment/workflow/agents/relevance_validator_agent.py b/processing_brand_sentiment/workflow/agents/relevance_validator_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..fed66ea496e310befd5d461d9427db08d645083a --- /dev/null +++ b/processing_brand_sentiment/workflow/agents/relevance_validator_agent.py @@ -0,0 +1,289 @@ +""" +Relevance Validator Agent for brand sentiment analysis. +Lightweight LLM-based agent that confirms whether ambiguous terms (HH, AA) +refer to Sabian products or generic terms. +""" + +from typing import Dict, Any +import json +from langchain_openai import ChatOpenAI +from langchain.schema import HumanMessage, SystemMessage +import logging + +from .base_agent import BaseAgent + +logger = logging.getLogger(__name__) + + +class RelevanceValidatorAgent(BaseAgent): + """ + Agent that validates whether posts with ambiguous terms (like HH, AA) + are actually referring to Sabian products or generic terms. + + This is a lightweight LLM call specifically for disambiguation. + """ + + def __init__(self, config: Dict[str, Any], api_key: str, brand_config: Dict[str, Any]): + """ + Initialize the Relevance Validator Agent. + + Args: + config: Agent configuration + api_key: OpenAI API key + brand_config: Brand-specific configuration with product info + """ + super().__init__("RelevanceValidatorAgent", config) + self.api_key = api_key + self.brand_config = brand_config + + self.llm = ChatOpenAI( + model=self.model, + temperature=self.temperature, + api_key=self.api_key + ) + + # Build disambiguation context from brand config + self._build_disambiguation_context() + + logger.info("RelevanceValidatorAgent initialized") + + def _build_disambiguation_context(self) -> None: + """Build context strings for disambiguation from brand config.""" + brand = self.brand_config.get("brand", {}) + ambiguous = brand.get("ambiguous_terms", {}) + + self.disambiguation_info = {} + for term, info in ambiguous.items(): + if isinstance(info, dict): + self.disambiguation_info[term] = { + "description": info.get("description", ""), + "context_clues": info.get("disambiguation_context", []) + } + else: + self.disambiguation_info[term] = { + "description": str(info), + "context_clues": [] + } + + # Product descriptions for context + self.product_descriptions = brand.get("product_descriptions", {}) + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """ + Validate that input contains required fields. + + Args: + input_data: Input dictionary + + Returns: + True if valid, False otherwise + """ + required = ["cleaned_content", "relevance_keywords_found"] + return all(field in input_data for field in required) + + def _build_system_prompt(self) -> str: + """Build the system prompt for relevance validation.""" + brand_name = self.brand_config.get("brand", {}).get("name", "Sabian") + products = self.brand_config.get("brand", {}).get("products", []) + + # Build disambiguation rules + disambiguation_rules = [] + for term, info in self.disambiguation_info.items(): + desc = info.get("description", "") + clues = info.get("context_clues", []) + rule = f"- '{term}': {desc}" + if clues: + rule += f" Context clues for {brand_name}: {', '.join(clues)}" + disambiguation_rules.append(rule) + + disambiguation_text = "\n".join(disambiguation_rules) if disambiguation_rules else "No specific disambiguation rules." + + system_prompt = f"""You are an expert at identifying brand mentions in drum/cymbal forum discussions. + +Your task is to determine if the POST CONTENT itself discusses {brand_name} products. + +**CRITICAL RULE:** +- You must determine relevance based ONLY on the POST CONTENT +- The context (thread info, quoted/parent content) is provided to help you understand ambiguous terms +- But if the POST CONTENT itself does not mention or discuss {brand_name}, it is NOT relevant +- Example: If quoted content mentions Sabian but the post just says "Got it! Thanks!" → NOT relevant + +**{brand_name} Product Lines:** +{', '.join(products)} + +**Ambiguous Terms to Watch For:** +{disambiguation_text} + +**Key Disambiguation Rules:** +- "HH" alone usually means "Hi-Hat" (a type of cymbal), NOT Sabian HH series +- "HH" WITH Sabian context IN THE POST (e.g., "Sabian HH", "HH crashes", "my HH ride") likely refers to Sabian +- "AA" alone might be a general abbreviation, NOT Sabian AA series +- "AA" WITH Sabian context IN THE POST (e.g., "Sabian AA", "AA cymbals", "AA medium ride") likely refers to Sabian +- Generic replies like "Thanks!", "Got it!", "Good point!" are NOT relevant even if context mentions {brand_name} + +**Return JSON with:** +- is_relevant: boolean - true ONLY if the POST CONTENT itself discusses {brand_name} products +- confidence: "high", "medium", or "low" +- reason: brief explanation (1-2 sentences) - explain what IN THE POST made you decide +- detected_products: list of {brand_name} products mentioned IN THE POST (empty if none) + +Return only valid JSON.""" + + return system_prompt + + def validate_relevance( + self, + content: str, + keywords_found: list, + thread_context: str = "", + quoted_content: str = "" + ) -> Dict[str, Any]: + """ + Validate whether content is relevant to the brand. + + Args: + content: The cleaned post content + keywords_found: Keywords that triggered validation + thread_context: Thread context for additional context + quoted_content: Quoted content if any + + Returns: + Dictionary with validation results + """ + brand_name = self.brand_config.get("brand", {}).get("name", "Sabian") + + # Build context for the LLM + context_parts = [] + if thread_context: + context_parts.append(f"Thread context: {thread_context}") + if quoted_content: + context_parts.append(f"Replying to: {quoted_content[:300]}...") + + context_str = "\n".join(context_parts) if context_parts else "No additional context." + + user_prompt = f"""Determine if this POST CONTENT discusses {brand_name} cymbal products. + +**Keywords found in post:** {', '.join(keywords_found)} + +**CONTEXT (for understanding ambiguous terms only - do NOT base relevance on this):** +{context_str} + +**POST CONTENT TO EVALUATE (base your relevance decision ONLY on this):** +"{content}" + +Does the POST CONTENT itself discuss {brand_name} products? Remember: generic replies are NOT relevant even if context mentions {brand_name}. Return JSON only.""" + + try: + messages = [ + SystemMessage(content=self._build_system_prompt()), + HumanMessage(content=user_prompt) + ] + + response = self.llm.invoke(messages) + result = self._parse_llm_json_response(response.content) + + return { + "success": True, + "is_relevant": result.get("is_relevant", False), + "relevance_confidence": result.get("confidence", "low"), + "relevance_reason": result.get("reason", ""), + "detected_products": result.get("detected_products", []) + } + + except json.JSONDecodeError as e: + self.log_processing(f"JSON decode error in relevance validation: {e}", "warning") + # Default to relevant if we can't determine + return { + "success": True, + "is_relevant": True, + "relevance_confidence": "low", + "relevance_reason": "Could not parse LLM response, defaulting to relevant", + "detected_products": [] + } + + except Exception as e: + self.log_processing(f"Relevance validation error: {e}", "error") + return { + "success": False, + "is_relevant": True, # Default to relevant on error + "relevance_confidence": "low", + "relevance_reason": f"Error during validation: {str(e)}", + "detected_products": [], + "error": str(e) + } + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a post to validate its relevance to the brand. + + Args: + input_data: Dictionary containing: + - cleaned_content: Cleaned post text + - relevance_keywords_found: Keywords that triggered validation + - thread_context: Optional thread context + - quoted_content: Optional quoted content + + Returns: + Dictionary with validation results and original data + """ + try: + if not self.validate_input(input_data): + return { + "success": False, + "error": "Invalid input: missing required fields", + "is_relevant": True, # Default to relevant + "relevance_confidence": "low", + **input_data + } + + # Check if validation is actually needed + if not input_data.get("needs_relevance_validation", False): + # No validation needed, use preliminary assessment + return { + "success": True, + "is_relevant": input_data.get("preliminary_relevant", False), + "relevance_confidence": input_data.get("relevance_confidence", "high"), + "relevance_reason": "No validation needed - preliminary assessment used", + "validation_performed": False, + **input_data + } + + # Perform LLM validation + validation_result = self.validate_relevance( + content=input_data.get("cleaned_content", ""), + keywords_found=input_data.get("relevance_keywords_found", []), + thread_context=input_data.get("thread_context", ""), + quoted_content=input_data.get("quoted_content", "") + ) + + # Merge results + result = { + **input_data, + "is_relevant": validation_result["is_relevant"], + "relevance_confidence": validation_result["relevance_confidence"], + "relevance_reason": validation_result["relevance_reason"], + "validation_performed": True, + "success": validation_result["success"] + } + + # Update products detected if LLM found any + if validation_result.get("detected_products"): + existing_products = input_data.get("products_detected", []) + llm_products = validation_result["detected_products"] + # Merge without duplicates + all_products = list(set(existing_products + llm_products)) + result["products_detected"] = all_products + + if "error" in validation_result: + result["validation_error"] = validation_result["error"] + + self.log_processing( + f"Validated relevance for post: is_relevant={result['is_relevant']}, " + f"confidence={result['relevance_confidence']}", + "debug" + ) + + return result + + except Exception as e: + return self.handle_error(e, "relevance validation") \ No newline at end of file diff --git a/processing_brand_sentiment/workflow/agents/sabian_analyzer_agent.py b/processing_brand_sentiment/workflow/agents/sabian_analyzer_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..356af2ff2bcead9993415f7f7335fcd1fdb4ee70 --- /dev/null +++ b/processing_brand_sentiment/workflow/agents/sabian_analyzer_agent.py @@ -0,0 +1,388 @@ +""" +Sabian Analyzer Agent for comprehensive brand sentiment analysis. +LLM-based agent that extracts products, competitors, sentiment, intents, +pain points, and other brand intelligence from forum posts. +""" + +from typing import Dict, Any, List +import json +from langchain_openai import ChatOpenAI +from langchain.schema import HumanMessage, SystemMessage +import logging + +from .base_agent import BaseAgent + +logger = logging.getLogger(__name__) + + +class SabianAnalyzerAgent(BaseAgent): + """ + Comprehensive brand analysis agent for Sabian cymbal discussions. + """ + + def __init__( + self, + config: Dict[str, Any], + api_key: str, + brand_config: Dict[str, Any], + analysis_categories: Dict[str, Any] + ): + super().__init__("SabianAnalyzerAgent", config) + self.api_key = api_key + self.brand_config = brand_config + self.analysis_categories = analysis_categories + + self.llm = ChatOpenAI( + model=self.model, + temperature=self.temperature, + api_key=self.api_key + ) + + # Pre-compute valid values for validation + self._valid_values = self._compute_valid_values() + logger.info("SabianAnalyzerAgent initialized") + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + required = ["cleaned_content", "is_relevant"] + return all(field in input_data for field in required) + + def _compute_valid_values(self) -> Dict[str, List[str]]: + """Pre-compute all valid values from config for validation.""" + valid = {} + + # Products from brand config + valid["products"] = self.brand_config.get("brand", {}).get("products", []) + + # Competitors + competitor_names = [] + for comp in self.brand_config.get("brand", {}).get("competitors", []): + if isinstance(comp, dict): + competitor_names.append(comp.get("name", "")) + valid["competitors"] = competitor_names + + # Extract category values from analysis_categories + category_map = { + "author_role": "author_role", + "sabian_mention_context": "sabian_mention_context", + "sentiment_level": "sentiment", + "emotion_type": "emotions", + "intents": "intents", + "purchase_stage": "purchase_stage", + "comparison_type": "comparison_type", + "feedback_aspects": "feedback_aspects", + "decision_drivers": "decision_drivers", + "product_attributes": "product_attributes", + } + + for key, config_key in category_map.items(): + config_section = self.analysis_categories.get(config_key, {}) + if "categories" in config_section: + valid[key] = [c["value"] for c in config_section["categories"]] + elif "levels" in config_section: + valid[key] = [c["value"] for c in config_section["levels"]] + else: + valid[key] = [] + + return valid + + def _get_category_list(self, key: str) -> List[str]: + """Get list of valid values for a category.""" + return self._valid_values.get(key, []) + + def _build_system_prompt(self) -> str: + """Build optimized system prompt for brand analysis.""" + brand = self.brand_config.get("brand", {}) + brand_name = brand.get("name", "Sabian") + products = brand.get("products", []) + + competitors = [c.get("name", "") for c in brand.get("competitors", []) if isinstance(c, dict)] + + # Get all valid values + v = self._valid_values + + return f"""You are a brand analyst extracting insights from forum posts about {brand_name} cymbals. + +## STRICT RULES +1. Extract ONLY from POST CONTENT, never from quoted/context text +2. Use ONLY values from the lists below - return null/[] if no match +3. Sentiment must be about {brand_name} specifically, NOT overall post tone +4. pain_points/delight_factors use SAME value list (feedback_aspects) - classification determines positive vs negative + +## VALID VALUES + +**{brand_name} Products:** {products} +**Competitors:** {competitors} + +| Field | Valid Values | +|-------|--------------| +| author_role | {v.get('author_role', [])} | +| sabian_mention_context | {v.get('sabian_mention_context', [])} | +| sentiment_level | {v.get('sentiment_level', [])} | +| emotion_type | {v.get('emotion_type', [])} | +| intents (multi) | {v.get('intents', [])} | +| purchase_stage | {v.get('purchase_stage', [])} | +| comparison_type | {v.get('comparison_type', [])} | +| feedback_aspects | {v.get('feedback_aspects', [])} | +| decision_drivers | {v.get('decision_drivers', [])} | +| product_attributes | {v.get('product_attributes', [])} | + +## KEY DISTINCTIONS + +**Sentiment vs Intent:** +- sentiment_level = How author FEELS about {brand_name} (positive/negative/neutral) +- praising/criticizing intent = Author is actively ENDORSING or WARNING others + +**Author-only fields (null if giving advice to others):** +- purchase_stage, decision_drivers, pain_points, delight_factors + +**Example - Sabian-specific sentiment:** +Post: "Love my new drum kit! The SBR cymbals sound terrible though." +- Overall post: positive (happy about kit) +- {brand_name} sentiment: NEGATIVE (dislikes SBR sound) +- pain_points: ["sound_quality"] + +## OUTPUT JSON +```json +{{ + "author_role": "value from list", + "sabian_mention_context": "value from list", + "sentiment_level": "value from list", + "emotion_type": "value or null", + "sentiment_confidence": "high|medium|low", + "sarcasm_detected": false, + "products_mentioned": [], + "product_attributes": [], + "competitors_mentioned": [], + "competitor_products_owned": [], + "comparison_type": "value or null", + "intents": [], + "purchase_stage": "value or null", + "decision_drivers": [], + "pain_points": [], + "delight_factors": [], + "analysis_notes": "1-2 sentences on key {brand_name}-specific insights" +}} +``` + +Return ONLY valid JSON.""" + + def analyze_post( + self, + content: str, + thread_context: str = "", + quoted_content: str = "" + ) -> Dict[str, Any]: + """Perform brand analysis on a post.""" + brand_name = self.brand_config.get("brand", {}).get("name", "Sabian") + + context_str = "" + if thread_context: + context_str += f"[Thread: {thread_context[:200]}] " + if quoted_content: + context_str += f"[Replying to: {quoted_content[:200]}...]" + + user_prompt = f"""Analyze this post about {brand_name}. + +CONTEXT (for understanding only, DO NOT extract from): {context_str or "None"} + +POST CONTENT (extract from THIS only): +"{content}" + +Return JSON only.""" + + try: + messages = [ + SystemMessage(content=self._build_system_prompt()), + HumanMessage(content=user_prompt) + ] + + response = self.llm.invoke(messages) + result = self._parse_llm_json_response(response.content) + validated = self._validate_and_normalize(result) + + return {"success": True, **validated} + + except json.JSONDecodeError as e: + self.log_processing(f"JSON decode error: {e}", "warning") + return { + "success": False, + "error": f"JSON parse error: {str(e)}", + "sentiment_level": "neutral", + "intents": ["general_discussion"] + } + except Exception as e: + self.log_processing(f"Analysis error: {e}", "error") + return {"success": False, "error": str(e)} + + def _validate_single(self, value: Any, valid_list: List[str], default: Any = None) -> Any: + """Validate single value against list, return canonical form or default.""" + if value is None: + return default + if isinstance(value, str): + val_lower = value.lower() + for v in valid_list: + if v.lower() == val_lower: + return v + return default + + def _validate_list(self, values: Any, valid_list: List[str]) -> List[str]: + """Validate list values, return only valid items in canonical form.""" + if not values: + return [] + if not isinstance(values, list): + values = [values] + + validated = [] + valid_lower = {v.lower(): v for v in valid_list} + for val in values: + if isinstance(val, str) and val.lower() in valid_lower: + validated.append(valid_lower[val.lower()]) + return validated + + def _validate_and_normalize(self, result: Dict[str, Any]) -> Dict[str, Any]: + """Validate all fields against predefined values and normalize.""" + v = self._valid_values + + normalized = { + # Classification + "author_role": self._validate_single( + result.get("author_role"), v["author_role"], "unknown" + ), + "sabian_mention_context": self._validate_single( + result.get("sabian_mention_context"), v["sabian_mention_context"], "casual_mention" + ), + + # Sentiment + "sentiment_level": self._validate_single( + result.get("sentiment_level"), v["sentiment_level"], "neutral" + ), + "emotion_type": self._validate_single( + result.get("emotion_type"), v["emotion_type"], None + ), + "sentiment_confidence": result.get("sentiment_confidence", "medium"), + "sarcasm_detected": bool(result.get("sarcasm_detected", False)), + + # Products + "products_mentioned": self._validate_list( + result.get("products_mentioned"), v["products"] + ), + "product_attributes": self._validate_list( + result.get("product_attributes"), v["product_attributes"] + ), + + # Competitors + "competitors_mentioned": self._validate_list( + result.get("competitors_mentioned"), v["competitors"] + ), + "competitor_products_owned": self._validate_list( + result.get("competitor_products_owned"), v["competitors"] + ), + "comparison_type": self._validate_single( + result.get("comparison_type"), v["comparison_type"], None + ), + + # Intents + "intents": self._validate_list( + result.get("intents"), v["intents"] + ) or ["general_discussion"], + + # Author journey (null if advising others) + "purchase_stage": self._validate_single( + result.get("purchase_stage"), v["purchase_stage"], None + ), + "decision_drivers": self._validate_list( + result.get("decision_drivers"), v["decision_drivers"] + ), + + # Feedback - both use feedback_aspects + "pain_points": self._validate_list( + result.get("pain_points"), v["feedback_aspects"] + ), + "delight_factors": self._validate_list( + result.get("delight_factors"), v["feedback_aspects"] + ), + + # Notes + "analysis_notes": result.get("analysis_notes", ""), + } + + # Log filtered values for debugging + for field in ["products_mentioned", "product_attributes", "pain_points", "delight_factors"]: + original = result.get(field, []) + if isinstance(original, list) and len(original) > len(normalized[field]): + filtered = set(str(x) for x in original) - set(normalized[field]) + if filtered: + logger.debug(f"Filtered invalid {field}: {filtered}") + + return normalized + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """Process a post through brand analysis.""" + try: + if not self.validate_input(input_data): + return { + "success": False, + "error": "Invalid input: missing required fields", + **input_data + } + + # Skip non-relevant posts + if not input_data.get("is_relevant", False): + return { + "success": True, + "analysis_skipped": True, + "analysis_skip_reason": "Post marked as not relevant", + "author_role": None, + "sabian_mention_context": None, + "sentiment_level": None, + "emotion_type": None, + "products_mentioned": [], + "competitors_mentioned": [], + "competitor_products_owned": [], + "intents": [], + "purchase_stage": None, + "decision_drivers": [], + "pain_points": [], + "delight_factors": [], + **input_data + } + + # Skip non-English posts + if not input_data.get("is_english", True): + return { + "success": True, + "analysis_skipped": True, + "analysis_skip_reason": f"Non-English: {input_data.get('detected_language')}", + "author_role": None, + "sabian_mention_context": None, + "sentiment_level": None, + "emotion_type": None, + "intents": [], + "competitor_products_owned": [], + **input_data + } + + # Perform analysis + analysis_result = self.analyze_post( + content=input_data.get("cleaned_content", ""), + thread_context=input_data.get("thread_context", ""), + quoted_content=input_data.get("quoted_content", "") + ) + + result = { + **input_data, + **analysis_result, + "analysis_skipped": False + } + + self.log_processing( + f"Analyzed: sentiment={result.get('sentiment_level')}, " + f"products={len(result.get('products_mentioned', []))}, " + f"intents={result.get('intents', [])}", + "debug" + ) + + return result + + except Exception as e: + return self.handle_error(e, "brand analysis") diff --git a/processing_brand_sentiment/workflow/agents/sabian_relevance_extraction_agent.py b/processing_brand_sentiment/workflow/agents/sabian_relevance_extraction_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..6331c3076cc621c1ff6e260df1f09f110fb05a45 --- /dev/null +++ b/processing_brand_sentiment/workflow/agents/sabian_relevance_extraction_agent.py @@ -0,0 +1,431 @@ +""" +Sabian Relevance & Extraction Agent for brand sentiment analysis. + +This agent performs two critical functions: +1. Determines relevance with HIGH confidence using strict rules +2. Extracts verifiable facts (products, author role, context summary) + +Key Design Principles: +- Strict product matching: ONLY return products from predefined list +- Competitor awareness: Know what products belong to competitors +- Conservative relevance: When uncertain, mark as NOT relevant +- Thread context summarization: Provide clean, concise context for next agent +""" + +from typing import Dict, Any, List +import json +from langchain_openai import ChatOpenAI +from langchain.schema import HumanMessage, SystemMessage +import logging + +from .base_agent import BaseAgent + +logger = logging.getLogger(__name__) + + +class SabianRelevanceExtractionAgent(BaseAgent): + """ + Agent that validates relevance and extracts key facts from posts. + + This agent is the first LLM call in the pipeline and serves as the + gatekeeper for relevance while also extracting structured information + for downstream analysis. + """ + + def __init__( + self, + config: Dict[str, Any], + api_key: str, + brand_config: Dict[str, Any], + analysis_categories: Dict[str, Any] + ): + """ + Initialize the Relevance & Extraction Agent. + + Args: + config: Agent configuration + api_key: OpenAI API key + brand_config: Brand-specific configuration with products and competitors + analysis_categories: Category definitions for validation + """ + super().__init__("SabianRelevanceExtractionAgent", config) + self.api_key = api_key + self.brand_config = brand_config + self.analysis_categories = analysis_categories + + self.llm = ChatOpenAI( + model=self.model, + temperature=self.temperature, + api_key=self.api_key + ) + + # Pre-compute valid values + self._build_valid_values() + self._build_competitor_product_warnings() + + logger.info("SabianRelevanceExtractionAgent initialized") + + def _build_valid_values(self) -> None: + """Build valid value lists for validation.""" + brand = self.brand_config.get("brand", {}) + + # Products + self.valid_products = brand.get("products", []) + + # Competitors (brand names only) + self.valid_competitors = [] + for comp in brand.get("competitors", []): + if isinstance(comp, dict): + self.valid_competitors.append(comp.get("name", "")) + else: + self.valid_competitors.append(str(comp)) + + # Author roles from categories + author_role_config = self.analysis_categories.get("author_role", {}) + self.valid_author_roles = [ + c["value"] for c in author_role_config.get("categories", []) + ] + + # Sabian mention context from categories + mention_context_config = self.analysis_categories.get("sabian_mention_context", {}) + self.valid_mention_contexts = [ + c["value"] for c in mention_context_config.get("categories", []) + ] + + def _build_competitor_product_warnings(self) -> None: + """Build list of competitor products to warn about in prompts.""" + warnings = self.brand_config.get("brand", {}).get("competitor_products_warning", {}) + + self.competitor_products_by_brand = {} + for key, products in warnings.items(): + if key == "description": + continue + # Extract brand name from key (e.g., "paiste_products" -> "Paiste") + brand_name = key.replace("_products", "").capitalize() + self.competitor_products_by_brand[brand_name] = products + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """Validate input contains required fields.""" + required = ["cleaned_content"] + return all(field in input_data for field in required) + + def _build_system_prompt(self) -> str: + """Build the system prompt for relevance and extraction.""" + brand_name = self.brand_config.get("brand", {}).get("name", "Sabian") + + # Build competitor product warnings + competitor_warnings = [] + for brand, products in self.competitor_products_by_brand.items(): + products_str = ", ".join(f'"{p}"' for p in products[:5]) # Limit to 5 examples + if len(products) > 5: + products_str += f" (and {len(products)-5} more)" + competitor_warnings.append(f"- {brand}: {products_str}") + + competitor_warnings_text = "\n".join(competitor_warnings) if competitor_warnings else "None specified" + + return f"""You are a brand mention extractor for {brand_name} cymbals. Your job is to: +1. Determine if the POST CONTENT discusses {brand_name} products or brand +2. Extract ONLY verifiable facts, not interpretations + +## CRITICAL RULES + +### Rule 1: Relevance Based on POST CONTENT Only +- The post is relevant ONLY if the POST CONTENT itself mentions {brand_name} brand or products +- Quoted/parent content mentioning {brand_name} does NOT make the post relevant +- Generic replies ("Thanks!", "Got it!", "Good point!") are NEVER relevant +- Posts can be relevant even without specific product mentions if they discuss the {brand_name} brand + +### Rule 2: Strict Product Matching +{brand_name.upper()} PRODUCTS (use ONLY these exact values): +{self.valid_products} + +CRITICAL: +- Return ONLY products from this exact list above +- If you see a product not in this list, do NOT include it +- Return empty list [] if no products from the list are mentioned +- It's OK to have empty products_mentioned if the post discusses {brand_name} brand generally + +### Rule 3: Competitor Product Awareness +These products belong to COMPETITORS, NOT {brand_name}: +{competitor_warnings_text} + +COMPETITOR BRANDS: {self.valid_competitors} +- Only return competitor BRAND names in competitors_mentioned (not their products) +- If you see "2002", "Signature", "Sound Edge", "Formula 602" - these are PAISTE, not {brand_name} +- If you see "K Custom", "A Custom" - these are ZILDJIAN, not {brand_name} + +### Rule 4: Thread Context Summary +- Summarize thread context in 1-2 sentences MAXIMUM +- Focus only on what helps understand what the post is responding to +- If thread is about unrelated topics (pizza, general life), say so briefly +- Keep it factual and concise + +### Rule 5: Author Role Classification +Determine the author's relationship to {brand_name}: +- current_owner: Currently owns/uses {brand_name} products +- past_owner: Previously owned but sold/replaced +- potential_buyer: Considering purchasing {brand_name} +- never_owned: Explicitly states they don't own {brand_name} +- unknown: Cannot determine from post content + +### Rule 6: Mention Context Classification +How prominently is {brand_name} discussed IN THE POST CONTENT: +- primary_focus: {brand_name} is the main topic of the post +- significant_mention: {brand_name} discussed with some detail, but not main focus +- casual_mention: Brief mention among other topics +- comparison_context: Mentioned while comparing to competitors +- null: Not relevant (use when is_relevant=false) + +## OUTPUT FORMAT +Return ONLY valid JSON with these exact fields: +```json +{{ + "is_relevant": true/false, + "relevance_confidence": "high" | "medium" | "low", + "relevance_reason": "1-2 sentences explaining your decision", + "products_mentioned": [], + "sabian_mention_context": "value from list" | null, + "author_role": "value from list", + "competitors_mentioned": [], + "thread_context_summary": "1-2 sentence summary of thread context" +}} +``` + +IMPORTANT: Return ONLY the JSON object, no additional text.""" + + def _build_user_prompt( + self, + content: str, + quoted_content: str, + raw_thread_context: str, + keywords_found: List[str] + ) -> str: + """Build the user prompt with post content and context.""" + brand_name = self.brand_config.get("brand", {}).get("name", "Sabian") + + context_section = "" + if raw_thread_context: + # Truncate if too long + truncated_context = raw_thread_context[:1000] if len(raw_thread_context) > 1000 else raw_thread_context + context_section += f"THREAD CONTEXT (for understanding only):\n{truncated_context}\n\n" + + if quoted_content: + truncated_quote = quoted_content[:500] if len(quoted_content) > 500 else quoted_content + context_section += f"QUOTED/PARENT CONTENT (for understanding only):\n{truncated_quote}\n\n" + + keywords_info = "" + if keywords_found: + keywords_info = f"Keywords detected by preprocessor: {', '.join(keywords_found)}\n\n" + + return f"""Analyze this post for {brand_name} relevance and extract facts. + +{keywords_info}{context_section}POST CONTENT TO EVALUATE (base your decision ONLY on this): +\"\"\"{content}\"\"\" + +Remember: +- is_relevant=true ONLY if POST CONTENT discusses {brand_name} +- products_mentioned must be from the exact product list provided +- competitors_mentioned should be brand names only (Zildjian, Paiste, etc.) +- thread_context_summary should be 1-2 sentences max + +Return JSON only.""" + + def extract_and_validate(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Perform relevance check and fact extraction. + + Args: + input_data: Preprocessed post data + + Returns: + Dictionary with extraction results + """ + content = input_data.get("cleaned_content", "") + quoted_content = input_data.get("quoted_content", "") + raw_thread_context = input_data.get("raw_thread_context", "") + keywords_found = input_data.get("relevance_keywords_found", []) + + try: + messages = [ + SystemMessage(content=self._build_system_prompt()), + HumanMessage(content=self._build_user_prompt( + content, quoted_content, raw_thread_context, keywords_found + )) + ] + + response = self.llm.invoke(messages) + result = self._parse_llm_json_response(response.content) + + # Validate and normalize the response + validated = self._validate_response(result) + + return { + "success": True, + **validated + } + + except json.JSONDecodeError as e: + self.log_processing(f"JSON decode error: {e}", "warning") + return { + "success": False, + "error": f"JSON parse error: {str(e)}", + "is_relevant": False, + "relevance_confidence": "low", + "relevance_reason": "Failed to parse LLM response" + } + + except Exception as e: + self.log_processing(f"Extraction error: {e}", "error") + return { + "success": False, + "error": str(e), + "is_relevant": False, + "relevance_confidence": "low", + "relevance_reason": f"Error during extraction: {str(e)}" + } + + def _validate_response(self, result: Dict[str, Any]) -> Dict[str, Any]: + """Validate and normalize LLM response against allowed values.""" + + # Validate products + products = result.get("products_mentioned", []) + if not isinstance(products, list): + products = [] + valid_products = [ + p for p in products + if any(p.lower() == vp.lower() for vp in self.valid_products) + ] + # Normalize to canonical case + normalized_products = [] + for p in valid_products: + for vp in self.valid_products: + if p.lower() == vp.lower(): + normalized_products.append(vp) + break + + # Validate competitors + competitors = result.get("competitors_mentioned", []) + if not isinstance(competitors, list): + competitors = [] + valid_competitors = [ + c for c in competitors + if any(c.lower() == vc.lower() for vc in self.valid_competitors) + ] + # Normalize to canonical case + normalized_competitors = [] + for c in valid_competitors: + for vc in self.valid_competitors: + if c.lower() == vc.lower(): + normalized_competitors.append(vc) + break + + # Validate author_role + author_role = result.get("author_role", "unknown") + if author_role not in self.valid_author_roles: + author_role = "unknown" + + # Validate sabian_mention_context + mention_context = result.get("sabian_mention_context") + is_relevant = result.get("is_relevant", False) + + if not is_relevant: + mention_context = None + elif mention_context and mention_context not in self.valid_mention_contexts: + mention_context = "casual_mention" # Default for relevant posts + + # Validate confidence + confidence = result.get("relevance_confidence", "medium") + if confidence not in ["high", "medium", "low"]: + confidence = "medium" + + return { + "is_relevant": bool(is_relevant), + "relevance_confidence": confidence, + "relevance_reason": result.get("relevance_reason", ""), + "products_mentioned": normalized_products, + "sabian_mention_context": mention_context, + "author_role": author_role, + "competitors_mentioned": normalized_competitors, + "thread_context_summary": result.get("thread_context_summary", "") + } + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a post through relevance validation and fact extraction. + + Args: + input_data: Dictionary from preprocessor containing: + - cleaned_content: Cleaned post text + - quoted_content: Quoted content if any + - raw_thread_context: Raw thread context + - relevance_keywords_found: Keywords from preprocessor + - preliminary_relevant: Preprocessor's relevance assessment + - needs_relevance_validation: Whether LLM validation needed + + Returns: + Dictionary with extraction results and original data + """ + try: + if not self.validate_input(input_data): + return { + "success": False, + "error": "Invalid input: missing required fields", + "is_relevant": False, + **input_data + } + + # Skip if already determined not relevant and no validation needed + if (not input_data.get("preliminary_relevant", False) and + not input_data.get("needs_relevance_validation", False)): + return { + "success": True, + "is_relevant": False, + "relevance_confidence": "high", + "relevance_reason": "No Sabian-related keywords found in post", + "products_mentioned": [], + "sabian_mention_context": None, + "author_role": "unknown", + "competitors_mentioned": input_data.get("competitors_detected", []), + "thread_context_summary": "", + "extraction_performed": False, + **input_data + } + + # Skip non-English posts + if not input_data.get("is_english", True): + return { + "success": True, + "is_relevant": False, + "relevance_confidence": "high", + "relevance_reason": f"Non-English post: {input_data.get('detected_language')}", + "products_mentioned": [], + "sabian_mention_context": None, + "author_role": "unknown", + "competitors_mentioned": [], + "thread_context_summary": "", + "extraction_performed": False, + **input_data + } + + # Perform LLM extraction + extraction_result = self.extract_and_validate(input_data) + + # Merge results + result = { + **input_data, + **extraction_result, + "extraction_performed": True + } + + # Log the result + self.log_processing( + f"Extraction complete: is_relevant={result.get('is_relevant')}, " + f"products={result.get('products_mentioned')}, " + f"context={result.get('sabian_mention_context')}", + "debug" + ) + + return result + + except Exception as e: + return self.handle_error(e, "relevance extraction") diff --git a/processing_brand_sentiment/workflow/agents/sabian_sentiment_analyzer_agent.py b/processing_brand_sentiment/workflow/agents/sabian_sentiment_analyzer_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..fe5d88303866bd5897632e863e1170f41b2229ca --- /dev/null +++ b/processing_brand_sentiment/workflow/agents/sabian_sentiment_analyzer_agent.py @@ -0,0 +1,434 @@ +""" +Sabian Sentiment & Intent Analyzer Agent for brand sentiment analysis. + +This agent performs deep analysis on VERIFIED relevant posts with STRUCTURED input. +It receives pre-validated data from the Relevance Extraction Agent including: +- Products already extracted and validated +- Thread context already summarized +- Author role already determined + +Key Design Principles: +- Focused analysis: Only sentiment, intents, and customer journey +- No re-extraction: Products are given, not re-detected +- Sabian-specific sentiment: How author feels about Sabian, not overall post tone +- Author perspective: Pain points/delights only from author's own experience +""" + +from typing import Dict, Any, List +import json +from langchain_openai import ChatOpenAI +from langchain.schema import HumanMessage, SystemMessage +import logging + +from .base_agent import BaseAgent + +logger = logging.getLogger(__name__) + + +class SabianSentimentAnalyzerAgent(BaseAgent): + """ + Agent that performs deep sentiment and intent analysis on relevant posts. + + This agent is the second LLM call in the pipeline and focuses purely on + analysis, not extraction. It receives structured input from the extraction + agent and produces sentiment, intent, and customer journey insights. + """ + + def __init__( + self, + config: Dict[str, Any], + api_key: str, + brand_config: Dict[str, Any], + analysis_categories: Dict[str, Any] + ): + """ + Initialize the Sentiment Analyzer Agent. + + Args: + config: Agent configuration + api_key: OpenAI API key + brand_config: Brand-specific configuration + analysis_categories: Category definitions for analysis + """ + super().__init__("SabianSentimentAnalyzerAgent", config) + self.api_key = api_key + self.brand_config = brand_config + self.analysis_categories = analysis_categories + + self.llm = ChatOpenAI( + model=self.model, + temperature=self.temperature, + api_key=self.api_key + ) + + # Pre-compute valid values for validation + self._valid_values = self._compute_valid_values() + + logger.info("SabianSentimentAnalyzerAgent initialized") + + def _compute_valid_values(self) -> Dict[str, List[str]]: + """Pre-compute all valid values from config for validation.""" + valid = {} + + # Products from brand config + valid["products"] = self.brand_config.get("brand", {}).get("products", []) + + # Competitors + competitor_names = [] + for comp in self.brand_config.get("brand", {}).get("competitors", []): + if isinstance(comp, dict): + competitor_names.append(comp.get("name", "")) + valid["competitors"] = competitor_names + + # Extract category values from analysis_categories + category_map = { + "sentiment_level": "sentiment", + "emotion_type": "emotions", + "intents": "intents", + "purchase_stage": "purchase_stage", + "comparison_type": "comparison_type", + "feedback_aspects": "feedback_aspects", + "decision_drivers": "decision_drivers", + "product_attributes": "product_attributes", + } + + for key, config_key in category_map.items(): + config_section = self.analysis_categories.get(config_key, {}) + if "categories" in config_section: + valid[key] = [c["value"] for c in config_section["categories"]] + elif "levels" in config_section: + valid[key] = [c["value"] for c in config_section["levels"]] + else: + valid[key] = [] + + return valid + + def _get_valid_list(self, key: str) -> List[str]: + """Get list of valid values for a category.""" + return self._valid_values.get(key, []) + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """Validate that input contains required fields.""" + required = ["cleaned_content", "is_relevant"] + return all(field in input_data for field in required) + + def _build_system_prompt(self) -> str: + """Build optimized system prompt for sentiment analysis.""" + brand_name = self.brand_config.get("brand", {}).get("name", "Sabian") + v = self._valid_values + + return f"""You are a sentiment analyst for {brand_name} cymbal discussions. + +## YOUR TASK +Analyze the sentiment, emotions, and intents in posts about {brand_name}. +You will receive PRE-VALIDATED context (products, author role, etc.) - trust these values. + +## CRITICAL RULES + +### Rule 1: Neutral by Default +Sentiment defaults to NEUTRAL unless there is EXPLICIT positive or negative language toward {brand_name}. +- Factual statements = neutral +- Comparative statements ("sounds different", "not the same as") = neutral (different ≠ worse) +- Advice-giving without personal opinion = neutral + +Only assign positive/negative sentiment when the author CLEARLY expresses satisfaction or dissatisfaction with {brand_name}. + +### Rule 2: {brand_name}-Specific Sentiment +Sentiment MUST be about {brand_name} specifically, NOT overall post tone or other products. + +EXAMPLE: +Post: "I have SBR cymbals and bought a Pearl crash. The Pearl sounds different from the SBR. Go with what feels best!" +- This is NEUTRAL toward {brand_name} - "different" is not criticism +- The author owns SBR (no complaint), is giving advice +- pain_points: [] (no negative experience expressed) +- delight_factors: [] (no positive experience expressed) + +### Rule 3: Mutually Exclusive Feedback +pain_points and delight_factors CANNOT contain the same values. +- If an aspect is positive → delight_factors only +- If an aspect is negative → pain_points only +- Never both + +### Rule 4: Author Perspective Only +These fields are ONLY for author's OWN experience, not advice to others: +- purchase_stage, decision_drivers, pain_points, delight_factors + +If author is primarily giving ADVICE to someone else, these should be null/empty. + +### Rule 5: Valid Values + +| Field | Valid Values | +|-------|--------------| +| sentiment_level | {v.get('sentiment_level', [])} | +| emotion_type | {v.get('emotion_type', [])} | +| intents (multi-select) | {v.get('intents', [])} | +| purchase_stage | {v.get('purchase_stage', [])} | +| comparison_type | {v.get('comparison_type', [])} | +| feedback_aspects | {v.get('feedback_aspects', [])} | +| decision_drivers | {v.get('decision_drivers', [])} | +| product_attributes | {v.get('product_attributes', [])} | +| competitor brands | {v.get('competitors', [])} | + +### Rule 6: Intent Classification +- seeking_information: Asking questions, seeking advice +- providing_information: Answering questions, giving advice +- sharing_experience: Personal experience, review, testimonial +- comparing: Comparing brands/products +- praising: Actively endorsing {brand_name} +- criticizing: Actively complaining about {brand_name} +- buying_selling: Listing gear for sale/trade +- general_discussion: General conversation + +## OUTPUT FORMAT +```json +{{ + "sentiment_level": "neutral unless explicit positive/negative", + "emotion_type": "value or null", + "sentiment_confidence": "high" | "medium" | "low", + "sarcasm_detected": false, + "product_attributes": [], + "competitor_products_owned": [], + "comparison_type": "value or null", + "intents": [], + "purchase_stage": "value or null", + "decision_drivers": [], + "pain_points": [], + "delight_factors": [], + "analysis_notes": "1-2 sentences" +}} +``` + +Return ONLY valid JSON.""" + + def _build_user_prompt(self, input_data: Dict[str, Any]) -> str: + """Build user prompt with structured context.""" + brand_name = self.brand_config.get("brand", {}).get("name", "Sabian") + + content = input_data.get("cleaned_content", "") + products_mentioned = input_data.get("products_mentioned", []) + sabian_context = input_data.get("sabian_mention_context", "") + author_role = input_data.get("author_role", "unknown") + thread_summary = input_data.get("thread_context_summary", "") + competitors_mentioned = input_data.get("competitors_mentioned", []) + + context_section = f"""## PRE-VALIDATED CONTEXT (trust these values) +- Products mentioned: {products_mentioned if products_mentioned else 'None specific'} +- {brand_name} mention context: {sabian_context} +- Author role: {author_role} +- Competitors mentioned: {competitors_mentioned if competitors_mentioned else 'None'} +- Thread summary: {thread_summary if thread_summary else 'Not available'} +""" + + return f"""Analyze this post about {brand_name} for sentiment and intents. + +{context_section} +## POST CONTENT TO ANALYZE: +\"\"\"{content}\"\"\" + +Remember: +- Sentiment is about {brand_name} ONLY, not overall post tone +- pain_points/delight_factors only from author's OWN experience +- Use only values from the valid lists provided + +Return JSON only.""" + + def analyze_post(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Perform sentiment and intent analysis. + + Args: + input_data: Structured data from extraction agent + + Returns: + Dictionary with analysis results + """ + try: + messages = [ + SystemMessage(content=self._build_system_prompt()), + HumanMessage(content=self._build_user_prompt(input_data)) + ] + + response = self.llm.invoke(messages) + result = self._parse_llm_json_response(response.content) + + # Validate and normalize + validated = self._validate_and_normalize(result) + + return {"success": True, **validated} + + except json.JSONDecodeError as e: + self.log_processing(f"JSON decode error: {e}", "warning") + return { + "success": False, + "error": f"JSON parse error: {str(e)}", + "sentiment_level": "neutral", + "intents": ["general_discussion"] + } + + except Exception as e: + self.log_processing(f"Analysis error: {e}", "error") + return {"success": False, "error": str(e)} + + def _validate_single(self, value: Any, valid_list: List[str], default: Any = None) -> Any: + """Validate single value against list, return canonical form or default.""" + if value is None: + return default + if isinstance(value, str): + val_lower = value.lower() + for v in valid_list: + if v.lower() == val_lower: + return v + return default + + def _validate_list(self, values: Any, valid_list: List[str]) -> List[str]: + """Validate list values, return only valid items in canonical form.""" + if not values: + return [] + if not isinstance(values, list): + values = [values] + + validated = [] + valid_lower = {v.lower(): v for v in valid_list} + for val in values: + if isinstance(val, str) and val.lower() in valid_lower: + validated.append(valid_lower[val.lower()]) + return validated + + def _validate_and_normalize(self, result: Dict[str, Any]) -> Dict[str, Any]: + """Validate all fields against predefined values and normalize.""" + v = self._valid_values + + normalized = { + # Sentiment + "sentiment_level": self._validate_single( + result.get("sentiment_level"), v["sentiment_level"], "neutral" + ), + "emotion_type": self._validate_single( + result.get("emotion_type"), v["emotion_type"], None + ), + "sentiment_confidence": result.get("sentiment_confidence", "medium"), + "sarcasm_detected": bool(result.get("sarcasm_detected", False)), + + # Product info + "product_attributes": self._validate_list( + result.get("product_attributes"), v["product_attributes"] + ), + + # Competitors + "competitor_products_owned": self._validate_list( + result.get("competitor_products_owned"), v["competitors"] + ), + "comparison_type": self._validate_single( + result.get("comparison_type"), v["comparison_type"], None + ), + + # Intents + "intents": self._validate_list( + result.get("intents"), v["intents"] + ) or ["general_discussion"], + + # Author journey (null if advising others) + "purchase_stage": self._validate_single( + result.get("purchase_stage"), v["purchase_stage"], None + ), + "decision_drivers": self._validate_list( + result.get("decision_drivers"), v["decision_drivers"] + ), + + # Feedback - both use feedback_aspects + "pain_points": self._validate_list( + result.get("pain_points"), v["feedback_aspects"] + ), + "delight_factors": self._validate_list( + result.get("delight_factors"), v["feedback_aspects"] + ), + + # Notes + "analysis_notes": result.get("analysis_notes", ""), + } + + # Validate confidence + if normalized["sentiment_confidence"] not in ["high", "medium", "low"]: + normalized["sentiment_confidence"] = "medium" + + return normalized + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a post through sentiment and intent analysis. + + Args: + input_data: Dictionary from extraction agent containing: + - cleaned_content: Post text + - is_relevant: Relevance determination + - products_mentioned: Pre-validated products + - sabian_mention_context: How Sabian is discussed + - author_role: Author's relationship to Sabian + - thread_context_summary: Summarized context + - competitors_mentioned: Competitor brands + + Returns: + Dictionary with analysis results and original data + """ + try: + if not self.validate_input(input_data): + return { + "success": False, + "error": "Invalid input: missing required fields", + **input_data + } + + # Skip non-relevant posts + if not input_data.get("is_relevant", False): + return { + "success": True, + "analysis_skipped": True, + "analysis_skip_reason": "Post marked as not relevant", + "sentiment_level": None, + "emotion_type": None, + "sentiment_confidence": None, + "sarcasm_detected": False, + "product_attributes": [], + "competitor_products_owned": [], + "comparison_type": None, + "intents": [], + "purchase_stage": None, + "decision_drivers": [], + "pain_points": [], + "delight_factors": [], + "analysis_notes": "", + **input_data + } + + # Skip non-English posts (should already be filtered, but double-check) + if not input_data.get("is_english", True): + return { + "success": True, + "analysis_skipped": True, + "analysis_skip_reason": f"Non-English: {input_data.get('detected_language')}", + "sentiment_level": None, + "emotion_type": None, + "intents": [], + **input_data + } + + # Perform analysis + analysis_result = self.analyze_post(input_data) + + result = { + **input_data, + **analysis_result, + "analysis_skipped": False + } + + self.log_processing( + f"Analyzed: sentiment={result.get('sentiment_level')}, " + f"intents={result.get('intents')}, " + f"pain_points={result.get('pain_points')}", + "debug" + ) + + return result + + except Exception as e: + return self.handle_error(e, "sentiment analysis") diff --git a/processing_brand_sentiment/workflow/comment_orchestrator.py b/processing_brand_sentiment/workflow/comment_orchestrator.py new file mode 100644 index 0000000000000000000000000000000000000000..e0d5540f4af682653a96396609c7f86691b1a503 --- /dev/null +++ b/processing_brand_sentiment/workflow/comment_orchestrator.py @@ -0,0 +1,558 @@ +""" +Comment Analysis Workflow Orchestrator using LangGraph. + +Coordinates the 4-agent pipeline for social media comments: +1. CommentPreprocessorAgent - Plain text cleaning, keyword detection (no LLM) +2. SabianRelevanceExtractionAgent - Relevance + fact extraction (LLM #1) [shared] +3. SabianSentimentAnalyzerAgent - Deep sentiment analysis (LLM #2) [shared] +4. OutputValidatorAgent - Rule-based validation (no LLM) [shared] + +Architecture v4.0: +- Same analysis pipeline as forums, different preprocessing and state +- Plain text input (no HTML parsing) +- Context from social media content metadata and parent comments +- Comment-specific identifiers (comment_sk, comment_id, platform, etc.) +""" + +from typing import Dict, Any, List, TypedDict, Annotated, Optional +import operator +import json +import os +from langgraph.graph import StateGraph, END +import logging + +from .agents.comment_preprocessor_agent import CommentPreprocessorAgent +from .agents.sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent +from .agents.sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent +from .agents.output_validator_agent import OutputValidatorAgent + +logger = logging.getLogger(__name__) + + +class CommentAnalysisState(TypedDict): + """ + State definition for the comment analysis workflow v4.0. + + Uses comment-specific identifiers but shares the same analysis fields + as the forum workflow for consistent output. + """ + # ============== Source Identifiers (Comment-specific) ============== + comment_sk: int + comment_id: str + platform: str + comment_timestamp: Any + author_name: str + author_id: str + parent_comment_id: str + parent_comment_text: str + + # Content metadata + content_sk: int + content_id: str + content_description: str + content_title: str + channel_sk: int + channel_name: str + channel_display_name: str + + # ============== Original Content ============== + comment_text: str + original_text: str + + # ============== Preprocessor Output ============== + cleaned_content: str + quoted_content: str + has_quote: bool + quoted_author: str + raw_thread_context: str # Comment context (reuses field name for agent compatibility) + is_empty: bool + + # Language detection + detected_language: str + language_code: str + is_english: bool + language_confidence: str + language_detection_skipped: bool + + # Preliminary relevance (keyword-based) + preliminary_relevant: bool + needs_relevance_validation: bool + relevance_keywords_found: List[str] + relevance_type: str + has_primary_keywords: bool + + # Initial detections + products_detected: List[str] + competitors_detected: List[str] + + # ============== Extraction Agent Output ============== + is_relevant: bool + relevance_confidence: str + relevance_reason: str + extraction_performed: bool + + # Extracted facts + products_mentioned: List[str] + sabian_mention_context: str + author_role: str + competitors_mentioned: List[str] + thread_context_summary: str + + # ============== Sentiment Analyzer Output ============== + sentiment_level: str + emotion_type: str + sentiment_confidence: str + sarcasm_detected: bool + + # Product information + product_attributes: List[str] + + # Competitive intelligence + competitor_products_owned: List[str] + comparison_type: str + + # Customer journey (AUTHOR PERSPECTIVE ONLY) + intents: List[str] + purchase_stage: str + decision_drivers: List[str] + pain_points: List[str] + delight_factors: List[str] + + # Analysis notes + analysis_notes: str + analysis_skipped: bool + analysis_skip_reason: str + + # ============== Validator Output ============== + validation_passed: bool + validation_errors: List[str] + validation_warnings: List[str] + validation_flags: List[str] + processing_status: str + + # ============== Processing Metadata ============== + processing_errors: Annotated[List[str], operator.add] + success: bool + + +class CommentAnalysisWorkflow: + """ + LangGraph-based workflow for comment brand sentiment analysis v4.0. + + Pipeline: + 1. Comment Preprocessor (no LLM) - plain text, comment context + 2. Relevance & Extraction Agent (LLM #1) - shared with forums + 3. Sentiment Analyzer Agent (LLM #2) - shared with forums + 4. Output Validator (no LLM) - shared with forums + """ + + def __init__( + self, + workflow_config: Dict[str, Any], + brand_config: Dict[str, Any], + analysis_categories: Dict[str, Any], + api_key: str + ): + """ + Initialize the workflow with agents and configuration. + + Args: + workflow_config: Workflow and agent configuration + brand_config: Brand-specific configuration + analysis_categories: Analysis category definitions + api_key: OpenAI API key + """ + self.workflow_config = workflow_config + self.brand_config = brand_config + self.analysis_categories = analysis_categories + self.api_key = api_key + + # Initialize agents + self._init_agents() + + # Build the workflow graph + self.workflow = self._build_workflow() + + logger.info("CommentAnalysisWorkflow v4.0 initialized successfully") + + def _init_agents(self) -> None: + """Initialize all agents with their configurations.""" + agents_config = self.workflow_config.get("agents", {}) + + # 1. Comment Preprocessor Agent (no LLM) - comment-specific + preprocessor_config = agents_config.get("preprocessor", {}) + self.preprocessor = CommentPreprocessorAgent( + preprocessor_config, + self.brand_config + ) + + # 2. Relevance & Extraction Agent (LLM #1) - shared with forums + extraction_config = agents_config.get("relevance_extraction", + agents_config.get("relevance_validator", {}) + ) + self.extraction_agent = SabianRelevanceExtractionAgent( + extraction_config, + self.api_key, + self.brand_config, + self.analysis_categories + ) + + # 3. Sentiment Analyzer Agent (LLM #2) - shared with forums + analyzer_config = agents_config.get("sentiment_analyzer", + agents_config.get("brand_analyzer", {}) + ) + self.sentiment_analyzer = SabianSentimentAnalyzerAgent( + analyzer_config, + self.api_key, + self.brand_config, + self.analysis_categories + ) + + # 4. Output Validator Agent (no LLM) - shared with forums + validator_config = agents_config.get("output_validator", {}) + self.output_validator = OutputValidatorAgent( + validator_config, + self.brand_config, + self.analysis_categories + ) + + logger.info("All 4 agents initialized for comment processing") + + def _build_workflow(self) -> StateGraph: + """ + Build the LangGraph workflow. + + Flow: + preprocessing -> extraction -> (analysis if relevant) -> validation -> END + + Returns: + Compiled StateGraph workflow + """ + workflow = StateGraph(CommentAnalysisState) + + # Add nodes + workflow.add_node("preprocessing", self._preprocessing_node) + workflow.add_node("extraction", self._extraction_node) + workflow.add_node("analysis", self._analysis_node) + workflow.add_node("validation", self._validation_node) + + # Set entry point + workflow.set_entry_point("preprocessing") + + # Define edges + workflow.add_conditional_edges( + "preprocessing", + self._route_after_preprocessing, + { + "extract": "extraction", + "skip_to_validation": "validation" + } + ) + + workflow.add_conditional_edges( + "extraction", + self._route_after_extraction, + { + "analyze": "analysis", + "skip_to_validation": "validation" + } + ) + + workflow.add_edge("analysis", "validation") + workflow.add_edge("validation", END) + + return workflow.compile() + + def _preprocessing_node(self, state: CommentAnalysisState) -> CommentAnalysisState: + """ + Preprocessing node: Plain text cleaning, language detection, keyword check. + """ + try: + input_data = { + "comment_sk": state.get("comment_sk"), + "comment_text": state.get("comment_text", ""), + "content_title": state.get("content_title"), + "content_description": state.get("content_description"), + "parent_comment_text": state.get("parent_comment_text") + } + + result = self.preprocessor.process(input_data) + + if result.get("success", False): + # Content + state["cleaned_content"] = result.get("cleaned_content", "") + state["quoted_content"] = result.get("quoted_content") + state["has_quote"] = result.get("has_quote", False) + state["quoted_author"] = result.get("quoted_author") + state["raw_thread_context"] = result.get("raw_thread_context", "") + state["is_empty"] = result.get("is_empty", False) + state["original_text"] = result.get("original_text", state.get("comment_text", "")) + + # Language + state["detected_language"] = result.get("detected_language", "English") + state["language_code"] = result.get("language_code", "en") + state["is_english"] = result.get("is_english", True) + state["language_confidence"] = result.get("language_confidence", "low") + state["language_detection_skipped"] = result.get("language_detection_skipped", False) + + # Relevance + state["preliminary_relevant"] = result.get("preliminary_relevant", False) + state["needs_relevance_validation"] = result.get("needs_relevance_validation", False) + state["relevance_keywords_found"] = result.get("relevance_keywords_found", []) + state["relevance_type"] = result.get("relevance_type", "none") + state["has_primary_keywords"] = result.get("has_primary_keywords", False) + + # Detections + state["products_detected"] = result.get("products_detected", []) + state["competitors_detected"] = result.get("competitors_detected", []) + + state["success"] = True + else: + error_msg = f"Preprocessing failed: {result.get('error', 'Unknown error')}" + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + state["success"] = False + + logger.debug(f"Preprocessing complete for comment {state.get('comment_sk')}") + return state + + except Exception as e: + error_msg = f"Preprocessing node error: {str(e)}" + logger.error(error_msg) + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + state["success"] = False + return state + + def _extraction_node(self, state: CommentAnalysisState) -> CommentAnalysisState: + """ + Extraction node: LLM-based relevance validation and fact extraction. + Reuses the same extraction agent as forums. + """ + try: + input_data = { + "cleaned_content": state.get("cleaned_content", ""), + "quoted_content": state.get("quoted_content"), + "raw_thread_context": state.get("raw_thread_context", ""), + "relevance_keywords_found": state.get("relevance_keywords_found", []), + "preliminary_relevant": state.get("preliminary_relevant", False), + "needs_relevance_validation": state.get("needs_relevance_validation", True), + "products_detected": state.get("products_detected", []), + "competitors_detected": state.get("competitors_detected", []), + "is_english": state.get("is_english", True), + "detected_language": state.get("detected_language", "English") + } + + result = self.extraction_agent.process(input_data) + + # Update state with extraction results + state["is_relevant"] = result.get("is_relevant", False) + state["relevance_confidence"] = result.get("relevance_confidence", "low") + state["relevance_reason"] = result.get("relevance_reason", "") + state["extraction_performed"] = result.get("extraction_performed", True) + + # Extracted facts + state["products_mentioned"] = result.get("products_mentioned", []) + state["sabian_mention_context"] = result.get("sabian_mention_context") + state["author_role"] = result.get("author_role", "unknown") + state["competitors_mentioned"] = result.get("competitors_mentioned", []) + state["thread_context_summary"] = result.get("thread_context_summary", "") + + if not result.get("success", False) and result.get("error"): + state["processing_errors"] = state.get("processing_errors", []) + [result["error"]] + + logger.debug(f"Extraction complete: is_relevant={state['is_relevant']}") + return state + + except Exception as e: + error_msg = f"Extraction node error: {str(e)}" + logger.error(error_msg) + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + state["is_relevant"] = False + state["relevance_confidence"] = "low" + return state + + def _analysis_node(self, state: CommentAnalysisState) -> CommentAnalysisState: + """ + Analysis node: Deep sentiment and intent analysis for relevant comments. + Reuses the same sentiment analyzer as forums. + """ + try: + input_data = { + "cleaned_content": state.get("cleaned_content", ""), + "is_relevant": state.get("is_relevant", True), + "is_english": state.get("is_english", True), + "detected_language": state.get("detected_language", "English"), + "products_mentioned": state.get("products_mentioned", []), + "sabian_mention_context": state.get("sabian_mention_context"), + "author_role": state.get("author_role", "unknown"), + "competitors_mentioned": state.get("competitors_mentioned", []), + "thread_context_summary": state.get("thread_context_summary", "") + } + + result = self.sentiment_analyzer.process(input_data) + + if result.get("success", False): + # Sentiment + state["sentiment_level"] = result.get("sentiment_level") + state["emotion_type"] = result.get("emotion_type") + state["sentiment_confidence"] = result.get("sentiment_confidence", "medium") + state["sarcasm_detected"] = result.get("sarcasm_detected", False) + + # Products + state["product_attributes"] = result.get("product_attributes", []) + + # Competitive + state["competitor_products_owned"] = result.get("competitor_products_owned", []) + state["comparison_type"] = result.get("comparison_type") + + # Journey + state["intents"] = result.get("intents", []) + state["purchase_stage"] = result.get("purchase_stage") + state["decision_drivers"] = result.get("decision_drivers", []) + state["pain_points"] = result.get("pain_points", []) + state["delight_factors"] = result.get("delight_factors", []) + + # Notes + state["analysis_notes"] = result.get("analysis_notes", "") + state["analysis_skipped"] = result.get("analysis_skipped", False) + state["analysis_skip_reason"] = result.get("analysis_skip_reason", "") + else: + error_msg = f"Analysis failed: {result.get('error', 'Unknown error')}" + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + + logger.debug(f"Analysis complete for comment {state.get('comment_sk')}") + return state + + except Exception as e: + error_msg = f"Analysis node error: {str(e)}" + logger.error(error_msg) + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + return state + + def _validation_node(self, state: CommentAnalysisState) -> CommentAnalysisState: + """ + Validation node: Rule-based validation and anomaly detection. + Reuses the same output validator as forums. + """ + try: + result = self.output_validator.process(dict(state)) + + state["validation_passed"] = result.get("validation_passed", True) + state["validation_errors"] = result.get("validation_errors", []) + state["validation_warnings"] = result.get("validation_warnings", []) + state["validation_flags"] = result.get("validation_flags", []) + state["processing_status"] = result.get("processing_status", "completed") + + # Set overall success + has_errors = len(state.get("processing_errors", [])) > 0 + state["success"] = not has_errors or state.get("is_relevant") is not None + + logger.debug(f"Validation complete: status={state['processing_status']}") + return state + + except Exception as e: + error_msg = f"Validation node error: {str(e)}" + logger.error(error_msg) + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + state["validation_passed"] = False + state["processing_status"] = "validation_failed" + state["success"] = False + return state + + def _route_after_preprocessing(self, state: CommentAnalysisState) -> str: + """Determine routing after preprocessing.""" + if state.get("is_empty", False): + state["is_relevant"] = False + state["relevance_reason"] = "Empty content" + return "skip_to_validation" + + if not state.get("is_english", True): + state["is_relevant"] = False + state["relevance_reason"] = f"Non-English: {state.get('detected_language')}" + return "skip_to_validation" + + if (not state.get("preliminary_relevant", False) and + not state.get("needs_relevance_validation", False)): + state["is_relevant"] = False + state["relevance_reason"] = "No relevant keywords found" + return "skip_to_validation" + + return "extract" + + def _route_after_extraction(self, state: CommentAnalysisState) -> str: + """Determine routing after extraction.""" + if state.get("is_relevant", False): + return "analyze" + return "skip_to_validation" + + def process_comment(self, comment_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a single social media comment through the workflow. + + Args: + comment_data: Dictionary containing comment data + + Returns: + Dictionary with processed results + """ + try: + initial_state = { + # Comment identifiers + "comment_sk": comment_data.get("comment_sk"), + "comment_id": comment_data.get("comment_id"), + "platform": comment_data.get("platform"), + "comment_timestamp": comment_data.get("comment_timestamp"), + "author_name": comment_data.get("author_name"), + "author_id": comment_data.get("author_id"), + "parent_comment_id": comment_data.get("parent_comment_id"), + "parent_comment_text": comment_data.get("parent_comment_text"), + + # Content metadata + "content_sk": comment_data.get("content_sk"), + "content_id": comment_data.get("content_id"), + "content_description": comment_data.get("content_description"), + "content_title": comment_data.get("content_title"), + "channel_sk": comment_data.get("channel_sk"), + "channel_name": comment_data.get("channel_name"), + "channel_display_name": comment_data.get("channel_display_name"), + + # Comment text + "comment_text": comment_data.get("comment_text", ""), + + # Processing metadata + "processing_errors": [], + "success": True + } + + final_state = self.workflow.invoke(initial_state) + + return dict(final_state) + + except Exception as e: + logger.error(f"Workflow execution error: {str(e)}") + return { + **comment_data, + "success": False, + "processing_errors": [str(e)], + "processing_status": "workflow_error" + } + + def process_batch(self, comments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Process a batch of social media comments. + + Args: + comments: List of comment dictionaries + + Returns: + List of processed comment dictionaries + """ + results = [] + total = len(comments) + + for idx, comment in enumerate(comments, 1): + logger.info(f"Processing comment {idx}/{total} (SK: {comment.get('comment_sk')})") + result = self.process_comment(comment) + results.append(result) + + logger.info(f"Batch processing complete: {total} comments processed") + return results diff --git a/processing_brand_sentiment/workflow/orchestrator.py b/processing_brand_sentiment/workflow/orchestrator.py new file mode 100644 index 0000000000000000000000000000000000000000..69dbef9a670ab31d596bbbf83f663f93c6b852c3 --- /dev/null +++ b/processing_brand_sentiment/workflow/orchestrator.py @@ -0,0 +1,551 @@ +""" +Brand Analysis Workflow Orchestrator using LangGraph. + +Coordinates the 4-agent pipeline: +1. ContentPreprocessorAgent - HTML parsing, cleaning, keyword detection (no LLM) +2. SabianRelevanceExtractionAgent - Relevance + fact extraction (LLM #1) +3. SabianSentimentAnalyzerAgent - Deep sentiment analysis (LLM #2) +4. OutputValidatorAgent - Rule-based validation (no LLM) + +Architecture v4.0: +- Separation of concerns: extraction vs analysis +- Strict validation at every step +- Structured data flow between agents +- Conservative relevance determination +""" + +from typing import Dict, Any, List, TypedDict, Annotated, Optional +import operator +import json +import os +from langgraph.graph import StateGraph, END +import logging + +from .agents.content_preprocessor_agent import ContentPreprocessorAgent +from .agents.sabian_relevance_extraction_agent import SabianRelevanceExtractionAgent +from .agents.sabian_sentiment_analyzer_agent import SabianSentimentAnalyzerAgent +from .agents.output_validator_agent import OutputValidatorAgent + +logger = logging.getLogger(__name__) + + +class BrandAnalysisState(TypedDict): + """ + State definition for the brand analysis workflow v4.0. + + This state flows through all agents, accumulating data at each step. + """ + # ============== Source Identifiers ============== + post_id: int + thread_id: int + post_author_id: int + + # ============== Original Content ============== + post_content: str + original_content: str + + # ============== Thread Context ============== + thread_title: str + thread_first_post: str + thread_started_at: Any + category_title: str + category_topic: str + + # ============== Timestamps ============== + post_created_at: Any + + # ============== Preprocessor Output ============== + cleaned_content: str + quoted_content: str + has_quote: bool + quoted_author: str + raw_thread_context: str # Raw context for extraction agent + is_empty: bool + + # Language detection + detected_language: str + language_code: str + is_english: bool + language_confidence: str + language_detection_skipped: bool + + # Preliminary relevance (keyword-based) + preliminary_relevant: bool + needs_relevance_validation: bool + relevance_keywords_found: List[str] + relevance_type: str + has_primary_keywords: bool + + # Initial detections + products_detected: List[str] + competitors_detected: List[str] + + # ============== Extraction Agent Output ============== + is_relevant: bool + relevance_confidence: str + relevance_reason: str + extraction_performed: bool + + # Extracted facts + products_mentioned: List[str] + sabian_mention_context: str # primary_focus, significant_mention, casual_mention, comparison_context + author_role: str # current_owner, past_owner, potential_buyer, never_owned, unknown + competitors_mentioned: List[str] + thread_context_summary: str # NEW: Summarized context for storage and analysis + + # ============== Sentiment Analyzer Output ============== + sentiment_level: str + emotion_type: str + sentiment_confidence: str + sarcasm_detected: bool + + # Product information + product_attributes: List[str] + + # Competitive intelligence + competitor_products_owned: List[str] + comparison_type: str + + # Customer journey (AUTHOR PERSPECTIVE ONLY) + intents: List[str] + purchase_stage: str + decision_drivers: List[str] + pain_points: List[str] + delight_factors: List[str] + + # Analysis notes + analysis_notes: str + analysis_skipped: bool + analysis_skip_reason: str + + # ============== Validator Output ============== + validation_passed: bool + validation_errors: List[str] + validation_warnings: List[str] + validation_flags: List[str] + processing_status: str # completed, completed_with_flags, validation_failed + + # ============== Processing Metadata ============== + processing_errors: Annotated[List[str], operator.add] + success: bool + + +class BrandAnalysisWorkflow: + """ + LangGraph-based workflow for brand sentiment analysis v4.0. + + Pipeline: + 1. Content Preprocessor (no LLM) + 2. Relevance & Extraction Agent (LLM #1) + 3. Sentiment Analyzer Agent (LLM #2) - only for relevant posts + 4. Output Validator (no LLM) + """ + + def __init__( + self, + workflow_config: Dict[str, Any], + brand_config: Dict[str, Any], + analysis_categories: Dict[str, Any], + api_key: str + ): + """ + Initialize the workflow with agents and configuration. + + Args: + workflow_config: Workflow and agent configuration + brand_config: Brand-specific configuration + analysis_categories: Analysis category definitions + api_key: OpenAI API key + """ + self.workflow_config = workflow_config + self.brand_config = brand_config + self.analysis_categories = analysis_categories + self.api_key = api_key + + # Initialize agents + self._init_agents() + + # Build the workflow graph + self.workflow = self._build_workflow() + + logger.info("BrandAnalysisWorkflow v4.0 initialized successfully") + + def _init_agents(self) -> None: + """Initialize all agents with their configurations.""" + agents_config = self.workflow_config.get("agents", {}) + + # 1. Content Preprocessor Agent (no LLM) + preprocessor_config = agents_config.get("preprocessor", {}) + self.preprocessor = ContentPreprocessorAgent( + preprocessor_config, + self.brand_config + ) + + # 2. Relevance & Extraction Agent (LLM #1) + extraction_config = agents_config.get("relevance_extraction", + agents_config.get("relevance_validator", {}) # Fallback to old config + ) + self.extraction_agent = SabianRelevanceExtractionAgent( + extraction_config, + self.api_key, + self.brand_config, + self.analysis_categories + ) + + # 3. Sentiment Analyzer Agent (LLM #2) + analyzer_config = agents_config.get("sentiment_analyzer", + agents_config.get("brand_analyzer", {}) # Fallback to old config + ) + self.sentiment_analyzer = SabianSentimentAnalyzerAgent( + analyzer_config, + self.api_key, + self.brand_config, + self.analysis_categories + ) + + # 4. Output Validator Agent (no LLM) + validator_config = agents_config.get("output_validator", {}) + self.output_validator = OutputValidatorAgent( + validator_config, + self.brand_config, + self.analysis_categories + ) + + logger.info("All 4 agents initialized") + + def _build_workflow(self) -> StateGraph: + """ + Build the LangGraph workflow. + + Flow: + preprocessing -> extraction -> (analysis if relevant) -> validation -> END + + Returns: + Compiled StateGraph workflow + """ + workflow = StateGraph(BrandAnalysisState) + + # Add nodes + workflow.add_node("preprocessing", self._preprocessing_node) + workflow.add_node("extraction", self._extraction_node) + workflow.add_node("analysis", self._analysis_node) + workflow.add_node("validation", self._validation_node) + + # Set entry point + workflow.set_entry_point("preprocessing") + + # Define edges + # Preprocessing -> conditional routing + workflow.add_conditional_edges( + "preprocessing", + self._route_after_preprocessing, + { + "extract": "extraction", + "skip_to_validation": "validation" + } + ) + + # Extraction -> conditional routing + workflow.add_conditional_edges( + "extraction", + self._route_after_extraction, + { + "analyze": "analysis", + "skip_to_validation": "validation" + } + ) + + # Analysis -> validation + workflow.add_edge("analysis", "validation") + + # Validation -> END + workflow.add_edge("validation", END) + + return workflow.compile() + + def _preprocessing_node(self, state: BrandAnalysisState) -> BrandAnalysisState: + """ + Preprocessing node: HTML parsing, cleaning, language detection, keyword check. + """ + try: + input_data = { + "post_id": state.get("post_id"), + "post_content": state.get("post_content", ""), + "thread_title": state.get("thread_title"), + "thread_first_post": state.get("thread_first_post"), + "category_title": state.get("category_title"), + "category_topic": state.get("category_topic") + } + + result = self.preprocessor.process(input_data) + + if result.get("success", False): + # Content + state["cleaned_content"] = result.get("cleaned_content", "") + state["quoted_content"] = result.get("quoted_content") + state["has_quote"] = result.get("has_quote", False) + state["quoted_author"] = result.get("quoted_author") + state["raw_thread_context"] = result.get("raw_thread_context", "") + state["is_empty"] = result.get("is_empty", False) + state["original_content"] = result.get("original_content", state.get("post_content", "")) + + # Language + state["detected_language"] = result.get("detected_language", "English") + state["language_code"] = result.get("language_code", "en") + state["is_english"] = result.get("is_english", True) + state["language_confidence"] = result.get("language_confidence", "low") + state["language_detection_skipped"] = result.get("language_detection_skipped", False) + + # Relevance + state["preliminary_relevant"] = result.get("preliminary_relevant", False) + state["needs_relevance_validation"] = result.get("needs_relevance_validation", False) + state["relevance_keywords_found"] = result.get("relevance_keywords_found", []) + state["relevance_type"] = result.get("relevance_type", "none") + state["has_primary_keywords"] = result.get("has_primary_keywords", False) + + # Detections + state["products_detected"] = result.get("products_detected", []) + state["competitors_detected"] = result.get("competitors_detected", []) + + state["success"] = True + else: + error_msg = f"Preprocessing failed: {result.get('error', 'Unknown error')}" + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + state["success"] = False + + logger.debug(f"Preprocessing complete for post {state.get('post_id')}") + return state + + except Exception as e: + error_msg = f"Preprocessing node error: {str(e)}" + logger.error(error_msg) + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + state["success"] = False + return state + + def _extraction_node(self, state: BrandAnalysisState) -> BrandAnalysisState: + """ + Extraction node: LLM-based relevance validation and fact extraction. + """ + try: + input_data = { + "cleaned_content": state.get("cleaned_content", ""), + "quoted_content": state.get("quoted_content"), + "raw_thread_context": state.get("raw_thread_context", ""), + "relevance_keywords_found": state.get("relevance_keywords_found", []), + "preliminary_relevant": state.get("preliminary_relevant", False), + "needs_relevance_validation": state.get("needs_relevance_validation", True), + "products_detected": state.get("products_detected", []), + "competitors_detected": state.get("competitors_detected", []), + "is_english": state.get("is_english", True), + "detected_language": state.get("detected_language", "English") + } + + result = self.extraction_agent.process(input_data) + + # Update state with extraction results + state["is_relevant"] = result.get("is_relevant", False) + state["relevance_confidence"] = result.get("relevance_confidence", "low") + state["relevance_reason"] = result.get("relevance_reason", "") + state["extraction_performed"] = result.get("extraction_performed", True) + + # Extracted facts + state["products_mentioned"] = result.get("products_mentioned", []) + state["sabian_mention_context"] = result.get("sabian_mention_context") + state["author_role"] = result.get("author_role", "unknown") + state["competitors_mentioned"] = result.get("competitors_mentioned", []) + state["thread_context_summary"] = result.get("thread_context_summary", "") + + if not result.get("success", False) and result.get("error"): + state["processing_errors"] = state.get("processing_errors", []) + [result["error"]] + + logger.debug(f"Extraction complete: is_relevant={state['is_relevant']}") + return state + + except Exception as e: + error_msg = f"Extraction node error: {str(e)}" + logger.error(error_msg) + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + state["is_relevant"] = False + state["relevance_confidence"] = "low" + return state + + def _analysis_node(self, state: BrandAnalysisState) -> BrandAnalysisState: + """ + Analysis node: Deep sentiment and intent analysis for relevant posts. + """ + try: + input_data = { + "cleaned_content": state.get("cleaned_content", ""), + "is_relevant": state.get("is_relevant", True), + "is_english": state.get("is_english", True), + "detected_language": state.get("detected_language", "English"), + "products_mentioned": state.get("products_mentioned", []), + "sabian_mention_context": state.get("sabian_mention_context"), + "author_role": state.get("author_role", "unknown"), + "competitors_mentioned": state.get("competitors_mentioned", []), + "thread_context_summary": state.get("thread_context_summary", "") + } + + result = self.sentiment_analyzer.process(input_data) + + if result.get("success", False): + # Sentiment + state["sentiment_level"] = result.get("sentiment_level") + state["emotion_type"] = result.get("emotion_type") + state["sentiment_confidence"] = result.get("sentiment_confidence", "medium") + state["sarcasm_detected"] = result.get("sarcasm_detected", False) + + # Products + state["product_attributes"] = result.get("product_attributes", []) + + # Competitive + state["competitor_products_owned"] = result.get("competitor_products_owned", []) + state["comparison_type"] = result.get("comparison_type") + + # Journey + state["intents"] = result.get("intents", []) + state["purchase_stage"] = result.get("purchase_stage") + state["decision_drivers"] = result.get("decision_drivers", []) + state["pain_points"] = result.get("pain_points", []) + state["delight_factors"] = result.get("delight_factors", []) + + # Notes + state["analysis_notes"] = result.get("analysis_notes", "") + state["analysis_skipped"] = result.get("analysis_skipped", False) + state["analysis_skip_reason"] = result.get("analysis_skip_reason", "") + else: + error_msg = f"Analysis failed: {result.get('error', 'Unknown error')}" + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + + logger.debug(f"Analysis complete for post {state.get('post_id')}") + return state + + except Exception as e: + error_msg = f"Analysis node error: {str(e)}" + logger.error(error_msg) + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + return state + + def _validation_node(self, state: BrandAnalysisState) -> BrandAnalysisState: + """ + Validation node: Rule-based validation and anomaly detection. + """ + try: + result = self.output_validator.process(dict(state)) + + state["validation_passed"] = result.get("validation_passed", True) + state["validation_errors"] = result.get("validation_errors", []) + state["validation_warnings"] = result.get("validation_warnings", []) + state["validation_flags"] = result.get("validation_flags", []) + state["processing_status"] = result.get("processing_status", "completed") + + # Set overall success + has_errors = len(state.get("processing_errors", [])) > 0 + state["success"] = not has_errors or state.get("is_relevant") is not None + + logger.debug(f"Validation complete: status={state['processing_status']}") + return state + + except Exception as e: + error_msg = f"Validation node error: {str(e)}" + logger.error(error_msg) + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + state["validation_passed"] = False + state["processing_status"] = "validation_failed" + state["success"] = False + return state + + def _route_after_preprocessing(self, state: BrandAnalysisState) -> str: + """ + Determine routing after preprocessing. + """ + # If empty content, skip to validation + if state.get("is_empty", False): + state["is_relevant"] = False + state["relevance_reason"] = "Empty content" + return "skip_to_validation" + + # If not English, skip to validation + if not state.get("is_english", True): + state["is_relevant"] = False + state["relevance_reason"] = f"Non-English: {state.get('detected_language')}" + return "skip_to_validation" + + # If no keywords found and no need for validation, skip + if (not state.get("preliminary_relevant", False) and + not state.get("needs_relevance_validation", False)): + state["is_relevant"] = False + state["relevance_reason"] = "No relevant keywords found" + return "skip_to_validation" + + # Otherwise, go to extraction + return "extract" + + def _route_after_extraction(self, state: BrandAnalysisState) -> str: + """ + Determine routing after extraction. + """ + if state.get("is_relevant", False): + return "analyze" + return "skip_to_validation" + + def process_post(self, post_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a single forum post through the workflow. + + Args: + post_data: Dictionary containing post data + + Returns: + Dictionary with processed results + """ + try: + initial_state = { + "post_id": post_data.get("post_id"), + "thread_id": post_data.get("thread_id"), + "post_author_id": post_data.get("post_author_id"), + "post_content": post_data.get("post_content", ""), + "thread_title": post_data.get("thread_title"), + "thread_first_post": post_data.get("thread_first_post"), + "thread_started_at": post_data.get("thread_started_at"), + "category_title": post_data.get("category_title"), + "category_topic": post_data.get("category_topic"), + "post_created_at": post_data.get("post_created_at"), + "processing_errors": [], + "success": True + } + + final_state = self.workflow.invoke(initial_state) + + return dict(final_state) + + except Exception as e: + logger.error(f"Workflow execution error: {str(e)}") + return { + **post_data, + "success": False, + "processing_errors": [str(e)], + "processing_status": "workflow_error" + } + + def process_batch(self, posts: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Process a batch of forum posts. + + Args: + posts: List of post dictionaries + + Returns: + List of processed post dictionaries + """ + results = [] + total = len(posts) + + for idx, post in enumerate(posts, 1): + logger.info(f"Processing post {idx}/{total} (ID: {post.get('post_id')})") + result = self.process_post(post) + results.append(result) + + logger.info(f"Batch processing complete: {total} posts processed") + return results diff --git a/processing_comments/.dockerignore b/processing_comments/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..3058d0c46e000fd35c75eda33ae751a420dd0f3f --- /dev/null +++ b/processing_comments/.dockerignore @@ -0,0 +1,8 @@ +**/__pycache__/ +**/*.pyc +.git +.gitignore +.env +*.log +dist +build diff --git a/processing_comments/LICENSE b/processing_comments/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/processing_comments/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/processing_comments/README.md b/processing_comments/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd563035a178cc917a2dff3c9c062bd938216025 --- /dev/null +++ b/processing_comments/README.md @@ -0,0 +1,726 @@ +# Comment Processing with Agentic Workflow + +A scalable, modular system for processing comments from multiple data sources using OpenAI API, LangChain, and LangGraph. The system performs language detection, translation, and context-aware sentiment analysis using an agentic workflow architecture. + +## Data Sources Supported + +- **Social Media Comments**: External platforms (Facebook, Instagram, YouTube, etc.) +- **Musora Internal Comments**: Comments from Musora internal applications +- **Extensible Architecture**: Easily add new data sources via configuration + +## Features + +- **Multi-Source Support**: Process comments from multiple data sources with a single codebase +- **Configuration-Driven**: Add new data sources without code changes +- **Parent Comment Context**: Automatically includes parent comment text for reply analysis +- **Modular Agent Architecture**: Extensible base classes for easy addition of new agents +- **Language Detection**: Hybrid approach using lingua library for fast English detection, with LLM fallback for non-English languages +- **Translation**: High-quality translation for non-English comments using OpenAI models +- **Context-Aware Sentiment Analysis**: + - Uses content description for context + - Includes parent comment text when analyzing replies + - Multi-label intent classification +- **LangGraph Workflow**: Flexible graph-based orchestration of agent operations +- **Snowflake Integration**: Seamless data fetching and storage with source-specific tables +- **Parallel Processing**: Multiprocessing support for high-performance batch processing +- **Dynamic Batch Sizing**: Intelligent batch size calculation based on workload and available resources +- **Independent Batch Execution**: Each batch processes and stores results independently +- **Comprehensive Logging**: Detailed logging for monitoring and debugging +- **Scalable Configuration**: Easy-to-modify sentiment categories and intents via JSON config + +## Project Structure + +``` +musora-sentiment-analysis/ +├── agents/ +│ ├── __init__.py +│ ├── base_agent.py # Base class for all agents +│ ├── language_detection_agent.py # Language detection agent +│ ├── translation_agent.py # Translation agent +│ └── sentiment_analysis_agent.py # Sentiment analysis agent (parent context support) +├── workflow/ +│ ├── __init__.py +│ └── comment_processor.py # LangGraph workflow orchestrator +├── sql/ +│ ├── fetch_comments.sql # Query for social media comments (with parent join) +│ ├── fetch_musora_comments.sql # Query for Musora internal comments (with parent join) +│ ├── create_ml_features_table.sql # Schema for social media table (with parent fields) +│ ├── init_musora_table.sql # Initialize empty Musora table (run first!) +│ └── create_musora_ml_features_table.sql # Full Musora schema with views (optional) +├── config_files/ +│ ├── data_sources_config.json # Data source configuration (NEW) +│ ├── sentiment_config.json # Configuration for agents and workflow +│ └── sentiment_analysis_config.json # Sentiment categories and intents +├── logs/ # Processing logs (auto-created) +├── LLM.py # LLM utility class +├── SnowFlakeConnection.py # Snowflake connection handler +├── main.py # Main execution script (multi-source support) +├── requirements.txt # Python dependencies +├── .env # Environment variables (not in git) +├── README.md # This file +└── CLAUDE.md # Detailed technical documentation +``` + +## Setup + +### 1. Install Dependencies + +```bash +pip install -r requirements.txt +``` + +### 2. Configure Environment Variables + +Ensure your `.env` file contains the required credentials: + +```env +# Snowflake +SNOWFLAKE_USER=your_user +SNOWFLAKE_PASSWORD=your_password +SNOWFLAKE_ACCOUNT=your_account +SNOWFLAKE_ROLE=your_role +SNOWFLAKE_DATABASE=SOCIAL_MEDIA_DB +SNOWFLAKE_WAREHOUSE=your_warehouse +SNOWFLAKE_SCHEMA=ML_FEATURES + +# OpenAI +OPENAI_API_KEY=your_openai_key +``` + +### 3. Create Snowflake Tables + +Run the SQL scripts to create the output tables: + +```bash +# Execute the SQL files in Snowflake +# For social media comments (if not already exists) +sql/create_ml_features_table.sql + +# For Musora internal comments - INITIAL SETUP (First time only) +# This creates the empty table structure +sql/init_musora_table.sql +``` + +**Note**: Run `init_musora_table.sql` before the first Musora comments processing run. After that, you can optionally run `create_musora_ml_features_table.sql` to create the additional views if needed. + +## Usage + +### Basic Usage (Process All Data Sources) + +Process unprocessed comments from all enabled data sources: + +```bash +python main.py +``` + +This will: +- Process all enabled data sources (social media and Musora comments) +- Fetch only comments that haven't been processed yet +- Process them through the workflow using parallel workers (CPU count - 2, max 5) +- Each batch processes and stores to Snowflake independently +- Append new results to the existing tables (no overwrite) + +### Process Specific Data Source + +Process only social media comments: + +```bash +python main.py --data-source social_media +``` + +Process only Musora internal comments: + +```bash +python main.py --data-source musora_comments +``` + +### Process Limited Number of Comments + +Limit applies per data source: + +```bash +# Process 100 comments from each enabled data source +python main.py --limit 100 + +# Process 100 comments from only Musora source +python main.py --limit 100 --data-source musora_comments +``` + +### Sequential Processing (Debug Mode) + +For debugging purposes, use sequential processing: + +```bash +python main.py --limit 100 --sequential +``` + +This processes all comments in a single batch, making it easier to debug issues. + +### First Run for New Data Source + +For the first run of Musora comments: + +1. **First**: Run the initialization SQL script in Snowflake: + ```sql + -- Execute in Snowflake + sql/init_musora_table.sql + ``` + +2. **Then**: Run the processing with overwrite flag: + ```bash + python main.py --overwrite --data-source musora_comments --limit 100 + ``` + +**Why two steps?** +- The fetch query checks for already-processed comments by querying the output table +- On first run, that table doesn't exist, causing an error +- The init script creates the empty table structure first +- Then processing can run normally + +**Warning**: Overwrite will replace all existing data in the output table. Only use for initial table creation or when reprocessing from scratch. + +### Custom Configuration File + +```bash +python main.py --config path/to/custom_config.json +``` + +### Command-Line Arguments + +- `--limit N`: Process only N comments per data source (default: 10000) +- `--overwrite`: Overwrite existing Snowflake table (default: append mode) +- `--config PATH`: Custom configuration file path +- `--sequential`: Use sequential processing instead of parallel (for debugging) +- `--data-source SOURCE`: Process only specific data source (e.g., social_media, musora_comments) + +### Parallel Processing + +The system uses multiprocessing to process comments in parallel: + +**Worker Calculation**: +- Number of workers: `CPU count - 2` (max 5 workers) +- Leaves CPU cores available for system operations +- Example: 8-core system → 5 workers (capped at max) + +**Dynamic Batch Sizing**: +- Batch size calculated as: `total_comments / num_workers` +- Minimum batch size: 20 comments +- Maximum batch size: 1000 comments +- Batches ≤ 20 comments are not split + +**Independent Execution**: +- Each batch runs in a separate process +- Batches store to Snowflake immediately upon completion +- No waiting for all batches to complete +- Failed batches don't affect successful ones + +**Performance**: +- Expected speedup: ~1.8-4.5x depending on number of workers +- Real-time progress reporting as batches complete +- Processing time and average per comment displayed in summary + +### Incremental Processing + +The pipeline is designed for incremental processing: +- **Automatic deduplication**: SQL query excludes comments already in `COMMENT_SENTIMENT_FEATURES` +- **Append-only by default**: New results are added without overwriting existing data +- **Failed comment retry**: Comments with `success=False` are not stored and will be retried in future runs +- **Run regularly**: Safe to run daily/weekly to process new comments + +## Configuration + +### Data Sources Configuration + +The `config_files/data_sources_config.json` file defines available data sources: + +```json +{ + "data_sources": { + "social_media": { + "name": "Social Media Comments", + "enabled": true, + "sql_query_file": "sql/fetch_comments.sql", + "output_config": { + "table_name": "COMMENT_SENTIMENT_FEATURES", + "database": "SOCIAL_MEDIA_DB", + "schema": "ML_FEATURES" + } + }, + "musora_comments": { + "name": "Musora Internal Comments", + "enabled": true, + "sql_query_file": "sql/fetch_musora_comments.sql", + "output_config": { + "table_name": "MUSORA_COMMENT_SENTIMENT_FEATURES", + "database": "SOCIAL_MEDIA_DB", + "schema": "ML_FEATURES" + }, + "additional_fields": [ + "PERMALINK_URL", + "THUMBNAIL_URL" + ] + } + } +} +``` + +**To add a new data source**: Simply add a new entry to this config file and create the corresponding SQL query file. + +### Agent Configuration + +The `config_files/sentiment_config.json` file controls agent behavior: + +```json +{ + "agents": { + "language_detection": { + "model": "gpt-5-nano", + "temperature": 0.0, + "max_retries": 3 + }, + "translation": { + "model": "gpt-5-nano", + "temperature": 0.3, + "max_retries": 3 + }, + "sentiment_analysis": { + "model": "gpt-5-nano", + "temperature": 0.2, + "max_retries": 3 + } + }, + "workflow": { + "description": "Batch size is calculated dynamically based on number of workers (min: 20, max: 1000)", + "parallel_processing": { + "enabled": true, + "worker_calculation": "CPU count - 2, max 5 workers", + "min_batch_size": 20, + "max_batch_size": 1000 + } + }, + "snowflake": { + "output_table": "COMMENT_SENTIMENT_FEATURES", + "database": "SOCIAL_MEDIA_DB", + "schema": "ML_FEATURES" + } +} +``` + +**Note**: Batch size is now calculated dynamically and no longer needs to be configured manually. + +### Sentiment Categories Configuration + +The `config_files/sentiment_analysis_config.json` file defines sentiment categories and intents (easily extensible): + +```json +{ + "sentiment_polarity": { + "categories": [ + {"value": "very_positive", "label": "Very Positive", "description": "..."}, + {"value": "positive", "label": "Positive", "description": "..."}, + {"value": "neutral", "label": "Neutral", "description": "..."}, + {"value": "negative", "label": "Negative", "description": "..."}, + {"value": "very_negative", "label": "Very Negative", "description": "..."} + ] + }, + "intent": { + "categories": [ + {"value": "praise", "label": "Praise", "description": "..."}, + {"value": "question", "label": "Question", "description": "..."}, + {"value": "request", "label": "Request", "description": "..."}, + {"value": "feedback_negative", "label": "Negative Feedback", "description": "..."}, + {"value": "suggestion", "label": "Suggestion", "description": "..."}, + {"value": "humor_sarcasm", "label": "Humor/Sarcasm", "description": "..."}, + {"value": "off_topic", "label": "Off Topic", "description": "..."}, + {"value": "spam_selfpromo", "label": "Spam/Self-Promotion", "description": "..."} + ] + }, + "reply_policy": { + "requires_reply_intents": ["question", "request"], + "description": "Comments with these intents should be flagged for reply" + }, + "intent_settings": { + "multi_label": true, + "description": "Intent can have multiple labels as a comment can express multiple intents" + } +} +``` + +## Adding New Agents + +The system is designed for easy extensibility. To add a new agent: + +### 1. Create Agent Class + +```python +from agents.base_agent import BaseAgent +from typing import Dict, Any + +class MyNewAgent(BaseAgent): + def __init__(self, config: Dict[str, Any], api_key: str): + super().__init__("MyNewAgent", config) + # Initialize your agent-specific components + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + # Validate input data + return True + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + # Implement your agent logic + return {"success": True, "result": "..."} +``` + +### 2. Update Workflow + +Add the agent to `workflow/comment_processor.py`: + +```python +# Add to CommentState TypedDict +new_agent_result: str + +# Add node +workflow.add_node("my_new_agent", self._my_new_agent_node) + +# Add edges +workflow.add_edge("translation", "my_new_agent") +workflow.add_edge("my_new_agent", END) +``` + +### 3. Update Configuration + +Add agent config to `sentiment_config.json`: + +```json +{ + "agents": { + "my_new_agent": { + "name": "MyNewAgent", + "model": "gpt-4o-mini", + "temperature": 0.5, + "max_retries": 3 + } + } +} +``` + +## Output Schema + +### Social Media Comments Table +Stored in `SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES` + +### Musora Comments Table +Stored in `SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES` + +### Common Columns (Both Tables) + +| Column | Type | Description | +|--------|------|-------------| +| COMMENT_SK | NUMBER | Surrogate key from source | +| COMMENT_ID | VARCHAR | Platform comment ID | +| ORIGINAL_TEXT | VARCHAR | Original comment text | +| **PARENT_COMMENT_ID** | **VARCHAR** | **ID of parent comment if this is a reply** | +| **PARENT_COMMENT_TEXT** | **VARCHAR** | **Text of parent comment for context** | +| DETECTED_LANGUAGE | VARCHAR | Detected language name | +| LANGUAGE_CODE | VARCHAR | ISO 639-1 code | +| IS_ENGLISH | BOOLEAN | Is comment in English | +| TRANSLATED_TEXT | VARCHAR | English translation | +| TRANSLATION_PERFORMED | BOOLEAN | Was translation performed | +| SENTIMENT_POLARITY | VARCHAR | Sentiment (very_positive, positive, neutral, negative, very_negative) | +| INTENT | VARCHAR | Multi-label intents (comma-separated) | +| REQUIRES_REPLY | BOOLEAN | Does comment need a response | +| SENTIMENT_CONFIDENCE | VARCHAR | Analysis confidence (high, medium, low) | +| PROCESSING_SUCCESS | BOOLEAN | Processing status | +| PROCESSED_AT | TIMESTAMP | Processing timestamp | + +### Musora-Specific Additional Columns + +| Column | Type | Description | +|--------|------|-------------| +| PERMALINK_URL | VARCHAR | Web URL path of the content | +| THUMBNAIL_URL | VARCHAR | Thumbnail URL of the content | + +### Available Views + +**Social Media:** +- `VW_COMMENTS_REQUIRING_REPLY`: Comments that need responses (includes parent comment info) +- `VW_SENTIMENT_DISTRIBUTION`: Sentiment and intent statistics by channel (includes reply comment count) +- `VW_NON_ENGLISH_COMMENTS`: Filtered view of non-English comments + +**Musora:** +- `VW_MUSORA_COMMENTS_REQUIRING_REPLY`: Musora comments needing responses +- `VW_MUSORA_SENTIMENT_DISTRIBUTION`: Musora sentiment and intent statistics +- `VW_MUSORA_NON_ENGLISH_COMMENTS`: Non-English Musora comments + +## Workflow Architecture + +The system uses LangGraph to create a flexible, state-based workflow: + +``` +┌─────────────────────┐ +│ Fetch Comments │ +│ from Snowflake │ +│ (Unprocessed Only) │ +└──────────┬──────────┘ + │ + ▼ +┌─────────────────────┐ +│ Language Detection │ +│ Agent │ +└──────────┬──────────┘ + │ + ▼ + ┌────┴────┐ + │ English?│ + └────┬────┘ + │ + ┌─────┴─────┐ + │ │ + Yes No + │ │ + │ ▼ + │ ┌─────────────┐ + │ │ Translation │ + │ │ Agent │ + │ └──────┬──────┘ + │ │ + └─────┬─────┘ + │ + ▼ + ┌──────────────────┐ + │ Sentiment │ + │ Analysis Agent │ + └─────────┬────────┘ + │ + ▼ + ┌──────────────┐ + │Store Results │ + │to Snowflake │ + │(Append Mode) │ + └──────────────┘ +``` + +**Note**: The fetch step automatically excludes comments already present in `COMMENT_SENTIMENT_FEATURES`, enabling incremental processing. + +## Logging + +Logs are automatically created in the `logs/` directory with timestamps: + +``` +logs/comment_processing_20251001_143022.log +``` + +## Adding New Data Sources + +The system is designed to make adding new data sources easy: + +### Steps to Add a New Source: + +1. **Update Configuration** (`config_files/data_sources_config.json`): + ```json + "your_new_source": { + "name": "Your New Source Name", + "enabled": true, + "sql_query_file": "sql/fetch_your_source.sql", + "output_config": { + "table_name": "YOUR_SOURCE_SENTIMENT_FEATURES", + "database": "SOCIAL_MEDIA_DB", + "schema": "ML_FEATURES" + }, + "additional_fields": ["FIELD1", "FIELD2"] // Optional + } + ``` + +2. **Create SQL Query File** (`sql/fetch_your_source.sql`): + - Fetch comments with consistent column names + - Include self-join for parent comments if available + - Exclude already-processed comments (LEFT JOIN with output table) + +3. **Create Table Initialization Script** (`sql/init_your_source_table.sql`): + - Creates empty table structure + - Base schema on `init_musora_table.sql` + - Add source-specific fields as needed + - **Run this in Snowflake FIRST before processing** + +4. **Create Full Schema** (optional): + - Base schema on `create_musora_ml_features_table.sql` + - Include views and indexes + +5. **Run First Time**: + ```bash + # Step 1: Run init script in Snowflake + sql/init_your_source_table.sql + + # Step 2: Process first batch + python main.py --overwrite --data-source your_new_source --limit 100 + ``` + +**No code changes required!** + +## Best Practices + +1. **Testing**: Always test with `--limit` flag first (e.g., `--limit 100`) +2. **New Data Sources**: Test new sources with `--sequential --limit 100` first +3. **Debugging**: Use `--sequential` flag for easier debugging of processing issues +4. **Incremental Processing**: Run regularly without `--overwrite` to process only new comments +5. **Monitoring**: Check logs for processing errors and batch completion +6. **Performance**: Use default parallel mode for production workloads +7. **Extensibility**: Follow the base agent pattern for consistency +8. **Error Handling**: All agents include robust error handling +9. **Failed Comments**: Review logs for failed comments - they'll be automatically retried in future runs +10. **Resource Management**: System automatically adapts to available CPU resources +11. **Parent Comments**: Ensure SQL queries include parent comment joins for best accuracy + +## Sentiment Analysis Features + +### Multi-Label Intent Classification + +The system supports **multi-label intent classification**, meaning a single comment can have multiple intents: + +- **Example**: "This is amazing! What scale are you using?" → `["praise", "question"]` +- **Example**: "Love this but can you make a tutorial on it?" → `["praise", "request"]` + +### Context-Aware Analysis with Parent Comment Support + +The sentiment analysis agent provides rich context understanding: + +1. **Content Context**: Uses the `content_description` field to understand what the comment is about +2. **Parent Comment Context** (NEW): When analyzing reply comments, the system: + - Automatically detects when a comment is a reply + - Fetches the parent comment text from the database + - Includes parent comment in the LLM prompt + - Explicitly instructs the LLM that this is a reply comment + - Results in more accurate sentiment and intent classification + +**Example**: +- Parent Comment: "Does anyone know how to play this riff?" +- Reply Comment: "Yes!" +- Without parent context: Might be classified as unclear/off-topic +- With parent context: Correctly classified as answering a question + +This dramatically improves accuracy for: +- Short reply comments ("Yes", "Thanks!", "Agreed") +- Sarcastic replies (context crucial for understanding) +- Continuation of discussions +- Agreement/disagreement comments + +### Failure Handling & Reprocessing + +Comments that fail sentiment analysis (missing critical fields like sentiment_polarity or intents) are: +- Marked as `success=False` in the workflow +- **NOT stored in Snowflake** +- **Automatically available for reprocessing** in future runs + +This ensures only successfully processed comments are stored, while failed comments remain available for retry. + +### Incremental Processing & Deduplication + +The pipeline automatically handles incremental processing: +- **SQL-level deduplication**: Query excludes comments already in `COMMENT_SENTIMENT_FEATURES` using `LEFT JOIN` +- **Automatic retry**: Failed comments (not stored) are automatically retried on next run +- **Append-only mode**: Default behavior appends new records without overwriting +- **Production-ready**: Safe to run daily/weekly/monthly to process new comments + +### Scalable Configuration + +To add or modify sentiment categories or intents: + +1. Edit `config_files/sentiment_analysis_config.json` +2. Add/modify categories in the `sentiment_polarity` or `intent` sections +3. Update `reply_policy.requires_reply_intents` if needed +4. No code changes required! + +## Future Extensions + +The modular architecture supports easy addition of: + +- Topic classification agent +- Entity extraction agent +- Engagement score prediction agent +- Named entity recognition agent + +Simply create a new agent inheriting from `BaseAgent` and add it to the workflow graph. + +## Troubleshooting + +### Issue: "Object does not exist or not authorized" on First Run + +**Error**: `Object 'SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES' does not exist or not authorized` + +**Cause**: The fetch query tries to check for already-processed comments, but the output table doesn't exist yet on first run. + +**Solution**: +1. Run the initialization script first: + ```sql + -- Execute in Snowflake + sql/init_musora_table.sql + ``` +2. Then run the processing: + ```bash + python main.py --overwrite --data-source musora_comments --limit 100 + ``` + +### Issue: API Rate Limits + +If hitting API rate limits, reduce the number of parallel workers or process fewer comments: +```bash +# Process fewer comments at a time +python main.py --limit 500 + +# Or use sequential mode +python main.py --sequential --limit 100 +``` + +### Issue: Memory Issues + +Process in smaller batches using `--limit`: +```bash +python main.py --limit 500 +``` + +### Issue: Debugging Processing Errors + +Use sequential mode to debug issues more easily: +```bash +python main.py --sequential --limit 50 +``` + +This processes all comments in a single batch with clearer error messages. + +### Issue: Connection Timeouts + +Check Snowflake credentials in `.env` and network connectivity. + +### Issue: Parallel Processing Not Working + +If multiprocessing issues occur, use sequential mode: +```bash +python main.py --sequential +``` + +## Performance + +### Expected Speedup + +Parallel processing provides significant performance improvements: + +- **Sequential**: 1x (baseline) +- **2 workers**: ~1.8-1.9x faster +- **5 workers**: ~4-4.5x faster + +Speedup isn't perfectly linear due to: +- Snowflake connection overhead +- LLM API rate limits (shared across workers) +- I/O operations + +### Monitoring Performance + +The processing summary includes: +- Total processing time +- Average time per comment +- Number of workers used +- Batch size calculations +- Failed batches (if any) + +## License + +Internal use only - Musora sentiment analysis project. \ No newline at end of file diff --git a/processing_comments/SnowFlakeConnection.py b/processing_comments/SnowFlakeConnection.py new file mode 100644 index 0000000000000000000000000000000000000000..271a0f09942a3465d752d1692168674e4a371f45 --- /dev/null +++ b/processing_comments/SnowFlakeConnection.py @@ -0,0 +1,121 @@ +""" +This class create a connection to Snowflake, run queries (read and write) +""" +import json +import os +from snowflake.snowpark import Session +from dotenv import load_dotenv +import logging +logger = logging.getLogger() +load_dotenv() + +class SnowFlakeConn: + def __init__(self): + self. session = self.connect_to_snowflake() + + + # ========================================================= + def connect_to_snowflake(self): + # --- Snowflake connection via env vars --- + conn = dict( + user=self.get_credential("SNOWFLAKE_USER"), + password=self.get_credential("SNOWFLAKE_PASSWORD"), + account=self.get_credential("SNOWFLAKE_ACCOUNT"), + role=self.get_credential("SNOWFLAKE_ROLE"), + database=self.get_credential("SNOWFLAKE_DATABASE"), + warehouse=self.get_credential("SNOWFLAKE_WAREHOUSE"), + schema=self.get_credential("SNOWFLAKE_SCHEMA"), + ) + + session = Session.builder.configs(conn).create() + return session + + # ========================================================= + def get_credential(self, key): + return os.getenv(key) + + # ========================================================= + def run_read_query(self, query, data): + """ + Executes a SQL query on Snowflake that fetch the data + :return: Pandas dataframe containing the query results + """ + + # Connect to Snowflake + try: + dataframe = self.session.sql(query).to_pandas() + dataframe.columns = dataframe.columns.str.lower() + print(f"reading {data} table successfully") + return dataframe + except Exception as e: + print(f"Error in creating/updating table: {e}") + + # ========================================================= + def store_df_to_snowflake(self, table_name, dataframe, database="SOCIAL_MEDIA_DB", schema="ML_FEATURES", overwrite=False): + """ + Executes a SQL query on Snowflake that write the preprocessed data on new tables + :param query: SQL query string to be executed + :return: None + """ + + try: + self.session.use_database(database) + self.session.use_schema(schema) + + dataframe = dataframe.reset_index(drop=True) + dataframe.columns = dataframe.columns.str.upper() + + self.session.write_pandas(df=dataframe, + table_name=table_name.strip().upper(), + auto_create_table=True, + overwrite=overwrite, + use_logical_type=True) + print(f"Data inserted into {table_name} successfully.") + + except Exception as e: + print(f"Error in creating/updating/inserting table: {e}") + + # ========================================================= + def execute_sql_file(self, file_path): + """ + Executes SQL queries from a file + :param file_path: Path to SQL file + :return: Query result or None for DDL/DML + """ + try: + with open(file_path, 'r', encoding='utf-8') as file: + sql_content = file.read() + + result = self.session.sql(sql_content).collect() + print(f"Successfully executed SQL from {file_path}") + return result + except Exception as e: + print(f"Error executing SQL file {file_path}: {e}") + return None + + # ========================================================= + def execute_query(self, query, description="query"): + """ + Executes a SQL query and returns results + :param query: SQL query string + :param description: Description of the query for logging + :return: Query results + """ + try: + result = self.session.sql(query).collect() + print(f"Successfully executed {description}") + return result + except Exception as e: + print(f"Error executing {description}: {e}") + return None + + + # ========================================================= + def get_data(self, data): + # get any sort of data based on requirement --> comments, contents, etc + pass + + # ========================================================= + def close_connection(self): + self.session.close() + diff --git a/processing_comments/agents/README.md b/processing_comments/agents/README.md new file mode 100644 index 0000000000000000000000000000000000000000..419d66a9aad160446668754684b6ecbdadb77a29 --- /dev/null +++ b/processing_comments/agents/README.md @@ -0,0 +1,1571 @@ +# Agents Architecture Documentation + +## Table of Contents +- [Overview](#overview) +- [Agent Architecture](#agent-architecture) +- [Existing Agents](#existing-agents) +- [How Agents Work](#how-agents-work) +- [Adding New Agents](#adding-new-agents) +- [Modifying Existing Agents](#modifying-existing-agents) +- [Configuration System](#configuration-system) +- [Best Practices](#best-practices) +- [Troubleshooting](#troubleshooting) + +## Overview + +The agent system in this project is built on a modular, extensible architecture that processes social media comments through a series of specialized agents. Each agent performs a specific task (language detection, translation, sentiment analysis) and is orchestrated through a LangGraph workflow. + +### Key Design Principles + +1. **Modularity**: Each agent handles a single responsibility +2. **Extensibility**: Easy to add new agents without modifying existing code +3. **Consistency**: All agents inherit from a common base class +4. **Configuration-Driven**: Agent behavior controlled through JSON config files +5. **Error Resilience**: Robust error handling at every level + +### Technology Stack + +- **LangChain**: For LLM interactions and agent framework +- **LangGraph**: For workflow orchestration +- **OpenAI API**: LLM backend for NLP tasks +- **Lingua**: Fast language detection library +- **Python 3.x**: Core language + +## Agent Architecture + +### Directory Structure + +``` +agents/ +├── __init__.py # Module exports +├── base_agent.py # Abstract base class +├── language_detection_agent.py # Language detection agent +├── translation_agent.py # Translation agent +├── sentiment_analysis_agent.py # Sentiment analysis agent +└── README.md # This file +``` + +### Base Agent Class + +All agents inherit from `BaseAgent` (`base_agent.py`), which provides: + +```python +class BaseAgent(ABC): + """Abstract base class for all agents""" + + # Common attributes + - name: str # Agent name + - config: Dict[str, Any] # Configuration dictionary + - model: str # LLM model to use + - temperature: float # LLM temperature + - max_retries: int # Maximum retry attempts + + # Abstract methods (must be implemented) + @abstractmethod + def process(input_data: Dict) -> Dict + @abstractmethod + def validate_input(input_data: Dict) -> bool + + # Common methods (inherited) + def get_name() -> str + def get_config() -> Dict + def log_processing(message: str, level: str) + def handle_error(error: Exception, context: str) -> Dict +``` + +### Workflow Integration + +Agents are orchestrated through `workflow/comment_processor.py` using LangGraph: + +``` +┌─────────────────────┐ +│ Language Detection │ +│ Agent │ +└──────────┬──────────┘ + │ + ▼ + ┌────┴────┐ + │ English?│ + └────┬────┘ + │ + ┌─────┴─────┐ + │ │ + Yes No + │ │ + │ ▼ + │ ┌─────────────┐ + │ │ Translation │ + │ │ Agent │ + │ └──────┬──────┘ + │ │ + └─────┬─────┘ + │ + ▼ + ┌──────────────────┐ + │ Sentiment │ + │ Analysis Agent │ + └──────────────────┘ +``` + +## Existing Agents + +### 1. Language Detection Agent + +**File**: `language_detection_agent.py` + +**Purpose**: Detects the language of comment text using a hybrid approach. + +**Strategy**: +- Uses **Lingua library** for fast English detection +- Falls back to **LLM** for non-English languages (higher accuracy) +- Returns language name, ISO code, and confidence level + +**Key Methods**: +```python +def detect_with_lingua(text: str) -> tuple[str, str, bool] + # Fast detection using lingua library + # Returns: (language_code, language_name, is_english) + +def detect_with_llm(text: str) -> Dict[str, Any] + # LLM-based detection for nuanced analysis + # Returns: {language, language_code, confidence, has_text} + +def process(input_data: Dict) -> Dict + # Main processing: lingua first, LLM if not English +``` + +**Configuration** (`sentiment_config.json`): +```json +{ + "language_detection": { + "model": "gpt-5-nano", + "temperature": 0.0, + "max_retries": 3 + } +} +``` + +**Input Requirements**: +- `comment_text`: str + +**Output**: +- `language`: str (e.g., "English", "Spanish") +- `language_code`: str (ISO 639-1, e.g., "en", "es") +- `is_english`: bool +- `confidence`: str ("high", "medium", "low") +- `detection_method`: str ("lingua", "llm", "default") +- `has_text`: bool + +### 2. Translation Agent + +**File**: `translation_agent.py` + +**Purpose**: Translates non-English comments to English using LLM. + +**Strategy**: +- Skips translation if already English +- Uses LLM for context-aware, high-quality translation +- Preserves tone, intent, emojis, and special characters +- Specialized for music/education social media content + +**Key Methods**: +```python +def translate_text(text: str, source_language: str) -> Dict + # LLM-based translation with context preservation + # Returns: {translated_text, translation_confidence, notes} + +def process(input_data: Dict) -> Dict + # Main processing: checks is_english, translates if needed +``` + +**Configuration**: +```json +{ + "translation": { + "model": "gpt-5-nano", + "temperature": 0.3, + "max_retries": 3 + } +} +``` + +**Input Requirements**: +- `comment_text`: str +- `is_english`: bool +- `language`: str (optional, for context) + +**Output**: +- `translated_text`: str +- `translation_performed`: bool +- `translation_confidence`: str +- `translation_notes`: str + +### 3. Sentiment Analysis Agent + +**File**: `sentiment_analysis_agent.py` + +**Purpose**: Analyzes sentiment polarity, intent, and determines if reply is needed. + +**Strategy**: +- Uses content description for context +- Supports parent comment context for reply analysis +- Multi-label intent classification +- Differentiates genuine vs rhetorical/sarcastic questions +- Platform-aware analysis (YouTube, Facebook, Instagram) + +**Key Features**: +- **Context-Aware**: Uses content description and parent comment +- **Multi-Label**: Can assign multiple intents to a single comment +- **Reply Policy**: Flags comments requiring responses +- **Rhetorical Detection**: Identifies sarcastic/rhetorical questions + +**Key Methods**: +```python +def _build_context_string( + content_description: str, + parent_comment_text: str = None, + platform: str = None, + content_title: str = None +) -> str + # Builds context for LLM prompt + # Handles YouTube title+description vs other platforms + +def analyze_sentiment( + comment_text: str, + content_description: str, + parent_comment_text: str = None, + platform: str = None, + content_title: str = None +) -> Dict + # Performs sentiment analysis with full context + # Returns: {sentiment_polarity, intent, requires_reply, confidence, analysis_notes} + +def process(input_data: Dict) -> Dict + # Main processing: validates input, analyzes sentiment +``` + +**Configuration**: +Uses two config files: + +1. **Agent Config** (`sentiment_config.json`): +```json +{ + "sentiment_analysis": { + "model": "gpt-5-nano", + "temperature": 0.2, + "max_retries": 3 + } +} +``` + +2. **Categories Config** (`sentiment_analysis_config.json`): +```json +{ + "sentiment_polarity": { + "categories": [ + {"value": "very_positive", "label": "Very Positive", "description": "..."}, + {"value": "positive", "label": "Positive", "description": "..."}, + {"value": "neutral", "label": "Neutral", "description": "..."}, + {"value": "negative", "label": "Negative", "description": "..."}, + {"value": "very_negative", "label": "Very Negative", "description": "..."} + ] + }, + "intent": { + "categories": [ + {"value": "praise", "description": "..."}, + {"value": "question", "description": "..."}, + {"value": "request", "description": "..."}, + {"value": "feedback_negative", "description": "..."}, + {"value": "suggestion", "description": "..."}, + {"value": "humor_sarcasm", "description": "..."}, + {"value": "off_topic", "description": "..."}, + {"value": "spam_selfpromo", "description": "..."} + ] + }, + "reply_policy": { + "requires_reply_intents": ["question", "request"], + "not_include": ["humor_sarcasm"] + } +} +``` + +**Input Requirements**: +- `comment_text`: str +- `content_description`: str +- `parent_comment_text`: str (optional) +- `platform`: str (optional, e.g., "youtube", "facebook") +- `content_title`: str (optional, mainly for YouTube) + +**Output**: +- `sentiment_polarity`: str (one of: very_positive, positive, neutral, negative, very_negative) +- `intent`: str (comma-separated list, e.g., "praise, question") +- `requires_reply`: bool +- `sentiment_confidence`: str ("high", "medium", "low") +- `analysis_notes`: str (1-2 sentence summary) +- `success`: bool (False if critical fields missing) + +### Common Patterns Across All Agents + +1. **JSON Response Parsing**: All agents have `_parse_llm_json_response()` method to handle markdown-wrapped JSON +2. **Error Handling**: All use `handle_error()` from base class +3. **Logging**: All use `log_processing()` for consistent logging +4. **Validation**: All implement `validate_input()` before processing +5. **State Preservation**: All preserve original input data in output + +## How Agents Work + +### Workflow Execution Flow + +1. **Initialization** (`CommentProcessingWorkflow.__init__`): + ```python + # Load configurations + lang_detect_config = config["agents"]["language_detection"] + translation_config = config["agents"]["translation"] + sentiment_config = config["agents"]["sentiment_analysis"] + + # Initialize agents + self.language_agent = LanguageDetectionAgent(lang_detect_config, api_key) + self.translation_agent = TranslationAgent(translation_config, api_key) + self.sentiment_agent = SentimentAnalysisAgent(sentiment_config, api_key, sentiment_categories) + + # Build workflow graph + self.workflow = self._build_workflow() + ``` + +2. **Workflow Graph** (`_build_workflow()`): + ```python + workflow = StateGraph(CommentState) + + # Add nodes (agents) + workflow.add_node("language_detection", self._language_detection_node) + workflow.add_node("translation", self._translation_node) + workflow.add_node("sentiment_analysis", self._sentiment_analysis_node) + + # Define edges (control flow) + workflow.set_entry_point("language_detection") + workflow.add_conditional_edges( + "language_detection", + self._should_translate, + {"translate": "translation", "skip_translation": "sentiment_analysis"} + ) + workflow.add_edge("translation", "sentiment_analysis") + workflow.add_edge("sentiment_analysis", END) + + return workflow.compile() + ``` + +3. **Node Execution** (Example: `_language_detection_node`): + ```python + def _language_detection_node(self, state: CommentState) -> CommentState: + try: + # Prepare input + input_data = {"comment_text": state["comment_text"]} + + # Process with agent + result = self.language_agent.process(input_data) + + # Update state + if result.get("success", False): + state["language"] = result.get("language", "English") + state["language_code"] = result.get("language_code", "en") + state["is_english"] = result.get("is_english", True) + # ... more fields + else: + # Handle error, set defaults + state["processing_errors"].append(result.get("error")) + + return state + except Exception as e: + # Error handling + state["processing_errors"].append(str(e)) + return state + ``` + +4. **Decision Points** (Example: `_should_translate`): + ```python + def _should_translate(self, state: CommentState) -> str: + if state.get("is_english", True) or not state.get("has_text", True): + # Set defaults for skipped translation + state["translated_text"] = state["comment_text"] + state["translation_performed"] = False + return "skip_translation" + else: + return "translate" + ``` + +5. **Comment Processing** (`process_comment()`): + ```python + def process_comment(self, comment_data: Dict) -> Dict: + # Initialize state + initial_state = { + "comment_sk": comment_data.get("comment_sk"), + "comment_text": comment_data.get("comment_text"), + # ... all fields + "processing_errors": [], + "success": True + } + + # Run workflow + final_state = self.workflow.invoke(initial_state) + + # Merge and return + return dict(final_state) + ``` + +### State Management + +The workflow uses a `CommentState` TypedDict to pass data between agents: + +```python +class CommentState(TypedDict): + # Input fields + comment_sk: int + comment_id: str + comment_text: str + # ... more fields + + # Processing fields (populated by agents) + language: str + language_code: str + is_english: bool + translated_text: str + sentiment_polarity: str + intent: str + # ... more fields + + # Metadata + processing_errors: Annotated[List[str], operator.add] + success: bool +``` + +### Error Handling Strategy + +1. **Agent Level**: Each agent returns `{"success": False, "error": "..."}` on failure +2. **Node Level**: Nodes catch exceptions, set defaults, append to `processing_errors` +3. **Workflow Level**: Workflow continues even if an agent fails (graceful degradation) +4. **Critical Failures**: Sentiment agent marks `success=False` if critical fields missing (comment not stored) + +## Adding New Agents + +### Step-by-Step Guide + +#### Step 1: Create the Agent Class + +Create a new file in the `agents/` directory (e.g., `topic_classification_agent.py`): + +```python +""" +Topic Classification Agent +Extracts topics and themes from comments +""" + +from typing import Dict, Any +import json +from langchain_openai import ChatOpenAI +from langchain.schema import HumanMessage, SystemMessage +from agents.base_agent import BaseAgent +import logging + +logger = logging.getLogger(__name__) + + +class TopicClassificationAgent(BaseAgent): + """ + Agent that classifies comments into predefined topics/themes. + """ + + def __init__(self, config: Dict[str, Any], api_key: str, topic_categories: Dict[str, Any]): + """ + Initialize the Topic Classification Agent. + + Args: + config: Configuration dictionary + api_key: OpenAI API key + topic_categories: Dictionary with topic categories + """ + super().__init__("TopicClassificationAgent", config) + self.api_key = api_key + self.topic_categories = topic_categories + self.llm = ChatOpenAI( + model=self.model, + temperature=self.temperature, + api_key=self.api_key + ) + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """ + Validate that input contains required fields. + + Args: + input_data: Input dictionary + + Returns: + True if valid, False otherwise + """ + required_fields = ["comment_text"] + return all(field in input_data for field in required_fields) + + def classify_topics(self, comment_text: str) -> Dict[str, Any]: + """ + Classify comment into topics using LLM. + + Args: + comment_text: The comment text to analyze + + Returns: + Dictionary with topic classification results + """ + # Build topic options from config + topic_options = "\n".join([ + f"- {cat['value']}: {cat['description']}" + for cat in self.topic_categories["topics"]["categories"] + ]) + + system_prompt = f"""You are an expert at classifying music-related comments into topics. + +Available Topics: +{topic_options} + +Return your response in JSON format with the following fields: +- topics: array of topic values (multi-label, can have multiple topics) +- confidence: your confidence level (high, medium, low) +- reasoning: brief explanation of your classification +""" + + user_prompt = f"""Classify this comment into relevant topics: + +Comment: "{comment_text}" + +Return JSON only.""" + + try: + messages = [ + SystemMessage(content=system_prompt), + HumanMessage(content=user_prompt) + ] + + response = self.llm.invoke(messages) + result = self._parse_llm_json_response(response.content) + + topics = result.get("topics", []) + if isinstance(topics, str): + topics = [topics] + + topic_str = ", ".join(topics) if topics else None + + return { + "success": True, + "topics": topic_str, + "topic_confidence": result.get("confidence", "medium"), + "topic_reasoning": result.get("reasoning", "") + } + + except json.JSONDecodeError as e: + self.log_processing(f"JSON decode error: {str(e)}", "warning") + return { + "success": False, + "error": str(e) + } + except Exception as e: + self.log_processing(f"Topic classification failed: {str(e)}", "error") + return { + "success": False, + "error": str(e) + } + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process comment and extract topics. + + Args: + input_data: Dictionary containing comment data + + Returns: + Dictionary with topic classification results + """ + try: + # Validate input + if not self.validate_input(input_data): + return { + "success": False, + "error": "Invalid input: missing required fields" + } + + comment_text = input_data["comment_text"] + + self.log_processing(f"Classifying topics for comment", "debug") + + # Perform classification + classification_result = self.classify_topics(comment_text) + + result = { + "success": classification_result.get("success", False), + "topics": classification_result.get("topics"), + "topic_confidence": classification_result.get("topic_confidence"), + "topic_reasoning": classification_result.get("topic_reasoning", "") + } + + if "error" in classification_result: + result["topic_error"] = classification_result["error"] + + # Preserve all original data + for key, value in input_data.items(): + if key not in result: + result[key] = value + + return result + + except Exception as e: + return self.handle_error(e, "topic_classification") + + def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]: + """ + Parse LLM response that may contain JSON wrapped in markdown code blocks. + + Args: + response_content: Raw response content from LLM + + Returns: + Parsed JSON dictionary + + Raises: + json.JSONDecodeError: If JSON cannot be parsed + """ + content = response_content.strip() + + # Check if response is wrapped in markdown code block + if content.startswith("```json"): + content = content[7:] + if content.endswith("```"): + content = content[:-3] + content = content.strip() + elif content.startswith("```"): + content = content[3:] + if content.endswith("```"): + content = content[:-3] + content = content.strip() + + return json.loads(content) +``` + +#### Step 2: Update `__init__.py` + +Add your agent to `agents/__init__.py`: + +```python +""" +Agents module for the sentiment analysis workflow. +Provides modular, extensible agents for various NLP tasks. +""" + +from agents.base_agent import BaseAgent +from agents.language_detection_agent import LanguageDetectionAgent +from agents.translation_agent import TranslationAgent +from agents.sentiment_analysis_agent import SentimentAnalysisAgent +from agents.topic_classification_agent import TopicClassificationAgent # ADD THIS + +__all__ = [ + "BaseAgent", + "LanguageDetectionAgent", + "TranslationAgent", + "SentimentAnalysisAgent", + "TopicClassificationAgent" # ADD THIS +] +``` + +#### Step 3: Update Configuration Files + +Add agent configuration to `config_files/sentiment_config.json`: + +```json +{ + "agents": { + "language_detection": { ... }, + "translation": { ... }, + "sentiment_analysis": { ... }, + "topic_classification": { + "name": "TopicClassificationAgent", + "model": "gpt-5-nano", + "temperature": 0.2, + "max_retries": 3, + "description": "Classifies comments into topic categories" + } + } +} +``` + +Create topic categories config (or add to existing `sentiment_analysis_config.json`): + +```json +{ + "topics": { + "categories": [ + { + "value": "technique", + "label": "Technique", + "description": "Playing technique, finger positioning, hand coordination" + }, + { + "value": "theory", + "label": "Music Theory", + "description": "Scales, chords, harmony, composition theory" + }, + { + "value": "equipment", + "label": "Equipment", + "description": "Instruments, gear, accessories, software" + }, + { + "value": "performance", + "label": "Performance", + "description": "Stage presence, live playing, performance anxiety" + }, + { + "value": "practice", + "label": "Practice", + "description": "Practice routines, discipline, improvement tips" + } + ] + } +} +``` + +#### Step 4: Update Workflow State + +Add fields to `CommentState` in `workflow/comment_processor.py`: + +```python +class CommentState(TypedDict): + # ... existing fields ... + + # Topic classification fields + topics: str + topic_confidence: str + topic_reasoning: str +``` + +#### Step 5: Add Workflow Node + +Add the node method to `CommentProcessingWorkflow` class: + +```python +def _topic_classification_node(self, state: CommentState) -> CommentState: + """ + Node for topic classification. + + Args: + state: Current workflow state + + Returns: + Updated state with topic classification results + """ + try: + # Prepare input + input_data = { + "comment_text": state.get("translated_text", state["comment_text"]) + } + + # Process with topic classification agent + result = self.topic_agent.process(input_data) + + # Update state + if result.get("success", False): + state["topics"] = result.get("topics") + state["topic_confidence"] = result.get("topic_confidence") + state["topic_reasoning"] = result.get("topic_reasoning", "") + else: + error_msg = f"Topic classification failed: {result.get('error', 'Unknown error')}" + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + state["topics"] = None + state["topic_confidence"] = None + state["topic_reasoning"] = "Topic classification failed" + + logger.debug(f"Topics: {state['topics']}") + return state + + except Exception as e: + error_msg = f"Topic classification node error: {str(e)}" + logger.error(error_msg) + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + state["topics"] = None + state["topic_confidence"] = None + state["topic_reasoning"] = "Error during topic classification" + return state +``` + +#### Step 6: Initialize Agent in Workflow + +Update `__init__` method: + +```python +def __init__(self, config: Dict[str, Any], api_key: str): + # ... existing initialization ... + + # Load topic categories + topic_categories_path = config.get("topic_categories_config", "config_files/topic_categories.json") + with open(topic_categories_path, 'r') as f: + topic_categories = json.load(f) + + # Initialize topic agent + topic_config = config["agents"]["topic_classification"] + self.topic_agent = TopicClassificationAgent(topic_config, api_key, topic_categories) +``` + +#### Step 7: Update Workflow Graph + +Modify `_build_workflow()`: + +```python +def _build_workflow(self) -> StateGraph: + workflow = StateGraph(CommentState) + + # Add nodes + workflow.add_node("language_detection", self._language_detection_node) + workflow.add_node("translation", self._translation_node) + workflow.add_node("sentiment_analysis", self._sentiment_analysis_node) + workflow.add_node("topic_classification", self._topic_classification_node) # ADD THIS + + # Define edges + workflow.set_entry_point("language_detection") + workflow.add_conditional_edges( + "language_detection", + self._should_translate, + {"translate": "translation", "skip_translation": "sentiment_analysis"} + ) + workflow.add_edge("translation", "sentiment_analysis") + workflow.add_edge("sentiment_analysis", "topic_classification") # ADD THIS + workflow.add_edge("topic_classification", END) # MODIFY THIS + + return workflow.compile() +``` + +#### Step 8: Update Database Schema + +Add columns to your Snowflake table: + +```sql +ALTER TABLE COMMENT_SENTIMENT_FEATURES +ADD COLUMN TOPICS VARCHAR(500), +ADD COLUMN TOPIC_CONFIDENCE VARCHAR(20), +ADD COLUMN TOPIC_REASONING VARCHAR(1000); +``` + +#### Step 9: Test Your Agent + +Test with a small batch first: + +```bash +python main.py --limit 10 --sequential +``` + +Check logs for any errors and verify output in Snowflake. + +### Quick Checklist for Adding New Agents + +- [ ] Create agent class inheriting from `BaseAgent` +- [ ] Implement `validate_input()` method +- [ ] Implement `process()` method +- [ ] Implement `_parse_llm_json_response()` if using LLM +- [ ] Add agent to `agents/__init__.py` +- [ ] Add configuration to `sentiment_config.json` +- [ ] Create/update category config file if needed +- [ ] Add fields to `CommentState` TypedDict +- [ ] Create node method in `CommentProcessingWorkflow` +- [ ] Initialize agent in `__init__` +- [ ] Add node to workflow graph +- [ ] Update edges in workflow +- [ ] Update database schema +- [ ] Test with small batch + +## Modifying Existing Agents + +### Common Modifications + +#### 1. Change LLM Model + +Update `config_files/sentiment_config.json`: + +```json +{ + "agents": { + "sentiment_analysis": { + "model": "gpt-4o", // Change from gpt-5-nano + "temperature": 0.2, + "max_retries": 3 + } + } +} +``` + +No code changes needed! Configuration is loaded dynamically. + +#### 2. Add New Sentiment Category + +Update `config_files/sentiment_analysis_config.json`: + +```json +{ + "sentiment_polarity": { + "categories": [ + // ... existing categories ... + { + "value": "mixed", + "label": "Mixed", + "description": "Contains both positive and negative elements" + } + ] + } +} +``` + +The agent will automatically include this in prompts. No code changes needed. + +#### 3. Add New Intent Category + +Update `config_files/sentiment_analysis_config.json`: + +```json +{ + "intent": { + "categories": [ + // ... existing categories ... + { + "value": "collaboration", + "label": "Collaboration", + "description": "Seeking or offering collaboration opportunities" + } + ] + } +} +``` + +#### 4. Modify Reply Policy + +Update `config_files/sentiment_analysis_config.json`: + +```json +{ + "reply_policy": { + "requires_reply_intents": ["question", "request", "feedback_negative"], // Added feedback_negative + "not_include": ["humor_sarcasm", "spam_selfpromo"] // Added spam_selfpromo + } +} +``` + +#### 5. Adjust Temperature for Better Results + +If getting inconsistent results, adjust temperature: + +```json +{ + "agents": { + "sentiment_analysis": { + "model": "gpt-5-nano", + "temperature": 0.1, // Lower = more consistent, less creative + "max_retries": 3 + } + } +} +``` + +#### 6. Add Context to Sentiment Analysis + +Modify `_build_context_string()` in `sentiment_analysis_agent.py`: + +```python +def _build_context_string(self, content_description: str, parent_comment_text: str = None, + platform: str = None, content_title: str = None, + channel_name: str = None) -> str: # ADD channel_name + """Build context string for sentiment analysis.""" + context_parts = [] + + # ... existing code ... + + # ADD THIS + if channel_name: + context_parts.append(f"Channel: {channel_name}") + + return "\n".join(context_parts) +``` + +Then update the `analyze_sentiment()` method to accept and pass `channel_name`. + +#### 7. Improve Language Detection Accuracy + +Modify `language_detection_agent.py` to add more languages to LINGUA_TO_ISO: + +```python +LINGUA_TO_ISO = { + # ... existing mappings ... + Language.VIETNAMESE: "vi", + Language.THAI: "th", + Language.INDONESIAN: "id", + # Add more as needed +} +``` + +#### 8. Customize Translation Prompt + +Modify `translate_text()` in `translation_agent.py`: + +```python +system_prompt = """You are a professional translator specializing in social media content related to music and education. +Translate the given text from the source language to English. The text is a comment on a musical content. +Preserve the tone, intent, and any emojis or special characters. +For informal social media language, maintain the casual tone in translation. + +// ADD THESE GUIDELINES: +Special Instructions: +- Preserve musical terminology (e.g., "legato", "staccato") untranslated +- Translate instrument names (e.g., "guitarra" → "guitar") +- Keep artist names and brand names in original language +- Maintain slang and colloquialisms when possible + +Return your response in JSON format with the following fields: +- translated_text: The English translation +- translation_confidence: Your confidence level (high, medium, low) +- notes: Any important notes about the translation (optional) +""" +``` + +#### 9. Add Retry Logic for Failed Analyses + +Modify `process()` in `sentiment_analysis_agent.py`: + +```python +def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + try: + # ... existing validation code ... + + # ADD RETRY LOGIC + max_attempts = self.max_retries + for attempt in range(max_attempts): + analysis_result = self.analyze_sentiment( + comment_text, content_description, + parent_comment_text, platform, content_title + ) + + if analysis_result.get("success"): + break + + if attempt < max_attempts - 1: + self.log_processing(f"Attempt {attempt + 1} failed, retrying...", "warning") + + # ... rest of existing code ... +``` + +#### 10. Add Custom Validation Rules + +Modify `validate_input()` in any agent: + +```python +def validate_input(self, input_data: Dict[str, Any]) -> bool: + """Validate that input contains required fields.""" + required_fields = ["comment_text", "content_description"] + + # Check required fields exist + if not all(field in input_data for field in required_fields): + return False + + # ADD CUSTOM VALIDATION + # Ensure comment_text is not empty or too short + comment_text = input_data.get("comment_text", "") + if not comment_text or len(comment_text.strip()) < 2: + self.log_processing("Comment text too short or empty", "warning") + return False + + # Ensure content_description exists + content_desc = input_data.get("content_description", "") + if not content_desc or content_desc.strip() == "": + self.log_processing("Content description missing", "warning") + return False + + return True +``` + +### Testing Modified Agents + +After making modifications, always test: + +```bash +# Test with a small batch +python main.py --limit 10 --sequential + +# Check specific data source +python main.py --limit 10 --sequential --data-source social_media + +# Review logs for errors +tail -f logs/comment_processing_*.log +``` + +## Configuration System + +### Configuration Files Overview + +``` +config_files/ +├── sentiment_config.json # Agent behavior config +├── sentiment_analysis_config.json # Sentiment categories and intents +└── data_sources_config.json # Data source configuration +``` + +### Agent Configuration Structure + +**File**: `sentiment_config.json` + +```json +{ + "agents": { + "agent_name": { + "name": "AgentClassName", + "model": "gpt-5-nano", // LLM model to use + "temperature": 0.0, // Creativity (0.0 = deterministic, 1.0 = creative) + "max_retries": 3, // Max retry attempts + "description": "What this agent does" + } + }, + "workflow": { + "parallel_processing": { + "enabled": true, + "worker_calculation": "CPU count - 2, max 5 workers", + "min_batch_size": 20, + "max_batch_size": 1000 + } + } +} +``` + +### Temperature Guidelines + +- **0.0 - 0.1**: Deterministic, consistent (good for classification) +- **0.2 - 0.4**: Slight variation, mostly consistent (good for sentiment analysis) +- **0.5 - 0.7**: Balanced creativity and consistency (good for translation) +- **0.8 - 1.0**: Creative, varied (good for content generation) + +### Model Selection Guidelines + +- **gpt-5-nano**: Fast, cheap, good for simple tasks +- **gpt-4o-mini**: Balanced speed/quality, good for most tasks +- **gpt-4o**: High quality, slower, good for complex analysis + +### Category Configuration Structure + +**File**: `sentiment_analysis_config.json` + +```json +{ + "category_type": { + "categories": [ + { + "value": "machine_readable_value", // Used in code/DB + "label": "Human Readable Label", // Used in UI + "description": "Detailed description for LLM prompt" + } + ] + } +} +``` + +### Loading Configuration in Code + +```python +# In workflow/__init__ or agent __init__ +import json +import os + +# Load agent config +with open('config_files/sentiment_config.json', 'r') as f: + config = json.load(f) + +agent_config = config["agents"]["agent_name"] + +# Load category config +with open('config_files/sentiment_analysis_config.json', 'r') as f: + categories = json.load(f) + +sentiment_categories = categories["sentiment_polarity"]["categories"] +``` + +## Best Practices + +### Agent Development + +1. **Single Responsibility**: Each agent should do one thing well +2. **Fail Gracefully**: Always return structured error responses +3. **Preserve Data**: Never lose original input data - pass it through +4. **Log Everything**: Use `log_processing()` for debugging +5. **Validate Early**: Check inputs before processing +6. **Configuration Over Code**: Use config files for behavior changes +7. **Test Incrementally**: Test with `--limit 10 --sequential` first + +### Prompt Engineering + +1. **Be Specific**: Clearly define expected output format +2. **Use Examples**: Include few-shot examples in prompts +3. **Request JSON**: Always request JSON format for structured data +4. **Handle Edge Cases**: Document edge cases in prompts +5. **Provide Context**: Give LLM all relevant context +6. **Set Constraints**: Clearly define boundaries and limitations + +Example of good prompt structure: + +```python +system_prompt = """You are an expert at [TASK]. + +Your task is to: +1. [Step 1] +2. [Step 2] +3. [Step 3] + +Context: [Explain the context] + +Rules: +- Rule 1 +- Rule 2 +- Rule 3 + +Examples: +- Input: "..." → Output: {...} +- Input: "..." → Output: {...} + +Return your response in JSON format with the following fields: +- field1: description +- field2: description +""" +``` + +### Error Handling + +1. **Try-Catch Everything**: Wrap all processing in try-catch +2. **Specific Error Messages**: Make errors actionable +3. **Graceful Degradation**: Continue workflow even if one agent fails +4. **Error Accumulation**: Collect errors in `processing_errors` list +5. **Critical vs Non-Critical**: Distinguish between recoverable and fatal errors + +Example: + +```python +def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + try: + # Validate + if not self.validate_input(input_data): + return { + "success": False, + "error": "Invalid input: missing required fields", + **input_data # Preserve original data + } + + # Process + result = self.do_processing(input_data) + + # Check result + if not result.get("success"): + return { + "success": False, + "error": result.get("error", "Unknown error"), + **input_data + } + + # Return success + return { + "success": True, + "output_field": result["output"], + **input_data + } + + except Exception as e: + return self.handle_error(e, "process") +``` + +### Testing + +1. **Unit Test Agents**: Test agents independently before integration +2. **Small Batches**: Always test with `--limit 10` first +3. **Sequential Mode**: Use `--sequential` for debugging +4. **Check Logs**: Review logs after every test run +5. **Validate Output**: Check Snowflake results +6. **Test Edge Cases**: Empty text, emojis only, very long text, special characters + +Test script example: + +```python +# test_agent.py +from agents.sentiment_analysis_agent import SentimentAnalysisAgent +import json + +# Load config +with open('config_files/sentiment_config.json', 'r') as f: + config = json.load(f) +with open('config_files/sentiment_analysis_config.json', 'r') as f: + categories = json.load(f) + +# Initialize agent +agent = SentimentAnalysisAgent( + config["agents"]["sentiment_analysis"], + "your-api-key", + categories +) + +# Test cases +test_cases = [ + {"comment_text": "This is amazing!", "content_description": "Guitar tutorial"}, + {"comment_text": "😊😊😊", "content_description": "Piano cover"}, + {"comment_text": "What scale is this?", "content_description": "Blues solo"}, +] + +for test in test_cases: + result = agent.process(test) + print(f"Input: {test['comment_text']}") + print(f"Result: {result}") + print("---") +``` + +### Performance Optimization + +1. **Batch Processing**: Process comments in batches (handled by workflow) +2. **Parallel Workers**: Use multiprocessing for large batches +3. **Minimize LLM Calls**: Cache results when possible +4. **Optimize Prompts**: Shorter prompts = faster responses +5. **Choose Right Model**: Use gpt-5-nano for simple tasks + +### Code Organization + +1. **One Agent Per File**: Don't combine multiple agents +2. **Helper Methods**: Use private methods (\_method\_name) for internal logic +3. **Type Hints**: Always use type hints for parameters and returns +4. **Docstrings**: Document all public methods +5. **Constants**: Define constants at class level + +Example structure: + +```python +class MyAgent(BaseAgent): + # Constants + DEFAULT_VALUE = "default" + MAX_LENGTH = 1000 + + def __init__(self, config, api_key): + """Initialize agent.""" + super().__init__("MyAgent", config) + # ... initialization + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """Validate input data.""" + # ... validation + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """Main processing method.""" + # ... processing + + def _helper_method(self, data: str) -> str: + """Private helper method.""" + # ... helper logic + + def _parse_llm_json_response(self, response: str) -> Dict[str, Any]: + """Parse LLM JSON response.""" + # ... parsing +``` + +## Troubleshooting + +### Common Issues + +#### Issue 1: Agent Returns Empty Results + +**Symptoms**: Agent succeeds but returns None or empty strings for key fields + +**Causes**: +- LLM not following JSON format +- JSON parsing failing silently +- Missing fields in LLM response + +**Solutions**: +1. Check logs for JSON parsing warnings +2. Add validation after LLM call: + ```python + result = self._parse_llm_json_response(response.content) + + # Validate result + if not result.get("sentiment_polarity"): + return { + "success": False, + "error": "Missing sentiment_polarity in LLM response" + } + ``` +3. Improve prompt to be more specific about required fields +4. Add examples to prompt showing exact JSON structure + +#### Issue 2: JSON Parsing Errors + +**Symptoms**: `JSON decode error` in logs + +**Causes**: +- LLM returns markdown-wrapped JSON +- LLM includes explanatory text before/after JSON +- Malformed JSON from LLM + +**Solutions**: +1. Use `_parse_llm_json_response()` helper (already handles markdown) +2. Add more explicit prompt: + ```python + user_prompt = """... + + Return ONLY valid JSON, no explanation or markdown. Just the raw JSON object. + """ + ``` +3. Add fallback parsing: + ```python + try: + result = json.loads(content) + except json.JSONDecodeError: + # Try to extract JSON from text + import re + json_match = re.search(r'\{.*\}', content, re.DOTALL) + if json_match: + result = json.loads(json_match.group()) + else: + raise + ``` + +#### Issue 3: Inconsistent Results + +**Symptoms**: Same comment gets different classifications on reruns + +**Causes**: +- Temperature too high +- Prompt too vague +- Model inconsistency + +**Solutions**: +1. Lower temperature to 0.0 - 0.2 for classification tasks +2. Make prompt more specific and rule-based +3. Add examples to prompt +4. Use a more consistent model (gpt-5-nano vs gpt-4o) + +#### Issue 4: Agent Too Slow + +**Symptoms**: Processing takes very long + +**Causes**: +- Large LLM model +- Complex prompts +- Sequential processing +- API rate limits + +**Solutions**: +1. Use faster model (gpt-5-nano instead of gpt-4o) +2. Simplify prompt (shorter = faster) +3. Enable parallel processing (already default) +4. Increase batch size (if not hitting rate limits) +5. Consider caching repeated analyses + +#### Issue 5: Agent Failing Validation + +**Symptoms**: `validate_input()` returns False, agent skips processing + +**Causes**: +- Missing required fields in input +- Empty or None values +- Wrong data types + +**Solutions**: +1. Check workflow node - ensure all required fields passed: + ```python + input_data = { + "comment_text": state.get("translated_text", state["comment_text"]), + "content_description": state["content_description"], + # Add all required fields + } + ``` +2. Add logging to validation: + ```python + def validate_input(self, input_data: Dict[str, Any]) -> bool: + for field in required_fields: + if field not in input_data: + self.log_processing(f"Missing field: {field}", "error") + return False + return True + ``` + +#### Issue 6: Workflow Not Running New Agent + +**Symptoms**: New agent not being called, no logs from new agent + +**Causes**: +- Forgot to add node to workflow graph +- Forgot to initialize agent +- Workflow edges not connected + +**Solutions**: +1. Verify agent initialized in `__init__`: + ```python + self.new_agent = NewAgent(config, api_key) + ``` +2. Verify node added: + ```python + workflow.add_node("new_agent", self._new_agent_node) + ``` +3. Verify edges: + ```python + workflow.add_edge("previous_agent", "new_agent") + workflow.add_edge("new_agent", END) + ``` +4. Check for exceptions in workflow compilation + +#### Issue 7: Database Insert Fails + +**Symptoms**: Processing succeeds but data not in Snowflake + +**Causes**: +- Missing columns in database +- Data type mismatch +- Field name mismatch + +**Solutions**: +1. Check column exists: + ```sql + DESC TABLE COMMENT_SENTIMENT_FEATURES; + ``` +2. Add column if missing: + ```sql + ALTER TABLE COMMENT_SENTIMENT_FEATURES + ADD COLUMN NEW_FIELD VARCHAR(500); + ``` +3. Check field names match exactly (case-sensitive) +4. Check main.py result_df construction includes new fields + +### Debugging Tips + +1. **Enable Debug Logging**: Set log level to DEBUG in main.py +2. **Print State**: Add print statements in workflow nodes to see state +3. **Test Agent Directly**: Test agent outside workflow first +4. **Use Sequential Mode**: `--sequential` flag for clearer debugging +5. **Check API Logs**: Review OpenAI API dashboard for errors +6. **Validate JSON**: Use online JSON validator for config files +7. **Check Git Status**: Ensure all files saved and changes committed + +### Getting Help + +1. **Check Logs**: Always check `logs/` directory first +2. **Review This README**: Answers to most questions are here +3. **Test Incrementally**: Isolate the problem to one agent +4. **Use Small Batches**: Test with `--limit 5` for faster iteration +5. **Document Issues**: Keep notes on what you tried + +## Conclusion + +This agent architecture provides a flexible, maintainable foundation for processing social media comments. Key takeaways: + +- **Base class pattern** ensures consistency +- **LangGraph workflow** enables flexible orchestration +- **Configuration-driven** design minimizes code changes +- **Error resilience** at every level +- **Extensible by design** - easy to add new agents + +For questions or issues, refer to the main project README or review the existing agent implementations for patterns and examples. + +--- + +**Last Updated**: 2026-01-15 +**Version**: 1.0 +**Maintainer**: Musora Development Team diff --git a/processing_comments/agents/__init__.py b/processing_comments/agents/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4190db134bccfd598da7194ed5c12e44c6c41cbd --- /dev/null +++ b/processing_comments/agents/__init__.py @@ -0,0 +1,14 @@ +""" +Agents module for the sentiment analysis workflow. +Provides modular, extensible agents for various NLP tasks. +""" + +from agents.base_agent import BaseAgent +from agents.language_detection_agent import LanguageDetectionAgent +from agents.translation_agent import TranslationAgent + +__all__ = [ + "BaseAgent", + "LanguageDetectionAgent", + "TranslationAgent" +] \ No newline at end of file diff --git a/processing_comments/agents/base_agent.py b/processing_comments/agents/base_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..38a37b00e3c26ed0ff303f89b1b48ae138925b3a --- /dev/null +++ b/processing_comments/agents/base_agent.py @@ -0,0 +1,104 @@ +""" +Base Agent class for all agents in the workflow +This provides a common interface and structure for extensibility +""" + +from abc import ABC, abstractmethod +from typing import Dict, Any, Optional +import json +import logging + +logger = logging.getLogger(__name__) + + +class BaseAgent(ABC): + """ + Abstract base class for all agents in the agentic workflow. + Provides common functionality and enforces consistent interface. + """ + + def __init__(self, name: str, config: Dict[str, Any]): + """ + Initialize the base agent. + + Args: + name: Name of the agent + config: Configuration dictionary for the agent + """ + self.name = name + self.config = config + self.model = config.get("model", "gpt-4o-mini") + self.temperature = config.get("temperature", 0.7) + self.max_retries = config.get("max_retries", 3) + logger.info(f"Initialized {self.name} with model {self.model}") + + @abstractmethod + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process input data and return results. + This method must be implemented by all concrete agent classes. + + Args: + input_data: Dictionary containing input data for processing + + Returns: + Dictionary containing processing results + """ + pass + + @abstractmethod + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """ + Validate input data before processing. + + Args: + input_data: Dictionary containing input data + + Returns: + True if input is valid, False otherwise + """ + pass + + def get_name(self) -> str: + """Get the agent name.""" + return self.name + + def get_config(self) -> Dict[str, Any]: + """Get the agent configuration.""" + return self.config + + def log_processing(self, message: str, level: str = "info"): + """ + Log processing information. + + Args: + message: Log message + level: Log level (info, warning, error, debug) + """ + log_method = getattr(logger, level, logger.info) + log_method(f"[{self.name}] {message}") + + def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]: + """ + Handle errors consistently across all agents. + + Args: + error: The exception that occurred + context: Additional context about the error + + Returns: + Error dictionary with details + """ + error_msg = f"Error in {self.name}" + if context: + error_msg += f" ({context})" + error_msg += f": {str(error)}" + + logger.error(error_msg) + + return { + "success": False, + "error": str(error), + "agent": self.name, + "context": context + } \ No newline at end of file diff --git a/processing_comments/agents/language_detection_agent.py b/processing_comments/agents/language_detection_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..b8916720623821be11ab184bf8862fba18496fbb --- /dev/null +++ b/processing_comments/agents/language_detection_agent.py @@ -0,0 +1,292 @@ +""" +Language Detection Agent +Detects the language of social media comments using lingua library and LLM fallback +""" + +from typing import Dict, Any +import json +from lingua import Language, LanguageDetectorBuilder +from langchain_openai import ChatOpenAI +from langchain.schema import HumanMessage, SystemMessage +from agents.base_agent import BaseAgent +import logging + +logger = logging.getLogger(__name__) + + +class LanguageDetectionAgent(BaseAgent): + """ + Agent that detects the language of text comments. + Uses lingua library for fast English detection, then LLM for non-English languages. + """ + + # Lingua to ISO 639-1 language code mapping + LINGUA_TO_ISO = { + Language.ENGLISH: "en", + Language.SPANISH: "es", + Language.FRENCH: "fr", + Language.GERMAN: "de", + Language.ITALIAN: "it", + Language.PORTUGUESE: "pt", + Language.RUSSIAN: "ru", + Language.JAPANESE: "ja", + Language.KOREAN: "ko", + Language.CHINESE: "zh", + Language.ARABIC: "ar", + Language.HINDI: "hi", + Language.DUTCH: "nl", + Language.SWEDISH: "sv", + Language.POLISH: "pl", + Language.TURKISH: "tr" + } + + def __init__(self, config: Dict[str, Any], api_key: str): + """ + Initialize the Language Detection Agent. + + Args: + config: Configuration dictionary + api_key: OpenAI API key + """ + super().__init__("LanguageDetectionAgent", config) + self.api_key = api_key + self.llm = ChatOpenAI( + model=self.model, + temperature=self.temperature, + api_key=self.api_key + ) + + # Initialize lingua detector with all languages + self.detector = LanguageDetectorBuilder.from_all_languages().build() + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """ + Validate that input contains required fields. + + Args: + input_data: Input dictionary + + Returns: + True if valid, False otherwise + """ + return "comment_text" in input_data and input_data["comment_text"] + + def detect_with_lingua(self, text: str) -> tuple[str, str, bool]: + """ + Detect language using lingua library. + + Args: + text: Text to analyze + + Returns: + Tuple of (language_code, language_name, is_english) + """ + try: + # Clean text + cleaned_text = text.strip() + if not cleaned_text or len(cleaned_text) < 3: + return "en", "English", True # Default for very short text + + # Detect language with lingua + detected_language = self.detector.detect_language_of(cleaned_text) + + if detected_language is None: + # If detection fails, default to English + return "en", "English", True + + # Check if it's English + if detected_language == Language.ENGLISH: + return "en", "English", True + + # Map lingua language to ISO code + lang_code = self.LINGUA_TO_ISO.get(detected_language, "unknown") + lang_name = detected_language.name.capitalize() + + return lang_code, lang_name, False + + except Exception as e: + logger.warning(f"Lingua detection failed: {str(e)}") + # If detection fails, default to English + return "en", "English", True + + def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]: + """ + Parse LLM response that may contain JSON wrapped in markdown code blocks. + + Args: + response_content: Raw response content from LLM + + Returns: + Parsed JSON dictionary + + Raises: + json.JSONDecodeError: If JSON cannot be parsed + """ + content = response_content.strip() + + # Check if response is wrapped in markdown code block + if content.startswith("```json"): + # Remove ```json prefix and ``` suffix + content = content[7:] # Remove ```json + if content.endswith("```"): + content = content[:-3] # Remove trailing ``` + content = content.strip() + elif content.startswith("```"): + # Remove generic ``` code block + content = content[3:] + if content.endswith("```"): + content = content[:-3] + content = content.strip() + + # Parse the cleaned JSON + return json.loads(content) + + def detect_with_llm(self, text: str) -> Dict[str, Any]: + """ + Detect language using LLM for more nuanced detection. + + Args: + text: Text to analyze + + Returns: + Dictionary with language detection results + """ + system_prompt = """You are a language detection expert. Analyze the given text and detect its language. +For text with only emojis, special characters, or minimal content, classify as "English". Comment is about a music content, so having links or using musician name is normal and still be english. +Return your response in JSON format with the following fields: +- language: The detected language name (e.g., "English", "Spanish", "French") +- language_code: ISO 639-1 language code (e.g., "en", "es", "fr") +- confidence: Your confidence level (high, medium, low) +- has_text: boolean indicating if there is actual textual content (not just emojis/symbols) +""" + + user_prompt = f"""Detect the language of this comment related to a musical content: + +"{text}" + +Return JSON only.""" + + try: + messages = [ + SystemMessage(content=system_prompt), + HumanMessage(content=user_prompt) + ] + + response = self.llm.invoke(messages) + + # Parse the response using helper function + result = self._parse_llm_json_response(response.content) + + # If no text content, default to English + if not result.get("has_text", True): + result["language"] = "English" + result["language_code"] = "en" + + return result + + except json.JSONDecodeError as e: + self.log_processing(f"LLM response JSON parsing failed: {str(e)}", "warning") + self.log_processing(f"Raw response: {response.content[:200]}", "debug") + return { + "language": "English", + "language_code": "en", + "confidence": "low", + "has_text": True + } + except Exception as e: + self.log_processing(f"LLM detection failed: {str(e)}", "warning") + return { + "language": "English", + "language_code": "en", + "confidence": "low", + "has_text": True + } + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process comment and detect its language. + Strategy: Use lingua first. If English, done. If not English, use LLM for better accuracy. + + Args: + input_data: Dictionary containing comment_text and other metadata + + Returns: + Dictionary with language detection results + """ + try: + # Validate input + if not self.validate_input(input_data): + return { + "success": False, + "error": "Invalid input: missing comment_text", + "language": "English", + "language_code": "en", + "is_english": True + } + + comment_text = input_data["comment_text"] + + # Check for empty or emoji-only content + if not comment_text or len(comment_text.strip()) == 0: + return { + "success": True, + "comment_text": comment_text, + "language": "English", + "language_code": "en", + "is_english": True, + "confidence": "high", + "detection_method": "default", + "has_text": False + } + + # Step 1: Use lingua for initial detection + lingua_lang_code, lingua_lang_name, is_english = self.detect_with_lingua(comment_text) + + # Step 2: If English, we're done (lingua is good at detecting English) + if is_english: + result = { + "success": True, + "comment_text": comment_text, + "language": "English", + "language_code": "en", + "is_english": True, + "confidence": "high", + "detection_method": "lingua", + "has_text": True + } + else: + # Step 3: If not English, use LLM for more accurate detection + llm_result = self.detect_with_llm(comment_text) + language = llm_result.get("language", lingua_lang_name) + language_code = llm_result.get("language_code", lingua_lang_code) + confidence = llm_result.get("confidence", "medium") + has_text = llm_result.get("has_text", True) + if language_code == "en" or language == "English": + is_english=True + + result = { + "success": True, + "comment_text": comment_text, + "language": language, + "language_code": language_code, + "is_english": is_english, + "confidence": confidence, + "detection_method": "llm", + "has_text": has_text + } + + # Preserve original metadata + for key, value in input_data.items(): + if key not in result: + result[key] = value + + self.log_processing( + f"Detected language: {result['language']} ({result['language_code']}) - " + f"Method: {result['detection_method']}", + "debug" + ) + + return result + + except Exception as e: + return self.handle_error(e, "language detection") \ No newline at end of file diff --git a/processing_comments/agents/sentiment_analysis_agent.py b/processing_comments/agents/sentiment_analysis_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..ec92464316a36de566eb7a60ae457ea485a76026 --- /dev/null +++ b/processing_comments/agents/sentiment_analysis_agent.py @@ -0,0 +1,381 @@ +""" +Sentiment Analysis Agent +Extracts sentiment polarity, intent, and determines if reply is needed +""" + +from typing import Dict, Any, List, Optional +import json +import re +from langchain_openai import ChatOpenAI +from langchain.schema import HumanMessage, SystemMessage +from agents.base_agent import BaseAgent +import logging + +logger = logging.getLogger(__name__) + +# Reply policy constants — must stay in sync with reply_policy in sentiment_analysis_config.json +_REQUIRES_REPLY_INTENTS = {"question", "request", "subscription"} +_NO_REPLY_INTENTS = {"humor_sarcasm"} + +# Compiled regexes for content description parsing (compiled once at module load) +_RE_FOLLOW_SECTION = re.compile(r"^Follow\b", re.IGNORECASE) +_RE_ARROW_LINK = re.compile(r"^►") +_RE_URL_ONLY = re.compile(r"^https?://\S+$") +_RE_TIMESTAMP = re.compile(r"^\d+:\d+\s*[-–]\s*(.*)") + + +class SentimentAnalysisAgent(BaseAgent): + """ + Agent that analyzes comment sentiment, intent, and reply requirements. + + Design decisions: + - System prompt is built once at init (static across all calls) + - requires_reply is computed deterministically in Python, not by the LLM + - LLM output is validated against config-defined allowed value sets + - Content descriptions are parsed to strip URLs, timestamps, and social sections + - Parent comments are passed as read-only context; classification targets the + TARGET comment only + """ + + def __init__(self, config: Dict[str, Any], api_key: str, sentiment_categories: Dict[str, Any]): + """ + Initialize the Sentiment Analysis Agent. + + Args: + config: Agent configuration dictionary + api_key: OpenAI API key + sentiment_categories: Loaded sentiment_analysis_config.json dict + """ + super().__init__("SentimentAnalysisAgent", config) + self.api_key = api_key + self.sentiment_categories = sentiment_categories + + # Pre-compute valid value sets from config for O(1) validation + self._valid_polarities = { + cat["value"] for cat in sentiment_categories["sentiment_polarity"]["categories"] + } + self._valid_intents = { + cat["value"] for cat in sentiment_categories["intent"]["categories"] + } + + self.llm = ChatOpenAI( + model=self.model, + temperature=self.temperature, + api_key=self.api_key, + model_kwargs={"response_format": {"type": "json_object"}} + ) + + # Build system prompt once at init — reused for every LLM call + self._system_prompt = self._build_system_prompt() + + # ------------------------------------------------------------------ + # Prompt construction + # ------------------------------------------------------------------ + + def _build_system_prompt(self) -> str: + """ + Build a compact, static system prompt from the sentiment config. + Pulls category descriptions directly from config so changes to + sentiment_analysis_config.json are automatically reflected. + """ + polarity_lines = "\n".join( + f"- {cat['value']}: {cat['description']}" + for cat in self.sentiment_categories["sentiment_polarity"]["categories"] + ) + + intent_lines = "\n".join( + f"- {cat['value']}: {cat['description']}" + for cat in self.sentiment_categories["intent"]["categories"] + ) + + return ( + "Classify a social media comment about musical content.\n\n" + "RULE: Analyze ONLY the TARGET comment. " + "The parent comment is context only — do not extract sentiment or intent from it.\n\n" + "Return JSON only:\n" + '{"sentiment_polarity": , "intents": [], ' + '"confidence": "high"|"medium"|"low", "analysis_notes": "<1-2 sentences>"}\n\n' + f"POLARITY (pick one):\n{polarity_lines}\n\n" + f"INTENTS (multi-label, pick all that apply):\n{intent_lines}\n\n" + "Rhetorical/sarcasm rules:\n" + "- Rhetorical questions → humor_sarcasm or feedback_negative, NOT question\n" + "- Sarcastic suggestions → feedback_negative, NOT suggestion\n" + "- Sarcastic requests → feedback_negative, NOT request\n" + "- Only use question/request/suggestion for GENUINE expressions" + ) + + def _build_user_prompt( + self, + comment_text: str, + content_description: str, + parent_comment_text: Optional[str] = None, + platform: Optional[str] = None, + content_title: Optional[str] = None, + ) -> str: + """ + Build the compact user prompt with parsed, truncated context. + + YouTube stores the video title separately from the description, so they + are combined here. Other platforms already embed the title in the + description, so only the parsed description is used. + """ + parsed_description = self._parse_content_description(content_description) + + if platform and platform.lower() == "youtube" and content_title and str(content_title).strip(): + content_context = f"{content_title.strip()} — {parsed_description}"[:500] + else: + content_context = parsed_description + + parts = [f"Content: {content_context}"] + + if parent_comment_text and str(parent_comment_text).strip(): + parent_snippet = str(parent_comment_text).strip()[:500] + parts.append(f'Parent (context only): "{parent_snippet}"') + + parts.append(f'TARGET: "{comment_text}"') + + return "\n".join(parts) + + # ------------------------------------------------------------------ + # Content description parsing + # ------------------------------------------------------------------ + + @staticmethod + def _parse_content_description(text: str) -> str: + """ + Extract meaningful narrative text from a raw content description. + + Strips noise common in YouTube/social descriptions: + - "Follow [name]:" blocks and everything after them + - Lines starting with ► (hyperlinks) + - Lines that are a bare URL + - Timestamp chapter markers: "01:08 - Active listening" → "Active listening" + + Returns at most 500 characters of joined clean text. + """ + if not text or not str(text).strip(): + return "" + + cleaned = [] + for line in str(text).splitlines(): + stripped = line.strip() + + # Stop at social-media "Follow" blocks + if _RE_FOLLOW_SECTION.match(stripped): + break + + # Skip ► link lines + if _RE_ARROW_LINK.match(stripped): + continue + + # Skip bare URL lines + if _RE_URL_ONLY.match(stripped): + continue + + # Convert "MM:SS - Chapter label" → keep just the label + ts_match = _RE_TIMESTAMP.match(stripped) + if ts_match: + label = ts_match.group(1).strip() + if label: + cleaned.append(label) + continue + + if stripped: + cleaned.append(stripped) + + return " ".join(cleaned)[:500] + + # ------------------------------------------------------------------ + # Output validation and reply computation + # ------------------------------------------------------------------ + + def _validate_result(self, raw: Dict[str, Any]) -> Dict[str, Any]: + """ + Validate LLM output against config-defined allowed value sets. + + - Invalid polarity → fail (comment will not be stored) + - Invalid intent values → filtered out; if none remain → fail + - Invalid confidence → silently corrected to "medium" + + Returns a success dict with cleaned fields, or a failure dict with + an explanatory error message. + """ + sentiment_polarity = raw.get("sentiment_polarity") + + if not sentiment_polarity or sentiment_polarity not in self._valid_polarities: + return { + "success": False, + "error": ( + f"Invalid sentiment_polarity '{sentiment_polarity}'. " + f"Expected one of: {sorted(self._valid_polarities)}" + ), + } + + # Normalize intents to a list + intents = raw.get("intents", raw.get("intent", [])) + if isinstance(intents, str): + intents = [i.strip() for i in intents.split(",")] + if not isinstance(intents, list): + intents = [] + + valid_intents = [i for i in intents if i in self._valid_intents] + if not valid_intents: + return { + "success": False, + "error": ( + f"No valid intents in response: {intents}. " + f"Expected values from: {sorted(self._valid_intents)}" + ), + } + + confidence = raw.get("confidence", "medium") + if confidence not in {"high", "medium", "low"}: + confidence = "medium" + + return { + "success": True, + "sentiment_polarity": sentiment_polarity, + "intents": valid_intents, + "confidence": confidence, + "analysis_notes": str(raw.get("analysis_notes", "")).strip(), + } + + @staticmethod + def _compute_requires_reply(intents: List[str]) -> bool: + """ + Deterministically decide if the comment requires a reply. + + True when the comment contains at least one reply-required intent + (question, request, subscription) AND no no-reply intents (humor_sarcasm). + This mirrors the reply_policy section of sentiment_analysis_config.json + without delegating the decision to the LLM. + """ + intent_set = set(intents) + return ( + bool(intent_set & _REQUIRES_REPLY_INTENTS) + and not bool(intent_set & _NO_REPLY_INTENTS) + ) + + # ------------------------------------------------------------------ + # Core analysis + # ------------------------------------------------------------------ + + def analyze_sentiment( + self, + comment_text: str, + content_description: str, + parent_comment_text: Optional[str] = None, + platform: Optional[str] = None, + content_title: Optional[str] = None, + ) -> Dict[str, Any]: + """ + Call the LLM to classify the TARGET comment's sentiment and intents. + + Args: + comment_text: The comment to analyze (translated to English if needed) + content_description: Raw content description (will be parsed internally) + parent_comment_text: Optional parent comment — context only, max 500 chars + platform: Platform name; drives YouTube title-handling logic + content_title: YouTube video title (YouTube only) + + Returns: + Success dict with sentiment_polarity, intent (comma-separated str), + requires_reply, sentiment_confidence, analysis_notes + — or a failure dict with an error key. + """ + user_prompt = self._build_user_prompt( + comment_text, content_description, parent_comment_text, platform, content_title + ) + + try: + messages = [ + SystemMessage(content=self._system_prompt), + HumanMessage(content=user_prompt), + ] + + response = self.llm.invoke(messages) + raw = json.loads(response.content) + + validated = self._validate_result(raw) + if not validated["success"]: + self.log_processing(f"Validation failed: {validated['error']}", "warning") + return validated + + requires_reply = self._compute_requires_reply(validated["intents"]) + intent_str = ", ".join(validated["intents"]) + + return { + "success": True, + "sentiment_polarity": validated["sentiment_polarity"], + "intent": intent_str, + "requires_reply": requires_reply, + "sentiment_confidence": validated["confidence"], + "analysis_notes": validated["analysis_notes"], + } + + except json.JSONDecodeError as e: + self.log_processing(f"JSON decode error: {e}", "warning") + return {"success": False, "error": f"JSON parse error: {e}"} + + except Exception as e: + self.log_processing(f"Sentiment analysis failed: {e}", "error") + return {"success": False, "error": str(e)} + + # ------------------------------------------------------------------ + # Agent interface + # ------------------------------------------------------------------ + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + return all(field in input_data for field in ("comment_text", "content_description")) + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a comment and return sentiment analysis results merged with + the original input fields. + + Args: + input_data: Must contain comment_text and content_description. + May contain parent_comment_text, platform, content_title, + and any additional source fields (permalink_url, etc.) + + Returns: + Dict with sentiment fields merged on top of original input_data. + """ + try: + if not self.validate_input(input_data): + return { + "success": False, + "error": "Invalid input: missing required fields (comment_text, content_description)", + } + + self.log_processing("Analyzing sentiment for comment", "debug") + + analysis_result = self.analyze_sentiment( + comment_text=input_data["comment_text"], + content_description=input_data["content_description"], + parent_comment_text=input_data.get("parent_comment_text"), + platform=input_data.get("platform"), + content_title=input_data.get("content_title"), + ) + + result = { + "success": analysis_result.get("success", False), + "sentiment_polarity": analysis_result.get("sentiment_polarity"), + "intent": analysis_result.get("intent"), + "requires_reply": analysis_result.get("requires_reply", False), + "sentiment_confidence": analysis_result.get("sentiment_confidence"), + "analysis_notes": analysis_result.get("analysis_notes", ""), + } + + if "error" in analysis_result: + result["sentiment_error"] = analysis_result["error"] + + # Preserve all original input fields (e.g. permalink_url, thumbnail_url) + for key, value in input_data.items(): + if key not in result: + result[key] = value + + return result + + except Exception as e: + return self.handle_error(e, "sentiment_analysis") \ No newline at end of file diff --git a/processing_comments/agents/translation_agent.py b/processing_comments/agents/translation_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..619013de07a1d8e9f2c1773ca456e71d584682c3 --- /dev/null +++ b/processing_comments/agents/translation_agent.py @@ -0,0 +1,210 @@ +""" +Translation Agent +Translates non-English comments to English using LLM +""" + +from typing import Dict, Any +import json +from langchain_openai import ChatOpenAI +from langchain.schema import HumanMessage, SystemMessage +from agents.base_agent import BaseAgent +import logging + +logger = logging.getLogger(__name__) + + +class TranslationAgent(BaseAgent): + """ + Agent that translates text from source language to English. + Uses LLM for high-quality, context-aware translation. + """ + + def __init__(self, config: Dict[str, Any], api_key: str): + """ + Initialize the Translation Agent. + + Args: + config: Configuration dictionary + api_key: OpenAI API key + """ + super().__init__("TranslationAgent", config) + self.api_key = api_key + self.llm = ChatOpenAI( + model=self.model, + temperature=self.temperature, + api_key=self.api_key + ) + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """ + Validate that input contains required fields. + + Args: + input_data: Input dictionary + + Returns: + True if valid, False otherwise + """ + required_fields = ["comment_text", "is_english"] + return all(field in input_data for field in required_fields) + + def translate_text(self, text: str, source_language: str) -> Dict[str, Any]: + """ + Translate text from source language to English using LLM. + + Args: + text: Text to translate + source_language: Source language name + + Returns: + Dictionary with translation results + """ + system_prompt = """You are a professional translator specializing in social media content related to music and education. +Translate the given text from the source language to English. The text is a comment on a musical content. +Preserve the tone, intent, and any emojis or special characters. +For informal social media language, maintain the casual tone in translation. + +Return your response in JSON format with the following fields: +- translated_text: The English translation +- translation_confidence: Your confidence level (high, medium, low) +- notes: Any important notes about the translation (optional) +""" + + user_prompt = f"""Translate this {source_language} comment to English: + +"{text}" + +Return JSON only.""" + + try: + messages = [ + SystemMessage(content=system_prompt), + HumanMessage(content=user_prompt) + ] + + response = self.llm.invoke(messages) + result = self._parse_llm_json_response(response.content) + + return { + "success": True, + "translated_text": result.get("translated_text", text), + "translation_confidence": result.get("translation_confidence", "medium"), + "translation_notes": result.get("notes", "") + } + + except json.JSONDecodeError as e: + self.log_processing(f"JSON decode error: {str(e)}", "warning") + # Try to extract text from response + return { + "success": False, + "translated_text": text, + "translation_confidence": "low", + "translation_notes": "JSON parsing failed", + "error": str(e) + } + + except Exception as e: + self.log_processing(f"Translation failed: {str(e)}", "error") + return { + "success": False, + "translated_text": text, + "translation_confidence": "low", + "translation_notes": "Translation error", + "error": str(e) + } + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process comment and translate if needed. + + Args: + input_data: Dictionary containing comment data with language info + + Returns: + Dictionary with translation results + """ + try: + # Validate input + if not self.validate_input(input_data): + return { + "success": False, + "error": "Invalid input: missing required fields", + "translated_text": input_data.get("comment_text", ""), + "translation_performed": False + } + + comment_text = input_data["comment_text"] + is_english = input_data["is_english"] + source_language = input_data.get("language", "Unknown") + + # If already English, no translation needed + if is_english: + result = { + "success": True, + "translated_text": comment_text, + "translation_performed": False, + "translation_confidence": "N/A", + "translation_notes": "Original text is English" + } + self.log_processing("Text is already English, skipping translation", "debug") + else: + # Perform translation + self.log_processing( + f"Translating from {source_language} to English", + "debug" + ) + + translation_result = self.translate_text(comment_text, source_language) + + result = { + "success": translation_result.get("success", True), + "translated_text": translation_result.get("translated_text", comment_text), + "translation_performed": True, + "translation_confidence": translation_result.get("translation_confidence", "medium"), + "translation_notes": translation_result.get("translation_notes", "") + } + + if "error" in translation_result: + result["translation_error"] = translation_result["error"] + + # Preserve all original data + for key, value in input_data.items(): + if key not in result: + result[key] = value + + return result + + except Exception as e: + return self.handle_error(e, "translation") + + def _parse_llm_json_response(self, response_content: str) -> Dict[str, Any]: + """ + Parse LLM response that may contain JSON wrapped in markdown code blocks. + + Args: + response_content: Raw response content from LLM + + Returns: + Parsed JSON dictionary + + Raises: + json.JSONDecodeError: If JSON cannot be parsed + """ + content = response_content.strip() + + # Check if response is wrapped in markdown code block + if content.startswith("```json"): + # Remove ```json prefix and ``` suffix + content = content[7:] # Remove ```json + if content.endswith("```"): + content = content[:-3] # Remove trailing ``` + content = content.strip() + elif content.startswith("```"): + # Remove generic ``` code block + content = content[3:] + if content.endswith("```"): + content = content[:-3] + content = content.strip() + + # Parse the cleaned JSON + return json.loads(content) \ No newline at end of file diff --git a/processing_comments/config_files/data_sources_config.json b/processing_comments/config_files/data_sources_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9930fc44e0e19578ced03f206332079780297fc6 --- /dev/null +++ b/processing_comments/config_files/data_sources_config.json @@ -0,0 +1,56 @@ +{ + "data_sources": { + "social_media": { + "name": "Social Media Comments", + "description": "Comments from external social media platforms (Facebook, Instagram, YouTube, etc.)", + "enabled": true, + "sql_query_file": "sql/fetch_comments.sql", + "output_config": { + "table_name": "COMMENT_SENTIMENT_FEATURES", + "database": "SOCIAL_MEDIA_DB", + "schema": "ML_FEATURES" + }, + "source_columns": { + "comment_sk": "COMMENT_SK", + "comment_id": "COMMENT_ID", + "comment_text": "COMMENT_TEXT", + "parent_comment_id": "PARENT_COMMENT_ID", + "parent_comment_text": "PARENT_COMMENT_TEXT", + "platform": "PLATFORM", + "content_description": "CONTENT_DESCRIPTION" + } + }, + "musora_comments": { + "name": "Musora Internal Comments", + "description": "Comments from Musora internal applications", + "enabled": true, + "sql_query_file": "sql/fetch_musora_comments.sql", + "output_config": { + "table_name": "MUSORA_COMMENT_SENTIMENT_FEATURES", + "database": "SOCIAL_MEDIA_DB", + "schema": "ML_FEATURES" + }, + "source_columns": { + "comment_sk": "COMMENT_SK (generated via HASH)", + "comment_id": "COMMENT_ID", + "comment_text": "COMMENT_TEXT (aliased from MESSAGE)", + "parent_comment_id": "PARENT_COMMENT_ID", + "parent_comment_text": "PARENT_COMMENT_TEXT", + "platform": "PLATFORM", + "content_description": "CONTENT_DESCRIPTION (aliased from CONTENT_PROFILE)", + "author_id": "AUTHOR_ID (aliased from USER_ID)", + "permalink_url": "PERMALINK_URL (aliased from WEB_URL_PATH)", + "thumbnail_url": "THUMBNAIL_URL" + }, + "additional_fields": [ + "PERMALINK_URL", + "THUMBNAIL_URL" + ] + } + }, + "processing": { + "default_limit": 10000, + "enable_parent_context": true, + "parent_context_description": "When a comment is a reply, include the parent comment text for better sentiment analysis context" + } +} diff --git a/processing_comments/config_files/sentiment_analysis_config.json b/processing_comments/config_files/sentiment_analysis_config.json new file mode 100644 index 0000000000000000000000000000000000000000..95409ed7c68f60efa7bc86da70ddfebfd4ee7a9e --- /dev/null +++ b/processing_comments/config_files/sentiment_analysis_config.json @@ -0,0 +1,96 @@ +{ + "sentiment_polarity": { + "categories": [ + { + "value": "very_positive", + "label": "Very Positive", + "description": "Extremely enthusiastic, excited, deeply grateful, or highly satisfied" + }, + { + "value": "positive", + "label": "Positive", + "description": "Generally positive, appreciative, supportive, or encouraging" + }, + { + "value": "neutral", + "label": "Neutral", + "description": "Factual, informational, balanced, or lacking clear emotional tone" + }, + { + "value": "negative", + "label": "Negative", + "description": "Disappointed, critical, frustrated, or mildly dissatisfied" + }, + { + "value": "very_negative", + "label": "Very Negative", + "description": "Highly critical, angry, abusive, or extremely dissatisfied" + } + ] + }, + "intent": { + "categories": [ + { + "value": "praise", + "label": "Praise", + "description": "Compliments, thanks, admiration, excitement, and similar positive expressions" + }, + { + "value": "question", + "label": "Question", + "description": "Information seeking (e.g., 'what scale?', 'when's it out?', How to get account?)" + }, + { + "value": "request", + "label": "Request", + "description": "Asking for something actionable (tutorial, feature, sheet music, etc.)" + }, + { + "value": "feedback_negative", + "label": "Negative Feedback", + "description": "Critical feedback about the content or issues (mixing, performance, composition) without abuse" + }, + { + "value": "suggestion", + "label": "Suggestion", + "description": "Constructive ideas/improvements (e.g., 'try slower tempo', 'add captions')" + }, + { + "value": "humor_sarcasm", + "label": "Humor/Sarcasm", + "description": "Joking, teasing, memes, irony (non-toxic)" + }, + { + "value": "off_topic", + "label": "Off Topic", + "description": "Unrelated chatter or unclear/no discernible intent" + }, + { + "value": "spam_selfpromo", + "label": "Spam/Self-Promotion", + "description": "Ads, links, promos, scams" + }, + { + "value": "subscription", + "label": "Subscription", + "description": "Questions about subscribing (e.g., 'How do I subscribe?', 'What's the cost?') or requests to unsubscribe/cancel (e.g., 'I want to cancel', 'How to unsubscribe?')" + } + ] + }, + "reply_policy": { + "requires_reply_intents": ["question", "request", "subscription"], + "not_include": ["humor_sarcasm"], + "description": "Comments with these intents should be flagged for reply" + }, + "intent_settings": { + "multi_label": true, + "description": "Intent can have multiple labels as a comment can express multiple intents", + "rhetorical_sarcasm_handling": true, + "rhetorical_sarcasm_description": "System differentiates between genuine questions/suggestions/requests and rhetorical/sarcastic ones" + }, + "analysis_notes_policy": { + "max_length": "1-2 sentences", + "include_topics": true, + "description": "Concise notes including key topics/highlights not covered by other categories for future summarization" + } +} \ No newline at end of file diff --git a/processing_comments/config_files/sentiment_config.json b/processing_comments/config_files/sentiment_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b7ca9f0c5e80b597b777104e7a04559791c72d9 --- /dev/null +++ b/processing_comments/config_files/sentiment_config.json @@ -0,0 +1,49 @@ +{ + "LLM_models": ["gpt-5-nano", "gpt-4o-mini"], + "reasoning": ["gpt-5-nano"], + + "agents": { + "language_detection": { + "name": "LanguageDetectionAgent", + "model": "gpt-5-nano", + "temperature": 0.0, + "max_retries": 3, + "description": "Detects language of comments and identifies non-English content" + }, + "translation": { + "name": "TranslationAgent", + "model": "gpt-5-nano", + "temperature": 0.3, + "max_retries": 3, + "description": "Translates non-English comments to English" + }, + "sentiment_analysis": { + "name": "SentimentAnalysisAgent", + "model": "gpt-5-nano", + "temperature": 0.0, + "max_retries": 3, + "description": "Analyzes sentiment polarity, intent, and determines if reply is needed" + } + }, + + "workflow": { + "description": "Batch size is calculated dynamically based on number of workers (min: 20, max: 1000)", + "parallel_processing": { + "enabled": true, + "worker_calculation": "CPU count - 2, max 5 workers", + "min_batch_size": 20, + "max_batch_size": 1000 + } + }, + + "snowflake": { + "output_table": "COMMENT_SENTIMENT_FEATURES", + "database": "SOCIAL_MEDIA_DB", + "schema": "ML_FEATURES" + }, + + "default_language": "English" +} + + + diff --git a/processing_comments/main.py b/processing_comments/main.py new file mode 100644 index 0000000000000000000000000000000000000000..f555a63051fc8f60786bb8172f02492076391ac3 --- /dev/null +++ b/processing_comments/main.py @@ -0,0 +1,572 @@ +""" +Main execution script for comment processing workflow. +Orchestrates data fetching, processing, and storage using agentic workflow. +Supports parallel processing with multiprocessing for improved performance. +Supports multiple data sources (social media and Musora internal comments). +""" + +import json +import os +import logging +import argparse +from datetime import datetime +import pandas as pd +from dotenv import load_dotenv +from multiprocessing import Pool, cpu_count, Manager +from functools import partial +import traceback +from typing import Dict, Any, List + +from SnowFlakeConnection import SnowFlakeConn +from workflow.comment_processor import CommentProcessingWorkflow + +# Get the directory where this script is located +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) + +# Load environment variables from root directory (parent of processing_comments) +ROOT_DIR = os.path.dirname(SCRIPT_DIR) +load_dotenv(os.path.join(ROOT_DIR, '.env')) + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(os.path.join(SCRIPT_DIR, 'logs', f'comment_processing_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')), + logging.StreamHandler() + ] +) +logger = logging.getLogger(__name__) + + +def calculate_optimal_batch_size(total_comments: int, num_workers: int, min_batch: int = 20, max_batch: int = 100) -> int: + """ + Calculate optimal batch size based on total comments and number of workers. + + Args: + total_comments: Total number of comments to process + num_workers: Number of parallel workers + min_batch: Minimum batch size (default: 20) + max_batch: Maximum batch size (default: 1000) + + Returns: + Optimal batch size + """ + if total_comments <= min_batch: + return total_comments + + # Calculate batch size to distribute work evenly among workers + batch_size = total_comments // num_workers + + # Apply constraints + batch_size = max(min_batch, min(max_batch, batch_size)) + + return batch_size + + +def process_batch_worker(batch_data: tuple) -> dict: + """ + Worker function to process a single batch of comments. + This function runs in a separate process. + + Args: + batch_data: Tuple containing (batch_num, batch_comments, config, api_key, overwrite_first_batch, data_source_config) + + Returns: + Dictionary with batch statistics and results + """ + batch_num, batch_comments, config, api_key, overwrite_first_batch, data_source_config = batch_data + + # Configure logging for this worker + worker_logger = logging.getLogger(f"Worker-{batch_num}") + + try: + worker_logger.info(f"Batch {batch_num}: Starting processing of {len(batch_comments)} comments") + + # Initialize Snowflake connection for this worker + snowflake = SnowFlakeConn() + + # Initialize workflow for this worker + workflow = CommentProcessingWorkflow(config, api_key) + + # Process comments through workflow + results = workflow.process_batch(batch_comments) + + # Convert to DataFrame + results_df = pd.DataFrame(results) + + # Filter successful results + initial_count = len(results_df) + df_successful = results_df[results_df['success'] == True].copy() + filtered_count = initial_count - len(df_successful) + + worker_logger.info(f"Batch {batch_num}: Processed {initial_count} comments, {len(df_successful)} successful") + + # Prepare output data with base columns + output_columns = { + 'comment_sk': 'COMMENT_SK', + 'comment_id': 'COMMENT_ID', + 'comment_text': 'ORIGINAL_TEXT', + 'platform': 'PLATFORM', + 'comment_timestamp': 'COMMENT_TIMESTAMP', + 'author_name': 'AUTHOR_NAME', + 'author_id': 'AUTHOR_ID', + 'parent_comment_id': 'PARENT_COMMENT_ID', + 'parent_comment_text': 'PARENT_COMMENT_TEXT', + 'content_sk': 'CONTENT_SK', + 'content_id': 'CONTENT_ID', + 'content_description': 'CONTENT_DESCRIPTION', + 'channel_sk': 'CHANNEL_SK', + 'channel_name': 'CHANNEL_NAME', + 'channel_display_name': 'CHANNEL_DISPLAY_NAME', + 'language': 'DETECTED_LANGUAGE', + 'language_code': 'LANGUAGE_CODE', + 'is_english': 'IS_ENGLISH', + 'language_confidence': 'LANGUAGE_CONFIDENCE', + 'detection_method': 'DETECTION_METHOD', + 'has_text': 'HAS_TEXT', + 'translated_text': 'TRANSLATED_TEXT', + 'translation_performed': 'TRANSLATION_PERFORMED', + 'translation_confidence': 'TRANSLATION_CONFIDENCE', + 'translation_notes': 'TRANSLATION_NOTES', + 'sentiment_polarity': 'SENTIMENT_POLARITY', + 'intent': 'INTENT', + 'requires_reply': 'REQUIRES_REPLY', + 'sentiment_confidence': 'SENTIMENT_CONFIDENCE', + 'analysis_notes': 'ANALYSIS_NOTES', + 'success': 'PROCESSING_SUCCESS' + } + + # Add data source-specific columns if present + if 'additional_fields' in data_source_config: + for field in data_source_config['additional_fields']: + field_lower = field.lower() + output_columns[field_lower] = field + worker_logger.debug(f"Batch {batch_num}: Added {len(data_source_config['additional_fields'])} additional fields") + + output_df = pd.DataFrame() + for source_col, target_col in output_columns.items(): + if source_col in df_successful.columns: + output_df[target_col] = df_successful[source_col] + else: + output_df[target_col] = None + # Log missing columns for debugging + if source_col in ['permalink_url', 'thumbnail_url']: + worker_logger.warning(f"Batch {batch_num}: Column '{source_col}' not found in DataFrame. Available columns: {list(df_successful.columns)}") + + # Add processing metadata + output_df['PROCESSED_AT'] = datetime.now() + output_df['WORKFLOW_VERSION'] = '1.0' + + # Store results to Snowflake + if len(output_df) > 0: + # Use data source-specific output configuration + table_name = data_source_config['output_config']['table_name'] + database = data_source_config['output_config']['database'] + schema = data_source_config['output_config']['schema'] + + # Only the first batch should overwrite if requested + overwrite = overwrite_first_batch and batch_num == 1 + + snowflake.store_df_to_snowflake( + table_name=table_name, + dataframe=output_df, + database=database, + schema=schema, + overwrite=overwrite + ) + + worker_logger.info(f"Batch {batch_num}: Stored {len(output_df)} records to Snowflake ({table_name})") + else: + worker_logger.warning(f"Batch {batch_num}: No successful records to store") + + # Close Snowflake connection + snowflake.close_connection() + + # Calculate statistics + translations = output_df['TRANSLATION_PERFORMED'].sum() if 'TRANSLATION_PERFORMED' in output_df.columns else 0 + non_english = (~output_df['IS_ENGLISH']).sum() if 'IS_ENGLISH' in output_df.columns else 0 + requires_reply = output_df['REQUIRES_REPLY'].sum() if 'REQUIRES_REPLY' in output_df.columns else 0 + + return { + 'batch_num': batch_num, + 'success': True, + 'total_processed': initial_count, + 'total_stored': len(output_df), + 'failed_count': filtered_count, + 'translations': int(translations), + 'non_english': int(non_english), + 'requires_reply': int(requires_reply), + 'error': None + } + + except Exception as e: + error_msg = f"Batch {batch_num} failed: {str(e)}" + worker_logger.error(error_msg) + worker_logger.error(traceback.format_exc()) + + return { + 'batch_num': batch_num, + 'success': False, + 'total_processed': len(batch_comments), + 'total_stored': 0, + 'failed_count': len(batch_comments), + 'translations': 0, + 'non_english': 0, + 'requires_reply': 0, + 'error': error_msg + } + + +class CommentProcessor: + """ + Main processor class that orchestrates the entire workflow. + Supports multiple data sources (social media and Musora internal comments). + """ + + def __init__(self, config_path: str = None, data_sources_config_path: str = None): + """ + Initialize the comment processor. + + Args: + config_path: Path to configuration file (default: config_files/sentiment_config.json relative to script) + data_sources_config_path: Path to data sources config (default: config_files/data_sources_config.json) + """ + # Set default config path if not provided + if config_path is None: + config_path = os.path.join(SCRIPT_DIR, 'config_files', 'sentiment_config.json') + + if data_sources_config_path is None: + data_sources_config_path = os.path.join(SCRIPT_DIR, 'config_files', 'data_sources_config.json') + + # Load configuration + with open(config_path, 'r') as f: + self.config = json.load(f) + + # Load data sources configuration + with open(data_sources_config_path, 'r') as f: + self.data_sources_config = json.load(f) + + # Initialize Snowflake connection + self.snowflake = SnowFlakeConn() + + # Get OpenAI API key + self.api_key = os.getenv("OPENAI_API_KEY") + if not self.api_key: + raise ValueError("OPENAI_API_KEY not found in environment variables") + + # Initialize workflow + self.workflow = CommentProcessingWorkflow(self.config, self.api_key) + + logger.info("CommentProcessor initialized successfully") + + def get_enabled_data_sources(self) -> List[Dict[str, Any]]: + """ + Get list of enabled data sources from configuration. + + Returns: + List of enabled data source configurations + """ + enabled_sources = [] + for source_key, source_config in self.data_sources_config['data_sources'].items(): + if source_config.get('enabled', True): + enabled_sources.append({ + 'key': source_key, + 'config': source_config + }) + return enabled_sources + + def fetch_comments(self, data_source_key: str, limit: int = None) -> pd.DataFrame: + """ + Fetch comments from Snowflake using the SQL query for a specific data source. + + Args: + data_source_key: Key identifying the data source (e.g., 'social_media', 'musora_comments') + limit: Optional limit on number of comments to fetch + + Returns: + DataFrame containing comment data + """ + data_source_config = self.data_sources_config['data_sources'][data_source_key] + source_name = data_source_config['name'] + + logger.info(f"Fetching comments from {source_name}...") + + # Read SQL query + sql_file = data_source_config['sql_query_file'] + sql_path = os.path.join(SCRIPT_DIR, sql_file) + with open(sql_path, 'r') as f: + query = f.read() + + # Add limit if specified + if limit: + query = query.rstrip(';') + f"\nLIMIT {limit};" + + # Execute query + df = self.snowflake.run_read_query(query, f"{source_name} comments") + + logger.info(f"Fetched {len(df)} comments from {source_name}") + + # Normalize column names to lowercase for consistent processing + df.columns = df.columns.str.lower() + + # Additional validation: filter out any empty comments that might have slipped through + if 'comment_text' in df.columns: + initial_count = len(df) + df = df[df['comment_text'].notna() & (df['comment_text'].str.strip() != '')] + filtered_count = initial_count - len(df) + if filtered_count > 0: + logger.info(f"Filtered out {filtered_count} empty comments in post-processing") + + logger.info(f"Final count: {len(df)} non-empty comments") + return df + + def calculate_num_workers(self) -> int: + """ + Calculate the number of parallel workers to use. + Uses CPU count - 2, with a maximum of 5 workers. + + Returns: + Number of workers + """ + num_cpus = cpu_count() + num_workers = max(1, min(5, num_cpus - 2)) + logger.info(f"Using {num_workers} parallel workers (CPU count: {num_cpus})") + return num_workers + + def process_comments_parallel(self, df: pd.DataFrame, data_source_config: Dict[str, Any], overwrite: bool = False) -> dict: + """ + Process comments through the agentic workflow using parallel processing. + + Args: + df: DataFrame containing raw comment data + data_source_config: Configuration for the data source being processed + overwrite: Whether to overwrite existing Snowflake table + + Returns: + Dictionary with aggregated statistics + """ + # Convert DataFrame to list of dictionaries + comments = df.to_dict('records') + total_comments = len(comments) + + logger.info(f"Processing {total_comments} comments using parallel processing...") + + # Calculate number of workers + num_workers = self.calculate_num_workers() + + # Calculate optimal batch size + batch_size = calculate_optimal_batch_size(total_comments, num_workers) + logger.info(f"Batch size: {batch_size} (min: 20, max: 100)") + + # Create batches + batches = [] + for i in range(0, total_comments, batch_size): + batch = comments[i:i + batch_size] + batch_num = (i // batch_size) + 1 + batches.append((batch_num, batch, self.config, self.api_key, overwrite, data_source_config)) + + total_batches = len(batches) + logger.info(f"Split into {total_batches} batches") + + # Process batches in parallel + with Pool(processes=num_workers) as pool: + results = pool.map(process_batch_worker, batches) + + # Aggregate statistics + total_processed = sum(r['total_processed'] for r in results) + total_stored = sum(r['total_stored'] for r in results) + failed_count = sum(r['failed_count'] for r in results) + translations = sum(r['translations'] for r in results) + non_english = sum(r['non_english'] for r in results) + requires_reply = sum(r['requires_reply'] for r in results) + + # Count failed batches + failed_batches = [r for r in results if not r['success']] + if failed_batches: + logger.error(f"{len(failed_batches)} batch(es) failed:") + for fb in failed_batches: + logger.error(f" Batch {fb['batch_num']}: {fb['error']}") + + return { + 'total_processed': total_processed, + 'total_stored': total_stored, + 'failed_count': failed_count, + 'translations': translations, + 'non_english': non_english, + 'requires_reply': requires_reply, + 'failed_batches': len(failed_batches) + } + + def process_comments_sequential(self, df: pd.DataFrame, data_source_config: Dict[str, Any], overwrite: bool = False) -> dict: + """ + Process comments through the agentic workflow sequentially (for debugging). + + Args: + df: DataFrame containing raw comment data + data_source_config: Configuration for the data source being processed + overwrite: Whether to overwrite existing Snowflake table + + Returns: + Dictionary with aggregated statistics + """ + logger.info(f"Processing {len(df)} comments using sequential processing (debug mode)...") + + # Convert DataFrame to list of dictionaries + comments = df.to_dict('records') + + # Process as a single batch + batch_data = (1, comments, self.config, self.api_key, overwrite, data_source_config) + result = process_batch_worker(batch_data) + + return { + 'total_processed': result['total_processed'], + 'total_stored': result['total_stored'], + 'failed_count': result['failed_count'], + 'translations': result['translations'], + 'non_english': result['non_english'], + 'requires_reply': result['requires_reply'], + 'failed_batches': 0 if result['success'] else 1 + } + + def run(self, limit: int = None, overwrite: bool = False, sequential: bool = False, data_source_filter: str = None): + """ + Run the complete processing pipeline for all enabled data sources. + + Args: + limit: Optional limit on number of comments to process per data source + overwrite: Whether to overwrite existing Snowflake table + sequential: If True, use sequential processing instead of parallel (for debugging) + data_source_filter: Optional filter to process only a specific data source + """ + try: + logger.info("=" * 80) + logger.info("Starting Comment Processing Workflow") + if sequential: + logger.info("Mode: SEQUENTIAL (Debug Mode)") + else: + logger.info("Mode: PARALLEL") + logger.info("=" * 80) + + # Get enabled data sources + enabled_sources = self.get_enabled_data_sources() + + if data_source_filter: + enabled_sources = [s for s in enabled_sources if s['key'] == data_source_filter] + if not enabled_sources: + logger.error(f"Data source '{data_source_filter}' not found or not enabled") + return + + logger.info(f"Processing {len(enabled_sources)} data source(s)") + + # Process each data source + for source_info in enabled_sources: + source_key = source_info['key'] + source_config = source_info['config'] + source_name = source_config['name'] + + logger.info("=" * 80) + logger.info(f"Processing Data Source: {source_name}") + logger.info("=" * 80) + + # Step 1: Fetch comments + df_comments = self.fetch_comments(data_source_key=source_key, limit=limit) + + if df_comments.empty: + logger.warning(f"No comments to process from {source_name}") + continue + + # Step 2: Process comments through workflow (parallel or sequential) + start_time = datetime.now() + + if sequential: + stats = self.process_comments_sequential(df_comments, source_config, overwrite=overwrite) + else: + stats = self.process_comments_parallel(df_comments, source_config, overwrite=overwrite) + + end_time = datetime.now() + processing_time = (end_time - start_time).total_seconds() + + # Summary statistics + logger.info("=" * 80) + logger.info(f"Processing Summary for {source_name}:") + logger.info(f" Processing Mode: {'Sequential' if sequential else 'Parallel'}") + logger.info(f" Output Table: {source_config['output_config']['table_name']}") + logger.info(f" Total comments processed: {stats['total_processed']}") + logger.info(f" Successfully stored: {stats['total_stored']}") + logger.info(f" Failed sentiment analysis (not stored): {stats['failed_count']}") + if stats.get('failed_batches', 0) > 0: + logger.info(f" Failed batches: {stats['failed_batches']}") + logger.info(f" Non-English comments: {stats['non_english']}") + logger.info(f" Translations performed: {stats['translations']}") + logger.info(f" Comments requiring reply: {stats['requires_reply']}") + logger.info(f" Processing time: {processing_time:.2f} seconds") + logger.info(f" Average time per comment: {processing_time / stats['total_processed']:.2f} seconds") + logger.info("=" * 80) + + except Exception as e: + logger.error(f"Error in workflow execution: {str(e)}", exc_info=True) + raise + + finally: + # Close main Snowflake connection (workers have their own connections) + self.snowflake.close_connection() + logger.info("Snowflake connection closed") + + +def main(): + """ + Main entry point for the script. + """ + parser = argparse.ArgumentParser( + description="Process comments with language detection, translation, and sentiment analysis from multiple data sources" + ) + parser.add_argument( + '--limit', + type=int, + default=5000, + help='Limit number of comments to process per data source (default: 10000)' + ) + parser.add_argument( + '--overwrite', + action='store_true', + default=False, + help='Overwrite existing Snowflake table (default: False, appends new records)' + ) + parser.add_argument( + '--config', + type=str, + default=None, + help='Path to configuration file (default: config_files/sentiment_config.json relative to script)' + ) + parser.add_argument( + '--sequential', + action='store_true', + default=False, + help='Use sequential processing instead of parallel (for debugging)' + ) + parser.add_argument( + '--data-source', + type=str, + default=None, + help='Process only a specific data source (e.g., social_media, musora_comments). If not specified, all enabled sources are processed.' + ) + + args = parser.parse_args() + + # Create logs directory if it doesn't exist + logs_dir = os.path.join(SCRIPT_DIR, 'logs') + os.makedirs(logs_dir, exist_ok=True) + + # Initialize and run processor + processor = CommentProcessor(config_path=args.config) + processor.run( + limit=args.limit, + overwrite=args.overwrite, + sequential=args.sequential, + data_source_filter=args.data_source + ) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/processing_comments/requirements.txt b/processing_comments/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0497e50ea8832353ffdc5eec709f03b68b453e82 --- /dev/null +++ b/processing_comments/requirements.txt @@ -0,0 +1,10 @@ +snowflake-snowpark-python>=1.0.0 +pandas>=1.3.0 +python-dotenv>=0.19.0 +openai>=1.0.0 +argparse +langchain>=0.1.0 +langchain-openai>=0.0.5 +langgraph>=0.0.20 +lingua-language-detector>=2.0.0 +pydantic>=2.0.0 \ No newline at end of file diff --git a/processing_comments/sql/create_ml_features_table.sql b/processing_comments/sql/create_ml_features_table.sql new file mode 100644 index 0000000000000000000000000000000000000000..68084627903a1370377bb3ac17cefc4c4969b34a --- /dev/null +++ b/processing_comments/sql/create_ml_features_table.sql @@ -0,0 +1,127 @@ +-- Create table in ML_FEATURES schema to store comment sentiment analysis results +-- This table stores the output from the language detection, translation, and sentiment analysis workflow + +USE DATABASE SOCIAL_MEDIA_DB; +USE SCHEMA ML_FEATURES; + +CREATE TABLE IF NOT EXISTS COMMENT_SENTIMENT_FEATURES ( + -- Primary identifiers + COMMENT_SK NUMBER(38,0) NOT NULL COMMENT 'Surrogate key from FACT_COMMENTS', + COMMENT_ID VARCHAR(16777216) NOT NULL COMMENT 'Platform comment ID', + ORIGINAL_TEXT VARCHAR(16777216) COMMENT 'Original comment text', + PLATFORM VARCHAR(16777216) COMMENT 'Social platform', + COMMENT_TIMESTAMP TIMESTAMP_NTZ(9) COMMENT 'When comment was posted', + AUTHOR_NAME VARCHAR(16777216) COMMENT 'Commenter name', + AUTHOR_ID VARCHAR(16777216) COMMENT 'Platform user ID', + + -- Parent comment information + PARENT_COMMENT_ID VARCHAR(16777216) COMMENT 'ID of parent comment if this is a reply', + PARENT_COMMENT_TEXT VARCHAR(16777216) COMMENT 'Text of parent comment for context', + + -- Content references + CONTENT_SK NUMBER(38,0) COMMENT 'Foreign key to content', + CONTENT_ID VARCHAR(16777216) COMMENT 'Platform content ID', + CONTENT_DESCRIPTION VARCHAR(16777216) COMMENT 'Content description/message', + + -- Channel references + CHANNEL_SK NUMBER(38,0) COMMENT 'Foreign key to channel', + CHANNEL_NAME VARCHAR(16777216) COMMENT 'Brand/channel name', + CHANNEL_DISPLAY_NAME VARCHAR(16777216) COMMENT 'Channel display name', + + -- Language detection features + DETECTED_LANGUAGE VARCHAR(100) COMMENT 'Detected language name (e.g., English, Spanish)', + LANGUAGE_CODE VARCHAR(10) COMMENT 'ISO 639-1 language code (e.g., en, es)', + IS_ENGLISH BOOLEAN COMMENT 'True if comment is in English', + LANGUAGE_CONFIDENCE VARCHAR(20) COMMENT 'Confidence level: high, medium, low', + DETECTION_METHOD VARCHAR(50) COMMENT 'Method used: library, llm, or default', + HAS_TEXT BOOLEAN COMMENT 'True if comment has textual content (not just emojis)', + + -- Translation features + TRANSLATED_TEXT VARCHAR(16777216) COMMENT 'English translation (or original if already English)', + TRANSLATION_PERFORMED BOOLEAN COMMENT 'True if translation was performed', + TRANSLATION_CONFIDENCE VARCHAR(20) COMMENT 'Translation confidence level', + TRANSLATION_NOTES VARCHAR(16777216) COMMENT 'Notes about translation', + + -- Sentiment analysis features + SENTIMENT_POLARITY VARCHAR(20) COMMENT 'Sentiment: very_positive, positive, neutral, negative, very_negative', + INTENT VARCHAR(500) COMMENT 'Multi-label intents (comma-separated): praise, question, request, feedback_negative, suggestion, humor_sarcasm, off_topic, spam_selfpromo', + REQUIRES_REPLY BOOLEAN COMMENT 'True if comment requires a response (genuine questions/requests only)', + SENTIMENT_CONFIDENCE VARCHAR(20) COMMENT 'Sentiment analysis confidence: high, medium, low', + ANALYSIS_NOTES VARCHAR(16777216) COMMENT 'Concise notes with key topics/highlights for summarization', + + -- Processing metadata + PROCESSING_SUCCESS BOOLEAN COMMENT 'True if processing completed successfully', + PROCESSING_ERRORS VARCHAR(16777216) COMMENT 'Any errors encountered during processing', + PROCESSED_AT TIMESTAMP_NTZ(9) COMMENT 'When this record was processed', + WORKFLOW_VERSION VARCHAR(20) COMMENT 'Version of the processing workflow', + + -- Audit fields + CREATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP() COMMENT 'Record creation time', + UPDATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP() COMMENT 'Record update time' +) +COMMENT='ML Features table for language detection, translation, and sentiment analysis results from social media comments'; + +-- Create indexes for common queries +-- Note: Snowflake automatically optimizes queries, but we can define clustering keys +ALTER TABLE COMMENT_SENTIMENT_FEATURES CLUSTER BY (COMMENT_TIMESTAMP, CHANNEL_NAME); + +-- Create view for comments requiring reply +CREATE OR REPLACE VIEW VW_COMMENTS_REQUIRING_REPLY AS +SELECT + COMMENT_SK, + COMMENT_ID, + ORIGINAL_TEXT, + TRANSLATED_TEXT, + PARENT_COMMENT_ID, + PARENT_COMMENT_TEXT, + INTENT, + SENTIMENT_POLARITY, + SENTIMENT_CONFIDENCE, + CHANNEL_NAME, + AUTHOR_NAME, + COMMENT_TIMESTAMP, + PLATFORM, + CONTENT_DESCRIPTION +FROM COMMENT_SENTIMENT_FEATURES +WHERE REQUIRES_REPLY = TRUE + AND PROCESSING_SUCCESS = TRUE +ORDER BY COMMENT_TIMESTAMP DESC; + +-- Create view for sentiment distribution +CREATE OR REPLACE VIEW VW_SENTIMENT_DISTRIBUTION AS +SELECT + CHANNEL_NAME, + SENTIMENT_POLARITY, + INTENT, + COUNT(*) AS COMMENT_COUNT, + COUNT(CASE WHEN REQUIRES_REPLY THEN 1 END) AS REPLIES_NEEDED, + COUNT(CASE WHEN PARENT_COMMENT_ID IS NOT NULL THEN 1 END) AS REPLY_COMMENTS, + AVG(CASE WHEN SENTIMENT_CONFIDENCE = 'high' THEN 3 + WHEN SENTIMENT_CONFIDENCE = 'medium' THEN 2 + WHEN SENTIMENT_CONFIDENCE = 'low' THEN 1 + ELSE 0 END) AS AVG_CONFIDENCE_SCORE, + MAX(PROCESSED_AT) AS LAST_PROCESSED +FROM COMMENT_SENTIMENT_FEATURES +WHERE PROCESSING_SUCCESS = TRUE +GROUP BY CHANNEL_NAME, SENTIMENT_POLARITY, INTENT +ORDER BY CHANNEL_NAME, COMMENT_COUNT DESC; + +-- Create view for non-English comments +CREATE OR REPLACE VIEW VW_NON_ENGLISH_COMMENTS AS +SELECT + COMMENT_SK, + COMMENT_ID, + ORIGINAL_TEXT, + DETECTED_LANGUAGE, + LANGUAGE_CODE, + TRANSLATED_TEXT, + TRANSLATION_CONFIDENCE, + SENTIMENT_POLARITY, + INTENT, + CHANNEL_NAME, + COMMENT_TIMESTAMP, + PLATFORM +FROM COMMENT_SENTIMENT_FEATURES +WHERE IS_ENGLISH = FALSE + AND PROCESSING_SUCCESS = TRUE +ORDER BY COMMENT_TIMESTAMP DESC; \ No newline at end of file diff --git a/processing_comments/sql/create_musora_ml_features_table.sql b/processing_comments/sql/create_musora_ml_features_table.sql new file mode 100644 index 0000000000000000000000000000000000000000..e517b2b81580bfd5702bbffaf799aeb7c3304711 --- /dev/null +++ b/processing_comments/sql/create_musora_ml_features_table.sql @@ -0,0 +1,135 @@ +-- Create table in ML_FEATURES schema to store Musora comment sentiment analysis results +-- This table stores the output from the language detection, translation, and sentiment analysis workflow +-- Schema matches COMMENT_SENTIMENT_FEATURES with additional Musora-specific fields + +USE DATABASE SOCIAL_MEDIA_DB; +USE SCHEMA ML_FEATURES; + +CREATE TABLE IF NOT EXISTS MUSORA_COMMENT_SENTIMENT_FEATURES ( + -- Primary identifiers + COMMENT_SK NUMBER(38,0) NOT NULL COMMENT 'Generated surrogate key (hash of COMMENT_ID and PLATFORM)', + COMMENT_ID VARCHAR(16777216) NOT NULL COMMENT 'Musora comment ID', + ORIGINAL_TEXT VARCHAR(16777216) COMMENT 'Original comment text', + PLATFORM VARCHAR(16777216) COMMENT 'Musora platform/brand', + COMMENT_TIMESTAMP TIMESTAMP_NTZ(9) COMMENT 'When comment was posted', + AUTHOR_NAME VARCHAR(16777216) COMMENT 'Commenter name', + AUTHOR_ID VARCHAR(16777216) COMMENT 'User ID', + + -- Parent comment information + PARENT_COMMENT_ID VARCHAR(16777216) COMMENT 'ID of parent comment if this is a reply', + PARENT_COMMENT_TEXT VARCHAR(16777216) COMMENT 'Text of parent comment for context', + + -- Content references + CONTENT_SK NUMBER(38,0) COMMENT 'Generated surrogate key for content', + CONTENT_ID VARCHAR(16777216) COMMENT 'Content ID', + CONTENT_DESCRIPTION VARCHAR(16777216) COMMENT 'Content profile/description', + + -- Channel references + CHANNEL_SK NUMBER(38,0) COMMENT 'Generated surrogate key for channel', + CHANNEL_NAME VARCHAR(16777216) COMMENT 'Brand/channel name', + CHANNEL_DISPLAY_NAME VARCHAR(16777216) COMMENT 'Channel display name', + + -- Musora-specific fields + PERMALINK_URL VARCHAR(16777216) COMMENT 'Web URL path of the content', + THUMBNAIL_URL VARCHAR(16777216) COMMENT 'Thumbnail URL of the content', + + -- Language detection features + DETECTED_LANGUAGE VARCHAR(100) COMMENT 'Detected language name (e.g., English, Spanish)', + LANGUAGE_CODE VARCHAR(10) COMMENT 'ISO 639-1 language code (e.g., en, es)', + IS_ENGLISH BOOLEAN COMMENT 'True if comment is in English', + LANGUAGE_CONFIDENCE VARCHAR(20) COMMENT 'Confidence level: high, medium, low', + DETECTION_METHOD VARCHAR(50) COMMENT 'Method used: library, llm, or default', + HAS_TEXT BOOLEAN COMMENT 'True if comment has textual content (not just emojis)', + + -- Translation features + TRANSLATED_TEXT VARCHAR(16777216) COMMENT 'English translation (or original if already English)', + TRANSLATION_PERFORMED BOOLEAN COMMENT 'True if translation was performed', + TRANSLATION_CONFIDENCE VARCHAR(20) COMMENT 'Translation confidence level', + TRANSLATION_NOTES VARCHAR(16777216) COMMENT 'Notes about translation', + + -- Sentiment analysis features + SENTIMENT_POLARITY VARCHAR(20) COMMENT 'Sentiment: very_positive, positive, neutral, negative, very_negative', + INTENT VARCHAR(500) COMMENT 'Multi-label intents (comma-separated): praise, question, request, feedback_negative, suggestion, humor_sarcasm, off_topic, spam_selfpromo', + REQUIRES_REPLY BOOLEAN COMMENT 'True if comment requires a response (genuine questions/requests only)', + SENTIMENT_CONFIDENCE VARCHAR(20) COMMENT 'Sentiment analysis confidence: high, medium, low', + ANALYSIS_NOTES VARCHAR(16777216) COMMENT 'Concise notes with key topics/highlights for summarization', + + -- Processing metadata + PROCESSING_SUCCESS BOOLEAN COMMENT 'True if processing completed successfully', + PROCESSING_ERRORS VARCHAR(16777216) COMMENT 'Any errors encountered during processing', + PROCESSED_AT TIMESTAMP_NTZ(9) COMMENT 'When this record was processed', + WORKFLOW_VERSION VARCHAR(20) COMMENT 'Version of the processing workflow', + + -- Audit fields + CREATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP() COMMENT 'Record creation time', + UPDATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP() COMMENT 'Record update time' +) +COMMENT='ML Features table for language detection, translation, and sentiment analysis results from Musora internal app comments'; + +-- Create indexes for common queries +-- Note: Snowflake automatically optimizes queries, but we can define clustering keys +ALTER TABLE MUSORA_COMMENT_SENTIMENT_FEATURES CLUSTER BY (COMMENT_TIMESTAMP, CHANNEL_NAME); + +-- Create view for Musora comments requiring reply +CREATE OR REPLACE VIEW VW_MUSORA_COMMENTS_REQUIRING_REPLY AS +SELECT + COMMENT_SK, + COMMENT_ID, + ORIGINAL_TEXT, + TRANSLATED_TEXT, + PARENT_COMMENT_ID, + PARENT_COMMENT_TEXT, + INTENT, + SENTIMENT_POLARITY, + SENTIMENT_CONFIDENCE, + CHANNEL_NAME, + AUTHOR_ID, + COMMENT_TIMESTAMP, + PLATFORM, + CONTENT_DESCRIPTION, + PERMALINK_URL, + THUMBNAIL_URL +FROM MUSORA_COMMENT_SENTIMENT_FEATURES +WHERE REQUIRES_REPLY = TRUE + AND PROCESSING_SUCCESS = TRUE +ORDER BY COMMENT_TIMESTAMP DESC; + +-- Create view for Musora sentiment distribution +CREATE OR REPLACE VIEW VW_MUSORA_SENTIMENT_DISTRIBUTION AS +SELECT + CHANNEL_NAME, + SENTIMENT_POLARITY, + INTENT, + COUNT(*) AS COMMENT_COUNT, + COUNT(CASE WHEN REQUIRES_REPLY THEN 1 END) AS REPLIES_NEEDED, + COUNT(CASE WHEN PARENT_COMMENT_ID IS NOT NULL THEN 1 END) AS REPLY_COMMENTS, + AVG(CASE WHEN SENTIMENT_CONFIDENCE = 'high' THEN 3 + WHEN SENTIMENT_CONFIDENCE = 'medium' THEN 2 + WHEN SENTIMENT_CONFIDENCE = 'low' THEN 1 + ELSE 0 END) AS AVG_CONFIDENCE_SCORE, + MAX(PROCESSED_AT) AS LAST_PROCESSED +FROM MUSORA_COMMENT_SENTIMENT_FEATURES +WHERE PROCESSING_SUCCESS = TRUE +GROUP BY CHANNEL_NAME, SENTIMENT_POLARITY, INTENT +ORDER BY CHANNEL_NAME, COMMENT_COUNT DESC; + +-- Create view for non-English Musora comments +CREATE OR REPLACE VIEW VW_MUSORA_NON_ENGLISH_COMMENTS AS +SELECT + COMMENT_SK, + COMMENT_ID, + ORIGINAL_TEXT, + DETECTED_LANGUAGE, + LANGUAGE_CODE, + TRANSLATED_TEXT, + TRANSLATION_CONFIDENCE, + SENTIMENT_POLARITY, + INTENT, + CHANNEL_NAME, + COMMENT_TIMESTAMP, + PLATFORM, + PERMALINK_URL +FROM MUSORA_COMMENT_SENTIMENT_FEATURES +WHERE IS_ENGLISH = FALSE + AND PROCESSING_SUCCESS = TRUE +ORDER BY COMMENT_TIMESTAMP DESC; diff --git a/processing_comments/sql/fetch_comments.sql b/processing_comments/sql/fetch_comments.sql new file mode 100644 index 0000000000000000000000000000000000000000..38ae7c123533a52605fe345209c9cec8d4d016fa --- /dev/null +++ b/processing_comments/sql/fetch_comments.sql @@ -0,0 +1,76 @@ +-- Query to fetch comments with all required information +-- Includes: comments, parent comments, timestamps, IDs, channel info, and content description +-- Excludes comments that have already been processed (present in ML_FEATURES table) + +SELECT + fc.COMMENT_SK, + fc.COMMENT_ID, + fc.PLATFORM, + fc.MESSAGE AS COMMENT_TEXT, + fc.CREATED_TIME AS COMMENT_TIMESTAMP, + fc.AUTHOR_NAME, + fc.AUTHOR_ID, + fc.LIKE_COUNT, + fc.PARENT_COMMENT_ID, + fc.REPLIES_COUNT, + fc.COMMENT_LENGTH, + fc.IS_ACTIVE AS COMMENT_IS_ACTIVE, + + -- Parent comment information (self-join to get parent comment text) + parent_fc.MESSAGE AS PARENT_COMMENT_TEXT, + + -- Content information (content description is in MESSAGE column) + dc.CONTENT_SK, + dc.CONTENT_ID, + dc.CONTENT_TYPE, + dc.MESSAGE AS CONTENT_DESCRIPTION, + dc.TITLE AS CONTENT_TITLE, + dc.PERMALINK_URL, + dc.CREATED_TIME AS CONTENT_TIMESTAMP, + dc.SHARES_COUNT, + dc.REACTIONS_TOTAL, + dc.COMMENTS_TOTAL, + dc.HASHTAGS, + + -- Channel information + dch.CHANNEL_SK, + dch.CHANNEL_NAME, + dch.CHANNEL_DISPLAY_NAME, + dch.PAGE_ID, + dch.CHANNEL_URL, + dch.IS_ACTIVE AS CHANNEL_IS_ACTIVE + +FROM + SOCIAL_MEDIA_DB.CORE.FACT_COMMENTS fc + +-- Left join to get parent comment text if it exists +LEFT JOIN + SOCIAL_MEDIA_DB.CORE.FACT_COMMENTS parent_fc + ON fc.PARENT_COMMENT_ID = parent_fc.COMMENT_ID + AND fc.PLATFORM = parent_fc.PLATFORM + +INNER JOIN + SOCIAL_MEDIA_DB.CORE.DIM_CONTENT dc + ON fc.CONTENT_SK = dc.CONTENT_SK + +INNER JOIN + SOCIAL_MEDIA_DB.CORE.DIM_CHANNEL dch + ON dc.CHANNEL_NAME = dch.CHANNEL_NAME + AND dc.PLATFORM = dch.PLATFORM + +LEFT JOIN + SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES csf + ON fc.COMMENT_SK = csf.COMMENT_SK + +WHERE + fc.IS_ACTIVE = TRUE + AND dc.IS_ACTIVE = TRUE + AND dch.IS_ACTIVE = TRUE + AND (fc.AUTHOR_NAME IS NULL OR fc.AUTHOR_NAME NOT IN ('Musora', 'Drumeo', 'Pianote', '@PianoteOfficial', '@DrumeoOfficial', '@MusoraOfficial')) + AND csf.COMMENT_SK IS NULL -- Exclude comments already processed + AND fc.MESSAGE IS NOT NULL -- Exclude NULL comments + AND TRIM(fc.MESSAGE) != '' -- Exclude empty or whitespace-only comments + AND LENGTH(TRIM(fc.MESSAGE)) > 0 -- Double-check for non-empty content + +ORDER BY + fc.CREATED_TIME DESC \ No newline at end of file diff --git a/processing_comments/sql/fetch_musora_comments.sql b/processing_comments/sql/fetch_musora_comments.sql new file mode 100644 index 0000000000000000000000000000000000000000..2d15b7bf3e66bf69861ad0073a00faefcef22a8f --- /dev/null +++ b/processing_comments/sql/fetch_musora_comments.sql @@ -0,0 +1,54 @@ +-- Query to fetch Musora internal app comments with all required information +-- Includes: comments, parent comments, timestamps, IDs, brand info, and content description +-- Excludes comments that have already been processed (present in ML_FEATURES table) + +SELECT + mc.COMMENT_ID, + mc.PLATFORM, + mc.BRAND, + mc.MESSAGE AS COMMENT_TEXT, -- Alias to match expected column name + mc.PARENT_COMMENT_ID, + -- Self-join to get parent comment text + parent.MESSAGE AS PARENT_COMMENT_TEXT, + mc.CONTENT_ID, + mc.USER_ID AS AUTHOR_ID, -- Map USER_ID to AUTHOR_ID + NULL AS AUTHOR_NAME, -- Musora comments don't have author names + mc.CREATED_ON AS COMMENT_TIMESTAMP, + mc.CONTENT_PROFILE AS CONTENT_DESCRIPTION, -- Map CONTENT_PROFILE to CONTENT_DESCRIPTION + NULL AS CONTENT_TITLE, -- Not available in Musora data + mc.WEB_URL_PATH AS PERMALINK_URL, + mc.THUMBNAIL_URL, + + -- For consistency with social media processing, use brand as channel info + mc.BRAND AS CHANNEL_NAME, + mc.BRAND AS CHANNEL_DISPLAY_NAME, + + -- Generate surrogate keys for consistency (using hash of comment_id and platform) + -- This ensures uniqueness across platforms + HASH(mc.COMMENT_ID, mc.PLATFORM) AS COMMENT_SK, + HASH(mc.CONTENT_ID, mc.PLATFORM) AS CONTENT_SK, + HASH(mc.BRAND, mc.PLATFORM) AS CHANNEL_SK + +FROM + SOCIAL_MEDIA_DB.CORE.MUSORA_COMMENTS mc + +-- Left join to get parent comment text if it exists +LEFT JOIN + SOCIAL_MEDIA_DB.CORE.MUSORA_COMMENTS parent + ON mc.PARENT_COMMENT_ID = parent.COMMENT_ID + AND mc.PLATFORM = parent.PLATFORM + +-- Exclude comments already processed +LEFT JOIN + SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES mcsf + ON mc.COMMENT_ID = mcsf.COMMENT_ID + AND mc.PLATFORM = mcsf.PLATFORM + +WHERE + mcsf.COMMENT_ID IS NULL -- Exclude comments already processed + AND mc.MESSAGE IS NOT NULL -- Exclude NULL comments + AND TRIM(mc.MESSAGE) != '' -- Exclude empty or whitespace-only comments + AND LENGTH(TRIM(mc.MESSAGE)) > 0 -- Double-check for non-empty content + +ORDER BY + mc.CREATED_ON DESC diff --git a/processing_comments/sql/init_musora_table.sql b/processing_comments/sql/init_musora_table.sql new file mode 100644 index 0000000000000000000000000000000000000000..4cf308baa60876064ed26ce42d70e3a092ec5ffa --- /dev/null +++ b/processing_comments/sql/init_musora_table.sql @@ -0,0 +1,70 @@ +-- Initialize empty MUSORA_COMMENT_SENTIMENT_FEATURES table +-- Run this ONCE before the first processing run for Musora comments +-- This creates just the table structure without any data + +USE DATABASE SOCIAL_MEDIA_DB; +USE SCHEMA ML_FEATURES; + +-- Create the table if it doesn't exist +CREATE TABLE IF NOT EXISTS MUSORA_COMMENT_SENTIMENT_FEATURES ( + -- Primary identifiers + COMMENT_SK NUMBER(38,0) NOT NULL, + COMMENT_ID VARCHAR(16777216) NOT NULL, + ORIGINAL_TEXT VARCHAR(16777216), + PLATFORM VARCHAR(16777216), + COMMENT_TIMESTAMP TIMESTAMP_NTZ(9), + AUTHOR_NAME VARCHAR(16777216), + AUTHOR_ID VARCHAR(16777216), + + -- Parent comment information + PARENT_COMMENT_ID VARCHAR(16777216), + PARENT_COMMENT_TEXT VARCHAR(16777216), + + -- Content references + CONTENT_SK NUMBER(38,0), + CONTENT_ID VARCHAR(16777216), + CONTENT_DESCRIPTION VARCHAR(16777216), + + -- Channel references + CHANNEL_SK NUMBER(38,0), + CHANNEL_NAME VARCHAR(16777216), + CHANNEL_DISPLAY_NAME VARCHAR(16777216), + + -- Musora-specific fields + PERMALINK_URL VARCHAR(16777216), + THUMBNAIL_URL VARCHAR(16777216), + + -- Language detection features + DETECTED_LANGUAGE VARCHAR(100), + LANGUAGE_CODE VARCHAR(10), + IS_ENGLISH BOOLEAN, + LANGUAGE_CONFIDENCE VARCHAR(20), + DETECTION_METHOD VARCHAR(50), + HAS_TEXT BOOLEAN, + + -- Translation features + TRANSLATED_TEXT VARCHAR(16777216), + TRANSLATION_PERFORMED BOOLEAN, + TRANSLATION_CONFIDENCE VARCHAR(20), + TRANSLATION_NOTES VARCHAR(16777216), + + -- Sentiment analysis features + SENTIMENT_POLARITY VARCHAR(20), + INTENT VARCHAR(500), + REQUIRES_REPLY BOOLEAN, + SENTIMENT_CONFIDENCE VARCHAR(20), + ANALYSIS_NOTES VARCHAR(16777216), + + -- Processing metadata + PROCESSING_SUCCESS BOOLEAN, + PROCESSING_ERRORS VARCHAR(16777216), + PROCESSED_AT TIMESTAMP_NTZ(9), + WORKFLOW_VERSION VARCHAR(20), + + -- Audit fields + CREATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP(), + UPDATED_AT TIMESTAMP_NTZ(9) DEFAULT CURRENT_TIMESTAMP() +); + +-- Confirm table was created +SELECT 'Table MUSORA_COMMENT_SENTIMENT_FEATURES created successfully' AS STATUS; diff --git a/processing_comments/workflow/__init__.py b/processing_comments/workflow/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..57deb10ecc10e12130f6cb4c3dd6c6109445923a --- /dev/null +++ b/processing_comments/workflow/__init__.py @@ -0,0 +1,7 @@ +""" +Workflow module for orchestrating agent-based comment processing. +""" + +from workflow.comment_processor import CommentProcessingWorkflow, CommentState + +__all__ = ["CommentProcessingWorkflow", "CommentState"] \ No newline at end of file diff --git a/processing_comments/workflow/comment_processor.py b/processing_comments/workflow/comment_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..487ab2ce07ca99d5f376c067003b41c57584f27c --- /dev/null +++ b/processing_comments/workflow/comment_processor.py @@ -0,0 +1,436 @@ +""" +Comment Processing Workflow using LangGraph +Orchestrates language detection and translation agents in a scalable workflow +""" + +from typing import Dict, Any, List, TypedDict, Annotated +import operator +import json +import os +from langgraph.graph import StateGraph, END +from agents.language_detection_agent import LanguageDetectionAgent +from agents.translation_agent import TranslationAgent +from agents.sentiment_analysis_agent import SentimentAnalysisAgent +import logging + +logger = logging.getLogger(__name__) + + +class CommentState(TypedDict): + """ + State definition for the comment processing workflow. + This can be extended to add more fields as the workflow grows. + """ + # Input fields + comment_sk: int + comment_id: str + comment_text: str + comment_timestamp: Any + author_name: str + author_id: str + platform: str + + # Parent comment fields + parent_comment_id: str + parent_comment_text: str + + # Content fields + content_sk: int + content_id: str + content_description: str + content_title: str + + # Channel fields + channel_sk: int + channel_name: str + channel_display_name: str + + # Processing fields + language: str + language_code: str + is_english: bool + language_confidence: str + detection_method: str + has_text: bool + + # Translation fields + translated_text: str + translation_performed: bool + translation_confidence: str + translation_notes: str + + # Sentiment analysis fields + sentiment_polarity: str + intent: str + requires_reply: bool + sentiment_confidence: str + analysis_notes: str + + # Metadata + processing_errors: Annotated[List[str], operator.add] + success: bool + + +class CommentProcessingWorkflow: + """ + LangGraph-based workflow for processing social media comments. + Orchestrates agents in a flexible, extensible graph structure. + """ + + def __init__(self, config: Dict[str, Any], api_key: str): + """ + Initialize the workflow with agents and configuration. + + Args: + config: Configuration dictionary + api_key: OpenAI API key + """ + self.config = config + self.api_key = api_key + + # Initialize agents + lang_detect_config = config["agents"]["language_detection"] + translation_config = config["agents"]["translation"] + sentiment_config = config["agents"]["sentiment_analysis"] + + # Load sentiment categories + sentiment_categories_path = config.get("sentiment_categories_config") + if sentiment_categories_path is None: + # Default to config_files/sentiment_analysis_config.json relative to this script's parent directory + workflow_dir = os.path.dirname(os.path.abspath(__file__)) + parent_dir = os.path.dirname(workflow_dir) + sentiment_categories_path = os.path.join(parent_dir, 'config_files', 'sentiment_analysis_config.json') + + with open(sentiment_categories_path, 'r') as f: + sentiment_categories = json.load(f) + + self.language_agent = LanguageDetectionAgent(lang_detect_config, api_key) + self.translation_agent = TranslationAgent(translation_config, api_key) + self.sentiment_agent = SentimentAnalysisAgent(sentiment_config, api_key, sentiment_categories) + + # Build the workflow graph + self.workflow = self._build_workflow() + + logger.info("CommentProcessingWorkflow initialized successfully") + + def _build_workflow(self) -> StateGraph: + """ + Build the LangGraph workflow. + This defines the sequence of agent operations. + + Returns: + Compiled StateGraph workflow + """ + # Create the graph + workflow = StateGraph(CommentState) + + # Add nodes (agents) + workflow.add_node("language_detection", self._language_detection_node) + workflow.add_node("translation", self._translation_node) + workflow.add_node("sentiment_analysis", self._sentiment_analysis_node) + + # Define edges (workflow sequence) + workflow.set_entry_point("language_detection") + + # After language detection, decide whether to translate + workflow.add_conditional_edges( + "language_detection", + self._should_translate, + { + "translate": "translation", + "skip_translation": "sentiment_analysis" + } + ) + + # After translation, proceed to sentiment analysis + workflow.add_edge("translation", "sentiment_analysis") + + # After sentiment analysis, end the workflow + workflow.add_edge("sentiment_analysis", END) + + # Compile the graph + return workflow.compile() + + def _language_detection_node(self, state: CommentState) -> CommentState: + """ + Node for language detection. + + Args: + state: Current workflow state + + Returns: + Updated state with language detection results + """ + try: + # Prepare input for language detection agent + input_data = { + "comment_text": state["comment_text"] + } + + # Process with language detection agent + result = self.language_agent.process(input_data) + + # Update state with results + if result.get("success", False): + state["language"] = result.get("language", "English") + state["language_code"] = result.get("language_code", "en") + state["is_english"] = result.get("is_english", True) + state["language_confidence"] = result.get("confidence", "medium") + state["detection_method"] = result.get("detection_method", "unknown") + state["has_text"] = result.get("has_text", True) + state["success"] = True + else: + error_msg = f"Language detection failed: {result.get('error', 'Unknown error')}" + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + # Set defaults + state["language"] = "English" + state["language_code"] = "en" + state["is_english"] = True + state["language_confidence"] = "low" + state["detection_method"] = "default" + state["has_text"] = True + + logger.debug(f"Language detected: {state['language']}") + return state + + except Exception as e: + error_msg = f"Language detection node error: {str(e)}" + logger.error(error_msg) + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + # Set safe defaults + state["language"] = "English" + state["language_code"] = "en" + state["is_english"] = True + state["language_confidence"] = "low" + state["detection_method"] = "error_default" + state["has_text"] = True + return state + + def _translation_node(self, state: CommentState) -> CommentState: + """ + Node for translation. + + Args: + state: Current workflow state + + Returns: + Updated state with translation results + """ + try: + # Prepare input for translation agent + input_data = { + "comment_text": state["comment_text"], + "is_english": state["is_english"], + "language": state["language"], + "language_code": state["language_code"] + } + + # Process with translation agent + result = self.translation_agent.process(input_data) + + # Update state with results + if result.get("success", False): + state["translated_text"] = result.get("translated_text", state["comment_text"]) + state["translation_performed"] = result.get("translation_performed", False) + state["translation_confidence"] = result.get("translation_confidence", "N/A") + state["translation_notes"] = result.get("translation_notes", "") + else: + error_msg = f"Translation failed: {result.get('error', 'Unknown error')}" + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + # Set fallback values + state["translated_text"] = state["comment_text"] + state["translation_performed"] = False + state["translation_confidence"] = "N/A" + state["translation_notes"] = "Translation error" + + logger.debug(f"Translation performed: {state['translation_performed']}") + return state + + except Exception as e: + error_msg = f"Translation node error: {str(e)}" + logger.error(error_msg) + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + # Set safe defaults + state["translated_text"] = state["comment_text"] + state["translation_performed"] = False + state["translation_confidence"] = "N/A" + state["translation_notes"] = "Error during translation" + return state + + def _sentiment_analysis_node(self, state: CommentState) -> CommentState: + """ + Node for sentiment analysis. + + Args: + state: Current workflow state + + Returns: + Updated state with sentiment analysis results + """ + try: + # Prepare input for sentiment analysis agent + # Use translated text if available, otherwise use original + text_to_analyze = state.get("translated_text", state["comment_text"]) + + input_data = { + "comment_text": text_to_analyze, + "content_description": state["content_description"], + "parent_comment_text": state.get("parent_comment_text"), + "platform": state.get("platform"), + "content_title": state.get("content_title") + } + + # Process with sentiment analysis agent + result = self.sentiment_agent.process(input_data) + + # Update state with results + if result.get("success", False): + state["sentiment_polarity"] = result.get("sentiment_polarity") + state["intent"] = result.get("intent") + state["requires_reply"] = result.get("requires_reply", False) + state["sentiment_confidence"] = result.get("sentiment_confidence") + state["analysis_notes"] = result.get("analysis_notes", "") + state["success"] = True + else: + error_msg = f"Sentiment analysis failed: {result.get('error', 'Unknown error')}" + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + # Mark as unsuccessful - this comment will not be stored + state["success"] = False + state["sentiment_polarity"] = None + state["intent"] = None + state["requires_reply"] = False + state["sentiment_confidence"] = None + state["analysis_notes"] = "Sentiment analysis failed" + + logger.debug(f"Sentiment: {state['sentiment_polarity']}, Intent: {state['intent']}") + return state + + except Exception as e: + error_msg = f"Sentiment analysis node error: {str(e)}" + logger.error(error_msg) + state["processing_errors"] = state.get("processing_errors", []) + [error_msg] + # Mark as unsuccessful - this comment will not be stored + state["success"] = False + state["sentiment_polarity"] = None + state["intent"] = None + state["requires_reply"] = False + state["sentiment_confidence"] = None + state["analysis_notes"] = "Error during sentiment analysis" + return state + + def _should_translate(self, state: CommentState) -> str: + """ + Decision function to determine if translation is needed. + + Args: + state: Current workflow state + + Returns: + Edge name to follow + """ + # If already English or no text, skip translation + if state.get("is_english", True) or not state.get("has_text", True): + # Set default values for skipped translation + state["translated_text"] = state["comment_text"] + state["translation_performed"] = False + state["translation_confidence"] = "N/A" + state["translation_notes"] = "Translation not needed" + return "skip_translation" + else: + return "translate" + + def process_comment(self, comment_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a single comment through the workflow. + + Args: + comment_data: Dictionary containing comment data + + Returns: + Dictionary with processed results + """ + try: + # Get comment text and validate it's not empty + comment_text = comment_data.get("comment_text", "") + + # Skip empty comments (None, empty string, or whitespace only) + if not comment_text or not str(comment_text).strip(): + logger.warning(f"Skipping empty comment: {comment_data.get('comment_id')}") + # Return all original data plus error info + return { + **comment_data, + "success": False, + "processing_errors": ["Comment text is empty or whitespace only"], + "comment_text": comment_text + } + + # Initialize state with comment data + initial_state = { + "comment_sk": comment_data.get("comment_sk"), + "comment_id": comment_data.get("comment_id"), + "comment_text": str(comment_text).strip(), # Use trimmed text + "comment_timestamp": comment_data.get("comment_timestamp"), + "author_name": comment_data.get("author_name"), + "author_id": comment_data.get("author_id"), + "platform": comment_data.get("platform"), + "parent_comment_id": comment_data.get("parent_comment_id"), + "parent_comment_text": comment_data.get("parent_comment_text"), + "content_sk": comment_data.get("content_sk"), + "content_id": comment_data.get("content_id"), + "content_description": comment_data.get("content_description"), + "content_title": comment_data.get("content_title"), + "channel_sk": comment_data.get("channel_sk"), + "channel_name": comment_data.get("channel_name"), + "channel_display_name": comment_data.get("channel_display_name"), + "processing_errors": [], + "success": True + } + + # Run the workflow + final_state = self.workflow.invoke(initial_state) + + # Merge final_state with original comment_data to preserve additional fields + # (like permalink_url, thumbnail_url for Musora comments) + result = dict(final_state) + + # Add any fields from comment_data that weren't in initial_state + for key, value in comment_data.items(): + if key not in result: + result[key] = value + logger.debug(f"Preserved additional field from source: {key}") + + return result + + except Exception as e: + logger.error(f"Workflow execution error: {str(e)}") + return { + **comment_data, + "success": False, + "processing_errors": [str(e)], + "language": "English", + "language_code": "en", + "is_english": True, + "translated_text": comment_data.get("comment_text", ""), + "translation_performed": False + } + + def process_batch(self, comments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Process a batch of comments. + + Args: + comments: List of comment dictionaries + + Returns: + List of processed comment dictionaries + """ + results = [] + total = len(comments) + + for idx, comment in enumerate(comments, 1): + logger.info(f"Processing comment {idx}/{total}") + result = self.process_comment(comment) + results.append(result) + + logger.info(f"Batch processing complete: {total} comments processed") + return results \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 28d994e22f8dd432b51df193562052e315ad95f7..2ac9081aa3a574cb920f02aae6475bbb64fc5df2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,33 @@ -altair -pandas -streamlit \ No newline at end of file +# Brand Sentiment - Visualization & Processing Requirements +# Install with: pip install -r requirements.txt + +# Core visualization +streamlit==1.50.0 +plotly==6.3.1 + +# Data processing +pandas==2.3.2 +numpy==2.0.2 +python-dateutil==2.9.0.post0 + +# Snowflake connectivity +snowflake-snowpark-python==1.39.0 + +# Environment management +python-dotenv==1.1.1 + +# AI / LLM (visualization agents + processing pipeline) +openai==1.108.0 +langchain==0.3.27 +langchain-openai==0.3.34 +langgraph==0.6.8 + +# Language detection (processing pipeline) +lingua-language-detector==2.0.2 + +# HTML parsing (processing pipeline) +beautifulsoup4==4.14.3 + +# PDF report generation +fpdf2==2.8.4 +kaleido==1.2.0 diff --git a/src/streamlit_app.py b/src/streamlit_app.py deleted file mode 100644 index 99d0b84662681e7d21a08fcce44908344fa86f80..0000000000000000000000000000000000000000 --- a/src/streamlit_app.py +++ /dev/null @@ -1,40 +0,0 @@ -import altair as alt -import numpy as np -import pandas as pd -import streamlit as st - -""" -# Welcome to Streamlit! - -Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:. -If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community -forums](https://discuss.streamlit.io). - -In the meantime, below is an example of what you can do with just a few lines of code: -""" - -num_points = st.slider("Number of points in spiral", 1, 10000, 1100) -num_turns = st.slider("Number of turns in spiral", 1, 300, 31) - -indices = np.linspace(0, 1, num_points) -theta = 2 * np.pi * num_turns * indices -radius = indices - -x = radius * np.cos(theta) -y = radius * np.sin(theta) - -df = pd.DataFrame({ - "x": x, - "y": y, - "idx": indices, - "rand": np.random.randn(num_points), -}) - -st.altair_chart(alt.Chart(df, height=700, width=700) - .mark_point(filled=True) - .encode( - x=alt.X("x", axis=None), - y=alt.Y("y", axis=None), - color=alt.Color("idx", legend=None, scale=alt.Scale()), - size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])), - )) \ No newline at end of file diff --git a/visualization/README.md b/visualization/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3e1be22dc2f61f5b03952eb063f1180579bcd575 --- /dev/null +++ b/visualization/README.md @@ -0,0 +1,309 @@ +# Musora Sentiment Analysis Dashboard + +A Streamlit dashboard for visualising sentiment analysis results from **social media comments** (Facebook, Instagram, YouTube, Twitter) and the **Musora internal app** across brands (Drumeo, Pianote, Guitareo, Singeo, Musora). + +--- + +## Table of Contents + +1. [Project Structure](#project-structure) +2. [How Data Flows](#how-data-flows) +3. [Data Loading Strategy](#data-loading-strategy) +4. [Pages](#pages) +5. [Global Filters & Session State](#global-filters--session-state) +6. [Snowflake Queries](#snowflake-queries) +7. [Adding or Changing Things](#adding-or-changing-things) +8. [Running the App](#running-the-app) +9. [Configuration Reference](#configuration-reference) + +--- + +## Project Structure + +``` +visualization/ +├── app.py # Entry point — routing, sidebar, session state +├── config/ +│ └── viz_config.json # Colors, query strings, dashboard settings +├── data/ +│ └── data_loader.py # All Snowflake queries and caching logic +├── utils/ +│ ├── data_processor.py # Pandas aggregations (intent dist, content summary, etc.) +│ └── metrics.py # KPI calculations (sentiment score, urgency, etc.) +├── components/ +│ ├── dashboard.py # Dashboard page renderer +│ ├── sentiment_analysis.py # Sentiment Analysis page renderer +│ └── reply_required.py # Reply Required page renderer +├── visualizations/ +│ ├── sentiment_charts.py # Plotly sentiment chart functions +│ ├── distribution_charts.py # Plotly distribution / heatmap / scatter functions +│ ├── demographic_charts.py # Plotly demographic chart functions +│ └── content_cards.py # Streamlit card components (comment cards, content cards) +├── agents/ +│ └── content_summary_agent.py # AI analysis agent (OpenAI) for comment summarisation +├── img/ +│ └── musora.png # Sidebar logo +└── SnowFlakeConnection.py # Snowflake connection wrapper (Snowpark session) +``` + +--- + +## How Data Flows + +``` +Snowflake + │ + ▼ +data_loader.py ← Three separate loading modes (see below) + │ + ├── load_dashboard_data() ──► st.session_state['dashboard_df'] + │ └─► app.py sidebar (filter options, counts) + │ └─► dashboard.py (all charts) + │ + ├── load_sa_data() ──► st.session_state['sa_contents'] + │ (on-demand, button) st.session_state['sa_comments'] + │ └─► sentiment_analysis.py + │ + └── load_reply_required_data() ► st.session_state['rr_df'] + (on-demand, button) └─► reply_required.py +``` + +**Key principle:** Data is loaded as little as possible, as late as possible. + +- The **Dashboard** uses a lightweight query (no text columns, no content join) cached for 24 hours. +- The **Sentiment Analysis** and **Reply Required** pages never load data automatically — they wait for the user to click **Fetch Data**. +- All data is stored in `st.session_state` so page navigation and widget interactions do not re-trigger Snowflake queries. + +--- + +## Data Loading Strategy + +All loading logic lives in **`data/data_loader.py`** (`SentimentDataLoader` class). + +### `load_dashboard_data()` +- Uses `dashboard_query` from `viz_config.json`. +- Fetches only: `comment_sk, content_sk, platform, brand, sentiment_polarity, intent, requires_reply, detected_language, comment_timestamp, processed_at, author_id`. +- No text columns, no `DIM_CONTENT` join — significantly faster than the full query. +- Also merges demographics data if `demographics_query` is configured. +- Cached for **24 hours** (`@st.cache_data(ttl=86400)`). +- Called once by `app.py` at startup; result stored in `st.session_state['dashboard_df']`. + +### `load_sa_data(platform, brand, top_n, min_comments, sort_by, sentiments, intents, date_range)` +- Runs **two** sequential Snowflake queries: + 1. **Content aggregation** — groups by `content_sk`, counts per sentiment, computes severity score, returns top N. + 2. **Sampled comments** — for the top N `content_sk`s only, fetches up to 50 comments per sentiment group per content (negative, positive, other), using Snowflake `QUALIFY ROW_NUMBER()`. `display_text` is computed in SQL (`CASE WHEN IS_ENGLISH = FALSE AND TRANSLATED_TEXT IS NOT NULL THEN TRANSLATED_TEXT ELSE ORIGINAL_TEXT END`). +- Returns a tuple `(contents_df, comments_df)`. +- Cached for **24 hours**. +- Called only when the user clicks **Fetch Data** on the Sentiment Analysis page. + +### `load_reply_required_data(platforms, brands, date_range)` +- Runs a single query filtering `REQUIRES_REPLY = TRUE`. +- Dynamically includes/excludes the social media table and musora table based on selected platforms. +- `display_text` computed in SQL. +- Cached for **24 hours**. +- Called only when the user clicks **Fetch Data** on the Reply Required page. + +### Important: SQL Column Qualification +Both the social media table (`COMMENT_SENTIMENT_FEATURES`) and the content dimension table (`DIM_CONTENT`) share column names. Any `WHERE` clause inside a query that joins these two tables **must** use the table alias prefix (e.g. `s.PLATFORM`, `s.COMMENT_TIMESTAMP`, `s.CHANNEL_NAME`) to avoid Snowflake `ambiguous column name` errors. The musora table (`MUSORA_COMMENT_SENTIMENT_FEATURES`) has no joins so unqualified column names are fine there. + +--- + +## Pages + +### Dashboard (`components/dashboard.py`) + +**Receives:** `filtered_df` — the lightweight dashboard dataframe (after optional global filter applied by `app.py`). + +**Does not need:** text, translations, content URLs. All charts work purely on aggregated columns (sentiment_polarity, brand, platform, intent, requires_reply, comment_timestamp). + +**Key sections:** +- Summary stats + health indicator +- Sentiment distribution (pie + gauge) +- Sentiment by brand and platform (stacked + percentage bar charts) +- Intent analysis +- Brand-Platform heatmap +- Reply requirements + urgency breakdown +- Demographics (age, timezone, experience level) — only rendered if `author_id` is present and demographics were merged + +**To add a new chart:** create the chart function in `visualizations/` and call it from `render_dashboard()`. The function receives `filtered_df`. + +--- + +### Sentiment Analysis (`components/sentiment_analysis.py`) + +**Receives:** `data_loader` instance only (no dataframe). + +**Flow:** +1. Reads `st.session_state['dashboard_df']` for filter option lists (platforms, brands, sentiments, intents). +2. Pre-populates platform/brand dropdowns from `st.session_state['global_filters']`. +3. Shows filter controls (platform, brand, sentiment, intent, top_n, min_comments, sort_by). +4. On **Fetch Data** click: calls `data_loader.load_sa_data(...)` and stores results in `st.session_state['sa_contents']` and `['sa_comments']`. +5. Renders content cards, per-content sentiment + intent charts, AI analysis buttons, and sampled comment expanders. + +**Pagination:** `st.session_state['sentiment_page']` (5 contents per page). Reset on new fetch. + +**Comments:** Sampled (up to 50 negative + 50 positive + 50 neutral per content). These are already in memory after the fetch — no extra query is needed when the user expands a comment section. + +**AI Analysis:** Uses `ContentSummaryAgent` (see `agents/`). Results cached in `st.session_state['content_summaries']`. + +--- + +### Reply Required (`components/reply_required.py`) + +**Receives:** `data_loader` instance only. + +**Flow:** +1. Reads `st.session_state['dashboard_df']` for filter option lists. +2. Pre-populates platform, brand, and date from `st.session_state['global_filters']`. +3. On **Fetch Data** click: calls `data_loader.load_reply_required_data(...)` and stores result in `st.session_state['rr_df']`. +4. Shows urgency breakdown, in-page view filters (priority, platform, brand, intent — applied in Python, no new query), paginated comment cards, and a "Reply by Content" summary. + +**Pagination:** `st.session_state['reply_page']` (10 comments per page). Reset on new fetch. + +--- + +## Global Filters & Session State + +Global filters live in the sidebar (`app.py`) and are stored in `st.session_state['global_filters']` as a dict: + +```python +{ + 'platforms': ['facebook', 'instagram'], # list or [] + 'brands': ['drumeo'], + 'sentiments': [], + 'date_range': (date(2025, 1, 1), date(2025, 12, 31)), # or None +} +``` + +- **Dashboard:** `app.py` applies global filters to `dashboard_df` using `data_loader.apply_filters()` and passes the result to `render_dashboard()`. +- **Sentiment Analysis / Reply Required:** global filters are used to pre-populate their own filter widgets. The actual Snowflake query uses those values when the user clicks Fetch. The pages do **not** receive a pre-filtered dataframe. + +### Full session state key reference + +| Key | Set by | Used by | +|-----|--------|---------| +| `dashboard_df` | `app.py` on startup | sidebar (filter options), dashboard, SA + RR (filter option lists) | +| `global_filters` | sidebar "Apply Filters" button | app.py (dashboard filter), SA + RR (pre-populate widgets) | +| `filters_applied` | sidebar buttons | app.py (whether to apply filters) | +| `sa_contents` | SA fetch button | SA page rendering | +| `sa_comments` | SA fetch button | SA page rendering | +| `sa_fetch_key` | SA fetch button | SA page (detect stale data) | +| `rr_df` | RR fetch button | RR page rendering | +| `rr_fetch_key` | RR fetch button | RR page (detect stale data) | +| `sentiment_page` | SA page / fetch | SA pagination | +| `reply_page` | RR page / fetch | RR pagination | +| `content_summaries` | AI analysis buttons | SA AI analysis display | + +--- + +## Snowflake Queries + +All query strings are either stored in `config/viz_config.json` (static queries) or built dynamically in `data/data_loader.py` (page-specific queries). + +### Static queries (in `viz_config.json`) + +| Key | Purpose | +|-----|---------| +| `query` | Full query with all columns (legacy, kept for compatibility) | +| `dashboard_query` | Lightweight query — no text, no DIM_CONTENT join | +| `demographics_query` | Joins `usora_users` with `preprocessed.users` to get age/timezone/experience | + +### Dynamic queries (built in `data_loader.py`) + +| Method | Description | +|--------|-------------| +| `_build_sa_content_query()` | Content aggregation for SA page; filters by platform + brand + date | +| `_build_sa_comments_query()` | Sampled comments for SA page; uses `QUALIFY ROW_NUMBER() <= 50` | +| `_build_rr_query()` | Reply-required comments; filters by platform/brand/date; conditionally includes social media and/or musora table | + +### Data source tables + +| Table | Platform | Notes | +|-------|----------|-------| +| `SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES` | facebook, instagram, youtube, twitter | Needs `LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT` for `PERMALINK_URL` | +| `SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES` | musora_app | Has `PERMALINK_URL` and `THUMBNAIL_URL` natively; platform stored as `'musora'`, mapped to `'musora_app'` in queries | + +--- + +## Adding or Changing Things + +### Add a new chart to the Dashboard +1. Write the chart function in the appropriate `visualizations/` file. +2. Call it from `render_dashboard()` in `components/dashboard.py`, passing `filtered_df`. +3. The chart function receives a lightweight df — it has no text columns but has all the columns listed in `dashboard_query`. + +### Add a new filter to the Dashboard sidebar +1. Add the widget in `app.py` under the "Global Filters" section. +2. Store the selected value in the `global_filters` dict under `st.session_state`. +3. Pass it to `data_loader.apply_filters()`. + +### Change what the Sentiment Analysis page queries +- Edit `_build_sa_content_query()` and/or `_build_sa_comments_query()` in `data_loader.py`. +- If you add new columns to the content aggregation result, also update `_process_sa_content_stats()` so they are available in `contents_df`. +- If you add new columns to the comments result, update `_process_sa_comments()`. + +### Change what the Reply Required page queries +- Edit `_build_rr_query()` in `data_loader.py`. +- Remember: all column references inside the social media block (which has a `JOIN`) must be prefixed with `s.` to avoid Snowflake ambiguity errors. + +### Change the cache duration +- `@st.cache_data(ttl=86400)` is set on `load_dashboard_data`, `_fetch_sa_data`, `_fetch_rr_data`, and `load_demographics_data`. +- Change `86400` (seconds) to the desired TTL, or set `ttl=None` for no expiry. +- Users can always force a refresh with the "Reload Data" button in the sidebar (which calls `st.cache_data.clear()` and deletes `st.session_state['dashboard_df']`). + +### Add a new page +1. Create `components/new_page.py` with a `render_new_page(data_loader)` function. +2. Import and add a radio option in `app.py`. +3. If the page needs its own Snowflake data, add a `load_new_page_data()` method to `SentimentDataLoader` following the same pattern as `load_sa_data`. + +### Add a new column to the Dashboard query +- Edit `dashboard_query` in `config/viz_config.json`. +- Both UNION branches must select the same columns in the same order. +- `_process_dashboard_dataframe()` in `data_loader.py` handles basic type casting — add processing there if needed. + +--- + +## Running the App + +```bash +# From the project root +streamlit run visualization/app.py +``` + +**Required environment variables** (in `.env` at project root): + +``` +SNOWFLAKE_USER +SNOWFLAKE_PASSWORD +SNOWFLAKE_ACCOUNT +SNOWFLAKE_ROLE +SNOWFLAKE_DATABASE +SNOWFLAKE_WAREHOUSE +SNOWFLAKE_SCHEMA +``` + +--- + +## Configuration Reference + +`config/viz_config.json` controls: + +| Section | What it configures | +|---------|-------------------| +| `color_schemes.sentiment_polarity` | Hex colors for each sentiment level | +| `color_schemes.intent` | Hex colors for each intent label | +| `color_schemes.platform` | Hex colors for each platform | +| `color_schemes.brand` | Hex colors for each brand | +| `sentiment_order` | Display order for sentiment categories in charts | +| `intent_order` | Display order for intent categories | +| `negative_sentiments` | Which sentiment values count as "negative" | +| `dashboard.default_date_range_days` | Default date filter window (days) | +| `dashboard.max_comments_display` | Max comments shown per pagination page | +| `dashboard.chart_height` | Default Plotly chart height | +| `dashboard.top_n_contents` | Default top-N for content ranking | +| `snowflake.query` | Full query (legacy, all columns) | +| `snowflake.dashboard_query` | Lightweight dashboard query (no text columns) | +| `snowflake.demographics_query` | Demographics join query | +| `demographics.age_groups` | Age bucket definitions (label → [min, max]) | +| `demographics.experience_groups` | Experience bucket definitions | +| `demographics.top_timezones_count` | How many timezones to show in the geographic chart | \ No newline at end of file diff --git a/visualization/SnowFlakeConnection.py b/visualization/SnowFlakeConnection.py new file mode 100644 index 0000000000000000000000000000000000000000..a52c89b81b3b07fa3cb0396ee6b78416aadeaf46 --- /dev/null +++ b/visualization/SnowFlakeConnection.py @@ -0,0 +1,150 @@ +""" +This class create a connection to Snowflake, run queries (read and write) +""" +import json +import os +from snowflake.snowpark import Session +from dotenv import load_dotenv +import logging +logger = logging.getLogger() +load_dotenv() + +class SnowFlakeConn: + def __init__(self): + self.session = self.connect_to_snowflake() + + + # ========================================================= + def connect_to_snowflake(self): + # --- Snowflake connection via env vars --- + # Validate all required credentials exist + required_credentials = [ + "SNOWFLAKE_USER", + "SNOWFLAKE_PASSWORD", + "SNOWFLAKE_ACCOUNT", + "SNOWFLAKE_ROLE", + "SNOWFLAKE_DATABASE", + "SNOWFLAKE_WAREHOUSE", + "SNOWFLAKE_SCHEMA" + ] + + missing_credentials = [] + for cred in required_credentials: + if not self.get_credential(cred): + missing_credentials.append(cred) + + if missing_credentials: + error_msg = f"Missing required Snowflake credentials: {', '.join(missing_credentials)}" + logger.error(error_msg) + raise ValueError(error_msg) + + conn = dict( + user=self.get_credential("SNOWFLAKE_USER"), + password=self.get_credential("SNOWFLAKE_PASSWORD"), + account=self.get_credential("SNOWFLAKE_ACCOUNT"), + role=self.get_credential("SNOWFLAKE_ROLE"), + database=self.get_credential("SNOWFLAKE_DATABASE"), + warehouse=self.get_credential("SNOWFLAKE_WAREHOUSE"), + schema=self.get_credential("SNOWFLAKE_SCHEMA"), + ) + + try: + session = Session.builder.configs(conn).create() + logger.info("Successfully connected to Snowflake") + return session + except Exception as e: + logger.error(f"Failed to connect to Snowflake: {e}") + raise + + # ========================================================= + def get_credential(self, key): + return os.getenv(key) + + # ========================================================= + def run_read_query(self, query, data): + """ + Executes a SQL query on Snowflake that fetch the data + :return: Pandas dataframe containing the query results + """ + + # Connect to Snowflake + try: + dataframe = self.session.sql(query).to_pandas() + dataframe.columns = dataframe.columns.str.lower() + print(f"reading {data} table successfully") + return dataframe + except Exception as e: + error_msg = f"Error reading {data}: {e}" + print(error_msg) + logger.error(error_msg) + raise + + # ========================================================= + def store_df_to_snowflake(self, table_name, dataframe, database="SOCIAL_MEDIA_DB", schema="ML_FEATURES", overwrite=False): + """ + Executes a SQL query on Snowflake that write the preprocessed data on new tables + :param query: SQL query string to be executed + :return: None + """ + + try: + self.session.use_database(database) + self.session.use_schema(schema) + + dataframe = dataframe.reset_index(drop=True) + dataframe.columns = dataframe.columns.str.upper() + + self.session.write_pandas(df=dataframe, + table_name=table_name.strip().upper(), + auto_create_table=True, + overwrite=overwrite, + use_logical_type=True) + print(f"Data inserted into {table_name} successfully.") + + except Exception as e: + print(f"Error in creating/updating/inserting table: {e}") + + # ========================================================= + def execute_sql_file(self, file_path): + """ + Executes SQL queries from a file + :param file_path: Path to SQL file + :return: Query result or None for DDL/DML + """ + try: + with open(file_path, 'r', encoding='utf-8') as file: + sql_content = file.read() + + result = self.session.sql(sql_content).collect() + print(f"Successfully executed SQL from {file_path}") + return result + except Exception as e: + print(f"Error executing SQL file {file_path}: {e}") + return None + + # ========================================================= + def execute_query(self, query, description="query"): + """ + Executes a SQL query and returns results + :param query: SQL query string + :param description: Description of the query for logging + :return: Query results + """ + try: + result = self.session.sql(query).collect() + print(f"Successfully executed {description}") + return result + except Exception as e: + print(f"Error executing {description}: {e}") + return None + + + # ========================================================= + def get_data(self, data): + # get any sort of data based on requirement --> comments, contents, etc + pass + + # ========================================================= + def close_connection(self): + self.session.close() + diff --git a/visualization/agents/README.md b/visualization/agents/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5b9e17ac02504a843319ad009f99efc5eb765874 --- /dev/null +++ b/visualization/agents/README.md @@ -0,0 +1,320 @@ +# Visualization Agents + +## Overview +This folder contains AI-powered agents that enhance the sentiment analysis dashboard with intelligent, context-aware insights and analysis capabilities. + +## Architecture + +### Base Agent Pattern +All agents inherit from `BaseVisualizationAgent` which provides: +- Common interface (`process()`, `validate_input()`) +- Error handling +- Logging functionality +- Consistent configuration + +### LLM Helper +`utils/llm_helper.py` provides: +- OpenAI API integration +- Retry logic with exponential backoff +- JSON mode support +- Token usage tracking + +## Available Agents + +### 1. ContentSummaryAgent + +**Purpose**: Analyze and summarize comments for content pieces + +**Location**: `agents/content_summary_agent.py` + +**Input**: +```python +{ + 'content_sk': str, # Content identifier + 'content_description': str, # Content title/description + 'comments': DataFrame or list # Comments data +} +``` + +**Output**: +```python +{ + 'success': bool, + 'content_sk': str, + 'summary': { + 'executive_summary': str, # 2-3 sentence overview + 'main_themes': [ # Top themes discussed + { + 'theme': str, + 'sentiment': str, # positive/negative/mixed + 'description': str + } + ], + 'praise_points': [str], # What users love + 'key_complaints': [str], # Main concerns + 'frequently_asked_questions': [str], # Common questions + 'unexpected_insights': [str], # Surprising patterns + 'action_recommendations': [ # Suggested actions + { + 'priority': str, # high/medium/low + 'action': str + } + ] + }, + 'metadata': { + 'total_comments_analyzed': int, + 'model_used': str, + 'tokens_used': int + } +} +``` + +**Configuration**: +- Model: `gpt-5-nano` (configurable) +- Temperature: 0.3 (lower for focused summaries) +- Sampling: All negative comments + up to 50 positive/neutral (if >100 total) + +**Features**: +- **Smart sampling**: Prioritizes negative comments, samples others +- **Context preservation**: Includes sentiment and intent metadata +- **Token optimization**: Truncates long comments to 300 chars +- **Structured output**: JSON format with guaranteed fields +- **Error handling**: Graceful failures with retry capability + +## UI Integration + +### Poor Sentiment Contents Page + +**Location**: `components/poor_sentiment_contents.py` + +**User Flow**: +1. User views content cards on Poor Sentiment Contents page +2. Clicks "🔍 Generate AI Analysis" button +3. Agent processes comments (with spinner indicator) +4. Summary displays in expandable section +5. Result cached in session state + +**Display Sections**: +- **Executive Summary**: High-level overview (info box) +- **Main Themes**: Key topics with sentiment indicators +- **Praise Points** ✅ & **Key Complaints** ⚠️ (side-by-side) +- **FAQs** ❓ & **Unexpected Insights** 💡 (side-by-side) +- **Recommended Actions** 🎯 (priority-coded) +- **Analysis Metadata** ℹ️ (expandable details) + +**Session Caching**: +- Summaries stored in `st.session_state.content_summaries` +- Key: `content_sk` +- Persists during session, cleared on page reload +- Prevents redundant API calls + +## Usage Example + +```python +from agents.content_summary_agent import ContentSummaryAgent +import pandas as pd + +# Initialize agent +agent = ContentSummaryAgent(model="gpt-5-nano", temperature=0.3) + +# Prepare input +input_data = { + 'content_sk': '12345', + 'content_description': 'Advanced Drum Fills Tutorial', + 'comments': comments_df # DataFrame with comments +} + +# Generate summary +result = agent.process(input_data) + +if result['success']: + summary = result['summary'] + print(summary['executive_summary']) + + for theme in summary['main_themes']: + print(f"Theme: {theme['theme']} ({theme['sentiment']})") + print(f" {theme['description']}") +else: + print(f"Error: {result['error']}") +``` + +## Environment Setup + +### Required Environment Variables +Add to `.env` file (parent directory): +```bash +OPENAI_API_KEY=your_openai_api_key_here +``` + +### Dependencies +All dependencies already in `visualization/requirements.txt`: +- `streamlit>=1.28.0` +- `pandas>=2.0.0` +- `python-dotenv>=1.0.0` +- OpenAI library (inherited from parent project) + +## Error Handling + +### Agent-Level Errors +- **Invalid input**: Returns `{'success': False, 'error': 'Invalid input data'}` +- **LLM API failure**: Retries up to 3 times with exponential backoff +- **JSON parsing error**: Returns error with raw content +- **Exception**: Catches all exceptions, logs, returns error dict + +### UI-Level Errors +- Displays error message in red box +- Provides "🔄 Retry Analysis" button +- Clears cache and regenerates on retry +- Logs errors to agent logger + +## Performance Considerations + +### API Costs +- Model: `gpt-5-nano` (cost-effective) +- Sampling strategy: Reduces tokens by up to 50% for large comment sets +- Comment truncation: Max 300 chars per comment +- Session caching: Eliminates duplicate API calls + +### Response Time +- Average: 5-10 seconds for 50-100 comments +- Depends on: Comment count, OpenAI API latency +- User feedback: Spinner shows "Analyzing comments with AI..." + +### Scalability +- Handles up to 100 comments per analysis (after sampling) +- Parallel requests: Each content analyzed independently +- Session state: Memory usage scales with number of analyzed contents + +## Extending Agents + +### Adding New Agents + +1. **Create agent file**: +```python +# agents/new_agent.py +from agents.base_agent import BaseVisualizationAgent +from utils.llm_helper import LLMHelper + +class NewAgent(BaseVisualizationAgent): + def __init__(self, model="gpt-5-nano", temperature=0.7): + super().__init__(name="NewAgent", model=model, temperature=temperature) + self.llm_helper = LLMHelper(model=model, temperature=temperature) + + def validate_input(self, input_data): + # Validation logic + return True + + def process(self, input_data): + # Processing logic + pass +``` + +2. **Update `__init__.py`**: +```python +from .new_agent import NewAgent + +__all__ = ['ContentSummaryAgent', 'NewAgent'] +``` + +3. **Integrate in UI**: +- Import agent in component file +- Add UI controls (buttons, inputs) +- Display results +- Handle caching if needed + +### Best Practices + +1. **Input Validation**: Always validate required fields +2. **Error Handling**: Use `handle_error()` method +3. **Logging**: Use `log_processing()` for debugging +4. **Structured Output**: Return consistent dict format +5. **Caching**: Use session state for expensive operations +6. **Token Optimization**: Sample/truncate data for large inputs +7. **User Feedback**: Show spinners for async operations +8. **Graceful Degradation**: Provide fallbacks for failures + +## Testing + +### Manual Testing +1. Start dashboard: `streamlit run app.py` +2. Navigate to "⚠️ Poor Sentiment Contents" page +3. Click "🔍 Generate AI Analysis" for any content +4. Verify summary displays correctly +5. Check session caching (click button again) +6. Test error handling (disconnect network) + +### Unit Testing +```python +# tests/test_content_summary_agent.py +import pytest +from agents.content_summary_agent import ContentSummaryAgent + +def test_validate_input(): + agent = ContentSummaryAgent() + + # Valid input + valid_input = { + 'content_sk': '123', + 'content_description': 'Test', + 'comments': [] + } + assert agent.validate_input(valid_input) == True + + # Missing field + invalid_input = {'content_sk': '123'} + assert agent.validate_input(invalid_input) == False +``` + +## Future Enhancements + +### Planned Features +1. **Batch Analysis**: Analyze multiple contents at once +2. **Trend Detection**: Compare with historical summaries +3. **Export Summaries**: Download as PDF/CSV +4. **Custom Prompts**: User-defined analysis focus +5. **Multi-language Support**: Summaries in user's language + +### Additional Agents (Roadmap) +- **InsightsSummaryAgent**: Overall dataset insights +- **InteractiveChatbotAgent**: Conversational analysis +- **ComparativeContentAgent**: Content comparison +- **ReplySuggestionAgent**: Generate reply suggestions +- **TrendForecastingAgent**: Predict sentiment trends + +## Troubleshooting + +### Common Issues + +**Issue**: `OPENAI_API_KEY not found` +- **Solution**: Add key to `.env` file in parent directory + +**Issue**: Import error for `agents` module +- **Solution**: Ensure `__init__.py` exists in `visualization/agents/` + +**Issue**: LLM timeout errors +- **Solution**: Reduce comment count or increase retry limit + +**Issue**: JSON parsing errors +- **Solution**: Check LLM prompt format, ensure JSON mode enabled + +**Issue**: Cached summaries not showing +- **Solution**: Check `st.session_state.content_summaries` initialization + +## Support + +For issues or questions: +1. Check this README +2. Review agent logs in console +3. Inspect session state in Streamlit +4. Verify environment variables +5. Check OpenAI API status + +## Version History + +### v1.0.0 (Current) +- Initial release +- ContentSummaryAgent implementation +- Poor Sentiment Contents page integration +- Session-based caching +- Error handling and retry logic +- Comprehensive UI display \ No newline at end of file diff --git a/visualization/agents/__init__.py b/visualization/agents/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..407dfda0c808602aef994c16c34aed90753f8bb2 --- /dev/null +++ b/visualization/agents/__init__.py @@ -0,0 +1,8 @@ +""" +Visualization Agents Package +Contains AI agents for intelligent dashboard features +""" + +from .content_summary_agent import ContentSummaryAgent + +__all__ = ['ContentSummaryAgent'] diff --git a/visualization/agents/base_agent.py b/visualization/agents/base_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..9c35e0e2e5ec9af79f08d4197b877ba293c6aae6 --- /dev/null +++ b/visualization/agents/base_agent.py @@ -0,0 +1,88 @@ +""" +Base Agent class for visualization agents +Provides common functionality and interface for all agents +""" +from abc import ABC, abstractmethod +from typing import Dict, Any +import logging + + +class BaseVisualizationAgent(ABC): + """ + Abstract base class for all visualization agents + """ + + def __init__(self, name: str, model: str = "gpt-5-nano", temperature: float = 0.7): + """ + Initialize base agent + + Args: + name: Agent name + model: LLM model to use + temperature: LLM temperature + """ + self.name = name + self.model = model + self.temperature = temperature + self.logger = logging.getLogger(f"visualization.agents.{name}") + + @abstractmethod + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process input data and return results + + Args: + input_data: Input data dictionary + + Returns: + Results dictionary + """ + pass + + @abstractmethod + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """ + Validate input data + + Args: + input_data: Input data dictionary + + Returns: + True if valid, False otherwise + """ + pass + + def log_processing(self, message: str, level: str = "info"): + """ + Log processing information + + Args: + message: Log message + level: Log level (info, warning, error) + """ + log_func = getattr(self.logger, level.lower(), self.logger.info) + log_func(f"[{self.name}] {message}") + + def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]: + """ + Handle errors consistently + + Args: + error: Exception that occurred + context: Additional context information + + Returns: + Error response dictionary + """ + error_msg = f"Error in {self.name}: {str(error)}" + if context: + error_msg += f" | Context: {context}" + + self.log_processing(error_msg, level="error") + + return { + 'success': False, + 'error': str(error), + 'error_type': type(error).__name__, + 'context': context + } \ No newline at end of file diff --git a/visualization/agents/content_summary_agent.py b/visualization/agents/content_summary_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..4a60464b36b6d1ca99c21cccc52a45d177d0bbd0 --- /dev/null +++ b/visualization/agents/content_summary_agent.py @@ -0,0 +1,366 @@ +""" +Content Summary Agent +Analyzes and summarizes comments for content pieces +""" +import pandas as pd +from typing import Dict, Any, List +import sys +from pathlib import Path + +# Add parent directory to path +parent_dir = Path(__file__).resolve().parent.parent +sys.path.append(str(parent_dir)) + +from agents.base_agent import BaseVisualizationAgent +from utils.llm_helper import LLMHelper + + +class ContentSummaryAgent(BaseVisualizationAgent): + """ + Agent that analyzes and summarizes comments for content + Extracts themes, praise points, complaints, FAQs, and insights + """ + + def __init__(self, model: str = "gpt-5-nano", temperature: float = 1): + """ + Initialize Content Summary Agent + + Args: + model: LLM model to use + temperature: Temperature for generation (lower for more focused summaries) + """ + super().__init__(name="ContentSummaryAgent", model=model, temperature=temperature) + self.llm_helper = LLMHelper(model=model, temperature=temperature) + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """ + Validate input data + + Args: + input_data: Input dictionary + + Returns: + True if valid, False otherwise + """ + required_fields = ['content_sk', 'content_description', 'comments'] + + for field in required_fields: + if field not in input_data: + self.log_processing(f"Missing required field: {field}", level="error") + return False + + if not isinstance(input_data['comments'], (list, pd.DataFrame)): + self.log_processing("Comments must be a list or DataFrame", level="error") + return False + + return True + + def _prepare_comments_context(self, comments: Any, sentiment_type: str = 'negative') -> str: + """ + Prepare comments data for LLM analysis + + Args: + comments: Comments as DataFrame or list of dicts + sentiment_type: Type of sentiment to analyze ('negative', 'positive', 'combined') + + Returns: + Formatted string with comment data + """ + # Convert to DataFrame if needed + if isinstance(comments, list): + comments_df = pd.DataFrame(comments) + else: + comments_df = comments.copy() + + # Filter based on sentiment type + if sentiment_type == 'negative': + # Only negative comments + comments_df = comments_df[ + comments_df['sentiment_polarity'].isin(['negative', 'very_negative']) + ] + elif sentiment_type == 'positive': + # Only positive comments + comments_df = comments_df[ + comments_df['sentiment_polarity'].isin(['positive', 'very_positive']) + ] + # else: combined - use all comments + + # Limit to reasonable number for API + if len(comments_df) > 100: + if sentiment_type == 'combined': + # For combined: sample from both positive and negative + negative_comments = comments_df[ + comments_df['sentiment_polarity'].isin(['negative', 'very_negative']) + ].sample(n=min(50, len(comments_df[comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])])), random_state=42) + + positive_comments = comments_df[ + comments_df['sentiment_polarity'].isin(['positive', 'very_positive']) + ].sample(n=min(50, len(comments_df[comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])])), random_state=42) + + comments_df = pd.concat([negative_comments, positive_comments]) + else: + # For single sentiment type: just sample + comments_df = comments_df.sample(n=min(100, len(comments_df)), random_state=42) + + # Format comments for analysis + comments_text = [] + for idx, row in comments_df.iterrows(): + text = row.get('display_text', row.get('original_text', '')) + sentiment = row.get('sentiment_polarity', 'unknown') + intent = row.get('intent', 'unknown') + + comment_entry = f""" +Comment #{idx + 1}: +- Text: {text[:300]}{'...' if len(str(text)) > 300 else ''} +- Sentiment: {sentiment} +- Intent: {intent} +""" + comments_text.append(comment_entry) + + return "\n".join(comments_text) + + def _generate_summary_prompt( + self, + content_description: str, + comments_context: str, + total_comments: int, + sentiment_type: str = 'negative' + ) -> str: + """ + Generate prompt for LLM + + Args: + content_description: Description of the content + comments_context: Formatted comments + total_comments: Total number of comments + sentiment_type: Type of sentiment being analyzed ('negative', 'positive', 'combined') + + Returns: + Prompt string + """ + # Customize prompt based on sentiment type + if sentiment_type == 'negative': + focus_instruction = "Focus on understanding negative feedback, complaints, and issues that need attention." + elif sentiment_type == 'positive': + focus_instruction = "Focus on understanding what users love, praise points, and successful elements that should be maintained or amplified." + else: # combined + focus_instruction = "Provide a balanced analysis covering both positive feedback and areas for improvement." + + prompt = f"""Analyze the {sentiment_type} comments below for the following content and provide a brief executive summary. + +**Content:** {content_description} + +**Total Comments Analyzed:** {total_comments} + +**Analysis Focus:** {focus_instruction} + +**Comments to Analyze:** +{comments_context} + +**Task:** Provide a concise executive summary in JSON format with the following structure: + +{{ + "executive_summary": "2-3 sentence high-level overview focusing on {sentiment_type} sentiment", + "main_themes": [ + {{ + "theme": "theme name", + "sentiment": "positive/negative/mixed", + "description": "brief description" + }} + ], + "praise_points": ["point 1", "point 2", "point 3"], + "key_complaints": ["complaint 1", "complaint 2", "complaint 3"], + "frequently_asked_questions": ["question 1", "question 2"], + "unexpected_insights": ["insight 1", "insight 2"], + "action_recommendations": [ + {{ + "priority": "high/medium/low", + "action": "recommended action" + }} + ] +}} + +**Guidelines:** +- Be concise and actionable +- Focus on the most important insights from {sentiment_type} comments +- Limit each list to top 3-5 items +- If a section has no relevant items, use an empty list +- Executive summary should capture the overall patterns and key takeaways +""" + return prompt + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process comments and generate summary + + Args: + input_data: { + 'content_sk': content identifier, + 'content_description': content title/description, + 'comments': DataFrame or list of comment dicts, + 'sentiment_type': 'negative', 'positive', or 'combined' (optional, defaults to 'negative') + } + + Returns: + { + 'success': bool, + 'content_sk': str, + 'sentiment_type': str, + 'summary': { + 'executive_summary': str, + 'main_themes': list, + 'praise_points': list, + 'key_complaints': list, + 'frequently_asked_questions': list, + 'unexpected_insights': list, + 'action_recommendations': list + }, + 'metadata': { + 'total_comments_analyzed': int, + 'model_used': str, + 'tokens_used': int + } + } + """ + try: + # Validate input + if not self.validate_input(input_data): + return { + 'success': False, + 'error': 'Invalid input data', + 'content_sk': input_data.get('content_sk', 'unknown') + } + + content_sk = input_data['content_sk'] + content_description = input_data['content_description'] + comments = input_data['comments'] + sentiment_type = input_data.get('sentiment_type', 'negative') # Default to negative for backward compatibility + + self.log_processing(f"Starting {sentiment_type} analysis for content: {content_sk}") + + # Convert to DataFrame if needed + if isinstance(comments, list): + comments_df = pd.DataFrame(comments) + else: + comments_df = comments.copy() + + total_comments = len(comments_df) + + if total_comments == 0: + return { + 'success': True, + 'content_sk': content_sk, + 'sentiment_type': sentiment_type, + 'summary': { + 'executive_summary': 'No comments available for analysis.', + 'main_themes': [], + 'praise_points': [], + 'key_complaints': [], + 'frequently_asked_questions': [], + 'unexpected_insights': [], + 'action_recommendations': [] + }, + 'metadata': { + 'total_comments_analyzed': 0, + 'model_used': self.model, + 'tokens_used': 0 + } + } + + # Prepare comments context based on sentiment type + comments_context = self._prepare_comments_context(comments_df, sentiment_type) + + # Get count of comments after filtering + if sentiment_type == 'negative': + filtered_count = len(comments_df[comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])]) + elif sentiment_type == 'positive': + filtered_count = len(comments_df[comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])]) + else: + filtered_count = total_comments + + if filtered_count == 0: + return { + 'success': True, + 'content_sk': content_sk, + 'sentiment_type': sentiment_type, + 'summary': { + 'executive_summary': f'No {sentiment_type} comments available for analysis.', + 'main_themes': [], + 'praise_points': [], + 'key_complaints': [], + 'frequently_asked_questions': [], + 'unexpected_insights': [], + 'action_recommendations': [] + }, + 'metadata': { + 'total_comments_analyzed': 0, + 'model_used': self.model, + 'tokens_used': 0 + } + } + + # Generate prompt + prompt = self._generate_summary_prompt( + content_description, + comments_context, + filtered_count, + sentiment_type + ) + + # System message + system_message = """You are an expert social media analyst specializing in +sentiment analysis and community insights. Provide concise, actionable summaries +that help content creators understand their audience feedback.""" + + # Get LLM response + self.log_processing(f"Calling LLM for {sentiment_type} summary generation") + response = self.llm_helper.get_structured_completion( + prompt=prompt, + system_message=system_message, + max_retries=3 + ) + + if not response['success']: + return self.handle_error( + Exception(response.get('error', 'LLM call failed')), + context=f"content_sk={content_sk}, sentiment_type={sentiment_type}" + ) + + # Extract summary + summary = response['content'] + + # Ensure all expected fields exist + default_summary = { + 'executive_summary': '', + 'main_themes': [], + 'praise_points': [], + 'key_complaints': [], + 'frequently_asked_questions': [], + 'unexpected_insights': [], + 'action_recommendations': [] + } + + # Merge with defaults + for key in default_summary: + if key not in summary: + summary[key] = default_summary[key] + + self.log_processing(f"Successfully generated {sentiment_type} summary for content: {content_sk}") + + return { + 'success': True, + 'content_sk': content_sk, + 'sentiment_type': sentiment_type, + 'summary': summary, + 'metadata': { + 'total_comments_analyzed': filtered_count, + 'model_used': response['model'], + 'tokens_used': response['usage']['total_tokens'] + } + } + + except Exception as e: + return self.handle_error( + e, + context=f"content_sk={input_data.get('content_sk', 'unknown')}, sentiment_type={input_data.get('sentiment_type', 'negative')}" + ) \ No newline at end of file diff --git a/visualization/app.py b/visualization/app.py new file mode 100644 index 0000000000000000000000000000000000000000..023ca0dcd33d858fdaa8ac5e59292060bdf387f9 --- /dev/null +++ b/visualization/app.py @@ -0,0 +1,211 @@ +""" +Musora Sentiment Analysis Dashboard +Main Streamlit Application + +Run with: streamlit run app.py +""" +import streamlit as st +import sys +from pathlib import Path +import json + +# Add parent directory to path +parent_dir = Path(__file__).resolve().parent +sys.path.append(str(parent_dir)) + +from data.data_loader import SentimentDataLoader +from components.dashboard import render_dashboard +from components.sentiment_analysis import render_sentiment_analysis +from components.reply_required import render_reply_required +from utils.auth import check_authentication, render_login_page, logout, get_current_user + +# ── Load configuration ──────────────────────────────────────────────────────── +config_path = parent_dir / "config" / "viz_config.json" +with open(config_path, 'r') as f: + config = json.load(f) + +# ── Page configuration ──────────────────────────────────────────────────────── +st.set_page_config( + page_title=config['page_config']['page_title'], + page_icon=config['page_config']['page_icon'], + layout=config['page_config']['layout'], + initial_sidebar_state=config['page_config']['initial_sidebar_state'] +) + +# ── Authentication gate ─────────────────────────────────────────────────────── +# render_login_page() calls st.stop() when the user is not authenticated, +# so nothing below this point executes until login succeeds. +if not check_authentication(): + render_login_page() + +# ── Single data-loader instance (cheap: just reads config) ──────────────────── +data_loader = SentimentDataLoader() + + +def _ensure_dashboard_data(): + """ + Load dashboard data once and store in session_state. + Subsequent calls within the same session (or until cache expires) are free. + """ + if 'dashboard_df' not in st.session_state or st.session_state['dashboard_df'] is None: + with st.spinner("Loading dashboard data…"): + df = data_loader.load_dashboard_data() + st.session_state['dashboard_df'] = df + return st.session_state['dashboard_df'] + + +def main(): + # ── Sidebar ─────────────────────────────────────────────────────────────── + with st.sidebar: + st.image("visualization/img/musora.png", use_container_width=True) + + # User info + logout + current_user = get_current_user() + if current_user: + st.caption(f"Logged in as **{current_user}**") + if st.button("🔓 Logout", use_container_width=True): + logout() + st.rerun() + + st.markdown("---") + st.title("Navigation") + + page = st.radio( + "Select Page", + ["📊 Dashboard", "🔍 Sentiment Analysis", "💬 Reply Required"], + index=0 + ) + + st.markdown("---") + st.markdown("### 🔍 Global Filters") + + # Load / retrieve dashboard data for filter options + dashboard_df = _ensure_dashboard_data() + + if dashboard_df.empty: + st.error("No data available. Please check your Snowflake connection.") + return + + filter_options = data_loader.get_filter_options(dashboard_df) + + # Restore previous filter values from session_state so widgets keep state + prev = st.session_state.get('global_filters', {}) + + selected_platforms = st.multiselect( + "Platforms", + options=filter_options['platforms'], + default=prev.get('platforms', []) + ) + selected_brands = st.multiselect( + "Brands", + options=filter_options['brands'], + default=prev.get('brands', []) + ) + selected_sentiments = st.multiselect( + "Sentiments", + options=filter_options['sentiments'], + default=prev.get('sentiments', []) + ) + + # Date range filter + if 'comment_timestamp' in dashboard_df.columns and not dashboard_df.empty: + min_date = dashboard_df['comment_timestamp'].min().date() + max_date = dashboard_df['comment_timestamp'].max().date() + + prev_range = prev.get('date_range') + default_range = ( + (prev_range[0], prev_range[1]) if prev_range and len(prev_range) == 2 + else (min_date, max_date) + ) + date_range = st.date_input( + "Date Range", + value=default_range, + min_value=min_date, + max_value=max_date + ) + else: + date_range = None + + # Apply / Reset + if st.button("🔍 Apply Filters", use_container_width=True): + st.session_state['global_filters'] = { + 'platforms': selected_platforms, + 'brands': selected_brands, + 'sentiments': selected_sentiments, + 'date_range': date_range if date_range and len(date_range) == 2 else None, + } + st.session_state['filters_applied'] = True + + if st.button("🔄 Reset Filters", use_container_width=True): + st.session_state['global_filters'] = {} + st.session_state['filters_applied'] = False + st.rerun() + + st.markdown("---") + + # Data management + st.markdown("### 🔄 Data Management") + if st.button("♻️ Reload Data", use_container_width=True): + st.cache_data.clear() + st.session_state.pop('dashboard_df', None) + st.rerun() + + # Data info + st.markdown("---") + st.markdown("### ℹ️ Data Info") + st.info(f"**Total Records:** {len(dashboard_df):,}") + if 'processed_at' in dashboard_df.columns and not dashboard_df.empty: + last_update = dashboard_df['processed_at'].max() + if hasattr(last_update, 'strftime'): + st.info(f"**Last Updated:** {last_update.strftime('%Y-%m-%d %H:%M')}") + + # ── Build filtered dashboard_df for the Dashboard page ─────────────────── + filters_applied = st.session_state.get('filters_applied', False) + global_filters = st.session_state.get('global_filters', {}) + + if filters_applied and global_filters: + filtered_df = data_loader.apply_filters( + dashboard_df, + platforms=global_filters.get('platforms') or None, + brands=global_filters.get('brands') or None, + sentiments=global_filters.get('sentiments') or None, + date_range=global_filters.get('date_range') or None, + ) + if filtered_df.empty: + st.warning("No data matches the selected filters. Please adjust your filters.") + return + st.info(f"Showing **{len(filtered_df):,}** records after applying filters") + else: + filtered_df = dashboard_df + + # ── Render selected page ────────────────────────────────────────────────── + if page == "📊 Dashboard": + render_dashboard(filtered_df) + + elif page == "🔍 Sentiment Analysis": + # SA page fetches its own data on demand; receives only data_loader + render_sentiment_analysis(data_loader) + + elif page == "💬 Reply Required": + # RR page fetches its own data on demand; receives only data_loader + render_reply_required(data_loader) + + # ── Footer ──────────────────────────────────────────────────────────────── + st.markdown("---") + st.markdown( + """ +
+

Musora Sentiment Analysis Dashboard v1.0

+

Powered by Streamlit | Data from Snowflake

+
+ """, + unsafe_allow_html=True + ) + + +if __name__ == "__main__": + try: + main() + except Exception as e: + st.error(f"An error occurred: {str(e)}") + st.exception(e) \ No newline at end of file diff --git a/visualization/components/dashboard.py b/visualization/components/dashboard.py new file mode 100644 index 0000000000000000000000000000000000000000..b79a70c69e08852519ccb5ee6d785ee0d05171b0 --- /dev/null +++ b/visualization/components/dashboard.py @@ -0,0 +1,583 @@ +""" +Main Dashboard Page +Displays overall sentiment distributions by brand and platform +""" +import streamlit as st +import sys +from pathlib import Path + +# Add parent directory to path +parent_dir = Path(__file__).resolve().parent.parent +sys.path.append(str(parent_dir)) + +from utils.data_processor import SentimentDataProcessor +from utils.metrics import SentimentMetrics +from utils.pdf_exporter import DashboardPDFExporter +from visualizations.sentiment_charts import SentimentCharts +from visualizations.distribution_charts import DistributionCharts +from visualizations.demographic_charts import DemographicCharts +from visualizations.content_cards import ContentCards + + +def render_dashboard(df): + """ + Render the main dashboard page + + Args: + df: Sentiment dataframe + """ + st.title("📊 Sentiment Analysis Dashboard") + + # ── PDF Report ──────────────────────────────────────────────────────────── + with st.expander("📄 Export PDF Report", expanded=False): + st.markdown( + "Generate a comprehensive PDF report of the current dashboard view. " + "The report includes all charts, metrics, and a data summary. " + "Active global filters are reflected in the report." + ) + if st.button("Generate PDF Report", type="primary", use_container_width=True): + with st.spinner("Generating PDF report — this may take 30–60 seconds…"): + try: + # Build a human-readable description of active filters + global_filters = st.session_state.get("global_filters", {}) + filter_info = {} + if global_filters.get("platforms"): + filter_info["Platforms"] = global_filters["platforms"] + if global_filters.get("brands"): + filter_info["Brands"] = global_filters["brands"] + if global_filters.get("sentiments"): + filter_info["Sentiments"] = global_filters["sentiments"] + if global_filters.get("date_range"): + dr = global_filters["date_range"] + filter_info["Date Range"] = f"{dr[0]} to {dr[1]}" + + exporter = DashboardPDFExporter() + pdf_bytes = exporter.generate_report(df, filter_info or None) + + filename = ( + f"musora_sentiment_report_" + f"{__import__('datetime').datetime.now().strftime('%Y%m%d_%H%M')}.pdf" + ) + st.success("Report generated successfully!") + st.download_button( + label="Download PDF Report", + data=pdf_bytes, + file_name=filename, + mime="application/pdf", + use_container_width=True, + ) + except Exception as e: + st.error(f"Failed to generate report: {e}") + st.exception(e) + + st.markdown("---") + + # Performance tip + if len(df) > 10000: + st.info(f"💡 **Performance Tip**: Loaded {len(df):,} comments. Use the global filters in the sidebar to narrow down your analysis for faster performance.") + + st.markdown("---") + + # Initialize components + sentiment_charts = SentimentCharts() + distribution_charts = DistributionCharts() + processor = SentimentDataProcessor() + + # Display overall summary statistics + ContentCards.display_summary_stats(df) + + st.markdown("---") + + # Calculate overall metrics + overall_metrics = SentimentMetrics.calculate_overall_metrics(df) + + # Display health indicator + col1, col2, col3 = st.columns([1, 2, 1]) + with col2: + ContentCards.display_health_indicator(overall_metrics['negative_pct']) + + st.markdown("---") + + # Overall sentiment distribution + st.markdown("## 🎯 Overall Sentiment Distribution") + + col1, col2 = st.columns(2) + + with col1: + # Sentiment pie chart + sentiment_pie = sentiment_charts.create_sentiment_pie_chart(df, title="Overall Sentiment Distribution") + st.plotly_chart(sentiment_pie, use_container_width=True) + + with col2: + # Sentiment score gauge + sentiment_gauge = sentiment_charts.create_sentiment_score_gauge( + overall_metrics['avg_sentiment_score'], + title="Overall Sentiment Score" + ) + st.plotly_chart(sentiment_gauge, use_container_width=True) + + # Additional metrics + metric_col1, metric_col2 = st.columns(2) + with metric_col1: + st.metric("Positive %", f"{overall_metrics['positive_pct']:.1f}%") + with metric_col2: + st.metric("Reply Rate %", f"{overall_metrics['reply_required_pct']:.1f}%") + + st.markdown("---") + + # Sentiment by Brand + st.markdown("## 🏢 Sentiment Analysis by Brand") + + col1, col2 = st.columns(2) + + with col1: + # Stacked bar chart + brand_sentiment_bar = sentiment_charts.create_sentiment_bar_chart( + df, group_by='brand', title="Sentiment Distribution by Brand" + ) + st.plotly_chart(brand_sentiment_bar, use_container_width=True) + + with col2: + # Percentage bar chart + brand_sentiment_pct = sentiment_charts.create_sentiment_percentage_bar_chart( + df, group_by='brand', title="Sentiment Distribution by Brand (%)" + ) + st.plotly_chart(brand_sentiment_pct, use_container_width=True) + + # Brand metrics table + with st.expander("📈 Detailed Brand Metrics"): + brand_metrics = SentimentMetrics.calculate_brand_metrics(df) + + brand_data = [] + for brand, metrics in brand_metrics.items(): + brand_data.append({ + 'Brand': brand.title(), + 'Total Comments': metrics['total_comments'], + 'Replies Needed': metrics['total_reply_required'], + 'Negative %': f"{metrics['negative_pct']:.1f}%", + 'Positive %': f"{metrics['positive_pct']:.1f}%", + 'Avg Sentiment Score': f"{metrics['avg_sentiment_score']:.2f}" + }) + + st.table(brand_data) + + st.markdown("---") + + # Sentiment by Platform + st.markdown("## 🌐 Sentiment Analysis by Platform") + + col1, col2 = st.columns(2) + + with col1: + # Stacked bar chart + platform_sentiment_bar = sentiment_charts.create_sentiment_bar_chart( + df, group_by='platform', title="Sentiment Distribution by Platform" + ) + st.plotly_chart(platform_sentiment_bar, use_container_width=True) + + with col2: + # Percentage bar chart + platform_sentiment_pct = sentiment_charts.create_sentiment_percentage_bar_chart( + df, group_by='platform', title="Sentiment Distribution by Platform (%)" + ) + st.plotly_chart(platform_sentiment_pct, use_container_width=True) + + # Platform metrics table + with st.expander("📈 Detailed Platform Metrics"): + platform_metrics = SentimentMetrics.calculate_platform_metrics(df) + + platform_data = [] + for platform, metrics in platform_metrics.items(): + platform_data.append({ + 'Platform': platform.title(), + 'Total Comments': metrics['total_comments'], + 'Replies Needed': metrics['total_reply_required'], + 'Negative %': f"{metrics['negative_pct']:.1f}%", + 'Positive %': f"{metrics['positive_pct']:.1f}%", + 'Avg Sentiment Score': f"{metrics['avg_sentiment_score']:.2f}" + }) + + st.table(platform_data) + + st.markdown("---") + + # Intent Analysis + st.markdown("## 🎭 Intent Analysis") + + col1, col2 = st.columns(2) + + with col1: + # Intent bar chart + intent_bar = distribution_charts.create_intent_bar_chart( + df, title="Intent Distribution", orientation='h' + ) + st.plotly_chart(intent_bar, use_container_width=True) + + with col2: + # Intent pie chart + intent_pie = distribution_charts.create_intent_pie_chart(df, title="Intent Distribution") + st.plotly_chart(intent_pie, use_container_width=True) + + st.markdown("---") + + # Brand-Platform Matrix + st.markdown("## 🔀 Cross-Dimensional Analysis") + + col1, col2 = st.columns(2) + + with col1: + # Heatmap showing comment distribution + brand_platform_matrix = distribution_charts.create_brand_platform_matrix( + df, title="Brand-Platform Comment Matrix" + ) + st.plotly_chart(brand_platform_matrix, use_container_width=True) + + with col2: + # Sentiment heatmap + sentiment_heatmap = sentiment_charts.create_sentiment_heatmap( + df, row_dimension='brand', col_dimension='platform', title="Negative Sentiment Heatmap" + ) + st.plotly_chart(sentiment_heatmap, use_container_width=True) + + st.markdown("---") + + # Platform and Brand Distribution + st.markdown("## 📊 Volume Analysis") + + col1, col2 = st.columns(2) + + with col1: + # Platform distribution + platform_dist = distribution_charts.create_platform_distribution(df, title="Comments by Platform") + st.plotly_chart(platform_dist, use_container_width=True) + + with col2: + # Brand distribution + brand_dist = distribution_charts.create_brand_distribution(df, title="Comments by Brand") + st.plotly_chart(brand_dist, use_container_width=True) + + st.markdown("---") + + # Reply Requirements + st.markdown("## ⚠️ Reply Requirements Analysis") + + col1, col2 = st.columns(2) + + with col1: + # Reply required by brand + reply_brand = distribution_charts.create_reply_required_chart( + df, group_by='brand', title="Comments Requiring Reply by Brand" + ) + st.plotly_chart(reply_brand, use_container_width=True) + + with col2: + # Reply required by platform + reply_platform = distribution_charts.create_reply_required_chart( + df, group_by='platform', title="Comments Requiring Reply by Platform" + ) + st.plotly_chart(reply_platform, use_container_width=True) + + # Response urgency metrics + urgency_metrics = SentimentMetrics.calculate_response_urgency(df) + + st.markdown("### 🚨 Response Urgency Breakdown") + urgency_col1, urgency_col2, urgency_col3, urgency_col4 = st.columns(4) + + with urgency_col1: + st.metric("🔴 Urgent", urgency_metrics['urgent_count'], help="Negative sentiment + requires reply") + + with urgency_col2: + st.metric("🟠 High Priority", urgency_metrics['high_priority_count'], help="Neutral with feedback/request") + + with urgency_col3: + st.metric("🟡 Medium Priority", urgency_metrics['medium_priority_count'], help="Positive requiring reply") + + with urgency_col4: + st.metric("🟢 Low Priority", urgency_metrics['low_priority_count'], help="Very positive requiring reply") + + st.markdown("---") + + st.markdown("---") + + # Demographics Analysis (for musora_app only) + # Check if we have musora_app data and demographic fields + has_musora_app = 'platform' in df.columns and 'musora_app' in df['platform'].values + has_demographics = ( + has_musora_app and + 'age_group' in df.columns and + 'timezone' in df.columns and + 'experience_level' in df.columns + ) + + if has_demographics: + # Filter for musora_app data only + df_musora = df[df['platform'] == 'musora_app'].copy() + + # Check if we have any demographic data (not all Unknown) + has_valid_demographics = ( + (df_musora['age_group'] != 'Unknown').any() or + (df_musora['timezone_region'] != 'Unknown').any() or + (df_musora['experience_group'] != 'Unknown').any() + ) + + if has_valid_demographics and len(df_musora) > 0: + st.markdown("## 👥 Demographics Analysis (Musora App)") + st.info(f"📊 Analyzing demographics for **{len(df_musora):,}** Musora App comments") + + # Initialize demographic charts + demographic_charts = DemographicCharts() + + # Get demographic summary + demo_summary = processor.get_demographics_summary(df_musora) + + # Display summary metrics + demo_col1, demo_col2, demo_col3, demo_col4 = st.columns(4) + + with demo_col1: + st.metric( + "Comments with Demographics", + f"{demo_summary['users_with_demographics']:,}", + f"{demo_summary['coverage_percentage']:.1f}% coverage" + ) + + with demo_col2: + if demo_summary['avg_age'] is not None: + st.metric("Average Age", f"{demo_summary['avg_age']:.1f} years") + else: + st.metric("Average Age", "N/A") + + with demo_col3: + st.metric("Most Common Region", demo_summary['most_common_region']) + + with demo_col4: + if demo_summary['avg_experience'] is not None: + st.metric("Avg Experience", f"{demo_summary['avg_experience']:.1f}/10") + else: + st.metric("Avg Experience", "N/A") + + st.markdown("---") + + # Age Analysis + st.markdown("### 🎂 Age Distribution") + + age_dist = processor.get_demographics_distribution(df_musora, 'age_group') + age_sentiment = processor.get_demographics_by_sentiment(df_musora, 'age_group') + + if not age_dist.empty: + col1, col2 = st.columns(2) + + with col1: + age_chart = demographic_charts.create_age_distribution_chart( + age_dist, + title="Comments by Age Group" + ) + st.plotly_chart(age_chart, use_container_width=True) + + with col2: + age_sent_chart = demographic_charts.create_age_sentiment_chart( + age_sentiment, + title="Sentiment Distribution by Age Group" + ) + st.plotly_chart(age_sent_chart, use_container_width=True) + + # Insights + with st.expander("💡 Age Insights"): + if len(age_dist) > 0: + top_age_group = age_dist.iloc[0]['age_group'] + top_age_count = age_dist.iloc[0]['count'] + top_age_pct = age_dist.iloc[0]['percentage'] + + st.write(f"**Most Active Age Group:** {top_age_group} ({top_age_count:,} comments, {top_age_pct:.1f}%)") + + # Find age group with most negative sentiment + if not age_sentiment.empty: + negative_sentiments = age_sentiment[ + age_sentiment['sentiment_polarity'].isin(['negative', 'very_negative']) + ].groupby('age_group')['percentage'].sum().reset_index() + + if len(negative_sentiments) > 0: + negative_sentiments = negative_sentiments.sort_values('percentage', ascending=False) + most_negative_age = negative_sentiments.iloc[0]['age_group'] + most_negative_pct = negative_sentiments.iloc[0]['percentage'] + st.write(f"**Highest Negative Sentiment:** {most_negative_age} ({most_negative_pct:.1f}% negative)") + else: + st.info("No age data available for visualization") + + st.markdown("---") + + # Timezone Analysis + st.markdown("### 🌍 Geographic Distribution") + + # Get timezone data + top_timezones = processor.get_top_timezones(df_musora, top_n=15) + region_dist = processor.get_timezone_regions_distribution(df_musora) + region_sentiment = processor.get_demographics_by_sentiment(df_musora, 'timezone_region') + + if not top_timezones.empty or not region_dist.empty: + # Top timezones + if not top_timezones.empty: + st.markdown("#### Top 15 Timezones") + timezone_chart = demographic_charts.create_timezone_chart( + top_timezones, + title="Most Common Timezones", + top_n=15 + ) + st.plotly_chart(timezone_chart, use_container_width=True) + + # Regional distribution + if not region_dist.empty: + st.markdown("#### Regional Distribution") + col1, col2 = st.columns(2) + + with col1: + region_chart = demographic_charts.create_region_distribution_chart( + region_dist, + title="Comments by Region" + ) + st.plotly_chart(region_chart, use_container_width=True) + + with col2: + if not region_sentiment.empty: + region_sent_chart = demographic_charts.create_region_sentiment_chart( + region_sentiment, + title="Sentiment Distribution by Region" + ) + st.plotly_chart(region_sent_chart, use_container_width=True) + + # Insights + with st.expander("💡 Geographic Insights"): + if not top_timezones.empty: + top_tz = top_timezones.iloc[0]['timezone'] + top_tz_count = top_timezones.iloc[0]['count'] + top_tz_pct = top_timezones.iloc[0]['percentage'] + st.write(f"**Most Common Timezone:** {top_tz} ({top_tz_count:,} comments, {top_tz_pct:.1f}%)") + + if not region_dist.empty: + top_region = region_dist.iloc[0]['timezone_region'] + top_region_count = region_dist.iloc[0]['count'] + top_region_pct = region_dist.iloc[0]['percentage'] + st.write(f"**Most Active Region:** {top_region} ({top_region_count:,} comments, {top_region_pct:.1f}%)") + + # Find region with most negative sentiment + if not region_sentiment.empty: + negative_regions = region_sentiment[ + region_sentiment['sentiment_polarity'].isin(['negative', 'very_negative']) + ].groupby('timezone_region')['percentage'].sum().reset_index() + + if len(negative_regions) > 0: + negative_regions = negative_regions.sort_values('percentage', ascending=False) + most_negative_region = negative_regions.iloc[0]['timezone_region'] + most_negative_region_pct = negative_regions.iloc[0]['percentage'] + st.write(f"**Highest Negative Sentiment:** {most_negative_region} ({most_negative_region_pct:.1f}% negative)") + else: + st.info("No timezone/region data available for visualization") + + st.markdown("---") + + # Experience Level Analysis + st.markdown("### 🎯 Experience Level Distribution") + + # Get both detailed and grouped experience data + exp_dist_detailed = processor.get_experience_level_distribution(df_musora, use_groups=False) + exp_dist_grouped = processor.get_experience_level_distribution(df_musora, use_groups=True) + exp_sentiment_grouped = processor.get_demographics_by_sentiment(df_musora, 'experience_group') + + if not exp_dist_detailed.empty or not exp_dist_grouped.empty: + # Tabs for detailed vs grouped view + tab1, tab2 = st.tabs(["📊 Detailed (0-10)", "📊 Grouped (Beginner/Intermediate/Advanced)"]) + + with tab1: + if not exp_dist_detailed.empty: + exp_chart_detailed = demographic_charts.create_experience_distribution_chart( + exp_dist_detailed, + title="Comments by Experience Level (0-10 Scale)", + use_groups=False + ) + st.plotly_chart(exp_chart_detailed, use_container_width=True) + else: + st.info("No detailed experience level data available") + + with tab2: + if not exp_dist_grouped.empty: + col1, col2 = st.columns(2) + + with col1: + exp_chart_grouped = demographic_charts.create_experience_distribution_chart( + exp_dist_grouped, + title="Comments by Experience Group", + use_groups=True + ) + st.plotly_chart(exp_chart_grouped, use_container_width=True) + + with col2: + if not exp_sentiment_grouped.empty: + exp_sent_chart = demographic_charts.create_experience_sentiment_chart( + exp_sentiment_grouped, + title="Sentiment by Experience Group", + use_groups=True + ) + st.plotly_chart(exp_sent_chart, use_container_width=True) + else: + st.info("No grouped experience level data available") + + # Insights + with st.expander("💡 Experience Insights"): + if not exp_dist_grouped.empty: + top_exp_group = exp_dist_grouped.iloc[0]['experience_group'] + top_exp_count = exp_dist_grouped.iloc[0]['count'] + top_exp_pct = exp_dist_grouped.iloc[0]['percentage'] + st.write(f"**Most Active Group:** {top_exp_group} ({top_exp_count:,} comments, {top_exp_pct:.1f}%)") + + # Find experience group with most negative sentiment + if not exp_sentiment_grouped.empty: + negative_exp = exp_sentiment_grouped[ + exp_sentiment_grouped['sentiment_polarity'].isin(['negative', 'very_negative']) + ].groupby('experience_group')['percentage'].sum().reset_index() + + if len(negative_exp) > 0: + negative_exp = negative_exp.sort_values('percentage', ascending=False) + most_negative_exp = negative_exp.iloc[0]['experience_group'] + most_negative_exp_pct = negative_exp.iloc[0]['percentage'] + st.write(f"**Highest Negative Sentiment:** {most_negative_exp} ({most_negative_exp_pct:.1f}% negative)") + + if demo_summary['avg_experience'] is not None: + st.write(f"**Average Experience Level:** {demo_summary['avg_experience']:.2f}/10") + st.write(f"**Most Common Experience Group:** {demo_summary.get('most_common_experience', 'Unknown')}") + else: + st.info("No experience level data available for visualization") + + st.markdown("---") + + # Language Distribution (if available) + if 'detected_language' in df.columns: + st.markdown("## 🌍 Language Distribution") + + lang_dist = distribution_charts.create_language_distribution(df, top_n=10, title="Top 10 Languages") + st.plotly_chart(lang_dist, use_container_width=True) + + st.markdown("---") + + # Temporal trends (if timestamp available) + if 'comment_timestamp' in df.columns and not df.empty: + with st.expander("📈 Temporal Trends", expanded=False): + # Frequency selector + freq_col1, freq_col2 = st.columns([1, 3]) + + with freq_col1: + freq = st.selectbox( + "Time Granularity", + options=['D', 'W', 'M'], + format_func=lambda x: {'D': 'Daily', 'W': 'Weekly', 'M': 'Monthly'}[x], + index=1 # Default to Weekly + ) + + sentiment_timeline = sentiment_charts.create_sentiment_timeline(df, freq=freq, title="Sentiment Trends Over Time") + st.plotly_chart(sentiment_timeline, use_container_width=True) + + # Hierarchical sunburst + with st.expander("🌟 Hierarchical View", expanded=False): + st.markdown("**Interactive Brand > Platform > Sentiment Distribution**") + sunburst = distribution_charts.create_combined_distribution_sunburst( + df, title="Brand > Platform > Sentiment Distribution" + ) + st.plotly_chart(sunburst, use_container_width=True) \ No newline at end of file diff --git a/visualization/components/reply_required.py b/visualization/components/reply_required.py new file mode 100644 index 0000000000000000000000000000000000000000..c10f7fe6f43721aa895aa656e533bbc4198d8982 --- /dev/null +++ b/visualization/components/reply_required.py @@ -0,0 +1,365 @@ +""" +Reply Required Page +Displays comments that require replies with filtering and prioritisation. + +Data is fetched on-demand: user sets filters then clicks "Fetch Data". +Platform, brand and date are pre-populated from global sidebar filters. +""" +import streamlit as st +import pandas as pd +import sys +from pathlib import Path + +parent_dir = Path(__file__).resolve().parent.parent +sys.path.append(str(parent_dir)) + +from utils.metrics import SentimentMetrics +from visualizations.sentiment_charts import SentimentCharts +from visualizations.distribution_charts import DistributionCharts +from visualizations.content_cards import ContentCards + + +def render_reply_required(data_loader): + """ + Render the Reply Required page. + + Args: + data_loader: SentimentDataLoader instance + """ + st.title("⚠️ Comments Requiring Reply") + st.markdown("Manage and prioritise comments that need responses") + st.markdown("---") + + metrics = SentimentMetrics() + + # ── Get filter options from the lightweight dashboard df ────────────────── + dashboard_df = st.session_state.get('dashboard_df') + if dashboard_df is None or dashboard_df.empty: + st.warning("Dashboard data not loaded yet. Please wait for the app to initialise.") + return + + available_platforms = sorted(dashboard_df['platform'].dropna().unique().tolist()) + available_brands = sorted(dashboard_df['brand'].dropna().unique().tolist()) + + # ── Pre-populate from global sidebar filters ─────────────────────────────── + global_filters = st.session_state.get('global_filters', {}) + global_platforms = global_filters.get('platforms', []) + global_brands = global_filters.get('brands', []) + global_date = global_filters.get('date_range') + + st.markdown("### 🔍 Query Filters") + st.info( + "⚡ **Performance**: Set your filters then click **Fetch Data** to run a targeted Snowflake query. " + "Global sidebar filters are pre-populated below." + ) + + filter_col1, filter_col2, filter_col3 = st.columns(3) + + with filter_col1: + selected_platforms = st.multiselect( + "Platforms", + options=available_platforms, + default=[p for p in global_platforms if p in available_platforms], + help="Leave empty to include all platforms" + ) + + with filter_col2: + selected_brands = st.multiselect( + "Brands", + options=available_brands, + default=[b for b in global_brands if b in available_brands], + help="Leave empty to include all brands" + ) + + with filter_col3: + # Date range — default from global filter or show no filter + if global_date and len(global_date) == 2: + default_date = (global_date[0], global_date[1]) + elif 'comment_timestamp' in dashboard_df.columns and not dashboard_df.empty: + max_d = dashboard_df['comment_timestamp'].max().date() + min_d = dashboard_df['comment_timestamp'].min().date() + default_date = (min_d, max_d) + else: + default_date = None + + if default_date: + date_range = st.date_input( + "Date Range", + value=default_date, + help="Filter by comment timestamp" + ) + else: + date_range = None + + st.markdown("---") + + # ── Fetch button ─────────────────────────────────────────────────────────── + fetch_key = ( + tuple(sorted(selected_platforms)), + tuple(sorted(selected_brands)), + str(date_range) if date_range and len(date_range) == 2 else '' + ) + + has_data = ( + 'rr_df' in st.session_state + and st.session_state.get('rr_fetch_key') == fetch_key + and not st.session_state['rr_df'].empty + ) + + fetch_col, status_col = st.columns([1, 3]) + with fetch_col: + fetch_clicked = st.button("🚀 Fetch Data", use_container_width=True, type="primary") + with status_col: + if has_data: + st.success(f"✅ Showing **{len(st.session_state['rr_df']):,}** comments requiring reply") + elif not fetch_clicked: + st.info("👆 Click **Fetch Data** to load reply-required comments from Snowflake.") + + if fetch_clicked: + with st.spinner("Fetching reply-required comments from Snowflake…"): + df = data_loader.load_reply_required_data( + platforms=selected_platforms or None, + brands=selected_brands or None, + date_range=date_range if date_range and len(date_range) == 2 else None, + ) + st.session_state['rr_df'] = df + st.session_state['rr_fetch_key'] = fetch_key + st.session_state['reply_page'] = 1 + st.rerun() + + if not has_data and not fetch_clicked: + return + + # ── Work with fetched data ───────────────────────────────────────────────── + reply_comments = st.session_state.get('rr_df', pd.DataFrame()) + + if reply_comments.empty: + st.success("🎉 No comments currently require replies with these filters.") + return + + st.markdown("---") + + # ── Summary stats ────────────────────────────────────────────────────────── + st.markdown("### 📊 Summary") + col1, col2, col3, col4 = st.columns(4) + + with col1: + st.metric("Total Replies Needed", len(reply_comments)) + with col2: + urgency = metrics.calculate_response_urgency(reply_comments) + st.metric("🔴 Urgent", urgency['urgent_count'], help="Negative sentiment") + with col3: + unique_contents = reply_comments['content_sk'].nunique() if 'content_sk' in reply_comments.columns else 0 + st.metric("Affected Contents", unique_contents) + with col4: + neg_cnt = reply_comments['sentiment_polarity'].isin(['negative', 'very_negative']).sum() + neg_pct = neg_cnt / len(reply_comments) * 100 if len(reply_comments) > 0 else 0 + st.metric("Negative %", f"{neg_pct:.1f}%") + + st.markdown("---") + + # ── Urgency breakdown ────────────────────────────────────────────────────── + st.markdown("### 🚨 Response Urgency Breakdown") + urgency_metrics = metrics.calculate_response_urgency(reply_comments) + uc1, uc2, uc3, uc4 = st.columns(4) + uc1.metric("🔴 Urgent", urgency_metrics['urgent_count'], help="Negative — immediate action") + uc2.metric("🟠 High Priority", urgency_metrics['high_priority_count'], help="Neutral + feedback/request — 24h") + uc3.metric("🟡 Medium", urgency_metrics['medium_priority_count'], help="Positive — 48h") + uc4.metric("🟢 Low", urgency_metrics['low_priority_count'], help="Very positive — when convenient") + + st.markdown("---") + + # ── In-page filters (applied to already-fetched data) ───────────────────── + st.markdown("### 🔍 Refine View") + rf1, rf2, rf3, rf4 = st.columns(4) + + with rf1: + priority_options = ['All', '🔴 Urgent', '🟠 High', '🟡 Medium', '🟢 Low'] + selected_priority = st.selectbox("Priority", priority_options, index=0) + + with rf2: + platform_options = ['All'] + sorted(reply_comments['platform'].unique().tolist()) + view_platform = st.selectbox("Platform", platform_options, index=0) + + with rf3: + brand_options = ['All'] + sorted(reply_comments['brand'].unique().tolist()) + view_brand = st.selectbox("Brand", brand_options, index=0) + + with rf4: + intent_list = ( + reply_comments['intent'].str.split(',').explode().str.strip() + .dropna().unique().tolist() + ) + intent_options = ['All'] + sorted(intent_list) + selected_intent = st.selectbox("Intent", intent_options, index=0) + + filtered_comments = reply_comments + + if selected_priority != 'All': + if selected_priority == '🔴 Urgent': + filtered_comments = filtered_comments[ + filtered_comments['sentiment_polarity'].isin(['negative', 'very_negative']) + ] + elif selected_priority == '🟠 High': + filtered_comments = filtered_comments[ + (filtered_comments['sentiment_polarity'] == 'neutral') & + (filtered_comments['intent'].str.contains('feedback_negative|request', na=False)) + ] + elif selected_priority == '🟡 Medium': + filtered_comments = filtered_comments[filtered_comments['sentiment_polarity'] == 'positive'] + elif selected_priority == '🟢 Low': + filtered_comments = filtered_comments[filtered_comments['sentiment_polarity'] == 'very_positive'] + + if view_platform != 'All': + filtered_comments = filtered_comments[filtered_comments['platform'] == view_platform] + + if view_brand != 'All': + filtered_comments = filtered_comments[filtered_comments['brand'] == view_brand] + + if selected_intent != 'All': + filtered_comments = filtered_comments[ + filtered_comments['intent'].str.contains(selected_intent, na=False) + ] + + st.markdown(f"**Showing {len(filtered_comments):,} comments after filtering**") + st.markdown("---") + + # ── Charts ───────────────────────────────────────────────────────────────── + if not filtered_comments.empty: + st.markdown("### 📈 Analysis") + viz_col1, viz_col2 = st.columns(2) + with viz_col1: + sentiment_charts = SentimentCharts() + st.plotly_chart( + sentiment_charts.create_sentiment_pie_chart(filtered_comments, title="Sentiment Distribution"), + use_container_width=True + ) + with viz_col2: + distribution_charts = DistributionCharts() + st.plotly_chart( + distribution_charts.create_intent_bar_chart( + filtered_comments, title="Intent Distribution", orientation='h' + ), + use_container_width=True + ) + st.markdown("---") + + # ── Paginated comment list ───────────────────────────────────────────────── + st.markdown("### 💬 Comments Requiring Reply") + + items_per_page = 10 + total_pages = max(1, (len(filtered_comments) - 1) // items_per_page + 1) + + if 'reply_page' not in st.session_state: + st.session_state.reply_page = 1 + + # Clamp page if filters reduced the total + st.session_state.reply_page = min(st.session_state.reply_page, total_pages) + + if total_pages > 1: + pc1, pc2, pc3 = st.columns([1, 2, 1]) + with pc1: + if st.button("⬅️ Previous", key="prev_top", + disabled=st.session_state.reply_page <= 1): + st.session_state.reply_page -= 1 + st.rerun() + with pc2: + st.markdown(f"
Page {st.session_state.reply_page} of {total_pages}
", + unsafe_allow_html=True) + with pc3: + if st.button("Next ➡️", key="next_top", + disabled=st.session_state.reply_page >= total_pages): + st.session_state.reply_page += 1 + st.rerun() + st.markdown("---") + + start_idx = (st.session_state.reply_page - 1) * items_per_page + paginated = filtered_comments.iloc[start_idx: start_idx + items_per_page] + + if paginated.empty: + st.info("No comments match the selected filters.") + else: + for idx, (_, comment) in enumerate(paginated.iterrows(), start=start_idx + 1): + sp = comment['sentiment_polarity'] + if sp in ['negative', 'very_negative']: + priority_emoji = "🔴" + elif sp == 'neutral' and any(i in comment['intent'] for i in ['feedback_negative', 'request']): + priority_emoji = "🟠" + elif sp == 'positive': + priority_emoji = "🟡" + else: + priority_emoji = "🟢" + + st.markdown(f"#### {priority_emoji} Comment #{idx}") + ContentCards.display_comment_card(comment, show_original=True) + + if total_pages > 1: + st.markdown("---") + pb1, pb2, pb3 = st.columns([1, 2, 1]) + with pb1: + if st.button("⬅️ Previous", key="prev_bottom", + disabled=st.session_state.reply_page <= 1): + st.session_state.reply_page -= 1 + st.rerun() + with pb2: + st.markdown(f"
Page {st.session_state.reply_page} of {total_pages}
", + unsafe_allow_html=True) + with pb3: + if st.button("Next ➡️", key="next_bottom", + disabled=st.session_state.reply_page >= total_pages): + st.session_state.reply_page += 1 + st.rerun() + + st.markdown("---") + + # ── Export ───────────────────────────────────────────────────────────────── + st.markdown("### 💾 Export Data") + col1, col2 = st.columns([1, 3]) + with col1: + export_columns = [ + 'comment_id', 'author_name', 'platform', 'brand', 'comment_timestamp', + 'display_text', 'original_text', 'detected_language', 'sentiment_polarity', + 'intent', 'sentiment_confidence', 'content_description', 'permalink_url' + ] + available_cols = [c for c in export_columns if c in filtered_comments.columns] + csv = filtered_comments[available_cols].to_csv(index=False) + st.download_button( + label="📥 Download as CSV", + data=csv, + file_name="comments_requiring_reply.csv", + mime="text/csv" + ) + with col2: + st.info("Download the filtered comments for team collaboration or CRM import.") + + st.markdown("---") + + # ── Reply requirements by content (top 10) ───────────────────────────────── + st.markdown("### 📋 Reply Requirements by Content") + + if 'content_sk' in filtered_comments.columns: + content_reply_summary = ( + filtered_comments + .groupby('content_sk', as_index=False) + .agg( + replies_needed=('comment_sk', 'count') if 'comment_sk' in filtered_comments.columns + else ('sentiment_polarity', 'count'), + content_description=('content_description', 'first'), + permalink_url=('permalink_url', 'first') + ) + .sort_values('replies_needed', ascending=False) + .head(10) + ) + + for i, (_, content) in enumerate(content_reply_summary.iterrows(), 1): + with st.expander(f"📝 Content #{i} — {content['replies_needed']} replies needed"): + st.markdown(f"**Description:** {content['content_description']}") + if pd.notna(content.get('permalink_url')): + st.markdown(f"**Link:** [View Content]({content['permalink_url']})") + + top_comments = filtered_comments[ + filtered_comments['content_sk'] == content['content_sk'] + ].head(3) + st.markdown(f"**Top {len(top_comments)} comments:**") + for _, c in top_comments.iterrows(): + ContentCards.display_comment_card(c, show_original=True) \ No newline at end of file diff --git a/visualization/components/sentiment_analysis.py b/visualization/components/sentiment_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..4bcd261345807478351de4deaffd86bcd9aada85 --- /dev/null +++ b/visualization/components/sentiment_analysis.py @@ -0,0 +1,570 @@ +""" +Sentiment Analysis Page +Analyze content performance across all sentiment types with advanced filtering. + +Data is fetched on-demand: user sets filters then clicks "Fetch Data". +Global filters (platform/brand/date) from the sidebar are pre-populated. +""" +import streamlit as st +import pandas as pd +import sys +from pathlib import Path + +parent_dir = Path(__file__).resolve().parent.parent +sys.path.append(str(parent_dir)) + +from visualizations.sentiment_charts import SentimentCharts +from visualizations.distribution_charts import DistributionCharts +from visualizations.content_cards import ContentCards +from agents.content_summary_agent import ContentSummaryAgent + + +def render_sentiment_analysis(data_loader): + """ + Render the Sentiment Analysis page. + + Args: + data_loader: SentimentDataLoader instance + """ + st.title("🔍 Sentiment Analysis") + st.markdown("Analyze content performance based on sentiment patterns and user feedback") + st.markdown("---") + + sentiment_charts = SentimentCharts() + distribution_charts = DistributionCharts() + summary_agent = ContentSummaryAgent(model="gpt-5-nano", temperature=1) + + if 'content_summaries' not in st.session_state: + st.session_state.content_summaries = {} + + # ── Get filter options from the already-loaded (lightweight) dashboard df ─ + dashboard_df = st.session_state.get('dashboard_df') + if dashboard_df is None or dashboard_df.empty: + st.warning("Dashboard data not loaded yet. Please wait for the app to initialise.") + return + + available_platforms = sorted(dashboard_df['platform'].dropna().unique().tolist()) + available_brands = sorted(dashboard_df['brand'].dropna().unique().tolist()) + + # ── Pre-populate from global sidebar filters ─────────────────────────────── + global_filters = st.session_state.get('global_filters', {}) + global_platforms = global_filters.get('platforms', []) + global_brands = global_filters.get('brands', []) + global_date_range = global_filters.get('date_range') + + # ── Platform & Brand selection ───────────────────────────────────────────── + st.markdown("### 🎯 Select Platform and Brand") + st.info( + "⚡ **Performance**: Choose a platform and brand, set optional filters, " + "then click **Fetch Data** to run a targeted Snowflake query." + ) + + filter_col1, filter_col2 = st.columns(2) + + with filter_col1: + default_platform_idx = 0 + if global_platforms and global_platforms[0] in available_platforms: + default_platform_idx = available_platforms.index(global_platforms[0]) + 1 # +1 for blank + selected_platform = st.selectbox( + "Platform *", + options=[''] + available_platforms, + index=default_platform_idx, + help="Select the platform to analyse" + ) + + with filter_col2: + default_brand_idx = 0 + if global_brands and global_brands[0] in available_brands: + default_brand_idx = available_brands.index(global_brands[0]) + 1 + selected_brand = st.selectbox( + "Brand *", + options=[''] + available_brands, + index=default_brand_idx, + help="Select the brand to analyse" + ) + + if not selected_platform or not selected_brand: + st.warning("⚠️ Please select both **Platform** and **Brand** to continue.") + st.markdown("---") + + # Quick summary from dashboard data + st.markdown("### 📊 Available Data Summary") + col1, col2, col3 = st.columns(3) + with col1: + st.metric("Total Comments", f"{len(dashboard_df):,}") + with col2: + st.metric("Platforms", len(available_platforms)) + with st.expander("View Platforms"): + for p in available_platforms: + cnt = (dashboard_df['platform'] == p).sum() + st.write(f"- **{p}**: {cnt:,} comments") + with col3: + st.metric("Brands", len(available_brands)) + with st.expander("View Brands"): + for b in available_brands: + cnt = (dashboard_df['brand'] == b).sum() + st.write(f"- **{b}**: {cnt:,} comments") + return + + st.markdown("---") + + # ── Content filters ──────────────────────────────────────────────────────── + st.markdown("### 🔍 Content Filters") + + # Build available sentiment / intent options from dashboard_df filtered to + # selected platform+brand (fast — no text columns involved) + mask = (dashboard_df['platform'] == selected_platform) & (dashboard_df['brand'] == selected_brand) + preview_df = dashboard_df[mask] + + filter_col1, filter_col2, filter_col3, filter_col4 = st.columns(4) + + with filter_col1: + sentiment_options = sorted(preview_df['sentiment_polarity'].unique().tolist()) + selected_sentiments = st.multiselect( + "Sentiment", + options=sentiment_options, + default=[], + help="Filter by dominant sentiment. Leave empty for all." + ) + + with filter_col2: + intent_list = ( + preview_df['intent'] + .str.split(',').explode().str.strip() + .dropna().unique().tolist() + ) + selected_intents = st.multiselect( + "Intent", + options=sorted(i for i in intent_list if i), + default=[], + help="Filter contents that have comments with these intents" + ) + + with filter_col3: + top_n = st.selectbox( + "Top N Contents", + options=[5, 10, 15, 20, 25], + index=1, + help="Number of contents to display" + ) + + with filter_col4: + filter_active = bool(selected_sentiments or selected_intents) + st.metric( + "Filters Active", + "✓ Yes" if filter_active else "✗ No", + help="Sentiment or intent filters applied" if filter_active else "Showing all sentiments" + ) + + st.markdown("---") + + # ── Advanced ranking controls ────────────────────────────────────────────── + with st.expander("⚙️ Advanced Ranking Controls", expanded=False): + adv_col1, adv_col2 = st.columns(2) + with adv_col1: + min_comments = st.slider( + "Minimum Comments Required", + min_value=1, max_value=50, value=10, step=1, + help="Exclude contents with fewer comments than this threshold." + ) + with adv_col2: + sort_by = st.selectbox( + "Sort By", + options=[ + ('severity_score', '🎯 Severity Score (Balanced) — Recommended'), + ('sentiment_percentage', '📊 Sentiment Percentage'), + ('sentiment_count', '🔢 Sentiment Count (Absolute)'), + ('total_comments', '💬 Total Comments (Volume)'), + ], + format_func=lambda x: x[1], + index=0 + ) + sort_by_value = sort_by[0] + + sentiment_label = "selected sentiments" if selected_sentiments else "negative sentiments" + info_map = { + 'severity_score': f"📘 **Severity Score** = Sentiment % × √(Total Comments). Balances {sentiment_label} % with volume.", + 'sentiment_percentage': f"📘 Ranks by highest % of {sentiment_label}. May include low-volume contents.", + 'sentiment_count': f"📘 Ranks by absolute number of {sentiment_label} comments.", + 'total_comments': "📘 Ranks by total comment volume, regardless of sentiment.", + } + st.info(info_map.get(sort_by_value, "")) + + # Date range for the query (inherit from global filters if set) + if global_date_range and len(global_date_range) == 2: + query_date_range = global_date_range + else: + query_date_range = None + + # ── Fetch button ─────────────────────────────────────────────────────────── + fetch_key = ( + selected_platform, selected_brand, top_n, min_comments, sort_by_value, + tuple(sorted(selected_sentiments)), tuple(sorted(selected_intents)), + str(query_date_range) + ) + + fetch_col, info_col = st.columns([1, 3]) + with fetch_col: + fetch_clicked = st.button("🚀 Fetch Data", use_container_width=True, type="primary") + + # Auto-fetch if the key hasn't changed and we already have data + has_data = ( + 'sa_contents' in st.session_state + and st.session_state.get('sa_fetch_key') == fetch_key + and not st.session_state['sa_contents'].empty + ) + + with info_col: + if has_data: + n_contents = len(st.session_state['sa_contents']) + n_comments = len(st.session_state.get('sa_comments', [])) + st.success(f"✅ Showing **{n_contents}** contents with **{n_comments:,}** sampled comments") + elif fetch_clicked: + pass # spinner shown below + else: + st.info("👆 Click **Fetch Data** to run a targeted Snowflake query with the settings above.") + + if fetch_clicked: + with st.spinner("Fetching data from Snowflake…"): + contents_df, comments_df = data_loader.load_sa_data( + platform=selected_platform, + brand=selected_brand, + top_n=top_n, + min_comments=min_comments, + sort_by=sort_by_value, + sentiments=selected_sentiments or None, + intents=selected_intents or None, + date_range=query_date_range, + ) + st.session_state['sa_contents'] = contents_df + st.session_state['sa_comments'] = comments_df + st.session_state['sa_fetch_key'] = fetch_key + st.session_state['sa_platform'] = selected_platform + st.session_state['sa_brand'] = selected_brand + # Reset pagination on new fetch + st.session_state['sentiment_page'] = 1 + st.rerun() + + # ── Nothing fetched yet ──────────────────────────────────────────────────── + if not has_data and not fetch_clicked: + return + + filtered_contents = st.session_state.get('sa_contents', pd.DataFrame()) + comments_df = st.session_state.get('sa_comments', pd.DataFrame()) + + if filtered_contents.empty: + st.warning("No content data found with the selected filters. Try adjusting and re-fetching.") + return + + # ── Summary stats ────────────────────────────────────────────────────────── + st.markdown("### 📊 Summary") + col1, col2, col3, col4 = st.columns(4) + + with col1: + st.metric("Contents Analysed", len(filtered_contents)) + with col2: + if 'selected_sentiment_percentage' in filtered_contents.columns: + avg_pct = filtered_contents['selected_sentiment_percentage'].mean() + label = "Selected Sentiment %" if selected_sentiments else "Avg Negative %" + st.metric(label, f"{avg_pct:.1f}%") + else: + st.metric("Avg Negative %", f"{filtered_contents['negative_percentage'].mean():.1f}%") + with col3: + st.metric("Total Comments", int(filtered_contents['total_comments'].sum())) + with col4: + st.metric("Total Replies Needed", int(filtered_contents['reply_required_count'].sum())) + + st.markdown("---") + + # ── Engagement scatter ───────────────────────────────────────────────────── + st.markdown("### 📈 Content Engagement Analysis") + scatter = distribution_charts.create_engagement_scatter( + filtered_contents, title="Content Engagement vs. Sentiment" + ) + st.plotly_chart(scatter, use_container_width=True, key="engagement_scatter_chart") + + st.markdown("---") + + # ── Paginated content cards ──────────────────────────────────────────────── + st.markdown("### 🔍 Detailed Content Analysis") + + if 'sentiment_page' not in st.session_state: + st.session_state.sentiment_page = 1 + + items_per_page = 5 + total_contents = len(filtered_contents) + total_pages = (total_contents + items_per_page - 1) // items_per_page + + if total_contents > items_per_page: + st.info(f"📄 Page {st.session_state.sentiment_page} of {total_pages} ({total_contents} total contents)") + col_prev, col_info, col_next = st.columns([1, 2, 1]) + with col_prev: + if st.button("⬅️ Previous", key="prev_top", + disabled=st.session_state.sentiment_page == 1): + st.session_state.sentiment_page -= 1 + st.rerun() + with col_info: + st.markdown( + f"
" + f"Page {st.session_state.sentiment_page} / {total_pages}
", + unsafe_allow_html=True + ) + with col_next: + if st.button("Next ➡️", key="next_top", + disabled=st.session_state.sentiment_page >= total_pages): + st.session_state.sentiment_page += 1 + st.rerun() + st.markdown("---") + + start_idx = (st.session_state.sentiment_page - 1) * items_per_page + end_idx = min(start_idx + items_per_page, total_contents) + paginated = filtered_contents.iloc[start_idx:end_idx] + + for idx, (_, content_row) in enumerate(paginated.iterrows(), start_idx + 1): + ContentCards.display_content_card(content_row, rank=idx) + + # Comments from the sampled set (pre-fetched, no extra Snowflake call) + if not comments_df.empty and 'content_sk' in comments_df.columns: + content_comments = comments_df[comments_df['content_sk'] == content_row['content_sk']] + else: + content_comments = pd.DataFrame() + + if content_comments.empty: + st.info("No sampled comment details available for this content.") + else: + viz_col1, viz_col2 = st.columns(2) + with viz_col1: + pie = sentiment_charts.create_sentiment_pie_chart( + content_comments, title="Sentiment Distribution (sample)" + ) + st.plotly_chart(pie, use_container_width=True, + key=f"sentiment_pie_{content_row['content_sk']}") + with viz_col2: + bar = distribution_charts.create_intent_bar_chart( + content_comments, title="Intent Distribution (sample)", orientation='h' + ) + st.plotly_chart(bar, use_container_width=True, + key=f"intent_bar_{content_row['content_sk']}") + + # AI Analysis + st.markdown("#### 🤖 AI-Powered Analysis") + content_sk = content_row['content_sk'] + st.markdown("**Select analysis type:**") + btn_col1, btn_col2, btn_col3 = st.columns(3) + + with btn_col1: + gen_neg = st.button("📉 Negative Summary", key=f"ai_negative_{content_sk}", + use_container_width=True) + with btn_col2: + gen_combined = st.button("📊 Combined Summary", key=f"ai_combined_{content_sk}", + use_container_width=True) + with btn_col3: + gen_pos = st.button("📈 Positive Summary", key=f"ai_positive_{content_sk}", + use_container_width=True) + + summary_type = None + if gen_neg: + summary_type = 'negative' + elif gen_pos: + summary_type = 'positive' + elif gen_combined: + summary_type = 'combined' + + key_neg = f"{content_sk}_negative" + key_pos = f"{content_sk}_positive" + key_com = f"{content_sk}_combined" + + if summary_type or any(k in st.session_state.content_summaries for k in (key_neg, key_pos, key_com)): + if summary_type: + summary_key = f"{content_sk}_{summary_type}" + with st.spinner(f"Analysing {summary_type} comments with AI…"): + result = summary_agent.process({ + 'content_sk': content_sk, + 'content_description': content_row['content_description'], + 'comments': content_comments, + 'sentiment_type': summary_type + }) + st.session_state.content_summaries[summary_key] = result + + for label, key in [('Negative', key_neg), ('Combined', key_com), ('Positive', key_pos)]: + if key not in st.session_state.content_summaries: + continue + result = st.session_state.content_summaries[key] + if result['success']: + summary = result['summary'] + with st.expander(f"📊 AI Analysis Report — {label}", expanded=True): + st.markdown("### Executive Summary") + st.info(summary['executive_summary']) + if summary['main_themes']: + st.markdown("### 🎯 Main Themes") + for theme in summary['main_themes']: + emoji = {'positive': '😊', 'negative': '😟', 'mixed': '🤔'}.get( + theme.get('sentiment', 'mixed'), '🤔') + st.markdown(f"**{emoji} {theme.get('theme')}** ({theme.get('sentiment','mixed').title()})\n- {theme.get('description','')}") + col_p, col_c = st.columns(2) + with col_p: + st.markdown("### ✅ Praise Points") + for pt in summary.get('praise_points', []): + st.markdown(f"- {pt}") + with col_c: + st.markdown("### ⚠️ Key Complaints") + for pt in summary.get('key_complaints', []): + st.markdown(f"- {pt}") + col_f, col_i = st.columns(2) + with col_f: + st.markdown("### ❓ FAQs") + for q in summary.get('frequently_asked_questions', []): + st.markdown(f"- {q}") + with col_i: + st.markdown("### 💡 Insights") + for ins in summary.get('unexpected_insights', []): + st.markdown(f"- {ins}") + if summary.get('action_recommendations'): + st.markdown("### 🎯 Recommended Actions") + for action in summary['action_recommendations']: + priority = action.get('priority', 'medium').upper() + emoji = {'HIGH': '🔴', 'MEDIUM': '🟡', 'LOW': '🟢'}.get(priority, '🟡') + st.markdown(f"{emoji} **[{priority}]** {action.get('action','')}") + with st.expander("ℹ️ Analysis Metadata"): + meta = result.get('metadata', {}) + mc1, mc2, mc3 = st.columns(3) + mc1.metric("Comments Analysed", meta.get('total_comments_analyzed', 0)) + mc2.metric("Model Used", meta.get('model_used', 'N/A')) + mc3.metric("Tokens Used", meta.get('tokens_used', 0)) + else: + st.error(f"❌ AI analysis failed: {result.get('error','Unknown error')}") + if st.button("🔄 Retry", key=f"retry_{key}"): + del st.session_state.content_summaries[key] + st.rerun() + + # Comment expansion (text already loaded from fetch) + st.markdown("#### 💬 View Comments by Sentiment") + + if not content_comments.empty: + neg_comments = content_comments[ + content_comments['sentiment_polarity'].isin(['negative', 'very_negative']) + ] + pos_comments = content_comments[ + content_comments['sentiment_polarity'].isin(['positive', 'very_positive']) + ] + + col_neg, col_pos = st.columns(2) + with col_neg: + with st.expander(f"📉 Negative Comments ({len(neg_comments)} sampled)", expanded=False): + if not neg_comments.empty: + for _, comment in neg_comments.iterrows(): + ContentCards.display_comment_card(comment, show_original=True) + else: + st.info("No negative comments in sample.") + with col_pos: + with st.expander(f"📈 Positive Comments ({len(pos_comments)} sampled)", expanded=False): + if not pos_comments.empty: + for _, comment in pos_comments.iterrows(): + ContentCards.display_comment_card(comment, show_original=True) + else: + st.info("No positive comments in sample.") + else: + st.info("No comments available for this content in the current sample.") + + st.markdown("---") + + # ── Bottom pagination ────────────────────────────────────────────────────── + if total_contents > items_per_page: + col_prev_b, col_info_b, col_next_b = st.columns([1, 2, 1]) + with col_prev_b: + if st.button("⬅️ Previous", key="prev_bottom", + disabled=st.session_state.sentiment_page == 1): + st.session_state.sentiment_page -= 1 + st.rerun() + with col_info_b: + st.markdown( + f"
" + f"Page {st.session_state.sentiment_page} / {total_pages}
", + unsafe_allow_html=True + ) + with col_next_b: + if st.button("Next ➡️", key="next_bottom", + disabled=st.session_state.sentiment_page >= total_pages): + st.session_state.sentiment_page += 1 + st.rerun() + + st.markdown("---") + + # ── Insights & recommendations (using sampled comments) ─────────────────── + st.markdown("### 💡 Insights & Recommendations") + + from utils.data_processor import SentimentDataProcessor + processor = SentimentDataProcessor() + + all_sampled = comments_df[ + comments_df['content_sk'].isin(filtered_contents['content_sk']) + ] if not comments_df.empty else pd.DataFrame() + + insight_col1, insight_col2 = st.columns(2) + with insight_col1: + st.markdown("#### 🎯 Common Intent Patterns") + if not all_sampled.empty: + intent_dist = processor.get_intent_distribution(all_sampled) + for _, row in intent_dist.sort_values('count', ascending=False).head(5).iterrows(): + st.markdown(f"- **{row['intent']}**: {row['count']} ({row['percentage']:.1f}%)") + + with insight_col2: + st.markdown("#### 🌐 Platform Breakdown") + if not all_sampled.empty: + for platform, count in all_sampled['platform'].value_counts().items(): + pct = count / len(all_sampled) * 100 + st.markdown(f"- **{platform.title()}**: {count} comments ({pct:.1f}%)") + + st.markdown("---") + + # ── Action items ─────────────────────────────────────────────────────────── + st.markdown("### ✅ Recommended Actions") + action_items = [] + + total_replies = int(filtered_contents['reply_required_count'].sum()) + if total_replies > 0: + action_items.append(f"🔴 **High Priority**: {total_replies} comments require immediate response") + + critical = filtered_contents[filtered_contents['negative_percentage'] > 50] + if not critical.empty: + action_items.append( + f"🚨 **Critical**: {len(critical)} content(s) have >50% negative sentiment — investigate root causes" + ) + + if not all_sampled.empty: + feedback_cnt = all_sampled['intent'].str.contains('feedback_negative', na=False).sum() + if feedback_cnt: + action_items.append(f"💬 **Feedback**: {feedback_cnt} negative-feedback comments — consider product improvements") + + question_cnt = all_sampled['intent'].str.contains('question', na=False).sum() + if question_cnt: + action_items.append(f"❓ **Questions**: {question_cnt} questions — improve FAQ or support docs") + + if action_items: + for item in action_items: + st.markdown(item) + else: + st.success("No critical action items at this time.") + + st.markdown("---") + + # ── Export ───────────────────────────────────────────────────────────────── + st.markdown("### 💾 Export Data") + col1, col2 = st.columns([1, 3]) + with col1: + base_cols = ['content_sk', 'content_description', 'permalink_url', + 'total_comments', 'reply_required_count', 'dominant_sentiment'] + for extra in ['selected_sentiment_count', 'selected_sentiment_percentage', + 'negative_count', 'negative_percentage']: + if extra in filtered_contents.columns: + base_cols.append(extra) + export_cols = [c for c in base_cols if c in filtered_contents.columns] + csv = filtered_contents[export_cols].to_csv(index=False) + st.download_button( + label="📥 Download as CSV", + data=csv, + file_name=f"sentiment_analysis_top{top_n}.csv", + mime="text/csv" + ) + with col2: + st.info("Download the data for further analysis or reporting.") \ No newline at end of file diff --git a/visualization/config/viz_config.json b/visualization/config/viz_config.json new file mode 100644 index 0000000000000000000000000000000000000000..945892cd8157d09176df4419daebe387154c9b8e --- /dev/null +++ b/visualization/config/viz_config.json @@ -0,0 +1,88 @@ +{ + "color_schemes": { + "sentiment_polarity": { + "very_positive": "#00C851", + "positive": "#7CB342", + "neutral": "#FFB300", + "negative": "#FF6F00", + "very_negative": "#D32F2F" + }, + "intent": { + "praise": "#4CAF50", + "question": "#2196F3", + "request": "#9C27B0", + "feedback_negative": "#FF5722", + "suggestion": "#00BCD4", + "humor_sarcasm": "#FFC107", + "off_topic": "#9E9E9E", + "spam_selfpromo": "#795548" + }, + "platform": { + "facebook": "#1877F2", + "instagram": "#E4405F", + "youtube": "#FF0000", + "twitter": "#1DA1F2", + "musora_app": "#1982C4", + "default": "#607D8B" + }, + "brand": { + "drumeo": "#FF6B35", + "pianote": "#6A4C93", + "musora": "#1982C4", + "default": "#8AC926" + } + }, + "sentiment_order": [ + "very_positive", + "positive", + "neutral", + "negative", + "very_negative" + ], + "intent_order": [ + "praise", + "question", + "request", + "feedback_negative", + "suggestion", + "humor_sarcasm", + "off_topic", + "spam_selfpromo" + ], + "negative_sentiments": [ + "negative", + "very_negative" + ], + "dashboard": { + "default_date_range_days": 30, + "max_comments_display": 100, + "chart_height": 400, + "top_n_contents": 10 + }, + "page_config": { + "page_title": "Musora Sentiment Analysis Dashboard", + "page_icon": "📊", + "layout": "wide", + "initial_sidebar_state": "expanded" + }, + "snowflake": { + "query": "SELECT s.COMMENT_SK, s.COMMENT_ID, s.ORIGINAL_TEXT, s.PLATFORM, s.COMMENT_TIMESTAMP, s.AUTHOR_NAME, s.AUTHOR_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_TEXT, s.CONTENT_SK, s.CONTENT_ID, s.CONTENT_DESCRIPTION, s.CHANNEL_SK, s.CHANNEL_NAME, s.CHANNEL_DISPLAY_NAME, s.DETECTED_LANGUAGE, s.LANGUAGE_CODE, s.IS_ENGLISH, s.LANGUAGE_CONFIDENCE, s.DETECTION_METHOD, s.HAS_TEXT, s.TRANSLATED_TEXT, s.TRANSLATION_PERFORMED, s.TRANSLATION_CONFIDENCE, s.TRANSLATION_NOTES, s.SENTIMENT_POLARITY, s.INTENT, s.REQUIRES_REPLY, s.SENTIMENT_CONFIDENCE, s.ANALYSIS_NOTES, s.PROCESSING_SUCCESS, CAST(NULL AS VARCHAR(16777216)) as PROCESSING_ERRORS, s.PROCESSED_AT, s.WORKFLOW_VERSION, CAST(NULL AS TIMESTAMP_NTZ(9)) as CREATED_AT, CAST(NULL AS TIMESTAMP_NTZ(9)) as UPDATED_AT, s.CHANNEL_NAME as BRAND, c.PERMALINK_URL, CAST(NULL AS VARCHAR(16777216)) as THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT c ON s.CONTENT_SK = c.CONTENT_SK UNION ALL SELECT COMMENT_SK, COMMENT_ID, ORIGINAL_TEXT, CASE WHEN PLATFORM = 'musora' THEN 'musora_app' ELSE PLATFORM END as PLATFORM, COMMENT_TIMESTAMP, AUTHOR_NAME, AUTHOR_ID, PARENT_COMMENT_ID, PARENT_COMMENT_TEXT, CONTENT_SK, CONTENT_ID, CONTENT_DESCRIPTION, CHANNEL_SK, CHANNEL_NAME, CHANNEL_DISPLAY_NAME, DETECTED_LANGUAGE, LANGUAGE_CODE, IS_ENGLISH, LANGUAGE_CONFIDENCE, DETECTION_METHOD, HAS_TEXT, TRANSLATED_TEXT, TRANSLATION_PERFORMED, TRANSLATION_CONFIDENCE, TRANSLATION_NOTES, SENTIMENT_POLARITY, INTENT, REQUIRES_REPLY, SENTIMENT_CONFIDENCE, ANALYSIS_NOTES, PROCESSING_SUCCESS, PROCESSING_ERRORS, PROCESSED_AT, WORKFLOW_VERSION, CREATED_AT, UPDATED_AT, CHANNEL_NAME as BRAND, PERMALINK_URL, THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES", + "dashboard_query": "SELECT s.COMMENT_SK, s.CONTENT_SK, LOWER(s.PLATFORM) AS PLATFORM, LOWER(s.CHANNEL_NAME) AS BRAND, s.SENTIMENT_POLARITY, s.INTENT, s.REQUIRES_REPLY, s.DETECTED_LANGUAGE, s.COMMENT_TIMESTAMP, s.PROCESSED_AT, s.AUTHOR_ID FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s UNION ALL SELECT COMMENT_SK, CONTENT_SK, CASE WHEN LOWER(PLATFORM) = 'musora' THEN 'musora_app' ELSE LOWER(PLATFORM) END AS PLATFORM, LOWER(CHANNEL_NAME) AS BRAND, SENTIMENT_POLARITY, INTENT, REQUIRES_REPLY, DETECTED_LANGUAGE, COMMENT_TIMESTAMP, PROCESSED_AT, AUTHOR_ID FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES", + "demographics_query": "SELECT u.id as USER_ID, u.birthday as BIRTHDAY, u.timezone as TIMEZONE, GREATEST(COALESCE(p.difficulty, 0), COALESCE(p.self_report_difficulty, 0), COALESCE(p.method_experience, 0)) AS EXPERIENCE_LEVEL FROM stitch.musora_ecom_db.usora_users u JOIN online_recsys.preprocessed.users p ON u.id = p.user_id" + }, + "demographics": { + "age_groups": { + "18-24": [18, 24], + "25-34": [25, 34], + "35-44": [35, 44], + "45-54": [45, 54], + "55+": [55, 150] + }, + "experience_groups": { + "Beginner (0-3)": [0, 3], + "Intermediate (4-7)": [4, 7], + "Advanced (8-10)": [8, 10] + }, + "top_timezones_count": 15 + } +} diff --git a/visualization/data/data_loader.py b/visualization/data/data_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..d4cfcd2dd11ba3a2fc7a6d4cc9d786dc19f00142 --- /dev/null +++ b/visualization/data/data_loader.py @@ -0,0 +1,810 @@ +""" +Data loader module for Sentiment Analysis Visualization +Handles Snowflake connection and data loading with caching +""" +import sys +import os +import re +import pandas as pd +import numpy as np +import streamlit as st +from pathlib import Path +import json +from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta + +# Add parent directory to path to import SnowFlakeConnection +parent_dir = Path(__file__).resolve().parent.parent.parent +sys.path.append(str(parent_dir)) + +from visualization.SnowFlakeConnection import SnowFlakeConn + + +class SentimentDataLoader: + """ + Loads sentiment analysis data from Snowflake with caching. + + Three data loading modes: + - load_dashboard_data() : lightweight (no text), cached 24h + - load_sa_data(...) : top-N content stats + sampled comments, on-demand + - load_reply_required_data() : reply-queue comments with text, on-demand + """ + + def __init__(self, config_path=None): + if config_path is None: + config_path = Path(__file__).parent.parent / "config" / "viz_config.json" + + with open(config_path, 'r') as f: + self.config = json.load(f) + + self.query = self.config['snowflake']['query'] + self.dashboard_query = self.config['snowflake'].get('dashboard_query', self.query) + self.demographics_query = self.config['snowflake'].get('demographics_query', None) + + # ───────────────────────────────────────────────────────────── + # Dashboard data (lightweight, 24-hour cache) + # ───────────────────────────────────────────────────────────── + + @st.cache_data(ttl=86400) + def load_dashboard_data(_self): + """ + Load lightweight dashboard data from Snowflake (no text columns). + Includes demographics merge if demographics_query is configured. + + Returns: + pd.DataFrame + """ + try: + conn = SnowFlakeConn() + df = conn.run_read_query(_self.dashboard_query, "dashboard data") + conn.close_connection() + + if df is None or df.empty: + st.error("No dashboard data returned from Snowflake") + return pd.DataFrame() + + df = _self._process_dashboard_dataframe(df) + + if _self.demographics_query: + demographics_df = _self.load_demographics_data() + df = _self.merge_demographics_with_comments(df, demographics_df) + + return df + + except Exception as e: + st.error(f"Error loading dashboard data from Snowflake: {e}") + return pd.DataFrame() + + def _process_dashboard_dataframe(self, df): + """Process lightweight dashboard dataframe (no text columns).""" + df.columns = df.columns.str.lower() + + if 'comment_timestamp' in df.columns: + df['comment_timestamp'] = pd.to_datetime(df['comment_timestamp'], errors='coerce') + + if 'processed_at' in df.columns: + df['processed_at'] = pd.to_datetime(df['processed_at'], errors='coerce') + + df['sentiment_polarity'] = df['sentiment_polarity'].fillna('unknown') + df['intent'] = df['intent'].fillna('unknown') + df['platform'] = df['platform'].fillna('unknown').str.lower() + df['brand'] = df['brand'].fillna('unknown').str.lower() + + if 'requires_reply' in df.columns: + df['requires_reply'] = df['requires_reply'].astype(bool) + + return df + + # ───────────────────────────────────────────────────────────── + # Full data (legacy / kept for compatibility, 24-hour cache) + # ───────────────────────────────────────────────────────────── + + @st.cache_data(ttl=86400) + def load_data(_self, reload=False): + """ + Load full sentiment data (with text). Kept for compatibility. + Prefer load_dashboard_data() for dashboard views. + """ + try: + conn = SnowFlakeConn() + df = conn.run_read_query(_self.query, "sentiment features") + conn.close_connection() + + if df is None or df.empty: + st.error("No data returned from Snowflake") + return pd.DataFrame() + + df = _self._process_dataframe(df) + + if _self.demographics_query: + demographics_df = _self.load_demographics_data() + df = _self.merge_demographics_with_comments(df, demographics_df) + + return df + + except Exception as e: + st.error(f"Error loading data from Snowflake: {e}") + return pd.DataFrame() + + def _process_dataframe(self, df): + """Process full dataframe including vectorized display_text computation.""" + df.columns = df.columns.str.lower() + + if 'comment_timestamp' in df.columns: + df['comment_timestamp'] = pd.to_datetime(df['comment_timestamp'], errors='coerce') + + if 'processed_at' in df.columns: + df['processed_at'] = pd.to_datetime(df['processed_at'], errors='coerce') + + df['sentiment_polarity'] = df['sentiment_polarity'].fillna('unknown') + df['intent'] = df['intent'].fillna('unknown') + df['platform'] = df['platform'].fillna('unknown').str.lower() + df['brand'] = df['brand'].fillna('unknown').str.lower() + + if 'requires_reply' in df.columns: + df['requires_reply'] = df['requires_reply'].astype(bool) + + # Vectorized display_text: use translated_text when non-English and available + if 'translated_text' in df.columns and 'is_english' in df.columns: + mask_translate = (df['is_english'] == False) & df['translated_text'].notna() + df['display_text'] = df.get('original_text', pd.Series('', index=df.index)).fillna('') + df.loc[mask_translate, 'display_text'] = df.loc[mask_translate, 'translated_text'] + elif 'original_text' in df.columns: + df['display_text'] = df['original_text'].fillna('') + else: + df['display_text'] = '' + + # Vectorized short version + text = df['display_text'].astype(str) + df['display_text_short'] = text.where(text.str.len() <= 100, text.str[:100] + '...') + + return df + + # ───────────────────────────────────────────────────────────── + # Sentiment Analysis page data (on-demand, 24-hour cache) + # ───────────────────────────────────────────────────────────── + + def load_sa_data(self, platform, brand, top_n=10, min_comments=10, + sort_by='severity_score', sentiments=None, intents=None, + date_range=None): + """ + Load Sentiment Analysis page data: + 1. Content aggregation stats for top-N contents + 2. Sampled comments (up to 50 neg + 50 pos + 50 other per content) + + Args: + platform: Selected platform string + brand: Selected brand string + top_n: Max number of contents to return + min_comments: Minimum comment threshold for inclusion + sort_by: 'severity_score' | 'sentiment_percentage' | 'sentiment_count' | 'total_comments' + sentiments: List of sentiments to filter by (dominant_sentiment) + intents: List of intents to filter by + date_range: Tuple (start_date, end_date) or None + + Returns: + tuple: (contents_df, comments_df) + """ + sentiments_key = tuple(sorted(sentiments)) if sentiments else () + intents_key = tuple(sorted(intents)) if intents else () + date_key = (str(date_range[0]), str(date_range[1])) if date_range and len(date_range) == 2 else () + + return self._fetch_sa_data( + platform, brand, top_n, min_comments, sort_by, + sentiments_key, intents_key, date_key + ) + + @st.cache_data(ttl=86400) + def _fetch_sa_data(_self, platform, brand, top_n, min_comments, sort_by, + sentiments, intents, date_range): + """Cached SA data fetch — returns (contents_df, comments_df).""" + try: + conn = SnowFlakeConn() + + # Step 1: content-level aggregation query + content_query = _self._build_sa_content_query( + platform, brand, min_comments, sort_by, date_range + ) + contents_df = conn.run_read_query(content_query, "SA content aggregation") + + if contents_df is None or contents_df.empty: + conn.close_connection() + return pd.DataFrame(), pd.DataFrame() + + # columns already lowercased by run_read_query + contents_df = _self._process_sa_content_stats(contents_df) + + # Python-side sentiment filter (dominant_sentiment) + if sentiments: + contents_df = contents_df[contents_df['dominant_sentiment'].isin(sentiments)] + + # Limit to top_n after Python-side filtering + contents_df = contents_df.head(top_n) + + if contents_df.empty: + conn.close_connection() + return pd.DataFrame(), pd.DataFrame() + + # Step 2: sampled comments with text for those content_sks + content_sk_list = contents_df['content_sk'].tolist() + comments_query = _self._build_sa_comments_query( + platform, brand, content_sk_list, date_range + ) + comments_df = conn.run_read_query(comments_query, "SA sampled comments") + conn.close_connection() + + if comments_df is not None and not comments_df.empty: + comments_df = _self._process_sa_comments(comments_df) + + # Python-side intent filter — keep only content_sks that have + # at least one comment matching any selected intent + if intents: + pattern = '|'.join(re.escape(i) for i in intents) + valid_sks = comments_df[ + comments_df['intent'].str.contains(pattern, na=False, case=False) + ]['content_sk'].unique() + contents_df = contents_df[contents_df['content_sk'].isin(valid_sks)] + comments_df = comments_df[comments_df['content_sk'].isin(valid_sks)] + else: + comments_df = pd.DataFrame() + + return contents_df, comments_df + + except Exception as e: + st.error(f"Error loading SA data: {e}") + return pd.DataFrame(), pd.DataFrame() + + def _build_sa_content_query(self, platform, brand, min_comments, sort_by, date_range): + """Build dynamic SQL for content-level aggregation (no text columns).""" + # Build table-qualified date clauses to avoid ambiguity when a JOIN is present + social_date_clause = self._build_date_clause(date_range, table_alias='s') + musora_date_clause = self._build_date_clause(date_range) + + safe_brand = self._sanitize_value(brand.lower()) + safe_platform = self._sanitize_value(platform.lower()) + + sort_exprs = { + 'severity_score': ( + "(SUM(CASE WHEN SENTIMENT_POLARITY IN ('negative','very_negative') THEN 1 ELSE 0 END)" + " * 100.0 / COUNT(*)) * SQRT(COUNT(*))" + ), + 'sentiment_percentage': ( + "SUM(CASE WHEN SENTIMENT_POLARITY IN ('negative','very_negative') THEN 1 ELSE 0 END)" + " * 100.0 / COUNT(*)" + ), + 'sentiment_count': ( + "SUM(CASE WHEN SENTIMENT_POLARITY IN ('negative','very_negative') THEN 1 ELSE 0 END)" + ), + 'total_comments': "COUNT(*)", + } + sort_expr = sort_exprs.get(sort_by, sort_exprs['severity_score']) + + parts = [] + + if platform != 'musora_app': + parts.append(f""" + SELECT + s.COMMENT_SK, s.CONTENT_SK, s.CONTENT_DESCRIPTION, + c.PERMALINK_URL, CAST(NULL AS VARCHAR) AS THUMBNAIL_URL, + s.SENTIMENT_POLARITY, s.INTENT, s.REQUIRES_REPLY, s.COMMENT_TIMESTAMP + FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s + LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT c ON s.CONTENT_SK = c.CONTENT_SK + WHERE LOWER(s.CHANNEL_NAME) = '{safe_brand}' + AND LOWER(s.PLATFORM) = '{safe_platform}' + {social_date_clause} + """) + + if platform == 'musora_app': + parts.append(f""" + SELECT + COMMENT_SK, CONTENT_SK, CONTENT_DESCRIPTION, + PERMALINK_URL, THUMBNAIL_URL, + SENTIMENT_POLARITY, INTENT, REQUIRES_REPLY, COMMENT_TIMESTAMP + FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES + WHERE LOWER(CHANNEL_NAME) = '{safe_brand}' + {musora_date_clause} + """) + + combined = " UNION ALL ".join(parts) + + return f""" + WITH combined AS ({combined}) + SELECT + CONTENT_SK, + MAX(CONTENT_DESCRIPTION) AS CONTENT_DESCRIPTION, + MAX(PERMALINK_URL) AS PERMALINK_URL, + MAX(THUMBNAIL_URL) AS THUMBNAIL_URL, + COUNT(*) AS TOTAL_COMMENTS, + SUM(CASE WHEN REQUIRES_REPLY THEN 1 ELSE 0 END) AS REPLY_REQUIRED_COUNT, + SUM(CASE WHEN SENTIMENT_POLARITY = 'very_negative' THEN 1 ELSE 0 END) AS VERY_NEGATIVE_COUNT, + SUM(CASE WHEN SENTIMENT_POLARITY = 'negative' THEN 1 ELSE 0 END) AS NEGATIVE_COUNT_RAW, + SUM(CASE WHEN SENTIMENT_POLARITY = 'neutral' THEN 1 ELSE 0 END) AS NEUTRAL_COUNT, + SUM(CASE WHEN SENTIMENT_POLARITY = 'positive' THEN 1 ELSE 0 END) AS POSITIVE_COUNT_RAW, + SUM(CASE WHEN SENTIMENT_POLARITY = 'very_positive' THEN 1 ELSE 0 END) AS VERY_POSITIVE_COUNT + FROM combined + GROUP BY CONTENT_SK + HAVING COUNT(*) >= {int(min_comments)} + ORDER BY {sort_expr} DESC + """ + + def _process_sa_content_stats(self, df): + """ + Derive all columns expected by the existing SA page UI from the + raw content-aggregation result. + """ + df['negative_count'] = df['very_negative_count'] + df['negative_count_raw'] + df['positive_count'] = df['positive_count_raw'] + df['very_positive_count'] + + df['negative_percentage'] = ( + df['negative_count'] / df['total_comments'] * 100 + ).round(2) + df['positive_percentage'] = ( + df['positive_count'] / df['total_comments'] * 100 + ).round(2) + + df['severity_score'] = ( + df['negative_percentage'] * (df['total_comments'] ** 0.5) + ).round(2) + + # These mirror the columns produced by get_sentiment_filtered_contents() + df['dynamic_severity_score'] = df['severity_score'] + df['selected_sentiment_count'] = df['negative_count'] + df['selected_sentiment_percentage'] = df['negative_percentage'] + + # Dominant sentiment = sentiment with the highest count + sentiment_cols = pd.DataFrame({ + 'very_negative': df['very_negative_count'], + 'negative': df['negative_count_raw'], + 'neutral': df['neutral_count'], + 'positive': df['positive_count_raw'], + 'very_positive': df['very_positive_count'], + }) + df['dominant_sentiment'] = sentiment_cols.idxmax(axis=1) + + return df + + def _build_sa_comments_query(self, platform, brand, content_sk_list, date_range): + """ + Build SQL for sampled comments for a list of content_sks. + Samples up to 50 per (content_sk, sentiment_group) — neg, pos, other. + display_text is computed in SQL (no need to fetch both original + translated). + """ + # Qualified date clauses: social media query has a JOIN so needs s. prefix + social_date_clause = self._build_date_clause(date_range, table_alias='s') + musora_date_clause = self._build_date_clause(date_range) + safe_brand = self._sanitize_value(brand.lower()) + content_sks_str = ", ".join(f"'{self._sanitize_value(str(sk))}'" for sk in content_sk_list) + + parts = [] + + if platform != 'musora_app': + parts.append(f""" + SELECT + s.COMMENT_SK, s.COMMENT_ID, s.CONTENT_SK, s.CONTENT_DESCRIPTION, + CASE WHEN s.IS_ENGLISH = FALSE AND s.TRANSLATED_TEXT IS NOT NULL + THEN s.TRANSLATED_TEXT ELSE s.ORIGINAL_TEXT END AS DISPLAY_TEXT, + s.ORIGINAL_TEXT, + LOWER(s.PLATFORM) AS PLATFORM, + LOWER(s.CHANNEL_NAME) AS BRAND, + s.COMMENT_TIMESTAMP, s.AUTHOR_NAME, + s.DETECTED_LANGUAGE, s.SENTIMENT_POLARITY, s.INTENT, + s.REQUIRES_REPLY, s.SENTIMENT_CONFIDENCE, s.IS_ENGLISH, + c.PERMALINK_URL + FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s + LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT c ON s.CONTENT_SK = c.CONTENT_SK + WHERE s.CONTENT_SK IN ({content_sks_str}) + AND LOWER(s.CHANNEL_NAME) = '{safe_brand}' + {social_date_clause} + """) + + if platform == 'musora_app': + parts.append(f""" + SELECT + COMMENT_SK, COMMENT_ID, CONTENT_SK, CONTENT_DESCRIPTION, + CASE WHEN IS_ENGLISH = FALSE AND TRANSLATED_TEXT IS NOT NULL + THEN TRANSLATED_TEXT ELSE ORIGINAL_TEXT END AS DISPLAY_TEXT, + ORIGINAL_TEXT, + 'musora_app' AS PLATFORM, + LOWER(CHANNEL_NAME) AS BRAND, + COMMENT_TIMESTAMP, AUTHOR_NAME, + DETECTED_LANGUAGE, SENTIMENT_POLARITY, INTENT, + REQUIRES_REPLY, SENTIMENT_CONFIDENCE, IS_ENGLISH, + PERMALINK_URL + FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES + WHERE CONTENT_SK IN ({content_sks_str}) + AND LOWER(CHANNEL_NAME) = '{safe_brand}' + {musora_date_clause} + """) + + combined = " UNION ALL ".join(parts) + + return f""" + WITH combined AS ({combined}) + SELECT * + FROM combined + QUALIFY ROW_NUMBER() OVER ( + PARTITION BY CONTENT_SK, + CASE + WHEN SENTIMENT_POLARITY IN ('negative', 'very_negative') THEN 'neg' + WHEN SENTIMENT_POLARITY IN ('positive', 'very_positive') THEN 'pos' + ELSE 'other' + END + ORDER BY + CASE SENTIMENT_POLARITY + WHEN 'very_negative' THEN 1 WHEN 'negative' THEN 2 + WHEN 'very_positive' THEN 1 WHEN 'positive' THEN 2 + ELSE 3 + END, + RANDOM() + ) <= 50 + """ + + def _process_sa_comments(self, df): + """Process sampled comments dataframe for the SA page.""" + if 'comment_timestamp' in df.columns: + df['comment_timestamp'] = pd.to_datetime(df['comment_timestamp'], errors='coerce') + + df['sentiment_polarity'] = df['sentiment_polarity'].fillna('unknown') + df['intent'] = df['intent'].fillna('unknown') + df['platform'] = df['platform'].fillna('unknown').str.lower() + + if 'requires_reply' in df.columns: + df['requires_reply'] = df['requires_reply'].astype(bool) + + # display_text already computed in SQL; create short version vectorized + if 'display_text' in df.columns: + text = df['display_text'].astype(str) + df['display_text_short'] = text.where( + text.str.len() <= 100, text.str[:100] + '...' + ) + + return df + + # ───────────────────────────────────────────────────────────── + # Reply Required page data (on-demand, 24-hour cache) + # ───────────────────────────────────────────────────────────── + + def load_reply_required_data(self, platforms=None, brands=None, date_range=None): + """ + Load comments requiring reply, filtered by platform/brand/date. + + Args: + platforms: List of platform strings (or None for all) + brands: List of brand strings (or None for all) + date_range: Tuple (start_date, end_date) or None + + Returns: + pd.DataFrame + """ + platforms_key = tuple(sorted(platforms)) if platforms else () + brands_key = tuple(sorted(brands)) if brands else () + date_key = (str(date_range[0]), str(date_range[1])) if date_range and len(date_range) == 2 else () + + return self._fetch_rr_data(platforms_key, brands_key, date_key) + + @st.cache_data(ttl=86400) + def _fetch_rr_data(_self, platforms, brands, date_range): + """Cached Reply Required data fetch.""" + try: + query = _self._build_rr_query(platforms, brands, date_range) + if not query: + return pd.DataFrame() + + conn = SnowFlakeConn() + df = conn.run_read_query(query, "reply required comments") + conn.close_connection() + + if df is None or df.empty: + return pd.DataFrame() + + # columns already lowercased by run_read_query + if 'comment_timestamp' in df.columns: + df['comment_timestamp'] = pd.to_datetime(df['comment_timestamp'], errors='coerce') + + df['sentiment_polarity'] = df['sentiment_polarity'].fillna('unknown') + df['intent'] = df['intent'].fillna('unknown') + df['platform'] = df['platform'].fillna('unknown').str.lower() + df['brand'] = df['brand'].fillna('unknown').str.lower() + + if 'requires_reply' in df.columns: + df['requires_reply'] = df['requires_reply'].astype(bool) + + if 'comment_timestamp' in df.columns: + df = df.sort_values('comment_timestamp', ascending=False) + + # display_text already computed in SQL; create short version + if 'display_text' in df.columns: + text = df['display_text'].astype(str) + df['display_text_short'] = text.where( + text.str.len() <= 100, text.str[:100] + '...' + ) + + return df + + except Exception as e: + st.error(f"Error loading reply required data: {e}") + return pd.DataFrame() + + def _build_rr_query(self, platforms, brands, date_range): + """Build dynamic SQL for the Reply Required page.""" + # Build separate qualified clauses for the social media table (has a JOIN so needs s. prefix) + # and unqualified clauses for the musora table (no JOIN, no ambiguity). + + # Date filters + social_date_clause = "" + musora_date_clause = "" + if date_range and len(date_range) == 2: + social_date_clause = ( + f"AND s.COMMENT_TIMESTAMP >= '{date_range[0]}'" + f" AND s.COMMENT_TIMESTAMP <= '{date_range[1]}'" + ) + musora_date_clause = ( + f"AND COMMENT_TIMESTAMP >= '{date_range[0]}'" + f" AND COMMENT_TIMESTAMP <= '{date_range[1]}'" + ) + + # Brand filters + social_brand_clause = "" + musora_brand_clause = "" + if brands: + brands_str = "', '".join(self._sanitize_value(b.lower()) for b in brands) + social_brand_clause = f"AND LOWER(s.CHANNEL_NAME) IN ('{brands_str}')" + musora_brand_clause = f"AND LOWER(CHANNEL_NAME) IN ('{brands_str}')" + + # Determine which source tables to include + include_social = True + include_musora = True + social_platform_clause = "" + + if platforms: + non_musora = [p for p in platforms if p != 'musora_app'] + include_musora = 'musora_app' in platforms + include_social = len(non_musora) > 0 + if non_musora: + plat_str = "', '".join(self._sanitize_value(p.lower()) for p in non_musora) + social_platform_clause = f"AND LOWER(s.PLATFORM) IN ('{plat_str}')" + + if not include_social and not include_musora: + return None + + parts = [] + + if include_social: + parts.append(f""" + SELECT + s.COMMENT_SK, s.COMMENT_ID, s.CONTENT_SK, s.CONTENT_DESCRIPTION, + CASE WHEN s.IS_ENGLISH = FALSE AND s.TRANSLATED_TEXT IS NOT NULL + THEN s.TRANSLATED_TEXT ELSE s.ORIGINAL_TEXT END AS DISPLAY_TEXT, + s.ORIGINAL_TEXT, + LOWER(s.PLATFORM) AS PLATFORM, + LOWER(s.CHANNEL_NAME) AS BRAND, + s.COMMENT_TIMESTAMP, s.AUTHOR_NAME, + s.DETECTED_LANGUAGE, s.SENTIMENT_POLARITY, s.INTENT, + s.REQUIRES_REPLY, s.SENTIMENT_CONFIDENCE, s.IS_ENGLISH, + c.PERMALINK_URL + FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s + LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT c ON s.CONTENT_SK = c.CONTENT_SK + WHERE s.REQUIRES_REPLY = TRUE + {social_platform_clause} + {social_brand_clause} + {social_date_clause} + """) + + if include_musora: + parts.append(f""" + SELECT + COMMENT_SK, COMMENT_ID, CONTENT_SK, CONTENT_DESCRIPTION, + CASE WHEN IS_ENGLISH = FALSE AND TRANSLATED_TEXT IS NOT NULL + THEN TRANSLATED_TEXT ELSE ORIGINAL_TEXT END AS DISPLAY_TEXT, + ORIGINAL_TEXT, + 'musora_app' AS PLATFORM, + LOWER(CHANNEL_NAME) AS BRAND, + COMMENT_TIMESTAMP, AUTHOR_NAME, + DETECTED_LANGUAGE, SENTIMENT_POLARITY, INTENT, + REQUIRES_REPLY, SENTIMENT_CONFIDENCE, IS_ENGLISH, + PERMALINK_URL + FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES + WHERE REQUIRES_REPLY = TRUE + {musora_brand_clause} + {musora_date_clause} + """) + + combined = " UNION ALL ".join(parts) + return f""" + WITH combined AS ({combined}) + SELECT * FROM combined + ORDER BY COMMENT_TIMESTAMP DESC + """ + + # ───────────────────────────────────────────────────────────── + # Demographics (24-hour cache) + # ───────────────────────────────────────────────────────────── + + @st.cache_data(ttl=86400) + def load_demographics_data(_self): + """Load user demographic data from Snowflake.""" + if not _self.demographics_query: + return pd.DataFrame() + + try: + conn = SnowFlakeConn() + query_with_cast = _self.demographics_query.replace( + "u.birthday as BIRTHDAY", + "TO_VARCHAR(u.birthday, 'YYYY-MM-DD HH24:MI:SS.FF6 TZHTZM') as BIRTHDAY" + ) + df = conn.run_read_query(query_with_cast, "user demographics") + conn.close_connection() + + if df is None or df.empty: + return pd.DataFrame() + + return _self._process_demographics_dataframe(df) + + except Exception as e: + st.warning(f"Could not load demographic data: {str(e)}") + return pd.DataFrame() + + def _process_demographics_dataframe(self, df): + """Process and enrich demographic dataframe.""" + df.columns = df.columns.str.lower() + + if 'birthday' in df.columns: + df['birthday'] = df['birthday'].astype(str) + df['birthday'] = pd.to_datetime(df['birthday'], errors='coerce', utc=True) + df['birthday'] = df['birthday'].dt.tz_localize(None) + df['age'] = df['birthday'].apply(self._calculate_age) + df['age_group'] = df['age'].apply(self._categorize_age) + + if 'timezone' in df.columns: + df['timezone_region'] = df['timezone'].apply(self._extract_timezone_region) + + if 'experience_level' in df.columns: + df['experience_group'] = df['experience_level'].apply(self._categorize_experience) + + if 'user_id' in df.columns: + df = df[df['user_id'].notna()] + + return df + + @staticmethod + def _calculate_age(birthday): + if pd.isna(birthday): + return None + try: + age = relativedelta(datetime.now(), birthday).years + return age if 0 <= age <= 120 else None + except Exception: + return None + + def _categorize_age(self, age): + if pd.isna(age) or age is None: + return 'Unknown' + for group_name, (min_age, max_age) in self.config.get('demographics', {}).get('age_groups', {}).items(): + if min_age <= age <= max_age: + return group_name + return 'Unknown' + + @staticmethod + def _extract_timezone_region(timezone): + if pd.isna(timezone) or not isinstance(timezone, str): + return 'Unknown' + parts = timezone.split('/') + return parts[0] if parts else 'Unknown' + + def _categorize_experience(self, experience_level): + if pd.isna(experience_level): + return 'Unknown' + try: + exp_level = float(experience_level) + except Exception: + return 'Unknown' + for group_name, (min_exp, max_exp) in self.config.get('demographics', {}).get('experience_groups', {}).items(): + if min_exp <= exp_level <= max_exp: + return group_name + return 'Unknown' + + def merge_demographics_with_comments(self, comments_df, demographics_df): + """Merge demographic data with comment data for musora_app platform only.""" + if demographics_df.empty: + for col, val in [('age', None), ('age_group', 'Unknown'), + ('timezone', None), ('timezone_region', 'Unknown'), + ('experience_level', None), ('experience_group', 'Unknown')]: + comments_df[col] = val + return comments_df + + if 'author_id' in comments_df.columns and 'user_id' in demographics_df.columns: + comments_df = comments_df.copy() + comments_df['author_id_str'] = comments_df['author_id'].astype(str) + demographics_df['user_id_str'] = demographics_df['user_id'].astype(str) + + merged_df = comments_df.merge( + demographics_df[['user_id_str', 'age', 'age_group', 'timezone', + 'timezone_region', 'experience_level', 'experience_group']], + left_on='author_id_str', + right_on='user_id_str', + how='left' + ) + merged_df.drop(columns=['author_id_str', 'user_id_str'], errors='ignore', inplace=True) + + for col in ['age_group', 'timezone_region', 'experience_group']: + if col in merged_df.columns: + merged_df[col] = merged_df[col].fillna('Unknown') + + return merged_df + + return comments_df + + # ───────────────────────────────────────────────────────────── + # Filter helpers + # ───────────────────────────────────────────────────────────── + + @staticmethod + def get_filter_options(df): + """Get unique values for sidebar filters.""" + return { + 'platforms': sorted(df['platform'].unique().tolist()), + 'brands': sorted(df['brand'].unique().tolist()), + 'sentiments': sorted(df['sentiment_polarity'].unique().tolist()), + 'languages': sorted(df['detected_language'].dropna().unique().tolist()) + if 'detected_language' in df.columns else [] + } + + @staticmethod + def apply_filters(df, platforms=None, brands=None, sentiments=None, + date_range=None, languages=None): + """Apply sidebar filters to a dataframe (no copy — boolean indexing only).""" + filtered_df = df + + if platforms: + filtered_df = filtered_df[filtered_df['platform'].isin(platforms)] + + if brands: + filtered_df = filtered_df[filtered_df['brand'].isin(brands)] + + if sentiments: + filtered_df = filtered_df[filtered_df['sentiment_polarity'].isin(sentiments)] + + if languages: + filtered_df = filtered_df[filtered_df['detected_language'].isin(languages)] + + if date_range and len(date_range) == 2 and 'comment_timestamp' in filtered_df.columns: + start_date, end_date = date_range + filtered_df = filtered_df[ + (filtered_df['comment_timestamp'] >= pd.Timestamp(start_date)) & + (filtered_df['comment_timestamp'] <= pd.Timestamp(end_date)) + ] + + return filtered_df + + @staticmethod + def get_date_range(df, default_days=30): + """Get default date range from dataframe.""" + if 'comment_timestamp' in df.columns and not df.empty: + max_date = df['comment_timestamp'].max() + min_date = max_date - timedelta(days=default_days) + return (min_date, max_date) + return (datetime.now() - timedelta(days=default_days), datetime.now()) + + # ───────────────────────────────────────────────────────────── + # Internal helpers + # ───────────────────────────────────────────────────────────── + + @staticmethod + def _sanitize_value(value): + """Remove characters that could break SQL string literals.""" + return re.sub(r"['\";\\]", '', str(value)) + + @staticmethod + def _build_date_clause(date_range, table_alias=None): + """ + Build a SQL AND COMMENT_TIMESTAMP ... clause, or empty string. + + Args: + date_range: tuple of (start, end) or None + table_alias: optional table alias prefix (e.g. 's') to avoid + ambiguous column errors when a JOIN is present + """ + if date_range and len(date_range) == 2: + col = f"{table_alias}.COMMENT_TIMESTAMP" if table_alias else "COMMENT_TIMESTAMP" + return f"AND {col} >= '{date_range[0]}' AND {col} <= '{date_range[1]}'" + return "" \ No newline at end of file diff --git a/visualization/img/musora.png b/visualization/img/musora.png new file mode 100644 index 0000000000000000000000000000000000000000..941dd210827ae56cb1d5bf08948b04c125d97e79 Binary files /dev/null and b/visualization/img/musora.png differ diff --git a/visualization/requirements.txt b/visualization/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ac9081aa3a574cb920f02aae6475bbb64fc5df2 --- /dev/null +++ b/visualization/requirements.txt @@ -0,0 +1,33 @@ +# Brand Sentiment - Visualization & Processing Requirements +# Install with: pip install -r requirements.txt + +# Core visualization +streamlit==1.50.0 +plotly==6.3.1 + +# Data processing +pandas==2.3.2 +numpy==2.0.2 +python-dateutil==2.9.0.post0 + +# Snowflake connectivity +snowflake-snowpark-python==1.39.0 + +# Environment management +python-dotenv==1.1.1 + +# AI / LLM (visualization agents + processing pipeline) +openai==1.108.0 +langchain==0.3.27 +langchain-openai==0.3.34 +langgraph==0.6.8 + +# Language detection (processing pipeline) +lingua-language-detector==2.0.2 + +# HTML parsing (processing pipeline) +beautifulsoup4==4.14.3 + +# PDF report generation +fpdf2==2.8.4 +kaleido==1.2.0 diff --git a/visualization/utils/auth.py b/visualization/utils/auth.py new file mode 100644 index 0000000000000000000000000000000000000000..05f0437f3d558040691f32b7bddeafa82e7e4227 --- /dev/null +++ b/visualization/utils/auth.py @@ -0,0 +1,121 @@ +""" +Authentication module for the Musora Sentiment Analysis Dashboard. + +Handles user authentication and access control. +Works both locally (loading .env) and on Hugging Face / cloud (using secrets). +""" + +import os +import streamlit as st +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables from .env file. +# On cloud deployments, env vars are set as secrets and os.getenv works directly. +# Locally, we load from .env at the project root. +_env_path = Path(__file__).resolve().parent.parent.parent / '.env' +if _env_path.exists(): + load_dotenv(_env_path) + +# Authorized emails — team members only. +AUTHORIZED_EMAILS = { + "danial@musora.com", + "caleb@musora.com", + "gabriel@musora.com", + "jmilligan@musora.com", + "dave@musora.com", +} + + +def get_valid_token() -> str: + """ + Get the valid access token from environment. + + Returns: + str: Valid access token (empty string if not set). + """ + return os.getenv("APP_TOKEN", "") + + +def verify_login(email: str, token: str) -> bool: + """ + Verify user login credentials. + + Args: + email: User email address. + token: Access token. + + Returns: + bool: True if credentials are valid, False otherwise. + """ + valid_token = get_valid_token() + email_normalized = email.lower().strip() + return (email_normalized in AUTHORIZED_EMAILS) and (token == valid_token) + + +def check_authentication() -> bool: + """ + Check if the user is authenticated in the current session. + + Returns: + bool: True if authenticated, False otherwise. + """ + return st.session_state.get("authenticated", False) + + +def get_current_user() -> str: + """ + Get the currently logged-in user's email. + + Returns: + str: User email or empty string if not authenticated. + """ + return st.session_state.get("user_email", "") + + +def logout() -> None: + """ + Log out the current user by clearing auth-related session state. + """ + for key in ("authenticated", "user_email"): + st.session_state.pop(key, None) + + +def render_login_page() -> None: + """ + Render the login page UI and halt execution until authenticated. + + Call this at the top of app.py before any other page logic. + Uses st.stop() to prevent the rest of the app from running. + """ + st.title("Musora Sentiment Analysis Dashboard") + + st.markdown(""" + Welcome to the **Musora Sentiment Analysis Dashboard**. + + This tool is restricted to authorized Musora team members. + Please enter your credentials below to access the dashboard. + """) + + with st.form("login_form"): + email = st.text_input( + "Email Address", + placeholder="your.name@musora.com", + ) + token = st.text_input( + "Access Token", + type="password", + placeholder="Enter your access token", + ) + submitted = st.form_submit_button("Login", use_container_width=True) + + if submitted: + if verify_login(email, token): + st.session_state["authenticated"] = True + st.session_state["user_email"] = email.lower().strip() + st.success("Login successful! Redirecting…") + st.rerun() + else: + st.error("Invalid email or access token. Please try again.") + + st.stop() diff --git a/visualization/utils/data_processor.py b/visualization/utils/data_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..0d08264258a47dbc1e68f9d91acdf6f2061c2080 --- /dev/null +++ b/visualization/utils/data_processor.py @@ -0,0 +1,604 @@ +""" +Data processing utilities for sentiment analysis +Handles aggregation, grouping, and transformation operations +""" +import pandas as pd +import numpy as np +from typing import List, Dict, Tuple + + +class SentimentDataProcessor: + """ + Processes sentiment data for visualization + """ + + @staticmethod + def aggregate_by_dimensions(df, group_by_cols, agg_cols=None): + """ + Aggregate data by specified dimensions + + Args: + df: Sentiment dataframe + group_by_cols: List of columns to group by + agg_cols: Dictionary of columns and aggregation functions + + Returns: + pd.DataFrame: Aggregated dataframe + """ + if agg_cols is None: + agg_cols = { + 'comment_sk': 'count', + 'requires_reply': 'sum' + } + + return df.groupby(group_by_cols, as_index=False).agg(agg_cols) + + @staticmethod + def get_sentiment_distribution(df, group_by=None): + """ + Calculate sentiment distribution + + Args: + df: Sentiment dataframe + group_by: Optional column(s) to group by + + Returns: + pd.DataFrame: Sentiment distribution + """ + if group_by: + # Group by specified columns and sentiment + if isinstance(group_by, str): + group_by = [group_by] + + sentiment_counts = df.groupby( + group_by + ['sentiment_polarity'], + as_index=False + ).size().rename(columns={'size': 'count'}) + + # Calculate percentages within each group + sentiment_counts['percentage'] = sentiment_counts.groupby(group_by)['count'].transform( + lambda x: (x / x.sum() * 100).round(2) + ) + + else: + # Overall sentiment distribution + sentiment_counts = df['sentiment_polarity'].value_counts().reset_index() + sentiment_counts.columns = ['sentiment_polarity', 'count'] + sentiment_counts['percentage'] = ( + sentiment_counts['count'] / sentiment_counts['count'].sum() * 100 + ).round(2) + + return sentiment_counts + + @staticmethod + def get_intent_distribution(df, group_by=None): + """ + Calculate intent distribution (handles multi-label) + + Args: + df: Sentiment dataframe + group_by: Optional column(s) to group by + + Returns: + pd.DataFrame: Intent distribution + """ + # Explode intents (split comma-separated values) + df_exploded = df.copy() + df_exploded['intent'] = df_exploded['intent'].str.split(',') + df_exploded = df_exploded.explode('intent') + df_exploded['intent'] = df_exploded['intent'].str.strip() + + if group_by: + # Group by specified columns and intent + if isinstance(group_by, str): + group_by = [group_by] + + intent_counts = df_exploded.groupby( + group_by + ['intent'], + as_index=False + ).size().rename(columns={'size': 'count'}) + + # Calculate percentages within each group + intent_counts['percentage'] = intent_counts.groupby(group_by)['count'].transform( + lambda x: (x / x.sum() * 100).round(2) + ) + + else: + # Overall intent distribution + intent_counts = df_exploded['intent'].value_counts().reset_index() + intent_counts.columns = ['intent', 'count'] + intent_counts['percentage'] = ( + intent_counts['count'] / intent_counts['count'].sum() * 100 + ).round(2) + + return intent_counts + + @staticmethod + def get_content_summary(df): + """ + Get summary statistics for each content + + Args: + df: Sentiment dataframe + + Returns: + pd.DataFrame: Content summary with statistics + """ + # Group by content (dropna=False to include records with NULL permalink_url, e.g., YouTube) + content_summary = df.groupby(['content_sk', 'content_description', 'permalink_url'], dropna=False).agg({ + 'comment_sk': 'count', + 'requires_reply': 'sum', + 'sentiment_polarity': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'unknown' + }).reset_index() + + content_summary.columns = [ + 'content_sk', 'content_description', 'permalink_url', + 'total_comments', 'reply_required_count', 'dominant_sentiment' + ] + + # Calculate negative sentiment percentage for each content + negative_sentiments = ['negative', 'very_negative'] + content_negative = df[df['sentiment_polarity'].isin(negative_sentiments)].groupby( + 'content_sk' + ).size().reset_index(name='negative_count') + + content_summary = content_summary.merge(content_negative, on='content_sk', how='left') + content_summary['negative_count'] = content_summary['negative_count'].fillna(0) + content_summary['negative_percentage'] = ( + content_summary['negative_count'] / content_summary['total_comments'] * 100 + ).round(2) + + # Calculate severity score (balances percentage and volume) + # Formula: negative_percentage * sqrt(total_comments) + # This gives weight to both high negative % and high comment volume + content_summary['severity_score'] = ( + content_summary['negative_percentage'] * + (content_summary['total_comments'] ** 0.5) + ).round(2) + + return content_summary + + @staticmethod + def get_top_poor_sentiment_contents(df, top_n=10, min_comments=1, sort_by='severity_score'): + """ + Get contents with highest poor sentiment based on selected criteria + + Args: + df: Sentiment dataframe + top_n: Number of top contents to return + min_comments: Minimum number of comments a content must have to be included + sort_by: Sorting criteria - 'severity_score', 'negative_percentage', 'negative_count', 'total_comments' + + Returns: + pd.DataFrame: Top contents with poor sentiment + """ + content_summary = SentimentDataProcessor.get_content_summary(df) + + # Filter by minimum comments + content_summary = content_summary[content_summary['total_comments'] >= min_comments] + + # Determine sort columns based on sort_by parameter + if sort_by == 'severity_score': + # Sort by severity score (balanced), then by negative percentage as tie-breaker + sort_columns = ['severity_score', 'negative_percentage'] + elif sort_by == 'negative_percentage': + # Sort by negative percentage, then by total comments + sort_columns = ['negative_percentage', 'total_comments'] + elif sort_by == 'negative_count': + # Sort by absolute negative count, then by negative percentage + sort_columns = ['negative_count', 'negative_percentage'] + elif sort_by == 'total_comments': + # Sort by total comments volume + sort_columns = ['total_comments', 'negative_count'] + else: + # Default to severity score + sort_columns = ['severity_score', 'negative_percentage'] + + # Sort and get top N + top_poor = content_summary.sort_values( + by=sort_columns, + ascending=[False, False] + ).head(top_n) + + return top_poor + + @staticmethod + def get_comments_requiring_reply(df): + """ + Get all comments that require reply + + Args: + df: Sentiment dataframe + + Returns: + pd.DataFrame: Comments requiring reply + """ + reply_df = df[df['requires_reply'] == True].copy() + + # Sort by timestamp (most recent first) + if 'comment_timestamp' in reply_df.columns: + reply_df = reply_df.sort_values('comment_timestamp', ascending=False) + + return reply_df + + @staticmethod + def get_platform_brand_summary(df): + """ + Get summary statistics by platform and brand + + Args: + df: Sentiment dataframe + + Returns: + pd.DataFrame: Platform and brand summary + """ + summary = df.groupby(['platform', 'brand']).agg({ + 'comment_sk': 'count', + 'requires_reply': 'sum' + }).reset_index() + + summary.columns = ['platform', 'brand', 'total_comments', 'reply_required'] + + # Add sentiment distribution + sentiment_dist = SentimentDataProcessor.get_sentiment_distribution( + df, group_by=['platform', 'brand'] + ) + + # Pivot sentiment distribution + sentiment_pivot = sentiment_dist.pivot_table( + index=['platform', 'brand'], + columns='sentiment_polarity', + values='count', + fill_value=0 + ).reset_index() + + # Merge with summary + summary = summary.merge(sentiment_pivot, on=['platform', 'brand'], how='left') + + return summary + + @staticmethod + def get_temporal_trends(df, freq='D'): + """ + Get temporal trends of sentiment over time + + Args: + df: Sentiment dataframe + freq: Frequency for aggregation ('D'=daily, 'W'=weekly, 'M'=monthly) + + Returns: + pd.DataFrame: Temporal sentiment trends + """ + if 'comment_timestamp' not in df.columns: + return pd.DataFrame() + + df_temporal = df.copy() + df_temporal['date'] = pd.to_datetime(df_temporal['comment_timestamp']).dt.to_period(freq) + + # Aggregate by date and sentiment + trends = df_temporal.groupby(['date', 'sentiment_polarity']).size().reset_index(name='count') + trends['date'] = trends['date'].dt.to_timestamp() + + return trends + + @staticmethod + def calculate_sentiment_score(df): + """ + Calculate weighted sentiment score + + Args: + df: Sentiment dataframe + + Returns: + float: Average sentiment score (-2 to +2) + """ + sentiment_weights = { + 'very_negative': -2, + 'negative': -1, + 'neutral': 0, + 'positive': 1, + 'very_positive': 2 + } + + df['sentiment_score'] = df['sentiment_polarity'].map(sentiment_weights) + return df['sentiment_score'].mean() + + @staticmethod + def get_language_distribution(df): + """ + Get distribution of detected languages + + Args: + df: Sentiment dataframe + + Returns: + pd.DataFrame: Language distribution + """ + if 'detected_language' not in df.columns: + return pd.DataFrame() + + lang_dist = df['detected_language'].value_counts().reset_index() + lang_dist.columns = ['language', 'count'] + lang_dist['percentage'] = (lang_dist['count'] / lang_dist['count'].sum() * 100).round(2) + + return lang_dist + + @staticmethod + def get_sentiment_filtered_contents(df, selected_sentiments=None, selected_intents=None, + top_n=10, min_comments=1, sort_by='severity_score'): + """ + Get contents filtered by selected sentiments and intents with dynamic sorting + + Args: + df: Sentiment dataframe + selected_sentiments: List of sentiments to filter by (filters by dominant sentiment) + selected_intents: List of intents to filter by (content must have at least one comment with these intents) + top_n: Number of top contents to return + min_comments: Minimum number of comments a content must have + sort_by: Sorting criteria - 'severity_score', 'sentiment_percentage', 'sentiment_count', 'total_comments' + + Returns: + pd.DataFrame: Filtered and sorted contents + """ + content_summary = SentimentDataProcessor.get_content_summary(df) + + # Filter by minimum comments + content_summary = content_summary[content_summary['total_comments'] >= min_comments] + + # If no sentiments selected, default to all sentiments + if not selected_sentiments: + selected_sentiments = df['sentiment_polarity'].unique().tolist() + + # Filter by dominant sentiment + content_summary = content_summary[content_summary['dominant_sentiment'].isin(selected_sentiments)] + + # Filter by intents if specified + if selected_intents: + # Get content_sks that have at least one comment with the selected intents + content_sks_with_intent = set() + for intent in selected_intents: + matching_contents = df[df['intent'].str.contains(intent, na=False, case=False)]['content_sk'].unique() + content_sks_with_intent.update(matching_contents) + + content_summary = content_summary[content_summary['content_sk'].isin(content_sks_with_intent)] + + # Calculate percentage and count for selected sentiments + sentiment_counts = df[df['sentiment_polarity'].isin(selected_sentiments)].groupby( + 'content_sk' + ).size().reset_index(name='selected_sentiment_count') + + content_summary = content_summary.merge(sentiment_counts, on='content_sk', how='left') + content_summary['selected_sentiment_count'] = content_summary['selected_sentiment_count'].fillna(0) + content_summary['selected_sentiment_percentage'] = ( + content_summary['selected_sentiment_count'] / content_summary['total_comments'] * 100 + ).round(2) + + # Calculate dynamic severity score based on selected sentiments + content_summary['dynamic_severity_score'] = ( + content_summary['selected_sentiment_percentage'] * + (content_summary['total_comments'] ** 0.5) + ).round(2) + + # Determine sort columns based on sort_by parameter + if sort_by == 'severity_score': + sort_columns = ['dynamic_severity_score', 'selected_sentiment_percentage'] + elif sort_by == 'sentiment_percentage': + sort_columns = ['selected_sentiment_percentage', 'total_comments'] + elif sort_by == 'sentiment_count': + sort_columns = ['selected_sentiment_count', 'selected_sentiment_percentage'] + elif sort_by == 'total_comments': + sort_columns = ['total_comments', 'selected_sentiment_count'] + else: + sort_columns = ['dynamic_severity_score', 'selected_sentiment_percentage'] + + # Sort and get top N + filtered_contents = content_summary.sort_values( + by=sort_columns, + ascending=[False, False] + ).head(top_n) + + return filtered_contents + + @staticmethod + def get_demographics_distribution(df, demographic_field, filter_platform='musora_app'): + """ + Get distribution of a demographic field (only for specified platform) + + Args: + df: Sentiment dataframe with demographic fields + demographic_field: Field to analyze ('age_group', 'timezone', 'timezone_region', 'experience_level', 'experience_group') + filter_platform: Platform to filter (default: 'musora_app') + + Returns: + pd.DataFrame: Distribution with count and percentage + """ + # Filter for specified platform only + if filter_platform and 'platform' in df.columns: + df_filtered = df[df['platform'] == filter_platform].copy() + else: + df_filtered = df.copy() + + if df_filtered.empty or demographic_field not in df_filtered.columns: + return pd.DataFrame() + + # Remove 'Unknown' and null values + df_filtered = df_filtered[ + (df_filtered[demographic_field].notna()) & + (df_filtered[demographic_field] != 'Unknown') + ] + + if df_filtered.empty: + return pd.DataFrame() + + # Count distribution + distribution = df_filtered[demographic_field].value_counts().reset_index() + distribution.columns = [demographic_field, 'count'] + + # Calculate percentage + distribution['percentage'] = ( + distribution['count'] / distribution['count'].sum() * 100 + ).round(2) + + # Sort by count descending + distribution = distribution.sort_values('count', ascending=False) + + return distribution + + @staticmethod + def get_demographics_by_sentiment(df, demographic_field, filter_platform='musora_app'): + """ + Get sentiment distribution for each demographic group + + Args: + df: Sentiment dataframe with demographic fields + demographic_field: Field to analyze + filter_platform: Platform to filter (default: 'musora_app') + + Returns: + pd.DataFrame: Sentiment distribution per demographic group + """ + # Filter for specified platform only + if filter_platform and 'platform' in df.columns: + df_filtered = df[df['platform'] == filter_platform].copy() + else: + df_filtered = df.copy() + + if df_filtered.empty or demographic_field not in df_filtered.columns: + return pd.DataFrame() + + # Remove 'Unknown' and null values + df_filtered = df_filtered[ + (df_filtered[demographic_field].notna()) & + (df_filtered[demographic_field] != 'Unknown') + ] + + if df_filtered.empty: + return pd.DataFrame() + + # Group by demographic field and sentiment + sentiment_by_demo = df_filtered.groupby( + [demographic_field, 'sentiment_polarity'], + as_index=False + ).size().rename(columns={'size': 'count'}) + + # Calculate percentage within each demographic group + sentiment_by_demo['percentage'] = sentiment_by_demo.groupby(demographic_field)['count'].transform( + lambda x: (x / x.sum() * 100).round(2) + ) + + return sentiment_by_demo + + @staticmethod + def get_top_timezones(df, top_n=15, filter_platform='musora_app'): + """ + Get top N timezones with most comments + + Args: + df: Sentiment dataframe with timezone field + top_n: Number of top timezones to return + filter_platform: Platform to filter (default: 'musora_app') + + Returns: + pd.DataFrame: Top timezones with counts + """ + return SentimentDataProcessor.get_demographics_distribution( + df, 'timezone', filter_platform + ).head(top_n) + + @staticmethod + def get_timezone_regions_distribution(df, filter_platform='musora_app'): + """ + Get distribution of timezone regions + + Args: + df: Sentiment dataframe with timezone_region field + filter_platform: Platform to filter (default: 'musora_app') + + Returns: + pd.DataFrame: Region distribution with counts + """ + return SentimentDataProcessor.get_demographics_distribution( + df, 'timezone_region', filter_platform + ) + + @staticmethod + def get_experience_level_distribution(df, filter_platform='musora_app', use_groups=False): + """ + Get distribution of experience levels + + Args: + df: Sentiment dataframe with experience fields + filter_platform: Platform to filter (default: 'musora_app') + use_groups: If True, use grouped experience levels, otherwise use raw values + + Returns: + pd.DataFrame: Experience distribution + """ + field = 'experience_group' if use_groups else 'experience_level' + return SentimentDataProcessor.get_demographics_distribution( + df, field, filter_platform + ) + + @staticmethod + def get_demographics_summary(df, filter_platform='musora_app'): + """ + Get summary statistics for demographic data + + Args: + df: Sentiment dataframe with demographic fields + filter_platform: Platform to filter (default: 'musora_app') + + Returns: + dict: Summary statistics + """ + # Filter for specified platform only + if filter_platform and 'platform' in df.columns: + df_filtered = df[df['platform'] == filter_platform].copy() + else: + df_filtered = df.copy() + + if df_filtered.empty: + return { + 'total_comments': 0, + 'users_with_demographics': 0, + 'avg_age': None, + 'most_common_age_group': 'Unknown', + 'most_common_region': 'Unknown', + 'avg_experience': None + } + + # Remove records without demographic data + df_with_demo = df_filtered[ + (df_filtered['age'].notna()) | + (df_filtered['timezone'].notna()) | + (df_filtered['experience_level'].notna()) + ].copy() + + summary = { + 'total_comments': len(df_filtered), + 'users_with_demographics': len(df_with_demo), + 'coverage_percentage': round(len(df_with_demo) / len(df_filtered) * 100, 2) if len(df_filtered) > 0 else 0 + } + + # Age statistics + if 'age' in df_with_demo.columns: + valid_ages = df_with_demo['age'].dropna() + summary['avg_age'] = round(valid_ages.mean(), 1) if len(valid_ages) > 0 else None + + age_groups = df_with_demo['age_group'].value_counts() + summary['most_common_age_group'] = age_groups.index[0] if len(age_groups) > 0 else 'Unknown' + + # Timezone statistics + if 'timezone_region' in df_with_demo.columns: + regions = df_with_demo[df_with_demo['timezone_region'] != 'Unknown']['timezone_region'].value_counts() + summary['most_common_region'] = regions.index[0] if len(regions) > 0 else 'Unknown' + + # Experience statistics + if 'experience_level' in df_with_demo.columns: + valid_exp = df_with_demo['experience_level'].dropna() + summary['avg_experience'] = round(valid_exp.mean(), 2) if len(valid_exp) > 0 else None + + exp_groups = df_with_demo['experience_group'].value_counts() + summary['most_common_experience'] = exp_groups.index[0] if len(exp_groups) > 0 else 'Unknown' + + return summary \ No newline at end of file diff --git a/visualization/utils/llm_helper.py b/visualization/utils/llm_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..dbd5bbc65f019783a19d0b6f71155b0221dc3c28 --- /dev/null +++ b/visualization/utils/llm_helper.py @@ -0,0 +1,149 @@ +""" +LLM Helper for visualization agents +Handles OpenAI API calls with retry logic and error handling +""" +import os +import json +from typing import Dict, Any, Optional +from openai import OpenAI +from dotenv import load_dotenv +import time + +# Load environment variables from root directory (parent of visualization) +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +VISUALIZATION_DIR = os.path.dirname(SCRIPT_DIR) +ROOT_DIR = os.path.dirname(VISUALIZATION_DIR) +load_dotenv(os.path.join(ROOT_DIR, '.env')) + + +class LLMHelper: + """ + Helper class for LLM interactions + """ + + def __init__(self, model: str = "gpt-5-nano", temperature: float = 1): + """ + Initialize LLM helper + + Args: + model: Model name to use + temperature: Temperature for generation + """ + self.model = model + self.temperature = temperature + self.api_key = os.getenv('OPENAI_API_KEY') + + if not self.api_key: + raise ValueError("OPENAI_API_KEY not found in environment variables") + + self.client = OpenAI(api_key=self.api_key) + + def get_completion( + self, + prompt: str, + system_message: Optional[str] = None, + max_retries: int = 3, + json_mode: bool = False + ) -> Dict[str, Any]: + """ + Get completion from LLM with retry logic + + Args: + prompt: User prompt + system_message: Optional system message + max_retries: Maximum number of retries + json_mode: Whether to force JSON response + + Returns: + Dictionary with response data + """ + messages = [] + + if system_message: + messages.append({"role": "system", "content": system_message}) + + messages.append({"role": "user", "content": prompt}) + + for attempt in range(max_retries): + try: + # Prepare API call parameters + api_params = { + "model": self.model, + "messages": messages, + "temperature": self.temperature, + "reasoning_effort": "low", + "n": 1 + } + + # Add response format if JSON mode requested + if json_mode: + api_params["response_format"] = {"type": "json_object"} + + # Make API call + response = self.client.chat.completions.create(**api_params) + + # Extract response + content = response.choices[0].message.content + + # Parse JSON if requested + if json_mode: + try: + content = json.loads(content) + except json.JSONDecodeError as e: + return { + 'success': False, + 'error': f"Failed to parse JSON response: {str(e)}", + 'raw_content': content + } + + return { + 'success': True, + 'content': content, + 'model': response.model, + 'usage': { + 'prompt_tokens': response.usage.prompt_tokens, + 'completion_tokens': response.usage.completion_tokens, + 'total_tokens': response.usage.total_tokens + } + } + + except Exception as e: + if attempt < max_retries - 1: + # Wait before retry (exponential backoff) + time.sleep(2 ** attempt) + continue + else: + return { + 'success': False, + 'error': str(e), + 'error_type': type(e).__name__ + } + + return { + 'success': False, + 'error': f"Failed after {max_retries} attempts" + } + + def get_structured_completion( + self, + prompt: str, + system_message: str, + max_retries: int = 3 + ) -> Dict[str, Any]: + """ + Get structured JSON completion + + Args: + prompt: User prompt + system_message: System message + max_retries: Maximum retries + + Returns: + Structured response dictionary + """ + return self.get_completion( + prompt=prompt, + system_message=system_message, + max_retries=max_retries, + json_mode=True + ) \ No newline at end of file diff --git a/visualization/utils/metrics.py b/visualization/utils/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..43fa704e9784d575d5fd122458c991084601d1f9 --- /dev/null +++ b/visualization/utils/metrics.py @@ -0,0 +1,297 @@ +""" +Metrics calculation for sentiment analysis dashboard +Provides key performance indicators and statistical metrics +""" +import pandas as pd +import numpy as np +from typing import Dict, List, Tuple + + +class SentimentMetrics: + """ + Calculates various metrics for sentiment analysis + """ + + @staticmethod + def calculate_overall_metrics(df): + """ + Calculate overall summary metrics + + Args: + df: Sentiment dataframe + + Returns: + dict: Overall metrics + """ + total_comments = len(df) + total_reply_required = df['requires_reply'].sum() if 'requires_reply' in df.columns else 0 + + # Sentiment distribution + sentiment_dist = df['sentiment_polarity'].value_counts(normalize=True) * 100 + + # Calculate sentiment score (vectorized — no copy needed) + sentiment_weights = { + 'very_negative': -2, + 'negative': -1, + 'neutral': 0, + 'positive': 1, + 'very_positive': 2 + } + avg_sentiment_score = df['sentiment_polarity'].map(sentiment_weights).mean() + + # Negative sentiment percentage + negative_sentiments = ['negative', 'very_negative'] + negative_pct = (df['sentiment_polarity'].isin(negative_sentiments).sum() / total_comments * 100) if total_comments > 0 else 0 + + # Positive sentiment percentage + positive_sentiments = ['positive', 'very_positive'] + positive_pct = (df['sentiment_polarity'].isin(positive_sentiments).sum() / total_comments * 100) if total_comments > 0 else 0 + + return { + 'total_comments': total_comments, + 'total_reply_required': int(total_reply_required), + 'reply_required_pct': (total_reply_required / total_comments * 100) if total_comments > 0 else 0, + 'avg_sentiment_score': avg_sentiment_score, + 'negative_pct': negative_pct, + 'positive_pct': positive_pct, + 'sentiment_distribution': sentiment_dist.to_dict() + } + + @staticmethod + def calculate_brand_metrics(df): + """ + Calculate metrics by brand + + Args: + df: Sentiment dataframe + + Returns: + dict: Metrics by brand + """ + brand_metrics = {} + + for brand in df['brand'].unique(): + brand_df = df[df['brand'] == brand] + brand_metrics[brand] = SentimentMetrics.calculate_overall_metrics(brand_df) + + return brand_metrics + + @staticmethod + def calculate_platform_metrics(df): + """ + Calculate metrics by platform + + Args: + df: Sentiment dataframe + + Returns: + dict: Metrics by platform + """ + platform_metrics = {} + + for platform in df['platform'].unique(): + platform_df = df[df['platform'] == platform] + platform_metrics[platform] = SentimentMetrics.calculate_overall_metrics(platform_df) + + return platform_metrics + + @staticmethod + def calculate_content_engagement_score(content_df): + """ + Calculate engagement score for a content piece + + Args: + content_df: DataFrame for a single content + + Returns: + float: Engagement score (0-100) + """ + if len(content_df) == 0: + return 0 + + # Factors: + # 1. Number of comments (normalized) + # 2. Sentiment positivity + # 3. Intent diversity + # 4. Reply requirement rate + + comment_count = len(content_df) + comment_score = min(comment_count / 100 * 30, 30) # Max 30 points for 100+ comments + + # Sentiment score (max 40 points) — vectorized, no copy needed + sentiment_weights = { + 'very_negative': -2, + 'negative': -1, + 'neutral': 0, + 'positive': 1, + 'very_positive': 2 + } + avg_sentiment = content_df['sentiment_polarity'].map(sentiment_weights).mean() + sentiment_score = ((avg_sentiment + 2) / 4) * 40 # Normalize to 0-40 + + # Intent diversity score (max 20 points) + unique_intents = content_df['intent'].str.split(',').explode().str.strip().nunique() + intent_score = min(unique_intents / 8 * 20, 20) # Max 20 points for 8 unique intents + + # Interaction requirement (max 10 points) + reply_rate = content_df['requires_reply'].sum() / len(content_df) if len(content_df) > 0 else 0 + interaction_score = reply_rate * 10 + + total_score = comment_score + sentiment_score + intent_score + interaction_score + return round(total_score, 2) + + @staticmethod + def get_sentiment_health_status(negative_pct): + """ + Determine health status based on negative sentiment percentage + + Args: + negative_pct: Percentage of negative sentiments + + Returns: + tuple: (status, color) + """ + if negative_pct < 10: + return ("Excellent", "green") + elif negative_pct < 20: + return ("Good", "lightgreen") + elif negative_pct < 30: + return ("Fair", "orange") + elif negative_pct < 50: + return ("Poor", "darkorange") + else: + return ("Critical", "red") + + @staticmethod + def calculate_intent_priority_score(intent_counts): + """ + Calculate priority score for different intents + + Args: + intent_counts: Dictionary of intent counts + + Returns: + dict: Priority scores for each intent + """ + # Priority weights (higher = more urgent) + priority_weights = { + 'feedback_negative': 5, + 'request': 4, + 'question': 4, + 'suggestion': 3, + 'praise': 2, + 'humor_sarcasm': 1, + 'off_topic': 1, + 'spam_selfpromo': 0 + } + + priority_scores = {} + for intent, count in intent_counts.items(): + weight = priority_weights.get(intent, 1) + priority_scores[intent] = count * weight + + return priority_scores + + @staticmethod + def calculate_response_urgency(df): + """ + Calculate response urgency metrics + + Args: + df: Sentiment dataframe + + Returns: + dict: Urgency metrics + """ + reply_required_df = df[df['requires_reply'] == True] + + if len(reply_required_df) == 0: + return { + 'urgent_count': 0, + 'high_priority_count': 0, + 'medium_priority_count': 0, + 'low_priority_count': 0 + } + + # Classify urgency based on sentiment and intent + urgent = reply_required_df[ + reply_required_df['sentiment_polarity'].isin(['very_negative', 'negative']) + ] + high_priority = reply_required_df[ + (reply_required_df['sentiment_polarity'] == 'neutral') & + (reply_required_df['intent'].str.contains('feedback_negative|request', na=False)) + ] + medium_priority = reply_required_df[ + reply_required_df['sentiment_polarity'] == 'positive' + ] + low_priority = reply_required_df[ + reply_required_df['sentiment_polarity'] == 'very_positive' + ] + + return { + 'urgent_count': len(urgent), + 'high_priority_count': len(high_priority), + 'medium_priority_count': len(medium_priority), + 'low_priority_count': len(low_priority) + } + + @staticmethod + def calculate_trend_indicator(df, current_period, previous_period, metric='sentiment_score'): + """ + Calculate trend indicator comparing two periods + + Args: + df: Sentiment dataframe + current_period: Tuple of (start_date, end_date) for current period + previous_period: Tuple of (start_date, end_date) for previous period + metric: Metric to compare + + Returns: + dict: Trend information + """ + if 'comment_timestamp' not in df.columns: + return {'trend': 'stable', 'change': 0} + + # Filter data for each period + current_df = df[ + (df['comment_timestamp'] >= pd.Timestamp(current_period[0])) & + (df['comment_timestamp'] <= pd.Timestamp(current_period[1])) + ] + previous_df = df[ + (df['comment_timestamp'] >= pd.Timestamp(previous_period[0])) & + (df['comment_timestamp'] <= pd.Timestamp(previous_period[1])) + ] + + if len(current_df) == 0 or len(previous_df) == 0: + return {'trend': 'stable', 'change': 0} + + # Calculate metric for each period + if metric == 'sentiment_score': + # Vectorized — no copy needed + sentiment_weights = { + 'very_negative': -2, 'negative': -1, 'neutral': 0, + 'positive': 1, 'very_positive': 2 + } + current_value = current_df['sentiment_polarity'].map(sentiment_weights).mean() + previous_value = previous_df['sentiment_polarity'].map(sentiment_weights).mean() + else: + current_value = len(current_df) + previous_value = len(previous_df) + + # Calculate change + change = ((current_value - previous_value) / previous_value * 100) if previous_value != 0 else 0 + + # Determine trend + if abs(change) < 5: + trend = 'stable' + elif change > 0: + trend = 'improving' if metric == 'sentiment_score' else 'increasing' + else: + trend = 'declining' if metric == 'sentiment_score' else 'decreasing' + + return { + 'trend': trend, + 'change': round(change, 2), + 'current_value': round(current_value, 2), + 'previous_value': round(previous_value, 2) + } \ No newline at end of file diff --git a/visualization/utils/pdf_exporter.py b/visualization/utils/pdf_exporter.py new file mode 100644 index 0000000000000000000000000000000000000000..67b640f57def9da99ce28102f63dc0d286b79a6a --- /dev/null +++ b/visualization/utils/pdf_exporter.py @@ -0,0 +1,945 @@ +""" +PDF Report Exporter for Musora Sentiment Analysis Dashboard. + +Generates a comprehensive PDF report from the filtered dashboard data and +Plotly visualizations. + +Dependencies: + fpdf2 — PDF assembly (pip install fpdf2) + kaleido — Plotly PNG rendering (pip install kaleido) +""" + +import os +import sys +import tempfile +import logging +from datetime import datetime +from pathlib import Path + +# Ensure the visualization package root is importable when this module is +# loaded directly (e.g., during testing outside Streamlit). +_parent = Path(__file__).resolve().parent.parent +if str(_parent) not in sys.path: + sys.path.insert(0, str(_parent)) + +import plotly.io as pio +from fpdf import FPDF + +from utils.metrics import SentimentMetrics +from utils.data_processor import SentimentDataProcessor +from visualizations.sentiment_charts import SentimentCharts +from visualizations.distribution_charts import DistributionCharts +from visualizations.demographic_charts import DemographicCharts + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Section descriptions — plain-language context shown below each section header. +# --------------------------------------------------------------------------- +_DESCRIPTIONS = { + "executive_summary": ( + "A top-level snapshot of community sentiment across all Musora brands and platforms. " + "All findings are based on comments processed through the AI sentiment analysis pipeline." + ), + "sentiment": ( + "Every comment is assigned one of five sentiment levels: " + "Very Positive, Positive, Neutral, Negative, or Very Negative. " + "The pie chart shows how those levels split across all analyzed comments. " + "The Sentiment Score (0-100) converts the average rating to a percentage scale: " + "50 = perfectly neutral, above 60 = primarily positive." + ), + "brand": ( + "Sentiment broken down by Musora brand (Drumeo, Pianote, Guitareo, Singeo, etc.). " + "Shows both the count and percentage of each sentiment level per brand, " + "helping identify which brands receive the most positive or negative feedback." + ), + "platform": ( + "Sentiment broken down by platform (Facebook, Instagram, YouTube, Twitter, Musora App). " + "Helps compare audience sentiment across channels." + ), + "intent": ( + "Beyond positive/negative, the AI identifies the intent behind each comment: " + "praise, questions, requests, feedback, suggestions, humor, off-topic, or spam. " + "Understanding intent helps prioritize community management." + ), + "cross_dimensional": ( + "Cross-dimensional analysis reveals patterns across both brand and platform simultaneously. " + "The heatmaps show comment volume and negative sentiment concentration by combination." + ), + "volume": ( + "Volume analysis shows the distribution of comments across platforms and brands, " + "indicating where the most community engagement is happening." + ), + "reply_requirements": ( + "Comments flagged as requiring a reply, broken down by brand and platform. " + "The urgency breakdown helps prioritize community management resources." + ), + "demographics": ( + "Demographics data is available for Musora App comments and is derived from user profiles. " + "Note: These charts reflect only users who have filled in their profile information - " + "they do not represent all community members." + ), + "language": ( + "Language distribution shows what languages comments are written in. " + "Non-English comments are automatically translated for analysis." + ), +} + +# --------------------------------------------------------------------------- +# Musora brand colours +# --------------------------------------------------------------------------- +_PRIMARY_HEX = "#1982C4" +_PRIMARY_RGB = (25, 130, 196) + + +# --------------------------------------------------------------------------- +# PDF document class +# --------------------------------------------------------------------------- + +class MusoraPDF(FPDF): + """Custom FPDF subclass with Musora branding and layout helpers.""" + + PRIMARY = _PRIMARY_RGB + WHITE = (255, 255, 255) + GRAY = (180, 180, 180) + LIGHT_GRAY = (240, 240, 240) + + def __init__(self): + super().__init__(orientation="P", unit="mm", format="A4") + self.set_auto_page_break(auto=True, margin=20) + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + @staticmethod + def _sanitize(text: str) -> str: + """Strip characters outside Latin-1 (required by the Helvetica font).""" + if not isinstance(text, str): + text = str(text) + return text.encode("latin-1", errors="ignore").decode("latin-1") + + # ------------------------------------------------------------------ + # FPDF overrides + # ------------------------------------------------------------------ + + def header(self): + if self.page_no() > 1: + self.set_font("Helvetica", "B", 8) + self.set_text_color(*self.GRAY) + self.cell(0, 6, "Musora Sentiment Analysis Report", align="L") + self.cell( + 0, 6, f"Page {self.page_no()}", align="R", + new_x="LMARGIN", new_y="NEXT", + ) + self.set_draw_color(*self.PRIMARY) + self.set_line_width(0.5) + self.line(10, self.get_y(), 200, self.get_y()) + self.ln(4) + + def footer(self): + self.set_y(-15) + self.set_font("Helvetica", "I", 7) + self.set_text_color(*self.GRAY) + self.cell( + 0, 10, + f"Generated on {datetime.now().strftime('%Y-%m-%d %H:%M')} | Confidential", + align="C", + ) + + # ------------------------------------------------------------------ + # Layout primitives + # ------------------------------------------------------------------ + + def check_page_break(self, needed_mm: float) -> None: + """Add a page break if less than *needed_mm* mm remain on the page.""" + if self.get_y() + needed_mm > self.h - 20: + self.add_page() + + def section_header(self, title: str) -> None: + """Bold, brand-coloured section heading with an underline rule.""" + title = self._sanitize(title) + self.check_page_break(20) + self.ln(4) + self.set_font("Helvetica", "B", 14) + self.set_text_color(*self.PRIMARY) + self.cell(0, 10, title, new_x="LMARGIN", new_y="NEXT") + self.set_draw_color(*self.PRIMARY) + self.set_line_width(0.3) + self.line(10, self.get_y(), 200, self.get_y()) + self.ln(3) + self.set_text_color(0, 0, 0) + + def subsection_header(self, title: str) -> None: + """Lighter subsection heading.""" + title = self._sanitize(title) + self.check_page_break(15) + self.ln(2) + self.set_font("Helvetica", "B", 11) + self.set_text_color(60, 60, 60) + self.cell(0, 8, title, new_x="LMARGIN", new_y="NEXT") + self.ln(1) + self.set_text_color(0, 0, 0) + + def section_description(self, text: str) -> None: + """Italicised description block beneath a section header.""" + text = self._sanitize(text) + self.set_font("Helvetica", "I", 9) + self.set_text_color(80, 80, 80) + self.multi_cell(0, 5, text) + self.ln(4) + self.set_text_color(0, 0, 0) + + def body_text(self, text: str) -> None: + """Standard paragraph text.""" + text = self._sanitize(text) + self.set_font("Helvetica", "", 9) + self.set_text_color(50, 50, 50) + self.multi_cell(0, 5, text) + self.ln(2) + self.set_text_color(0, 0, 0) + + def callout_box( + self, + text: str, + bg_color: tuple = (240, 248, 255), + border_color: tuple = None, + ) -> None: + """Lightly-coloured info/callout box with a left accent bar.""" + if border_color is None: + border_color = self.PRIMARY + text = self._sanitize(text) + self.check_page_break(20) + x, w = 10, 180 + approx_lines = max(2, len(text) // 90 + text.count("\n") + 1) + h = approx_lines * 5 + 6 + y = self.get_y() + self.set_fill_color(*bg_color) + self.rect(x, y, w, h, style="F") + self.set_fill_color(*border_color) + self.rect(x, y, 3, h, style="F") + self.set_font("Helvetica", "", 8.5) + self.set_text_color(40, 40, 40) + self.set_xy(x + 5, y + 3) + self.multi_cell(w - 7, 4.5, text) + self.set_y(y + h + 3) + self.set_text_color(0, 0, 0) + + def metric_row(self, metrics: list) -> None: + """ + Horizontal row of metric tiles. + + Args: + metrics: list of (label, value) tuples. + """ + self.check_page_break(18) + n = len(metrics) + if n == 0: + return + box_w = (190 - (n - 1) * 3) / n + x0 = 10 + y = self.get_y() + for i, (label, value) in enumerate(metrics): + x = x0 + i * (box_w + 3) + self.set_fill_color(245, 245, 245) + self.rect(x, y, box_w, 14, style="F") + self.set_xy(x, y + 1) + self.set_font("Helvetica", "B", 10) + self.set_text_color(*self.PRIMARY) + self.cell(box_w, 6, self._sanitize(str(value)), align="C") + self.set_xy(x, y + 7) + self.set_font("Helvetica", "", 7) + self.set_text_color(100, 100, 100) + self.cell(box_w, 5, self._sanitize(str(label)), align="C") + self.set_text_color(0, 0, 0) + self.set_y(y + 16) + + def add_table( + self, + headers: list, + rows: list, + col_widths: list = None, + ) -> None: + """ + Styled data table with alternating row shading. + + Args: + headers: Column header strings. + rows: List of row tuples/lists. + col_widths: Optional column widths in mm. + """ + self.check_page_break(10 + len(rows) * 6) + n = len(headers) + if col_widths is None: + col_widths = [190 / n] * n + # Header + self.set_font("Helvetica", "B", 8) + self.set_fill_color(*self.PRIMARY) + self.set_text_color(*self.WHITE) + for i, hdr in enumerate(headers): + self.cell(col_widths[i], 7, self._sanitize(hdr), border=1, fill=True, align="C") + self.ln() + # Rows + self.set_font("Helvetica", "", 8) + self.set_text_color(0, 0, 0) + for row_idx, row in enumerate(rows): + self.set_fill_color(250, 250, 250) if row_idx % 2 == 0 else self.set_fill_color(*self.WHITE) + for i, cell_val in enumerate(row): + self.cell(col_widths[i], 6, self._sanitize(str(cell_val)), border=1, fill=True, align="C") + self.ln() + self.ln(2) + + +# --------------------------------------------------------------------------- +# Main exporter +# --------------------------------------------------------------------------- + +class DashboardPDFExporter: + """ + Generates a comprehensive PDF report from the Musora Sentiment dashboard. + + Usage:: + + exporter = DashboardPDFExporter() + pdf_bytes = exporter.generate_report(filtered_df, filter_info) + + The *filter_info* dict (optional) maps human-readable filter names to their + selected values and is shown on the cover page. + """ + + # Kaleido scale factor: 3× ≈ 300 DPI at A4 print size. + RENDER_SCALE = 3 + + def __init__(self): + self.sentiment_charts = SentimentCharts() + self.distribution_charts = DistributionCharts() + self.demographic_charts = DemographicCharts() + self.processor = SentimentDataProcessor() + self._temp_files: list[str] = [] + + # ------------------------------------------------------------------ + # Public entry point + # ------------------------------------------------------------------ + + def generate_report(self, df, filter_info: dict = None) -> bytes: + """ + Build and return the full PDF report. + + Args: + df: Filtered dashboard DataFrame. + filter_info: Optional dict of active filter descriptions shown on + the cover page, e.g. {"Platforms": ["facebook"], + "Brands": ["drumeo"]}. + + Returns: + bytes: Raw PDF file contents ready for st.download_button. + """ + self.pdf = MusoraPDF() + try: + self._add_cover_page(df, filter_info) + self._add_executive_summary(df) + self._add_sentiment_section(df) + self._add_brand_section(df) + self._add_platform_section(df) + self._add_intent_section(df) + self._add_cross_dimensional_section(df) + self._add_volume_section(df) + self._add_reply_requirements_section(df) + if self._has_demographics(df): + self._add_demographics_section(df) + if "detected_language" in df.columns: + self._add_language_section(df) + self._add_data_summary(df, filter_info) + + return bytes(self.pdf.output()) + finally: + self._cleanup_temp_files() + + # ------------------------------------------------------------------ + # Chart rendering helpers + # ------------------------------------------------------------------ + + def _prepare_fig_for_pdf(self, fig, is_side_by_side: bool = False) -> None: + """Apply white background, readable fonts, and automargin to a Plotly figure.""" + base_fs = 13 if is_side_by_side else 14 + fig.update_layout( + paper_bgcolor="white", + plot_bgcolor="white", + font=dict(color="black", size=base_fs), + title_font_size=base_fs + 4, + margin=( + dict(l=60, r=40, t=60, b=60) + if is_side_by_side + else dict(l=80, r=40, t=60, b=80) + ), + ) + fig.update_xaxes(automargin=True) + fig.update_yaxes(automargin=True) + if fig.layout.showlegend is not False: + fig.update_layout(legend_font_size=base_fs - 2) + + def _fig_to_temp_path( + self, fig, width: int = 800, height: int = 400, is_side_by_side: bool = False + ) -> str: + """Render a Plotly figure to a temporary high-DPI PNG and return the path.""" + self._prepare_fig_for_pdf(fig, is_side_by_side=is_side_by_side) + img_bytes = pio.to_image( + fig, + format="png", + width=width, + height=height, + scale=self.RENDER_SCALE, + engine="kaleido", + ) + tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False) + tmp.write(img_bytes) + tmp.close() + self._temp_files.append(tmp.name) + return tmp.name + + def _add_chart(self, fig, width: int = 180, img_width: int = 800, img_height: int = 400) -> None: + """Render one figure full-width on the current PDF page.""" + try: + path = self._fig_to_temp_path(fig, img_width, img_height) + h_mm = width * (img_height / img_width) + self.pdf.check_page_break(h_mm + 5) + self.pdf.image(path, x=10, w=width) + self.pdf.ln(3) + except Exception: + logger.exception("Chart render failed") + self.pdf.body_text("[Chart could not be rendered]") + + def _add_two_charts(self, fig1, fig2, width: int = 92) -> None: + """Render two figures side-by-side.""" + try: + p1 = self._fig_to_temp_path(fig1, 700, 450, is_side_by_side=True) + p2 = self._fig_to_temp_path(fig2, 700, 450, is_side_by_side=True) + h_mm = width * (450 / 700) + self.pdf.check_page_break(h_mm + 5) + y = self.pdf.get_y() + self.pdf.image(p1, x=10, y=y, w=width) + self.pdf.image(p2, x=10 + width + 4, y=y, w=width) + self.pdf.set_y(y + h_mm + 3) + except Exception: + logger.exception("Side-by-side chart render failed") + self.pdf.body_text("[Charts could not be rendered]") + + def _cleanup_temp_files(self) -> None: + for path in self._temp_files: + try: + os.unlink(path) + except OSError: + pass + self._temp_files.clear() + + # ------------------------------------------------------------------ + # Data helpers + # ------------------------------------------------------------------ + + @staticmethod + def _has_demographics(df) -> bool: + return ( + "platform" in df.columns + and "musora_app" in df["platform"].values + and "age_group" in df.columns + and "timezone" in df.columns + and "experience_level" in df.columns + ) + + @staticmethod + def _filter_summary(filter_info: dict) -> str: + if not filter_info: + return "No filters applied - showing all data." + parts = [] + for key, value in filter_info.items(): + if value: + display = ( + value if isinstance(value, str) + else ", ".join(str(v) for v in value) + ) + parts.append(f"{key}: {display}") + return "; ".join(parts) if parts else "No filters applied." + + @staticmethod + def _date_range_str(df) -> str: + if "comment_timestamp" not in df.columns or df.empty: + return "N/A" + valid = df["comment_timestamp"].dropna() + if valid.empty: + return "N/A" + return ( + f"{valid.min().strftime('%b %d, %Y')} to {valid.max().strftime('%b %d, %Y')}" + ) + + # ------------------------------------------------------------------ + # Report sections + # ------------------------------------------------------------------ + + def _add_cover_page(self, df, filter_info: dict) -> None: + self.pdf.add_page() + self.pdf.ln(40) + + r, g, b = MusoraPDF.PRIMARY + self.pdf.set_fill_color(r, g, b) + self.pdf.rect(0, 60, 210, 4, style="F") + + self.pdf.ln(20) + self.pdf.set_font("Helvetica", "B", 28) + self.pdf.set_text_color(r, g, b) + self.pdf.cell(0, 15, "Musora", align="C", new_x="LMARGIN", new_y="NEXT") + + self.pdf.set_font("Helvetica", "", 16) + self.pdf.set_text_color(80, 80, 80) + self.pdf.cell( + 0, 10, "Sentiment Analysis Report", + align="C", new_x="LMARGIN", new_y="NEXT", + ) + + self.pdf.ln(10) + self.pdf.set_draw_color(r, g, b) + self.pdf.set_line_width(0.5) + self.pdf.line(60, self.pdf.get_y(), 150, self.pdf.get_y()) + self.pdf.ln(10) + + self.pdf.set_font("Helvetica", "", 12) + self.pdf.set_text_color(100, 100, 100) + self.pdf.cell( + 0, 8, + f"Generated: {datetime.now().strftime('%B %d, %Y at %H:%M')}", + align="C", new_x="LMARGIN", new_y="NEXT", + ) + + self.pdf.ln(5) + self.pdf.set_font("Helvetica", "", 10) + self.pdf.cell( + 0, 7, + f"Total Comments Analyzed: {len(df):,}", + align="C", new_x="LMARGIN", new_y="NEXT", + ) + + date_str = self._date_range_str(df) + if date_str != "N/A": + self.pdf.ln(3) + self.pdf.set_font("Helvetica", "I", 9) + self.pdf.set_text_color(120, 120, 120) + self.pdf.cell( + 0, 6, + MusoraPDF._sanitize(f"Data period: {date_str}"), + align="C", new_x="LMARGIN", new_y="NEXT", + ) + + if filter_info: + self.pdf.ln(8) + self.pdf.set_font("Helvetica", "B", 9) + self.pdf.set_text_color(80, 80, 80) + self.pdf.cell(0, 6, "Active Filters:", align="C", new_x="LMARGIN", new_y="NEXT") + self.pdf.set_font("Helvetica", "", 9) + for key, value in filter_info.items(): + if value: + display = ( + value if isinstance(value, str) + else ", ".join(str(v) for v in value) + ) + self.pdf.cell( + 0, 5, + MusoraPDF._sanitize(f"{key}: {display}"), + align="C", new_x="LMARGIN", new_y="NEXT", + ) + + self.pdf.ln(20) + self.pdf.set_font("Helvetica", "I", 8) + self.pdf.set_text_color(150, 150, 150) + self.pdf.cell( + 0, 6, "Confidential - For Internal Use Only", + align="C", new_x="LMARGIN", new_y="NEXT", + ) + self.pdf.cell( + 0, 6, "Data Source: Snowflake | Musora Sentiment Pipeline", + align="C", new_x="LMARGIN", new_y="NEXT", + ) + + def _add_executive_summary(self, df) -> None: + self.pdf.add_page() + self.pdf.section_header("Executive Summary") + self.pdf.section_description(_DESCRIPTIONS["executive_summary"]) + + metrics = SentimentMetrics.calculate_overall_metrics(df) + normalized_score = ((metrics["avg_sentiment_score"] + 2) / 4) * 100 + + # Health label + neg_pct = metrics["negative_pct"] + health = "Healthy" if neg_pct < 20 else ("Moderate" if neg_pct < 35 else "Needs Attention") + + # Opening narrative + brands = sorted(df["brand"].dropna().unique().tolist()) if "brand" in df.columns else [] + platforms = sorted(df["platform"].dropna().unique().tolist()) if "platform" in df.columns else [] + brands_str = ", ".join(str(b).title() for b in brands[:6]) if brands else "all brands" + platforms_str = ", ".join(str(p).title() for p in platforms[:6]) if platforms else "all platforms" + + narrative = ( + f"This report analyzes {metrics['total_comments']:,} comments across {brands_str} " + f"on {platforms_str}. " + f"Overall sentiment is {metrics['positive_pct']:.1f}% positive and " + f"{metrics['negative_pct']:.1f}% negative, " + f"with {metrics['reply_required_pct']:.1f}% of comments requiring a reply." + ) + self.pdf.body_text(narrative) + + # Health status + r, g, b = MusoraPDF.PRIMARY + self.pdf.set_font("Helvetica", "B", 11) + self.pdf.set_text_color(r, g, b) + self.pdf.cell( + 0, 8, f"Overall Sentiment Health: {health}", + new_x="LMARGIN", new_y="NEXT", + ) + self.pdf.ln(2) + self.pdf.set_text_color(0, 0, 0) + + # Metric tiles — two rows + self.pdf.metric_row([ + ("Total Comments", f"{metrics['total_comments']:,}"), + ("Positive %", f"{metrics['positive_pct']:.1f}%"), + ("Negative %", f"{metrics['negative_pct']:.1f}%"), + ("Sentiment Score", f"{normalized_score:.0f}/100"), + ]) + self.pdf.metric_row([ + ("Reply Required", f"{metrics['total_reply_required']:,}"), + ("Reply Rate %", f"{metrics['reply_required_pct']:.1f}%"), + ("Brands Analyzed", str(len(brands))), + ("Platforms Analyzed", str(len(platforms))), + ]) + + # Score explanation + self.pdf.ln(2) + self.pdf.callout_box( + "How to read the Sentiment Score:\n" + "Each comment is rated Very Positive (+2), Positive (+1), Neutral (0), " + "Negative (-1), or Very Negative (-2). " + "The Score (0-100) converts the average: 50 = perfectly neutral, " + "above 60 = primarily positive, below 40 = primarily negative.", + ) + + # Key findings + self.pdf.subsection_header("Key Findings") + for finding in self._generate_key_findings(df, metrics): + self.pdf.body_text(f" * {finding}") + + def _generate_key_findings(self, df, metrics: dict) -> list: + findings = [] + + # Sentiment summary + if metrics["positive_pct"] > 50: + findings.append( + f"Sentiment is predominantly positive at {metrics['positive_pct']:.1f}%." + ) + elif metrics["negative_pct"] > 30: + findings.append( + f"Negative sentiment is elevated at {metrics['negative_pct']:.1f}% - " + f"consider targeted community management." + ) + else: + findings.append( + f"Sentiment is balanced: {metrics['positive_pct']:.1f}% positive, " + f"{metrics['negative_pct']:.1f}% negative." + ) + + # Top brand by volume + if "brand" in df.columns and not df.empty: + top_brand = df["brand"].value_counts().index[0] + top_count = df["brand"].value_counts().iloc[0] + findings.append( + f"Most discussed brand: {str(top_brand).title()} " + f"({top_count:,} comments, {top_count / len(df) * 100:.1f}% of total)." + ) + + # Reply urgency + if metrics["reply_required_pct"] > 10: + findings.append( + f"{metrics['total_reply_required']:,} comments " + f"({metrics['reply_required_pct']:.1f}%) require a reply." + ) + + # Top platform by volume + if "platform" in df.columns and not df.empty: + top_platform = df["platform"].value_counts().index[0] + plat_count = df["platform"].value_counts().iloc[0] + findings.append( + f"Most active platform: {str(top_platform).title()} " + f"({plat_count:,} comments)." + ) + + return findings[:4] + + def _add_sentiment_section(self, df) -> None: + self.pdf.add_page() + self.pdf.section_header("Sentiment Distribution") + self.pdf.section_description(_DESCRIPTIONS["sentiment"]) + + metrics = SentimentMetrics.calculate_overall_metrics(df) + normalized_score = ((metrics["avg_sentiment_score"] + 2) / 4) * 100 + + pie = self.sentiment_charts.create_sentiment_pie_chart(df, title="Sentiment Distribution") + gauge = self.sentiment_charts.create_sentiment_score_gauge( + metrics["avg_sentiment_score"], title="Overall Sentiment Score" + ) + self._add_two_charts(pie, gauge) + + self.pdf.body_text( + f"Across {metrics['total_comments']:,} analyzed comments: " + f"{metrics['positive_pct']:.1f}% positive, " + f"{100 - metrics['positive_pct'] - metrics['negative_pct']:.1f}% neutral, " + f"{metrics['negative_pct']:.1f}% negative. " + f"Sentiment Score: {normalized_score:.0f}/100 " + f"(raw average: {metrics['avg_sentiment_score']:.2f} on a -2 to +2 scale)." + ) + + def _add_brand_section(self, df) -> None: + if "brand" not in df.columns or df["brand"].nunique() == 0: + return + + self.pdf.add_page() + self.pdf.section_header("Sentiment by Brand") + self.pdf.section_description(_DESCRIPTIONS["brand"]) + + bar = self.sentiment_charts.create_sentiment_bar_chart( + df, group_by="brand", title="Sentiment Distribution by Brand" + ) + pct = self.sentiment_charts.create_sentiment_percentage_bar_chart( + df, group_by="brand", title="Sentiment by Brand (%)" + ) + self._add_two_charts(bar, pct) + + # Summary table + brand_metrics = SentimentMetrics.calculate_brand_metrics(df) + rows = [] + for brand, m in sorted(brand_metrics.items()): + score = ((m["avg_sentiment_score"] + 2) / 4) * 100 + rows.append(( + str(brand).title(), + f"{m['total_comments']:,}", + f"{m['positive_pct']:.1f}%", + f"{m['negative_pct']:.1f}%", + f"{m['reply_required_pct']:.1f}%", + f"{score:.0f}/100", + )) + self.pdf.subsection_header("Brand Metrics Summary") + self.pdf.add_table( + headers=["Brand", "Comments", "Positive %", "Negative %", "Reply Rate", "Score"], + rows=rows, + col_widths=[38, 32, 30, 30, 30, 30], + ) + + def _add_platform_section(self, df) -> None: + if "platform" not in df.columns or df["platform"].nunique() == 0: + return + + self.pdf.add_page() + self.pdf.section_header("Sentiment by Platform") + self.pdf.section_description(_DESCRIPTIONS["platform"]) + + bar = self.sentiment_charts.create_sentiment_bar_chart( + df, group_by="platform", title="Sentiment Distribution by Platform" + ) + pct = self.sentiment_charts.create_sentiment_percentage_bar_chart( + df, group_by="platform", title="Sentiment by Platform (%)" + ) + self._add_two_charts(bar, pct) + + # Summary table + platform_metrics = SentimentMetrics.calculate_platform_metrics(df) + rows = [] + for platform, m in sorted(platform_metrics.items()): + score = ((m["avg_sentiment_score"] + 2) / 4) * 100 + rows.append(( + str(platform).title(), + f"{m['total_comments']:,}", + f"{m['positive_pct']:.1f}%", + f"{m['negative_pct']:.1f}%", + f"{m['reply_required_pct']:.1f}%", + f"{score:.0f}/100", + )) + self.pdf.subsection_header("Platform Metrics Summary") + self.pdf.add_table( + headers=["Platform", "Comments", "Positive %", "Negative %", "Reply Rate", "Score"], + rows=rows, + col_widths=[38, 32, 30, 30, 30, 30], + ) + + def _add_intent_section(self, df) -> None: + if "intent" not in df.columns: + return + + self.pdf.add_page() + self.pdf.section_header("Intent Analysis") + self.pdf.section_description(_DESCRIPTIONS["intent"]) + + intent_bar = self.distribution_charts.create_intent_bar_chart( + df, title="Intent Distribution", orientation="h" + ) + intent_pie = self.distribution_charts.create_intent_pie_chart( + df, title="Intent Distribution" + ) + self._add_two_charts(intent_bar, intent_pie) + + def _add_cross_dimensional_section(self, df) -> None: + if "brand" not in df.columns or "platform" not in df.columns: + return + + self.pdf.add_page() + self.pdf.section_header("Cross-Dimensional Analysis") + self.pdf.section_description(_DESCRIPTIONS["cross_dimensional"]) + + matrix = self.distribution_charts.create_brand_platform_matrix( + df, title="Brand-Platform Comment Matrix" + ) + heatmap = self.sentiment_charts.create_sentiment_heatmap( + df, + row_dimension="brand", + col_dimension="platform", + title="Negative Sentiment Heatmap", + ) + self._add_two_charts(matrix, heatmap) + + def _add_volume_section(self, df) -> None: + has_platform = "platform" in df.columns + has_brand = "brand" in df.columns + if not has_platform and not has_brand: + return + + self.pdf.add_page() + self.pdf.section_header("Volume Analysis") + self.pdf.section_description(_DESCRIPTIONS["volume"]) + + if has_platform and has_brand: + platform_dist = self.distribution_charts.create_platform_distribution( + df, title="Comments by Platform" + ) + brand_dist = self.distribution_charts.create_brand_distribution( + df, title="Comments by Brand" + ) + self._add_two_charts(platform_dist, brand_dist) + elif has_platform: + self._add_chart( + self.distribution_charts.create_platform_distribution(df, title="Comments by Platform") + ) + else: + self._add_chart( + self.distribution_charts.create_brand_distribution(df, title="Comments by Brand") + ) + + def _add_reply_requirements_section(self, df) -> None: + if "requires_reply" not in df.columns: + return + + self.pdf.add_page() + self.pdf.section_header("Reply Requirements Analysis") + self.pdf.section_description(_DESCRIPTIONS["reply_requirements"]) + + urgency = SentimentMetrics.calculate_response_urgency(df) + self.pdf.metric_row([ + ("Urgent", str(urgency["urgent_count"])), + ("High Priority", str(urgency["high_priority_count"])), + ("Medium Priority", str(urgency["medium_priority_count"])), + ("Low Priority", str(urgency["low_priority_count"])), + ]) + self.pdf.ln(3) + + has_brand = "brand" in df.columns + has_platform = "platform" in df.columns + if has_brand and has_platform: + reply_brand = self.distribution_charts.create_reply_required_chart( + df, group_by="brand", title="Comments Requiring Reply by Brand" + ) + reply_platform = self.distribution_charts.create_reply_required_chart( + df, group_by="platform", title="Comments Requiring Reply by Platform" + ) + self._add_two_charts(reply_brand, reply_platform) + elif has_brand: + self._add_chart( + self.distribution_charts.create_reply_required_chart( + df, group_by="brand", title="Comments Requiring Reply by Brand" + ) + ) + + def _add_demographics_section(self, df) -> None: + df_musora = df[df["platform"] == "musora_app"].copy() + if df_musora.empty: + return + + self.pdf.add_page() + self.pdf.section_header("Demographics Analysis (Musora App)") + self.pdf.section_description(_DESCRIPTIONS["demographics"]) + self.pdf.body_text(f"Analyzing demographics for {len(df_musora):,} Musora App comments.") + + # Age + age_dist = self.processor.get_demographics_distribution(df_musora, "age_group") + if not age_dist.empty: + self.pdf.subsection_header("Age Distribution") + self._add_chart( + self.demographic_charts.create_age_distribution_chart( + age_dist, title="Comments by Age Group" + ), + img_height=350, + ) + + # Region + region_dist = self.processor.get_timezone_regions_distribution(df_musora) + if not region_dist.empty: + self.pdf.subsection_header("Geographic Distribution") + self._add_chart( + self.demographic_charts.create_region_distribution_chart( + region_dist, title="Comments by Region" + ), + img_height=350, + ) + + # Experience + exp_dist = self.processor.get_experience_level_distribution(df_musora, use_groups=True) + if not exp_dist.empty: + self.pdf.subsection_header("Experience Level Distribution") + self._add_chart( + self.demographic_charts.create_experience_distribution_chart( + exp_dist, title="Comments by Experience Group", use_groups=True + ), + img_height=350, + ) + + def _add_language_section(self, df) -> None: + self.pdf.add_page() + self.pdf.section_header("Language Distribution") + self.pdf.section_description(_DESCRIPTIONS["language"]) + self._add_chart( + self.distribution_charts.create_language_distribution(df, top_n=10, title="Top 10 Languages") + ) + + def _add_data_summary(self, df, filter_info: dict) -> None: + self.pdf.add_page() + self.pdf.section_header("Data Summary") + + self.pdf.body_text( + f"Report generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" + ) + self.pdf.body_text(f"Total records in report: {len(df):,}") + + date_str = self._date_range_str(df) + if date_str != "N/A": + self.pdf.body_text(f"Data range: {date_str}") + + self.pdf.body_text(f"Active filters: {self._filter_summary(filter_info)}") + + if "brand" in df.columns: + brands = sorted(str(b).title() for b in df["brand"].dropna().unique()) + self.pdf.body_text(f"Brands included: {', '.join(brands)}") + + if "platform" in df.columns: + platforms = sorted(str(p).title() for p in df["platform"].dropna().unique()) + self.pdf.body_text(f"Platforms included: {', '.join(platforms)}") + + self.pdf.ln(5) + self.pdf.callout_box( + "Data source: Snowflake - SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES " + "and SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES.\n" + "This report is confidential and intended for internal Musora team use only.", + bg_color=(245, 245, 245), + ) \ No newline at end of file diff --git a/visualization/visualizations/content_cards.py b/visualization/visualizations/content_cards.py new file mode 100644 index 0000000000000000000000000000000000000000..e175cd46a76b1fb8f72f0e49e00f9b956865b795 --- /dev/null +++ b/visualization/visualizations/content_cards.py @@ -0,0 +1,309 @@ +""" +Content display components for sentiment visualization +Creates formatted cards and displays for content and comments +""" +import streamlit as st +import pandas as pd +from datetime import datetime + + +class ContentCards: + """ + Creates content display components + """ + + @staticmethod + def display_content_card(content_row, rank=None): + """ + Display a formatted content card + + Args: + content_row: Series containing content information + rank: Optional rank number to display + """ + with st.container(): + # Create columns for layout + col1, col2 = st.columns([3, 1]) + + with col1: + # Title with rank + if rank: + st.markdown(f"### 🔢 #{rank} - Content") + else: + st.markdown("### 📝 Content") + + # Content description + description = content_row.get('content_description', 'No description available') + if pd.notna(description) and description: + st.markdown(f"**Description:** {description[:200]}..." if len(str(description)) > 200 else f"**Description:** {description}") + else: + st.markdown("**Description:** *No description available*") + + # Permalink + if 'permalink_url' in content_row and pd.notna(content_row['permalink_url']): + st.markdown(f"🔗 [View Content]({content_row['permalink_url']})") + + with col2: + # Display thumbnail if available (Musora content) + if 'thumbnail_url' in content_row and pd.notna(content_row['thumbnail_url']): + try: + st.image(content_row['thumbnail_url'], use_container_width=True) + except Exception as e: + # If image fails to load, show a placeholder + st.markdown("*🖼️ Thumbnail unavailable*") + + # Statistics + st.metric("Total Comments", int(content_row.get('total_comments', 0))) + + if 'negative_percentage' in content_row: + neg_pct = content_row['negative_percentage'] + st.metric( + "Negative %", + f"{neg_pct:.1f}%", + delta=None, + delta_color="inverse" + ) + + if 'reply_required_count' in content_row: + st.metric("Replies Needed", int(content_row['reply_required_count'])) + + # Additional details in expander + with st.expander("📊 View Detailed Statistics"): + detail_col1, detail_col2, detail_col3 = st.columns(3) + + with detail_col1: + st.write("**Content ID:**", content_row.get('content_sk', 'N/A')) + if 'dominant_sentiment' in content_row: + st.write("**Dominant Sentiment:**", content_row['dominant_sentiment'].title()) + + with detail_col2: + if 'negative_count' in content_row: + st.write("**Negative Count:**", int(content_row['negative_count'])) + + with detail_col3: + if 'total_comments' in content_row: + positive_count = int(content_row['total_comments']) - int(content_row.get('negative_count', 0)) + st.write("**Positive/Neutral:**", positive_count) + + st.markdown("---") + + @staticmethod + def display_comment_card(comment_row, show_original=False): + """ + Display a formatted comment card + + Args: + comment_row: Series containing comment information + show_original: Whether to show original text for translated comments + """ + with st.container(): + # Header with metadata + col1, col2, col3 = st.columns([2, 1, 1]) + + with col1: + author = comment_row.get('author_name', 'Unknown') + st.markdown(f"**👤 {author}**") + + with col2: + if 'comment_timestamp' in comment_row and pd.notna(comment_row['comment_timestamp']): + timestamp = pd.to_datetime(comment_row['comment_timestamp']) + st.markdown(f"*📅 {timestamp.strftime('%Y-%m-%d %H:%M')}*") + + with col3: + platform = comment_row.get('platform', 'unknown') + st.markdown(f"*🌐 {platform.title()}*") + + # Comment text + display_text = comment_row.get('display_text', comment_row.get('original_text', 'No text available')) + st.markdown(f"💬 {display_text}") + + # Sentiment and intent badges + badge_col1, badge_col2, badge_col3 = st.columns([2, 2, 1]) + + with badge_col1: + sentiment = comment_row.get('sentiment_polarity', 'unknown') + sentiment_emoji = { + 'very_positive': '😄', + 'positive': '🙂', + 'neutral': '😐', + 'negative': '🙁', + 'very_negative': '😠' + }.get(sentiment, '❓') + st.markdown(f"**Sentiment:** {sentiment_emoji} {sentiment.replace('_', ' ').title()}") + + with badge_col2: + intent = comment_row.get('intent', 'unknown') + st.markdown(f"**Intent:** {intent}") + + with badge_col3: + if comment_row.get('requires_reply', False): + st.markdown("**⚠️ Reply Required**") + + # Show original text if translated + if show_original and comment_row.get('is_english') == False: + with st.expander("🌍 View Original Text"): + original_text = comment_row.get('original_text', 'Not available') + detected_lang = comment_row.get('detected_language', 'Unknown') + st.markdown(f"**Language:** {detected_lang}") + st.markdown(f"**Original:** {original_text}") + + # Additional details in expander + with st.expander("ℹ️ More Details"): + detail_col1, detail_col2 = st.columns(2) + + with detail_col1: + st.write("**Comment ID:**", comment_row.get('comment_id', 'N/A')) + st.write("**Channel:**", comment_row.get('channel_name', 'N/A')) + st.write("**Confidence:**", comment_row.get('sentiment_confidence', 'N/A')) + + with detail_col2: + if 'content_description' in comment_row and pd.notna(comment_row['content_description']): + content_desc = comment_row['content_description'] + st.write("**Content:**", content_desc[:50] + "..." if len(str(content_desc)) > 50 else content_desc) + if 'permalink_url' in comment_row and pd.notna(comment_row['permalink_url']): + st.markdown(f"[View Content]({comment_row['permalink_url']})") + + st.markdown("---") + + @staticmethod + def display_metric_cards(metrics_dict): + """ + Display a row of metric cards + + Args: + metrics_dict: Dictionary of metrics {label: value} + """ + cols = st.columns(len(metrics_dict)) + + for idx, (label, value) in enumerate(metrics_dict.items()): + with cols[idx]: + if isinstance(value, dict) and 'value' in value: + # Advanced metric with delta + st.metric( + label, + value['value'], + delta=value.get('delta'), + delta_color=value.get('delta_color', 'normal') + ) + else: + # Simple metric + st.metric(label, value) + + @staticmethod + def display_summary_stats(df): + """ + Display summary statistics in a formatted layout + + Args: + df: Sentiment dataframe + """ + st.markdown("### 📊 Summary Statistics") + + col1, col2, col3, col4 = st.columns(4) + + with col1: + st.metric("Total Comments", len(df)) + + with col2: + unique_contents = df['content_sk'].nunique() if 'content_sk' in df.columns else 0 + st.metric("Unique Contents", unique_contents) + + with col3: + reply_required = df['requires_reply'].sum() if 'requires_reply' in df.columns else 0 + st.metric("Replies Needed", int(reply_required)) + + with col4: + negative_sentiments = ['negative', 'very_negative'] + negative_count = df['sentiment_polarity'].isin(negative_sentiments).sum() + negative_pct = (negative_count / len(df) * 100) if len(df) > 0 else 0 + st.metric("Negative %", f"{negative_pct:.1f}%") + + @staticmethod + def display_filter_summary(applied_filters): + """ + Display summary of applied filters + + Args: + applied_filters: Dictionary of applied filters + """ + if not any(applied_filters.values()): + return + + st.markdown("### 🔍 Applied Filters") + + filter_text = [] + for filter_name, filter_value in applied_filters.items(): + if filter_value and len(filter_value) > 0: + filter_text.append(f"**{filter_name.title()}:** {', '.join(map(str, filter_value))}") + + if filter_text: + st.info(" | ".join(filter_text)) + + @staticmethod + def display_health_indicator(negative_pct): + """ + Display sentiment health indicator + + Args: + negative_pct: Percentage of negative sentiments + """ + if negative_pct < 10: + status = "Excellent" + color = "green" + emoji = "✅" + elif negative_pct < 20: + status = "Good" + color = "lightgreen" + emoji = "👍" + elif negative_pct < 30: + status = "Fair" + color = "orange" + emoji = "⚠️" + elif negative_pct < 50: + status = "Poor" + color = "darkorange" + emoji = "⚡" + else: + status = "Critical" + color = "red" + emoji = "🚨" + + st.markdown( + f""" +
+

{emoji} Sentiment Health: {status}

+

Negative Sentiment: {negative_pct:.1f}%

+
+ """, + unsafe_allow_html=True + ) + + @staticmethod + def display_pagination_controls(total_items, items_per_page, current_page): + """ + Display pagination controls + + Args: + total_items: Total number of items + items_per_page: Number of items per page + current_page: Current page number + + Returns: + int: New current page + """ + total_pages = (total_items - 1) // items_per_page + 1 + + col1, col2, col3 = st.columns([1, 2, 1]) + + with col1: + if st.button("⬅️ Previous", disabled=(current_page <= 1)): + current_page -= 1 + + with col2: + st.markdown(f"
Page {current_page} of {total_pages}
", unsafe_allow_html=True) + + with col3: + if st.button("Next ➡️", disabled=(current_page >= total_pages)): + current_page += 1 + + return current_page \ No newline at end of file diff --git a/visualization/visualizations/demographic_charts.py b/visualization/visualizations/demographic_charts.py new file mode 100644 index 0000000000000000000000000000000000000000..2f5f202de2dc808f3477eec50ec82ab46ba5cc34 --- /dev/null +++ b/visualization/visualizations/demographic_charts.py @@ -0,0 +1,470 @@ +""" +Demographic visualization charts for sentiment analysis +Handles age, timezone, and experience level visualizations +""" +import plotly.graph_objects as go +import plotly.express as px +import json +from pathlib import Path + + +class DemographicCharts: + """ + Creates demographic-related visualizations for musora_app data + """ + + def __init__(self): + """Initialize with configuration""" + config_path = Path(__file__).parent.parent / "config" / "viz_config.json" + with open(config_path, 'r') as f: + self.config = json.load(f) + + self.sentiment_colors = self.config['color_schemes']['sentiment_polarity'] + self.sentiment_order = self.config['sentiment_order'] + self.chart_height = self.config['dashboard']['chart_height'] + + def create_age_distribution_chart(self, age_dist_df, title="Age Distribution"): + """ + Create bar chart for age group distribution + + Args: + age_dist_df: DataFrame with age_group, count, percentage columns + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + if age_dist_df.empty: + return self._create_empty_chart(title, "No demographic data available") + + # Define custom age group order + age_order = ['18-24', '25-34', '35-44', '45-54', '55+'] + + # Sort by custom order + age_dist_df['age_group'] = pd.Categorical( + age_dist_df['age_group'], + categories=age_order, + ordered=True + ) + age_dist_df = age_dist_df.sort_values('age_group') + + fig = go.Figure() + + fig.add_trace(go.Bar( + x=age_dist_df['age_group'], + y=age_dist_df['count'], + text=age_dist_df.apply(lambda row: f"{row['count']}
({row['percentage']:.1f}%)", axis=1), + textposition='auto', + marker=dict( + color='#4A90E2', + line=dict(color='#2E5C8A', width=1) + ), + hovertemplate='%{x}
Comments: %{y}
Percentage: %{customdata:.1f}%', + customdata=age_dist_df['percentage'] + )) + + fig.update_layout( + title=title, + xaxis_title="Age Group", + yaxis_title="Number of Comments", + height=self.chart_height, + showlegend=False, + hovermode='x' + ) + + return fig + + def create_age_sentiment_chart(self, age_sentiment_df, title="Sentiment by Age Group"): + """ + Create stacked bar chart showing sentiment distribution for each age group + + Args: + age_sentiment_df: DataFrame with age_group, sentiment_polarity, count, percentage + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + if age_sentiment_df.empty: + return self._create_empty_chart(title, "No demographic data available") + + # Define custom age group order + age_order = ['18-24', '25-34', '35-44', '45-54', '55+'] + + fig = go.Figure() + + # Create a trace for each sentiment + for sentiment in self.sentiment_order: + sentiment_data = age_sentiment_df[age_sentiment_df['sentiment_polarity'] == sentiment] + + if not sentiment_data.empty: + fig.add_trace(go.Bar( + name=sentiment.replace('_', ' ').title(), + x=sentiment_data['age_group'], + y=sentiment_data['percentage'], + marker=dict(color=self.sentiment_colors.get(sentiment, '#999999')), + hovertemplate='%{fullData.name}
Age: %{x}
Percentage: %{y:.1f}%' + )) + + fig.update_layout( + title=title, + xaxis=dict( + title="Age Group", + categoryorder='array', + categoryarray=age_order + ), + yaxis=dict( + title="Percentage (%)", + range=[0, 100] + ), + barmode='stack', + height=self.chart_height, + hovermode='x unified', + legend=dict( + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1 + ) + ) + + return fig + + def create_timezone_chart(self, timezone_df, title="Top Timezones", top_n=15): + """ + Create horizontal bar chart for top timezones + + Args: + timezone_df: DataFrame with timezone, count, percentage columns + title: Chart title + top_n: Number of top timezones to display + + Returns: + plotly.graph_objects.Figure + """ + if timezone_df.empty: + return self._create_empty_chart(title, "No timezone data available") + + # Take top N and reverse for better display (highest at top) + display_df = timezone_df.head(top_n).iloc[::-1] + + fig = go.Figure() + + fig.add_trace(go.Bar( + y=display_df['timezone'], + x=display_df['count'], + orientation='h', + text=display_df.apply(lambda row: f"{row['count']} ({row['percentage']:.1f}%)", axis=1), + textposition='auto', + marker=dict( + color='#50C878', + line=dict(color='#2E7D4E', width=1) + ), + hovertemplate='%{y}
Comments: %{x}
Percentage: %{customdata:.1f}%', + customdata=display_df['percentage'] + )) + + fig.update_layout( + title=title, + xaxis_title="Number of Comments", + yaxis_title="Timezone", + height=max(self.chart_height, top_n * 25), # Dynamic height based on number of timezones + showlegend=False, + hovermode='y' + ) + + return fig + + def create_region_distribution_chart(self, region_df, title="Distribution by Region"): + """ + Create pie chart for timezone region distribution + + Args: + region_df: DataFrame with timezone_region, count, percentage columns + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + if region_df.empty: + return self._create_empty_chart(title, "No region data available") + + # Define color palette for regions + colors = px.colors.qualitative.Set3 + + fig = go.Figure() + + fig.add_trace(go.Pie( + labels=region_df['timezone_region'], + values=region_df['count'], + textinfo='label+percent', + hovertemplate='%{label}
Comments: %{value}
Percentage: %{percent}', + marker=dict(colors=colors) + )) + + fig.update_layout( + title=title, + height=self.chart_height, + showlegend=True, + legend=dict( + orientation="v", + yanchor="middle", + y=0.5, + xanchor="left", + x=1 + ) + ) + + return fig + + def create_region_sentiment_chart(self, region_sentiment_df, title="Sentiment by Region"): + """ + Create grouped bar chart showing sentiment distribution for each region + + Args: + region_sentiment_df: DataFrame with timezone_region, sentiment_polarity, count, percentage + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + if region_sentiment_df.empty: + return self._create_empty_chart(title, "No region sentiment data available") + + fig = go.Figure() + + # Create a trace for each sentiment + for sentiment in self.sentiment_order: + sentiment_data = region_sentiment_df[region_sentiment_df['sentiment_polarity'] == sentiment] + + if not sentiment_data.empty: + fig.add_trace(go.Bar( + name=sentiment.replace('_', ' ').title(), + x=sentiment_data['timezone_region'], + y=sentiment_data['percentage'], + marker=dict(color=self.sentiment_colors.get(sentiment, '#999999')), + hovertemplate='%{fullData.name}
Region: %{x}
Percentage: %{y:.1f}%' + )) + + fig.update_layout( + title=title, + xaxis_title="Region", + yaxis=dict( + title="Percentage (%)", + range=[0, 100] + ), + barmode='stack', + height=self.chart_height, + hovermode='x unified', + legend=dict( + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1 + ) + ) + + return fig + + def create_experience_distribution_chart(self, exp_df, title="Experience Level Distribution", use_groups=False): + """ + Create bar chart for experience level distribution + + Args: + exp_df: DataFrame with experience_level/experience_group, count, percentage columns + title: Chart title + use_groups: If True, display grouped experience levels + + Returns: + plotly.graph_objects.Figure + """ + if exp_df.empty: + return self._create_empty_chart(title, "No experience data available") + + field = 'experience_group' if use_groups else 'experience_level' + + # Define custom order for grouped experience + if use_groups: + exp_order = ['Beginner (0-3)', 'Intermediate (4-7)', 'Advanced (8-10)'] + exp_df[field] = pd.Categorical( + exp_df[field], + categories=exp_order, + ordered=True + ) + exp_df = exp_df.sort_values(field) + else: + # Sort by experience level numerically + exp_df = exp_df.sort_values(field) + + fig = go.Figure() + + fig.add_trace(go.Bar( + x=exp_df[field], + y=exp_df['count'], + text=exp_df.apply(lambda row: f"{row['count']}
({row['percentage']:.1f}%)", axis=1), + textposition='auto', + marker=dict( + color='#9B59B6', + line=dict(color='#6C3483', width=1) + ), + hovertemplate='%{x}
Comments: %{y}
Percentage: %{customdata:.1f}%', + customdata=exp_df['percentage'] + )) + + fig.update_layout( + title=title, + xaxis_title="Experience Level" if not use_groups else "Experience Group", + yaxis_title="Number of Comments", + height=self.chart_height, + showlegend=False, + hovermode='x' + ) + + return fig + + def create_experience_sentiment_chart(self, exp_sentiment_df, title="Sentiment by Experience Level", use_groups=False): + """ + Create stacked bar chart showing sentiment distribution for each experience level + + Args: + exp_sentiment_df: DataFrame with experience_level/experience_group, sentiment_polarity, count, percentage + title: Chart title + use_groups: If True, use grouped experience levels + + Returns: + plotly.graph_objects.Figure + """ + if exp_sentiment_df.empty: + return self._create_empty_chart(title, "No experience sentiment data available") + + field = 'experience_group' if use_groups else 'experience_level' + + fig = go.Figure() + + # Create a trace for each sentiment + for sentiment in self.sentiment_order: + sentiment_data = exp_sentiment_df[exp_sentiment_df['sentiment_polarity'] == sentiment] + + if not sentiment_data.empty: + fig.add_trace(go.Bar( + name=sentiment.replace('_', ' ').title(), + x=sentiment_data[field], + y=sentiment_data['percentage'], + marker=dict(color=self.sentiment_colors.get(sentiment, '#999999')), + hovertemplate='%{fullData.name}
Experience: %{x}
Percentage: %{y:.1f}%' + )) + + # Define custom order for grouped experience + if use_groups: + exp_order = ['Beginner (0-3)', 'Intermediate (4-7)', 'Advanced (8-10)'] + xaxis_config = dict( + title="Experience Group", + categoryorder='array', + categoryarray=exp_order + ) + else: + xaxis_config = dict(title="Experience Level") + + fig.update_layout( + title=title, + xaxis=xaxis_config, + yaxis=dict( + title="Percentage (%)", + range=[0, 100] + ), + barmode='stack', + height=self.chart_height, + hovermode='x unified', + legend=dict( + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1 + ) + ) + + return fig + + def create_demographics_heatmap(self, df, row_field, col_field, title="Demographics Heatmap"): + """ + Create heatmap for cross-demographic analysis + + Args: + df: DataFrame with demographic fields and sentiment + row_field: Field for rows (e.g., 'age_group') + col_field: Field for columns (e.g., 'experience_group') + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No data available for heatmap") + + # Create pivot table + pivot = df.pivot_table( + index=row_field, + columns=col_field, + values='count', + aggfunc='sum', + fill_value=0 + ) + + fig = go.Figure(data=go.Heatmap( + z=pivot.values, + x=pivot.columns, + y=pivot.index, + colorscale='Blues', + text=pivot.values, + texttemplate='%{text}', + textfont={"size": 10}, + hovertemplate='%{y} × %{x}
Comments: %{z}' + )) + + fig.update_layout( + title=title, + xaxis_title=col_field.replace('_', ' ').title(), + yaxis_title=row_field.replace('_', ' ').title(), + height=self.chart_height + ) + + return fig + + def _create_empty_chart(self, title, message): + """ + Create an empty chart with a message + + Args: + title: Chart title + message: Message to display + + Returns: + plotly.graph_objects.Figure + """ + fig = go.Figure() + + fig.add_annotation( + text=message, + xref="paper", + yref="paper", + x=0.5, + y=0.5, + showarrow=False, + font=dict(size=14, color="gray") + ) + + fig.update_layout( + title=title, + height=self.chart_height, + xaxis=dict(visible=False), + yaxis=dict(visible=False) + ) + + return fig + + +# Import pandas for use in methods (needed for Categorical) +import pandas as pd diff --git a/visualization/visualizations/distribution_charts.py b/visualization/visualizations/distribution_charts.py new file mode 100644 index 0000000000000000000000000000000000000000..4ca4858dc98c0ceaaae39c58d8115edcad8d5845 --- /dev/null +++ b/visualization/visualizations/distribution_charts.py @@ -0,0 +1,394 @@ +""" +Distribution visualization components using Plotly +Creates charts for intent, language, and other distributions +""" +import plotly.graph_objects as go +import plotly.express as px +from plotly.subplots import make_subplots +import pandas as pd +import json +from pathlib import Path + + +class DistributionCharts: + """ + Creates distribution visualizations + """ + + def __init__(self, config_path=None): + """ + Initialize with configuration + + Args: + config_path: Path to configuration file + """ + if config_path is None: + config_path = Path(__file__).parent.parent / "config" / "viz_config.json" + + with open(config_path, 'r') as f: + self.config = json.load(f) + + self.intent_colors = self.config['color_schemes']['intent'] + self.platform_colors = self.config['color_schemes']['platform'] + self.brand_colors = self.config['color_schemes']['brand'] + self.intent_order = self.config['intent_order'] + self.chart_height = self.config['dashboard']['chart_height'] + + def create_intent_bar_chart(self, df, title="Intent Distribution", orientation='h'): + """ + Create horizontal bar chart for intent distribution (handles multi-label) + + Args: + df: Sentiment dataframe + title: Chart title + orientation: 'h' for horizontal, 'v' for vertical + + Returns: + plotly.graph_objects.Figure + """ + # Explode intents + df_exploded = df.copy() + df_exploded['intent'] = df_exploded['intent'].str.split(',') + df_exploded = df_exploded.explode('intent') + df_exploded['intent'] = df_exploded['intent'].str.strip() + + # Count intents + intent_counts = df_exploded['intent'].value_counts() + + # Order by intent_order + ordered_intents = [i for i in self.intent_order if i in intent_counts.index] + intent_counts = intent_counts[ordered_intents] + + colors = [self.intent_colors.get(i, '#CCCCCC') for i in intent_counts.index] + + if orientation == 'h': + fig = go.Figure(data=[go.Bar( + y=intent_counts.index, + x=intent_counts.values, + orientation='h', + marker=dict(color=colors), + text=intent_counts.values, + textposition='auto', + hovertemplate='%{y}
Count: %{x}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Number of Comments", + yaxis_title="Intent", + height=self.chart_height, + yaxis={'categoryorder': 'total ascending'} + ) + else: + fig = go.Figure(data=[go.Bar( + x=intent_counts.index, + y=intent_counts.values, + marker=dict(color=colors), + text=intent_counts.values, + textposition='auto', + hovertemplate='%{x}
Count: %{y}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Intent", + yaxis_title="Number of Comments", + height=self.chart_height + ) + + return fig + + def create_intent_pie_chart(self, df, title="Intent Distribution"): + """ + Create pie chart for intent distribution + + Args: + df: Sentiment dataframe + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + # Explode intents + df_exploded = df.copy() + df_exploded['intent'] = df_exploded['intent'].str.split(',') + df_exploded = df_exploded.explode('intent') + df_exploded['intent'] = df_exploded['intent'].str.strip() + + intent_counts = df_exploded['intent'].value_counts() + + # Order by intent_order + ordered_intents = [i for i in self.intent_order if i in intent_counts.index] + intent_counts = intent_counts[ordered_intents] + + colors = [self.intent_colors.get(i, '#CCCCCC') for i in intent_counts.index] + + fig = go.Figure(data=[go.Pie( + labels=intent_counts.index, + values=intent_counts.values, + marker=dict(colors=colors), + textinfo='label+percent', + textposition='auto', + hovertemplate='%{label}
Count: %{value}
Percentage: %{percent}' + )]) + + fig.update_layout( + title=title, + height=self.chart_height, + showlegend=True, + legend=dict(orientation="v", yanchor="middle", y=0.5, xanchor="left", x=1.05) + ) + + return fig + + def create_platform_distribution(self, df, title="Comments by Platform"): + """ + Create bar chart for platform distribution + + Args: + df: Sentiment dataframe + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + platform_counts = df['platform'].value_counts() + + colors = [self.platform_colors.get(p, self.platform_colors['default']) for p in platform_counts.index] + + fig = go.Figure(data=[go.Bar( + x=platform_counts.index, + y=platform_counts.values, + marker=dict(color=colors), + text=platform_counts.values, + textposition='auto', + hovertemplate='%{x}
Comments: %{y}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Platform", + yaxis_title="Number of Comments", + height=self.chart_height + ) + + return fig + + def create_brand_distribution(self, df, title="Comments by Brand"): + """ + Create bar chart for brand distribution + + Args: + df: Sentiment dataframe + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + brand_counts = df['brand'].value_counts() + + colors = [self.brand_colors.get(b, self.brand_colors['default']) for b in brand_counts.index] + + fig = go.Figure(data=[go.Bar( + x=brand_counts.index, + y=brand_counts.values, + marker=dict(color=colors), + text=brand_counts.values, + textposition='auto', + hovertemplate='%{x}
Comments: %{y}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Brand", + yaxis_title="Number of Comments", + height=self.chart_height + ) + + return fig + + def create_language_distribution(self, df, top_n=10, title="Language Distribution"): + """ + Create bar chart for language distribution + + Args: + df: Sentiment dataframe + top_n: Number of top languages to show + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + if 'detected_language' not in df.columns: + return go.Figure().add_annotation( + text="No language data available", + xref="paper", yref="paper", + x=0.5, y=0.5, showarrow=False + ) + + lang_counts = df['detected_language'].value_counts().head(top_n) + + fig = go.Figure(data=[go.Bar( + x=lang_counts.index, + y=lang_counts.values, + marker=dict(color='#2196F3'), + text=lang_counts.values, + textposition='auto', + hovertemplate='%{x}
Comments: %{y}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Language", + yaxis_title="Number of Comments", + height=self.chart_height + ) + + return fig + + def create_combined_distribution_sunburst(self, df, title="Hierarchical Distribution"): + """ + Create sunburst chart showing hierarchical distribution + (Brand > Platform > Sentiment) + + Args: + df: Sentiment dataframe + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + # Prepare data for sunburst + sunburst_data = df.groupby(['brand', 'platform', 'sentiment_polarity']).size().reset_index(name='count') + + fig = px.sunburst( + sunburst_data, + path=['brand', 'platform', 'sentiment_polarity'], + values='count', + title=title, + height=500 + ) + + fig.update_layout( + margin=dict(t=50, l=0, r=0, b=0) + ) + + return fig + + def create_brand_platform_matrix(self, df, title="Brand-Platform Comment Matrix"): + """ + Create heatmap showing comment distribution across brands and platforms + + Args: + df: Sentiment dataframe + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + # Create pivot table + matrix_data = pd.crosstab(df['brand'], df['platform']) + + fig = go.Figure(data=go.Heatmap( + z=matrix_data.values, + x=matrix_data.columns, + y=matrix_data.index, + colorscale='Blues', + text=matrix_data.values, + texttemplate='%{text}', + textfont={"size": 14}, + hovertemplate='%{y} - %{x}
Comments: %{z}', + colorbar=dict(title="Comments") + )) + + fig.update_layout( + title=title, + xaxis_title="Platform", + yaxis_title="Brand", + height=self.chart_height + ) + + return fig + + def create_reply_required_chart(self, df, group_by='brand', title="Comments Requiring Reply"): + """ + Create stacked bar chart showing reply requirements + + Args: + df: Sentiment dataframe + group_by: Column to group by + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + # Create aggregation + reply_data = df.groupby([group_by, 'requires_reply']).size().reset_index(name='count') + reply_pivot = reply_data.pivot(index=group_by, columns='requires_reply', values='count').fillna(0) + + fig = go.Figure() + + if False in reply_pivot.columns: + fig.add_trace(go.Bar( + name='No Reply Needed', + x=reply_pivot.index, + y=reply_pivot[False], + marker_color='#81C784', + hovertemplate='%{x}
No Reply: %{y}' + )) + + if True in reply_pivot.columns: + fig.add_trace(go.Bar( + name='Reply Required', + x=reply_pivot.index, + y=reply_pivot[True], + marker_color='#FF7043', + hovertemplate='%{x}
Reply Required: %{y}' + )) + + fig.update_layout( + title=title, + xaxis_title=group_by.capitalize(), + yaxis_title="Number of Comments", + barmode='stack', + height=self.chart_height, + legend=dict(title="Reply Status", orientation="v", yanchor="top", y=1, xanchor="left", x=1.02) + ) + + return fig + + def create_engagement_scatter(self, content_summary_df, title="Content Engagement Analysis"): + """ + Create scatter plot showing content engagement + + Args: + content_summary_df: DataFrame with content summary statistics + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + fig = px.scatter( + content_summary_df, + x='total_comments', + y='negative_percentage', + size='reply_required_count', + color='negative_percentage', + hover_data=['content_description'], + title=title, + labels={ + 'total_comments': 'Total Comments', + 'negative_percentage': 'Negative Sentiment %', + 'reply_required_count': 'Replies Required' + }, + color_continuous_scale='RdYlGn_r', + height=self.chart_height + ) + + fig.update_layout( + xaxis_title="Total Comments", + yaxis_title="Negative Sentiment %", + coloraxis_colorbar=dict(title="Negative %") + ) + + return fig \ No newline at end of file diff --git a/visualization/visualizations/sentiment_charts.py b/visualization/visualizations/sentiment_charts.py new file mode 100644 index 0000000000000000000000000000000000000000..7881f1bbc2222528db302be375954b4f27701c8b --- /dev/null +++ b/visualization/visualizations/sentiment_charts.py @@ -0,0 +1,297 @@ +""" +Sentiment visualization components using Plotly +Creates interactive charts for sentiment analysis +""" +import plotly.graph_objects as go +import plotly.express as px +from plotly.subplots import make_subplots +import pandas as pd +import json +from pathlib import Path + + +class SentimentCharts: + """ + Creates sentiment-related visualizations + """ + + def __init__(self, config_path=None): + """ + Initialize with configuration + + Args: + config_path: Path to configuration file + """ + if config_path is None: + config_path = Path(__file__).parent.parent / "config" / "viz_config.json" + + with open(config_path, 'r') as f: + self.config = json.load(f) + + self.sentiment_colors = self.config['color_schemes']['sentiment_polarity'] + self.sentiment_order = self.config['sentiment_order'] + self.chart_height = self.config['dashboard']['chart_height'] + + def create_sentiment_pie_chart(self, df, title="Sentiment Distribution"): + """ + Create pie chart for sentiment distribution + + Args: + df: Sentiment dataframe + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + sentiment_counts = df['sentiment_polarity'].value_counts() + + # Order by sentiment_order + ordered_sentiments = [s for s in self.sentiment_order if s in sentiment_counts.index] + sentiment_counts = sentiment_counts[ordered_sentiments] + + colors = [self.sentiment_colors.get(s, '#CCCCCC') for s in sentiment_counts.index] + + fig = go.Figure(data=[go.Pie( + labels=sentiment_counts.index, + values=sentiment_counts.values, + marker=dict(colors=colors), + textinfo='label+percent', + textposition='auto', + hovertemplate='%{label}
Count: %{value}
Percentage: %{percent}' + )]) + + fig.update_layout( + title=title, + height=self.chart_height, + showlegend=True, + legend=dict(orientation="v", yanchor="middle", y=0.5, xanchor="left", x=1.05) + ) + + return fig + + def create_sentiment_bar_chart(self, df, group_by, title="Sentiment Distribution"): + """ + Create stacked bar chart for sentiment distribution by group + + Args: + df: Sentiment dataframe + group_by: Column to group by + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + # Create pivot table + sentiment_pivot = pd.crosstab(df[group_by], df['sentiment_polarity']) + + # Reorder columns by sentiment_order + ordered_columns = [s for s in self.sentiment_order if s in sentiment_pivot.columns] + sentiment_pivot = sentiment_pivot[ordered_columns] + + fig = go.Figure() + + for sentiment in sentiment_pivot.columns: + fig.add_trace(go.Bar( + name=sentiment, + x=sentiment_pivot.index, + y=sentiment_pivot[sentiment], + marker_color=self.sentiment_colors.get(sentiment, '#CCCCCC'), + hovertemplate='%{x}
%{y} comments' + )) + + fig.update_layout( + title=title, + xaxis_title=group_by.capitalize(), + yaxis_title="Number of Comments", + barmode='stack', + height=self.chart_height, + legend=dict(title="Sentiment", orientation="v", yanchor="top", y=1, xanchor="left", x=1.02) + ) + + return fig + + def create_sentiment_percentage_bar_chart(self, df, group_by, title="Sentiment Distribution (%)"): + """ + Create 100% stacked bar chart for sentiment distribution + + Args: + df: Sentiment dataframe + group_by: Column to group by + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + # Create pivot table with percentages + sentiment_pivot = pd.crosstab(df[group_by], df['sentiment_polarity'], normalize='index') * 100 + + # Reorder columns by sentiment_order + ordered_columns = [s for s in self.sentiment_order if s in sentiment_pivot.columns] + sentiment_pivot = sentiment_pivot[ordered_columns] + + fig = go.Figure() + + for sentiment in sentiment_pivot.columns: + fig.add_trace(go.Bar( + name=sentiment, + x=sentiment_pivot.index, + y=sentiment_pivot[sentiment], + marker_color=self.sentiment_colors.get(sentiment, '#CCCCCC'), + hovertemplate='%{x}
%{y:.1f}%' + )) + + fig.update_layout( + title=title, + xaxis_title=group_by.capitalize(), + yaxis_title="Percentage (%)", + barmode='stack', + height=self.chart_height, + yaxis=dict(range=[0, 100]), + legend=dict(title="Sentiment", orientation="v", yanchor="top", y=1, xanchor="left", x=1.02) + ) + + return fig + + def create_sentiment_heatmap(self, df, row_dimension, col_dimension, title="Sentiment Heatmap"): + """ + Create heatmap showing sentiment distribution across two dimensions + + Args: + df: Sentiment dataframe + row_dimension: Row dimension + col_dimension: Column dimension + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + # Create pivot table for negative sentiment percentage + negative_sentiments = self.config['negative_sentiments'] + df_negative = df[df['sentiment_polarity'].isin(negative_sentiments)] + + heatmap_data = pd.crosstab( + df[row_dimension], + df[col_dimension], + values=(df['sentiment_polarity'].isin(negative_sentiments)).astype(int), + aggfunc='mean' + ) * 100 + + fig = go.Figure(data=go.Heatmap( + z=heatmap_data.values, + x=heatmap_data.columns, + y=heatmap_data.index, + colorscale='RdYlGn_r', + text=heatmap_data.values.round(1), + texttemplate='%{text}%', + textfont={"size": 12}, + hovertemplate='%{y} - %{x}
Negative: %{z:.1f}%', + colorbar=dict(title="Negative %") + )) + + fig.update_layout( + title=title, + xaxis_title=col_dimension.capitalize(), + yaxis_title=row_dimension.capitalize(), + height=self.chart_height + ) + + return fig + + def create_sentiment_timeline(self, df, freq='D', title="Sentiment Over Time"): + """ + Create line chart showing sentiment trends over time + + Args: + df: Sentiment dataframe with comment_timestamp + freq: Frequency for aggregation ('D', 'W', 'M') + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + if 'comment_timestamp' not in df.columns: + return go.Figure().add_annotation( + text="No timestamp data available", + xref="paper", yref="paper", + x=0.5, y=0.5, showarrow=False + ) + + df_temp = df.copy() + df_temp['date'] = pd.to_datetime(df_temp['comment_timestamp']).dt.to_period(freq).dt.to_timestamp() + + # Aggregate by date and sentiment + timeline_data = df_temp.groupby(['date', 'sentiment_polarity']).size().reset_index(name='count') + + fig = go.Figure() + + for sentiment in self.sentiment_order: + sentiment_data = timeline_data[timeline_data['sentiment_polarity'] == sentiment] + if not sentiment_data.empty: + fig.add_trace(go.Scatter( + x=sentiment_data['date'], + y=sentiment_data['count'], + name=sentiment, + mode='lines+markers', + line=dict(color=self.sentiment_colors.get(sentiment, '#CCCCCC'), width=2), + marker=dict(size=6), + hovertemplate='%{x}
Count: %{y}' + )) + + fig.update_layout( + title=title, + xaxis_title="Date", + yaxis_title="Number of Comments", + height=self.chart_height, + legend=dict(title="Sentiment", orientation="v", yanchor="top", y=1, xanchor="left", x=1.02), + hovermode='x unified' + ) + + return fig + + def create_sentiment_score_gauge(self, avg_score, title="Overall Sentiment Score"): + """ + Create gauge chart for average sentiment score + + Args: + avg_score: Average sentiment score (-2 to +2) + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + # Normalize score to 0-100 scale + normalized_score = ((avg_score + 2) / 4) * 100 + + fig = go.Figure(go.Indicator( + mode="gauge+number+delta", + value=normalized_score, + domain={'x': [0, 1], 'y': [0, 1]}, + title={'text': title, 'font': {'size': 20}}, + number={'suffix': '', 'font': {'size': 40}}, + gauge={ + 'axis': {'range': [0, 100], 'tickwidth': 1, 'tickcolor': "darkblue"}, + 'bar': {'color': "darkblue"}, + 'bgcolor': "white", + 'borderwidth': 2, + 'bordercolor': "gray", + 'steps': [ + {'range': [0, 20], 'color': '#D32F2F'}, + {'range': [20, 40], 'color': '#FF6F00'}, + {'range': [40, 60], 'color': '#FFB300'}, + {'range': [60, 80], 'color': '#7CB342'}, + {'range': [80, 100], 'color': '#00C851'} + ], + 'threshold': { + 'line': {'color': "black", 'width': 4}, + 'thickness': 0.75, + 'value': normalized_score + } + } + )) + + fig.update_layout( + height=300, + margin=dict(l=20, r=20, t=60, b=20) + ) + + return fig \ No newline at end of file diff --git a/visualization_brand_sentiment/README.md b/visualization_brand_sentiment/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1171361b43282850db760825400260c8268d5045 --- /dev/null +++ b/visualization_brand_sentiment/README.md @@ -0,0 +1,518 @@ +# Brand Sentiment Visualization Dashboard + +An interactive Streamlit dashboard for visualizing Sabian Cymbals brand sentiment from **two data sources**: Musora Forum posts and social media comments (YouTube). Provides comprehensive brand health monitoring, sentiment analysis, product insights, competitive intelligence, user demographics, AI-powered brand insights, and professional PDF report generation. + +## Overview + +The dashboard connects to Snowflake, loads pre-processed brand sentiment data (produced by the `processing_brand_sentiment` pipeline), and presents it through interactive charts, filterable views, and an AI-powered insight agent. It supports analyzing forum posts and social media comments independently or combined. A PDF export module generates high-quality print-ready reports with all visualizations and descriptive analysis. + +## Data Sources + +| Source | Snowflake Table | Description | +|--------|----------------|-------------| +| **Forum Posts** | `SABIAN_BRAND_ANALYSIS` | ML-processed forum posts from Musora drumming community | +| **Social Media Comments** | `SABIAN_BRAND_ANALYSIS_COMMENTS` | ML-processed comments from YouTube channels (Drumeo, etc.) | +| **User Demographics** | `usora_users` | Musora user profiles (age, experience, gear ownership) | +| **Raw Forum Posts** | `forum_posts` | Raw forum post content used for competitor mention counts outside processed data | + +### Shared Analysis Columns (Both Sources) + +| Column | Description | +|--------|-------------| +| `sentiment_level` | very_positive, positive, neutral, negative, very_negative | +| `author_role` | current_owner, past_owner, potential_buyer, never_owned, unknown | +| `sabian_mention_context` | primary_focus, significant_mention, casual_mention, comparison_context | +| `emotion_type` | frustration, disappointment, anger, satisfaction, excitement, curiosity, indifference | +| `products_mentioned` | JSON array of Sabian products (HHX, AAX, Artisan, etc.) | +| `competitors_mentioned` | JSON array (Zildjian, Meinl, Paiste, etc.) | +| `intents` | JSON array of user intents | +| `pain_points` | JSON array of pain points | +| `delight_factors` | JSON array of delight factors | +| `purchase_stage` | researching, deciding, recently_purchased, long_term_owner, selling_replacing | + +### Source-Specific Columns + +| Forum Posts | Social Media Comments | +|-------------|----------------------| +| `post_id`, `thread_id`, `thread_title` | `comment_id`, `comment_sk` | +| `post_created_at` | `comment_timestamp` | +| `post_author_id` | `author_name`, `author_id` | +| `cleaned_content`, `original_content` | `original_text` | +| `thread_context_summary` | `platform`, `channel_display_name`, `content_title` | + +## Architecture + +``` +visualization_brand_sentiment/ +├── app.py # Main Streamlit entry point +├── requirements.txt # Python dependencies +├── README.md # This file +├── img/ # Static assets +│ └── musora.png # Musora logo +├── config/ +│ └── viz_config.json # Colors, queries, brand settings, dashboard config +├── components/ +│ ├── __init__.py +│ ├── dashboard.py # Main dashboard page (forums + social media sections) +│ └── sentiment_analysis.py # Deep-dive analysis with source/platform filters +├── data/ +│ ├── __init__.py +│ └── data_loader.py # Snowflake connection, caching, filtering +├── visualizations/ +│ ├── __init__.py +│ ├── brand_charts.py # Product & competitor charts +│ ├── content_cards.py # Post/comment cards, summary stats, AI insights +│ ├── demographic_charts.py # Experience, ownership, age, timezone charts +│ ├── distribution_charts.py # Intent, context, stage, sunburst charts +│ └── sentiment_charts.py # Sentiment pie, bar, timeline, gauge, emotion charts +├── utils/ +│ ├── __init__.py +│ ├── data_processor.py # Data aggregation & distribution calculations +│ ├── llm_helper.py # OpenAI API wrapper with retry logic +│ ├── metrics.py # KPI & metric calculations +│ └── pdf_exporter.py # High-quality PDF report generation +└── agents/ + ├── __init__.py + ├── base_agent.py # Abstract base agent class + └── brand_insight_agent.py # AI-powered brand analysis agent +``` + +### Data Flow + +``` +Snowflake + ├── Forum Posts ──> load_posts_data() ──> _process_posts_dataframe() + ├── Comments ──> load_comments_data() ──> _process_comments_dataframe() + ├── Users ──> load_users_data() ──> _process_users_dataframe() + │ │ + │ merge_posts_with_users() + │ │ + │ ┌─────────────────────────┤ + │ │ │ + │ Sidebar Main Content + │ (Global Filters) ┌────────┴────────┐ + │ │ │ │ + │ apply_filters() Dashboard Sentiment Analysis + │ │ (forums + (source selector, + │ └─────> comments platform/channel + │ sections) filters, AI insights, + │ │ post/comment explorer) + │ │ + │ PDF Export + │ (DashboardPDFExporter) + │ + └── Raw Forum Posts ──> load_competitor_forum_mentions(date_range) + │ + ┌──────────────┘ + │ + get_overall_brand_mentions(posts_df, comments_df, additional_mentions_df) + │ + Brand mention chart (Sabian + all competitors) +``` + +### Overall Brand Mentions Data Flow + +The "Overall Brand Mentions" section compares Sabian's total mention count against competitors by combining three data sources: + +1. **Sabian count**: Sum of filtered forum posts + filtered social media comments +2. **Competitor mentions (processed)**: Competitor names extracted by ML from within Sabian-related posts/comments (already date-filtered via `apply_filters`) +3. **Competitor mentions (additional)**: Competitor name matches from raw `forum_posts` that were NOT processed as Sabian-related (date-filtered at the SQL level via `load_competitor_forum_mentions(date_range)`) + +All three sources respect the sidebar date range filter, ensuring consistent time-window comparisons. The competitor mention SQL query uses `POST_CREATED_AT` on the raw `forum_posts` table to match the same date window applied to the processed data. + +## Pages + +### Dashboard + +The main overview page with two major sections: + +**Forum Posts Section:** +- Summary statistics (total posts, unique threads/authors, sentiment %) +- Brand health indicator (Excellent/Good/Fair/Needs Attention/Critical) +- Sentiment distribution (pie chart + gauge) +- Author role analysis (distribution + sentiment breakdown by role) +- Potential buyer insights (count, sentiment, purchase stage) +- Sabian products analysis (top products mentioned, sentiment breakdown, detailed metrics table) +- Overall brand mentions (Sabian vs competitors across all sources, date-filtered) +- Competitive analysis (competitors mentioned, sentiment heatmap, brand switching flow) +- Intents & feedback (intent distribution, pain points vs delight factors) +- Mention context & purchase journey +- User demographics (drumming experience, cymbal ownership, gear brands, age, timezone) +- Emotion analysis +- Temporal trends (daily/weekly/monthly) +- Processing status & hierarchical sunburst view +- PDF report export + +**Social Media Comments Section:** +- Separate section with distinct header styling +- Comment summary stats (total comments, platforms, channels, sentiment %) +- Brand health indicator +- Platform breakdown (pie chart by platform, sentiment by platform, per-platform metrics) +- Comment sentiment distribution (pie + gauge) +- Author role analysis +- Products & competitors in comments +- Intents & feedback +- Temporal trends +- Processing status + +### Sentiment Analysis + +A deep-dive analysis page with multi-source support: + +**Data Source Selection:** +- Radio button: **Forum Posts** / **Social Media Comments** / **All Sources** +- Dynamically adapts the entire page based on selection + +**Filters (14 dimensions):** +- **Platform** and **Channel** filters (appear dynamically for social media sources) +- Sentiment Level, Author Role, Products Mentioned, Competitors Mentioned +- Intents, Purchase Stage, Pain Points, Delight Factors +- Mention Context, Processing Status, Date Range + +**Summary Statistics:** +- Adapts metrics to source type (threads for forums, platforms for comments, split counts for all) + +**Visualizations:** +- Sentiment pie chart, intent distribution, products & competitors bar charts + +**AI-Powered Brand Insights:** +- Uses `BrandInsightAgent` with OpenAI (gpt-5-nano) +- Samples up to 50 entries with diverse sentiment distribution +- Works with both forum posts and social media comments +- Generates: executive summary, sentiment analysis, product insights, competitive position, customer journey, key themes, actionable recommendations, notable quotes + +**Post/Comment Explorer:** +- Paginated browsing with source-appropriate cards +- Forum posts: `display_post_card()` (thread title, author role, date, sentiment, content, tags) +- Social media comments: `display_comment_card()` (content title, author name, platform badge, channel, content, tags) +- Sort options adapt to data source +- Full detail expandable view + +**Export:** +- CSV download with source-appropriate columns +- List columns (products, competitors, etc.) converted to comma-separated strings + +## Key Components + +### Data Loader (`data/data_loader.py`) + +| Method | Description | +|--------|-------------| +| `load_posts_data()` | Load forum posts (cached 5 min) | +| `load_comments_data()` | Load social media comments (cached 5 min) | +| `load_users_data()` | Load user demographics (cached 10 min) | +| `load_competitor_forum_mentions(date_range)` | Load competitor mention counts from raw forum posts, filtered by date range (cached 5 min, keyed on date_range) | +| `merge_posts_with_users()` | Left-join posts with user demographics | +| `get_filter_options(df)` | Extract unique values for filter dropdowns | +| `get_comment_filter_options(df)` | Extends filter options with platforms & channels | +| `apply_filters(df, ...)` | Apply any combination of 14 filter dimensions | + +**Competitor Mention Query (`_build_competitor_mention_query`):** +- Builds a `UNION ALL` SQL query across all configured competitors +- Matches competitor names and aliases against `POST_CONTENT` using word-boundary `LIKE` patterns +- Excludes posts already in the processed `sabian_brand_analysis` table +- Applies `POST_CREATED_AT` date filter when a date range is provided, keeping counts aligned with the filtered Sabian data + +**Data Processing:** +- Column name normalization (lowercase) +- Datetime parsing for all date columns +- JSON array parsing for multi-value columns (products, competitors, intents, etc.) +- `data_source` column: `'forums'` for posts, `'comments'` for social media +- `display_text` unification (cleaned_content for posts, original_text for comments) +- Brand switching categorization +- Age/experience calculation and grouping +- Cymbal ownership detection +- Timezone region extraction + +### Metrics (`utils/metrics.py`) + +| Method | Description | +|--------|-------------| +| `calculate_overall_metrics(df)` | Total posts, sentiment %, score, unique threads/authors | +| `calculate_product_metrics(df)` | Per-product sentiment stats | +| `calculate_competitor_metrics(df)` | Per-competitor sentiment stats | +| `calculate_brand_switching_metrics(df)` | Switching to/from Sabian counts and ratio | +| `calculate_potential_buyer_metrics(df)` | Buyer segment analysis | +| `calculate_pain_delight_metrics(df)` | Pain/delight ratio and top items | +| `calculate_intent_metrics(df)` | Intent distribution analysis | +| `calculate_trend_metrics(df)` | Period-over-period sentiment comparison | +| `calculate_demographics_metrics(df)` | User demographics summary | +| `get_sentiment_health_status(pct)` | Health status based on negative % | + +**Health Status Levels:** + +| Negative % | Status | Color | +|-----------|--------|-------| +| < 10% | Excellent | Green | +| 10-20% | Good | Light Green | +| 20-30% | Fair | Amber | +| 30-50% | Needs Attention | Orange | +| > 50% | Critical | Red | + +### Data Processor (`utils/data_processor.py`) + +All static methods for distribution and aggregation calculations: + +- `get_sentiment_distribution()`, `get_author_role_distribution()`, `get_author_role_by_sentiment()` +- `get_products_distribution()`, `get_product_sentiment_breakdown()` +- `get_competitors_distribution()`, `get_competitor_sentiment_breakdown()` +- `get_overall_brand_mentions(posts_df, comments_df, additional_mentions_df)` - combines Sabian counts from filtered posts/comments with competitor additional mentions into a unified brand comparison DataFrame +- `get_intents_distribution()`, `get_pain_points_distribution()`, `get_delight_factors_distribution()` +- `get_purchase_stage_distribution()`, `get_mention_context_distribution()` +- `get_emotion_distribution()`, `get_processing_status_distribution()` +- `get_temporal_trends()`, `get_cymbal_ownership_analysis()`, `get_sentiment_by_cymbal_ownership()` +- `get_demographics_distribution()`, `get_gear_brand_analysis()` + +### PDF Exporter (`utils/pdf_exporter.py`) + +Generates comprehensive, high-quality PDF reports from dashboard data and visualizations. + +**Classes:** + +| Class | Description | +|-------|-------------| +| `SabianPDF` | Custom `FPDF` subclass with Sabian branding (header, footer, section headers, metric rows, data tables) | +| `DashboardPDFExporter` | Main report generator that orchestrates all visualization modules into a structured PDF | + +**Key Features:** + +- **High-DPI rendering**: Charts are rendered as PNG at 3x scale (`RENDER_SCALE = 3`) via Plotly/kaleido, producing ~300 DPI output at print size. This class attribute can be adjusted for different quality/size trade-offs. +- **Text cutoff prevention**: `_prepare_fig_for_pdf()` applies `automargin=True` on all axes, generous padding, and increased font sizes before rendering. Handles all chart types (bar, pie, gauge, funnel, sunburst) safely. +- **Structured page layout**: Each analytical section starts on its own page with a section header and italicized description explaining what the visualizations show and how to interpret them. Descriptions are stored in the module-level `SECTION_DESCRIPTIONS` dictionary for easy editing. +- **Section descriptions**: The `SECTION_DESCRIPTIONS` dict maps section keys to 2-4 sentence paragraphs rendered via `SabianPDF.section_description()`. This provides context for non-technical readers without cluttering chart code. + +**Report Sections (in order):** + +1. Cover page (branding, date, data counts, active filters) +2. Executive summary (health status, KPIs, key findings) +3. Sentiment distribution (pie + gauge, breakdown text) +4. Author role analysis (role distribution + sentiment by role, buyer insights) +5. Sabian products analysis (top products bar, sentiment breakdown, detailed metrics table) +6. Overall brand mentions (brand comparison chart, mention breakdown table) +7. Competitive analysis (competitors bar, sentiment heatmap, brand switching) +8. Intents & feedback (intent bar, pain/delight comparison, ratio metrics) +9. Mention context & purchase journey (context + stage charts) +10. User demographics (experience, cymbal ownership, age, timezone) +11. Emotion analysis (emotion distribution bar) +12. Social media comments (if available: metrics, platform breakdown, sentiment, products, competitors) +13. Appendix: data summary (volumes, date ranges, filters, methodology) + +**Chart Rendering Pipeline:** + +``` +Plotly Figure + │ + ├── _prepare_fig_for_pdf(fig, is_side_by_side) + │ Sets white background, font sizes, margins, automargin + │ + ├── pio.to_image(fig, format='png', scale=RENDER_SCALE) + │ Renders at 3x resolution (e.g., 800x400 base → 2400x1200 actual) + │ + ├── Temporary PNG file + │ + └── pdf.image(path, x, w) + Embeds into PDF at specified width (aspect ratio preserved) +``` + +### AI Agent (`agents/brand_insight_agent.py`) + +- Inherits from `BaseVisualizationAgent` +- Uses `LLMHelper` for OpenAI API calls (gpt-5-nano, JSON mode) +- **Diverse sampling**: 1/3 negative, 1/3 positive, 1/3 neutral (up to 50 entries) +- **Multi-source support**: Extracts text from `cleaned_content`, `original_content`, `original_text`, or `display_text` +- **Context-aware**: Includes thread title or content title, platform, and channel in LLM context +- **Structured output**: Executive summary, sentiment analysis, product/competitive insights, themes, recommendations, quotes + +### Visualization Modules + +| Module | Charts | +|--------|--------| +| `sentiment_charts.py` | Pie, bar, percentage bar, heatmap, timeline, gauge, emotion distribution | +| `brand_charts.py` | Products horizontal bar, product sentiment breakdown, overall brand mentions, competitors bar, competitive heatmap, switching flow | +| `distribution_charts.py` | Author role, mention context, intent bar, pain/delight comparison, purchase stage, processing status, sunburst | +| `demographic_charts.py` | Experience distribution, cymbal ownership, ownership sentiment, gear brands, age distribution, timezone | +| `content_cards.py` | Summary stats, health indicator, post card, comment card, comment summary stats, filter summary, AI insights display, pagination | + +## Configuration (`config/viz_config.json`) + +### Brand Settings +```json +{ + "name": "Sabian", + "primary_color": "#C8102E", + "secondary_color": "#1E1E1E", + "accent_color": "#FFD700" +} +``` + +### Color Schemes +Configured for: sentiment levels (5), author roles (5), mention contexts (4), intents (8), purchase stages (5), feedback aspects (8), competitors (6), products (10), emotions (7), brand switching (3), processing status (4), platforms (forums + youtube), data sources (forums + comments). + +### Competitor Aliases +Each competitor has a list of aliases used for word-boundary matching in the raw forum posts query: +- Zildjian: zildjian, z custom, a custom, k custom, k zildjian, a zildjian +- Meinl: meinl, byzance +- Paiste: paiste, formula 602 +- Dream Cymbals: dream cymbals +- Istanbul Agop: istanbul agop, agop, istanbul mehmet +- Bosphorus: bosphorus + +### Snowflake Queries +- **Posts**: `SELECT * FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS WHERE IS_RELEVANT = TRUE AND PROCESSING_SUCCESS = TRUE` +- **Comments**: `SELECT * FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS WHERE IS_RELEVANT = TRUE AND PROCESSING_SUCCESS = TRUE` +- **Users**: JOIN `usora_users` for authors found in the posts table +- **Competitor Mentions**: Dynamically built `UNION ALL` query against `social_media_db.core.forum_posts` with optional `POST_CREATED_AT` date filtering + +### Dashboard Settings +- `max_posts_display`: 100 +- `chart_height`: 400 +- `top_n_products`: 10 +- `top_n_competitors`: 10 + +### Demographics Config +- Experience groups: Beginner (0-5yr), Intermediate (5-15yr), Advanced (15-25yr), Expert (25+yr) +- Age groups: 18-24, 25-34, 35-44, 45-54, 55+ + +## Setup + +### Prerequisites +- Python 3.8+ +- Snowflake account with access to `SOCIAL_MEDIA_DB` +- OpenAI API key (for AI insights) +- Processed data from `processing_brand_sentiment` pipeline + +### Environment Variables + +Required in `.env` (project root): +```env +SNOWFLAKE_USER=your_user +SNOWFLAKE_PASSWORD=your_password +SNOWFLAKE_ACCOUNT=your_account +SNOWFLAKE_ROLE=your_role +SNOWFLAKE_DATABASE=SOCIAL_MEDIA_DB +SNOWFLAKE_WAREHOUSE=your_warehouse +SNOWFLAKE_SCHEMA=ML_FEATURES + +OPENAI_API_KEY=your_openai_key +``` + +### Installation + +```bash +cd visualization_brand_sentiment +pip install -r requirements.txt +``` + +### Dependencies + +| Package | Version | Purpose | +|---------|---------|---------| +| streamlit | >= 1.28.0 | Web framework | +| plotly | >= 5.17.0 | Interactive charts | +| fpdf2 | >= 2.7.0 | PDF report generation | +| kaleido | >= 0.2.1 | Plotly chart image export (PNG) | +| pandas | >= 2.0.0 | Data processing | +| numpy | >= 1.24.0 | Numerical operations | +| python-dateutil | >= 2.8.0 | Date parsing | +| snowflake-snowpark-python | >= 1.8.0 | Snowflake connectivity | +| python-dotenv | >= 1.0.0 | Environment management | +| openai | >= 1.0.0 | AI agent (gpt-5-nano) | + +## Usage + +### Running the Dashboard + +```bash +streamlit run visualization_brand_sentiment/app.py +``` + +Opens at `http://localhost:8501`. + +### Navigation + +**Sidebar:** +- Page selection: Dashboard or Sentiment Analysis +- Global filters: Sentiment, Author Role, Date Range, Products, Competitors, Intents, Context, Processing Status +- Apply / Reset filter buttons +- Reload Data button (clears cache) +- Data info panel (post count, comment count, last update, demographics) + +**Dashboard Page:** +- Scroll through forum posts analysis sections +- Social media comments section appears below with its own analytics +- PDF export button at the bottom generates a downloadable report + +**Sentiment Analysis Page:** +1. Select data source (Forum Posts / Social Media Comments / All Sources) +2. When social media is selected, Platform and Channel filters appear +3. Apply standard filters (sentiment, products, competitors, etc.) +4. Review summary statistics and visualizations +5. Generate AI insights (samples up to 50 entries) +6. Browse individual posts/comments with paginated explorer +7. Export filtered data as CSV + +### PDF Report Export + +1. Apply desired filters in the sidebar (including date range) +2. Scroll to the bottom of the Dashboard page +3. Click "Generate PDF Report" +4. Click "Download PDF Report" once generation completes + +The exported PDF includes all dashboard sections with high-resolution charts (~300 DPI), descriptive section introductions, metric boxes, data tables, and a methodology appendix. Each section starts on its own page for clean printing. + +## Performance + +### Caching +- Forum posts: 5-minute TTL (`@st.cache_data(ttl=300)`) +- Comments: 5-minute TTL +- Competitor forum mentions: 5-minute TTL, cache keyed on `date_range` (different date selections cache independently) +- User demographics: 10-minute TTL (`@st.cache_data(ttl=600)`) +- Clear all caches via "Reload Data" button + +### AI Agent +- Samples max 50 entries per analysis (cost control) +- Uses gpt-5-nano (fast, cost-effective) +- Retry logic with exponential backoff (3 attempts) +- Session state caching for generated insights + +### PDF Generation +- Chart rendering uses kaleido engine at 3x scale +- Typical report generation: 15-20 charts, ~30 seconds +- Output PDF size: ~5-8 MB (high-resolution charts) +- Temporary PNG files are cleaned up automatically after generation + +## Troubleshooting + +### No Data Displayed +- Verify Snowflake credentials in `.env` +- Check that `SABIAN_BRAND_ANALYSIS` and `SABIAN_BRAND_ANALYSIS_COMMENTS` tables exist and have data +- Ensure `IS_RELEVANT = TRUE` and `PROCESSING_SUCCESS = TRUE` rows exist +- Try clicking "Reload Data" to clear cache + +### AI Insights Show "Insufficient Data" +- The agent needs text content to analyze. Ensure comments have `original_text` populated +- Check that at least some entries pass the text extraction (not all empty) + +### Comments Section Empty +- Verify the comments query returns data +- Check platform values are normalized to lowercase +- Ensure `comment_timestamp` is parseable + +### PDF Export Fails +- Install required dependencies: `pip install fpdf2 kaleido` +- Ensure kaleido can render Plotly charts (may need `pip install --upgrade kaleido`) +- Check that chart data is not empty (empty sections are skipped gracefully) + +### Competitor Mention Counts Don't Change With Date Filter +- Verify the `POST_CREATED_AT` column exists in `social_media_db.core.forum_posts` +- Check the Snowflake query logs for date filter clauses +- Ensure the date range widget has both start and end dates selected (single-date selection is ignored) + +### Slow Performance +- Apply filters to reduce dataset size +- Use date range to limit temporal scope +- Clear browser cache if charts render slowly + +## License + +Internal use only - Musora brand sentiment analysis project. diff --git a/visualization_brand_sentiment/agents/__init__.py b/visualization_brand_sentiment/agents/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3f11067edf49491eac8ed719178ba6604b3938cd --- /dev/null +++ b/visualization_brand_sentiment/agents/__init__.py @@ -0,0 +1,7 @@ +""" +Agent modules for Brand Sentiment Visualization +""" +from .base_agent import BaseVisualizationAgent +from .brand_insight_agent import BrandInsightAgent + +__all__ = ['BaseVisualizationAgent', 'BrandInsightAgent'] diff --git a/visualization_brand_sentiment/agents/base_agent.py b/visualization_brand_sentiment/agents/base_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..e1eafdcb42b28a5ece7b96b3a54bf9a3948e90fb --- /dev/null +++ b/visualization_brand_sentiment/agents/base_agent.py @@ -0,0 +1,88 @@ +""" +Base Agent class for brand sentiment visualization agents +Provides common functionality and interface for all agents +""" +from abc import ABC, abstractmethod +from typing import Dict, Any +import logging + + +class BaseVisualizationAgent(ABC): + """ + Abstract base class for all visualization agents + """ + + def __init__(self, name: str, model: str = "gpt-5-nano", temperature: float = 1): + """ + Initialize base agent + + Args: + name: Agent name + model: LLM model to use + temperature: LLM temperature + """ + self.name = name + self.model = model + self.temperature = temperature + self.logger = logging.getLogger(f"brand_sentiment.agents.{name}") + + @abstractmethod + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process input data and return results + + Args: + input_data: Input data dictionary + + Returns: + Results dictionary + """ + pass + + @abstractmethod + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """ + Validate input data + + Args: + input_data: Input data dictionary + + Returns: + True if valid, False otherwise + """ + pass + + def log_processing(self, message: str, level: str = "info"): + """ + Log processing information + + Args: + message: Log message + level: Log level (info, warning, error) + """ + log_func = getattr(self.logger, level.lower(), self.logger.info) + log_func(f"[{self.name}] {message}") + + def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]: + """ + Handle errors consistently + + Args: + error: Exception that occurred + context: Additional context information + + Returns: + Error response dictionary + """ + error_msg = f"Error in {self.name}: {str(error)}" + if context: + error_msg += f" | Context: {context}" + + self.log_processing(error_msg, level="error") + + return { + 'success': False, + 'error': str(error), + 'error_type': type(error).__name__, + 'context': context + } diff --git a/visualization_brand_sentiment/agents/brand_insight_agent.py b/visualization_brand_sentiment/agents/brand_insight_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..0da4939324dcf13cb3db50612bdcc42d728f4d77 --- /dev/null +++ b/visualization_brand_sentiment/agents/brand_insight_agent.py @@ -0,0 +1,359 @@ +""" +Brand Insight Agent +Analyzes filtered posts and generates brand-focused insights and summaries +""" +import pandas as pd +from typing import Dict, Any, List +import sys +from pathlib import Path + +# Add parent directory to path +parent_dir = Path(__file__).resolve().parent.parent +sys.path.append(str(parent_dir)) + +from agents.base_agent import BaseVisualizationAgent +from utils.llm_helper import LLMHelper + + +class BrandInsightAgent(BaseVisualizationAgent): + """ + Agent that analyzes brand sentiment posts and extracts + brand-focused insights, patterns, and actionable recommendations + """ + + def __init__(self, model: str = "gpt-5-nano", temperature: float = 1): + """ + Initialize Brand Insight Agent + + Args: + model: LLM model to use + temperature: Temperature for generation + """ + super().__init__(name="BrandInsightAgent", model=model, temperature=temperature) + self.llm_helper = LLMHelper(model=model, temperature=temperature) + + def validate_input(self, input_data: Dict[str, Any]) -> bool: + """ + Validate input data + + Args: + input_data: Input dictionary + + Returns: + True if valid, False otherwise + """ + required_fields = ['posts'] + + for field in required_fields: + if field not in input_data: + self.log_processing(f"Missing required field: {field}", level="error") + return False + + if not isinstance(input_data['posts'], (list, pd.DataFrame)): + self.log_processing("Posts must be a list or DataFrame", level="error") + return False + + return True + + def _prepare_posts_context(self, posts: Any, max_posts: int = 50) -> str: + """ + Prepare posts data for LLM analysis + + Args: + posts: Posts as DataFrame or list of dicts + max_posts: Maximum number of posts to include + + Returns: + Formatted string with post data + """ + # Convert to DataFrame if needed + if isinstance(posts, list): + posts_df = pd.DataFrame(posts) + else: + posts_df = posts.copy() + + total_posts = len(posts_df) + + # Sample if too many posts + if len(posts_df) > max_posts: + # Prioritize diverse sampling: include negative, positive, and mixed + negative_posts = posts_df[posts_df['sentiment_level'].isin(['negative', 'very_negative'])] + positive_posts = posts_df[posts_df['sentiment_level'].isin(['positive', 'very_positive'])] + neutral_posts = posts_df[posts_df['sentiment_level'] == 'neutral'] + + samples = [] + n_per_group = max_posts // 3 + + if len(negative_posts) > 0: + samples.append(negative_posts.sample(n=min(n_per_group, len(negative_posts)), random_state=42)) + if len(positive_posts) > 0: + samples.append(positive_posts.sample(n=min(n_per_group, len(positive_posts)), random_state=42)) + if len(neutral_posts) > 0: + samples.append(neutral_posts.sample(n=min(n_per_group, len(neutral_posts)), random_state=42)) + + if samples: + posts_df = pd.concat(samples).drop_duplicates() + + # Format posts for analysis + posts_text = [] + for idx, row in posts_df.iterrows(): + # Get cleaned content or original (supports both forum posts and social media comments) + text = (row.get('cleaned_content', '') or row.get('original_content', '') + or row.get('original_text', '') or row.get('display_text', '')) + if not text: + continue + + # Truncate long text + text = text[:500] + '...' if len(str(text)) > 500 else text + + # Get relevant fields + sentiment = row.get('sentiment_level', 'unknown') + author_role = row.get('author_role', 'unknown') + products = row.get('products_mentioned', []) + competitors = row.get('competitors_mentioned', []) + intents = row.get('intents', []) + pain_points = row.get('pain_points', []) + delights = row.get('delight_factors', []) + purchase_stage = row.get('purchase_stage', '') + title = row.get('thread_title', '') or row.get('content_title', '') + platform = row.get('platform', '') + channel = row.get('channel_display_name', '') + + post_entry = f""" +--- +Entry #{len(posts_text) + 1}: +Title: {title[:100] if title else 'N/A'} +{'Platform: ' + platform if platform else ''}{' Channel: ' + channel if channel else ''} +Author Role: {author_role} +Sentiment: {sentiment} +Products Mentioned: {', '.join(products) if isinstance(products, list) else 'None'} +Competitors Mentioned: {', '.join(competitors) if isinstance(competitors, list) else 'None'} +Intents: {', '.join(intents) if isinstance(intents, list) else 'None'} +Pain Points: {', '.join(pain_points) if isinstance(pain_points, list) else 'None'} +Delight Factors: {', '.join(delights) if isinstance(delights, list) else 'None'} +Purchase Stage: {purchase_stage if purchase_stage else 'N/A'} +Content: {text} +""" + posts_text.append(post_entry) + + return f"Total posts in dataset: {total_posts}\nAnalyzing sample of {len(posts_text)} posts:\n" + "\n".join(posts_text) + + def _generate_analysis_prompt(self, posts_context: str, filter_description: str = "") -> str: + """ + Generate prompt for LLM brand analysis + + Args: + posts_context: Formatted posts + filter_description: Description of applied filters + + Returns: + Prompt string + """ + filter_info = f"\n**Applied Filters:** {filter_description}" if filter_description else "" + + prompt = f"""You are a brand analyst for Sabian Cymbals. Analyze the following entries from Musora's drumming community (forum posts and/or social media comments) and provide insights specifically about Sabian brand perception. + +**Important:** Focus ONLY on what users say about Sabian cymbals and their products. Ignore off-topic discussions. +{filter_info} + +{posts_context} + +**Task:** Provide a comprehensive brand analysis in JSON format: + +{{ + "executive_summary": "2-3 sentence overview of overall Sabian brand sentiment and key findings", + + "sentiment_analysis": {{ + "overall_tone": "positive/negative/mixed/neutral", + "sentiment_drivers": ["key reasons for the sentiment"], + "concerning_patterns": ["any worrying trends or issues"] + }}, + + "product_insights": {{ + "most_discussed_products": ["product names with brief sentiment"], + "product_strengths": ["what users love about specific products"], + "product_concerns": ["issues or complaints about specific products"] + }}, + + "competitive_position": {{ + "comparison_summary": "how Sabian compares to competitors in user discussions", + "advantages_vs_competitors": ["where Sabian wins"], + "disadvantages_vs_competitors": ["where competitors are preferred"], + "brand_switching_insights": "any patterns in users switching to/from Sabian" + }}, + + "customer_journey_insights": {{ + "potential_buyer_sentiment": "how potential buyers perceive Sabian", + "owner_satisfaction": "satisfaction level of current owners", + "common_purchase_drivers": ["what drives people to buy Sabian"], + "common_barriers": ["what prevents people from buying Sabian"] + }}, + + "key_themes": [ + {{ + "theme": "theme name", + "sentiment": "positive/negative/mixed", + "description": "brief explanation", + "frequency": "how common this theme is" + }} + ], + + "actionable_recommendations": [ + {{ + "priority": "high/medium/low", + "category": "product/marketing/customer_service/pricing", + "recommendation": "specific actionable suggestion", + "rationale": "why this matters based on the data" + }} + ], + + "notable_quotes": [ + {{ + "quote": "impactful direct quote from a post", + "context": "brief context", + "sentiment": "positive/negative/neutral" + }} + ] +}} + +**Guidelines:** +- Be specific and actionable in recommendations +- Focus on brand-relevant insights only +- Cite specific products when relevant +- Include both positive and negative insights for balance +- Limit each list to 3-5 most important items +- Quotes should be short and impactful (max 100 characters) +""" + return prompt + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process posts and generate brand insights + + Args: + input_data: { + 'posts': DataFrame or list of post dicts, + 'filter_description': Optional description of applied filters + } + + Returns: + { + 'success': bool, + 'insights': { + 'executive_summary': str, + 'sentiment_analysis': dict, + 'product_insights': dict, + 'competitive_position': dict, + 'customer_journey_insights': dict, + 'key_themes': list, + 'actionable_recommendations': list, + 'notable_quotes': list + }, + 'metadata': { + 'total_posts_analyzed': int, + 'model_used': str, + 'tokens_used': int + } + } + """ + try: + # Validate input + if not self.validate_input(input_data): + return { + 'success': False, + 'error': 'Invalid input data' + } + + posts = input_data['posts'] + filter_description = input_data.get('filter_description', '') + + self.log_processing(f"Starting brand insight analysis") + + # Convert to DataFrame if needed + if isinstance(posts, list): + posts_df = pd.DataFrame(posts) + else: + posts_df = posts.copy() + + total_posts = len(posts_df) + + if total_posts == 0: + return { + 'success': True, + 'insights': { + 'executive_summary': 'No posts available for analysis.', + 'sentiment_analysis': {}, + 'product_insights': {}, + 'competitive_position': {}, + 'customer_journey_insights': {}, + 'key_themes': [], + 'actionable_recommendations': [], + 'notable_quotes': [] + }, + 'metadata': { + 'total_posts_analyzed': 0, + 'model_used': self.model, + 'tokens_used': 0 + } + } + + # Prepare posts context + posts_context = self._prepare_posts_context(posts_df) + + # Generate prompt + prompt = self._generate_analysis_prompt(posts_context, filter_description) + + # System message + system_message = """You are a senior brand analyst specializing in the musical instrument industry, +particularly drums and cymbals. You provide data-driven insights that help brands understand +their market position and improve customer satisfaction. Always focus on actionable insights +and maintain objectivity in your analysis.""" + + # Get LLM response + self.log_processing("Calling LLM for brand insight generation") + response = self.llm_helper.get_structured_completion( + prompt=prompt, + system_message=system_message, + max_retries=3 + ) + + if not response['success']: + return self.handle_error( + Exception(response.get('error', 'LLM call failed')), + context=f"total_posts={total_posts}" + ) + + # Extract insights + insights = response['content'] + + # Ensure all expected fields exist + default_insights = { + 'executive_summary': '', + 'sentiment_analysis': {}, + 'product_insights': {}, + 'competitive_position': {}, + 'customer_journey_insights': {}, + 'key_themes': [], + 'actionable_recommendations': [], + 'notable_quotes': [] + } + + # Merge with defaults + for key in default_insights: + if key not in insights: + insights[key] = default_insights[key] + + self.log_processing(f"Successfully generated brand insights for {total_posts} posts") + + return { + 'success': True, + 'insights': insights, + 'metadata': { + 'total_posts_analyzed': total_posts, + 'model_used': response['model'], + 'tokens_used': response['usage']['total_tokens'] + } + } + + except Exception as e: + return self.handle_error(e, context="brand_insight_analysis") diff --git a/visualization_brand_sentiment/app.py b/visualization_brand_sentiment/app.py new file mode 100644 index 0000000000000000000000000000000000000000..ac2c58366923e554ee01246a60964ed6f4ffba64 --- /dev/null +++ b/visualization_brand_sentiment/app.py @@ -0,0 +1,384 @@ +""" +Sabian Brand Sentiment Analysis Dashboard +Main Streamlit Application + +Run with: streamlit run visualization_brand_sentiment/app.py +""" +import streamlit as st +import sys +import pandas as pd +from pathlib import Path +import json + +# Add parent directory to path +parent_dir = Path(__file__).resolve().parent +sys.path.append(str(parent_dir)) +sys.path.append(str(parent_dir.parent)) + +from utils.auth import check_authentication, verify_login, logout +from data.data_loader import BrandSentimentDataLoader +from components.dashboard import render_dashboard +from components.sentiment_analysis import render_sentiment_analysis + + +# Load configuration +config_path = parent_dir / "config" / "viz_config.json" +with open(config_path, 'r', encoding='utf-8') as f: + config = json.load(f) + +# Page configuration +st.set_page_config( + page_title=config['page_config']['page_title'], + page_icon=config['page_config']['page_icon'], + layout=config['page_config']['layout'], + initial_sidebar_state=config['page_config']['initial_sidebar_state'] +) + +# Custom CSS for Sabian branding +st.markdown(f""" + +""", unsafe_allow_html=True) + +# --- Authentication Gate --- +if not check_authentication(): + st.title("Sabian Brand Sentiment Dashboard - Login") + + st.markdown(""" + Welcome to the **Sabian Brand Sentiment Analysis Dashboard**. + + This tool is restricted to authorized Musora team members. + Please enter your credentials below to access the dashboard. + """) + + with st.form("login_form"): + email = st.text_input("Email Address", placeholder="your.name@musora.com") + token = st.text_input("Access Token", type="password", placeholder="Enter your access token") + submit = st.form_submit_button("Login", use_container_width=True) + + if submit: + if verify_login(email, token): + st.session_state.authenticated = True + st.session_state.user_email = email + st.success("Login successful! Redirecting...") + st.rerun() + else: + st.error("Invalid email or access token. Please try again.") + + st.stop() + + +def main(): + """Main application function.""" + data_loader = BrandSentimentDataLoader() + + # ── Load raw data once per session ────────────────────────────────────────── + # @st.cache_data on the load_* methods handles Snowflake-level caching. + # We additionally store the merged result in session_state so that every + # filter interaction (which reruns the whole script) reads from memory instead + # of re-executing the expensive DataFrame merge or the filter-options scan. + # The spinner is only shown on this first-load path; subsequent reruns caused + # by filter widget interactions are silent and fast. + if 'raw_df' not in st.session_state: + with st.spinner("Loading data from Snowflake..."): + posts_df = data_loader.load_posts_data() + users_df = data_loader.load_users_data() + st.session_state.raw_df = ( + data_loader.merge_posts_with_users(posts_df, users_df) + if not posts_df.empty else posts_df + ) + st.session_state.raw_comments_df = data_loader.load_comments_data() + + # Pre-compute filter options from the full unfiltered dataset once. + _src = (st.session_state.raw_df + if not st.session_state.raw_df.empty + else st.session_state.raw_comments_df) + st.session_state.filter_options = ( + data_loader.get_filter_options(_src) if not _src.empty + else {'sentiments': [], 'author_roles': [], 'mention_contexts': [], + 'processing_statuses': [], 'products': [], 'competitors': [], + 'intents': [], 'pain_points': [], 'delight_factors': [], + 'purchase_stages': []} + ) + + # Initial competitor/YT load with no date filter. + st.session_state.additional_mentions_df = ( + data_loader.load_competitor_forum_mentions(date_range=None) + ) + st.session_state.yt_total_comments = ( + data_loader.load_youtube_comment_count(date_range=None) + ) + + raw_df = st.session_state.raw_df + raw_comments_df = st.session_state.raw_comments_df + filter_options = st.session_state.filter_options + + # ── Sidebar ────────────────────────────────────────────────────────────────── + with st.sidebar: + # Logo + logo_path = parent_dir.parent / "visualization" / "img" / "musora.png" + if logo_path.exists(): + st.image(str(logo_path), use_container_width=True) + + # Brand header + st.markdown(f""" +
+

{config['brand']['name']}

+

Brand Sentiment Analysis

+
+ """, unsafe_allow_html=True) + + # Logout button + if st.button("Logout", use_container_width=True): + logout() + st.rerun() + + st.title("Navigation") + + # Page selection + page = st.radio( + "Select Page", + ["Dashboard", "Sentiment Analysis"], + index=0, + format_func=lambda x: f"{'📊' if x == 'Dashboard' else '🔍'} {x}" + ) + + st.markdown("---") + + # Global Filters section + st.markdown("### 🔍 Global Filters") + + if 'filters_applied' not in st.session_state: + st.session_state.filters_applied = False + + # Sentiment filter + selected_sentiments = st.multiselect( + "Sentiment Level", + options=filter_options['sentiments'], + default=[] + ) + + # Author role filter + selected_author_roles = st.multiselect( + "Author Role", + options=filter_options['author_roles'], + default=[] + ) + + # Date range filter + date_range = None + if (not raw_df.empty + and 'post_created_at' in raw_df.columns + and not raw_df['post_created_at'].isna().all()): + valid_dates = raw_df[raw_df['post_created_at'].notna()]['post_created_at'] + min_date = valid_dates.min().date() + max_date = valid_dates.max().date() + + st.markdown("**Date Range (Forums)**") + date_range = st.date_input( + "Select dates", + value=(min_date, max_date), + min_value=min_date, + max_value=max_date, + label_visibility="collapsed" + ) + + # More filters + with st.expander("More Filters"): + selected_products = st.multiselect( + "Products", + options=filter_options['products'], + default=[] + ) + selected_competitors = st.multiselect( + "Competitors", + options=filter_options['competitors'], + default=[] + ) + selected_intents = st.multiselect( + "Intents", + options=filter_options['intents'], + default=[] + ) + selected_contexts = st.multiselect( + "Mention Context", + options=filter_options['mention_contexts'], + default=[] + ) + selected_statuses = st.multiselect( + "Processing Status", + options=filter_options['processing_statuses'], + default=[] + ) + + # Apply / Reset + col1, col2 = st.columns(2) + with col1: + if st.button("Apply", use_container_width=True, type="primary"): + st.session_state.filters_applied = True + # Competitor/YT data is date-filtered at the SQL level. + # Re-query Snowflake only when Apply is explicitly clicked — + # not on every date-picker interaction — to avoid unnecessary + # queries while the user is still adjusting the date range. + effective_dr = ( + date_range if date_range and len(date_range) == 2 else None + ) + st.session_state.additional_mentions_df = ( + data_loader.load_competitor_forum_mentions(date_range=effective_dr) + ) + st.session_state.yt_total_comments = ( + data_loader.load_youtube_comment_count(date_range=effective_dr) + ) + with col2: + if st.button("Reset", use_container_width=True): + st.session_state.filters_applied = False + st.session_state.additional_mentions_df = ( + data_loader.load_competitor_forum_mentions(date_range=None) + ) + st.session_state.yt_total_comments = ( + data_loader.load_youtube_comment_count(date_range=None) + ) + st.rerun() + + st.markdown("---") + + # Data Management + st.markdown("### 🔄 Data Management") + + if st.button("Reload Data", use_container_width=True): + st.cache_data.clear() + # Also evict session_state raw data so the next rerun re-fetches + # from Snowflake and re-runs the merge and filter-options scan. + for key in ['raw_df', 'raw_comments_df', 'filter_options', + 'additional_mentions_df', 'yt_total_comments', + 'filters_applied']: + st.session_state.pop(key, None) + st.rerun() + + # Data info + st.markdown("---") + st.markdown("### ℹ️ Data Info") + st.info(f"**Forum Posts:** {len(raw_df):,}") + st.info(f"**Social Media Comments:** {len(raw_comments_df):,}") + + if (not raw_df.empty + and 'processed_at' in raw_df.columns + and not raw_df['processed_at'].isna().all()): + last_update = raw_df['processed_at'].max() + st.info(f"**Last Updated:**\n{last_update.strftime('%Y-%m-%d %H:%M')}") + + if not raw_df.empty and 'drums_experience_years' in raw_df.columns: + demo_count = raw_df['drums_experience_years'].notna().sum() + st.info(f"**Users with Demographics:** {demo_count:,}") + + # ── Retrieve competitor / YT data set by Apply / Reset / first load ────────── + additional_mentions_df = st.session_state.get( + 'additional_mentions_df', pd.DataFrame() + ) + yt_total_comments = st.session_state.get('yt_total_comments') + + if raw_df.empty and raw_comments_df.empty: + st.error("No data available. Please check your Snowflake connection.") + return + + # ── Apply filters in-memory (pure pandas — no Snowflake involved) ──────────── + filter_kwargs = dict( + sentiments=selected_sentiments if selected_sentiments else None, + author_roles=selected_author_roles if selected_author_roles else None, + mention_contexts=selected_contexts if selected_contexts else None, + products=selected_products if selected_products else None, + competitors=selected_competitors if selected_competitors else None, + intents=selected_intents if selected_intents else None, + processing_statuses=selected_statuses if selected_statuses else None, + date_range=date_range if date_range and len(date_range) == 2 else None + ) + + if st.session_state.filters_applied: + df = (data_loader.apply_filters(raw_df, **filter_kwargs) + if not raw_df.empty else raw_df) + comments_df = (data_loader.apply_filters(raw_comments_df, **filter_kwargs) + if not raw_comments_df.empty else raw_comments_df) + + if df.empty and comments_df.empty: + st.warning("No data matches the selected filters. Please adjust your filters.") + return + else: + total_filtered = len(df) + len(comments_df) + st.info( + f"Showing {total_filtered:,} items after applying filters " + f"({len(df):,} posts, {len(comments_df):,} comments)" + ) + else: + df = raw_df + comments_df = raw_comments_df + + # ── Main content ───────────────────────────────────────────────────────────── + st.markdown(f""" +
+

{config['brand']['name']} Brand Sentiment Dashboard

+

{config['brand']['description']}

+
+ """, unsafe_allow_html=True) + + if page == "Dashboard": + render_dashboard(df, comments_df, additional_mentions_df, + yt_total_comments=yt_total_comments) + elif page == "Sentiment Analysis": + render_sentiment_analysis(df, comments_df) + + # Footer + st.markdown("---") + st.markdown( + f""" +
+

{config['brand']['name']} Brand Sentiment Dashboard v2.0

+

Powered by Streamlit | Data from Snowflake | Platforms: Musora Forums, YouTube

+

+ Sabian Cymbals Analysis +

+
+ """, + unsafe_allow_html=True + ) + + +if __name__ == "__main__": + try: + main() + except Exception as e: + st.error(f"An error occurred: {str(e)}") + st.exception(e) diff --git a/visualization_brand_sentiment/components/__init__.py b/visualization_brand_sentiment/components/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c08b8f3a9c0bf9f94c1dd25371400a2a701a2886 --- /dev/null +++ b/visualization_brand_sentiment/components/__init__.py @@ -0,0 +1,7 @@ +""" +Page components for Brand Sentiment Dashboard +""" +from .dashboard import render_dashboard +from .sentiment_analysis import render_sentiment_analysis + +__all__ = ['render_dashboard', 'render_sentiment_analysis'] diff --git a/visualization_brand_sentiment/components/dashboard.py b/visualization_brand_sentiment/components/dashboard.py new file mode 100644 index 0000000000000000000000000000000000000000..676eef41bf4977cd324d9c688630c627de303fbe --- /dev/null +++ b/visualization_brand_sentiment/components/dashboard.py @@ -0,0 +1,887 @@ +""" +Main Dashboard Page for Brand Sentiment Analysis +Displays overall brand health, sentiment distributions, product analysis, and demographics +""" +import streamlit as st +import sys +import logging +import pandas as pd +from pathlib import Path +from datetime import datetime + +logger = logging.getLogger(__name__) + +# Add parent directory to path +parent_dir = Path(__file__).resolve().parent.parent +sys.path.append(str(parent_dir)) + +from utils.data_processor import BrandDataProcessor +from utils.metrics import BrandMetrics +from visualizations.sentiment_charts import SentimentCharts +from visualizations.distribution_charts import DistributionCharts +from visualizations.brand_charts import BrandCharts +from visualizations.demographic_charts import DemographicCharts +from visualizations.content_cards import ContentCards + + +def _hash_df(df: pd.DataFrame) -> str: + """Fast, list-safe cache key for brand-sentiment DataFrames. + + pandas hash_pandas_object() raises TypeError on columns that hold Python + lists (products_mentioned, competitors_mentioned, intents, pain_points, + delight_factors, etc.), which causes st.cache_data to fall back to pickling + the entire DataFrame — taking minutes for a large filtered result set and + blocking PDF generation entirely. + + Instead, we build a lightweight string key from: + - row count (changes when date filter reduces the dataset) + - sorted column names (schema guard) + - min/max of the date column (changes when date range filter changes) + + This uniquely identifies any filtered view of the brand-sentiment data + without touching the unhashable list columns at all. + """ + if df is None or (hasattr(df, 'empty') and df.empty): + return "empty" + try: + date_info = "" + for col in ('post_created_at', 'comment_timestamp'): + if col in df.columns: + valid = df[col].dropna() + if not valid.empty: + date_info = f"_{valid.min()}_{valid.max()}" + break + return f"{len(df)}_{sorted(df.columns.tolist())}{date_info}" + except Exception: + return f"{len(df)}_{sorted(df.columns.tolist())}" + + +@st.cache_data(ttl=300, show_spinner=False, hash_funcs={pd.DataFrame: _hash_df}) +def _generate_pdf_cached(posts_df, comments_df, additional_mentions_df): + """Generate PDF report bytes, cached for 5 min per unique filter state. + + Using @st.cache_data means Streamlit reruns (triggered by any widget + interaction after clicking Generate) will return the cached bytes instantly + instead of re-rendering all ~15 charts through kaleido again. + + Cache invalidates automatically when the user changes filters because + _hash_df encodes row count + date range, both of which change with filters. + """ + import json + from utils.pdf_exporter import DashboardPDFExporter + + config_path = Path(__file__).parent.parent / "config" / "viz_config.json" + with open(config_path, 'r', encoding='utf-8') as f: + config = json.load(f) + + exporter = DashboardPDFExporter(config) + return exporter.generate_report( + posts_df=posts_df, + comments_df=comments_df, + additional_mentions_df=additional_mentions_df, + ) + + +def render_dashboard(df, comments_df=None, additional_mentions_df=None, + yt_total_comments=None): + """ + Render the main dashboard page + + Args: + df: Brand sentiment dataframe (merged with user demographics) - forum posts + comments_df: Social media comments dataframe (optional) + additional_mentions_df: Competitor mentions from raw forum posts (optional) + yt_total_comments: Total YouTube comment count from FACT_COMMENTS (optional). + Represents the full raw dataset scanned, not just the analyzed sample. + """ + import pandas as pd + if comments_df is None: + comments_df = pd.DataFrame() + if additional_mentions_df is None: + additional_mentions_df = pd.DataFrame() + + st.title("🥁 Sabian Brand Sentiment Dashboard") + st.markdown("*Comprehensive brand health analysis from Musora Forums & Social Media*") + + if len(df) > 5000: + st.info(f"💡 Analyzing **{len(df):,}** relevant posts. Use filters in the sidebar to narrow your analysis.") + + st.markdown("---") + + # Initialize components + sentiment_charts = SentimentCharts() + distribution_charts = DistributionCharts() + brand_charts = BrandCharts() + demographic_charts = DemographicCharts() + processor = BrandDataProcessor() + + # Calculate overall metrics + overall_metrics = BrandMetrics.calculate_overall_metrics(df) + + # ============== SUMMARY SECTION ============== + ContentCards.display_summary_stats(df, overall_metrics) + + st.markdown("---") + + # Health Indicator + col1, col2, col3 = st.columns([1, 2, 1]) + with col2: + status_info = BrandMetrics.get_sentiment_health_status(overall_metrics['negative_pct']) + ContentCards.display_health_indicator(overall_metrics['negative_pct'], status_info) + + st.markdown("---") + + # ============== OVERALL SENTIMENT ============== + st.markdown("## 🎯 Overall Sentiment Distribution") + + col1, col2 = st.columns(2) + + with col1: + sentiment_pie = sentiment_charts.create_sentiment_pie_chart(df, title="Sentiment Distribution") + st.plotly_chart(sentiment_pie, use_container_width=True, key="sentiment_pie_main") + + with col2: + sentiment_gauge = sentiment_charts.create_sentiment_score_gauge( + overall_metrics['avg_sentiment_score'], + title="Brand Sentiment Score" + ) + st.plotly_chart(sentiment_gauge, use_container_width=True, key="sentiment_gauge_main") + + # Additional metrics + metric_col1, metric_col2, metric_col3 = st.columns(3) + with metric_col1: + st.metric("Positive %", f"{overall_metrics['positive_pct']:.1f}%") + with metric_col2: + st.metric("Neutral %", f"{overall_metrics['neutral_pct']:.1f}%") + with metric_col3: + st.metric("Negative %", f"{overall_metrics['negative_pct']:.1f}%") + + st.markdown("---") + + # ============== AUTHOR ROLE ANALYSIS ============== + st.markdown("## 👤 Author Role Analysis") + st.markdown("*Who is discussing Sabian cymbals?*") + + author_dist = processor.get_author_role_distribution(df) + role_sentiment = processor.get_author_role_by_sentiment(df) + + col1, col2 = st.columns(2) + + with col1: + author_pie = distribution_charts.create_author_role_chart(author_dist, title="Author Role Distribution") + st.plotly_chart(author_pie, use_container_width=True, key="author_role_pie") + + with col2: + author_sentiment_bar = sentiment_charts.create_sentiment_percentage_bar_chart( + df, group_by='author_role', title="Sentiment by Author Role" + ) + st.plotly_chart(author_sentiment_bar, use_container_width=True, key="author_sentiment_bar") + + # Potential buyer insights + buyer_metrics = BrandMetrics.calculate_potential_buyer_metrics(df) + if buyer_metrics['total_potential_buyers'] > 0: + st.markdown("### 🛒 Potential Buyer Insights") + col1, col2, col3, col4 = st.columns(4) + with col1: + st.metric("Total Potential Buyers", buyer_metrics['total_potential_buyers']) + with col2: + st.metric("Positive Sentiment %", f"{buyer_metrics['positive_sentiment_pct']:.1f}%") + with col3: + st.metric("Researching", buyer_metrics['researching_count']) + with col4: + st.metric("Deciding", buyer_metrics['deciding_count']) + + st.markdown("---") + + # ============== SABIAN PRODUCTS ANALYSIS ============== + st.markdown("## 🥁 Sabian Products Analysis") + + products_dist = processor.get_products_distribution(df) + product_sentiment = processor.get_product_sentiment_breakdown(df) + + if not products_dist.empty: + col1, col2 = st.columns(2) + + with col1: + products_bar = brand_charts.create_products_horizontal_bar( + products_dist.head(10), title="Top 10 Sabian Products Mentioned" + ) + st.plotly_chart(products_bar, use_container_width=True, key="products_bar") + + with col2: + if not product_sentiment.empty: + product_sent_chart = brand_charts.create_product_sentiment_breakdown( + product_sentiment, title="Product Sentiment Breakdown" + ) + st.plotly_chart(product_sent_chart, use_container_width=True, key="product_sentiment") + + # Product metrics table + with st.expander("📈 Detailed Product Metrics"): + product_metrics = BrandMetrics.calculate_product_metrics(df) + + product_data = [] + for product, metrics in sorted(product_metrics.items(), key=lambda x: x[1]['total_posts'], reverse=True): + product_data.append({ + 'Product': product, + 'Total Posts': metrics['total_posts'], + 'Avg Score': f"{metrics['avg_sentiment_score']:.2f}", + 'Positive %': f"{metrics['positive_pct']:.1f}%", + 'Negative %': f"{metrics['negative_pct']:.1f}%" + }) + + if product_data: + st.table(product_data[:15]) + else: + st.info("No product mention data available") + + st.markdown("---") + + # ============== OVERALL BRAND MENTIONS ============== + st.markdown("## 📊 Overall Brand Mentions") + st.markdown("*Total brand mentions across all forum posts and social media comments*") + + brand_mentions = processor.get_overall_brand_mentions(df, comments_df, additional_mentions_df) + + if not brand_mentions.empty: + brand_mentions_chart = brand_charts.create_overall_brand_mentions_chart( + brand_mentions, title="Brand Mention Distribution" + ) + st.plotly_chart(brand_mentions_chart, use_container_width=True, key="overall_brand_mentions") + + # Summary metrics + sabian_row = brand_mentions[brand_mentions['brand'] == 'Sabian'] + if not sabian_row.empty: + sabian_mentions = sabian_row.iloc[0]['total_mentions'] + sabian_pct = sabian_row.iloc[0]['percentage'] + total_mentions = brand_mentions['total_mentions'].sum() + + col1, col2, col3 = st.columns(3) + with col1: + st.metric("Total Brand Mentions", f"{total_mentions:,}") + with col2: + st.metric("Sabian Mentions", f"{sabian_mentions:,}") + with col3: + st.metric("Sabian Market Share", f"{sabian_pct:.1f}%") + + st.caption( + "Sabian count reflects all processed posts and comments. " + "Competitor counts include mentions from processed Sabian posts " + "plus additional mentions from forum posts that do not reference Sabian." + ) + else: + st.info("No brand mention data available") + + st.markdown("---") + + # ============== COMPETITIVE ANALYSIS ============== + st.markdown("## 🆚 Competitive Analysis") + + competitors_dist = processor.get_competitors_distribution(df) + competitor_sentiment = processor.get_competitor_sentiment_breakdown(df) + + if not competitors_dist.empty: + col1, col2 = st.columns(2) + + with col1: + competitors_bar = brand_charts.create_competitors_bar_chart( + competitors_dist.head(10), title="Competitors Mentioned" + ) + st.plotly_chart(competitors_bar, use_container_width=True, key="competitors_bar") + + with col2: + if not competitor_sentiment.empty: + comp_heatmap = brand_charts.create_competitive_heatmap( + competitor_sentiment, title="Sabian Sentiment When Competitors Mentioned" + ) + st.plotly_chart(comp_heatmap, use_container_width=True, key="competitor_heatmap") + + # Brand switching analysis + switching_metrics = BrandMetrics.calculate_brand_switching_metrics(df) + if switching_metrics['switching_to_sabian'] > 0 or switching_metrics['switching_from_sabian'] > 0: + st.markdown("### 🔄 Brand Switching Analysis") + + switching_chart = brand_charts.create_switching_flow_chart( + switching_metrics['switching_to_sabian'], + switching_metrics['switching_from_sabian'], + title="Brand Switching Flow" + ) + st.plotly_chart(switching_chart, use_container_width=True, key="switching_flow") + + if switching_metrics['net_switching'] > 0: + st.success(f"✅ **Net Positive:** {switching_metrics['net_switching']} more users switching TO Sabian") + elif switching_metrics['net_switching'] < 0: + st.warning(f"⚠️ **Net Negative:** {abs(switching_metrics['net_switching'])} more users switching FROM Sabian") + else: + st.info("No competitor mention data available") + + st.markdown("---") + + # ============== INTENTS & FEEDBACK ============== + st.markdown("## 💡 Intents & Feedback Analysis") + + col1, col2 = st.columns(2) + + with col1: + intents_dist = processor.get_intents_distribution(df) + if not intents_dist.empty: + intents_bar = distribution_charts.create_intent_bar_chart( + intents_dist, title="User Intents", orientation='h' + ) + st.plotly_chart(intents_bar, use_container_width=True, key="intents_bar") + + with col2: + # Pain points vs Delight factors + pain_dist = processor.get_pain_points_distribution(df) + delight_dist = processor.get_delight_factors_distribution(df) + + if not pain_dist.empty or not delight_dist.empty: + pain_delight_chart = distribution_charts.create_pain_delight_comparison_chart( + pain_dist.head(8), delight_dist.head(8), title="Pain Points vs Delight Factors" + ) + st.plotly_chart(pain_delight_chart, use_container_width=True, key="pain_delight") + + # Pain/Delight metrics + pd_metrics = BrandMetrics.calculate_pain_delight_metrics(df) + if pd_metrics['total_pain_points'] > 0 or pd_metrics['total_delight_factors'] > 0: + col1, col2, col3 = st.columns(3) + with col1: + st.metric("Total Pain Points", pd_metrics['total_pain_points']) + with col2: + st.metric("Total Delight Factors", pd_metrics['total_delight_factors']) + with col3: + ratio = pd_metrics['pain_to_delight_ratio'] + if ratio != float('inf'): + st.metric("Pain/Delight Ratio", f"{ratio:.2f}") + + st.markdown("---") + + # ============== MENTION CONTEXT & PURCHASE STAGE ============== + st.markdown("## 🎯 Mention Context & Purchase Journey") + + col1, col2 = st.columns(2) + + with col1: + context_dist = processor.get_mention_context_distribution(df) + if not context_dist.empty: + context_chart = distribution_charts.create_mention_context_chart( + context_dist, title="How Sabian is Mentioned" + ) + st.plotly_chart(context_chart, use_container_width=True, key="mention_context") + + with col2: + stage_dist = processor.get_purchase_stage_distribution(df) + if not stage_dist.empty: + stage_chart = distribution_charts.create_purchase_stage_chart( + stage_dist, title="Purchase Journey Stage" + ) + st.plotly_chart(stage_chart, use_container_width=True, key="purchase_stage") + + st.markdown("---") + + # ============== DEMOGRAPHICS ANALYSIS ============== + # Check if we have demographic data + has_demographics = ( + 'drums_experience_years' in df.columns or + 'age_group' in df.columns or + 'cymbal_brands_list' in df.columns + ) + + if has_demographics: + st.markdown("## 👥 User Demographics Analysis") + st.markdown("*Based on user profiles from Musora Forums*") + + # Experience Level + if 'experience_group' in df.columns: + exp_dist = processor.get_demographics_distribution(df, 'experience_group') + + if not exp_dist.empty: + st.markdown("### 🎸 Drumming Experience") + col1, col2 = st.columns(2) + + with col1: + exp_chart = demographic_charts.create_experience_distribution_chart( + exp_dist, title="Experience Level Distribution" + ) + st.plotly_chart(exp_chart, use_container_width=True, key="exp_dist") + + with col2: + # Sentiment by experience + exp_sentiment = processor.get_demographics_distribution(df, 'experience_group') + if not exp_sentiment.empty: + exp_order = [ + 'Beginner (0-5 years)', 'Intermediate (5-15 years)', + 'Advanced (15-25 years)', 'Expert (25+ years)' + ] + exp_sent_chart = sentiment_charts.create_sentiment_bar_chart( + df[df['experience_group'].notna() & (df['experience_group'] != 'Unknown')], + group_by='experience_group', + title="Sentiment by Experience Level", + category_order=exp_order + ) + st.plotly_chart(exp_sent_chart, use_container_width=True, key="exp_sentiment") + + # Cymbal Ownership + if 'cymbal_brands_list' in df.columns: + ownership_data = processor.get_cymbal_ownership_analysis(df) + + if ownership_data: + st.markdown("### 🎶 Cymbal Ownership") + + col1, col2 = st.columns(2) + + with col1: + ownership_chart = demographic_charts.create_cymbal_ownership_chart( + ownership_data, title="Cymbal Brands Owned by Forum Users" + ) + st.plotly_chart(ownership_chart, use_container_width=True, key="cymbal_ownership") + + with col2: + # Sentiment by Sabian ownership + ownership_sentiment = processor.get_sentiment_by_cymbal_ownership(df) + if not ownership_sentiment.empty: + ownership_sent_chart = demographic_charts.create_ownership_sentiment_chart( + ownership_sentiment, title="Sentiment: Sabian Owners vs Non-Owners" + ) + st.plotly_chart(ownership_sent_chart, use_container_width=True, key="ownership_sentiment") + + # Ownership metrics + sabian_rate = ownership_data.get('sabian_ownership_rate', 0) + total_users = ownership_data.get('total_users_with_cymbal_data', 0) + col1, col2 = st.columns(2) + with col1: + st.metric("Users with Cymbal Data", total_users) + with col2: + st.metric("Sabian Ownership Rate", f"{sabian_rate:.1f}%") + + # Gear Analysis + gear_columns = { + 'drums_gear_stick_brands': 'Stick Brands', + 'drums_gear_set_brands': 'Drum Set Brands', + 'drums_gear_hardware_brands': 'Hardware Brands' + } + + available_gear = [col for col in gear_columns.keys() if col in df.columns] + + if available_gear: + st.markdown("### 🥢 Other Gear Preferences") + + gear_cols = st.columns(len(available_gear)) + + for idx, gear_col in enumerate(available_gear): + with gear_cols[idx]: + gear_dist = processor.get_gear_brand_analysis(df, gear_col) + if not gear_dist.empty: + gear_chart = demographic_charts.create_gear_brand_chart( + gear_dist.head(8), + gear_columns[gear_col], + title=gear_columns[gear_col] + ) + st.plotly_chart(gear_chart, use_container_width=True, key=f"gear_{gear_col}") + + # Age Distribution + if 'age_group' in df.columns: + age_dist = processor.get_demographics_distribution(df, 'age_group') + if not age_dist.empty: + st.markdown("### 🎂 Age Distribution") + age_chart = demographic_charts.create_age_distribution_chart( + age_dist, title="Age Group Distribution" + ) + st.plotly_chart(age_chart, use_container_width=True, key="age_dist") + + # Geographic Distribution + if 'timezone_region' in df.columns: + tz_dist = processor.get_demographics_distribution(df, 'timezone_region') + if not tz_dist.empty: + st.markdown("### 🌍 Geographic Distribution") + tz_chart = demographic_charts.create_timezone_distribution_chart( + tz_dist, title="Users by Timezone Region" + ) + st.plotly_chart(tz_chart, use_container_width=True, key="timezone_dist") + + st.markdown("---") + + # ============== EMOTION ANALYSIS ============== + emotion_dist = processor.get_emotion_distribution(df) + if not emotion_dist.empty: + st.markdown("## 😊 Emotion Analysis") + emotion_chart = sentiment_charts.create_emotion_distribution_chart( + df, title="Emotion Distribution" + ) + st.plotly_chart(emotion_chart, use_container_width=True, key="emotion_dist") + st.markdown("---") + + # ============== TEMPORAL TRENDS ============== + if 'post_created_at' in df.columns and not df['post_created_at'].isna().all(): + with st.expander("📈 Temporal Trends", expanded=False): + freq_col, _ = st.columns([1, 3]) + + with freq_col: + freq = st.selectbox( + "Time Granularity", + options=['D', 'W', 'M'], + format_func=lambda x: {'D': 'Daily', 'W': 'Weekly', 'M': 'Monthly'}[x], + index=1 + ) + + timeline_chart = sentiment_charts.create_sentiment_timeline( + df, freq=freq, title="Sentiment Trends Over Time" + ) + st.plotly_chart(timeline_chart, use_container_width=True, key="timeline") + + # Trend metrics + trend_metrics = BrandMetrics.calculate_trend_metrics(df) + if trend_metrics.get('trend_available'): + col1, col2, col3 = st.columns(3) + with col1: + st.metric("Current Period Posts", trend_metrics['current_period_posts']) + with col2: + st.metric("Sentiment Score Change", f"{trend_metrics['sentiment_score_change']:+.2f}") + with col3: + trend_emoji = {'improving': '📈', 'declining': '📉', 'stable': '➡️'}.get(trend_metrics['sentiment_trend'], '➡️') + st.metric("Trend", f"{trend_emoji} {trend_metrics['sentiment_trend'].title()}") + + # ============== PROCESSING STATUS ============== + with st.expander("⚙️ Processing Status", expanded=False): + status_dist = processor.get_processing_status_distribution(df) + if not status_dist.empty: + col1, col2 = st.columns(2) + + with col1: + status_chart = distribution_charts.create_processing_status_chart( + status_dist, title="Processing Status Distribution" + ) + st.plotly_chart(status_chart, use_container_width=True, key="processing_status") + + with col2: + # Sarcasm detection stats + sarcasm_count = overall_metrics['sarcasm_count'] + st.metric("Posts with Sarcasm Detected", sarcasm_count) + + # Processing breakdown + for _, row in status_dist.iterrows(): + status = row['processing_status'].replace('_', ' ').title() + count = row['count'] + pct = row['percentage'] + st.write(f"**{status}:** {count} ({pct:.1f}%)") + + # ============== HIERARCHICAL VIEW ============== + with st.expander("🌟 Hierarchical View", expanded=False): + st.markdown("**Interactive: Author Role > Mention Context > Sentiment**") + sunburst = distribution_charts.create_combined_sunburst( + df, title="Hierarchical Distribution" + ) + st.plotly_chart(sunburst, use_container_width=True, key="sunburst") + + # ============== SOCIAL MEDIA COMMENTS SECTION ============== + if not comments_df.empty: + _render_comments_section(comments_df, sentiment_charts, distribution_charts, + brand_charts, processor, + yt_total_comments=yt_total_comments) + + # ============== PDF EXPORT ============== + _render_pdf_export_section(df, comments_df, additional_mentions_df) + + +def _render_pdf_export_section(df, comments_df, additional_mentions_df): + """ + Render the PDF export section at the bottom of the dashboard. + + Args: + df: Forum posts DataFrame + comments_df: Social media comments DataFrame + additional_mentions_df: Additional competitor mentions DataFrame + """ + st.markdown("---") + st.markdown("## 📄 Export Report") + st.markdown("*Generate a comprehensive PDF report with all visualizations, metrics, and analysis*") + + col1, col2 = st.columns([3, 1]) + with col1: + st.markdown( + "Export the current dashboard view into a professional PDF report " + "that can be shared with partners and stakeholders. " + "The report includes all charts, tables, metrics, and explanatory text." + ) + with col2: + generate = st.button( + "📄 Generate PDF Report", + type="primary", + use_container_width=True, + key="generate_pdf_btn" + ) + + if generate: + try: + with st.spinner("Generating comprehensive PDF report... This may take a moment."): + pdf_bytes = _generate_pdf_cached(df, comments_df, additional_mentions_df) + st.session_state['dashboard_pdf'] = pdf_bytes + st.session_state['dashboard_pdf_ready'] = True + except ImportError: + st.error( + "PDF export requires additional dependencies. " + "Please install them with: `pip install fpdf2 kaleido`" + ) + except Exception as e: + logger.exception("PDF generation failed: %s", e) + st.error( + f"Error generating PDF report — {type(e).__name__}: {e}. " + "Check the application logs for the full traceback." + ) + + if st.session_state.get('dashboard_pdf_ready'): + date_str = datetime.now().strftime('%Y%m%d') + st.download_button( + label="📥 Download PDF Report", + data=st.session_state['dashboard_pdf'], + file_name=f"sabian_sentiment_report_{date_str}.pdf", + mime="application/pdf", + type="primary", + use_container_width=True, + key="download_pdf_btn" + ) + st.success("PDF report generated successfully! Click above to download.") + + +def _render_comments_section(comments_df, sentiment_charts, distribution_charts, + brand_charts, processor, yt_total_comments=None): + """ + Render the social media comments section on the dashboard + + Args: + comments_df: Social media comments dataframe + sentiment_charts: SentimentCharts instance + distribution_charts: DistributionCharts instance + brand_charts: BrandCharts instance + processor: BrandDataProcessor instance + yt_total_comments: Total YouTube comment count from FACT_COMMENTS (optional) + """ + st.markdown("---") + st.markdown("---") + st.markdown(""" +
+

📺 Social Media Comments

+

Sentiment analysis from social media platforms

+
+ """, unsafe_allow_html=True) + + # Summary metrics + comments_metrics = BrandMetrics.calculate_overall_metrics(comments_df) + ContentCards.display_comment_summary_stats(comments_df, comments_metrics, + total_raw_count=yt_total_comments) + + st.markdown("---") + + # Health Indicator + col1, col2, col3 = st.columns([1, 2, 1]) + with col2: + status_info = BrandMetrics.get_sentiment_health_status(comments_metrics['negative_pct']) + ContentCards.display_health_indicator(comments_metrics['negative_pct'], status_info) + + st.markdown("---") + + # ============== PLATFORM BREAKDOWN ============== + if 'platform' in comments_df.columns: + st.markdown("### 🌐 Platform Breakdown") + platform_counts = comments_df['platform'].value_counts().reset_index() + platform_counts.columns = ['platform', 'count'] + + col1, col2 = st.columns(2) + + with col1: + import plotly.graph_objects as go + platform_colors_config = {} + try: + import json + from pathlib import Path + config_path = Path(__file__).parent.parent / "config" / "viz_config.json" + with open(config_path, 'r', encoding='utf-8') as f: + viz_config = json.load(f) + platform_colors_config = viz_config.get('color_schemes', {}).get('platform', {}) + except Exception: + pass + + colors = [platform_colors_config.get(p, '#9E9E9E') for p in platform_counts['platform']] + fig = go.Figure(data=[go.Pie( + labels=[p.replace('_', ' ').title() for p in platform_counts['platform']], + values=platform_counts['count'], + marker=dict(colors=colors), + textinfo='label+percent', + textposition='auto', + hovertemplate='%{label}
Count: %{value}
Percentage: %{percent}' + )]) + fig.update_layout(title="Comments by Platform", height=400, showlegend=True) + st.plotly_chart(fig, use_container_width=True, key="comments_platform_pie") + + with col2: + # Sentiment by platform + sentiment_by_platform = sentiment_charts.create_sentiment_percentage_bar_chart( + comments_df, group_by='platform', title="Sentiment by Platform" + ) + st.plotly_chart(sentiment_by_platform, use_container_width=True, key="comments_sentiment_platform") + + # Platform metrics + for platform in comments_df['platform'].unique(): + platform_data = comments_df[comments_df['platform'] == platform] + platform_metrics = BrandMetrics.calculate_overall_metrics(platform_data) + col1, col2, col3, col4 = st.columns(4) + with col1: + st.metric(f"{platform.replace('_', ' ').title()} Comments", f"{platform_metrics['total_posts']:,}") + with col2: + st.metric("Avg Score", f"{platform_metrics['avg_sentiment_score']:.2f}") + with col3: + st.metric("Positive %", f"{platform_metrics['positive_pct']:.1f}%") + with col4: + st.metric("Negative %", f"{platform_metrics['negative_pct']:.1f}%") + + st.markdown("---") + + # ============== COMMENTS SENTIMENT DISTRIBUTION ============== + st.markdown("### 🎯 Comments Sentiment Distribution") + + col1, col2 = st.columns(2) + + with col1: + sentiment_pie = sentiment_charts.create_sentiment_pie_chart( + comments_df, title="Comment Sentiment Distribution" + ) + st.plotly_chart(sentiment_pie, use_container_width=True, key="comments_sentiment_pie") + + with col2: + sentiment_gauge = sentiment_charts.create_sentiment_score_gauge( + comments_metrics['avg_sentiment_score'], + title="Comments Sentiment Score" + ) + st.plotly_chart(sentiment_gauge, use_container_width=True, key="comments_sentiment_gauge") + + st.markdown("---") + + # ============== COMMENTS AUTHOR ROLE ANALYSIS ============== + st.markdown("### 👤 Comments Author Role Analysis") + + author_dist = processor.get_author_role_distribution(comments_df) + + col1, col2 = st.columns(2) + + with col1: + author_pie = distribution_charts.create_author_role_chart( + author_dist, title="Author Roles in Comments" + ) + st.plotly_chart(author_pie, use_container_width=True, key="comments_author_role_pie") + + with col2: + author_sentiment_bar = sentiment_charts.create_sentiment_percentage_bar_chart( + comments_df, group_by='author_role', title="Comment Sentiment by Author Role" + ) + st.plotly_chart(author_sentiment_bar, use_container_width=True, key="comments_author_sentiment_bar") + + st.markdown("---") + + # ============== COMMENTS PRODUCTS ANALYSIS ============== + st.markdown("### 🥁 Products in Comments") + + products_dist = processor.get_products_distribution(comments_df) + product_sentiment = processor.get_product_sentiment_breakdown(comments_df) + + if not products_dist.empty: + col1, col2 = st.columns(2) + + with col1: + products_bar = brand_charts.create_products_horizontal_bar( + products_dist.head(10), title="Top Products Mentioned in Comments" + ) + st.plotly_chart(products_bar, use_container_width=True, key="comments_products_bar") + + with col2: + if not product_sentiment.empty: + product_sent_chart = brand_charts.create_product_sentiment_breakdown( + product_sentiment, title="Product Sentiment in Comments" + ) + st.plotly_chart(product_sent_chart, use_container_width=True, key="comments_product_sentiment") + else: + st.info("No product mention data available in comments") + + st.markdown("---") + + # ============== COMMENTS COMPETITIVE ANALYSIS ============== + st.markdown("### 🆚 Competitors in Comments") + + competitors_dist = processor.get_competitors_distribution(comments_df) + + if not competitors_dist.empty: + col1, col2 = st.columns(2) + + with col1: + competitors_bar = brand_charts.create_competitors_bar_chart( + competitors_dist.head(10), title="Competitors Mentioned in Comments" + ) + st.plotly_chart(competitors_bar, use_container_width=True, key="comments_competitors_bar") + + with col2: + competitor_sentiment = processor.get_competitor_sentiment_breakdown(comments_df) + if not competitor_sentiment.empty: + comp_heatmap = brand_charts.create_competitive_heatmap( + competitor_sentiment, title="Sentiment When Competitors Mentioned" + ) + st.plotly_chart(comp_heatmap, use_container_width=True, key="comments_competitor_heatmap") + else: + st.info("No competitor mention data available in comments") + + st.markdown("---") + + # ============== COMMENTS INTENTS & FEEDBACK ============== + st.markdown("### 💡 Comments Intents & Feedback") + + col1, col2 = st.columns(2) + + with col1: + intents_dist = processor.get_intents_distribution(comments_df) + if not intents_dist.empty: + intents_bar = distribution_charts.create_intent_bar_chart( + intents_dist, title="Comment Intents", orientation='h' + ) + st.plotly_chart(intents_bar, use_container_width=True, key="comments_intents_bar") + + with col2: + pain_dist = processor.get_pain_points_distribution(comments_df) + delight_dist = processor.get_delight_factors_distribution(comments_df) + + if not pain_dist.empty or not delight_dist.empty: + pain_delight_chart = distribution_charts.create_pain_delight_comparison_chart( + pain_dist.head(8), delight_dist.head(8), title="Pain Points vs Delight Factors" + ) + st.plotly_chart(pain_delight_chart, use_container_width=True, key="comments_pain_delight") + + st.markdown("---") + + # ============== COMMENTS TEMPORAL TRENDS ============== + date_col = None + if 'comment_timestamp' in comments_df.columns and not comments_df['comment_timestamp'].isna().all(): + date_col = 'comment_timestamp' + + if date_col: + with st.expander("📈 Comments Temporal Trends", expanded=False): + freq_col, _ = st.columns([1, 3]) + + with freq_col: + freq = st.selectbox( + "Time Granularity", + options=['D', 'W', 'M'], + format_func=lambda x: {'D': 'Daily', 'W': 'Weekly', 'M': 'Monthly'}[x], + index=1, + key="comments_freq" + ) + + timeline_chart = sentiment_charts.create_sentiment_timeline( + comments_df, freq=freq, title="Comment Sentiment Trends Over Time" + ) + st.plotly_chart(timeline_chart, use_container_width=True, key="comments_timeline") + + # ============== COMMENTS PROCESSING STATUS ============== + with st.expander("⚙️ Comments Processing Status", expanded=False): + status_dist = processor.get_processing_status_distribution(comments_df) + if not status_dist.empty: + status_chart = distribution_charts.create_processing_status_chart( + status_dist, title="Comments Processing Status" + ) + st.plotly_chart(status_chart, use_container_width=True, key="comments_processing_status") diff --git a/visualization_brand_sentiment/components/sentiment_analysis.py b/visualization_brand_sentiment/components/sentiment_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..6b0824f6c79e4feff00072d08d1819b90957b8be --- /dev/null +++ b/visualization_brand_sentiment/components/sentiment_analysis.py @@ -0,0 +1,711 @@ +""" +Sentiment Analysis Page for Brand Sentiment Dashboard +Detailed analysis with filtering, post exploration, and AI insights +""" +import streamlit as st +import pandas as pd +import sys +from pathlib import Path + +# Add parent directory to path +parent_dir = Path(__file__).resolve().parent.parent +sys.path.append(str(parent_dir)) + +from utils.data_processor import BrandDataProcessor +from utils.metrics import BrandMetrics +from visualizations.sentiment_charts import SentimentCharts +from visualizations.distribution_charts import DistributionCharts +from visualizations.brand_charts import BrandCharts +from visualizations.content_cards import ContentCards +from agents.brand_insight_agent import BrandInsightAgent +from data.data_loader import BrandSentimentDataLoader + + +def render_sentiment_analysis(df, comments_df=None): + """ + Render the sentiment analysis page + + Args: + df: Brand sentiment dataframe (forum posts) + comments_df: Social media comments dataframe (optional) + """ + if comments_df is None: + comments_df = pd.DataFrame() + + st.title("🔍 Sentiment Analysis") + st.markdown("*Deep dive into posts with advanced filtering and AI-powered insights*") + st.markdown("---") + + # Initialize components + processor = BrandDataProcessor() + sentiment_charts = SentimentCharts() + distribution_charts = DistributionCharts() + brand_charts = BrandCharts() + + # Initialize AI agent + try: + insight_agent = BrandInsightAgent(model="gpt-5-nano", temperature=1) + agent_available = True + except Exception as e: + agent_available = False + st.warning(f"AI Agent unavailable: {str(e)}") + + # Initialize session state + if 'brand_insights' not in st.session_state: + st.session_state.brand_insights = None + if 'analysis_page' not in st.session_state: + st.session_state.analysis_page = 1 + if 'sa_prev_source' not in st.session_state: + st.session_state.sa_prev_source = None + + # ============== DATA SOURCE SELECTION ============== + st.markdown("### 📡 Data Source") + + source_options = ["Forum Posts"] + if not comments_df.empty: + source_options.extend(["Social Media Comments", "All Sources"]) + + selected_source = st.radio( + "Select data source to analyze", + source_options, + index=0, + horizontal=True, + key="sa_source_selector" + ) + + # Reset page when source changes + if st.session_state.sa_prev_source != selected_source: + st.session_state.analysis_page = 1 + st.session_state.sa_prev_source = selected_source + + # Build working dataframe based on selection + if selected_source == "Forum Posts": + working_df = df.copy() + if 'data_source' not in working_df.columns: + working_df['data_source'] = 'forums' + elif selected_source == "Social Media Comments": + working_df = comments_df.copy() + else: # All Sources + forums_copy = df.copy() + if 'data_source' not in forums_copy.columns: + forums_copy['data_source'] = 'forums' + working_df = pd.concat([forums_copy, comments_df], ignore_index=True) + # Unify date column: fill post_created_at with comment_timestamp where missing, + # then normalise the entire column to tz-naive so that min/max comparisons + # and the Streamlit date-range widget never raise a tz mismatch error. + # + # Key insight: after pd.concat the column is object-dtype and may hold a mix + # of tz-naive Timestamps (forums) and tz-aware Timestamps (comments). + # Calling pd.to_datetime() directly on such a column raises: + # ValueError: Cannot mix tz-aware with tz-naive values + # The safe pattern is: strip tz element-by-element first, then re-parse. + if 'comment_timestamp' in working_df.columns: + mask = working_df['post_created_at'].isna() & working_df['comment_timestamp'].notna() + if mask.any(): + working_df.loc[mask, 'post_created_at'] = working_df.loc[mask, 'comment_timestamp'] + + if 'post_created_at' in working_df.columns: + # Step 1: strip tz-info from every element without calling pd.to_datetime + working_df['post_created_at'] = working_df['post_created_at'].apply( + lambda x: x.replace(tzinfo=None) + if pd.notna(x) and hasattr(x, 'tzinfo') and x.tzinfo is not None + else x + ) + # Step 2: re-parse as uniform tz-naive datetime64[ns] + working_df['post_created_at'] = pd.to_datetime( + working_df['post_created_at'], errors='coerce' + ) + + if working_df.empty: + st.warning("No data available for the selected source. Try a different source.") + return + + st.markdown("---") + + # ============== FILTERS SECTION ============== + st.markdown("### 🔍 Filter Posts") + + data_loader = BrandSentimentDataLoader() + + # Platform and channel filters (dynamic - only when social media data is in scope) + selected_platforms = [] + selected_channels = [] + + if selected_source in ["Social Media Comments", "All Sources"] and not comments_df.empty: + comment_filter_options = data_loader.get_comment_filter_options(comments_df) + + platform_col, channel_col = st.columns(2) + + with platform_col: + selected_platforms = st.multiselect( + "📺 Platform", + options=comment_filter_options.get('platforms', []), + default=[], + help="Filter by social media platform (e.g. YouTube)", + key="sa_platform_filter" + ) + + with channel_col: + # Filter channel options based on selected platforms + if selected_platforms: + available_channels = sorted( + comments_df[comments_df['platform'].isin(selected_platforms)]['channel_display_name'] + .dropna().unique().tolist() + ) + else: + available_channels = comment_filter_options.get('channels', []) + + selected_channels = st.multiselect( + "📡 Channel", + options=available_channels, + default=[], + help="Filter by content channel (e.g. Drumeo)", + key="sa_channel_filter" + ) + + # Get filter options from working dataframe + filter_options = data_loader.get_filter_options(working_df) + + # Row 1: Main filters + filter_col1, filter_col2, filter_col3, filter_col4 = st.columns(4) + + with filter_col1: + selected_sentiments = st.multiselect( + "Sentiment Level", + options=filter_options['sentiments'], + default=[], + help="Filter by sentiment level", + key="sa_sentiment_filter" + ) + + with filter_col2: + selected_author_roles = st.multiselect( + "Author Role", + options=filter_options['author_roles'], + default=[], + help="Filter by author's relationship to Sabian", + key="sa_author_role_filter" + ) + + with filter_col3: + selected_products = st.multiselect( + "Products Mentioned", + options=filter_options['products'], + default=[], + help="Filter by Sabian products mentioned", + key="sa_product_filter" + ) + + with filter_col4: + selected_competitors = st.multiselect( + "Competitors Mentioned", + options=filter_options['competitors'], + default=[], + help="Filter posts mentioning competitors", + key="sa_competitor_filter" + ) + + # Row 2: Additional filters + filter_col5, filter_col6, filter_col7, filter_col8 = st.columns(4) + + with filter_col5: + selected_intents = st.multiselect( + "Intents", + options=filter_options['intents'], + default=[], + help="Filter by user intent", + key="sa_intent_filter" + ) + + with filter_col6: + selected_purchase_stages = st.multiselect( + "Purchase Stage", + options=filter_options['purchase_stages'], + default=[], + help="Filter by purchase journey stage", + key="sa_purchase_stage_filter" + ) + + with filter_col7: + selected_pain_points = st.multiselect( + "Pain Points", + options=filter_options['pain_points'], + default=[], + help="Filter by mentioned pain points", + key="sa_pain_points_filter" + ) + + with filter_col8: + selected_delight_factors = st.multiselect( + "Delight Factors", + options=filter_options['delight_factors'], + default=[], + help="Filter by mentioned delight factors", + key="sa_delight_filter" + ) + + # Row 3: Context, status, and date filters + filter_col9, filter_col10, filter_col11, _ = st.columns(4) + + with filter_col9: + selected_contexts = st.multiselect( + "Mention Context", + options=filter_options['mention_contexts'], + default=[], + help="Filter by how Sabian is mentioned", + key="sa_context_filter" + ) + + with filter_col10: + selected_statuses = st.multiselect( + "Processing Status", + options=filter_options['processing_statuses'], + default=[], + help="Filter by processing status (for validation review)", + key="sa_status_filter" + ) + + with filter_col11: + date_range = None + # Determine date column based on source + if selected_source == "Social Media Comments": + date_col_name = 'comment_timestamp' + else: + # Forum Posts or All Sources (unified into post_created_at) + date_col_name = 'post_created_at' + + if date_col_name in working_df.columns and not working_df[date_col_name].isna().all(): + valid_dates = working_df[working_df[date_col_name].notna()][date_col_name] + min_date = valid_dates.min().date() + max_date = valid_dates.max().date() + + date_range = st.date_input( + "Date Range", + value=(min_date, max_date), + min_value=min_date, + max_value=max_date, + key="sa_date_range" + ) + + # Apply filters + filtered_df = data_loader.apply_filters( + working_df, + sentiments=selected_sentiments if selected_sentiments else None, + author_roles=selected_author_roles if selected_author_roles else None, + mention_contexts=selected_contexts if selected_contexts else None, + products=selected_products if selected_products else None, + competitors=selected_competitors if selected_competitors else None, + intents=selected_intents if selected_intents else None, + pain_points=selected_pain_points if selected_pain_points else None, + delight_factors=selected_delight_factors if selected_delight_factors else None, + purchase_stages=selected_purchase_stages if selected_purchase_stages else None, + date_range=date_range if date_range and len(date_range) == 2 else None, + processing_statuses=selected_statuses if selected_statuses else None, + platforms=selected_platforms if selected_platforms else None, + channels=selected_channels if selected_channels else None + ) + + # Check filter status + filters_active = any([ + selected_sentiments, selected_author_roles, selected_products, + selected_competitors, selected_intents, selected_purchase_stages, + selected_pain_points, selected_delight_factors, selected_contexts, + selected_statuses, selected_platforms, selected_channels + ]) + + st.markdown("---") + + # Determine label for items based on source + if selected_source == "All Sources": + item_label = "items" + elif selected_source == "Social Media Comments": + item_label = "comments" + else: + item_label = "posts" + + # Filter summary + if filters_active: + st.success(f"✅ **Filters Applied:** Showing **{len(filtered_df):,}** of {len(working_df):,} {item_label}") + + # Show active filters + active_filters = {} + if selected_platforms: + active_filters['Platforms'] = selected_platforms + if selected_channels: + active_filters['Channels'] = selected_channels + if selected_sentiments: + active_filters['Sentiments'] = selected_sentiments + if selected_author_roles: + active_filters['Author Roles'] = selected_author_roles + if selected_products: + active_filters['Products'] = selected_products + if selected_competitors: + active_filters['Competitors'] = selected_competitors + if selected_intents: + active_filters['Intents'] = selected_intents + if selected_purchase_stages: + active_filters['Purchase Stages'] = selected_purchase_stages + if selected_contexts: + active_filters['Mention Contexts'] = selected_contexts + if selected_statuses: + active_filters['Processing Status'] = selected_statuses + + ContentCards.display_filter_summary(active_filters) + else: + st.info(f"📊 Showing all **{len(filtered_df):,}** {item_label}. Use filters above to narrow your analysis.") + + if filtered_df.empty: + st.warning("No data matches the selected filters. Try adjusting your criteria.") + return + + # ============== SUMMARY STATISTICS ============== + st.markdown("### 📊 Filtered Data Summary") + + filtered_metrics = BrandMetrics.calculate_overall_metrics(filtered_df) + + if selected_source == "Social Media Comments": + col1, col2, col3, col4, col5 = st.columns(5) + with col1: + st.metric("Total Comments", f"{filtered_metrics['total_posts']:,}") + with col2: + platforms_count = filtered_df['platform'].nunique() if 'platform' in filtered_df.columns else 0 + st.metric("Platforms", platforms_count) + with col3: + st.metric("Avg Sentiment Score", f"{filtered_metrics['avg_sentiment_score']:.2f}") + with col4: + st.metric("Positive %", f"{filtered_metrics['positive_pct']:.1f}%") + with col5: + st.metric("Negative %", f"{filtered_metrics['negative_pct']:.1f}%") + + elif selected_source == "All Sources": + col1, col2, col3, col4, col5, col6 = st.columns(6) + with col1: + st.metric("Total Items", f"{filtered_metrics['total_posts']:,}") + with col2: + forum_count = len(filtered_df[filtered_df['data_source'] == 'forums']) + st.metric("Forum Posts", f"{forum_count:,}") + with col3: + comment_count = len(filtered_df[filtered_df['data_source'] == 'comments']) + st.metric("Comments", f"{comment_count:,}") + with col4: + st.metric("Avg Sentiment", f"{filtered_metrics['avg_sentiment_score']:.2f}") + with col5: + st.metric("Positive %", f"{filtered_metrics['positive_pct']:.1f}%") + with col6: + st.metric("Negative %", f"{filtered_metrics['negative_pct']:.1f}%") + + else: # Forum Posts + col1, col2, col3, col4, col5 = st.columns(5) + with col1: + st.metric("Total Posts", f"{filtered_metrics['total_posts']:,}") + with col2: + st.metric("Unique Threads", f"{filtered_metrics['unique_threads']:,}") + with col3: + st.metric("Avg Sentiment Score", f"{filtered_metrics['avg_sentiment_score']:.2f}") + with col4: + st.metric("Positive %", f"{filtered_metrics['positive_pct']:.1f}%") + with col5: + st.metric("Negative %", f"{filtered_metrics['negative_pct']:.1f}%") + + st.markdown("---") + + # ============== VISUALIZATIONS ============== + st.markdown("### 📈 Filtered Data Visualizations") + + col1, col2 = st.columns(2) + + with col1: + # Sentiment distribution + sentiment_pie = sentiment_charts.create_sentiment_pie_chart( + filtered_df, title="Sentiment Distribution" + ) + st.plotly_chart(sentiment_pie, use_container_width=True, key="filtered_sentiment_pie") + + with col2: + # Intent distribution + intents_dist = processor.get_intents_distribution(filtered_df) + if not intents_dist.empty: + intents_chart = distribution_charts.create_intent_bar_chart( + intents_dist, title="Intent Distribution", orientation='h' + ) + st.plotly_chart(intents_chart, use_container_width=True, key="filtered_intents") + + # Product and competitor analysis for filtered data + col1, col2 = st.columns(2) + + with col1: + products_dist = processor.get_products_distribution(filtered_df) + if not products_dist.empty: + products_chart = brand_charts.create_products_horizontal_bar( + products_dist.head(8), title="Products in Filtered Data" + ) + st.plotly_chart(products_chart, use_container_width=True, key="filtered_products") + + with col2: + competitors_dist = processor.get_competitors_distribution(filtered_df) + if not competitors_dist.empty: + competitors_chart = brand_charts.create_competitors_bar_chart( + competitors_dist.head(8), title="Competitors in Filtered Data" + ) + st.plotly_chart(competitors_chart, use_container_width=True, key="filtered_competitors") + + st.markdown("---") + + # ============== AI INSIGHTS ============== + st.markdown("### 🤖 AI-Powered Brand Insights") + + if agent_available: + # Build filter description including source info + filter_parts = [f"Source: {selected_source}"] + if selected_platforms: + filter_parts.append(f"Platforms: {', '.join(selected_platforms)}") + if selected_channels: + filter_parts.append(f"Channels: {', '.join(selected_channels)}") + if selected_sentiments: + filter_parts.append(f"Sentiments: {', '.join(selected_sentiments)}") + if selected_products: + filter_parts.append(f"Products: {', '.join(selected_products)}") + if selected_competitors: + filter_parts.append(f"Competitors: {', '.join(selected_competitors)}") + if selected_author_roles: + filter_parts.append(f"Author Roles: {', '.join(selected_author_roles)}") + + filter_description = "; ".join(filter_parts) + + col1, col2 = st.columns([1, 3]) + + with col1: + generate_insights = st.button( + "🔍 Generate AI Insights", + help="Analyze filtered data and generate brand-focused insights", + use_container_width=True + ) + + with col2: + st.write(f"*Will analyze up to 50 sampled {item_label} from the {len(filtered_df):,} filtered {item_label}*") + + if generate_insights: + with st.spinner("Analyzing data with AI... This may take a moment."): + result = insight_agent.process({ + 'posts': filtered_df, + 'filter_description': filter_description + }) + + if result['success']: + st.session_state.brand_insights = result + else: + st.error(f"Failed to generate insights: {result.get('error', 'Unknown error')}") + + # Display insights if available + if st.session_state.brand_insights and st.session_state.brand_insights.get('success'): + insights = st.session_state.brand_insights['insights'] + metadata = st.session_state.brand_insights['metadata'] + + with st.expander("📊 AI Analysis Report", expanded=True): + ContentCards.display_ai_insights(insights) + + # Metadata + st.markdown("---") + meta_col1, meta_col2, meta_col3 = st.columns(3) + with meta_col1: + st.metric("Posts Analyzed", metadata.get('total_posts_analyzed', 0)) + with meta_col2: + st.metric("Model Used", metadata.get('model_used', 'N/A')) + with meta_col3: + st.metric("Tokens Used", metadata.get('tokens_used', 0)) + + # Clear insights button + if st.button("🗑️ Clear Insights"): + st.session_state.brand_insights = None + st.rerun() + else: + st.warning("AI insights are currently unavailable. Please ensure OPENAI_API_KEY is configured.") + + st.markdown("---") + + # ============== POSTS/COMMENTS EXPLORER ============== + explorer_label = "Comments" if selected_source == "Social Media Comments" else "Posts" + st.markdown(f"### 📋 Explore {explorer_label}") + + # Sorting options - adapt to data source + sort_col1, sort_col2, _ = st.columns([1, 1, 2]) + + with sort_col1: + if selected_source == "Social Media Comments": + sort_options = [ + ('comment_timestamp', 'Date (Newest First)'), + ('sentiment_level', 'Sentiment'), + ('author_role', 'Author Role'), + ('platform', 'Platform') + ] + elif selected_source == "All Sources": + sort_options = [ + ('post_created_at', 'Date (Newest First)'), + ('sentiment_level', 'Sentiment'), + ('author_role', 'Author Role'), + ('data_source', 'Data Source') + ] + else: + sort_options = [ + ('post_created_at', 'Date (Newest First)'), + ('sentiment_level', 'Sentiment'), + ('author_role', 'Author Role'), + ('sabian_mention_context', 'Mention Context') + ] + + sort_by = st.selectbox( + "Sort by", + options=sort_options, + format_func=lambda x: x[1], + index=0, + key="sa_sort_by" + ) + + with sort_col2: + posts_per_page = st.selectbox( + "Items per page", + options=[5, 10, 20, 50], + index=1, + key="sa_per_page" + ) + + # Sort dataframe + sort_column = sort_by[0] + if sort_column in filtered_df.columns: + ascending = sort_column not in ('post_created_at', 'comment_timestamp') + sorted_df = filtered_df.sort_values(sort_column, ascending=ascending, na_position='last') + else: + sorted_df = filtered_df + + # Pagination + total_posts = len(sorted_df) + total_pages = max(1, (total_posts - 1) // posts_per_page + 1) + + # Reset page if needed + if st.session_state.analysis_page > total_pages: + st.session_state.analysis_page = 1 + + # Pagination controls at top + if total_posts > posts_per_page: + st.info(f"📄 Page {st.session_state.analysis_page} of {total_pages} ({total_posts:,} total {explorer_label.lower()})") + + col_prev, col_page, col_next = st.columns([1, 2, 1]) + + with col_prev: + if st.button("⬅️ Previous", key="prev_top", disabled=st.session_state.analysis_page == 1): + st.session_state.analysis_page -= 1 + st.rerun() + + with col_page: + # Page selector + page_num = st.number_input( + "Go to page", + min_value=1, + max_value=total_pages, + value=st.session_state.analysis_page, + key="page_selector" + ) + if page_num != st.session_state.analysis_page: + st.session_state.analysis_page = page_num + st.rerun() + + with col_next: + if st.button("Next ➡️", key="next_top", disabled=st.session_state.analysis_page >= total_pages): + st.session_state.analysis_page += 1 + st.rerun() + + st.markdown("---") + + # Calculate indices + start_idx = (st.session_state.analysis_page - 1) * posts_per_page + end_idx = min(start_idx + posts_per_page, total_posts) + + # Display items with appropriate cards + paginated_df = sorted_df.iloc[start_idx:end_idx] + + for idx, (_, row) in enumerate(paginated_df.iterrows(), start_idx + 1): + is_comment = row.get('data_source') == 'comments' + if is_comment: + st.markdown(f"#### Comment #{idx}") + ContentCards.display_comment_card(row, show_full=False) + else: + st.markdown(f"#### Post #{idx}") + ContentCards.display_post_card(row, show_full=False) + + # Pagination controls at bottom + if total_posts > posts_per_page: + st.markdown("---") + + col_prev_b, col_info_b, col_next_b = st.columns([1, 2, 1]) + + with col_prev_b: + if st.button("⬅️ Previous", key="prev_bottom", disabled=st.session_state.analysis_page == 1): + st.session_state.analysis_page -= 1 + st.rerun() + + with col_info_b: + st.markdown(f"
Page {st.session_state.analysis_page} / {total_pages}
", unsafe_allow_html=True) + + with col_next_b: + if st.button("Next ➡️", key="next_bottom", disabled=st.session_state.analysis_page >= total_pages): + st.session_state.analysis_page += 1 + st.rerun() + + st.markdown("---") + + # ============== EXPORT ============== + st.markdown("### 💾 Export Data") + + col1, col2 = st.columns([1, 3]) + + with col1: + # Prepare export columns based on source + if selected_source == "Social Media Comments": + export_columns = [ + 'comment_id', 'platform', 'channel_display_name', 'comment_timestamp', + 'author_name', 'author_role', 'sentiment_level', 'emotion_type', + 'sabian_mention_context', 'products_mentioned', 'competitors_mentioned', + 'intents', 'pain_points', 'delight_factors', 'purchase_stage', + 'original_text', 'analysis_notes' + ] + elif selected_source == "All Sources": + export_columns = [ + 'data_source', 'post_id', 'comment_id', 'platform', + 'thread_title', 'channel_display_name', + 'post_created_at', 'comment_timestamp', + 'author_role', 'sentiment_level', 'emotion_type', + 'sabian_mention_context', 'products_mentioned', 'competitors_mentioned', + 'intents', 'pain_points', 'delight_factors', 'purchase_stage', + 'display_text', 'analysis_notes' + ] + else: + export_columns = [ + 'post_id', 'thread_id', 'thread_title', 'post_created_at', + 'author_role', 'sentiment_level', 'emotion_type', 'sabian_mention_context', + 'products_mentioned', 'competitors_mentioned', 'intents', + 'pain_points', 'delight_factors', 'purchase_stage', + 'cleaned_content', 'thread_context_summary', 'analysis_notes' + ] + + # Filter to existing columns + export_columns = [col for col in export_columns if col in filtered_df.columns] + export_df = filtered_df[export_columns].copy() + + # Convert lists to strings for CSV + list_columns = ['products_mentioned', 'competitors_mentioned', 'intents', 'pain_points', 'delight_factors'] + for col in list_columns: + if col in export_df.columns: + export_df[col] = export_df[col].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x)) + + csv = export_df.to_csv(index=False) + + source_suffix = selected_source.lower().replace(' ', '_') + st.download_button( + label=f"📥 Download Filtered {explorer_label} (CSV)", + data=csv, + file_name=f"sabian_sentiment_{source_suffix}_{len(filtered_df)}.csv", + mime="text/csv" + ) + + with col2: + st.info(f"Download **{len(filtered_df):,}** filtered {explorer_label.lower()} for further analysis") diff --git a/visualization_brand_sentiment/config/viz_config.json b/visualization_brand_sentiment/config/viz_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0726d905cfc0439c6894b2dbf85a8a4ec61a258 --- /dev/null +++ b/visualization_brand_sentiment/config/viz_config.json @@ -0,0 +1,220 @@ +{ + "brand": { + "name": "Sabian", + "description": "Sabian Cymbals - Brand Sentiment Analysis Dashboard", + "primary_color": "#C8102E", + "secondary_color": "#1E1E1E", + "accent_color": "#FFD700" + }, + "color_schemes": { + "sentiment_level": { + "very_positive": "#00C851", + "positive": "#7CB342", + "neutral": "#FFB300", + "negative": "#FF6F00", + "very_negative": "#D32F2F" + }, + "author_role": { + "current_owner": "#4CAF50", + "past_owner": "#9E9E9E", + "potential_buyer": "#2196F3", + "never_owned": "#FF9800", + "unknown": "#607D8B" + }, + "mention_context": { + "primary_focus": "#C8102E", + "significant_mention": "#FF6B35", + "casual_mention": "#FFB300", + "comparison_context": "#9C27B0" + }, + "intents": { + "seeking_information": "#2196F3", + "providing_information": "#4CAF50", + "sharing_experience": "#9C27B0", + "comparing": "#FF9800", + "praising": "#00C851", + "criticizing": "#D32F2F", + "buying_selling": "#00BCD4", + "general_discussion": "#9E9E9E" + }, + "purchase_stage": { + "researching": "#2196F3", + "deciding": "#FF9800", + "recently_purchased": "#4CAF50", + "long_term_owner": "#9C27B0", + "selling_replacing": "#F44336" + }, + "feedback_aspects": { + "sound_quality": "#C8102E", + "price_value": "#4CAF50", + "durability": "#2196F3", + "playability": "#9C27B0", + "versatility": "#FF9800", + "customer_service": "#00BCD4", + "availability": "#607D8B", + "aesthetics": "#E91E63" + }, + "competitors": { + "Zildjian": "#1E88E5", + "Meinl": "#43A047", + "Paiste": "#FF9800", + "Dream Cymbals": "#9C27B0", + "Istanbul Agop": "#F44336", + "Bosphorus": "#00BCD4", + "default": "#9E9E9E" + }, + "products": { + "HHX": "#C8102E", + "AAX": "#FF6B35", + "Artisan": "#FFD700", + "FRX": "#4CAF50", + "Omni": "#2196F3", + "Chopper": "#9C27B0", + "Stratus": "#00BCD4", + "XSR": "#FF9800", + "B8X": "#607D8B", + "SBR": "#9E9E9E", + "default": "#CCCCCC" + }, + "brand_switching": { + "switching_to_sabian": "#4CAF50", + "switching_from_sabian": "#F44336", + "no_switching": "#9E9E9E" + }, + "emotion_type": { + "frustration": "#FF5722", + "disappointment": "#FF9800", + "anger": "#D32F2F", + "satisfaction": "#4CAF50", + "excitement": "#00C851", + "curiosity": "#2196F3", + "indifference": "#9E9E9E" + }, + "processing_status": { + "completed": "#4CAF50", + "completed_with_flags": "#FF9800", + "validation_failed": "#F44336", + "workflow_error": "#D32F2F" + }, + "platform": { + "musora_forums": "#C8102E", + "youtube": "#FF0000", + "default": "#9E9E9E" + }, + "data_source": { + "forums": "#C8102E", + "comments": "#FF6B35" + } + }, + "sentiment_order": [ + "very_positive", + "positive", + "neutral", + "negative", + "very_negative" + ], + "intent_order": [ + "seeking_information", + "providing_information", + "sharing_experience", + "comparing", + "praising", + "criticizing", + "buying_selling", + "general_discussion" + ], + "author_role_order": [ + "current_owner", + "past_owner", + "potential_buyer", + "never_owned", + "unknown" + ], + "negative_sentiments": [ + "negative", + "very_negative" + ], + "positive_sentiments": [ + "positive", + "very_positive" + ], + "sabian_products": [ + "HHX", + "AAX", + "Artisan", + "FRX", + "Omni", + "Chopper", + "Stratus", + "XSR", + "B8X", + "SBR" + ], + "competitors": [ + "Zildjian", + "Meinl", + "Paiste", + "Dream Cymbals", + "Istanbul Agop", + "Bosphorus" + ], + "competitors_with_aliases": [ + { + "name": "Zildjian", + "aliases": ["zildjian", "z custom", "a custom", "k custom", "k zildjian", "a zildjian"] + }, + { + "name": "Meinl", + "aliases": ["meinl", "byzance"] + }, + { + "name": "Paiste", + "aliases": ["paiste", "formula 602"] + }, + { + "name": "Dream Cymbals", + "aliases": ["dream cymbals"] + }, + { + "name": "Istanbul Agop", + "aliases": ["istanbul agop", "agop", "istanbul mehmet"] + }, + { + "name": "Bosphorus", + "aliases": ["bosphorus"] + } + ], + "dashboard": { + "default_date_range_days": null, + "max_posts_display": 100, + "chart_height": 400, + "top_n_products": 10, + "top_n_competitors": 10 + }, + "page_config": { + "page_title": "Sabian Brand Sentiment Dashboard", + "page_icon": "🥁", + "layout": "wide", + "initial_sidebar_state": "expanded" + }, + "snowflake": { + "posts_query": "SELECT * FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS WHERE IS_RELEVANT = TRUE AND PROCESSING_SUCCESS = TRUE", + "comments_query": "SELECT * FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS_COMMENTS WHERE IS_RELEVANT = TRUE AND PROCESSING_SUCCESS = TRUE", + "users_query": "SELECT u.id as USER_ID, u.birthday as BIRTHDAY, u.timezone as TIMEZONE, u.DRUMS_PLAYING_SINCE_YEAR, u.DRUMS_GEAR_STICK_BRANDS, u.DRUMS_GEAR_SET_BRANDS, u.DRUMS_GEAR_HARDWARE_BRANDS, u.DRUMS_GEAR_CYMBAL_BRANDS FROM stitch.musora_ecom_db.usora_users u WHERE u.id IN (SELECT DISTINCT POST_AUTHOR_ID FROM SOCIAL_MEDIA_DB.ML_FEATURES.SABIAN_BRAND_ANALYSIS WHERE IS_RELEVANT = TRUE AND PROCESSING_SUCCESS = TRUE AND POST_AUTHOR_ID IS NOT NULL)" + }, + "demographics": { + "experience_groups": { + "Beginner (0-5 years)": [0, 5], + "Intermediate (5-15 years)": [5, 15], + "Advanced (15-25 years)": [15, 25], + "Expert (25+ years)": [25, 100] + }, + "age_groups": { + "18-24": [18, 24], + "25-34": [25, 34], + "35-44": [35, 44], + "45-54": [45, 54], + "55+": [55, 150] + } + } +} diff --git a/visualization_brand_sentiment/data/__init__.py b/visualization_brand_sentiment/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..30ff27e6ec285121782ad2814f1421347a6f0435 --- /dev/null +++ b/visualization_brand_sentiment/data/__init__.py @@ -0,0 +1,6 @@ +""" +Data loading modules for Brand Sentiment Visualization +""" +from .data_loader import BrandSentimentDataLoader + +__all__ = ['BrandSentimentDataLoader'] diff --git a/visualization_brand_sentiment/data/data_loader.py b/visualization_brand_sentiment/data/data_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..f04fefd13c5e7dcc4d3a68d3fc87078b44c447e3 --- /dev/null +++ b/visualization_brand_sentiment/data/data_loader.py @@ -0,0 +1,757 @@ +""" +Data loader module for Brand Sentiment Analysis Visualization +Handles Snowflake connection and data loading with caching +""" +import sys +import os +import pandas as pd +import numpy as np +import streamlit as st +from pathlib import Path +import json +from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta + +# Add parent directory to path to import SnowFlakeConnection +parent_dir = Path(__file__).resolve().parent.parent.parent +sys.path.append(str(parent_dir)) + +from visualization.SnowFlakeConnection import SnowFlakeConn + + +class BrandSentimentDataLoader: + """ + Loads brand sentiment analysis data from Snowflake with caching + """ + + def __init__(self, config_path=None): + """ + Initialize data loader + + Args: + config_path: Path to configuration file + """ + if config_path is None: + config_path = Path(__file__).parent.parent / "config" / "viz_config.json" + + with open(config_path, 'r', encoding='utf-8') as f: + self.config = json.load(f) + + self.posts_query = self.config['snowflake']['posts_query'] + self.comments_query = self.config['snowflake']['comments_query'] + self.users_query = self.config['snowflake']['users_query'] + self.competitors = self.config['competitors'] + self.competitors_with_aliases = self.config.get('competitors_with_aliases', []) + + @st.cache_data(ttl=3600) + def load_posts_data(_self, reload=False): + """ + Load brand sentiment posts data from Snowflake + + Args: + reload: Force reload data (bypass cache) + + Returns: + pd.DataFrame: Brand sentiment analysis data + """ + try: + conn = SnowFlakeConn() + df = conn.run_read_query(_self.posts_query, "brand sentiment posts") + conn.close_connection() + + if df is None or df.empty: + st.error("No data returned from Snowflake") + return pd.DataFrame() + + df = _self._process_posts_dataframe(df) + return df + + except Exception as e: + st.error(f"Error loading posts data from Snowflake: {e}") + return pd.DataFrame() + + @st.cache_data(ttl=3600) + def load_comments_data(_self, reload=False): + """ + Load brand sentiment comments data from Snowflake + + Args: + reload: Force reload data (bypass cache) + + Returns: + pd.DataFrame: Brand sentiment comments data + """ + try: + conn = SnowFlakeConn() + df = conn.run_read_query(_self.comments_query, "brand sentiment comments") + conn.close_connection() + + if df is None or df.empty: + return pd.DataFrame() + + df = _self._process_comments_dataframe(df) + return df + + except Exception as e: + st.warning(f"Could not load comments data: {str(e)}") + return pd.DataFrame() + + @st.cache_data(ttl=3600) + def load_users_data(_self, reload=False): + """ + Load user demographic data from Snowflake + + Returns: + pd.DataFrame: User demographic data + """ + try: + conn = SnowFlakeConn() + df = conn.run_read_query(_self.users_query, "user demographics") + conn.close_connection() + + if df is None or df.empty: + return pd.DataFrame() + + df = _self._process_users_dataframe(df) + return df + + except Exception as e: + st.warning(f"Could not load demographic data: {str(e)}") + return pd.DataFrame() + + @st.cache_data(ttl=1800) + def load_youtube_comment_count(_self, date_range=None, reload=False): + """ + Load the total YouTube comment count from SOCIAL_MEDIA_DB.CORE.FACT_COMMENTS. + + This is the *full* raw count within the date window, not just the + Sabian-relevant analyzed sample stored in SABIAN_BRAND_ANALYSIS_COMMENTS. + Use this figure to contextualise the analyzed sample size. + + Args: + date_range: Optional tuple of (start_date, end_date) as + datetime.date objects to restrict the date window. + reload: Force reload data (bypass cache) + + Returns: + int | None: Total comment count, or None if Snowflake is unavailable. + """ + try: + # viz_dir must be on sys.path so we can import from utils/ + _viz_dir = str(Path(__file__).resolve().parent.parent) + if _viz_dir not in sys.path: + sys.path.insert(0, _viz_dir) + from utils.report_context import ReportContextLoader + loader = ReportContextLoader() + return loader.load_youtube_comment_count(date_range=date_range) + except Exception as e: + return None + + @st.cache_data(ttl=1800) + def load_competitor_forum_mentions(_self, date_range=None, reload=False): + """ + Load competitor mention counts from raw forum posts that are NOT + already in the processed Sabian brand analysis. + + This enables an accurate overall brand mentions comparison by + capturing competitor discussions that don't mention Sabian. + + Args: + date_range: Optional tuple of (start_date, end_date) as + datetime.date objects to restrict the date window + reload: Force reload data (bypass cache) + + Returns: + pd.DataFrame: DataFrame with columns (brand, additional_mentions) + """ + try: + query = _self._build_competitor_mention_query(date_range=date_range) + if not query: + return pd.DataFrame(columns=['brand', 'additional_mentions']) + + conn = SnowFlakeConn() + df = conn.run_read_query(query, "competitor forum mentions") + conn.close_connection() + + if df is None or df.empty: + return pd.DataFrame(columns=['brand', 'additional_mentions']) + + df.columns = df.columns.str.lower() + return df + + except Exception as e: + st.warning(f"Could not load competitor forum mentions: {str(e)}") + return pd.DataFrame(columns=['brand', 'additional_mentions']) + + def _build_competitor_mention_query(self, date_range=None): + """ + Build a UNION ALL query that counts each competitor's mentions + in raw forum posts not already processed in sabian_brand_analysis. + + Uses LIKE with word-boundary patterns (space, punctuation, start/end of + string) matching the proven approach from the sample query. + + Args: + date_range: Optional tuple of (start_date, end_date) as + datetime.date objects to filter by POST_CREATED_AT + + Returns: + str: SQL query string, or empty string if no aliases configured + """ + if not self.competitors_with_aliases: + return "" + + excluded_posts_subquery = ( + "SELECT DISTINCT post_id " + "FROM social_media_db.ml_features.sabian_brand_analysis" + ) + + # Build date filter clause if a range is provided + date_filter = "" + if date_range and len(date_range) == 2: + start_date, end_date = date_range + end_exclusive = end_date + timedelta(days=1) + date_filter = ( + f"AND fc.POST_CREATED_AT >= '{start_date}' " + f"AND fc.POST_CREATED_AT < '{end_exclusive}'" + ) + + union_parts = [] + for competitor in self.competitors_with_aliases: + brand_name = competitor['name'] + aliases = competitor.get('aliases', []) + if not aliases: + continue + + # Build LIKE conditions for each alias with word-boundary matching + like_conditions = [] + for alias in aliases: + like_conditions.extend(self._build_like_patterns('fc.POST_CONTENT', alias)) + + conditions_sql = " OR ".join(like_conditions) + part = ( + f"SELECT '{brand_name}' AS brand, COUNT(*) AS additional_mentions " + f"FROM social_media_db.core.forum_posts fc " + f"WHERE ({conditions_sql}) " + f"AND fc.post_id NOT IN ({excluded_posts_subquery}) " + f"{date_filter}" + ) + union_parts.append(part) + + if not union_parts: + return "" + + return " UNION ALL ".join(union_parts) + + @staticmethod + def _build_like_patterns(column, alias): + """ + Generate a list of LIKE conditions that match an alias at word + boundaries (space, punctuation, start/end of text). + + Mirrors the pattern used in the verified Zildjian sample query. + + Args: + column: SQL column name (e.g. 'fc.POST_CONTENT') + alias: Lowercase alias string to match + + Returns: + list[str]: List of SQL LIKE condition strings + """ + a = alias.replace("'", "''") # escape single quotes for SQL + col = f"LOWER({column})" + return [ + f"{col} LIKE '% {a}%'", + f"{col} LIKE '{a}%'", + f"{col} LIKE '% {a}'", + f"{col} LIKE '% {a}.%'", + f"{col} LIKE '% {a},%'", + f"{col} LIKE '% {a}!%'", + f"{col} LIKE '% {a}?%'", + f"{col} LIKE '% {a};%'", + f"{col} LIKE '% {a}:%'", + f"{col} LIKE '%({a})%'", + f"{col} LIKE '%({a}%'", + f"{col} LIKE '% {a})%'", + ] + + def _process_posts_dataframe(self, df): + """ + Process and clean the posts dataframe + + Args: + df: Raw dataframe from Snowflake + + Returns: + pd.DataFrame: Processed dataframe + """ + df.columns = df.columns.str.lower() + + # Parse datetime columns + datetime_cols = ['post_created_at', 'thread_started_at', 'processed_at'] + for col in datetime_cols: + if col in df.columns: + df[col] = pd.to_datetime(df[col], errors='coerce') + + # Handle null values in key columns + df['sentiment_level'] = df['sentiment_level'].fillna('unknown') + df['author_role'] = df['author_role'].fillna('unknown') + df['sabian_mention_context'] = df['sabian_mention_context'].fillna('unknown') + df['emotion_type'] = df['emotion_type'].fillna('unknown') + df['processing_status'] = df['processing_status'].fillna('unknown') + + # Parse JSON array columns + json_columns = [ + 'products_mentioned', 'product_attributes', 'competitors_mentioned', + 'competitor_products_owned', 'intents', 'decision_drivers', + 'pain_points', 'delight_factors', 'validation_flags', + 'validation_errors', 'validation_warnings' + ] + + for col in json_columns: + if col in df.columns: + df[col] = df[col].apply(self._safe_parse_json) + + # Create display text (cleaned content or original) + df['display_text'] = df.apply( + lambda row: row.get('cleaned_content', '') or row.get('original_content', ''), + axis=1 + ) + + # Create shortened display text + df['display_text_short'] = df['display_text'].apply( + lambda x: x[:200] + '...' if isinstance(x, str) and len(x) > 200 else x + ) + + # Extract brand switching direction + df['switching_direction'] = df['brand_switching'].apply(self._categorize_switching) + + return df + + def _process_comments_dataframe(self, df): + """ + Process and clean the comments dataframe. + Normalizes column names to enable reuse of shared chart functions. + + Args: + df: Raw comments dataframe from Snowflake + + Returns: + pd.DataFrame: Processed comments dataframe + """ + df.columns = df.columns.str.lower() + + # Mark data source + df['data_source'] = 'comments' + + # Parse datetime columns + datetime_cols = ['comment_timestamp', 'processed_at'] + for col in datetime_cols: + if col in df.columns: + df[col] = pd.to_datetime(df[col], errors='coerce') + + # Normalize platform to lowercase + if 'platform' in df.columns: + df['platform'] = df['platform'].fillna('unknown').str.lower() + + # Handle null values in shared key columns + df['sentiment_level'] = df['sentiment_level'].fillna('unknown') + df['author_role'] = df['author_role'].fillna('unknown') + df['sabian_mention_context'] = df['sabian_mention_context'].fillna('unknown') + df['emotion_type'] = df['emotion_type'].fillna('unknown') + df['processing_status'] = df['processing_status'].fillna('unknown') + + # Parse JSON array columns (same as posts) + json_columns = [ + 'products_mentioned', 'product_attributes', 'competitors_mentioned', + 'competitor_products_owned', 'intents', 'decision_drivers', + 'pain_points', 'delight_factors', 'validation_flags', + 'validation_errors', 'validation_warnings' + ] + for col in json_columns: + if col in df.columns: + df[col] = df[col].apply(self._safe_parse_json) + + # Create display text from original_text + df['display_text'] = df['original_text'].fillna('') + + # Create shortened display text + df['display_text_short'] = df['display_text'].apply( + lambda x: x[:200] + '...' if isinstance(x, str) and len(x) > 200 else x + ) + + # Extract brand switching direction + df['switching_direction'] = df['brand_switching'].apply(self._categorize_switching) if 'brand_switching' in df.columns else 'no_switching' + + return df + + def _process_users_dataframe(self, df): + """ + Process and enrich user demographic dataframe + + Args: + df: Raw demographics dataframe + + Returns: + pd.DataFrame: Processed demographics + """ + df.columns = df.columns.str.lower() + + # Parse birthday and calculate age + if 'birthday' in df.columns: + df['birthday'] = df['birthday'].astype(str) + df['birthday'] = pd.to_datetime(df['birthday'], errors='coerce', utc=True) + df['birthday'] = df['birthday'].dt.tz_localize(None) + df['age'] = df['birthday'].apply(self._calculate_age) + df['age_group'] = df['age'].apply(self._categorize_age) + + # Calculate drumming experience from DRUMS_PLAYING_SINCE_YEAR + if 'drums_playing_since_year' in df.columns: + current_year = datetime.now().year + df['drums_experience_years'] = df['drums_playing_since_year'].apply( + lambda x: current_year - x if pd.notna(x) and x > 1900 and x <= current_year else None + ) + df['experience_group'] = df['drums_experience_years'].apply(self._categorize_experience) + + # Parse cymbal brands for competitor analysis + if 'drums_gear_cymbal_brands' in df.columns: + df['owns_sabian'] = df['drums_gear_cymbal_brands'].apply( + lambda x: self._check_brand_in_text(x, ['sabian']) + ) + df['cymbal_brands_list'] = df['drums_gear_cymbal_brands'].apply( + lambda x: self._parse_brands_from_text(x) + ) + + # Extract timezone region + if 'timezone' in df.columns: + df['timezone_region'] = df['timezone'].apply(self._extract_timezone_region) + + # Remove null user_ids + if 'user_id' in df.columns: + df = df[df['user_id'].notna()] + + return df + + @staticmethod + def _safe_parse_json(value): + """Safely parse JSON string to list""" + if pd.isna(value) or value is None: + return [] + if isinstance(value, list): + return value + try: + import json + parsed = json.loads(value) + return parsed if isinstance(parsed, list) else [] + except: + return [] + + @staticmethod + def _calculate_age(birthday): + """Calculate age from birthday""" + if pd.isna(birthday): + return None + try: + today = datetime.now() + age = relativedelta(today, birthday).years + if 0 <= age <= 120: + return age + return None + except: + return None + + def _categorize_age(self, age): + """Categorize age into groups""" + if pd.isna(age) or age is None: + return 'Unknown' + + age_groups = self.config.get('demographics', {}).get('age_groups', {}) + for group_name, (min_age, max_age) in age_groups.items(): + if min_age <= age <= max_age: + return group_name + return 'Unknown' + + def _categorize_experience(self, years): + """Categorize drumming experience into groups""" + if pd.isna(years) or years is None: + return 'Unknown' + + exp_groups = self.config.get('demographics', {}).get('experience_groups', {}) + for group_name, (min_years, max_years) in exp_groups.items(): + if min_years <= years < max_years: + return group_name + return 'Unknown' + + @staticmethod + def _extract_timezone_region(timezone): + """Extract region from timezone string""" + if pd.isna(timezone) or not isinstance(timezone, str): + return 'Unknown' + parts = timezone.split('/') + return parts[0] if len(parts) > 0 else 'Unknown' + + @staticmethod + def _categorize_switching(brand_switching): + """Categorize brand switching direction""" + if pd.isna(brand_switching) or not brand_switching: + return 'no_switching' + switching_lower = str(brand_switching).lower() + if 'to_sabian' in switching_lower or 'to sabian' in switching_lower: + return 'switching_to_sabian' + elif 'from_sabian' in switching_lower or 'from sabian' in switching_lower: + return 'switching_from_sabian' + return 'no_switching' + + def _check_brand_in_text(self, text, brands): + """Check if any brand is mentioned in text""" + if pd.isna(text) or not text: + return False + text_lower = str(text).lower() + return any(brand.lower() in text_lower for brand in brands) + + def _parse_brands_from_text(self, text): + """Parse all brands from free text""" + if pd.isna(text) or not text: + return [] + + text_lower = str(text).lower() + found_brands = [] + + # Check for Sabian + if 'sabian' in text_lower: + found_brands.append('Sabian') + + # Check for competitors + for competitor in self.competitors: + if competitor.lower() in text_lower: + found_brands.append(competitor) + + return found_brands + + def merge_posts_with_users(self, posts_df, users_df): + """ + Merge posts data with user demographic data + + Args: + posts_df: Posts dataframe + users_df: Users dataframe + + Returns: + pd.DataFrame: Merged dataframe + """ + if users_df.empty: + # Add empty demographic columns + demo_cols = ['age', 'age_group', 'drums_experience_years', 'experience_group', + 'owns_sabian', 'cymbal_brands_list', 'timezone', 'timezone_region', + 'drums_gear_stick_brands', 'drums_gear_set_brands', + 'drums_gear_hardware_brands', 'drums_gear_cymbal_brands'] + for col in demo_cols: + posts_df[col] = None + return posts_df + + # Merge on author_id + if 'post_author_id' in posts_df.columns and 'user_id' in users_df.columns: + posts_df['post_author_id_str'] = posts_df['post_author_id'].astype(str) + users_df['user_id_str'] = users_df['user_id'].astype(str) + + merge_cols = ['user_id_str', 'age', 'age_group', 'drums_experience_years', + 'experience_group', 'owns_sabian', 'cymbal_brands_list', + 'timezone', 'timezone_region', 'drums_gear_stick_brands', + 'drums_gear_set_brands', 'drums_gear_hardware_brands', + 'drums_gear_cymbal_brands'] + + # Only include columns that exist + merge_cols = [col for col in merge_cols if col in users_df.columns] + + merged_df = posts_df.merge( + users_df[merge_cols], + left_on='post_author_id_str', + right_on='user_id_str', + how='left' + ) + + merged_df = merged_df.drop(columns=['post_author_id_str', 'user_id_str'], errors='ignore') + return merged_df + + return posts_df + + @staticmethod + def get_filter_options(df): + """ + Get unique values for filters + + Args: + df: Brand sentiment dataframe + + Returns: + dict: Filter options + """ + options = { + 'sentiments': sorted([s for s in df['sentiment_level'].unique() if s != 'unknown']), + 'author_roles': sorted([r for r in df['author_role'].unique() if r != 'unknown']), + 'mention_contexts': sorted([c for c in df['sabian_mention_context'].unique() if c != 'unknown']), + 'processing_statuses': sorted(df['processing_status'].unique().tolist()) + } + + # Get unique products from JSON arrays + all_products = [] + for products in df['products_mentioned']: + if isinstance(products, list): + all_products.extend(products) + options['products'] = sorted(list(set(all_products))) + + # Get unique competitors + all_competitors = [] + for competitors in df['competitors_mentioned']: + if isinstance(competitors, list): + all_competitors.extend(competitors) + options['competitors'] = sorted(list(set(all_competitors))) + + # Get unique intents + all_intents = [] + for intents in df['intents']: + if isinstance(intents, list): + all_intents.extend(intents) + options['intents'] = sorted(list(set(all_intents))) + + # Get unique pain points + all_pain_points = [] + for pain_points in df['pain_points']: + if isinstance(pain_points, list): + all_pain_points.extend(pain_points) + options['pain_points'] = sorted(list(set(all_pain_points))) + + # Get unique delight factors + all_delights = [] + for delights in df['delight_factors']: + if isinstance(delights, list): + all_delights.extend(delights) + options['delight_factors'] = sorted(list(set(all_delights))) + + # Get purchase stages + options['purchase_stages'] = sorted([s for s in df['purchase_stage'].dropna().unique() if s]) + + return options + + @staticmethod + def get_comment_filter_options(df): + """ + Get unique values for comment-specific filters + + Args: + df: Comments dataframe + + Returns: + dict: Filter options including platforms and channels + """ + options = BrandSentimentDataLoader.get_filter_options(df) + + # Add platform filter + if 'platform' in df.columns: + options['platforms'] = sorted([p for p in df['platform'].unique() if p != 'unknown']) + else: + options['platforms'] = [] + + # Add channel filter + if 'channel_display_name' in df.columns: + options['channels'] = sorted([ + c for c in df['channel_display_name'].dropna().unique() if c + ]) + else: + options['channels'] = [] + + return options + + @staticmethod + def apply_filters(df, sentiments=None, author_roles=None, mention_contexts=None, + products=None, competitors=None, intents=None, pain_points=None, + delight_factors=None, purchase_stages=None, date_range=None, + processing_statuses=None, platforms=None, channels=None): + """ + Apply filters to dataframe + + Args: + df: Brand sentiment dataframe + Various filter parameters + + Returns: + pd.DataFrame: Filtered dataframe + """ + filtered_df = df.copy() + + if sentiments and len(sentiments) > 0: + filtered_df = filtered_df[filtered_df['sentiment_level'].isin(sentiments)] + + if author_roles and len(author_roles) > 0: + filtered_df = filtered_df[filtered_df['author_role'].isin(author_roles)] + + if mention_contexts and len(mention_contexts) > 0: + filtered_df = filtered_df[filtered_df['sabian_mention_context'].isin(mention_contexts)] + + if processing_statuses and len(processing_statuses) > 0: + filtered_df = filtered_df[filtered_df['processing_status'].isin(processing_statuses)] + + if purchase_stages and len(purchase_stages) > 0: + filtered_df = filtered_df[filtered_df['purchase_stage'].isin(purchase_stages)] + + # Filter by platform + if platforms and len(platforms) > 0 and 'platform' in filtered_df.columns: + filtered_df = filtered_df[filtered_df['platform'].isin(platforms)] + + # Filter by channel + if channels and len(channels) > 0 and 'channel_display_name' in filtered_df.columns: + filtered_df = filtered_df[filtered_df['channel_display_name'].isin(channels)] + + # Filter by products (check if any selected product is in the list) + if products and len(products) > 0: + filtered_df = filtered_df[filtered_df['products_mentioned'].apply( + lambda x: any(p in x for p in products) if isinstance(x, list) else False + )] + + # Filter by competitors + if competitors and len(competitors) > 0: + filtered_df = filtered_df[filtered_df['competitors_mentioned'].apply( + lambda x: any(c in x for c in competitors) if isinstance(x, list) else False + )] + + # Filter by intents + if intents and len(intents) > 0: + filtered_df = filtered_df[filtered_df['intents'].apply( + lambda x: any(i in x for i in intents) if isinstance(x, list) else False + )] + + # Filter by pain points + if pain_points and len(pain_points) > 0: + filtered_df = filtered_df[filtered_df['pain_points'].apply( + lambda x: any(p in x for p in pain_points) if isinstance(x, list) else False + )] + + # Filter by delight factors + if delight_factors and len(delight_factors) > 0: + filtered_df = filtered_df[filtered_df['delight_factors'].apply( + lambda x: any(d in x for d in delight_factors) if isinstance(x, list) else False + )] + + # Filter by date range - support both post_created_at and comment_timestamp + if date_range and len(date_range) == 2: + date_col = None + if 'post_created_at' in filtered_df.columns and not filtered_df['post_created_at'].isna().all(): + date_col = 'post_created_at' + elif 'comment_timestamp' in filtered_df.columns and not filtered_df['comment_timestamp'].isna().all(): + date_col = 'comment_timestamp' + + if date_col: + start_date, end_date = date_range + col = filtered_df[date_col] + if col.dt.tz is not None: + tz = col.dt.tz + start_ts = pd.Timestamp(start_date).tz_localize(tz) + end_ts = pd.Timestamp(end_date).tz_localize(tz) + pd.Timedelta(days=1) - pd.Timedelta(seconds=1) + else: + start_ts = pd.Timestamp(start_date) + end_ts = pd.Timestamp(end_date) + pd.Timedelta(days=1) - pd.Timedelta(seconds=1) + filtered_df = filtered_df[ + (col >= start_ts) & (col <= end_ts) + ] + + return filtered_df diff --git a/visualization_brand_sentiment/img/musora.png b/visualization_brand_sentiment/img/musora.png new file mode 100644 index 0000000000000000000000000000000000000000..941dd210827ae56cb1d5bf08948b04c125d97e79 Binary files /dev/null and b/visualization_brand_sentiment/img/musora.png differ diff --git a/visualization_brand_sentiment/requirements.txt b/visualization_brand_sentiment/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ac9081aa3a574cb920f02aae6475bbb64fc5df2 --- /dev/null +++ b/visualization_brand_sentiment/requirements.txt @@ -0,0 +1,33 @@ +# Brand Sentiment - Visualization & Processing Requirements +# Install with: pip install -r requirements.txt + +# Core visualization +streamlit==1.50.0 +plotly==6.3.1 + +# Data processing +pandas==2.3.2 +numpy==2.0.2 +python-dateutil==2.9.0.post0 + +# Snowflake connectivity +snowflake-snowpark-python==1.39.0 + +# Environment management +python-dotenv==1.1.1 + +# AI / LLM (visualization agents + processing pipeline) +openai==1.108.0 +langchain==0.3.27 +langchain-openai==0.3.34 +langgraph==0.6.8 + +# Language detection (processing pipeline) +lingua-language-detector==2.0.2 + +# HTML parsing (processing pipeline) +beautifulsoup4==4.14.3 + +# PDF report generation +fpdf2==2.8.4 +kaleido==1.2.0 diff --git a/visualization_brand_sentiment/utils/__init__.py b/visualization_brand_sentiment/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..62ffb332425d61f2a2f197813620bc60c3488cf7 --- /dev/null +++ b/visualization_brand_sentiment/utils/__init__.py @@ -0,0 +1,17 @@ +""" +Utility modules for Brand Sentiment Visualization +""" +from .data_processor import BrandDataProcessor +from .metrics import BrandMetrics +from .auth import check_authentication, verify_login, get_current_user, logout +from .report_context import ReportContextLoader + +__all__ = [ + 'BrandDataProcessor', + 'BrandMetrics', + 'check_authentication', + 'verify_login', + 'get_current_user', + 'logout', + 'ReportContextLoader', +] diff --git a/visualization_brand_sentiment/utils/auth.py b/visualization_brand_sentiment/utils/auth.py new file mode 100644 index 0000000000000000000000000000000000000000..56411fdf91d9cd2a72483b195ea61a9567d44f1a --- /dev/null +++ b/visualization_brand_sentiment/utils/auth.py @@ -0,0 +1,87 @@ +""" +Authentication module for the Brand Sentiment Visualization Tool. + +Handles user authentication and access control. +Works both locally (loading .env) and on Hugging Face (using secrets). +""" + +import os +import streamlit as st +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables from .env file +# On Hugging Face, env vars are set as secrets and os.getenv works directly. +# Locally, we load from .env files. +_env_path = Path(__file__).resolve().parent.parent / '.env' +if _env_path.exists(): + load_dotenv(_env_path) +else: + _root_env_path = Path(__file__).resolve().parent.parent.parent / '.env' + if _root_env_path.exists(): + load_dotenv(_root_env_path) + +# Authorized emails - team members only +AUTHORIZED_EMAILS = { + "danial@musora.com", + "caleb@musora.com", + "gabriel@musora.com", + "jmilligan@musora.com", + "dave@musora.com", +} + + +def get_valid_token() -> str: + """ + Get the valid access token from environment. + + Returns: + str: Valid access token + """ + return os.getenv("APP_TOKEN", "") + + +def verify_login(email: str, token: str) -> bool: + """ + Verify user login credentials. + + Args: + email: User email address + token: Access token + + Returns: + bool: True if credentials are valid, False otherwise + """ + valid_token = get_valid_token() + email_normalized = email.lower().strip() + + return (email_normalized in AUTHORIZED_EMAILS) and (token == valid_token) + + +def check_authentication() -> bool: + """ + Check if user is authenticated in current session. + + Returns: + bool: True if authenticated, False otherwise + """ + return st.session_state.get("authenticated", False) + + +def get_current_user() -> str: + """ + Get the currently logged-in user's email. + + Returns: + str: User email or empty string if not authenticated + """ + return st.session_state.get("user_email", "") + + +def logout(): + """ + Log out the current user by clearing session state. + """ + for key in ["authenticated", "user_email"]: + if key in st.session_state: + del st.session_state[key] diff --git a/visualization_brand_sentiment/utils/data_processor.py b/visualization_brand_sentiment/utils/data_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..08a17d6ba8d201907da07667530177e774a9c440 --- /dev/null +++ b/visualization_brand_sentiment/utils/data_processor.py @@ -0,0 +1,585 @@ +""" +Data processing utilities for brand sentiment analysis +Handles aggregation, grouping, and transformation operations +""" +import pandas as pd +import numpy as np +from typing import List, Dict, Tuple +from collections import Counter + + +class BrandDataProcessor: + """ + Processes brand sentiment data for visualization + """ + + @staticmethod + def get_sentiment_distribution(df, group_by=None): + """ + Calculate sentiment distribution + + Args: + df: Brand sentiment dataframe + group_by: Optional column(s) to group by + + Returns: + pd.DataFrame: Sentiment distribution + """ + if group_by: + if isinstance(group_by, str): + group_by = [group_by] + + sentiment_counts = df.groupby( + group_by + ['sentiment_level'], + as_index=False + ).size().rename(columns={'size': 'count'}) + + sentiment_counts['percentage'] = sentiment_counts.groupby(group_by)['count'].transform( + lambda x: (x / x.sum() * 100).round(2) + ) + else: + sentiment_counts = df['sentiment_level'].value_counts().reset_index() + sentiment_counts.columns = ['sentiment_level', 'count'] + sentiment_counts['percentage'] = ( + sentiment_counts['count'] / sentiment_counts['count'].sum() * 100 + ).round(2) + + return sentiment_counts + + @staticmethod + def get_author_role_distribution(df): + """ + Get distribution of author roles + + Returns: + pd.DataFrame: Author role distribution + """ + role_counts = df['author_role'].value_counts().reset_index() + role_counts.columns = ['author_role', 'count'] + role_counts['percentage'] = ( + role_counts['count'] / role_counts['count'].sum() * 100 + ).round(2) + return role_counts + + @staticmethod + def get_mention_context_distribution(df): + """ + Get distribution of mention contexts + + Returns: + pd.DataFrame: Mention context distribution + """ + context_counts = df['sabian_mention_context'].value_counts().reset_index() + context_counts.columns = ['sabian_mention_context', 'count'] + context_counts['percentage'] = ( + context_counts['count'] / context_counts['count'].sum() * 100 + ).round(2) + return context_counts + + @staticmethod + def get_products_distribution(df): + """ + Get distribution of products mentioned (from JSON array) + + Returns: + pd.DataFrame: Products distribution + """ + all_products = [] + for products in df['products_mentioned']: + if isinstance(products, list): + all_products.extend(products) + + if not all_products: + return pd.DataFrame(columns=['product', 'count', 'percentage']) + + product_counts = pd.Series(all_products).value_counts().reset_index() + product_counts.columns = ['product', 'count'] + product_counts['percentage'] = ( + product_counts['count'] / product_counts['count'].sum() * 100 + ).round(2) + return product_counts + + @staticmethod + def get_competitors_distribution(df): + """ + Get distribution of competitors mentioned + + Returns: + pd.DataFrame: Competitors distribution + """ + all_competitors = [] + for competitors in df['competitors_mentioned']: + if isinstance(competitors, list): + all_competitors.extend(competitors) + + if not all_competitors: + return pd.DataFrame(columns=['competitor', 'count', 'percentage']) + + competitor_counts = pd.Series(all_competitors).value_counts().reset_index() + competitor_counts.columns = ['competitor', 'count'] + competitor_counts['percentage'] = ( + competitor_counts['count'] / competitor_counts['count'].sum() * 100 + ).round(2) + return competitor_counts + + @staticmethod + def get_intents_distribution(df): + """ + Get distribution of intents (multi-label from JSON array) + + Returns: + pd.DataFrame: Intents distribution + """ + all_intents = [] + for intents in df['intents']: + if isinstance(intents, list): + all_intents.extend(intents) + + if not all_intents: + return pd.DataFrame(columns=['intent', 'count', 'percentage']) + + intent_counts = pd.Series(all_intents).value_counts().reset_index() + intent_counts.columns = ['intent', 'count'] + intent_counts['percentage'] = ( + intent_counts['count'] / intent_counts['count'].sum() * 100 + ).round(2) + return intent_counts + + @staticmethod + def get_pain_points_distribution(df): + """ + Get distribution of pain points + + Returns: + pd.DataFrame: Pain points distribution + """ + all_pain_points = [] + for pain_points in df['pain_points']: + if isinstance(pain_points, list): + all_pain_points.extend(pain_points) + + if not all_pain_points: + return pd.DataFrame(columns=['pain_point', 'count', 'percentage']) + + pain_counts = pd.Series(all_pain_points).value_counts().reset_index() + pain_counts.columns = ['pain_point', 'count'] + pain_counts['percentage'] = ( + pain_counts['count'] / pain_counts['count'].sum() * 100 + ).round(2) + return pain_counts + + @staticmethod + def get_delight_factors_distribution(df): + """ + Get distribution of delight factors + + Returns: + pd.DataFrame: Delight factors distribution + """ + all_delights = [] + for delights in df['delight_factors']: + if isinstance(delights, list): + all_delights.extend(delights) + + if not all_delights: + return pd.DataFrame(columns=['delight_factor', 'count', 'percentage']) + + delight_counts = pd.Series(all_delights).value_counts().reset_index() + delight_counts.columns = ['delight_factor', 'count'] + delight_counts['percentage'] = ( + delight_counts['count'] / delight_counts['count'].sum() * 100 + ).round(2) + return delight_counts + + @staticmethod + def get_purchase_stage_distribution(df): + """ + Get distribution of purchase stages + + Returns: + pd.DataFrame: Purchase stage distribution + """ + stage_df = df[df['purchase_stage'].notna() & (df['purchase_stage'] != '')] + if stage_df.empty: + return pd.DataFrame(columns=['purchase_stage', 'count', 'percentage']) + + stage_counts = stage_df['purchase_stage'].value_counts().reset_index() + stage_counts.columns = ['purchase_stage', 'count'] + stage_counts['percentage'] = ( + stage_counts['count'] / stage_counts['count'].sum() * 100 + ).round(2) + return stage_counts + + @staticmethod + def get_brand_switching_distribution(df): + """ + Get distribution of brand switching behavior + + Returns: + pd.DataFrame: Brand switching distribution + """ + switching_df = df[df['switching_direction'] != 'no_switching'] + if switching_df.empty: + return pd.DataFrame(columns=['switching_direction', 'count', 'percentage']) + + switching_counts = switching_df['switching_direction'].value_counts().reset_index() + switching_counts.columns = ['switching_direction', 'count'] + switching_counts['percentage'] = ( + switching_counts['count'] / switching_counts['count'].sum() * 100 + ).round(2) + return switching_counts + + @staticmethod + def get_emotion_distribution(df): + """ + Get distribution of emotions + + Returns: + pd.DataFrame: Emotion distribution + """ + emotion_df = df[df['emotion_type'] != 'unknown'] + if emotion_df.empty: + return pd.DataFrame(columns=['emotion_type', 'count', 'percentage']) + + emotion_counts = emotion_df['emotion_type'].value_counts().reset_index() + emotion_counts.columns = ['emotion_type', 'count'] + emotion_counts['percentage'] = ( + emotion_counts['count'] / emotion_counts['count'].sum() * 100 + ).round(2) + return emotion_counts + + @staticmethod + def get_processing_status_distribution(df): + """ + Get distribution of processing statuses + + Returns: + pd.DataFrame: Processing status distribution + """ + status_counts = df['processing_status'].value_counts().reset_index() + status_counts.columns = ['processing_status', 'count'] + status_counts['percentage'] = ( + status_counts['count'] / status_counts['count'].sum() * 100 + ).round(2) + return status_counts + + @staticmethod + def get_overall_brand_mentions(posts_df, comments_df, additional_mentions_df): + """ + Calculate overall brand mention counts across all sources. + + Combines: + - Sabian: count of all processed posts + comments (all mention Sabian by definition) + - Competitors: mentions from processed data (competitors_mentioned field) + + additional mentions from raw forum posts not in processed data + + Args: + posts_df: Processed forum posts DataFrame + comments_df: Processed social media comments DataFrame + additional_mentions_df: Raw competitor mentions DataFrame (brand, additional_mentions) + + Returns: + pd.DataFrame: DataFrame with columns (brand, total_mentions, percentage) sorted desc + """ + # Sabian total = all processed entries + sabian_total = len(posts_df) + (len(comments_df) if comments_df is not None and not comments_df.empty else 0) + + # Count competitor mentions from processed data + processed_counts = {} + for df in [posts_df, comments_df]: + if df is not None and not df.empty and 'competitors_mentioned' in df.columns: + for competitors in df['competitors_mentioned']: + if isinstance(competitors, list): + for comp in competitors: + processed_counts[comp] = processed_counts.get(comp, 0) + 1 + + # Add additional mentions from raw forum posts + if additional_mentions_df is not None and not additional_mentions_df.empty: + for _, row in additional_mentions_df.iterrows(): + brand = row['brand'] + additional = int(row['additional_mentions']) + processed_counts[brand] = processed_counts.get(brand, 0) + additional + + # Build result + rows = [{'brand': 'Sabian', 'total_mentions': sabian_total}] + for brand, count in processed_counts.items(): + rows.append({'brand': brand, 'total_mentions': count}) + + result = pd.DataFrame(rows).sort_values('total_mentions', ascending=False).reset_index(drop=True) + total = result['total_mentions'].sum() + result['percentage'] = (result['total_mentions'] / total * 100).round(2) if total > 0 else 0 + + return result + + @staticmethod + def get_product_sentiment_breakdown(df): + """ + Get sentiment breakdown per product + + Returns: + pd.DataFrame: Product sentiment breakdown + """ + rows = [] + for _, row in df.iterrows(): + products = row['products_mentioned'] + sentiment = row['sentiment_level'] + if isinstance(products, list): + for product in products: + rows.append({'product': product, 'sentiment_level': sentiment}) + + if not rows: + return pd.DataFrame() + + product_sentiment_df = pd.DataFrame(rows) + breakdown = product_sentiment_df.groupby(['product', 'sentiment_level']).size().reset_index(name='count') + + # Calculate percentage within each product + breakdown['percentage'] = breakdown.groupby('product')['count'].transform( + lambda x: (x / x.sum() * 100).round(2) + ) + + return breakdown + + @staticmethod + def get_competitor_sentiment_breakdown(df): + """ + Get sentiment breakdown when competitors are mentioned + + Returns: + pd.DataFrame: Competitor sentiment breakdown + """ + rows = [] + for _, row in df.iterrows(): + competitors = row['competitors_mentioned'] + sentiment = row['sentiment_level'] + if isinstance(competitors, list): + for competitor in competitors: + rows.append({'competitor': competitor, 'sentiment_level': sentiment}) + + if not rows: + return pd.DataFrame() + + comp_sentiment_df = pd.DataFrame(rows) + breakdown = comp_sentiment_df.groupby(['competitor', 'sentiment_level']).size().reset_index(name='count') + + breakdown['percentage'] = breakdown.groupby('competitor')['count'].transform( + lambda x: (x / x.sum() * 100).round(2) + ) + + return breakdown + + @staticmethod + def get_temporal_trends(df, freq='W'): + """ + Get temporal trends of sentiment over time + + Args: + df: Brand sentiment dataframe + freq: Frequency for aggregation ('D'=daily, 'W'=weekly, 'M'=monthly) + + Returns: + pd.DataFrame: Temporal sentiment trends + """ + # Determine date column + date_col = None + if 'post_created_at' in df.columns: + date_col = 'post_created_at' + elif 'comment_timestamp' in df.columns: + date_col = 'comment_timestamp' + + if date_col is None: + return pd.DataFrame() + + df_temporal = df.copy() + df_temporal['date'] = pd.to_datetime(df_temporal[date_col]).dt.to_period(freq) + + trends = df_temporal.groupby(['date', 'sentiment_level']).size().reset_index(name='count') + trends['date'] = trends['date'].dt.to_timestamp() + + return trends + + @staticmethod + def get_thread_summary(df): + """ + Get summary statistics grouped by thread + + Returns: + pd.DataFrame: Thread summary + """ + thread_summary = df.groupby(['thread_id', 'thread_title']).agg({ + 'post_id': 'count', + 'sentiment_level': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'unknown' + }).reset_index() + + thread_summary.columns = ['thread_id', 'thread_title', 'post_count', 'dominant_sentiment'] + + # Calculate sentiment score for sorting + sentiment_weights = { + 'very_negative': -2, 'negative': -1, 'neutral': 0, + 'positive': 1, 'very_positive': 2 + } + + # Get average sentiment per thread + df_temp = df.copy() + df_temp['sentiment_score'] = df_temp['sentiment_level'].map(sentiment_weights).fillna(0) + thread_scores = df_temp.groupby('thread_id')['sentiment_score'].mean().reset_index() + thread_scores.columns = ['thread_id', 'avg_sentiment_score'] + + thread_summary = thread_summary.merge(thread_scores, on='thread_id', how='left') + + return thread_summary.sort_values('post_count', ascending=False) + + @staticmethod + def get_demographics_distribution(df, field): + """ + Get distribution of a demographic field + + Args: + df: Dataframe with demographic fields + field: Field to analyze + + Returns: + pd.DataFrame: Distribution + """ + if field not in df.columns: + return pd.DataFrame() + + df_filtered = df[ + (df[field].notna()) & + (df[field] != 'Unknown') & + (df[field] != '') + ] + + if df_filtered.empty: + return pd.DataFrame() + + dist = df_filtered[field].value_counts().reset_index() + dist.columns = [field, 'count'] + dist['percentage'] = (dist['count'] / dist['count'].sum() * 100).round(2) + + return dist + + @staticmethod + def get_cymbal_ownership_analysis(df): + """ + Analyze cymbal ownership from user profiles + + Returns: + dict: Cymbal ownership analysis + """ + if 'cymbal_brands_list' not in df.columns: + return {} + + # Flatten all cymbal brands + all_brands = [] + for brands in df['cymbal_brands_list']: + if isinstance(brands, list): + all_brands.extend(brands) + + if not all_brands: + return {} + + brand_counts = Counter(all_brands) + total_users = df['cymbal_brands_list'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum() + + return { + 'brand_counts': dict(brand_counts), + 'total_users_with_cymbal_data': total_users, + 'sabian_ownership_rate': brand_counts.get('Sabian', 0) / total_users * 100 if total_users > 0 else 0 + } + + @staticmethod + def get_sentiment_by_cymbal_ownership(df): + """ + Get sentiment distribution based on cymbal ownership + + Returns: + pd.DataFrame: Sentiment by ownership + """ + if 'owns_sabian' not in df.columns: + return pd.DataFrame() + + df_with_ownership = df[df['owns_sabian'].notna()] + if df_with_ownership.empty: + return pd.DataFrame() + + sentiment_by_ownership = df_with_ownership.groupby( + ['owns_sabian', 'sentiment_level'] + ).size().reset_index(name='count') + + sentiment_by_ownership['percentage'] = sentiment_by_ownership.groupby('owns_sabian')['count'].transform( + lambda x: (x / x.sum() * 100).round(2) + ) + + return sentiment_by_ownership + + @staticmethod + def get_author_role_by_sentiment(df): + """ + Get author role distribution by sentiment + + Returns: + pd.DataFrame: Author role by sentiment + """ + role_sentiment = df.groupby(['author_role', 'sentiment_level']).size().reset_index(name='count') + + role_sentiment['percentage'] = role_sentiment.groupby('author_role')['count'].transform( + lambda x: (x / x.sum() * 100).round(2) + ) + + return role_sentiment + + @staticmethod + def get_gear_brand_analysis(df, gear_column): + """ + Analyze gear brand mentions from user profiles + + Args: + df: Dataframe with gear columns + gear_column: Column to analyze + + Returns: + pd.DataFrame: Gear brand distribution + """ + if gear_column not in df.columns: + return pd.DataFrame() + + gear_df = df[df[gear_column].notna() & (df[gear_column] != '')] + if gear_df.empty: + return pd.DataFrame() + + # Simple word frequency (brands are typically single words or known names) + all_text = ' '.join(gear_df[gear_column].astype(str).tolist()).lower() + + # Common drum gear brands to look for + known_brands = { + 'sticks': ['vic firth', 'promark', 'zildjian', 'ahead', 'vater', 'regal tip'], + 'drums': ['pearl', 'dw', 'tama', 'ludwig', 'gretsch', 'yamaha', 'mapex', 'sonor', 'pdp'], + 'hardware': ['dw', 'tama', 'pearl', 'gibraltar', 'yamaha'], + 'cymbals': ['zildjian', 'sabian', 'meinl', 'paiste', 'istanbul', 'dream', 'bosphorus'] + } + + # Determine which brand list to use + if 'stick' in gear_column.lower(): + brands_to_check = known_brands['sticks'] + elif 'set' in gear_column.lower() or 'drum' in gear_column.lower(): + brands_to_check = known_brands['drums'] + elif 'hardware' in gear_column.lower(): + brands_to_check = known_brands['hardware'] + else: + brands_to_check = known_brands['cymbals'] + + brand_counts = {} + for brand in brands_to_check: + count = all_text.count(brand) + if count > 0: + brand_counts[brand.title()] = count + + if not brand_counts: + return pd.DataFrame() + + result_df = pd.DataFrame(list(brand_counts.items()), columns=['brand', 'count']) + result_df = result_df.sort_values('count', ascending=False) + result_df['percentage'] = (result_df['count'] / result_df['count'].sum() * 100).round(2) + + return result_df diff --git a/visualization_brand_sentiment/utils/llm_helper.py b/visualization_brand_sentiment/utils/llm_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..78c9ee11e76d0ee128a05f09cb10f149348e1faa --- /dev/null +++ b/visualization_brand_sentiment/utils/llm_helper.py @@ -0,0 +1,149 @@ +""" +LLM Helper for brand sentiment visualization agents +Handles OpenAI API calls with retry logic and error handling +""" +import os +import json +from typing import Dict, Any, Optional +from openai import OpenAI +from dotenv import load_dotenv +import time + +# Load environment variables from root directory +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +VISUALIZATION_DIR = os.path.dirname(SCRIPT_DIR) +ROOT_DIR = os.path.dirname(VISUALIZATION_DIR) +load_dotenv(os.path.join(ROOT_DIR, '.env')) + + +class LLMHelper: + """ + Helper class for LLM interactions + """ + + def __init__(self, model: str = "gpt-5-nano", temperature: float = 1): + """ + Initialize LLM helper + + Args: + model: Model name to use + temperature: Temperature for generation + """ + self.model = model + self.temperature = temperature + self.api_key = os.getenv('OPENAI_API_KEY') + + if not self.api_key: + raise ValueError("OPENAI_API_KEY not found in environment variables") + + self.client = OpenAI(api_key=self.api_key) + + def get_completion( + self, + prompt: str, + system_message: Optional[str] = None, + max_retries: int = 3, + json_mode: bool = False + ) -> Dict[str, Any]: + """ + Get completion from LLM with retry logic + + Args: + prompt: User prompt + system_message: Optional system message + max_retries: Maximum number of retries + json_mode: Whether to force JSON response + + Returns: + Dictionary with response data + """ + messages = [] + + if system_message: + messages.append({"role": "system", "content": system_message}) + + messages.append({"role": "user", "content": prompt}) + + for attempt in range(max_retries): + try: + # Prepare API call parameters + api_params = { + "model": self.model, + "messages": messages, + "temperature": self.temperature, + "reasoning_effort": "low", + "n": 1 + } + + # Add response format if JSON mode requested + if json_mode: + api_params["response_format"] = {"type": "json_object"} + + # Make API call + response = self.client.chat.completions.create(**api_params) + + # Extract response + content = response.choices[0].message.content + + # Parse JSON if requested + if json_mode: + try: + content = json.loads(content) + except json.JSONDecodeError as e: + return { + 'success': False, + 'error': f"Failed to parse JSON response: {str(e)}", + 'raw_content': content + } + + return { + 'success': True, + 'content': content, + 'model': response.model, + 'usage': { + 'prompt_tokens': response.usage.prompt_tokens, + 'completion_tokens': response.usage.completion_tokens, + 'total_tokens': response.usage.total_tokens + } + } + + except Exception as e: + if attempt < max_retries - 1: + # Wait before retry (exponential backoff) + time.sleep(2 ** attempt) + continue + else: + return { + 'success': False, + 'error': str(e), + 'error_type': type(e).__name__ + } + + return { + 'success': False, + 'error': f"Failed after {max_retries} attempts" + } + + def get_structured_completion( + self, + prompt: str, + system_message: str, + max_retries: int = 3 + ) -> Dict[str, Any]: + """ + Get structured JSON completion + + Args: + prompt: User prompt + system_message: System message + max_retries: Maximum retries + + Returns: + Structured response dictionary + """ + return self.get_completion( + prompt=prompt, + system_message=system_message, + max_retries=max_retries, + json_mode=True + ) diff --git a/visualization_brand_sentiment/utils/metrics.py b/visualization_brand_sentiment/utils/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..ba24de091ca74413e199e8db4dee27a43cff9faa --- /dev/null +++ b/visualization_brand_sentiment/utils/metrics.py @@ -0,0 +1,400 @@ +""" +Metrics calculation for brand sentiment analysis dashboard +Provides key performance indicators and statistical metrics +""" +import pandas as pd +import numpy as np +from typing import Dict, List, Tuple + + +class BrandMetrics: + """ + Calculates various metrics for brand sentiment analysis + """ + + @staticmethod + def calculate_overall_metrics(df): + """ + Calculate overall summary metrics + + Args: + df: Brand sentiment dataframe + + Returns: + dict: Overall metrics + """ + total_posts = len(df) + + # Sentiment distribution + sentiment_dist = df['sentiment_level'].value_counts(normalize=True) * 100 + + # Calculate sentiment score + sentiment_weights = { + 'very_negative': -2, + 'negative': -1, + 'neutral': 0, + 'positive': 1, + 'very_positive': 2 + } + df_temp = df.copy() + df_temp['sentiment_score'] = df_temp['sentiment_level'].map(sentiment_weights) + avg_sentiment_score = df_temp['sentiment_score'].mean() + + # Negative sentiment percentage + negative_sentiments = ['negative', 'very_negative'] + negative_pct = (df['sentiment_level'].isin(negative_sentiments).sum() / total_posts * 100) if total_posts > 0 else 0 + + # Positive sentiment percentage + positive_sentiments = ['positive', 'very_positive'] + positive_pct = (df['sentiment_level'].isin(positive_sentiments).sum() / total_posts * 100) if total_posts > 0 else 0 + + # Unique threads (forums) or unique channels (comments) + unique_threads = df['thread_id'].nunique() if 'thread_id' in df.columns else 0 + + # Unique authors - support both forum and comment author columns + if 'post_author_id' in df.columns: + unique_authors = df['post_author_id'].nunique() + elif 'author_id' in df.columns: + unique_authors = df['author_id'].nunique() + elif 'author_name' in df.columns: + unique_authors = df['author_name'].nunique() + else: + unique_authors = 0 + + # Author roles + author_role_dist = df['author_role'].value_counts().to_dict() + + # Sarcasm detection + sarcasm_count = df['sarcasm_detected'].sum() if 'sarcasm_detected' in df.columns else 0 + + return { + 'total_posts': total_posts, + 'unique_threads': unique_threads, + 'unique_authors': unique_authors, + 'avg_sentiment_score': round(avg_sentiment_score, 2) if not pd.isna(avg_sentiment_score) else 0, + 'negative_pct': round(negative_pct, 1), + 'positive_pct': round(positive_pct, 1), + 'neutral_pct': round(100 - negative_pct - positive_pct, 1), + 'sentiment_distribution': sentiment_dist.to_dict(), + 'author_role_distribution': author_role_dist, + 'sarcasm_count': int(sarcasm_count) + } + + @staticmethod + def calculate_product_metrics(df): + """ + Calculate metrics for each Sabian product + + Returns: + dict: Product metrics + """ + product_metrics = {} + + # Get all products mentioned + all_products = [] + for products in df['products_mentioned']: + if isinstance(products, list): + all_products.extend(products) + + unique_products = list(set(all_products)) + + for product in unique_products: + # Filter posts mentioning this product + product_df = df[df['products_mentioned'].apply( + lambda x: product in x if isinstance(x, list) else False + )] + + if len(product_df) > 0: + product_metrics[product] = BrandMetrics._calculate_subset_metrics(product_df) + + return product_metrics + + @staticmethod + def calculate_competitor_metrics(df): + """ + Calculate metrics when competitors are mentioned + + Returns: + dict: Competitor metrics + """ + competitor_metrics = {} + + # Get all competitors mentioned + all_competitors = [] + for competitors in df['competitors_mentioned']: + if isinstance(competitors, list): + all_competitors.extend(competitors) + + unique_competitors = list(set(all_competitors)) + + for competitor in unique_competitors: + # Filter posts mentioning this competitor + comp_df = df[df['competitors_mentioned'].apply( + lambda x: competitor in x if isinstance(x, list) else False + )] + + if len(comp_df) > 0: + competitor_metrics[competitor] = BrandMetrics._calculate_subset_metrics(comp_df) + + return competitor_metrics + + @staticmethod + def _calculate_subset_metrics(df): + """ + Calculate basic metrics for a subset of data + + Returns: + dict: Subset metrics + """ + total = len(df) + + sentiment_weights = { + 'very_negative': -2, 'negative': -1, 'neutral': 0, + 'positive': 1, 'very_positive': 2 + } + df_temp = df.copy() + df_temp['sentiment_score'] = df_temp['sentiment_level'].map(sentiment_weights) + + negative_sentiments = ['negative', 'very_negative'] + positive_sentiments = ['positive', 'very_positive'] + + return { + 'total_posts': total, + 'avg_sentiment_score': round(df_temp['sentiment_score'].mean(), 2), + 'negative_pct': round((df['sentiment_level'].isin(negative_sentiments).sum() / total * 100), 1) if total > 0 else 0, + 'positive_pct': round((df['sentiment_level'].isin(positive_sentiments).sum() / total * 100), 1) if total > 0 else 0, + 'sentiment_distribution': df['sentiment_level'].value_counts().to_dict() + } + + @staticmethod + def get_sentiment_health_status(negative_pct): + """ + Determine health status based on negative sentiment percentage + + Args: + negative_pct: Percentage of negative sentiments + + Returns: + tuple: (status, color, emoji) + """ + if negative_pct < 10: + return ("Excellent", "#00C851", "✅") + elif negative_pct < 20: + return ("Good", "#7CB342", "👍") + elif negative_pct < 30: + return ("Fair", "#FFB300", "⚠️") + elif negative_pct < 50: + return ("Needs Attention", "#FF6F00", "⚡") + else: + return ("Critical", "#D32F2F", "🚨") + + @staticmethod + def calculate_brand_switching_metrics(df): + """ + Calculate brand switching metrics + + Returns: + dict: Brand switching metrics + """ + switching_to = len(df[df['switching_direction'] == 'switching_to_sabian']) + switching_from = len(df[df['switching_direction'] == 'switching_from_sabian']) + + return { + 'switching_to_sabian': switching_to, + 'switching_from_sabian': switching_from, + 'net_switching': switching_to - switching_from, + 'switching_ratio': round(switching_to / switching_from, 2) if switching_from > 0 else float('inf') if switching_to > 0 else 0 + } + + @staticmethod + def calculate_potential_buyer_metrics(df): + """ + Calculate metrics for potential buyers + + Returns: + dict: Potential buyer metrics + """ + potential_buyers = df[df['author_role'] == 'potential_buyer'] + total_potential = len(potential_buyers) + + if total_potential == 0: + return { + 'total_potential_buyers': 0, + 'positive_sentiment_pct': 0, + 'researching_count': 0, + 'deciding_count': 0 + } + + positive_sentiments = ['positive', 'very_positive'] + positive_pct = potential_buyers['sentiment_level'].isin(positive_sentiments).sum() / total_potential * 100 + + researching = len(potential_buyers[potential_buyers['purchase_stage'] == 'researching']) + deciding = len(potential_buyers[potential_buyers['purchase_stage'] == 'deciding']) + + return { + 'total_potential_buyers': total_potential, + 'positive_sentiment_pct': round(positive_pct, 1), + 'researching_count': researching, + 'deciding_count': deciding + } + + @staticmethod + def calculate_pain_delight_metrics(df): + """ + Calculate pain points and delight factors metrics + + Returns: + dict: Pain/delight metrics + """ + # Count all pain points + all_pain_points = [] + for pain_points in df['pain_points']: + if isinstance(pain_points, list): + all_pain_points.extend(pain_points) + + # Count all delight factors + all_delights = [] + for delights in df['delight_factors']: + if isinstance(delights, list): + all_delights.extend(delights) + + pain_counts = pd.Series(all_pain_points).value_counts().to_dict() if all_pain_points else {} + delight_counts = pd.Series(all_delights).value_counts().to_dict() if all_delights else {} + + return { + 'total_pain_points': len(all_pain_points), + 'total_delight_factors': len(all_delights), + 'top_pain_points': dict(list(pain_counts.items())[:5]), + 'top_delight_factors': dict(list(delight_counts.items())[:5]), + 'pain_to_delight_ratio': round(len(all_pain_points) / len(all_delights), 2) if all_delights else float('inf') if all_pain_points else 0 + } + + @staticmethod + def calculate_intent_metrics(df): + """ + Calculate intent distribution metrics + + Returns: + dict: Intent metrics + """ + all_intents = [] + for intents in df['intents']: + if isinstance(intents, list): + all_intents.extend(intents) + + if not all_intents: + return { + 'total_intents': 0, + 'intent_distribution': {}, + 'seeking_info_count': 0, + 'praising_count': 0, + 'criticizing_count': 0 + } + + intent_counts = pd.Series(all_intents).value_counts() + + return { + 'total_intents': len(all_intents), + 'intent_distribution': intent_counts.to_dict(), + 'seeking_info_count': intent_counts.get('seeking_information', 0), + 'praising_count': intent_counts.get('praising', 0), + 'criticizing_count': intent_counts.get('criticizing', 0), + 'comparing_count': intent_counts.get('comparing', 0) + } + + @staticmethod + def calculate_trend_metrics(df, period_days=30): + """ + Calculate trend metrics comparing periods + + Args: + df: Brand sentiment dataframe + period_days: Number of days for each period + + Returns: + dict: Trend metrics + """ + # Determine date column + date_col = None + if 'post_created_at' in df.columns and not df['post_created_at'].isna().all(): + date_col = 'post_created_at' + elif 'comment_timestamp' in df.columns and not df['comment_timestamp'].isna().all(): + date_col = 'comment_timestamp' + + if date_col is None: + return {'trend_available': False} + + df_sorted = df.sort_values(date_col) + max_date = df_sorted[date_col].max() + min_date = df_sorted[date_col].min() + + # Check if we have enough data for comparison + date_range = (max_date - min_date).days + if date_range < period_days * 2: + return {'trend_available': False} + + cutoff_date = max_date - pd.Timedelta(days=period_days) + current_period = df_sorted[df_sorted[date_col] >= cutoff_date] + previous_period = df_sorted[ + (df_sorted[date_col] < cutoff_date) & + (df_sorted[date_col] >= cutoff_date - pd.Timedelta(days=period_days)) + ] + + if len(current_period) == 0 or len(previous_period) == 0: + return {'trend_available': False} + + # Calculate sentiment scores for each period + sentiment_weights = { + 'very_negative': -2, 'negative': -1, 'neutral': 0, + 'positive': 1, 'very_positive': 2 + } + + current_score = current_period['sentiment_level'].map(sentiment_weights).mean() + previous_score = previous_period['sentiment_level'].map(sentiment_weights).mean() + + score_change = current_score - previous_score + volume_change = len(current_period) - len(previous_period) + + return { + 'trend_available': True, + 'current_period_posts': len(current_period), + 'previous_period_posts': len(previous_period), + 'current_sentiment_score': round(current_score, 2), + 'previous_sentiment_score': round(previous_score, 2), + 'sentiment_score_change': round(score_change, 2), + 'volume_change': volume_change, + 'sentiment_trend': 'improving' if score_change > 0.1 else 'declining' if score_change < -0.1 else 'stable' + } + + @staticmethod + def calculate_demographics_metrics(df): + """ + Calculate demographics-related metrics + + Returns: + dict: Demographics metrics + """ + metrics = { + 'users_with_demographics': 0, + 'avg_drumming_experience': None, + 'sabian_owners_count': 0, + 'competitor_owners_count': 0 + } + + if 'drums_experience_years' in df.columns: + valid_exp = df['drums_experience_years'].dropna() + if len(valid_exp) > 0: + metrics['avg_drumming_experience'] = round(valid_exp.mean(), 1) + metrics['users_with_demographics'] = len(valid_exp) + + if 'owns_sabian' in df.columns: + metrics['sabian_owners_count'] = df['owns_sabian'].sum() if df['owns_sabian'].dtype == bool else 0 + + if 'cymbal_brands_list' in df.columns: + # Count users who own competitor cymbals + competitor_owners = df['cymbal_brands_list'].apply( + lambda x: len([b for b in x if b != 'Sabian']) > 0 if isinstance(x, list) else False + ).sum() + metrics['competitor_owners_count'] = competitor_owners + + return metrics diff --git a/visualization_brand_sentiment/utils/pdf_exporter.py b/visualization_brand_sentiment/utils/pdf_exporter.py new file mode 100644 index 0000000000000000000000000000000000000000..7de2d65e33723c9e7e6965ad2718fc379adc3c66 --- /dev/null +++ b/visualization_brand_sentiment/utils/pdf_exporter.py @@ -0,0 +1,1544 @@ +""" +PDF Report Exporter for Brand Sentiment Dashboard +Generates comprehensive PDF reports from dashboard data and visualizations. + +Uses fpdf2 for PDF assembly and kaleido (via plotly) for high-DPI chart rendering. +""" +import sys +import tempfile +import os +import logging +import concurrent.futures +from datetime import datetime +from pathlib import Path + +# Add parent directory to path +parent_dir = Path(__file__).resolve().parent.parent +sys.path.append(str(parent_dir)) + +import pandas as pd +import plotly.io as pio + +from fpdf import FPDF + +from utils.metrics import BrandMetrics +from utils.data_processor import BrandDataProcessor +from visualizations.sentiment_charts import SentimentCharts +from visualizations.distribution_charts import DistributionCharts +from visualizations.brand_charts import BrandCharts +from visualizations.demographic_charts import DemographicCharts + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Section descriptions – plain-language context shown below each section +# header. Written for a general audience with no data-science background. +# --------------------------------------------------------------------------- +SECTION_DESCRIPTIONS = { + 'executive_summary': ( + "A top-level snapshot of Sabian brand sentiment derived from forum posts and YouTube " + "comments. All findings are based on posts where Sabian was explicitly mentioned. " + "Sample comment cards at the bottom show real voices from the community." + ), + 'sentiment': ( + "Every post is read by our AI and assigned one of five sentiment levels: " + "Very Positive, Positive, Neutral, Negative, or Very Negative. " + "The pie chart shows how those levels split across all analyzed posts. " + "The Brand Sentiment Score (0-100) converts the average sentiment value to a " + "percentage scale: 50 = perfectly neutral, above 60 = primarily positive. " + "A score above 60 is considered healthy for a brand in an enthusiast community." + ), + 'author_role': ( + "Every post author is classified by their relationship with Sabian based on the " + "language in their post: Current Owner, Past Owner, Potential Buyer, Never Owned, " + "or Unknown. " + "IMPORTANT: Owner classifications are only as accurate as what users have shared " + "in their Musora profiles. These counts reflect only users who have filled in their " + "gear/ownership information — they do not represent all post authors." + ), + 'products': ( + "Our AI identifies every Sabian product line mentioned in a post (e.g., HHX, AAX). " + "A single post can reference multiple products. The charts show which product lines " + "attract the most discussion and what sentiment surrounds each one." + ), + 'overall_mentions': ( + "Total brand mentions across all sources: Sabian's count comes from all analyzed " + "posts and comments; competitor counts include mentions found in Sabian-focused posts " + "plus additional mentions in unrelated forum threads. This provides a 'share of voice' " + "picture of how much each cymbal brand is discussed in the Musora community." + ), + 'competitive': ( + "When users mention Sabian and a competitor in the same post, we can see how Sabian " + "sentiment shifts in that comparison context. " + "The chart shows a sentiment score (-2 to +2) for Sabian whenever each competitor is " + "mentioned alongside it. Think of it like a temperature scale: +2 = very warm/positive " + "toward Sabian in that comparison, 0 = neutral, -2 = very cool/negative. " + "For example: a score of +1.0 when Zildjian is mentioned means people tend to speak " + "positively about Sabian in the same breath as Zildjian; -0.5 means slightly less " + "favorably. Brand switching tracks posts where authors explicitly say they are " + "moving to or from Sabian." + ), + 'intents_feedback': ( + "Beyond positive/negative, our AI identifies the purpose behind each post " + "(seeking information, sharing an experience, comparing brands, praising, " + "criticizing, etc.) and specific pain points and delight factors mentioned. " + "Pain points are recurring frustrations; delight factors are things people love. " + "The balance between them signals overall customer satisfaction." + ), + 'purchase_journey': ( + "Post authors are classified into their buying-journey stage: Researching (early " + "exploration), Deciding (comparing before buying), Recently Purchased, Long-Term " + "Owner, or Selling/Replacing. The Mention Context chart shows whether Sabian is " + "the main focus of a post (Primary Focus), a significant topic, a casual reference, " + "or only brought up as a comparison point." + ), + 'demographics': ( + "Demographics data comes from Musora user profiles and is available only for users " + "who have explicitly filled in their profile information. " + "IMPORTANT: Cymbal ownership, drumming experience level, and age data are counted " + "only when a user has entered this information in their Musora profile. These charts " + "reflect a subset of post authors and should not be interpreted as representing all " + "community members." + ), + 'emotion': ( + "Going deeper than positive/negative, our AI detects specific emotions expressed in " + "posts: excitement and satisfaction (positive), frustration, disappointment, and anger " + "(negative), curiosity (exploratory), and indifference (neutral). Understanding " + "emotional drivers helps tailor messaging that resonates with each audience segment." + ), + 'social_media': ( + "This section covers Sabian mentions found in YouTube comments. For this analysis " + "period, Sabian was mentioned exclusively on YouTube among the social media platforms " + "scanned. The charts below show sentiment, products, and competitors discussed in " + "these YouTube comments." + ), +} + +# Sentinel used when AI summary is unavailable +_NO_AI_SUMMARY = "" + + +class SabianPDF(FPDF): + """Custom FPDF subclass with Sabian branding and PDF-rendering helpers.""" + + def __init__(self, config): + super().__init__(orientation='P', unit='mm', format='A4') + self.config = config + self.brand_color = self._hex_to_rgb(config['brand']['primary_color']) + self.dark_bg = (30, 30, 30) + self.white = (255, 255, 255) + self.gray = (180, 180, 180) + self.light_gray = (220, 220, 220) + self.set_auto_page_break(auto=True, margin=20) + + # ------------------------------------------------------------------ + # Static helpers + # ------------------------------------------------------------------ + + @staticmethod + def _hex_to_rgb(hex_color): + hex_color = hex_color.lstrip('#') + return tuple(int(hex_color[i:i + 2], 16) for i in (0, 2, 4)) + + @staticmethod + def _sanitize(text): + """Remove characters outside the Latin-1 range supported by Helvetica.""" + if not isinstance(text, str): + text = str(text) + return text.encode('latin-1', errors='ignore').decode('latin-1') + + # ------------------------------------------------------------------ + # Standard FPDF overrides + # ------------------------------------------------------------------ + + def header(self): + if self.page_no() > 1: + self.set_font('Helvetica', 'B', 8) + self.set_text_color(*self.gray) + self.cell(0, 6, f"{self.config['brand']['name']} Brand Sentiment Report", align='L') + self.cell(0, 6, f"Page {self.page_no()}", align='R', new_x="LMARGIN", new_y="NEXT") + self.set_draw_color(*self.brand_color) + self.set_line_width(0.5) + self.line(10, self.get_y(), 200, self.get_y()) + self.ln(4) + + def footer(self): + self.set_y(-15) + self.set_font('Helvetica', 'I', 7) + self.set_text_color(*self.gray) + self.cell( + 0, 10, + f"Generated on {datetime.now().strftime('%Y-%m-%d %H:%M')} | Confidential", + align='C' + ) + + # ------------------------------------------------------------------ + # Structural helpers + # ------------------------------------------------------------------ + + def section_header(self, title): + """Add a styled section header with a brand-coloured underline.""" + title = self._sanitize(title) + self.check_page_break(20) + self.ln(4) + self.set_font('Helvetica', 'B', 14) + self.set_text_color(*self.brand_color) + self.cell(0, 10, title, new_x="LMARGIN", new_y="NEXT") + self.set_draw_color(*self.brand_color) + self.set_line_width(0.3) + self.line(10, self.get_y(), 200, self.get_y()) + self.ln(3) + self.set_text_color(0, 0, 0) + + def section_description(self, text): + """Render an italicised description block beneath the section header.""" + text = self._sanitize(text) + self.set_font('Helvetica', 'I', 9) + self.set_text_color(80, 80, 80) + self.multi_cell(0, 5, text) + self.ln(4) + self.set_text_color(0, 0, 0) + + def subsection_header(self, title): + """Add a lighter subsection header.""" + title = self._sanitize(title) + self.check_page_break(15) + self.ln(2) + self.set_font('Helvetica', 'B', 11) + self.set_text_color(60, 60, 60) + self.cell(0, 8, title, new_x="LMARGIN", new_y="NEXT") + self.ln(1) + self.set_text_color(0, 0, 0) + + def body_text(self, text): + """Add a body-text paragraph.""" + text = self._sanitize(text) + self.set_font('Helvetica', '', 9) + self.set_text_color(50, 50, 50) + self.multi_cell(0, 5, text) + self.ln(2) + self.set_text_color(0, 0, 0) + + def callout_box(self, text, bg_color=(240, 248, 255), border_color=None): + """ + Render a lightly-coloured callout / info box. + + Args: + text: Content string (sanitised internally). + bg_color: RGB tuple for box background. + border_color: RGB tuple for left accent border; defaults to brand_color. + """ + if border_color is None: + border_color = self.brand_color + text = self._sanitize(text) + self.check_page_break(20) + x = 10 + w = 180 + # Estimate height: ~5 mm per ~90-character line + approx_lines = max(2, len(text) // 90 + text.count('\n') + 1) + h = approx_lines * 5 + 6 + y = self.get_y() + # Background + self.set_fill_color(*bg_color) + self.rect(x, y, w, h, style='F') + # Left accent bar + self.set_fill_color(*border_color) + self.rect(x, y, 3, h, style='F') + # Text + self.set_font('Helvetica', '', 8.5) + self.set_text_color(40, 40, 40) + self.set_xy(x + 5, y + 3) + self.multi_cell(w - 7, 4.5, text) + # Advance past box + self.set_y(y + h + 3) + self.set_text_color(0, 0, 0) + + def metric_row(self, metrics): + """ + Add a horizontal row of metric boxes. + + Args: + metrics: list of (label, value) tuples + """ + self.check_page_break(18) + n = len(metrics) + if n == 0: + return + box_width = (190 - (n - 1) * 3) / n + start_x = 10 + # Capture Y once – avoids the staircase effect that occurs when + # get_y() is called inside the loop after each cell advances the cursor. + y = self.get_y() + + for i, (label, value) in enumerate(metrics): + x = start_x + i * (box_width + 3) + + # Box background + self.set_fill_color(245, 245, 245) + self.rect(x, y, box_width, 14, style='F') + + # Value (top) + self.set_xy(x, y + 1) + self.set_font('Helvetica', 'B', 10) + self.set_text_color(*self.brand_color) + self.cell(box_width, 6, self._sanitize(str(value)), align='C') + + # Label (bottom) + self.set_xy(x, y + 7) + self.set_font('Helvetica', '', 7) + self.set_text_color(100, 100, 100) + self.cell(box_width, 5, self._sanitize(label), align='C') + + self.set_text_color(0, 0, 0) + # Advance cursor past the box row (14 mm height + 2 mm gap) + self.set_y(y + 16) + + def add_table(self, headers, rows, col_widths=None): + """ + Add a styled data table. + + Args: + headers: list of column header strings + rows: list of row tuples/lists + col_widths: optional list of column widths in mm + """ + self.check_page_break(10 + len(rows) * 6) + n = len(headers) + if col_widths is None: + col_widths = [190 / n] * n + + # Header row + self.set_font('Helvetica', 'B', 8) + self.set_fill_color(*self.brand_color) + self.set_text_color(*self.white) + for i, header in enumerate(headers): + self.cell(col_widths[i], 7, self._sanitize(header), border=1, fill=True, align='C') + self.ln() + + # Data rows with alternating row shading + self.set_font('Helvetica', '', 8) + self.set_text_color(0, 0, 0) + for row_idx, row in enumerate(rows): + self.set_fill_color(250, 250, 250) if row_idx % 2 == 0 else self.set_fill_color(*self.white) + for i, cell_val in enumerate(row): + self.cell(col_widths[i], 6, self._sanitize(str(cell_val)), border=1, fill=True, align='C') + self.ln() + self.ln(2) + + @staticmethod + def _estimate_text_lines(text, chars_per_line=82): + """ + Rough estimate of the number of printed lines for ``multi_cell`` at + Helvetica 8pt with a usable cell width of ~172 mm (~82 chars/line). + + Uses paragraph-aware wrapping and a 25 % safety buffer so the + pre-drawn background rect always contains the full text. + """ + total = 0.0 + for para in (text or '').split('\n'): + stripped = para.strip() + if not stripped: + total += 0.5 + else: + total += max(1.0, len(stripped) / chars_per_line) + return max(1, int(total * 1.25) + 1) + + def comment_card(self, text, sentiment, source, author_role=''): + """ + Render a styled comment card showing the **complete** post/comment text. + + The card height is calculated dynamically from the text length so + nothing is ever cut off. Very long posts that would exceed a single + PDF page are handled gracefully: ``multi_cell`` continues the text on + the next page while the coloured border bar marks the card's logical start. + + Args: + text: Full comment or post text – displayed without truncation. + sentiment: Sentiment level string (e.g. 'very_positive', 'negative'). + source: 'Forum' or 'YouTube'. + author_role: Optional author-role label shown as a subtitle. + """ + HEADER_H = 14 # mm reserved for labels + role line above body text + LINE_H = 4.5 # mm per text line at Helvetica 8pt + TEXT_W = 172 # mm usable text width inside the card + CARD_W = 180 # mm total card width + + _sentiment_rgb = { + 'very_positive': (0, 180, 60), + 'positive': (124, 179, 66), + 'neutral': (220, 150, 0), + 'negative': (255, 111, 0), + 'very_negative': (200, 40, 40), + } + sr, sg, sb = _sentiment_rgb.get(str(sentiment).lower(), (150, 150, 150)) + + text = self._sanitize(str(text)) + + # Calculate card height from estimated line count + n_lines = self._estimate_text_lines(text, chars_per_line=82) + card_h = HEADER_H + n_lines * LINE_H + 4 # 4 mm bottom padding + + self.check_page_break(min(card_h, 60) + 3) # cap check to avoid always forcing new page + y = self.get_y() + + # Card background + self.set_fill_color(247, 247, 247) + self.rect(10, y, CARD_W, card_h, style='F') + + # Thin outer border + self.set_draw_color(215, 215, 215) + self.set_line_width(0.12) + self.rect(10, y, CARD_W, card_h, style='D') + + # Left sentiment colour bar (3.5 mm wide) + self.set_fill_color(sr, sg, sb) + self.rect(10, y, 3.5, card_h, style='F') + + # Sentiment label (top-left) + sentiment_label = str(sentiment).replace('_', ' ').title() + self.set_font('Helvetica', 'B', 7) + self.set_text_color(sr, sg, sb) + self.set_xy(15, y + 2) + self.cell(60, 4, sentiment_label) + + # Source badge (top-right) – red for YouTube, brand colour for Forum + badge_rgb = (200, 0, 0) if source == 'YouTube' else self.brand_color + self.set_font('Helvetica', 'B', 7) + self.set_text_color(*badge_rgb) + self.set_xy(10 + CARD_W - 35, y + 2) + self.cell(33, 4, self._sanitize(f"[ {source} ]"), align='R') + + # Author role subtitle + if author_role and str(author_role).strip().lower() not in ('unknown', 'nan', ''): + role_display = str(author_role).replace('_', ' ').title() + self.set_font('Helvetica', 'I', 6.5) + self.set_text_color(140, 140, 140) + self.set_xy(15, y + 7) + self.cell(100, 3.5, self._sanitize(role_display)) + + # Full comment / post text – no truncation + self.set_font('Helvetica', '', 8) + self.set_text_color(50, 50, 50) + self.set_xy(15, y + HEADER_H) + self.multi_cell(TEXT_W, LINE_H, text) + + # Advance cursor past the card (use max of estimated and actual end) + self.set_y(max(self.get_y(), y + card_h) + 2) + self.set_text_color(0, 0, 0) + self.set_draw_color(0, 0, 0) + + def check_page_break(self, needed_height): + """Add a page break if insufficient vertical space remains.""" + if self.get_y() + needed_height > self.h - 20: + self.add_page() + + +# --------------------------------------------------------------------------- +# Main exporter +# --------------------------------------------------------------------------- + +class DashboardPDFExporter: + """ + Generates a comprehensive PDF report from dashboard data. + + Usage:: + + exporter = DashboardPDFExporter(config) + pdf_bytes = exporter.generate_report( + posts_df, comments_df, additional_mentions_df, filter_info + ) + """ + + RENDER_SCALE = 3 # 3× → ~300 DPI at print size + + def __init__(self, config): + self.config = config + self.sentiment_charts = SentimentCharts() + self.distribution_charts = DistributionCharts() + self.brand_charts = BrandCharts() + self.demographic_charts = DemographicCharts() + self.processor = BrandDataProcessor() + self._temp_files = [] + + # ------------------------------------------------------------------ + # Public entry point + # ------------------------------------------------------------------ + + def generate_report(self, posts_df, comments_df, additional_mentions_df, filter_info=None): + """ + Generate a comprehensive PDF report. + + Args: + posts_df: Processed forum posts DataFrame. + comments_df: Processed social media comments DataFrame. + additional_mentions_df: Raw competitor mentions DataFrame. + filter_info: Optional dict describing active filters. + + Returns: + bytes: PDF file contents. + """ + self.pdf = SabianPDF(self.config) + + # Store DFs at instance level so all section methods can access them + self._posts_df = posts_df + self._comments_df = comments_df + self._additional_mentions_df = additional_mentions_df + + # ---- Pre-flight: load context stats and AI summary ---- + context = self._load_report_context(posts_df) + ai_insights = self._generate_ai_summary(posts_df, comments_df) + + try: + self._add_cover_page(posts_df, comments_df, filter_info, context) + self._add_executive_summary(posts_df, comments_df, additional_mentions_df, context, ai_insights) + self._add_sentiment_section(posts_df) + self._add_author_role_section(posts_df) + self._add_products_section(posts_df) + self._add_overall_mentions_section(posts_df, comments_df, additional_mentions_df) + self._add_competitive_section(posts_df) + self._add_intents_feedback_section(posts_df) + self._add_purchase_journey_section(posts_df) + self._add_demographics_section(posts_df) + self._add_emotion_section(posts_df) + + if comments_df is not None and not comments_df.empty: + self._add_social_media_section(comments_df) + + self._add_data_summary(posts_df, comments_df, filter_info) + + output = self.pdf.output() + return bytes(output) + + finally: + self._cleanup_temp_files() + + # ------------------------------------------------------------------ + # Pre-flight helpers + # ------------------------------------------------------------------ + + def _load_report_context(self, posts_df): + """ + Query Snowflake for total forum-post and YouTube-video counts. + Returns a dict; all values may be None if Snowflake is unavailable. + """ + try: + from utils.report_context import ReportContextLoader + + date_range = None + if posts_df is not None and not posts_df.empty and 'post_created_at' in posts_df.columns: + valid = posts_df['post_created_at'].dropna() + if not valid.empty: + date_range = (valid.min().date(), valid.max().date()) + + loader = ReportContextLoader() + return loader.load_all_context(date_range=date_range) + + except Exception as exc: + logger.warning("Could not load report context: %s", exc) + return {'forum_post_count': None, 'youtube_video_count': None, 'date_range': None} + + def _generate_ai_summary(self, posts_df, comments_df): + """ + Call BrandInsightAgent to generate AI-powered brand insights. + Returns the insights dict or an empty dict on any failure. + """ + try: + from agents.brand_insight_agent import BrandInsightAgent + + dfs = [posts_df] + if comments_df is not None and not comments_df.empty: + dfs.append(comments_df) + combined = pd.concat(dfs, ignore_index=True) if len(dfs) > 1 else posts_df.copy() + + agent = BrandInsightAgent() + result = agent.process({'posts': combined, 'filter_description': ''}) + + if result.get('success'): + return result.get('insights', {}) + except Exception as exc: + logger.warning("AI summary generation failed: %s", exc) + return {} + + # ------------------------------------------------------------------ + # Chart rendering helpers + # ------------------------------------------------------------------ + + def _prepare_fig_for_pdf(self, fig, is_side_by_side=False): + """ + Apply white background, readable fonts, and automargin to a Plotly figure. + Safe for all chart types (pie, gauge, funnel, etc.). + """ + base_font_size = 13 if is_side_by_side else 14 + + fig.update_layout( + paper_bgcolor='white', + plot_bgcolor='white', + font=dict(color='black', size=base_font_size), + title_font_size=base_font_size + 4, + ) + + margin = dict(l=60, r=40, t=60, b=60) if is_side_by_side else dict(l=80, r=40, t=60, b=80) + fig.update_layout(margin=margin) + + # Automargin on cartesian axes prevents label clipping; harmless for others + fig.update_xaxes(automargin=True) + fig.update_yaxes(automargin=True) + + if fig.layout.showlegend is not False: + fig.update_layout(legend_font_size=base_font_size - 2) + + def _fig_to_temp_path(self, fig, width=800, height=400, is_side_by_side=False): + """Render a Plotly figure to a temporary high-DPI PNG file. + + kaleido >= 1.0 uses choreographer (async Chrome DevTools Protocol) + internally, so pio.to_image() calls asyncio.run() under the hood. + Streamlit's script-runner thread already owns an asyncio event loop, + so calling pio.to_image() directly from that thread raises: + RuntimeError: This event loop is already running + which is silently swallowed by _add_chart/_add_two_charts, producing + the "[Charts could not be rendered]" placeholder. + + Fix: offload the render to a ThreadPoolExecutor worker. The worker + thread has no event loop attached, so kaleido's asyncio.run() succeeds. + """ + self._prepare_fig_for_pdf(fig, is_side_by_side=is_side_by_side) + + def _do_render(): + return pio.to_image( + fig, format='png', + width=width, height=height, + scale=self.RENDER_SCALE, + engine='kaleido', + ) + + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: + img_bytes = pool.submit(_do_render).result(timeout=60) + + tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False) + tmp.write(img_bytes) + tmp.close() + self._temp_files.append(tmp.name) + return tmp.name + + def _add_chart(self, fig, width=180, img_width=800, img_height=400): + """Render a Plotly figure and add it to the current PDF page.""" + try: + path = self._fig_to_temp_path(fig, img_width, img_height) + aspect_ratio = img_height / img_width + h_mm = width * aspect_ratio + self.pdf.check_page_break(h_mm + 5) + self.pdf.image(path, x=10, w=width) + self.pdf.ln(3) + except Exception as e: + logger.exception("Chart rendering failed: %s", e) + self.pdf.body_text("[Chart could not be rendered]") + + def _add_two_charts(self, fig1, fig2, width=92): + """Render two Plotly figures side by side.""" + try: + path1 = self._fig_to_temp_path(fig1, 700, 450, is_side_by_side=True) + path2 = self._fig_to_temp_path(fig2, 700, 450, is_side_by_side=True) + aspect = 450 / 700 + h_mm = width * aspect + self.pdf.check_page_break(h_mm + 5) + y = self.pdf.get_y() + self.pdf.image(path1, x=10, y=y, w=width) + self.pdf.image(path2, x=10 + width + 4, y=y, w=width) + self.pdf.set_y(y + h_mm + 3) + except Exception as e: + logger.exception("Charts rendering failed: %s", e) + self.pdf.body_text("[Charts could not be rendered]") + + def _cleanup_temp_files(self): + """Remove temporary image files created during rendering.""" + for path in self._temp_files: + try: + os.unlink(path) + except OSError: + pass + self._temp_files.clear() + + # ------------------------------------------------------------------ + # Private data helpers + # ------------------------------------------------------------------ + + def _pick_sample_comments(self, posts_df, comments_df): + """ + Select 1 representative forum post + 1 YouTube comment for each of the + five sentiment levels (very_positive, positive, neutral, negative, + very_negative), yielding up to 10 comment cards. + + Matching is **exact** per level so very_positive and very_negative cards + are always distinct from their plain counterparts when the data contains + them. If a level has no data for a given source it is skipped silently. + + Preference is given to posts with substantive text (≥ 80 characters). + Among candidates the one closest to the median length is chosen as the + most representative example. + + Returns: + list[dict]: Each dict has keys: text, sentiment, source, author_role. + """ + samples = [] + + def _pick_one(df, sentiment_exact, source): + """Return one representative post/comment for an exact sentiment level.""" + if df is None or df.empty: + return None + + # Exact sentiment match + subset = df[df['sentiment_level'] == sentiment_exact].copy() + if subset.empty: + return None + + text_col = next( + (c for c in ('display_text', 'original_text', 'cleaned_content', 'original_content') + if c in subset.columns), + None + ) + if text_col is None: + return None + + subset = subset[subset[text_col].notna()].copy() + subset['_tlen'] = subset[text_col].astype(str).str.len() + + # Prefer substantive texts + good = subset[subset['_tlen'] >= 80] + if good.empty: + good = subset[subset['_tlen'] >= 20] + if good.empty: + good = subset + if good.empty: + return None + + # Pick closest to median length → most representative + median_len = good['_tlen'].median() + row = good.iloc[(good['_tlen'] - median_len).abs().argsort().iloc[0]] + + author_role = str(row.get('author_role', '')) if 'author_role' in row.index else '' + return { + 'text': str(row[text_col]), + 'sentiment': str(row.get('sentiment_level', sentiment_exact)), + 'source': source, + 'author_role': author_role, + } + + # Iterate all five levels in display order: best → worst + for sent in ('very_positive', 'positive', 'neutral', 'negative', 'very_negative'): + forum_card = _pick_one(posts_df, sent, 'Forum') + if forum_card: + samples.append(forum_card) + + yt_card = _pick_one(comments_df, sent, 'YouTube') + if yt_card: + samples.append(yt_card) + + return samples + + def _build_data_narrative(self, posts_df, comments_df, additional_mentions_df, context, metrics): + """ + Build the opening data-driven narrative paragraph for the executive summary. + All figures are derived from real data; no AI needed for this block. + """ + total_comments = ( + len(comments_df) if comments_df is not None and not comments_df.empty else 0 + ) + total_analyzed = metrics['total_posts'] + total_comments + pos_neutral_pct = metrics['positive_pct'] + metrics['neutral_pct'] + + # Derive date range from the actual posts + date_start, date_end = "Feb 2025", "Dec 2025" + if posts_df is not None and not posts_df.empty and 'post_created_at' in posts_df.columns: + valid = posts_df['post_created_at'].dropna() + if not valid.empty: + date_start = valid.min().strftime('%b %Y') + date_end = valid.max().strftime('%b %Y') + + forum_total = context.get('forum_post_count') + yt_video_total = context.get('youtube_video_count') + yt_comment_total = context.get('youtube_comment_count') + + narrative = ( + f"Based on a sample of {total_analyzed:,} posts and YouTube comments, " + f"Sabian's brand sentiment is {pos_neutral_pct:.0f}% positive or neutral." + ) + + if forum_total and yt_video_total and yt_comment_total: + narrative += ( + f" These findings were discovered through analysis of over {forum_total:,} " + f"forum posts and {yt_comment_total:,} YouTube comments across " + f"{yt_video_total:,} YouTube videos, all collected between " + f"{date_start} and {date_end}." + ) + elif forum_total and yt_video_total: + narrative += ( + f" These findings were discovered through analysis of over {forum_total:,} " + f"forum posts and {yt_video_total:,} YouTube videos collected between " + f"{date_start} and {date_end}." + ) + elif forum_total: + narrative += ( + f" Data was collected from over {forum_total:,} forum posts " + f"between {date_start} and {date_end}." + ) + else: + narrative += f" Data was collected between {date_start} and {date_end}." + + # Top competitor comparison from overall brand mentions + try: + brand_mentions = self.processor.get_overall_brand_mentions( + posts_df, comments_df, additional_mentions_df + ) + if not brand_mentions.empty: + sabian_row = brand_mentions[brand_mentions['brand'] == 'Sabian'] + non_sabian = ( + brand_mentions[brand_mentions['brand'] != 'Sabian'] + .sort_values('total_mentions', ascending=False) + ) + sabian_count = ( + int(sabian_row['total_mentions'].iloc[0]) + if not sabian_row.empty else total_analyzed + ) + if not non_sabian.empty: + top = non_sabian.iloc[0] + narrative += ( + f" Sabian was mentioned {sabian_count:,} times, compared to " + f"{top['brand']}'s {int(top['total_mentions']):,} mentions " + f"within the same timeframe." + ) + except Exception: + pass + + return narrative + + def _generate_key_findings(self, posts_df, comments_df, metrics): + """Return a list of up to 4 key-finding strings derived from the data.""" + findings = [] + pos_neutral_pct = metrics['positive_pct'] + metrics['neutral_pct'] + + # Primary sentiment finding + if metrics['positive_pct'] > 50: + findings.append( + f"Overall sentiment is predominantly positive at {metrics['positive_pct']:.1f}% " + f"({pos_neutral_pct:.0f}% positive or neutral combined)." + ) + elif metrics['negative_pct'] > 30: + findings.append( + f"Negative sentiment is elevated at {metrics['negative_pct']:.1f}%, " + f"warranting further attention." + ) + else: + findings.append( + f"Sentiment is balanced: {metrics['positive_pct']:.1f}% positive, " + f"{metrics['neutral_pct']:.1f}% neutral, {metrics['negative_pct']:.1f}% negative." + ) + + # Potential buyers + buyer = BrandMetrics.calculate_potential_buyer_metrics(posts_df) + if buyer['total_potential_buyers'] > 0: + findings.append( + f"{buyer['total_potential_buyers']} potential buyers identified; " + f"{buyer['positive_sentiment_pct']:.0f}% expressed positive sentiment toward Sabian." + ) + + # Brand switching + switching = BrandMetrics.calculate_brand_switching_metrics(posts_df) + if switching['switching_to_sabian'] > 0 or switching['switching_from_sabian'] > 0: + net = switching['net_switching'] + sign = '+' if net >= 0 else '' + findings.append( + f"Brand switching: {switching['switching_to_sabian']} users moving to Sabian, " + f"{switching['switching_from_sabian']} moving away (net {sign}{net})." + ) + + # YouTube comments snapshot + if comments_df is not None and not comments_df.empty: + cm = BrandMetrics.calculate_overall_metrics(comments_df) + findings.append( + f"YouTube comments show {cm['positive_pct']:.0f}% positive sentiment " + f"across {len(comments_df):,} analyzed comments." + ) + + return findings[:4] + + # ------------------------------------------------------------------ + # Report sections + # ------------------------------------------------------------------ + + def _add_cover_page(self, posts_df, comments_df, filter_info, context): + """Add branded cover page with data-source context.""" + self.pdf.add_page() + + self.pdf.ln(40) + + # Brand colour accent bar + r, g, b = self.pdf.brand_color + self.pdf.set_fill_color(r, g, b) + self.pdf.rect(0, 60, 210, 4, style='F') + + # Title + self.pdf.ln(20) + self.pdf.set_font('Helvetica', 'B', 28) + self.pdf.set_text_color(r, g, b) + self.pdf.cell( + 0, 15, self.config['brand']['name'], + align='C', new_x="LMARGIN", new_y="NEXT" + ) + + self.pdf.set_font('Helvetica', '', 16) + self.pdf.set_text_color(80, 80, 80) + self.pdf.cell( + 0, 10, "Brand Sentiment Analysis Report", + align='C', new_x="LMARGIN", new_y="NEXT" + ) + + # Separator + self.pdf.ln(10) + self.pdf.set_draw_color(r, g, b) + self.pdf.set_line_width(0.5) + self.pdf.line(60, self.pdf.get_y(), 150, self.pdf.get_y()) + self.pdf.ln(10) + + # Generation date + self.pdf.set_font('Helvetica', '', 12) + self.pdf.set_text_color(100, 100, 100) + self.pdf.cell( + 0, 8, + f"Generated: {datetime.now().strftime('%B %d, %Y at %H:%M')}", + align='C', new_x="LMARGIN", new_y="NEXT" + ) + + # Analyzed sample counts + self.pdf.ln(5) + self.pdf.set_font('Helvetica', '', 10) + total_posts = len(posts_df) if posts_df is not None else 0 + total_comments = len(comments_df) if ( + comments_df is not None and not comments_df.empty + ) else 0 + self.pdf.cell( + 0, 7, + f"Forum Posts Analyzed: {total_posts:,} | " + f"Social Media Comments Analyzed: {total_comments:,}", + align='C', new_x="LMARGIN", new_y="NEXT" + ) + + # Data-source context line + self.pdf.ln(3) + self.pdf.set_font('Helvetica', 'I', 9) + self.pdf.set_text_color(120, 120, 120) + + forum_total = context.get('forum_post_count') + yt_video_total = context.get('youtube_video_count') + yt_comment_total = context.get('youtube_comment_count') + ctx_range = context.get('date_range') + + if ctx_range: + start_str, end_str = ctx_range + # Format YYYY-MM-DD → "Feb 2025" + try: + from datetime import datetime as _dt + start_fmt = _dt.strptime(start_str, '%Y-%m-%d').strftime('%b %Y') + end_fmt = _dt.strptime(end_str, '%Y-%m-%d').strftime('%b %Y') + except Exception: + start_fmt, end_fmt = start_str, end_str + else: + start_fmt, end_fmt = "Feb 2025", "Dec 2025" + + if forum_total and yt_video_total and yt_comment_total: + ctx_line = ( + f"Data scope: {forum_total:,} forum posts | " + f"{yt_video_total:,} YouTube videos ({yt_comment_total:,} total comments) " + f"({start_fmt} - {end_fmt})" + ) + elif forum_total and yt_video_total: + ctx_line = ( + f"Data scope: {forum_total:,} forum posts & " + f"{yt_video_total:,} YouTube videos ({start_fmt} - {end_fmt})" + ) + elif forum_total: + ctx_line = ( + f"Data scope: {forum_total:,} forum posts ({start_fmt} - {end_fmt})" + ) + else: + ctx_line = f"Data collection period: {start_fmt} - {end_fmt}" + + self.pdf.cell(0, 6, SabianPDF._sanitize(ctx_line), align='C', new_x="LMARGIN", new_y="NEXT") + + # Active filters + if filter_info: + self.pdf.ln(8) + self.pdf.cell(0, 6, "Active Filters:", align='C', new_x="LMARGIN", new_y="NEXT") + for key, value in filter_info.items(): + if value: + display = value if isinstance(value, str) else ', '.join(str(v) for v in value) + self.pdf.cell( + 0, 5, SabianPDF._sanitize(f"{key}: {display}"), + align='C', new_x="LMARGIN", new_y="NEXT" + ) + + # Footer note + self.pdf.ln(20) + self.pdf.set_font('Helvetica', 'I', 8) + self.pdf.set_text_color(150, 150, 150) + self.pdf.cell( + 0, 6, "Confidential - For Internal Use Only", + align='C', new_x="LMARGIN", new_y="NEXT" + ) + self.pdf.cell( + 0, 6, "Data Sources: Musora Forums, YouTube", + align='C', new_x="LMARGIN", new_y="NEXT" + ) + + def _add_executive_summary(self, posts_df, comments_df, additional_mentions_df, context, ai_insights): + """ + Add a comprehensive executive summary. + + Structure: + 1. Data-driven narrative paragraph + 2. AI-generated insight paragraph (if available) + 3. Brand health status + 4. Key metric boxes (2 rows) + 5. Score explanation callout + 6. Key findings + 7. Sample comment cards + """ + self.pdf.add_page() + self.pdf.section_header("Executive Summary") + self.pdf.section_description(SECTION_DESCRIPTIONS['executive_summary']) + + metrics = BrandMetrics.calculate_overall_metrics(posts_df) + status_info = BrandMetrics.get_sentiment_health_status(metrics['negative_pct']) + status_text, _, _ = status_info + + total_comments = ( + len(comments_df) if comments_df is not None and not comments_df.empty else 0 + ) + normalized_score = ((metrics['avg_sentiment_score'] + 2) / 4) * 100 + yt_total = context.get('youtube_comment_count') + + # ---- 1. Data narrative ---- + narrative = self._build_data_narrative( + posts_df, comments_df, additional_mentions_df, context, metrics + ) + self.pdf.body_text(narrative) + + # ---- 2. AI insight paragraph ---- + ai_exec = ai_insights.get('executive_summary', _NO_AI_SUMMARY) + if ai_exec: + self.pdf.body_text(SabianPDF._sanitize(ai_exec)) + + self.pdf.ln(2) + + # ---- 3. Brand health status ---- + r, g, b = self.pdf.brand_color + self.pdf.set_font('Helvetica', 'B', 11) + self.pdf.set_text_color(r, g, b) + self.pdf.cell(0, 8, f"Brand Health Status: {status_text}", new_x="LMARGIN", new_y="NEXT") + self.pdf.ln(2) + self.pdf.set_text_color(0, 0, 0) + + # ---- 4. Key metric boxes ---- + # Row 1: sample counts + total YT comments + positive sentiment + yt_total_label = f"{yt_total:,}" if yt_total is not None else "N/A" + self.pdf.metric_row([ + ("Analyzed Forum Posts", f"{metrics['total_posts']:,}"), + ("Analyzed YT Comments", f"{total_comments:,}"), + ("Total YT Comments", yt_total_label), + ("Positive %", f"{metrics['positive_pct']:.1f}%"), + ]) + self.pdf.metric_row([ + ("Neutral %", f"{metrics['neutral_pct']:.1f}%"), + ("Negative %", f"{metrics['negative_pct']:.1f}%"), + ("Unique Authors", f"{metrics['unique_authors']:,}"), + ("Brand Score (0-100)", f"{normalized_score:.0f}"), + ]) + + # ---- 5. Score & counts explanation ---- + self.pdf.ln(2) + self.pdf.callout_box( + "How to read these numbers:\n" + "Analyzed Forum Posts / Analyzed YT Comments = the Sabian-relevant records " + "examined by the AI in this report. " + "Total YT Comments = all YouTube comments scanned in the collection window " + "(not all are Sabian-related). " + "Sentiment scores: each post/comment is rated Very Positive (+2), Positive (+1), " + "Neutral (0), Negative (-1), or Very Negative (-2). " + "Brand Score (0-100) converts the average: 50 = perfectly neutral, " + "above 60 = primarily positive, below 40 = primarily negative.", + bg_color=(240, 248, 255), + ) + + # ---- 6. Key findings ---- + self.pdf.subsection_header("Key Findings") + for finding in self._generate_key_findings(posts_df, comments_df, metrics): + self.pdf.body_text(f" * {finding}") + + # ---- 7. Sample comment cards ---- + samples = self._pick_sample_comments(posts_df, comments_df) + if samples: + self.pdf.ln(3) + self.pdf.check_page_break(30) + self.pdf.subsection_header("Voice of the Customer - Sample Posts") + self.pdf.section_description( + "Representative posts and comments from the community, one Forum post and " + "one YouTube comment per sentiment level." + ) + for sample in samples: + self.pdf.comment_card( + text=sample['text'], + sentiment=sample['sentiment'], + source=sample['source'], + author_role=sample.get('author_role', ''), + ) + + def _add_sentiment_section(self, df): + """Add sentiment distribution section with score explanation.""" + self.pdf.add_page() + self.pdf.section_header("Sentiment Distribution") + self.pdf.section_description(SECTION_DESCRIPTIONS['sentiment']) + + pie = self.sentiment_charts.create_sentiment_pie_chart(df, title="Sentiment Distribution") + gauge = self.sentiment_charts.create_sentiment_score_gauge( + BrandMetrics.calculate_overall_metrics(df)['avg_sentiment_score'], + title="Brand Sentiment Score (0-100)" + ) + self._add_two_charts(pie, gauge) + + metrics = BrandMetrics.calculate_overall_metrics(df) + normalized_score = ((metrics['avg_sentiment_score'] + 2) / 4) * 100 + self.pdf.body_text( + f"Across {metrics['total_posts']:,} analyzed forum posts: " + f"{metrics['positive_pct']:.1f}% positive, " + f"{metrics['neutral_pct']:.1f}% neutral, " + f"{metrics['negative_pct']:.1f}% negative. " + f"The Brand Sentiment Score is {normalized_score:.0f} out of 100 " + f"(raw average: {metrics['avg_sentiment_score']:.2f} on a -2 to +2 scale, " + f"where 0 = neutral)." + ) + + def _add_author_role_section(self, df): + """Add author role analysis section.""" + self.pdf.add_page() + self.pdf.section_header("Author Role Analysis") + self.pdf.section_description(SECTION_DESCRIPTIONS['author_role']) + + author_dist = self.processor.get_author_role_distribution(df) + role_pie = self.distribution_charts.create_author_role_chart( + author_dist, title="Author Role Distribution" + ) + role_sentiment = self.sentiment_charts.create_sentiment_percentage_bar_chart( + df, group_by='author_role', title="Sentiment by Author Role" + ) + self._add_two_charts(role_pie, role_sentiment) + + buyer = BrandMetrics.calculate_potential_buyer_metrics(df) + if buyer['total_potential_buyers'] > 0: + self.pdf.subsection_header("Potential Buyer Insights") + self.pdf.metric_row([ + ("Total Potential Buyers", buyer['total_potential_buyers']), + ("Positive Sentiment", f"{buyer['positive_sentiment_pct']:.1f}%"), + ("Researching", buyer['researching_count']), + ("Deciding", buyer['deciding_count']), + ]) + + def _add_products_section(self, df): + """Add products analysis section.""" + self.pdf.add_page() + self.pdf.section_header("Sabian Products Analysis") + self.pdf.section_description(SECTION_DESCRIPTIONS['products']) + + products_dist = self.processor.get_products_distribution(df) + product_sentiment = self.processor.get_product_sentiment_breakdown(df) + + if not products_dist.empty: + products_bar = self.brand_charts.create_products_horizontal_bar( + products_dist.head(10), title="Top 10 Sabian Products Mentioned" + ) + self._add_chart(products_bar) + + if not product_sentiment.empty: + product_sent = self.brand_charts.create_product_sentiment_breakdown( + product_sentiment, title="Product Sentiment Breakdown" + ) + self._add_chart(product_sent) + + # Product metrics table – ensure header and data land on the same page + product_metrics = BrandMetrics.calculate_product_metrics(df) + if product_metrics: + rows = [] + for product, m in sorted( + product_metrics.items(), + key=lambda x: x[1]['total_posts'], + reverse=True + )[:15]: + rows.append(( + product, + m['total_posts'], + f"{m['avg_sentiment_score']:.2f}", + f"{m['positive_pct']:.1f}%", + f"{m['negative_pct']:.1f}%", + )) + + # Reserve space for subsection header + table together + needed = 15 + 10 + min(len(rows), 15) * 6 + self.pdf.check_page_break(needed) + self.pdf.subsection_header("Detailed Product Metrics") + headers = ['Product', 'Posts', 'Avg Score', 'Positive %', 'Negative %'] + self.pdf.add_table(headers, rows, col_widths=[60, 25, 30, 35, 40]) + else: + self.pdf.body_text("No product mention data available.") + + def _add_overall_mentions_section(self, posts_df, comments_df, additional_mentions_df): + """Add overall brand mentions section.""" + self.pdf.add_page() + self.pdf.section_header("Overall Brand Mentions") + self.pdf.section_description(SECTION_DESCRIPTIONS['overall_mentions']) + + brand_mentions = self.processor.get_overall_brand_mentions( + posts_df, comments_df, additional_mentions_df + ) + + if not brand_mentions.empty: + chart = self.brand_charts.create_overall_brand_mentions_chart( + brand_mentions, title="Brand Mention Distribution" + ) + self._add_chart(chart) + + self.pdf.subsection_header("Brand Mention Breakdown") + headers = ['Brand', 'Total Mentions', 'Market Share'] + rows = [ + (row['brand'], f"{row['total_mentions']:,}", f"{row['percentage']:.1f}%") + for _, row in brand_mentions.iterrows() + ] + self.pdf.add_table(headers, rows, col_widths=[60, 65, 65]) + + self.pdf.body_text( + "Sabian count = all analyzed posts and comments. " + "Competitor counts include both mentions inside analyzed Sabian posts " + "and additional mentions from unrelated forum threads." + ) + else: + self.pdf.body_text("No brand mention data available.") + + def _add_competitive_section(self, df): + """Add competitive analysis section with ELI5 score explanation.""" + self.pdf.add_page() + self.pdf.section_header("Competitive Analysis") + self.pdf.section_description(SECTION_DESCRIPTIONS['competitive']) + + competitors_dist = self.processor.get_competitors_distribution(df) + competitor_sentiment = self.processor.get_competitor_sentiment_breakdown(df) + + if not competitors_dist.empty: + comp_bar = self.brand_charts.create_competitors_bar_chart( + competitors_dist.head(10), title="Competitors Mentioned (in Sabian Posts)" + ) + self._add_chart(comp_bar) + + if not competitor_sentiment.empty: + heatmap = self.brand_charts.create_competitive_heatmap( + competitor_sentiment, + title="Sabian Sentiment When Each Competitor Is Mentioned (-2 to +2)" + ) + self._add_chart(heatmap) + + # Plain-language callout about the -2 to +2 scale + self.pdf.callout_box( + "Reading the chart above:\n" + "The score next to each competitor shows how positively people talk about Sabian " + "when that competitor also appears in the same post. " + "Scale: +2 = very positive about Sabian in that context, " + "0 = neutral (no leaning either way), " + "-2 = quite negative. " + "Example: a score of +1.2 beside Zildjian means that whenever someone mentions " + "both Sabian and Zildjian together, they tend to speak favourably about Sabian. " + "A score of -0.3 would mean the opposite - slightly less favourable mentions.", + bg_color=(255, 252, 235), + ) + + # Switching metrics + switching = BrandMetrics.calculate_brand_switching_metrics(df) + if switching['switching_to_sabian'] > 0 or switching['switching_from_sabian'] > 0: + self.pdf.subsection_header("Brand Switching Analysis") + self.pdf.metric_row([ + ("Switching TO Sabian", switching['switching_to_sabian']), + ("Switching FROM Sabian", switching['switching_from_sabian']), + ("Net Switching", f"{switching['net_switching']:+d}"), + ]) + net = switching['net_switching'] + if net > 0: + self.pdf.body_text(f"Net positive: {net} more users switching TO Sabian than away.") + elif net < 0: + self.pdf.body_text(f"Net negative: {abs(net)} more users switching FROM Sabian than toward it.") + + def _add_intents_feedback_section(self, df): + """Add intents and feedback section.""" + self.pdf.add_page() + self.pdf.section_header("Intents & Feedback Analysis") + self.pdf.section_description(SECTION_DESCRIPTIONS['intents_feedback']) + + intents_dist = self.processor.get_intents_distribution(df) + pain_dist = self.processor.get_pain_points_distribution(df) + delight_dist = self.processor.get_delight_factors_distribution(df) + + if not intents_dist.empty: + intents_bar = self.distribution_charts.create_intent_bar_chart( + intents_dist, title="User Intents", orientation='h' + ) + self._add_chart(intents_bar) + + if not pain_dist.empty or not delight_dist.empty: + pd_chart = self.distribution_charts.create_pain_delight_comparison_chart( + pain_dist.head(8), delight_dist.head(8), + title="Pain Points vs Delight Factors" + ) + self._add_chart(pd_chart) + + pd_metrics = BrandMetrics.calculate_pain_delight_metrics(df) + if pd_metrics['total_pain_points'] > 0 or pd_metrics['total_delight_factors'] > 0: + metrics_list = [ + ("Pain Points Identified", pd_metrics['total_pain_points']), + ("Delight Factors Identified", pd_metrics['total_delight_factors']), + ] + ratio = pd_metrics['pain_to_delight_ratio'] + if ratio != float('inf'): + metrics_list.append(("Pain / Delight Ratio", f"{ratio:.2f}")) + self.pdf.metric_row(metrics_list) + + def _add_purchase_journey_section(self, df): + """Add mention context and purchase journey section.""" + self.pdf.add_page() + self.pdf.section_header("Mention Context & Purchase Journey") + self.pdf.section_description(SECTION_DESCRIPTIONS['purchase_journey']) + + context_dist = self.processor.get_mention_context_distribution(df) + stage_dist = self.processor.get_purchase_stage_distribution(df) + + if not context_dist.empty and not stage_dist.empty: + context_chart = self.distribution_charts.create_mention_context_chart( + context_dist, title="How Sabian is Mentioned" + ) + stage_chart = self.distribution_charts.create_purchase_stage_chart( + stage_dist, title="Purchase Journey Stage" + ) + self._add_two_charts(context_chart, stage_chart) + elif not context_dist.empty: + context_chart = self.distribution_charts.create_mention_context_chart( + context_dist, title="How Sabian is Mentioned" + ) + self._add_chart(context_chart) + elif not stage_dist.empty: + stage_chart = self.distribution_charts.create_purchase_stage_chart( + stage_dist, title="Purchase Journey Stage" + ) + self._add_chart(stage_chart) + + def _add_demographics_section(self, df): + """Add demographics section (only if data is available).""" + has_demographics = ( + 'drums_experience_years' in df.columns + or 'age_group' in df.columns + or 'cymbal_brands_list' in df.columns + ) + if not has_demographics: + return + + self.pdf.add_page() + self.pdf.section_header("User Demographics Analysis") + self.pdf.section_description(SECTION_DESCRIPTIONS['demographics']) + + # Experience distribution + if 'experience_group' in df.columns: + exp_dist = self.processor.get_demographics_distribution(df, 'experience_group') + if not exp_dist.empty: + self.pdf.subsection_header("Drumming Experience") + exp_chart = self.demographic_charts.create_experience_distribution_chart( + exp_dist, title="Experience Level Distribution" + ) + self._add_chart(exp_chart) + + # Cymbal ownership + if 'cymbal_brands_list' in df.columns: + ownership_data = self.processor.get_cymbal_ownership_analysis(df) + if ownership_data: + self.pdf.subsection_header("Cymbal Ownership") + self.pdf.body_text( + "Note: ownership data is available only for users who have filled in their " + "cymbal gear information in their Musora profile." + ) + ownership_chart = self.demographic_charts.create_cymbal_ownership_chart( + ownership_data, title="Cymbal Brands Owned by Forum Users" + ) + self._add_chart(ownership_chart) + + sabian_rate = ownership_data.get('sabian_ownership_rate', 0) + total = ownership_data.get('total_users_with_cymbal_data', 0) + self.pdf.metric_row([ + ("Users with Cymbal Data", total), + ("Sabian Ownership Rate", f"{sabian_rate:.1f}%"), + ]) + + # Age distribution + if 'age_group' in df.columns: + age_dist = self.processor.get_demographics_distribution(df, 'age_group') + if not age_dist.empty: + self.pdf.subsection_header("Age Distribution") + self.pdf.body_text( + "Note: age is calculated from the birthday field in each user's Musora " + "profile and is only available for users who provided this information." + ) + age_chart = self.demographic_charts.create_age_distribution_chart( + age_dist, title="Age Group Distribution" + ) + self._add_chart(age_chart) + + # Timezone / geographic distribution + if 'timezone_region' in df.columns: + tz_dist = self.processor.get_demographics_distribution(df, 'timezone_region') + if not tz_dist.empty: + self.pdf.subsection_header("Geographic Distribution") + tz_chart = self.demographic_charts.create_timezone_distribution_chart( + tz_dist, title="Users by Timezone Region" + ) + self._add_chart(tz_chart) + + def _add_emotion_section(self, df): + """ + Add emotion analysis section. + + This section does NOT force a new page – it continues from wherever + the previous section ended, starting a new page only if insufficient + space remains. This reduces wasted half-page gaps. + """ + emotion_dist = self.processor.get_emotion_distribution(df) + if emotion_dist.empty: + return + + # Reserve enough space for header + description + chart + self.pdf.check_page_break(80) + self.pdf.section_header("Emotion Analysis") + self.pdf.section_description(SECTION_DESCRIPTIONS['emotion']) + + emotion_chart = self.sentiment_charts.create_emotion_distribution_chart( + df, title="Emotion Distribution" + ) + self._add_chart(emotion_chart) + + def _add_social_media_section(self, comments_df): + """Add social media comments section.""" + self.pdf.add_page() + self.pdf.section_header("Social Media Comments Analysis") + self.pdf.section_description(SECTION_DESCRIPTIONS['social_media']) + + comments_metrics = BrandMetrics.calculate_overall_metrics(comments_df) + normalized_score = ((comments_metrics['avg_sentiment_score'] + 2) / 4) * 100 + + # Summary metrics + self.pdf.metric_row([ + ("Total Comments", f"{comments_metrics['total_posts']:,}"), + ("Brand Score (0-100)", f"{normalized_score:.0f}"), + ("Positive %", f"{comments_metrics['positive_pct']:.1f}%"), + ("Negative %", f"{comments_metrics['negative_pct']:.1f}%"), + ]) + + # Platform breakdown – Percentage column removed because we currently + # only have YouTube data (100% would add no information) + if 'platform' in comments_df.columns: + self.pdf.subsection_header("Platform Breakdown") + platform_counts = comments_df['platform'].value_counts().reset_index() + platform_counts.columns = ['platform', 'count'] + + headers = ['Platform', 'Comments'] + rows = [ + ( + row['platform'].replace('_', ' ').title(), + f"{row['count']:,}", + ) + for _, row in platform_counts.iterrows() + ] + self.pdf.add_table(headers, rows, col_widths=[95, 95]) + + # Sentiment distribution + sentiment_pie = self.sentiment_charts.create_sentiment_pie_chart( + comments_df, title="Comment Sentiment Distribution" + ) + self._add_chart(sentiment_pie) + + # Products in comments + products_dist = self.processor.get_products_distribution(comments_df) + if not products_dist.empty: + self.pdf.subsection_header("Products Mentioned in Comments") + products_bar = self.brand_charts.create_products_horizontal_bar( + products_dist.head(10), title="Top Products in Comments" + ) + self._add_chart(products_bar) + + # Competitors in comments + competitors_dist = self.processor.get_competitors_distribution(comments_df) + if not competitors_dist.empty: + self.pdf.subsection_header("Competitors Mentioned in Comments") + comp_bar = self.brand_charts.create_competitors_bar_chart( + competitors_dist.head(10), title="Competitors in Comments" + ) + self._add_chart(comp_bar) + + def _add_data_summary(self, posts_df, comments_df, filter_info): + """Add data summary appendix.""" + self.pdf.add_page() + self.pdf.section_header("Appendix: Data Summary") + + # Record counts + self.pdf.subsection_header("Data Volumes") + total_posts = len(posts_df) if posts_df is not None else 0 + total_comments = ( + len(comments_df) if comments_df is not None and not comments_df.empty else 0 + ) + rows = [ + ("Forum Posts (analyzed sample)", f"{total_posts:,}"), + ("Social Media Comments (analyzed sample)", f"{total_comments:,}"), + ("Total Records", f"{total_posts + total_comments:,}"), + ] + self.pdf.add_table(['Data Source', 'Count'], rows, col_widths=[130, 60]) + + # Date ranges + self.pdf.subsection_header("Date Ranges") + if 'post_created_at' in posts_df.columns and not posts_df['post_created_at'].isna().all(): + valid = posts_df[posts_df['post_created_at'].notna()]['post_created_at'] + self.pdf.body_text( + f"Forum Posts: {valid.min().strftime('%Y-%m-%d')} " + f"to {valid.max().strftime('%Y-%m-%d')}" + ) + + if ( + comments_df is not None and not comments_df.empty + and 'comment_timestamp' in comments_df.columns + and not comments_df['comment_timestamp'].isna().all() + ): + valid = comments_df[comments_df['comment_timestamp'].notna()]['comment_timestamp'] + self.pdf.body_text( + f"Social Media: {valid.min().strftime('%Y-%m-%d')} " + f"to {valid.max().strftime('%Y-%m-%d')}" + ) + + # Active filters + if filter_info: + self.pdf.subsection_header("Active Filters") + for key, value in filter_info.items(): + if value: + display = value if isinstance(value, str) else ', '.join(str(v) for v in value) + self.pdf.body_text(f"{key}: {display}") + else: + self.pdf.body_text("No filters applied — showing complete dataset.") + + # Methodology + self.pdf.subsection_header("Methodology") + self.pdf.body_text( + "This report is generated from AI-powered sentiment analysis performed on forum posts " + "and social media comments where Sabian cymbals are mentioned. Posts are collected " + "from Musora Forums and YouTube. Each post is analyzed to extract sentiment level, " + "author role, products and competitors mentioned, intents, pain points, delight " + "factors, and purchase stage. Demographic data is sourced from user profiles and " + "is available only for users who have provided this information. Brand mention counts " + "include both the analyzed sample and additional competitor mentions from raw forum " + "posts not focused on Sabian." + ) \ No newline at end of file diff --git a/visualization_brand_sentiment/utils/report_context.py b/visualization_brand_sentiment/utils/report_context.py new file mode 100644 index 0000000000000000000000000000000000000000..1d5df91c6cd18548540fe9bcf77ced02c2ae54b1 --- /dev/null +++ b/visualization_brand_sentiment/utils/report_context.py @@ -0,0 +1,158 @@ +""" +Report Context Loader +Queries Snowflake for data-source statistics used to contextualize sample sizes +in the PDF report (total forum posts available, total YouTube videos scanned, etc.). + +All queries are best-effort: if Snowflake is unavailable the methods return None +and the report degrades gracefully (omitting the context figures). +""" +import sys +import logging +from pathlib import Path +from datetime import date, timedelta + +# Allow importing SnowFlakeConn from the parent project +parent_dir = Path(__file__).resolve().parent.parent.parent +sys.path.append(str(parent_dir)) + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Default date range used when no range is supplied by the caller. +# Represents the Feb 2025 – Dec 2025 Sabian experiment window. +# --------------------------------------------------------------------------- +_DEFAULT_START = date(2025, 2, 1) +_DEFAULT_END = date(2025, 12, 31) + + +class ReportContextLoader: + """ + Fetches aggregate data-source statistics from Snowflake. + + Usage:: + + loader = ReportContextLoader() + ctx = loader.load_all_context(date_range=(start_date, end_date)) + # ctx = {'forum_post_count': 5234, 'youtube_video_count': 116, 'date_range': ...} + """ + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def load_forum_post_count(self, date_range=None): + """ + COUNT rows in SOCIAL_MEDIA_DB.CORE.FORUM_POSTS within the date window. + + Args: + date_range: Optional (start_date, end_date) tuple of :class:`datetime.date` + objects. Falls back to the default 2025 experiment window. + + Returns: + int | None: Row count, or None if the query fails. + """ + start, end = self._resolve_date_range(date_range) + end_exclusive = end + timedelta(days=1) + query = ( + "SELECT COUNT(*) AS cnt " + "FROM SOCIAL_MEDIA_DB.CORE.FORUM_POSTS " + f"WHERE POST_CREATED_AT >= '{start}' " + f"AND POST_CREATED_AT < '{end_exclusive}'" + ) + return self._run_scalar_query(query, label="forum post count") + + def load_youtube_video_count(self, date_range=None): + """ + COUNT distinct CONTENT_SK values for the 'youtube' platform in + SOCIAL_MEDIA_DB.CORE.FACT_COMMENTS within the date window. + Each unique CONTENT_SK represents one YouTube video/post. + + Args: + date_range: Optional (start_date, end_date) tuple of :class:`datetime.date`. + + Returns: + int | None: Unique video count, or None on failure. + """ + start, end = self._resolve_date_range(date_range) + end_exclusive = end + timedelta(days=1) + query = ( + "SELECT COUNT(DISTINCT CONTENT_SK) AS cnt " + "FROM SOCIAL_MEDIA_DB.CORE.FACT_COMMENTS " + "WHERE LOWER(PLATFORM) = 'youtube' " + f"AND CREATED_TIME >= '{start}' " + f"AND CREATED_TIME < '{end_exclusive}'" + ) + return self._run_scalar_query(query, label="YouTube video count") + + def load_youtube_comment_count(self, date_range=None): + """ + COUNT total rows for the 'youtube' platform in + SOCIAL_MEDIA_DB.CORE.FACT_COMMENTS within the date window. + Each row represents one individual YouTube comment. + + Args: + date_range: Optional (start_date, end_date) tuple of :class:`datetime.date`. + + Returns: + int | None: Total comment count, or None on failure. + """ + start, end = self._resolve_date_range(date_range) + end_exclusive = end + timedelta(days=1) + query = ( + "SELECT COUNT(*) AS cnt " + "FROM SOCIAL_MEDIA_DB.CORE.FACT_COMMENTS " + "WHERE LOWER(PLATFORM) = 'youtube' " + f"AND CREATED_TIME >= '{start}' " + f"AND CREATED_TIME < '{end_exclusive}'" + ) + return self._run_scalar_query(query, label="YouTube comment count") + + def load_all_context(self, date_range=None): + """ + Load all context statistics in a single call. + + Returns: + dict with keys: + ``forum_post_count`` – total forum posts (int or None) + ``youtube_video_count`` – unique YouTube videos/posts (int or None) + ``youtube_comment_count`` – total YouTube comments (int or None) + ``date_range`` – resolved ``(start_str, end_str)`` tuple + """ + start, end = self._resolve_date_range(date_range) + return { + "forum_post_count": self.load_forum_post_count(date_range), + "youtube_video_count": self.load_youtube_video_count(date_range), + "youtube_comment_count": self.load_youtube_comment_count(date_range), + "date_range": (str(start), str(end)), + } + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + def _resolve_date_range(self, date_range): + """Return a (start, end) pair of :class:`datetime.date` objects.""" + if date_range and len(date_range) == 2: + start, end = date_range + # Accept both date and datetime objects + if hasattr(start, "date"): + start = start.date() + if hasattr(end, "date"): + end = end.date() + return start, end + return _DEFAULT_START, _DEFAULT_END + + @staticmethod + def _run_scalar_query(query, label="query"): + """Execute a scalar COUNT query and return the integer result.""" + try: + from visualization.SnowFlakeConnection import SnowFlakeConn + conn = SnowFlakeConn() + df = conn.run_read_query(query, label) + conn.close_connection() + if df is not None and not df.empty: + return int(df.iloc[0, 0]) + return None + except Exception as exc: + logger.warning("Could not load %s: %s", label, exc) + return None \ No newline at end of file diff --git a/visualization_brand_sentiment/visualizations/__init__.py b/visualization_brand_sentiment/visualizations/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ba47625abbb6bb7862ac1658ee247268d5618ab0 --- /dev/null +++ b/visualization_brand_sentiment/visualizations/__init__.py @@ -0,0 +1,16 @@ +""" +Visualization modules for Brand Sentiment Dashboard +""" +from .sentiment_charts import SentimentCharts +from .distribution_charts import DistributionCharts +from .brand_charts import BrandCharts +from .demographic_charts import DemographicCharts +from .content_cards import ContentCards + +__all__ = [ + 'SentimentCharts', + 'DistributionCharts', + 'BrandCharts', + 'DemographicCharts', + 'ContentCards' +] diff --git a/visualization_brand_sentiment/visualizations/brand_charts.py b/visualization_brand_sentiment/visualizations/brand_charts.py new file mode 100644 index 0000000000000000000000000000000000000000..9c04d46575ec906287441440301be427c768f7da --- /dev/null +++ b/visualization_brand_sentiment/visualizations/brand_charts.py @@ -0,0 +1,484 @@ +""" +Brand-specific visualization components using Plotly +Creates charts for Sabian products, competitors, and brand analysis +""" +import plotly.graph_objects as go +import plotly.express as px +from plotly.subplots import make_subplots +import pandas as pd +import json +from pathlib import Path + + +class BrandCharts: + """ + Creates brand-specific visualizations for Sabian analysis + """ + + def __init__(self, config_path=None): + """ + Initialize with configuration + + Args: + config_path: Path to configuration file + """ + if config_path is None: + config_path = Path(__file__).parent.parent / "config" / "viz_config.json" + + with open(config_path, 'r', encoding='utf-8') as f: + self.config = json.load(f) + + self.product_colors = self.config['color_schemes']['products'] + self.competitor_colors = self.config['color_schemes']['competitors'] + self.sentiment_colors = self.config['color_schemes']['sentiment_level'] + self.sentiment_order = self.config['sentiment_order'] + self.chart_height = self.config['dashboard']['chart_height'] + + def create_products_bar_chart(self, df, title="Sabian Products Mentioned"): + """ + Create bar chart for product mentions + + Args: + df: Distribution dataframe with product, count columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No product data available") + + colors = [self.product_colors.get(p, self.product_colors['default']) for p in df['product']] + + fig = go.Figure(data=[go.Bar( + x=df['product'], + y=df['count'], + marker=dict(color=colors), + text=df['count'], + textposition='auto', + hovertemplate='%{x}
Mentions: %{y}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Product", + yaxis_title="Number of Mentions", + height=self.chart_height + ) + + return fig + + def create_products_horizontal_bar(self, df, title="Sabian Products Mentioned"): + """ + Create horizontal bar chart for product mentions (better for many products) + + Args: + df: Distribution dataframe with product, count columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No product data available") + + # Sort by count and reverse for display + df = df.sort_values('count', ascending=True) + + colors = [self.product_colors.get(p, self.product_colors['default']) for p in df['product']] + + fig = go.Figure(data=[go.Bar( + y=df['product'], + x=df['count'], + orientation='h', + marker=dict(color=colors), + text=df['count'], + textposition='auto', + hovertemplate='%{y}
Mentions: %{x}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Number of Mentions", + yaxis_title="Product", + height=max(self.chart_height, len(df) * 30) + ) + + return fig + + def create_product_sentiment_breakdown(self, df, title="Product Sentiment Breakdown"): + """ + Create stacked bar chart showing sentiment for each product + + Args: + df: Dataframe with product, sentiment_level, count columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No product sentiment data available") + + # Pivot the data + pivot_df = df.pivot(index='product', columns='sentiment_level', values='count').fillna(0) + + # Reorder columns by sentiment_order + ordered_columns = [s for s in self.sentiment_order if s in pivot_df.columns] + pivot_df = pivot_df[ordered_columns] + + fig = go.Figure() + + for sentiment in pivot_df.columns: + fig.add_trace(go.Bar( + name=sentiment.replace('_', ' ').title(), + y=pivot_df.index, + x=pivot_df[sentiment], + orientation='h', + marker_color=self.sentiment_colors.get(sentiment, '#CCCCCC'), + hovertemplate='%{y}
' + sentiment.replace('_', ' ').title() + ': %{x}' + )) + + fig.update_layout( + title=title, + xaxis_title="Number of Posts", + yaxis_title="Product", + barmode='stack', + height=max(self.chart_height, len(pivot_df) * 40), + legend=dict(title="Sentiment", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1) + ) + + return fig + + def create_overall_brand_mentions_chart(self, df, title="Overall Brand Mentions"): + """ + Create horizontal bar chart showing total brand mentions across all sources. + Sabian is highlighted with the brand primary color. + + Args: + df: DataFrame with brand, total_mentions, percentage columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No brand mention data available") + + # Sort ascending for horizontal bar (top = highest) + df = df.sort_values('total_mentions', ascending=True) + + # Assign colors: Sabian gets primary brand color, competitors get their config colors + brand_primary = self.config['brand']['primary_color'] + colors = [] + for brand in df['brand']: + if brand == 'Sabian': + colors.append(brand_primary) + else: + colors.append(self.competitor_colors.get(brand, self.competitor_colors['default'])) + + fig = go.Figure(data=[go.Bar( + y=df['brand'], + x=df['total_mentions'], + orientation='h', + marker=dict( + color=colors, + line=dict( + color=[brand_primary if b == 'Sabian' else 'rgba(0,0,0,0)' for b in df['brand']], + width=[2 if b == 'Sabian' else 0 for b in df['brand']] + ) + ), + text=df.apply( + lambda row: f"{row['total_mentions']:,} ({row['percentage']:.1f}%)", axis=1 + ), + textposition='auto', + hovertemplate='%{y}
Mentions: %{x:,}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Total Mentions", + yaxis_title="Brand", + height=max(self.chart_height, len(df) * 50), + showlegend=False + ) + + return fig + + def create_competitors_bar_chart(self, df, title="Competitors Mentioned"): + """ + Create horizontal bar chart for competitor mentions. + + Using a horizontal layout ensures count labels always render + upright, even for bars with a count of 1. + + Args: + df: Distribution dataframe with competitor, count columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No competitor data available") + + # Sort ascending so the highest bar appears at the top + df = df.sort_values('count', ascending=True) + + colors = [self.competitor_colors.get(c, self.competitor_colors['default']) for c in df['competitor']] + + fig = go.Figure(data=[go.Bar( + y=df['competitor'], + x=df['count'], + orientation='h', + marker=dict(color=colors), + text=df['count'], + textposition='outside', + textangle=0, # Always render count labels upright + cliponaxis=False, # Don't clip outside-bar labels + hovertemplate='%{y}
Mentions: %{x}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Number of Mentions", + yaxis_title="Competitor", + height=max(self.chart_height, len(df) * 45 + 80), + xaxis=dict(automargin=True), + ) + + return fig + + def create_competitor_sentiment_breakdown(self, df, title="Sentiment When Competitors Mentioned"): + """ + Create stacked bar chart showing Sabian sentiment when competitors are mentioned + + Args: + df: Dataframe with competitor, sentiment_level, count columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No competitor sentiment data available") + + # Pivot the data + pivot_df = df.pivot(index='competitor', columns='sentiment_level', values='count').fillna(0) + + # Reorder columns by sentiment_order + ordered_columns = [s for s in self.sentiment_order if s in pivot_df.columns] + pivot_df = pivot_df[ordered_columns] + + fig = go.Figure() + + for sentiment in pivot_df.columns: + fig.add_trace(go.Bar( + name=sentiment.replace('_', ' ').title(), + y=pivot_df.index, + x=pivot_df[sentiment], + orientation='h', + marker_color=self.sentiment_colors.get(sentiment, '#CCCCCC'), + hovertemplate='%{y}
' + sentiment.replace('_', ' ').title() + ': %{x}' + )) + + fig.update_layout( + title=title, + xaxis_title="Number of Posts", + yaxis_title="Competitor", + barmode='stack', + height=max(self.chart_height, len(pivot_df) * 40), + legend=dict(title="Sabian Sentiment", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1) + ) + + return fig + + def create_product_comparison_radar(self, product_metrics, title="Product Comparison"): + """ + Create radar chart comparing product metrics + + Args: + product_metrics: Dict with product -> metrics mapping + + Returns: + plotly.graph_objects.Figure + """ + if not product_metrics: + return self._create_empty_chart(title, "No product metrics available") + + categories = ['Total Posts', 'Positive %', 'Avg Score'] + + fig = go.Figure() + + for product, metrics in product_metrics.items(): + if product == 'default': + continue + + values = [ + min(metrics.get('total_posts', 0) / 10, 100), # Normalize + metrics.get('positive_pct', 0), + (metrics.get('avg_sentiment_score', 0) + 2) * 25 # Normalize -2 to 2 -> 0 to 100 + ] + values.append(values[0]) # Close the radar + + fig.add_trace(go.Scatterpolar( + r=values, + theta=categories + [categories[0]], + fill='toself', + name=product, + line=dict(color=self.product_colors.get(product, '#CCCCCC')) + )) + + fig.update_layout( + polar=dict( + radialaxis=dict(visible=True, range=[0, 100]) + ), + showlegend=True, + title=title, + height=self.chart_height + ) + + return fig + + def create_switching_flow_chart(self, switching_to, switching_from, title="Brand Switching Flow"): + """ + Create indicator chart for brand switching net flow + + Args: + switching_to: Number switching to Sabian + switching_from: Number switching from Sabian + + Returns: + plotly.graph_objects.Figure + """ + net_flow = switching_to - switching_from + + fig = make_subplots( + rows=1, cols=3, + specs=[[{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}]], + subplot_titles=["Coming to Sabian", "Leaving Sabian", "Net Flow"] + ) + + # Switching TO + fig.add_trace( + go.Indicator( + mode="number", + value=switching_to, + number={'font': {'color': '#4CAF50', 'size': 48}}, + ), + row=1, col=1 + ) + + # Switching FROM + fig.add_trace( + go.Indicator( + mode="number", + value=switching_from, + number={'font': {'color': '#F44336', 'size': 48}}, + ), + row=1, col=2 + ) + + # Net flow + fig.add_trace( + go.Indicator( + mode="number+delta", + value=net_flow, + number={'font': {'size': 48}}, + delta={'reference': 0, 'relative': False}, + ), + row=1, col=3 + ) + + fig.update_layout( + title=title, + height=250 + ) + + return fig + + def create_competitive_heatmap(self, df, title="Competitive Comparison Heatmap"): + """ + Create heatmap showing sentiment scores when different competitors are mentioned + + Args: + df: Dataframe with competitor sentiment breakdown + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No competitive data available") + + # Calculate sentiment score per competitor + sentiment_weights = { + 'very_negative': -2, 'negative': -1, 'neutral': 0, + 'positive': 1, 'very_positive': 2 + } + + # Pivot and calculate weighted average + pivot = df.pivot(index='competitor', columns='sentiment_level', values='count').fillna(0) + + scores = [] + for idx, row in pivot.iterrows(): + total = row.sum() + if total > 0: + score = sum(row[s] * sentiment_weights.get(s, 0) for s in row.index) / total + else: + score = 0 + scores.append({'competitor': idx, 'sentiment_score': score, 'total_mentions': total}) + + score_df = pd.DataFrame(scores).sort_values('sentiment_score') + + # Create horizontal bar with color scale + fig = go.Figure(data=[go.Bar( + y=score_df['competitor'], + x=score_df['sentiment_score'], + orientation='h', + marker=dict( + color=score_df['sentiment_score'], + colorscale='RdYlGn', + cmin=-2, + cmax=2, + colorbar=dict(title="Sentiment Score") + ), + text=score_df['sentiment_score'].round(2), + textposition='auto', + hovertemplate='%{y}
Score: %{x:.2f}
Mentions: %{customdata}', + customdata=score_df['total_mentions'] + )]) + + fig.update_layout( + title=title, + xaxis_title="Sabian Sentiment Score (when this competitor is mentioned)", + yaxis_title="Competitor", + height=max(300, len(score_df) * 40) + ) + + return fig + + def _create_empty_chart(self, title, message): + """ + Create an empty chart with a message + + Args: + title: Chart title + message: Message to display + + Returns: + plotly.graph_objects.Figure + """ + fig = go.Figure() + + fig.add_annotation( + text=message, + xref="paper", + yref="paper", + x=0.5, + y=0.5, + showarrow=False, + font=dict(size=14, color="gray") + ) + + fig.update_layout( + title=title, + height=self.chart_height, + xaxis=dict(visible=False), + yaxis=dict(visible=False) + ) + + return fig diff --git a/visualization_brand_sentiment/visualizations/content_cards.py b/visualization_brand_sentiment/visualizations/content_cards.py new file mode 100644 index 0000000000000000000000000000000000000000..afa8a6100186411d297a3c857b1eb762fd26179f --- /dev/null +++ b/visualization_brand_sentiment/visualizations/content_cards.py @@ -0,0 +1,580 @@ +""" +Content display components for brand sentiment visualization +Creates formatted cards and displays for posts +""" +import streamlit as st +import pandas as pd +from datetime import datetime +import json +from pathlib import Path + + +class ContentCards: + """ + Creates content display components for brand sentiment posts + """ + + def __init__(self, config_path=None): + """Initialize with configuration""" + if config_path is None: + config_path = Path(__file__).parent.parent / "config" / "viz_config.json" + + with open(config_path, 'r', encoding='utf-8') as f: + self.config = json.load(f) + + @staticmethod + def display_summary_stats(df, metrics): + """ + Display summary statistics in a formatted layout + + Args: + df: Brand sentiment dataframe + metrics: Calculated metrics dict + """ + st.markdown("### 📊 Summary Statistics") + + col1, col2, col3, col4, col5 = st.columns(5) + + with col1: + st.metric("Total Posts", f"{metrics['total_posts']:,}") + + with col2: + st.metric("Unique Threads", f"{metrics['unique_threads']:,}") + + with col3: + st.metric("Unique Authors", f"{metrics['unique_authors']:,}") + + with col4: + st.metric("Positive %", f"{metrics['positive_pct']:.1f}%") + + with col5: + st.metric("Negative %", f"{metrics['negative_pct']:.1f}%") + + @staticmethod + def display_health_indicator(negative_pct, status_info): + """ + Display sentiment health indicator + + Args: + negative_pct: Percentage of negative sentiments + status_info: Tuple of (status, color, emoji) + """ + status, color, emoji = status_info + + st.markdown( + f""" +
+

{emoji} Brand Health: {status}

+

Negative Sentiment: {negative_pct:.1f}%

+
+ """, + unsafe_allow_html=True + ) + + @staticmethod + def display_post_card(post_row, show_full=False): + """ + Display a formatted post card + + Args: + post_row: Series containing post information + show_full: Whether to show full content + """ + with st.container(): + # Header with thread title + thread_title = post_row.get('thread_title', 'No title') + if pd.notna(thread_title) and thread_title: + st.markdown(f"**📌 {thread_title[:100]}{'...' if len(str(thread_title)) > 100 else ''}**") + + # Metadata row + col1, col2, col3, col4 = st.columns([2, 1, 1, 1]) + + with col1: + author_role = post_row.get('author_role', 'unknown') + role_emoji = { + 'current_owner': '👤', + 'past_owner': '👥', + 'potential_buyer': '🛒', + 'never_owned': '❓', + 'unknown': '❔' + }.get(author_role, '❔') + st.markdown(f"{role_emoji} **{author_role.replace('_', ' ').title()}**") + + with col2: + if 'post_created_at' in post_row and pd.notna(post_row['post_created_at']): + timestamp = pd.to_datetime(post_row['post_created_at']) + st.markdown(f"📅 {timestamp.strftime('%Y-%m-%d')}") + + with col3: + sentiment = post_row.get('sentiment_level', 'unknown') + sentiment_emoji = { + 'very_positive': '😄', + 'positive': '🙂', + 'neutral': '😐', + 'negative': '🙁', + 'very_negative': '😠' + }.get(sentiment, '❓') + st.markdown(f"{sentiment_emoji} {sentiment.replace('_', ' ').title()}") + + with col4: + mention_context = post_row.get('sabian_mention_context', 'unknown') + st.markdown(f"🎯 {mention_context.replace('_', ' ').title()}") + + # Content – always show in full (no character cutoff) + content = post_row.get('cleaned_content', '') or post_row.get('original_content', '') + if content: + st.markdown(f"💬 {content}") + + # Tags row + tags = [] + + # Products + products = post_row.get('products_mentioned', []) + if isinstance(products, list) and products: + tags.extend([f"🥁 {p}" for p in products[:3]]) + + # Competitors + competitors = post_row.get('competitors_mentioned', []) + if isinstance(competitors, list) and competitors: + tags.extend([f"🆚 {c}" for c in competitors[:2]]) + + # Intents + intents = post_row.get('intents', []) + if isinstance(intents, list) and intents: + tags.extend([f"💡 {i.replace('_', ' ').title()}" for i in intents[:2]]) + + if tags: + st.markdown(" | ".join(tags)) + + # Expandable details + with st.expander("📋 View Full Details"): + detail_col1, detail_col2 = st.columns(2) + + with detail_col1: + st.write("**Post ID:**", post_row.get('post_id', 'N/A')) + st.write("**Thread ID:**", post_row.get('thread_id', 'N/A')) + st.write("**Author Role:**", post_row.get('author_role', 'N/A')) + st.write("**Sentiment:**", post_row.get('sentiment_level', 'N/A')) + st.write("**Emotion:**", post_row.get('emotion_type', 'N/A')) + st.write("**Purchase Stage:**", post_row.get('purchase_stage', 'N/A') or 'N/A') + + with detail_col2: + st.write("**Mention Context:**", post_row.get('sabian_mention_context', 'N/A')) + st.write("**Products:**", ', '.join(products) if products else 'None') + st.write("**Competitors:**", ', '.join(competitors) if competitors else 'None') + + pain_points = post_row.get('pain_points', []) + if isinstance(pain_points, list) and pain_points: + st.write("**Pain Points:**", ', '.join(pain_points)) + + delights = post_row.get('delight_factors', []) + if isinstance(delights, list) and delights: + st.write("**Delight Factors:**", ', '.join(delights)) + + # Thread context + thread_context = post_row.get('thread_context_summary', '') + if thread_context: + st.write("**Thread Context:**", thread_context) + + # Analysis notes + analysis_notes = post_row.get('analysis_notes', '') + if analysis_notes: + st.write("**Analysis Notes:**", analysis_notes) + + st.markdown("---") + + @staticmethod + def display_comment_summary_stats(df, metrics, total_raw_count=None): + """ + Display summary statistics for comments in a formatted layout. + + Args: + df: Comments dataframe + metrics: Calculated metrics dict + total_raw_count: Optional total raw comment count from FACT_COMMENTS + (the full scanned dataset, not just the analyzed Sabian-relevant sample). + When provided, an extra metric is shown so readers can understand + the sample-to-total ratio. + """ + st.markdown("### 📊 Comments Summary") + + platforms = df['platform'].nunique() if 'platform' in df.columns else 0 + channels = df['channel_display_name'].nunique() if 'channel_display_name' in df.columns else 0 + + if total_raw_count is not None: + col1, col2, col3, col4, col5, col6 = st.columns(6) + + with col1: + st.metric("Analyzed Comments", f"{metrics['total_posts']:,}", + help="Sabian-relevant comments included in this analysis") + with col2: + st.metric("Total YT Comments", f"{total_raw_count:,}", + help="All YouTube comments scanned in the data collection window") + with col3: + st.metric("Platforms", platforms) + with col4: + st.metric("Channels", channels) + with col5: + st.metric("Positive %", f"{metrics['positive_pct']:.1f}%") + with col6: + st.metric("Negative %", f"{metrics['negative_pct']:.1f}%") + else: + col1, col2, col3, col4, col5 = st.columns(5) + + with col1: + st.metric("Total Comments", f"{metrics['total_posts']:,}") + with col2: + st.metric("Platforms", platforms) + with col3: + st.metric("Channels", channels) + with col4: + st.metric("Positive %", f"{metrics['positive_pct']:.1f}%") + with col5: + st.metric("Negative %", f"{metrics['negative_pct']:.1f}%") + + @staticmethod + def display_comment_card(comment_row, show_full=False): + """ + Display a formatted comment card + + Args: + comment_row: Series containing comment information + show_full: Whether to show full content + """ + with st.container(): + # Header with content title + content_title = comment_row.get('content_title', '') + if pd.notna(content_title) and content_title: + st.markdown(f"**📺 {content_title[:100]}{'...' if len(str(content_title)) > 100 else ''}**") + + # Metadata row + col1, col2, col3, col4 = st.columns([2, 1, 1, 1]) + + with col1: + author_name = comment_row.get('author_name', 'Unknown') + author_role = comment_row.get('author_role', 'unknown') + role_emoji = { + 'current_owner': '👤', + 'past_owner': '👥', + 'potential_buyer': '🛒', + 'never_owned': '❓', + 'unknown': '❔' + }.get(author_role, '❔') + st.markdown(f"{role_emoji} **{author_name}** ({author_role.replace('_', ' ').title()})") + + with col2: + if 'comment_timestamp' in comment_row and pd.notna(comment_row['comment_timestamp']): + timestamp = pd.to_datetime(comment_row['comment_timestamp']) + st.markdown(f"📅 {timestamp.strftime('%Y-%m-%d')}") + + with col3: + sentiment = comment_row.get('sentiment_level', 'unknown') + sentiment_emoji = { + 'very_positive': '😄', + 'positive': '🙂', + 'neutral': '😐', + 'negative': '🙁', + 'very_negative': '😠' + }.get(sentiment, '❓') + st.markdown(f"{sentiment_emoji} {sentiment.replace('_', ' ').title()}") + + with col4: + platform = comment_row.get('platform', 'unknown') + platform_emoji = {'youtube': '📺', 'musora_forums': '💬'}.get(platform, '💬') + st.markdown(f"{platform_emoji} {platform.replace('_', ' ').title()}") + + # Channel info + channel_name = comment_row.get('channel_display_name', '') + if pd.notna(channel_name) and channel_name: + st.markdown(f"📡 *Channel: {channel_name}*") + + # Content – always show in full (no character cutoff) + content = comment_row.get('original_text', '') or comment_row.get('display_text', '') + if content: + st.markdown(f"💬 {content}") + + # Tags row + tags = [] + + products = comment_row.get('products_mentioned', []) + if isinstance(products, list) and products: + tags.extend([f"🥁 {p}" for p in products[:3]]) + + competitors = comment_row.get('competitors_mentioned', []) + if isinstance(competitors, list) and competitors: + tags.extend([f"🆚 {c}" for c in competitors[:2]]) + + intents = comment_row.get('intents', []) + if isinstance(intents, list) and intents: + tags.extend([f"💡 {i.replace('_', ' ').title()}" for i in intents[:2]]) + + if tags: + st.markdown(" | ".join(tags)) + + # Expandable details + with st.expander("📋 View Full Details"): + detail_col1, detail_col2 = st.columns(2) + + with detail_col1: + st.write("**Comment ID:**", comment_row.get('comment_id', 'N/A')) + st.write("**Platform:**", comment_row.get('platform', 'N/A')) + st.write("**Author:**", comment_row.get('author_name', 'N/A')) + st.write("**Author Role:**", comment_row.get('author_role', 'N/A')) + st.write("**Sentiment:**", comment_row.get('sentiment_level', 'N/A')) + st.write("**Emotion:**", comment_row.get('emotion_type', 'N/A')) + st.write("**Purchase Stage:**", comment_row.get('purchase_stage', 'N/A') or 'N/A') + + with detail_col2: + st.write("**Mention Context:**", comment_row.get('sabian_mention_context', 'N/A')) + st.write("**Channel:**", comment_row.get('channel_display_name', 'N/A')) + st.write("**Products:**", ', '.join(products) if products else 'None') + st.write("**Competitors:**", ', '.join(competitors) if competitors else 'None') + + pain_points = comment_row.get('pain_points', []) + if isinstance(pain_points, list) and pain_points: + st.write("**Pain Points:**", ', '.join(pain_points)) + + delights = comment_row.get('delight_factors', []) + if isinstance(delights, list) and delights: + st.write("**Delight Factors:**", ', '.join(delights)) + + # Parent comment context + parent_text = comment_row.get('parent_comment_text', '') + if pd.notna(parent_text) and parent_text: + st.write("**Replying to:**") + st.text_area("", parent_text, height=80, disabled=True, key=f"parent_{comment_row.get('comment_sk', '')}") + + # Content description + content_desc = comment_row.get('content_description', '') + if pd.notna(content_desc) and content_desc: + st.write("**Content Description:**", content_desc[:300]) + + # Analysis notes + analysis_notes = comment_row.get('analysis_notes', '') + if pd.notna(analysis_notes) and analysis_notes: + st.write("**Analysis Notes:**", analysis_notes) + + st.markdown("---") + + @staticmethod + def display_thread_card(thread_row, posts_df): + """ + Display a thread card with its posts + + Args: + thread_row: Series containing thread information + posts_df: DataFrame of posts in this thread + """ + with st.container(): + st.markdown(f"### 📌 {thread_row.get('thread_title', 'No title')}") + + col1, col2, col3 = st.columns(3) + + with col1: + st.metric("Posts", int(thread_row.get('post_count', 0))) + + with col2: + st.metric("Dominant Sentiment", thread_row.get('dominant_sentiment', 'N/A').replace('_', ' ').title()) + + with col3: + avg_score = thread_row.get('avg_sentiment_score', 0) + st.metric("Avg Score", f"{avg_score:.2f}") + + with st.expander(f"View {len(posts_df)} posts in this thread"): + for _, post in posts_df.iterrows(): + ContentCards.display_post_card(post, show_full=False) + + @staticmethod + def display_metric_cards(metrics_dict): + """ + Display a row of metric cards + + Args: + metrics_dict: Dictionary of metrics {label: value} + """ + cols = st.columns(len(metrics_dict)) + + for idx, (label, value) in enumerate(metrics_dict.items()): + with cols[idx]: + if isinstance(value, dict) and 'value' in value: + st.metric( + label, + value['value'], + delta=value.get('delta'), + delta_color=value.get('delta_color', 'normal') + ) + else: + st.metric(label, value) + + @staticmethod + def display_filter_summary(applied_filters): + """ + Display summary of applied filters + + Args: + applied_filters: Dictionary of applied filters + """ + active_filters = {k: v for k, v in applied_filters.items() if v and len(v) > 0} + + if not active_filters: + return + + st.markdown("### 🔍 Active Filters") + + filter_text = [] + for filter_name, filter_value in active_filters.items(): + if isinstance(filter_value, list): + filter_text.append(f"**{filter_name.replace('_', ' ').title()}:** {', '.join(map(str, filter_value))}") + else: + filter_text.append(f"**{filter_name.replace('_', ' ').title()}:** {filter_value}") + + if filter_text: + st.info(" | ".join(filter_text)) + + @staticmethod + def display_ai_insights(insights): + """ + Display AI-generated insights + + Args: + insights: Dict containing AI analysis results + """ + if not insights: + st.warning("No insights available") + return + + # Executive Summary + st.markdown("### 📊 Executive Summary") + st.info(insights.get('executive_summary', 'No summary available')) + + # Sentiment Analysis + if insights.get('sentiment_analysis'): + st.markdown("### 🎭 Sentiment Analysis") + sentiment = insights['sentiment_analysis'] + + col1, col2 = st.columns(2) + with col1: + st.write(f"**Overall Tone:** {sentiment.get('overall_tone', 'N/A').title()}") + + drivers = sentiment.get('sentiment_drivers', []) + if drivers: + st.write("**Sentiment Drivers:**") + for driver in drivers: + st.write(f" - {driver}") + + with col2: + concerns = sentiment.get('concerning_patterns', []) + if concerns: + st.write("**Concerning Patterns:**") + for concern in concerns: + st.write(f" - ⚠️ {concern}") + + # Product Insights + if insights.get('product_insights'): + st.markdown("### 🥁 Product Insights") + products = insights['product_insights'] + + col1, col2, col3 = st.columns(3) + + with col1: + st.write("**Most Discussed:**") + for p in products.get('most_discussed_products', [])[:5]: + st.write(f" - {p}") + + with col2: + st.write("**Strengths:**") + for s in products.get('product_strengths', [])[:5]: + st.write(f" - ✅ {s}") + + with col3: + st.write("**Concerns:**") + for c in products.get('product_concerns', [])[:5]: + st.write(f" - ⚠️ {c}") + + # Competitive Position + if insights.get('competitive_position'): + st.markdown("### 🆚 Competitive Position") + competitive = insights['competitive_position'] + + st.write(f"**Summary:** {competitive.get('comparison_summary', 'N/A')}") + + col1, col2 = st.columns(2) + + with col1: + st.write("**Advantages vs Competitors:**") + for adv in competitive.get('advantages_vs_competitors', [])[:5]: + st.write(f" - ✅ {adv}") + + with col2: + st.write("**Disadvantages:**") + for dis in competitive.get('disadvantages_vs_competitors', [])[:5]: + st.write(f" - ⚠️ {dis}") + + if competitive.get('brand_switching_insights'): + st.write(f"**Switching Insights:** {competitive['brand_switching_insights']}") + + # Key Themes + if insights.get('key_themes'): + st.markdown("### 🎯 Key Themes") + for theme in insights['key_themes'][:5]: + sentiment_emoji = {'positive': '😊', 'negative': '😟', 'mixed': '🤔'}.get(theme.get('sentiment', 'mixed'), '🤔') + st.markdown(f""" +**{sentiment_emoji} {theme.get('theme', 'Unknown')}** ({theme.get('frequency', 'N/A')}) +- {theme.get('description', 'No description')} +""") + + # Actionable Recommendations + if insights.get('actionable_recommendations'): + st.markdown("### 🎯 Recommended Actions") + for action in insights['actionable_recommendations']: + priority = action.get('priority', 'medium').upper() + priority_emoji = {'HIGH': '🔴', 'MEDIUM': '🟡', 'LOW': '🟢'}.get(priority, '🟡') + category = action.get('category', '').replace('_', ' ').title() + + st.markdown(f""" +{priority_emoji} **[{priority}] {category}** +- **Action:** {action.get('recommendation', 'N/A')} +- **Rationale:** {action.get('rationale', 'N/A')} +""") + + # Notable Quotes + if insights.get('notable_quotes'): + st.markdown("### 💬 Notable Quotes") + for quote in insights['notable_quotes'][:5]: + sentiment_emoji = {'positive': '👍', 'negative': '👎', 'neutral': '➖'}.get(quote.get('sentiment', 'neutral'), '➖') + st.markdown(f""" +> "{quote.get('quote', 'N/A')}" +> +> {sentiment_emoji} *{quote.get('context', '')}* +""") + + @staticmethod + def display_pagination_controls(total_items, items_per_page, current_page, key_prefix=""): + """ + Display pagination controls + + Args: + total_items: Total number of items + items_per_page: Number of items per page + current_page: Current page number + key_prefix: Prefix for unique keys + + Returns: + int: New current page + """ + total_pages = (total_items - 1) // items_per_page + 1 + + col1, col2, col3 = st.columns([1, 2, 1]) + + with col1: + if st.button("⬅️ Previous", key=f"{key_prefix}_prev", disabled=(current_page <= 1)): + return current_page - 1 + + with col2: + st.markdown(f"
Page {current_page} of {total_pages}
", unsafe_allow_html=True) + + with col3: + if st.button("Next ➡️", key=f"{key_prefix}_next", disabled=(current_page >= total_pages)): + return current_page + 1 + + return current_page diff --git a/visualization_brand_sentiment/visualizations/demographic_charts.py b/visualization_brand_sentiment/visualizations/demographic_charts.py new file mode 100644 index 0000000000000000000000000000000000000000..72b38534c5c10aef2b7d79bf8d4ba18411b1eef5 --- /dev/null +++ b/visualization_brand_sentiment/visualizations/demographic_charts.py @@ -0,0 +1,396 @@ +""" +Demographic visualization charts for brand sentiment analysis +Handles drumming experience, gear ownership, and user demographics +""" +import plotly.graph_objects as go +import plotly.express as px +from plotly.subplots import make_subplots +import pandas as pd +import json +from pathlib import Path + + +class DemographicCharts: + """ + Creates demographic-related visualizations for brand analysis + """ + + def __init__(self, config_path=None): + """ + Initialize with configuration + + Args: + config_path: Path to configuration file + """ + if config_path is None: + config_path = Path(__file__).parent.parent / "config" / "viz_config.json" + + with open(config_path, 'r', encoding='utf-8') as f: + self.config = json.load(f) + + self.sentiment_colors = self.config['color_schemes']['sentiment_level'] + self.sentiment_order = self.config['sentiment_order'] + self.chart_height = self.config['dashboard']['chart_height'] + self.competitor_colors = self.config['color_schemes']['competitors'] + + def create_experience_distribution_chart(self, df, title="Drumming Experience Distribution"): + """ + Create bar chart for drumming experience distribution + + Args: + df: Distribution dataframe with experience_group, count columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No experience data available") + + # Define order for experience groups + exp_order = ['Beginner (0-5 years)', 'Intermediate (5-15 years)', + 'Advanced (15-25 years)', 'Expert (25+ years)'] + + df['exp_order'] = df['experience_group'].apply( + lambda x: exp_order.index(x) if x in exp_order else 99 + ) + df = df.sort_values('exp_order') + + colors = ['#4CAF50', '#2196F3', '#9C27B0', '#FF9800'][:len(df)] + + fig = go.Figure(data=[go.Bar( + x=df['experience_group'], + y=df['count'], + marker=dict(color=colors), + text=df.apply(lambda row: f"{row['count']}
({row['percentage']:.1f}%)", axis=1), + textposition='auto', + hovertemplate='%{x}
Count: %{y}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Experience Level", + yaxis_title="Number of Users", + height=self.chart_height + ) + + return fig + + def create_experience_sentiment_chart(self, df, title="Sentiment by Experience Level"): + """ + Create stacked bar chart showing sentiment by experience level + + Args: + df: Dataframe with experience_group, sentiment_level, count, percentage + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No data available") + + fig = go.Figure() + + for sentiment in self.sentiment_order: + sentiment_data = df[df['sentiment_level'] == sentiment] + if not sentiment_data.empty: + fig.add_trace(go.Bar( + name=sentiment.replace('_', ' ').title(), + x=sentiment_data['experience_group'], + y=sentiment_data['percentage'], + marker=dict(color=self.sentiment_colors.get(sentiment, '#CCCCCC')), + hovertemplate='%{fullData.name}
%{y:.1f}%' + )) + + fig.update_layout( + title=title, + xaxis_title="Experience Level", + yaxis=dict(title="Percentage (%)", range=[0, 100]), + barmode='stack', + height=self.chart_height, + legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1) + ) + + return fig + + def create_cymbal_ownership_chart(self, ownership_data, title="Cymbal Brand Ownership"): + """ + Create bar chart showing cymbal brand ownership among users + + Args: + ownership_data: Dict with brand_counts from cymbal ownership analysis + + Returns: + plotly.graph_objects.Figure + """ + if not ownership_data or 'brand_counts' not in ownership_data: + return self._create_empty_chart(title, "No cymbal ownership data available") + + brand_counts = ownership_data['brand_counts'] + + df = pd.DataFrame(list(brand_counts.items()), columns=['brand', 'count']) + df = df.sort_values('count', ascending=False) + + # Assign colors + colors = [] + for brand in df['brand']: + if brand == 'Sabian': + colors.append(self.config['brand']['primary_color']) + else: + colors.append(self.competitor_colors.get(brand, self.competitor_colors['default'])) + + fig = go.Figure(data=[go.Bar( + x=df['brand'], + y=df['count'], + marker=dict(color=colors), + text=df['count'], + textposition='auto', + hovertemplate='%{x}
Users: %{y}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Cymbal Brand", + yaxis_title="Number of Users Owning", + height=self.chart_height + ) + + return fig + + def create_ownership_sentiment_chart(self, df, title="Sentiment by Sabian Ownership"): + """ + Create grouped bar chart comparing sentiment between Sabian owners and non-owners + + Args: + df: Dataframe with owns_sabian, sentiment_level, count, percentage + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No ownership data available") + + fig = go.Figure() + + for sentiment in self.sentiment_order: + sentiment_data = df[df['sentiment_level'] == sentiment] + if not sentiment_data.empty: + # Map True/False to readable labels + x_labels = sentiment_data['owns_sabian'].map({True: 'Sabian Owners', False: 'Non-Owners'}) + + fig.add_trace(go.Bar( + name=sentiment.replace('_', ' ').title(), + x=x_labels, + y=sentiment_data['percentage'], + marker=dict(color=self.sentiment_colors.get(sentiment, '#CCCCCC')), + hovertemplate='%{fullData.name}
%{y:.1f}%' + )) + + fig.update_layout( + title=title, + xaxis_title="Cymbal Ownership", + yaxis=dict(title="Percentage (%)", range=[0, 100]), + barmode='stack', + height=self.chart_height, + legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1) + ) + + return fig + + def create_gear_brand_chart(self, df, gear_type, title=None): + """ + Create bar chart for gear brand distribution + + Args: + df: Distribution dataframe with brand, count columns + gear_type: Type of gear (sticks, drums, hardware) + title: Optional title override + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title or f"{gear_type.title()} Brand Distribution", "No data available") + + if title is None: + title = f"{gear_type.title()} Brand Distribution" + + fig = go.Figure(data=[go.Bar( + x=df['brand'], + y=df['count'], + marker=dict(color='#2196F3'), + text=df['count'], + textposition='auto', + hovertemplate='%{x}
Mentions: %{y}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Brand", + yaxis_title="Number of Mentions", + height=self.chart_height + ) + + return fig + + def create_age_distribution_chart(self, df, title="Age Distribution"): + """ + Create bar chart for age group distribution + + Args: + df: Distribution dataframe with age_group, count, percentage columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No age data available") + + # Define age order + age_order = ['18-24', '25-34', '35-44', '45-54', '55+'] + + df['age_order'] = df['age_group'].apply( + lambda x: age_order.index(x) if x in age_order else 99 + ) + df = df.sort_values('age_order') + + fig = go.Figure(data=[go.Bar( + x=df['age_group'], + y=df['count'], + marker=dict(color='#4A90E2'), + text=df.apply(lambda row: f"{row['count']}
({row['percentage']:.1f}%)", axis=1), + textposition='auto', + hovertemplate='%{x}
Count: %{y}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Age Group", + yaxis_title="Number of Users", + height=self.chart_height + ) + + return fig + + def create_timezone_distribution_chart(self, df, title="Geographic Distribution (by Timezone Region)"): + """ + Create pie chart for timezone region distribution + + Args: + df: Distribution dataframe with timezone_region, count, percentage columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No timezone data available") + + colors = px.colors.qualitative.Set2 + + fig = go.Figure(data=[go.Pie( + labels=df['timezone_region'], + values=df['count'], + marker=dict(colors=colors), + textinfo='label+percent', + textposition='auto', + hovertemplate='%{label}
Count: %{value}
%{percent}' + )]) + + fig.update_layout( + title=title, + height=self.chart_height, + showlegend=True, + legend=dict(orientation="v", yanchor="middle", y=0.5, xanchor="left", x=1.05) + ) + + return fig + + def create_demographics_overview(self, df, title="Demographics Overview"): + """ + Create combined demographics chart with multiple subplots + + Args: + df: Dataframe with demographic fields + + Returns: + plotly.graph_objects.Figure + """ + fig = make_subplots( + rows=2, cols=2, + specs=[[{"type": "pie"}, {"type": "bar"}], + [{"type": "pie"}, {"type": "bar"}]], + subplot_titles=["Experience Level", "Age Distribution", + "Geographic Region", "Cymbal Ownership"] + ) + + # Experience level pie + if 'experience_group' in df.columns: + exp_counts = df['experience_group'].value_counts() + fig.add_trace( + go.Pie(labels=exp_counts.index, values=exp_counts.values, name="Experience"), + row=1, col=1 + ) + + # Age distribution bar + if 'age_group' in df.columns: + age_counts = df['age_group'].value_counts().sort_index() + fig.add_trace( + go.Bar(x=age_counts.index, y=age_counts.values, name="Age"), + row=1, col=2 + ) + + # Timezone region pie + if 'timezone_region' in df.columns: + tz_counts = df['timezone_region'].value_counts().head(5) + fig.add_trace( + go.Pie(labels=tz_counts.index, values=tz_counts.values, name="Region"), + row=2, col=1 + ) + + # Cymbal ownership bar + if 'owns_sabian' in df.columns: + ownership = df['owns_sabian'].value_counts() + labels = ownership.index.map({True: 'Owns Sabian', False: 'No Sabian'}) + fig.add_trace( + go.Bar(x=labels, y=ownership.values, name="Ownership"), + row=2, col=2 + ) + + fig.update_layout( + title=title, + height=600, + showlegend=False + ) + + return fig + + def _create_empty_chart(self, title, message): + """ + Create an empty chart with a message + + Args: + title: Chart title + message: Message to display + + Returns: + plotly.graph_objects.Figure + """ + fig = go.Figure() + + fig.add_annotation( + text=message, + xref="paper", + yref="paper", + x=0.5, + y=0.5, + showarrow=False, + font=dict(size=14, color="gray") + ) + + fig.update_layout( + title=title, + height=self.chart_height, + xaxis=dict(visible=False), + yaxis=dict(visible=False) + ) + + return fig diff --git a/visualization_brand_sentiment/visualizations/distribution_charts.py b/visualization_brand_sentiment/visualizations/distribution_charts.py new file mode 100644 index 0000000000000000000000000000000000000000..f37840f110713f16e17288e570042523812145cd --- /dev/null +++ b/visualization_brand_sentiment/visualizations/distribution_charts.py @@ -0,0 +1,399 @@ +""" +Distribution visualization components using Plotly +Creates charts for various distributions in brand sentiment analysis +""" +import plotly.graph_objects as go +import plotly.express as px +from plotly.subplots import make_subplots +import pandas as pd +import json +from pathlib import Path + + +class DistributionCharts: + """ + Creates distribution visualizations for brand analysis + """ + + def __init__(self, config_path=None): + """ + Initialize with configuration + + Args: + config_path: Path to configuration file + """ + if config_path is None: + config_path = Path(__file__).parent.parent / "config" / "viz_config.json" + + with open(config_path, 'r', encoding='utf-8') as f: + self.config = json.load(f) + + self.intent_colors = self.config['color_schemes']['intents'] + self.author_role_colors = self.config['color_schemes']['author_role'] + self.mention_context_colors = self.config['color_schemes']['mention_context'] + self.purchase_stage_colors = self.config['color_schemes']['purchase_stage'] + self.feedback_colors = self.config['color_schemes']['feedback_aspects'] + self.processing_colors = self.config['color_schemes']['processing_status'] + self.chart_height = self.config['dashboard']['chart_height'] + + def create_author_role_chart(self, df, title="Author Role Distribution"): + """ + Create pie chart for author role distribution + + Args: + df: Distribution dataframe with author_role, count columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No data available") + + colors = [self.author_role_colors.get(r, '#CCCCCC') for r in df['author_role']] + + fig = go.Figure(data=[go.Pie( + labels=[r.replace('_', ' ').title() for r in df['author_role']], + values=df['count'], + marker=dict(colors=colors), + textinfo='label+percent', + textposition='auto', + hovertemplate='%{label}
Count: %{value}
Percentage: %{percent}' + )]) + + fig.update_layout( + title=title, + height=self.chart_height, + showlegend=True, + legend=dict(orientation="v", yanchor="middle", y=0.5, xanchor="left", x=1.05) + ) + + return fig + + def create_mention_context_chart(self, df, title="Mention Context Distribution"): + """ + Create bar chart for mention context distribution + + Args: + df: Distribution dataframe with sabian_mention_context, count columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No data available") + + colors = [self.mention_context_colors.get(c, '#CCCCCC') for c in df['sabian_mention_context']] + + fig = go.Figure(data=[go.Bar( + x=[c.replace('_', ' ').title() for c in df['sabian_mention_context']], + y=df['count'], + marker=dict(color=colors), + text=df['count'], + textposition='auto', + hovertemplate='%{x}
Count: %{y}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Mention Context", + yaxis_title="Number of Posts", + height=self.chart_height + ) + + return fig + + def create_intent_bar_chart(self, df, title="Intent Distribution", orientation='h'): + """ + Create bar chart for intent distribution + + Args: + df: Distribution dataframe with intent, count columns + title: Chart title + orientation: 'h' for horizontal, 'v' for vertical + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No intent data available") + + colors = [self.intent_colors.get(i, '#CCCCCC') for i in df['intent']] + + if orientation == 'h': + fig = go.Figure(data=[go.Bar( + y=[i.replace('_', ' ').title() for i in df['intent']], + x=df['count'], + orientation='h', + marker=dict(color=colors), + text=df['count'], + textposition='auto', + hovertemplate='%{y}
Count: %{x}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Number of Posts", + yaxis_title="Intent", + height=self.chart_height, + yaxis={'categoryorder': 'total ascending'} + ) + else: + fig = go.Figure(data=[go.Bar( + x=[i.replace('_', ' ').title() for i in df['intent']], + y=df['count'], + marker=dict(color=colors), + text=df['count'], + textposition='auto', + hovertemplate='%{x}
Count: %{y}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Intent", + yaxis_title="Number of Posts", + height=self.chart_height + ) + + return fig + + def create_purchase_stage_chart(self, df, title="Purchase Stage Distribution"): + """ + Create funnel chart for purchase stage distribution + + Args: + df: Distribution dataframe with purchase_stage, count columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No purchase stage data available") + + # Define funnel order + stage_order = ['researching', 'deciding', 'recently_purchased', 'long_term_owner', 'selling_replacing'] + + # Reorder dataframe + df['stage_order'] = df['purchase_stage'].apply( + lambda x: stage_order.index(x) if x in stage_order else 99 + ) + df = df.sort_values('stage_order') + + colors = [self.purchase_stage_colors.get(s, '#CCCCCC') for s in df['purchase_stage']] + + fig = go.Figure(go.Funnel( + y=[s.replace('_', ' ').title() for s in df['purchase_stage']], + x=df['count'], + textinfo="value+percent total", + marker=dict(color=colors), + hovertemplate='%{y}
Count: %{x}' + )) + + fig.update_layout( + title=title, + height=self.chart_height + ) + + return fig + + def create_pain_delight_comparison_chart(self, pain_df, delight_df, title="Pain Points vs Delight Factors"): + """ + Create comparison bar chart for pain points and delight factors. + + The right subplot's y-axis is placed on the right side so its labels + never overlap with the left subplot. Horizontal spacing is also + increased to provide a clear visual separation. + + Args: + pain_df: Pain points distribution dataframe + delight_df: Delight factors distribution dataframe + + Returns: + plotly.graph_objects.Figure + """ + fig = make_subplots( + rows=1, cols=2, + subplot_titles=['Pain Points', 'Delight Factors'], + horizontal_spacing=0.35, # wider gap between panels + ) + + # Pain points (left panel) + if not pain_df.empty: + # Truncate long labels so they don't overlap + pain_labels = [ + p.replace('_', ' ').title()[:28] + for p in pain_df['pain_point'] + ] + fig.add_trace( + go.Bar( + y=pain_labels, + x=pain_df['count'], + orientation='h', + marker=dict(color='#D32F2F'), + name='Pain Points', + hovertemplate='%{y}
Count: %{x}' + ), + row=1, col=1 + ) + + # Delight factors (right panel – y-axis on the right side) + if not delight_df.empty: + delight_labels = [ + d.replace('_', ' ').title()[:28] + for d in delight_df['delight_factor'] + ] + fig.add_trace( + go.Bar( + y=delight_labels, + x=delight_df['count'], + orientation='h', + marker=dict(color='#4CAF50'), + name='Delight Factors', + hovertemplate='%{y}
Count: %{x}' + ), + row=1, col=2 + ) + + # Move the right-panel y-axis to the right side so labels + # don't bleed into the left panel + fig.update_yaxes(side='right', row=1, col=2) + + fig.update_layout( + title=title, + height=self.chart_height, + showlegend=False, + ) + + return fig + + def create_brand_switching_chart(self, df, title="Brand Switching Behavior"): + """ + Create bar chart for brand switching behavior + + Args: + df: Distribution dataframe with switching_direction, count columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No brand switching data available") + + switching_colors = self.config['color_schemes']['brand_switching'] + colors = [switching_colors.get(s, '#CCCCCC') for s in df['switching_direction']] + + labels = { + 'switching_to_sabian': 'Coming TO Sabian', + 'switching_from_sabian': 'Leaving Sabian' + } + + fig = go.Figure(data=[go.Bar( + x=[labels.get(s, s.replace('_', ' ').title()) for s in df['switching_direction']], + y=df['count'], + marker=dict(color=colors), + text=df['count'], + textposition='auto', + hovertemplate='%{x}
Count: %{y}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Switching Direction", + yaxis_title="Number of Posts", + height=self.chart_height + ) + + return fig + + def create_processing_status_chart(self, df, title="Processing Status"): + """ + Create pie chart for processing status distribution + + Args: + df: Distribution dataframe with processing_status, count columns + + Returns: + plotly.graph_objects.Figure + """ + if df.empty: + return self._create_empty_chart(title, "No data available") + + colors = [self.processing_colors.get(s, '#CCCCCC') for s in df['processing_status']] + + fig = go.Figure(data=[go.Pie( + labels=[s.replace('_', ' ').title() for s in df['processing_status']], + values=df['count'], + marker=dict(colors=colors), + textinfo='label+percent', + textposition='auto', + hovertemplate='%{label}
Count: %{value}
Percentage: %{percent}' + )]) + + fig.update_layout( + title=title, + height=300, + showlegend=True, + legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5) + ) + + return fig + + def create_combined_sunburst(self, df, title="Hierarchical View"): + """ + Create sunburst chart showing hierarchical distribution + (Author Role > Mention Context > Sentiment) + + Args: + df: Brand sentiment dataframe + + Returns: + plotly.graph_objects.Figure + """ + sunburst_data = df.groupby( + ['author_role', 'sabian_mention_context', 'sentiment_level'] + ).size().reset_index(name='count') + + fig = px.sunburst( + sunburst_data, + path=['author_role', 'sabian_mention_context', 'sentiment_level'], + values='count', + title=title, + height=500 + ) + + fig.update_layout( + margin=dict(t=50, l=0, r=0, b=0) + ) + + return fig + + def _create_empty_chart(self, title, message): + """ + Create an empty chart with a message + + Args: + title: Chart title + message: Message to display + + Returns: + plotly.graph_objects.Figure + """ + fig = go.Figure() + + fig.add_annotation( + text=message, + xref="paper", + yref="paper", + x=0.5, + y=0.5, + showarrow=False, + font=dict(size=14, color="gray") + ) + + fig.update_layout( + title=title, + height=self.chart_height, + xaxis=dict(visible=False), + yaxis=dict(visible=False) + ) + + return fig diff --git a/visualization_brand_sentiment/visualizations/sentiment_charts.py b/visualization_brand_sentiment/visualizations/sentiment_charts.py new file mode 100644 index 0000000000000000000000000000000000000000..1654676706064673643c23fb5246c750e5bef255 --- /dev/null +++ b/visualization_brand_sentiment/visualizations/sentiment_charts.py @@ -0,0 +1,350 @@ +""" +Sentiment visualization components using Plotly +Creates interactive charts for brand sentiment analysis +""" +import plotly.graph_objects as go +import plotly.express as px +from plotly.subplots import make_subplots +import pandas as pd +import json +from pathlib import Path + + +class SentimentCharts: + """ + Creates sentiment-related visualizations for brand analysis + """ + + def __init__(self, config_path=None): + """ + Initialize with configuration + + Args: + config_path: Path to configuration file + """ + if config_path is None: + config_path = Path(__file__).parent.parent / "config" / "viz_config.json" + + with open(config_path, 'r', encoding='utf-8') as f: + self.config = json.load(f) + + self.sentiment_colors = self.config['color_schemes']['sentiment_level'] + self.sentiment_order = self.config['sentiment_order'] + self.chart_height = self.config['dashboard']['chart_height'] + + def create_sentiment_pie_chart(self, df, title="Sentiment Distribution"): + """ + Create pie chart for sentiment distribution + + Args: + df: Brand sentiment dataframe + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + sentiment_counts = df['sentiment_level'].value_counts() + + # Order by sentiment_order + ordered_sentiments = [s for s in self.sentiment_order if s in sentiment_counts.index] + sentiment_counts = sentiment_counts[ordered_sentiments] + + colors = [self.sentiment_colors.get(s, '#CCCCCC') for s in sentiment_counts.index] + + fig = go.Figure(data=[go.Pie( + labels=[s.replace('_', ' ').title() for s in sentiment_counts.index], + values=sentiment_counts.values, + marker=dict(colors=colors), + textinfo='label+percent', + textposition='auto', + hovertemplate='%{label}
Count: %{value}
Percentage: %{percent}' + )]) + + fig.update_layout( + title=title, + height=self.chart_height, + showlegend=True, + legend=dict(orientation="v", yanchor="middle", y=0.5, xanchor="left", x=1.05) + ) + + return fig + + def create_sentiment_bar_chart(self, df, group_by, title="Sentiment Distribution", category_order=None): + """ + Create stacked bar chart for sentiment distribution by group + + Args: + df: Brand sentiment dataframe + group_by: Column to group by + title: Chart title + category_order: Optional list specifying the order of categories on the x-axis + + Returns: + plotly.graph_objects.Figure + """ + # Create pivot table + sentiment_pivot = pd.crosstab(df[group_by], df['sentiment_level']) + + # Reorder columns by sentiment_order + ordered_columns = [s for s in self.sentiment_order if s in sentiment_pivot.columns] + sentiment_pivot = sentiment_pivot[ordered_columns] + + # Apply category order if specified + if category_order: + existing_categories = [c for c in category_order if c in sentiment_pivot.index] + sentiment_pivot = sentiment_pivot.reindex(existing_categories) + + fig = go.Figure() + + for sentiment in sentiment_pivot.columns: + fig.add_trace(go.Bar( + name=sentiment.replace('_', ' ').title(), + x=sentiment_pivot.index, + y=sentiment_pivot[sentiment], + marker_color=self.sentiment_colors.get(sentiment, '#CCCCCC'), + hovertemplate='%{x}
%{y} posts' + )) + + fig.update_layout( + title=title, + xaxis_title=group_by.replace('_', ' ').title(), + yaxis_title="Number of Posts", + barmode='stack', + height=self.chart_height, + legend=dict(title="Sentiment", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1) + ) + + return fig + + def create_sentiment_percentage_bar_chart(self, df, group_by, title="Sentiment Distribution (%)"): + """ + Create 100% stacked bar chart for sentiment distribution + + Args: + df: Brand sentiment dataframe + group_by: Column to group by + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + # Create pivot table with percentages + sentiment_pivot = pd.crosstab(df[group_by], df['sentiment_level'], normalize='index') * 100 + + # Reorder columns by sentiment_order + ordered_columns = [s for s in self.sentiment_order if s in sentiment_pivot.columns] + sentiment_pivot = sentiment_pivot[ordered_columns] + + fig = go.Figure() + + for sentiment in sentiment_pivot.columns: + fig.add_trace(go.Bar( + name=sentiment.replace('_', ' ').title(), + x=sentiment_pivot.index, + y=sentiment_pivot[sentiment], + marker_color=self.sentiment_colors.get(sentiment, '#CCCCCC'), + hovertemplate='%{x}
%{y:.1f}%' + )) + + fig.update_layout( + title=title, + xaxis_title=group_by.replace('_', ' ').title(), + yaxis_title="Percentage (%)", + barmode='stack', + height=self.chart_height, + yaxis=dict(range=[0, 100]), + legend=dict(title="Sentiment", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1) + ) + + return fig + + def create_sentiment_heatmap(self, df, row_dimension, col_dimension, title="Sentiment Heatmap"): + """ + Create heatmap showing sentiment distribution across two dimensions + + Args: + df: Brand sentiment dataframe + row_dimension: Row dimension + col_dimension: Column dimension + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + negative_sentiments = self.config['negative_sentiments'] + + heatmap_data = pd.crosstab( + df[row_dimension], + df[col_dimension], + values=(df['sentiment_level'].isin(negative_sentiments)).astype(int), + aggfunc='mean' + ) * 100 + + fig = go.Figure(data=go.Heatmap( + z=heatmap_data.values, + x=heatmap_data.columns, + y=heatmap_data.index, + colorscale='RdYlGn_r', + text=heatmap_data.values.round(1), + texttemplate='%{text}%', + textfont={"size": 12}, + hovertemplate='%{y} - %{x}
Negative: %{z:.1f}%', + colorbar=dict(title="Negative %") + )) + + fig.update_layout( + title=title, + xaxis_title=col_dimension.replace('_', ' ').title(), + yaxis_title=row_dimension.replace('_', ' ').title(), + height=self.chart_height + ) + + return fig + + def create_sentiment_timeline(self, df, freq='W', title="Sentiment Over Time"): + """ + Create line chart showing sentiment trends over time + + Args: + df: Brand sentiment dataframe with post_created_at + freq: Frequency for aggregation ('D', 'W', 'M') + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + # Determine date column + date_col = None + if 'post_created_at' in df.columns and not df['post_created_at'].isna().all(): + date_col = 'post_created_at' + elif 'comment_timestamp' in df.columns and not df['comment_timestamp'].isna().all(): + date_col = 'comment_timestamp' + + if date_col is None: + return go.Figure().add_annotation( + text="No timestamp data available", + xref="paper", yref="paper", + x=0.5, y=0.5, showarrow=False + ) + + df_temp = df.copy() + df_temp['date'] = pd.to_datetime(df_temp[date_col]).dt.to_period(freq).dt.to_timestamp() + + # Aggregate by date and sentiment + timeline_data = df_temp.groupby(['date', 'sentiment_level']).size().reset_index(name='count') + + fig = go.Figure() + + for sentiment in self.sentiment_order: + sentiment_data = timeline_data[timeline_data['sentiment_level'] == sentiment] + if not sentiment_data.empty: + fig.add_trace(go.Scatter( + x=sentiment_data['date'], + y=sentiment_data['count'], + name=sentiment.replace('_', ' ').title(), + mode='lines+markers', + line=dict(color=self.sentiment_colors.get(sentiment, '#CCCCCC'), width=2), + marker=dict(size=6), + hovertemplate='%{x}
Count: %{y}' + )) + + fig.update_layout( + title=title, + xaxis_title="Date", + yaxis_title="Number of Posts", + height=self.chart_height, + legend=dict(title="Sentiment", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), + hovermode='x unified' + ) + + return fig + + def create_sentiment_score_gauge(self, avg_score, title="Overall Sentiment Score"): + """ + Create gauge chart for average sentiment score + + Args: + avg_score: Average sentiment score (-2 to +2) + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + # Normalize score to 0-100 scale + normalized_score = ((avg_score + 2) / 4) * 100 + + fig = go.Figure(go.Indicator( + mode="gauge+number", + value=normalized_score, + domain={'x': [0, 1], 'y': [0, 1]}, + title={'text': title, 'font': {'size': 18}}, + number={'font': {'size': 36}}, + gauge={ + 'axis': {'range': [0, 100], 'tickwidth': 1, 'tickcolor': "darkblue"}, + 'bar': {'color': self.config['brand']['primary_color']}, + 'bgcolor': "white", + 'borderwidth': 2, + 'bordercolor': "gray", + 'steps': [ + {'range': [0, 20], 'color': self.sentiment_colors['very_negative']}, + {'range': [20, 40], 'color': self.sentiment_colors['negative']}, + {'range': [40, 60], 'color': self.sentiment_colors['neutral']}, + {'range': [60, 80], 'color': self.sentiment_colors['positive']}, + {'range': [80, 100], 'color': self.sentiment_colors['very_positive']} + ], + 'threshold': { + 'line': {'color': "black", 'width': 4}, + 'thickness': 0.75, + 'value': normalized_score + } + } + )) + + fig.update_layout( + height=280, + margin=dict(l=20, r=20, t=60, b=20) + ) + + return fig + + def create_emotion_distribution_chart(self, df, title="Emotion Distribution"): + """ + Create bar chart for emotion distribution + + Args: + df: Brand sentiment dataframe + title: Chart title + + Returns: + plotly.graph_objects.Figure + """ + emotion_colors = self.config['color_schemes']['emotion_type'] + + emotion_df = df[df['emotion_type'] != 'unknown'] + if emotion_df.empty: + return go.Figure().add_annotation( + text="No emotion data available", + xref="paper", yref="paper", + x=0.5, y=0.5, showarrow=False + ) + + emotion_counts = emotion_df['emotion_type'].value_counts() + colors = [emotion_colors.get(e, '#CCCCCC') for e in emotion_counts.index] + + fig = go.Figure(data=[go.Bar( + x=emotion_counts.index, + y=emotion_counts.values, + marker=dict(color=colors), + text=emotion_counts.values, + textposition='auto', + hovertemplate='%{x}
Count: %{y}' + )]) + + fig.update_layout( + title=title, + xaxis_title="Emotion", + yaxis_title="Number of Posts", + height=self.chart_height + ) + + return fig