Spaces:

MusoraProductDepartment
/

Sentiment_analysis

Running

File size: 13,863 Bytes
"""
Content Summary Agent
Analyzes and summarizes comments for content pieces
"""
import pandas as pd
from typing import Dict, Any, List
import sys
from pathlib import Path

# Add parent directory to path
parent_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(parent_dir))

from agents.base_agent import BaseVisualizationAgent
from utils.llm_helper import LLMHelper


class ContentSummaryAgent(BaseVisualizationAgent):
    """
    Agent that analyzes and summarizes comments for content
    Extracts themes, praise points, complaints, FAQs, and insights
    """

    def __init__(self, model: str = "gpt-5-nano", temperature: float = 1):
        """
        Initialize Content Summary Agent

        Args:
            model: LLM model to use
            temperature: Temperature for generation (lower for more focused summaries)
        """
        super().__init__(name="ContentSummaryAgent", model=model, temperature=temperature)
        self.llm_helper = LLMHelper(model=model, temperature=temperature)

    def validate_input(self, input_data: Dict[str, Any]) -> bool:
        """
        Validate input data

        Args:
            input_data: Input dictionary

        Returns:
            True if valid, False otherwise
        """
        required_fields = ['content_sk', 'content_description', 'comments']

        for field in required_fields:
            if field not in input_data:
                self.log_processing(f"Missing required field: {field}", level="error")
                return False

        if not isinstance(input_data['comments'], (list, pd.DataFrame)):
            self.log_processing("Comments must be a list or DataFrame", level="error")
            return False

        return True

    def _prepare_comments_context(self, comments: Any, sentiment_type: str = 'negative') -> str:
        """
        Prepare comments data for LLM analysis

        Args:
            comments: Comments as DataFrame or list of dicts
            sentiment_type: Type of sentiment to analyze ('negative', 'positive', 'combined')

        Returns:
            Formatted string with comment data
        """
        # Convert to DataFrame if needed
        if isinstance(comments, list):
            comments_df = pd.DataFrame(comments)
        else:
            comments_df = comments.copy()

        # Filter based on sentiment type
        if sentiment_type == 'negative':
            # Only negative comments
            comments_df = comments_df[
                comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])
            ]
        elif sentiment_type == 'positive':
            # Only positive comments
            comments_df = comments_df[
                comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])
            ]
        # else: combined - use all comments

        # Limit to reasonable number for API
        if len(comments_df) > 100:
            if sentiment_type == 'combined':
                # For combined: sample from both positive and negative
                negative_comments = comments_df[
                    comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])
                ].sample(n=min(50, len(comments_df[comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])])), random_state=42)

                positive_comments = comments_df[
                    comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])
                ].sample(n=min(50, len(comments_df[comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])])), random_state=42)

                comments_df = pd.concat([negative_comments, positive_comments])
            else:
                # For single sentiment type: just sample
                comments_df = comments_df.sample(n=min(100, len(comments_df)), random_state=42)

        # Format comments for analysis
        comments_text = []
        for idx, row in comments_df.iterrows():
            text = row.get('display_text', row.get('original_text', ''))
            sentiment = row.get('sentiment_polarity', 'unknown')
            intent = row.get('intent', 'unknown')

            comment_entry = f"""
Comment #{idx + 1}:
- Text: {text[:300]}{'...' if len(str(text)) > 300 else ''}
- Sentiment: {sentiment}
- Intent: {intent}
"""
            comments_text.append(comment_entry)

        return "\n".join(comments_text)

    def _generate_summary_prompt(
        self,
        content_description: str,
        comments_context: str,
        total_comments: int,
        sentiment_type: str = 'negative'
    ) -> str:
        """
        Generate prompt for LLM

        Args:
            content_description: Description of the content
            comments_context: Formatted comments
            total_comments: Total number of comments
            sentiment_type: Type of sentiment being analyzed ('negative', 'positive', 'combined')

        Returns:
            Prompt string
        """
        # Customize prompt based on sentiment type
        if sentiment_type == 'negative':
            focus_instruction = "Focus on understanding negative feedback, complaints, and issues that need attention."
        elif sentiment_type == 'positive':
            focus_instruction = "Focus on understanding what users love, praise points, and successful elements that should be maintained or amplified."
        else:  # combined
            focus_instruction = "Provide a balanced analysis covering both positive feedback and areas for improvement."

        prompt = f"""Analyze the {sentiment_type} comments below for the following content and provide a brief executive summary.

**Content:** {content_description}

**Total Comments Analyzed:** {total_comments}

**Analysis Focus:** {focus_instruction}

**Comments to Analyze:**
{comments_context}

**Task:** Provide a concise executive summary in JSON format with the following structure:

{{
    "executive_summary": "2-3 sentence high-level overview focusing on {sentiment_type} sentiment",
    "main_themes": [
        {{
            "theme": "theme name",
            "sentiment": "positive/negative/mixed",
            "description": "brief description"
        }}
    ],
    "praise_points": ["point 1", "point 2", "point 3"],
    "key_complaints": ["complaint 1", "complaint 2", "complaint 3"],
    "frequently_asked_questions": ["question 1", "question 2"],
    "unexpected_insights": ["insight 1", "insight 2"],
    "action_recommendations": [
        {{
            "priority": "high/medium/low",
            "action": "recommended action"
        }}
    ]
}}

**Guidelines:**
- Be concise and actionable
- Focus on the most important insights from {sentiment_type} comments
- Limit each list to top 3-5 items
- If a section has no relevant items, use an empty list
- Executive summary should capture the overall patterns and key takeaways
"""
        return prompt

    def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Process comments and generate summary

        Args:
            input_data: {
                'content_sk': content identifier,
                'content_description': content title/description,
                'comments': DataFrame or list of comment dicts,
                'sentiment_type': 'negative', 'positive', or 'combined' (optional, defaults to 'negative')
            }

        Returns:
            {
                'success': bool,
                'content_sk': str,
                'sentiment_type': str,
                'summary': {
                    'executive_summary': str,
                    'main_themes': list,
                    'praise_points': list,
                    'key_complaints': list,
                    'frequently_asked_questions': list,
                    'unexpected_insights': list,
                    'action_recommendations': list
                },
                'metadata': {
                    'total_comments_analyzed': int,
                    'model_used': str,
                    'tokens_used': int
                }
            }
        """
        try:
            # Validate input
            if not self.validate_input(input_data):
                return {
                    'success': False,
                    'error': 'Invalid input data',
                    'content_sk': input_data.get('content_sk', 'unknown')
                }

            content_sk = input_data['content_sk']
            content_description = input_data['content_description']
            comments = input_data['comments']
            sentiment_type = input_data.get('sentiment_type', 'negative')  # Default to negative for backward compatibility

            self.log_processing(f"Starting {sentiment_type} analysis for content: {content_sk}")

            # Convert to DataFrame if needed
            if isinstance(comments, list):
                comments_df = pd.DataFrame(comments)
            else:
                comments_df = comments.copy()

            total_comments = len(comments_df)

            if total_comments == 0:
                return {
                    'success': True,
                    'content_sk': content_sk,
                    'sentiment_type': sentiment_type,
                    'summary': {
                        'executive_summary': 'No comments available for analysis.',
                        'main_themes': [],
                        'praise_points': [],
                        'key_complaints': [],
                        'frequently_asked_questions': [],
                        'unexpected_insights': [],
                        'action_recommendations': []
                    },
                    'metadata': {
                        'total_comments_analyzed': 0,
                        'model_used': self.model,
                        'tokens_used': 0
                    }
                }

            # Prepare comments context based on sentiment type
            comments_context = self._prepare_comments_context(comments_df, sentiment_type)

            # Get count of comments after filtering
            if sentiment_type == 'negative':
                filtered_count = len(comments_df[comments_df['sentiment_polarity'].isin(['negative', 'very_negative'])])
            elif sentiment_type == 'positive':
                filtered_count = len(comments_df[comments_df['sentiment_polarity'].isin(['positive', 'very_positive'])])
            else:
                filtered_count = total_comments

            if filtered_count == 0:
                return {
                    'success': True,
                    'content_sk': content_sk,
                    'sentiment_type': sentiment_type,
                    'summary': {
                        'executive_summary': f'No {sentiment_type} comments available for analysis.',
                        'main_themes': [],
                        'praise_points': [],
                        'key_complaints': [],
                        'frequently_asked_questions': [],
                        'unexpected_insights': [],
                        'action_recommendations': []
                    },
                    'metadata': {
                        'total_comments_analyzed': 0,
                        'model_used': self.model,
                        'tokens_used': 0
                    }
                }

            # Generate prompt
            prompt = self._generate_summary_prompt(
                content_description,
                comments_context,
                filtered_count,
                sentiment_type
            )

            # System message
            system_message = """You are an expert social media analyst specializing in
sentiment analysis and community insights. Provide concise, actionable summaries
that help content creators understand their audience feedback."""

            # Get LLM response
            self.log_processing(f"Calling LLM for {sentiment_type} summary generation")
            response = self.llm_helper.get_structured_completion(
                prompt=prompt,
                system_message=system_message,
                max_retries=3
            )

            if not response['success']:
                return self.handle_error(
                    Exception(response.get('error', 'LLM call failed')),
                    context=f"content_sk={content_sk}, sentiment_type={sentiment_type}"
                )

            # Extract summary
            summary = response['content']

            # Ensure all expected fields exist
            default_summary = {
                'executive_summary': '',
                'main_themes': [],
                'praise_points': [],
                'key_complaints': [],
                'frequently_asked_questions': [],
                'unexpected_insights': [],
                'action_recommendations': []
            }

            # Merge with defaults
            for key in default_summary:
                if key not in summary:
                    summary[key] = default_summary[key]

            self.log_processing(f"Successfully generated {sentiment_type} summary for content: {content_sk}")

            return {
                'success': True,
                'content_sk': content_sk,
                'sentiment_type': sentiment_type,
                'summary': summary,
                'metadata': {
                    'total_comments_analyzed': filtered_count,
                    'model_used': response['model'],
                    'tokens_used': response['usage']['total_tokens']
                }
            }

        except Exception as e:
            return self.handle_error(
                e,
                context=f"content_sk={input_data.get('content_sk', 'unknown')}, sentiment_type={input_data.get('sentiment_type', 'negative')}"
            )