Spaces:

Tanmoy-AI
/

customer-connect

Sleeping

App Files Files Community

FarhinSadia commited on Jun 19, 2025

Commit

bd85768

1 Parent(s): 7b8f7c1

Add requirements.txt for deployment

Browse files

Files changed (2) hide show

src/data_processor.py +145 -80
src/insights_generator.py +270 -0

src/data_processor.py CHANGED Viewed

@@ -10,7 +10,6 @@ try:
     OPENAI_AVAILABLE = True
 except ImportError:
     OPENAI_AVAILABLE = False
-    print("OpenAI not installed. GPT features will be disabled.")
 try:
     import nltk
@@ -19,7 +18,6 @@ try:
     NLTK_AVAILABLE = True
 except ImportError:
     NLTK_AVAILABLE = False
-    print("NLTK not installed. Using TextBlob only.")
 class DataProcessor:
     def __init__(self, openai_api_key=None):
@@ -40,43 +38,87 @@ class DataProcessor:
             openai.api_key = openai_api_key
             self.use_gpt = True
-        # Banking-specific patterns
-        self.banking_keywords = {
-            'service_quality': ['customer service', 'staff', 'support', 'help', 'assistance'],
-            'transaction': ['transfer', 'deposit', 'withdraw', 'payment', 'transaction'],
-            'account': ['account', 'savings', 'checking', 'balance'],
-            'loan': ['loan', 'mortgage', 'credit', 'interest rate'],
-            'digital': ['app', 'online banking', 'mobile', 'website', 'digital'],
-            'branch': ['branch', 'atm', 'location', 'queue', 'waiting']
         }
-    def process_csv_files(self, uploaded_files):
-        """Process multiple CSV files"""
-        all_dataframes = []
-        for uploaded_file in uploaded_files:
-            try:
-                df = pd.read_csv(uploaded_file)
-                df['source_file'] = uploaded_file.name
-                all_dataframes.append(df)
-            except Exception as e:
-                print(f"Error reading {uploaded_file.name}: {e}")
-        if all_dataframes:
-            combined_df = pd.concat(all_dataframes, ignore_index=True)
-            return combined_df
         return pd.DataFrame()
-    def process_txt_file(self, txt_file):
-        """Process text file with reviews"""
-        content = txt_file.read().decode('utf-8')
-        reviews = content.split('\n')
-        df = pd.DataFrame({
-            'text': [review.strip() for review in reviews if review.strip()],
-            'source_file': txt_file.name
-        })
-        return df
     def analyze_sentiment(self, text):
         """Analyze sentiment - use VADER if available, else TextBlob"""
@@ -112,67 +154,71 @@ class DataProcessor:
             return 'Neutral', 0
     def detect_emotion(self, text):
-        """Detect emotion in text"""
         if pd.isna(text):
-            return 'Neutral'
         text_lower = str(text).lower()
-        # Emotion keywords
         emotions = {
-            'Joy': ['happy', 'excellent', 'amazing', 'great', 'wonderful', 'fantastic', 'love', 'best', 'thank you'],
-            'Frustration': ['frustrated', 'angry', 'terrible', 'horrible', 'worst', 'hate', 'annoyed', 'disappointed'],
-            'Confusion': ['confused', 'unclear', "don't understand", 'what', 'how', 'why', '?', 'help me']
         }
         emotion_scores = {}
-        for emotion, keywords in emotions.items():
-            score = sum(keyword in text_lower for keyword in keywords)
             emotion_scores[emotion] = score
         if max(emotion_scores.values()) > 0:
-            return max(emotion_scores, key=emotion_scores.get)
-        return 'Neutral'
     def categorize_post(self, text):
-        """Categorize post type"""
         if pd.isna(text):
-            return 'Other'
         text_lower = str(text).lower()
-        if '?' in text_lower or any(word in text_lower for word in ['how', 'what', 'when', 'where']):
-            return 'Inquiry'
-        elif any(word in text_lower for word in ['complaint', 'problem', 'issue', 'bad', 'terrible']):
-            return 'Complaint'
-        elif any(word in text_lower for word in ['thank', 'great', 'excellent', 'love', 'best']):
-            return 'Praise'
         else:
-            return 'Other'
-    def count_prime_mentions(self, text):
-        """Count Prime Bank mentions"""
-        if pd.isna(text):
-            return 0
-        text_lower = str(text).lower()
-        patterns = [
-            r'prime\s*bank',
-            r'primebank',
-            r'@primebank'
-        ]
-        total_mentions = 0
-        for pattern in patterns:
-            mentions = len(re.findall(pattern, text_lower))
-            total_mentions += mentions
-        return total_mentions
     def process_all_data(self, df):
         """Apply all processing to dataframe"""
         # Find text column
-        text_columns = ['text', 'content', 'message', 'review', 'comment', 'post']
         text_col = None
         for col in text_columns:
@@ -186,20 +232,39 @@ class DataProcessor:
         if 'text' not in df.columns:
             return df
-        # Apply all analyses
         df[['sentiment', 'polarity']] = df['text'].apply(
             lambda x: pd.Series(self.analyze_sentiment(x))
         )
-        df['emotion'] = df['text'].apply(self.detect_emotion)
-        df['category'] = df['text'].apply(self.categorize_post)
-        df['prime_mentions'] = df['text'].apply(self.count_prime_mentions)
-        # Calculate viral score
-        df['viral_score'] = df['prime_mentions'] * 10
         if 'likes' in df.columns:
             df['viral_score'] += df['likes'].fillna(0)
         if 'shares' in df.columns:
             df['viral_score'] += df['shares'].fillna(0) * 2
         return df

     OPENAI_AVAILABLE = True
 except ImportError:
     OPENAI_AVAILABLE = False
 try:
     import nltk
     NLTK_AVAILABLE = True
 except ImportError:
     NLTK_AVAILABLE = False
 class DataProcessor:
     def __init__(self, openai_api_key=None):
             openai.api_key = openai_api_key
             self.use_gpt = True
+        # Banking patterns - INCLUDING OTHER BANKS
+        self.bank_patterns = {
+            'prime_bank': [r'prime\s*bank', r'primebank', r'@primebank', r'prime\s*b\.?'],
+            'eastern_bank': [r'eastern\s*bank', r'ebl', r'@easternbank'],
+            'brac_bank': [r'brac\s*bank', r'@bracbank'],
+            'city_bank': [r'city\s*bank', r'@citybank'],
+            'dutch_bangla': [r'dutch\s*bangla', r'dbbl', r'@dutchbangla']
         }
+    def load_data_from_files(self, csv_files=None, txt_files=None):
+        """Load data from CSV and TXT files"""
+        all_data = []
+        # Load CSV files
+        if csv_files:
+            for file_path in csv_files:
+                try:
+                    df = pd.read_csv(file_path)
+                    df['source_file'] = file_path.split('/')[-1]
+                    all_data.append(df)
+                except Exception as e:
+                    print(f"Error loading {file_path}: {e}")
+        # Load TXT files
+        if txt_files:
+            for file_path in txt_files:
+                try:
+                    with open(file_path, 'r', encoding='utf-8') as f:
+                        content = f.read()
+                    # Split by double newlines to separate posts
+                    posts = content.split('\n\n')
+                    # Create dataframe
+                    df = pd.DataFrame({
+                        'text': [post.strip() for post in posts if post.strip()],
+                        'source_file': file_path.split('/')[-1]
+                    })
+                    all_data.append(df)
+                except Exception as e:
+                    print(f"Error loading {file_path}: {e}")
+        if all_data:
+            return pd.concat(all_data, ignore_index=True)
         return pd.DataFrame()
+    def identify_bank(self, text):
+        """Identify which bank is mentioned in the text"""
+        if pd.isna(text):
+            return 'none', []
+        text_lower = str(text).lower()
+        mentioned_banks = []
+        for bank, patterns in self.bank_patterns.items():
+            for pattern in patterns:
+                if re.search(pattern, text_lower):
+                    mentioned_banks.append(bank)
+                    break
+        if not mentioned_banks:
+            return 'none', []
+        elif len(mentioned_banks) == 1:
+            return mentioned_banks[0], mentioned_banks
+        else:
+            return 'multiple', mentioned_banks
+    def count_bank_mentions(self, text, bank='prime_bank'):
+        """Count mentions of specific bank"""
+        if pd.isna(text):
+            return 0
+        text_lower = str(text).lower()
+        total_mentions = 0
+        if bank in self.bank_patterns:
+            for pattern in self.bank_patterns[bank]:
+                mentions = len(re.findall(pattern, text_lower))
+                total_mentions += mentions
+        return total_mentions
     def analyze_sentiment(self, text):
         """Analyze sentiment - use VADER if available, else TextBlob"""
             return 'Neutral', 0
     def detect_emotion(self, text):
+        """Detect emotion in text with context"""
         if pd.isna(text):
+            return 'Neutral', []
         text_lower = str(text).lower()
+        # Emotion keywords with context
         emotions = {
+            'Joy': {
+                'keywords': ['happy', 'excellent', 'amazing', 'great', 'wonderful', 'fantastic', 'love', 'best', 'thank you', 'appreciate'],
+                'context': 'expressing satisfaction and happiness'
+            },
+            'Frustration': {
+                'keywords': ['frustrated', 'angry', 'terrible', 'horrible', 'worst', 'hate', 'annoyed', 'disappointed', 'pathetic'],
+                'context': 'expressing anger and dissatisfaction'
+            },
+            'Confusion': {
+                'keywords': ['confused', 'unclear', "don't understand", 'what', 'how', 'why', '?', 'help me', 'lost'],
+                'context': 'seeking clarification or expressing confusion'
+            },
+            'Anxiety': {
+                'keywords': ['worried', 'concern', 'anxious', 'nervous', 'scared', 'fear', 'panic', 'urgent'],
+                'context': 'expressing worry or urgency'
+            }
         }
         emotion_scores = {}
+        detected_keywords = {}
+        for emotion, data in emotions.items():
+            keywords_found = [kw for kw in data['keywords'] if kw in text_lower]
+            score = len(keywords_found)
             emotion_scores[emotion] = score
+            if keywords_found:
+                detected_keywords[emotion] = keywords_found
         if max(emotion_scores.values()) > 0:
+            primary_emotion = max(emotion_scores, key=emotion_scores.get)
+            return primary_emotion, detected_keywords.get(primary_emotion, [])
+        return 'Neutral', []
     def categorize_post(self, text):
+        """Categorize post type with reason"""
         if pd.isna(text):
+            return 'Other', 'No text content'
         text_lower = str(text).lower()
+        # Categories with detection logic
+        if '?' in text_lower or any(phrase in text_lower for phrase in ['how do', 'what is', 'when', 'where', 'can i', 'could you']):
+            return 'Inquiry', 'Contains questions or information seeking'
+        elif any(word in text_lower for word in ['complaint', 'problem', 'issue', 'error', 'failed', 'not working', 'terrible', 'worst']):
+            return 'Complaint', 'Contains complaint or problem description'
+        elif any(word in text_lower for word in ['thank', 'great', 'excellent', 'love', 'best', 'appreciate', 'amazing']):
+            return 'Praise', 'Contains positive feedback or appreciation'
+        elif any(word in text_lower for word in ['suggest', 'should', 'could', 'recommend', 'request', 'please add']):
+            return 'Suggestion', 'Contains suggestions or feature requests'
         else:
+            return 'Other', 'General discussion or observation'
     def process_all_data(self, df):
         """Apply all processing to dataframe"""
         # Find text column
+        text_columns = ['text', 'content', 'message', 'review', 'comment', 'post', 'Text', 'Content']
         text_col = None
         for col in text_columns:
         if 'text' not in df.columns:
             return df
+        # Identify which bank each post is about
+        df[['primary_bank', 'all_banks_mentioned']] = df['text'].apply(
+            lambda x: pd.Series(self.identify_bank(x))
+        )
+        # Count mentions for each bank
+        df['prime_mentions'] = df['text'].apply(lambda x: self.count_bank_mentions(x, 'prime_bank'))
+        # Apply sentiment analysis
         df[['sentiment', 'polarity']] = df['text'].apply(
             lambda x: pd.Series(self.analyze_sentiment(x))
         )
+        # Apply emotion detection with keywords
+        df[['emotion', 'emotion_keywords']] = df['text'].apply(
+            lambda x: pd.Series(self.detect_emotion(x))
+        )
+        # Categorize posts with reasons
+        df[['category', 'category_reason']] = df['text'].apply(
+            lambda x: pd.Series(self.categorize_post(x))
+        )
+        # Calculate viral score (only for posts with engagement metrics)
+        df['viral_score'] = 0
         if 'likes' in df.columns:
             df['viral_score'] += df['likes'].fillna(0)
         if 'shares' in df.columns:
             df['viral_score'] += df['shares'].fillna(0) * 2
+        if 'comments' in df.columns:
+            df['viral_score'] += df['comments'].fillna(0) * 1.5
+        # Add Prime Bank specific viral score boost
+        df.loc[df['prime_mentions'] > 0, 'viral_score'] *= 1.2
         return df

src/insights_generator.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import pandas as pd
+from collections import Counter
+class InsightsGenerator:
+    def __init__(self):
+        self.insights = {}
+    def generate_all_insights(self, df, prime_df):
+        """Generate comprehensive insights for all analyses"""
+        # Overall statistics
+        total_posts = len(df)
+        prime_posts = len(prime_df)
+        prime_percentage = (prime_posts / total_posts * 100) if total_posts > 0 else 0
+        self.insights['overview'] = {
+            'summary': f"Analyzed {total_posts:,} total posts, of which {prime_posts:,} ({prime_percentage:.1f}%) specifically mention Prime Bank.",
+            'context': f"The remaining {total_posts - prime_posts:,} posts mention other banks or general banking topics."
+        }
+        # Sentiment insights
+        self.insights['sentiment'] = self._generate_sentiment_insights(prime_df)
+        # Emotion insights
+        self.insights['emotion'] = self._generate_emotion_insights(prime_df)
+        # Category insights
+        self.insights['category'] = self._generate_category_insights(prime_df)
+        # Trending topics
+        self.insights['topics'] = self._generate_topic_insights(prime_df)
+        # Comparative analysis
+        self.insights['comparison'] = self._generate_comparison_insights(df)
+        # Priority actions
+        self.insights['actions'] = self._generate_action_insights(prime_df)
+        return self.insights
+    def _generate_sentiment_insights(self, df):
+        """Generate sentiment-specific insights"""
+        if len(df) == 0:
+            return {'summary': 'No Prime Bank posts found for sentiment analysis.'}
+        sentiment_dist = df['sentiment'].value_counts(normalize=True) * 100
+        # Get sample posts for each sentiment
+        sentiment_examples = {}
+        for sentiment in ['Positive', 'Negative', 'Neutral']:
+            examples = df[df['sentiment'] == sentiment]['text'].head(2).tolist()
+            sentiment_examples[sentiment] = examples
+        # Analyze negative posts for common issues
+        negative_posts = df[df['sentiment'] == 'Negative']['text']
+        negative_themes = []
+        if len(negative_posts) > 0:
+            all_negative_text = ' '.join(negative_posts.astype(str).tolist()).lower()
+            if 'wait' in all_negative_text or 'queue' in all_negative_text:
+                negative_themes.append('long wait times')
+            if 'fee' in all_negative_text or 'charge' in all_negative_text:
+                negative_themes.append('fees and charges')
+            if 'app' in all_negative_text or 'online' in all_negative_text:
+                negative_themes.append('digital banking issues')
+            if 'staff' in all_negative_text or 'service' in all_negative_text:
+                negative_themes.append('customer service')
+        insights = {
+            'summary': f"Sentiment breakdown: {sentiment_dist.get('Positive', 0):.1f}% positive, {sentiment_dist.get('Negative', 0):.1f}% negative, {sentiment_dist.get('Neutral', 0):.1f}% neutral.",
+            'positive_context': f"Positive posts ({sentiment_dist.get('Positive', 0):.1f}%) primarily praise customer service, digital banking features, and efficient processes.",
+            'negative_context': f"Negative posts ({sentiment_dist.get('Negative', 0):.1f}%) mainly complain about: {', '.join(negative_themes) if negative_themes else 'various service issues'}.",
+            'neutral_context': f"Neutral posts ({sentiment_dist.get('Neutral', 0):.1f}%) are mostly inquiries about services and general discussions.",
+            'examples': sentiment_examples,
+            'concern_areas': negative_themes
+        }
+        return insights
+    def _generate_emotion_insights(self, df):
+        """Generate emotion-specific insights"""
+        if len(df) == 0:
+            return {'summary': 'No Prime Bank posts found for emotion analysis.'}
+        emotion_dist = df['emotion'].value_counts()
+        total_emotional = len(df[df['emotion'] != 'Neutral'])
+        emotion_contexts = {
+            'Joy': 'Customers expressing joy are satisfied with services, particularly praising staff helpfulness and quick problem resolution.',
+            'Frustration': 'Frustrated customers mainly face issues with wait times, technical problems, and unresolved complaints.',
+            'Confusion': 'Confused customers need better information about products, fees, and online banking procedures.',
+            'Anxiety': 'Anxious customers are worried about account security, loan applications, and urgent transaction issues.'
+        }
+        # Get most common emotion keywords
+        emotion_keywords = {}
+        for emotion in ['Joy', 'Frustration', 'Confusion', 'Anxiety']:
+            emotion_posts = df[df['emotion'] == emotion]
+            if len(emotion_posts) > 0:
+                # Flatten all keywords for this emotion
+                all_keywords = []
+                for keywords in emotion_posts['emotion_keywords']:
+                    if isinstance(keywords, list):
+                        all_keywords.extend(keywords)
+                if all_keywords:
+                    emotion_keywords[emotion] = Counter(all_keywords).most_common(3)
+        insights = {
+            'summary': f"{total_emotional} out of {len(df)} Prime Bank posts ({total_emotional/len(df)*100:.1f}%) express clear emotions.",
+            'distribution': {emotion: count for emotion, count in emotion_dist.items()},
+            'contexts': emotion_contexts,
+            'top_emotion': emotion_dist.index[0] if len(emotion_dist) > 0 else 'None',
+            'keywords': emotion_keywords,
+            'recommendation': self._get_emotion_recommendation(emotion_dist)
+        }
+        return insights
+    def _generate_category_insights(self, df):
+        """Generate category-specific insights"""
+        if len(df) == 0:
+            return {'summary': 'No Prime Bank posts found for category analysis.'}
+        category_dist = df['category'].value_counts()
+        category_insights = {
+            'Inquiry': {
+                'common_topics': ['account opening', 'loan applications', 'online banking setup', 'branch locations'],
+                'action': 'Improve FAQ section and provide clearer information channels'
+            },
+            'Complaint': {
+                'common_topics': ['service delays', 'technical issues', 'hidden fees', 'staff behavior'],
+                'action': 'Establish rapid response team for complaint resolution'
+            },
+            'Praise': {
+                'common_topics': ['helpful staff', 'quick service', 'user-friendly app', 'problem resolution'],
+                'action': 'Recognize and reward mentioned staff members'
+            },
+            'Suggestion': {
+                'common_topics': ['new features', 'branch expansion', 'service improvements', 'digital enhancements'],
+                'action': 'Review suggestions for product development roadmap'
+            }
+        }
+        insights = {
+            'summary': f"Post categories: {', '.join([f'{cat} ({count})' for cat, count in category_dist.items()])}",
+            'details': category_insights,
+            'urgent_attention': f"{category_dist.get('Complaint', 0)} complaints require immediate attention",
+            'opportunities': f"{category_dist.get('Suggestion', 0)} suggestions for improvement"
+        }
+        return insights
+    def _generate_topic_insights(self, df):
+        """Identify trending topics"""
+        if len(df) == 0:
+            return {'summary': 'No Prime Bank posts found for topic analysis.'}
+        # Combine all text
+        all_text = ' '.join(df['text'].astype(str).tolist()).lower()
+        # Define topic keywords
+        topics = {
+            'Digital Banking': ['app', 'online', 'mobile', 'website', 'internet banking'],
+            'Customer Service': ['staff', 'service', 'help', 'support', 'employee'],
+            'Fees & Charges': ['fee', 'charge', 'cost', 'expensive', 'price'],
+            'Loans': ['loan', 'credit', 'mortgage', 'interest', 'emi'],
+            'ATM & Branch': ['atm', 'branch', 'location', 'machine', 'cash'],
+            'Account Services': ['account', 'savings', 'current', 'balance', 'statement']
+        }
+        topic_counts = {}
+        for topic, keywords in topics.items():
+            count = sum(1 for keyword in keywords if keyword in all_text)
+            if count > 0:
+                topic_counts[topic] = count
+        sorted_topics = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)
+        insights = {
+            'summary': f"Top discussed topics: {', '.join([f'{topic} ({count} mentions)' for topic, count in sorted_topics[:3]])}",
+            'all_topics': dict(sorted_topics),
+            'trending': sorted_topics[0][0] if sorted_topics else 'None',
+            'recommendation': f"Focus on improving {sorted_topics[0][0].lower()} based on high discussion volume" if sorted_topics else "No clear topic trends"
+        }
+        return insights
+    def _generate_comparison_insights(self, df):
+        """Compare Prime Bank with other banks"""
+        bank_sentiment = {}
+        for bank in ['prime_bank', 'eastern_bank', 'brac_bank', 'city_bank', 'dutch_bangla']:
+            bank_posts = df[df['primary_bank'] == bank]
+            if len(bank_posts) > 0:
+                positive_rate = (bank_posts['sentiment'] == 'Positive').sum() / len(bank_posts) * 100
+                bank_sentiment[bank] = {
+                    'posts': len(bank_posts),
+                    'positive_rate': positive_rate
+                }
+        if 'prime_bank' in bank_sentiment:
+            prime_positive = bank_sentiment['prime_bank']['positive_rate']
+            comparison = "above average" if prime_positive > 50 else "below average"
+            insights = {
+                'summary': f"Prime Bank has {prime_positive:.1f}% positive sentiment, which is {comparison} in the banking sector.",
+                'comparison': bank_sentiment,
+                'recommendation': "Focus on maintaining positive momentum" if prime_positive > 50 else "Urgent improvement needed to match competitor satisfaction levels"
+            }
+        else:
+            insights = {'summary': 'No comparative data available.'}
+        return insights
+    def _generate_action_insights(self, df):
+        """Generate actionable insights"""
+        if len(df) == 0:
+            return {'summary': 'No Prime Bank posts found for action analysis.'}
+        # High priority posts
+        high_priority = df[
+            (df['sentiment'] == 'Negative') &
+            (df['emotion'].isin(['Frustration', 'Anxiety'])) &
+            (df['category'] == 'Complaint')
+        ]
+        # Quick wins - positive posts that can be amplified
+        quick_wins = df[
+            (df['sentiment'] == 'Positive') &
+            (df['category'] == 'Praise')
+        ]
+        actions = {
+            'immediate': {
+                'count': len(high_priority),
+                'description': 'High-priority complaints requiring immediate response',
+                'action': 'Contact these customers within 24 hours'
+            },
+            'quick_wins': {
+                'count': len(quick_wins),
+                'description': 'Positive testimonials for marketing use',
+                'action': 'Share success stories and thank customers publicly'
+            },
+            'strategic': {
+                'description': 'Long-term improvements based on feedback patterns',
+                'actions': [
+                    'Enhance digital banking infrastructure',
+                    'Implement customer service training program',
+                    'Review and simplify fee structure'
+                ]
+            }
+        }
+        return actions
+    def _get_emotion_recommendation(self, emotion_dist):
+        """Get recommendation based on emotion distribution"""
+        if len(emotion_dist) == 0:
+            return "No emotional data to analyze"
+        top_emotion = emotion_dist.index[0]
+        recommendations = {
+            'Joy': "Leverage positive emotions by encouraging happy customers to share testimonials",
+            'Frustration': "Implement rapid response protocol for frustrated customers to prevent escalation",
+            'Confusion': "Create clearer communication materials and improve customer education",
+            'Anxiety': "Provide reassurance through proactive communication about security and processes",
+            'Neutral': "Engage neutral customers with targeted campaigns to create"
+        }
+        return recommendations.get(top_emotion, "Monitor customer emotions closely")