Spaces:
Sleeping
Sleeping
Commit
·
bd85768
1
Parent(s):
7b8f7c1
Add requirements.txt for deployment
Browse files- src/data_processor.py +145 -80
- src/insights_generator.py +270 -0
src/data_processor.py
CHANGED
|
@@ -10,7 +10,6 @@ try:
|
|
| 10 |
OPENAI_AVAILABLE = True
|
| 11 |
except ImportError:
|
| 12 |
OPENAI_AVAILABLE = False
|
| 13 |
-
print("OpenAI not installed. GPT features will be disabled.")
|
| 14 |
|
| 15 |
try:
|
| 16 |
import nltk
|
|
@@ -19,7 +18,6 @@ try:
|
|
| 19 |
NLTK_AVAILABLE = True
|
| 20 |
except ImportError:
|
| 21 |
NLTK_AVAILABLE = False
|
| 22 |
-
print("NLTK not installed. Using TextBlob only.")
|
| 23 |
|
| 24 |
class DataProcessor:
|
| 25 |
def __init__(self, openai_api_key=None):
|
|
@@ -40,43 +38,87 @@ class DataProcessor:
|
|
| 40 |
openai.api_key = openai_api_key
|
| 41 |
self.use_gpt = True
|
| 42 |
|
| 43 |
-
# Banking-
|
| 44 |
-
self.
|
| 45 |
-
'
|
| 46 |
-
'
|
| 47 |
-
'
|
| 48 |
-
'
|
| 49 |
-
'
|
| 50 |
-
'branch': ['branch', 'atm', 'location', 'queue', 'waiting']
|
| 51 |
}
|
| 52 |
|
| 53 |
-
def
|
| 54 |
-
"""
|
| 55 |
-
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
return pd.DataFrame()
|
| 69 |
|
| 70 |
-
def
|
| 71 |
-
"""
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
def analyze_sentiment(self, text):
|
| 82 |
"""Analyze sentiment - use VADER if available, else TextBlob"""
|
|
@@ -112,67 +154,71 @@ class DataProcessor:
|
|
| 112 |
return 'Neutral', 0
|
| 113 |
|
| 114 |
def detect_emotion(self, text):
|
| 115 |
-
"""Detect emotion in text"""
|
| 116 |
if pd.isna(text):
|
| 117 |
-
return 'Neutral'
|
| 118 |
|
| 119 |
text_lower = str(text).lower()
|
| 120 |
|
| 121 |
-
# Emotion keywords
|
| 122 |
emotions = {
|
| 123 |
-
'Joy':
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
}
|
| 127 |
|
| 128 |
emotion_scores = {}
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
| 131 |
emotion_scores[emotion] = score
|
|
|
|
|
|
|
| 132 |
|
| 133 |
if max(emotion_scores.values()) > 0:
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
| 136 |
|
| 137 |
def categorize_post(self, text):
|
| 138 |
-
"""Categorize post type"""
|
| 139 |
if pd.isna(text):
|
| 140 |
-
return 'Other'
|
| 141 |
|
| 142 |
text_lower = str(text).lower()
|
| 143 |
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
| 150 |
else:
|
| 151 |
-
return 'Other'
|
| 152 |
-
|
| 153 |
-
def count_prime_mentions(self, text):
|
| 154 |
-
"""Count Prime Bank mentions"""
|
| 155 |
-
if pd.isna(text):
|
| 156 |
-
return 0
|
| 157 |
-
|
| 158 |
-
text_lower = str(text).lower()
|
| 159 |
-
patterns = [
|
| 160 |
-
r'prime\s*bank',
|
| 161 |
-
r'primebank',
|
| 162 |
-
r'@primebank'
|
| 163 |
-
]
|
| 164 |
-
|
| 165 |
-
total_mentions = 0
|
| 166 |
-
for pattern in patterns:
|
| 167 |
-
mentions = len(re.findall(pattern, text_lower))
|
| 168 |
-
total_mentions += mentions
|
| 169 |
-
|
| 170 |
-
return total_mentions
|
| 171 |
|
| 172 |
def process_all_data(self, df):
|
| 173 |
"""Apply all processing to dataframe"""
|
| 174 |
# Find text column
|
| 175 |
-
text_columns = ['text', 'content', 'message', 'review', 'comment', 'post']
|
| 176 |
text_col = None
|
| 177 |
|
| 178 |
for col in text_columns:
|
|
@@ -186,20 +232,39 @@ class DataProcessor:
|
|
| 186 |
if 'text' not in df.columns:
|
| 187 |
return df
|
| 188 |
|
| 189 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
df[['sentiment', 'polarity']] = df['text'].apply(
|
| 191 |
lambda x: pd.Series(self.analyze_sentiment(x))
|
| 192 |
)
|
| 193 |
|
| 194 |
-
|
| 195 |
-
df['
|
| 196 |
-
|
|
|
|
| 197 |
|
| 198 |
-
#
|
| 199 |
-
df['
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
if 'likes' in df.columns:
|
| 201 |
df['viral_score'] += df['likes'].fillna(0)
|
| 202 |
if 'shares' in df.columns:
|
| 203 |
df['viral_score'] += df['shares'].fillna(0) * 2
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
return df
|
|
|
|
| 10 |
OPENAI_AVAILABLE = True
|
| 11 |
except ImportError:
|
| 12 |
OPENAI_AVAILABLE = False
|
|
|
|
| 13 |
|
| 14 |
try:
|
| 15 |
import nltk
|
|
|
|
| 18 |
NLTK_AVAILABLE = True
|
| 19 |
except ImportError:
|
| 20 |
NLTK_AVAILABLE = False
|
|
|
|
| 21 |
|
| 22 |
class DataProcessor:
|
| 23 |
def __init__(self, openai_api_key=None):
|
|
|
|
| 38 |
openai.api_key = openai_api_key
|
| 39 |
self.use_gpt = True
|
| 40 |
|
| 41 |
+
# Banking patterns - INCLUDING OTHER BANKS
|
| 42 |
+
self.bank_patterns = {
|
| 43 |
+
'prime_bank': [r'prime\s*bank', r'primebank', r'@primebank', r'prime\s*b\.?'],
|
| 44 |
+
'eastern_bank': [r'eastern\s*bank', r'ebl', r'@easternbank'],
|
| 45 |
+
'brac_bank': [r'brac\s*bank', r'@bracbank'],
|
| 46 |
+
'city_bank': [r'city\s*bank', r'@citybank'],
|
| 47 |
+
'dutch_bangla': [r'dutch\s*bangla', r'dbbl', r'@dutchbangla']
|
|
|
|
| 48 |
}
|
| 49 |
|
| 50 |
+
def load_data_from_files(self, csv_files=None, txt_files=None):
|
| 51 |
+
"""Load data from CSV and TXT files"""
|
| 52 |
+
all_data = []
|
| 53 |
|
| 54 |
+
# Load CSV files
|
| 55 |
+
if csv_files:
|
| 56 |
+
for file_path in csv_files:
|
| 57 |
+
try:
|
| 58 |
+
df = pd.read_csv(file_path)
|
| 59 |
+
df['source_file'] = file_path.split('/')[-1]
|
| 60 |
+
all_data.append(df)
|
| 61 |
+
except Exception as e:
|
| 62 |
+
print(f"Error loading {file_path}: {e}")
|
| 63 |
+
|
| 64 |
+
# Load TXT files
|
| 65 |
+
if txt_files:
|
| 66 |
+
for file_path in txt_files:
|
| 67 |
+
try:
|
| 68 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 69 |
+
content = f.read()
|
| 70 |
+
|
| 71 |
+
# Split by double newlines to separate posts
|
| 72 |
+
posts = content.split('\n\n')
|
| 73 |
+
|
| 74 |
+
# Create dataframe
|
| 75 |
+
df = pd.DataFrame({
|
| 76 |
+
'text': [post.strip() for post in posts if post.strip()],
|
| 77 |
+
'source_file': file_path.split('/')[-1]
|
| 78 |
+
})
|
| 79 |
+
all_data.append(df)
|
| 80 |
+
except Exception as e:
|
| 81 |
+
print(f"Error loading {file_path}: {e}")
|
| 82 |
+
|
| 83 |
+
if all_data:
|
| 84 |
+
return pd.concat(all_data, ignore_index=True)
|
| 85 |
return pd.DataFrame()
|
| 86 |
|
| 87 |
+
def identify_bank(self, text):
|
| 88 |
+
"""Identify which bank is mentioned in the text"""
|
| 89 |
+
if pd.isna(text):
|
| 90 |
+
return 'none', []
|
| 91 |
+
|
| 92 |
+
text_lower = str(text).lower()
|
| 93 |
+
mentioned_banks = []
|
| 94 |
+
|
| 95 |
+
for bank, patterns in self.bank_patterns.items():
|
| 96 |
+
for pattern in patterns:
|
| 97 |
+
if re.search(pattern, text_lower):
|
| 98 |
+
mentioned_banks.append(bank)
|
| 99 |
+
break
|
| 100 |
+
|
| 101 |
+
if not mentioned_banks:
|
| 102 |
+
return 'none', []
|
| 103 |
+
elif len(mentioned_banks) == 1:
|
| 104 |
+
return mentioned_banks[0], mentioned_banks
|
| 105 |
+
else:
|
| 106 |
+
return 'multiple', mentioned_banks
|
| 107 |
+
|
| 108 |
+
def count_bank_mentions(self, text, bank='prime_bank'):
|
| 109 |
+
"""Count mentions of specific bank"""
|
| 110 |
+
if pd.isna(text):
|
| 111 |
+
return 0
|
| 112 |
+
|
| 113 |
+
text_lower = str(text).lower()
|
| 114 |
+
total_mentions = 0
|
| 115 |
+
|
| 116 |
+
if bank in self.bank_patterns:
|
| 117 |
+
for pattern in self.bank_patterns[bank]:
|
| 118 |
+
mentions = len(re.findall(pattern, text_lower))
|
| 119 |
+
total_mentions += mentions
|
| 120 |
+
|
| 121 |
+
return total_mentions
|
| 122 |
|
| 123 |
def analyze_sentiment(self, text):
|
| 124 |
"""Analyze sentiment - use VADER if available, else TextBlob"""
|
|
|
|
| 154 |
return 'Neutral', 0
|
| 155 |
|
| 156 |
def detect_emotion(self, text):
|
| 157 |
+
"""Detect emotion in text with context"""
|
| 158 |
if pd.isna(text):
|
| 159 |
+
return 'Neutral', []
|
| 160 |
|
| 161 |
text_lower = str(text).lower()
|
| 162 |
|
| 163 |
+
# Emotion keywords with context
|
| 164 |
emotions = {
|
| 165 |
+
'Joy': {
|
| 166 |
+
'keywords': ['happy', 'excellent', 'amazing', 'great', 'wonderful', 'fantastic', 'love', 'best', 'thank you', 'appreciate'],
|
| 167 |
+
'context': 'expressing satisfaction and happiness'
|
| 168 |
+
},
|
| 169 |
+
'Frustration': {
|
| 170 |
+
'keywords': ['frustrated', 'angry', 'terrible', 'horrible', 'worst', 'hate', 'annoyed', 'disappointed', 'pathetic'],
|
| 171 |
+
'context': 'expressing anger and dissatisfaction'
|
| 172 |
+
},
|
| 173 |
+
'Confusion': {
|
| 174 |
+
'keywords': ['confused', 'unclear', "don't understand", 'what', 'how', 'why', '?', 'help me', 'lost'],
|
| 175 |
+
'context': 'seeking clarification or expressing confusion'
|
| 176 |
+
},
|
| 177 |
+
'Anxiety': {
|
| 178 |
+
'keywords': ['worried', 'concern', 'anxious', 'nervous', 'scared', 'fear', 'panic', 'urgent'],
|
| 179 |
+
'context': 'expressing worry or urgency'
|
| 180 |
+
}
|
| 181 |
}
|
| 182 |
|
| 183 |
emotion_scores = {}
|
| 184 |
+
detected_keywords = {}
|
| 185 |
+
|
| 186 |
+
for emotion, data in emotions.items():
|
| 187 |
+
keywords_found = [kw for kw in data['keywords'] if kw in text_lower]
|
| 188 |
+
score = len(keywords_found)
|
| 189 |
emotion_scores[emotion] = score
|
| 190 |
+
if keywords_found:
|
| 191 |
+
detected_keywords[emotion] = keywords_found
|
| 192 |
|
| 193 |
if max(emotion_scores.values()) > 0:
|
| 194 |
+
primary_emotion = max(emotion_scores, key=emotion_scores.get)
|
| 195 |
+
return primary_emotion, detected_keywords.get(primary_emotion, [])
|
| 196 |
+
|
| 197 |
+
return 'Neutral', []
|
| 198 |
|
| 199 |
def categorize_post(self, text):
|
| 200 |
+
"""Categorize post type with reason"""
|
| 201 |
if pd.isna(text):
|
| 202 |
+
return 'Other', 'No text content'
|
| 203 |
|
| 204 |
text_lower = str(text).lower()
|
| 205 |
|
| 206 |
+
# Categories with detection logic
|
| 207 |
+
if '?' in text_lower or any(phrase in text_lower for phrase in ['how do', 'what is', 'when', 'where', 'can i', 'could you']):
|
| 208 |
+
return 'Inquiry', 'Contains questions or information seeking'
|
| 209 |
+
elif any(word in text_lower for word in ['complaint', 'problem', 'issue', 'error', 'failed', 'not working', 'terrible', 'worst']):
|
| 210 |
+
return 'Complaint', 'Contains complaint or problem description'
|
| 211 |
+
elif any(word in text_lower for word in ['thank', 'great', 'excellent', 'love', 'best', 'appreciate', 'amazing']):
|
| 212 |
+
return 'Praise', 'Contains positive feedback or appreciation'
|
| 213 |
+
elif any(word in text_lower for word in ['suggest', 'should', 'could', 'recommend', 'request', 'please add']):
|
| 214 |
+
return 'Suggestion', 'Contains suggestions or feature requests'
|
| 215 |
else:
|
| 216 |
+
return 'Other', 'General discussion or observation'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
def process_all_data(self, df):
|
| 219 |
"""Apply all processing to dataframe"""
|
| 220 |
# Find text column
|
| 221 |
+
text_columns = ['text', 'content', 'message', 'review', 'comment', 'post', 'Text', 'Content']
|
| 222 |
text_col = None
|
| 223 |
|
| 224 |
for col in text_columns:
|
|
|
|
| 232 |
if 'text' not in df.columns:
|
| 233 |
return df
|
| 234 |
|
| 235 |
+
# Identify which bank each post is about
|
| 236 |
+
df[['primary_bank', 'all_banks_mentioned']] = df['text'].apply(
|
| 237 |
+
lambda x: pd.Series(self.identify_bank(x))
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
# Count mentions for each bank
|
| 241 |
+
df['prime_mentions'] = df['text'].apply(lambda x: self.count_bank_mentions(x, 'prime_bank'))
|
| 242 |
+
|
| 243 |
+
# Apply sentiment analysis
|
| 244 |
df[['sentiment', 'polarity']] = df['text'].apply(
|
| 245 |
lambda x: pd.Series(self.analyze_sentiment(x))
|
| 246 |
)
|
| 247 |
|
| 248 |
+
# Apply emotion detection with keywords
|
| 249 |
+
df[['emotion', 'emotion_keywords']] = df['text'].apply(
|
| 250 |
+
lambda x: pd.Series(self.detect_emotion(x))
|
| 251 |
+
)
|
| 252 |
|
| 253 |
+
# Categorize posts with reasons
|
| 254 |
+
df[['category', 'category_reason']] = df['text'].apply(
|
| 255 |
+
lambda x: pd.Series(self.categorize_post(x))
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
# Calculate viral score (only for posts with engagement metrics)
|
| 259 |
+
df['viral_score'] = 0
|
| 260 |
if 'likes' in df.columns:
|
| 261 |
df['viral_score'] += df['likes'].fillna(0)
|
| 262 |
if 'shares' in df.columns:
|
| 263 |
df['viral_score'] += df['shares'].fillna(0) * 2
|
| 264 |
+
if 'comments' in df.columns:
|
| 265 |
+
df['viral_score'] += df['comments'].fillna(0) * 1.5
|
| 266 |
+
|
| 267 |
+
# Add Prime Bank specific viral score boost
|
| 268 |
+
df.loc[df['prime_mentions'] > 0, 'viral_score'] *= 1.2
|
| 269 |
+
|
| 270 |
return df
|
src/insights_generator.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from collections import Counter
|
| 3 |
+
|
| 4 |
+
class InsightsGenerator:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self.insights = {}
|
| 7 |
+
|
| 8 |
+
def generate_all_insights(self, df, prime_df):
|
| 9 |
+
"""Generate comprehensive insights for all analyses"""
|
| 10 |
+
|
| 11 |
+
# Overall statistics
|
| 12 |
+
total_posts = len(df)
|
| 13 |
+
prime_posts = len(prime_df)
|
| 14 |
+
prime_percentage = (prime_posts / total_posts * 100) if total_posts > 0 else 0
|
| 15 |
+
|
| 16 |
+
self.insights['overview'] = {
|
| 17 |
+
'summary': f"Analyzed {total_posts:,} total posts, of which {prime_posts:,} ({prime_percentage:.1f}%) specifically mention Prime Bank.",
|
| 18 |
+
'context': f"The remaining {total_posts - prime_posts:,} posts mention other banks or general banking topics."
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
# Sentiment insights
|
| 22 |
+
self.insights['sentiment'] = self._generate_sentiment_insights(prime_df)
|
| 23 |
+
|
| 24 |
+
# Emotion insights
|
| 25 |
+
self.insights['emotion'] = self._generate_emotion_insights(prime_df)
|
| 26 |
+
|
| 27 |
+
# Category insights
|
| 28 |
+
self.insights['category'] = self._generate_category_insights(prime_df)
|
| 29 |
+
|
| 30 |
+
# Trending topics
|
| 31 |
+
self.insights['topics'] = self._generate_topic_insights(prime_df)
|
| 32 |
+
|
| 33 |
+
# Comparative analysis
|
| 34 |
+
self.insights['comparison'] = self._generate_comparison_insights(df)
|
| 35 |
+
|
| 36 |
+
# Priority actions
|
| 37 |
+
self.insights['actions'] = self._generate_action_insights(prime_df)
|
| 38 |
+
|
| 39 |
+
return self.insights
|
| 40 |
+
|
| 41 |
+
def _generate_sentiment_insights(self, df):
|
| 42 |
+
"""Generate sentiment-specific insights"""
|
| 43 |
+
if len(df) == 0:
|
| 44 |
+
return {'summary': 'No Prime Bank posts found for sentiment analysis.'}
|
| 45 |
+
|
| 46 |
+
sentiment_dist = df['sentiment'].value_counts(normalize=True) * 100
|
| 47 |
+
|
| 48 |
+
# Get sample posts for each sentiment
|
| 49 |
+
sentiment_examples = {}
|
| 50 |
+
for sentiment in ['Positive', 'Negative', 'Neutral']:
|
| 51 |
+
examples = df[df['sentiment'] == sentiment]['text'].head(2).tolist()
|
| 52 |
+
sentiment_examples[sentiment] = examples
|
| 53 |
+
|
| 54 |
+
# Analyze negative posts for common issues
|
| 55 |
+
negative_posts = df[df['sentiment'] == 'Negative']['text']
|
| 56 |
+
negative_themes = []
|
| 57 |
+
if len(negative_posts) > 0:
|
| 58 |
+
all_negative_text = ' '.join(negative_posts.astype(str).tolist()).lower()
|
| 59 |
+
if 'wait' in all_negative_text or 'queue' in all_negative_text:
|
| 60 |
+
negative_themes.append('long wait times')
|
| 61 |
+
if 'fee' in all_negative_text or 'charge' in all_negative_text:
|
| 62 |
+
negative_themes.append('fees and charges')
|
| 63 |
+
if 'app' in all_negative_text or 'online' in all_negative_text:
|
| 64 |
+
negative_themes.append('digital banking issues')
|
| 65 |
+
if 'staff' in all_negative_text or 'service' in all_negative_text:
|
| 66 |
+
negative_themes.append('customer service')
|
| 67 |
+
|
| 68 |
+
insights = {
|
| 69 |
+
'summary': f"Sentiment breakdown: {sentiment_dist.get('Positive', 0):.1f}% positive, {sentiment_dist.get('Negative', 0):.1f}% negative, {sentiment_dist.get('Neutral', 0):.1f}% neutral.",
|
| 70 |
+
'positive_context': f"Positive posts ({sentiment_dist.get('Positive', 0):.1f}%) primarily praise customer service, digital banking features, and efficient processes.",
|
| 71 |
+
'negative_context': f"Negative posts ({sentiment_dist.get('Negative', 0):.1f}%) mainly complain about: {', '.join(negative_themes) if negative_themes else 'various service issues'}.",
|
| 72 |
+
'neutral_context': f"Neutral posts ({sentiment_dist.get('Neutral', 0):.1f}%) are mostly inquiries about services and general discussions.",
|
| 73 |
+
'examples': sentiment_examples,
|
| 74 |
+
'concern_areas': negative_themes
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
return insights
|
| 78 |
+
|
| 79 |
+
def _generate_emotion_insights(self, df):
|
| 80 |
+
"""Generate emotion-specific insights"""
|
| 81 |
+
if len(df) == 0:
|
| 82 |
+
return {'summary': 'No Prime Bank posts found for emotion analysis.'}
|
| 83 |
+
|
| 84 |
+
emotion_dist = df['emotion'].value_counts()
|
| 85 |
+
total_emotional = len(df[df['emotion'] != 'Neutral'])
|
| 86 |
+
|
| 87 |
+
emotion_contexts = {
|
| 88 |
+
'Joy': 'Customers expressing joy are satisfied with services, particularly praising staff helpfulness and quick problem resolution.',
|
| 89 |
+
'Frustration': 'Frustrated customers mainly face issues with wait times, technical problems, and unresolved complaints.',
|
| 90 |
+
'Confusion': 'Confused customers need better information about products, fees, and online banking procedures.',
|
| 91 |
+
'Anxiety': 'Anxious customers are worried about account security, loan applications, and urgent transaction issues.'
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
# Get most common emotion keywords
|
| 95 |
+
emotion_keywords = {}
|
| 96 |
+
for emotion in ['Joy', 'Frustration', 'Confusion', 'Anxiety']:
|
| 97 |
+
emotion_posts = df[df['emotion'] == emotion]
|
| 98 |
+
if len(emotion_posts) > 0:
|
| 99 |
+
# Flatten all keywords for this emotion
|
| 100 |
+
all_keywords = []
|
| 101 |
+
for keywords in emotion_posts['emotion_keywords']:
|
| 102 |
+
if isinstance(keywords, list):
|
| 103 |
+
all_keywords.extend(keywords)
|
| 104 |
+
if all_keywords:
|
| 105 |
+
emotion_keywords[emotion] = Counter(all_keywords).most_common(3)
|
| 106 |
+
|
| 107 |
+
insights = {
|
| 108 |
+
'summary': f"{total_emotional} out of {len(df)} Prime Bank posts ({total_emotional/len(df)*100:.1f}%) express clear emotions.",
|
| 109 |
+
'distribution': {emotion: count for emotion, count in emotion_dist.items()},
|
| 110 |
+
'contexts': emotion_contexts,
|
| 111 |
+
'top_emotion': emotion_dist.index[0] if len(emotion_dist) > 0 else 'None',
|
| 112 |
+
'keywords': emotion_keywords,
|
| 113 |
+
'recommendation': self._get_emotion_recommendation(emotion_dist)
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
return insights
|
| 117 |
+
|
| 118 |
+
def _generate_category_insights(self, df):
|
| 119 |
+
"""Generate category-specific insights"""
|
| 120 |
+
if len(df) == 0:
|
| 121 |
+
return {'summary': 'No Prime Bank posts found for category analysis.'}
|
| 122 |
+
|
| 123 |
+
category_dist = df['category'].value_counts()
|
| 124 |
+
|
| 125 |
+
category_insights = {
|
| 126 |
+
'Inquiry': {
|
| 127 |
+
'common_topics': ['account opening', 'loan applications', 'online banking setup', 'branch locations'],
|
| 128 |
+
'action': 'Improve FAQ section and provide clearer information channels'
|
| 129 |
+
},
|
| 130 |
+
'Complaint': {
|
| 131 |
+
'common_topics': ['service delays', 'technical issues', 'hidden fees', 'staff behavior'],
|
| 132 |
+
'action': 'Establish rapid response team for complaint resolution'
|
| 133 |
+
},
|
| 134 |
+
'Praise': {
|
| 135 |
+
'common_topics': ['helpful staff', 'quick service', 'user-friendly app', 'problem resolution'],
|
| 136 |
+
'action': 'Recognize and reward mentioned staff members'
|
| 137 |
+
},
|
| 138 |
+
'Suggestion': {
|
| 139 |
+
'common_topics': ['new features', 'branch expansion', 'service improvements', 'digital enhancements'],
|
| 140 |
+
'action': 'Review suggestions for product development roadmap'
|
| 141 |
+
}
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
insights = {
|
| 145 |
+
'summary': f"Post categories: {', '.join([f'{cat} ({count})' for cat, count in category_dist.items()])}",
|
| 146 |
+
'details': category_insights,
|
| 147 |
+
'urgent_attention': f"{category_dist.get('Complaint', 0)} complaints require immediate attention",
|
| 148 |
+
'opportunities': f"{category_dist.get('Suggestion', 0)} suggestions for improvement"
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
return insights
|
| 152 |
+
|
| 153 |
+
def _generate_topic_insights(self, df):
|
| 154 |
+
"""Identify trending topics"""
|
| 155 |
+
if len(df) == 0:
|
| 156 |
+
return {'summary': 'No Prime Bank posts found for topic analysis.'}
|
| 157 |
+
|
| 158 |
+
# Combine all text
|
| 159 |
+
all_text = ' '.join(df['text'].astype(str).tolist()).lower()
|
| 160 |
+
|
| 161 |
+
# Define topic keywords
|
| 162 |
+
topics = {
|
| 163 |
+
'Digital Banking': ['app', 'online', 'mobile', 'website', 'internet banking'],
|
| 164 |
+
'Customer Service': ['staff', 'service', 'help', 'support', 'employee'],
|
| 165 |
+
'Fees & Charges': ['fee', 'charge', 'cost', 'expensive', 'price'],
|
| 166 |
+
'Loans': ['loan', 'credit', 'mortgage', 'interest', 'emi'],
|
| 167 |
+
'ATM & Branch': ['atm', 'branch', 'location', 'machine', 'cash'],
|
| 168 |
+
'Account Services': ['account', 'savings', 'current', 'balance', 'statement']
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
topic_counts = {}
|
| 172 |
+
for topic, keywords in topics.items():
|
| 173 |
+
count = sum(1 for keyword in keywords if keyword in all_text)
|
| 174 |
+
if count > 0:
|
| 175 |
+
topic_counts[topic] = count
|
| 176 |
+
|
| 177 |
+
sorted_topics = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)
|
| 178 |
+
|
| 179 |
+
insights = {
|
| 180 |
+
'summary': f"Top discussed topics: {', '.join([f'{topic} ({count} mentions)' for topic, count in sorted_topics[:3]])}",
|
| 181 |
+
'all_topics': dict(sorted_topics),
|
| 182 |
+
'trending': sorted_topics[0][0] if sorted_topics else 'None',
|
| 183 |
+
'recommendation': f"Focus on improving {sorted_topics[0][0].lower()} based on high discussion volume" if sorted_topics else "No clear topic trends"
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
return insights
|
| 187 |
+
|
| 188 |
+
def _generate_comparison_insights(self, df):
|
| 189 |
+
"""Compare Prime Bank with other banks"""
|
| 190 |
+
bank_sentiment = {}
|
| 191 |
+
|
| 192 |
+
for bank in ['prime_bank', 'eastern_bank', 'brac_bank', 'city_bank', 'dutch_bangla']:
|
| 193 |
+
bank_posts = df[df['primary_bank'] == bank]
|
| 194 |
+
if len(bank_posts) > 0:
|
| 195 |
+
positive_rate = (bank_posts['sentiment'] == 'Positive').sum() / len(bank_posts) * 100
|
| 196 |
+
bank_sentiment[bank] = {
|
| 197 |
+
'posts': len(bank_posts),
|
| 198 |
+
'positive_rate': positive_rate
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
if 'prime_bank' in bank_sentiment:
|
| 202 |
+
prime_positive = bank_sentiment['prime_bank']['positive_rate']
|
| 203 |
+
comparison = "above average" if prime_positive > 50 else "below average"
|
| 204 |
+
|
| 205 |
+
insights = {
|
| 206 |
+
'summary': f"Prime Bank has {prime_positive:.1f}% positive sentiment, which is {comparison} in the banking sector.",
|
| 207 |
+
'comparison': bank_sentiment,
|
| 208 |
+
'recommendation': "Focus on maintaining positive momentum" if prime_positive > 50 else "Urgent improvement needed to match competitor satisfaction levels"
|
| 209 |
+
}
|
| 210 |
+
else:
|
| 211 |
+
insights = {'summary': 'No comparative data available.'}
|
| 212 |
+
|
| 213 |
+
return insights
|
| 214 |
+
|
| 215 |
+
def _generate_action_insights(self, df):
|
| 216 |
+
"""Generate actionable insights"""
|
| 217 |
+
if len(df) == 0:
|
| 218 |
+
return {'summary': 'No Prime Bank posts found for action analysis.'}
|
| 219 |
+
|
| 220 |
+
# High priority posts
|
| 221 |
+
high_priority = df[
|
| 222 |
+
(df['sentiment'] == 'Negative') &
|
| 223 |
+
(df['emotion'].isin(['Frustration', 'Anxiety'])) &
|
| 224 |
+
(df['category'] == 'Complaint')
|
| 225 |
+
]
|
| 226 |
+
|
| 227 |
+
# Quick wins - positive posts that can be amplified
|
| 228 |
+
quick_wins = df[
|
| 229 |
+
(df['sentiment'] == 'Positive') &
|
| 230 |
+
(df['category'] == 'Praise')
|
| 231 |
+
]
|
| 232 |
+
|
| 233 |
+
actions = {
|
| 234 |
+
'immediate': {
|
| 235 |
+
'count': len(high_priority),
|
| 236 |
+
'description': 'High-priority complaints requiring immediate response',
|
| 237 |
+
'action': 'Contact these customers within 24 hours'
|
| 238 |
+
},
|
| 239 |
+
'quick_wins': {
|
| 240 |
+
'count': len(quick_wins),
|
| 241 |
+
'description': 'Positive testimonials for marketing use',
|
| 242 |
+
'action': 'Share success stories and thank customers publicly'
|
| 243 |
+
},
|
| 244 |
+
'strategic': {
|
| 245 |
+
'description': 'Long-term improvements based on feedback patterns',
|
| 246 |
+
'actions': [
|
| 247 |
+
'Enhance digital banking infrastructure',
|
| 248 |
+
'Implement customer service training program',
|
| 249 |
+
'Review and simplify fee structure'
|
| 250 |
+
]
|
| 251 |
+
}
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
return actions
|
| 255 |
+
|
| 256 |
+
def _get_emotion_recommendation(self, emotion_dist):
|
| 257 |
+
"""Get recommendation based on emotion distribution"""
|
| 258 |
+
if len(emotion_dist) == 0:
|
| 259 |
+
return "No emotional data to analyze"
|
| 260 |
+
|
| 261 |
+
top_emotion = emotion_dist.index[0]
|
| 262 |
+
|
| 263 |
+
recommendations = {
|
| 264 |
+
'Joy': "Leverage positive emotions by encouraging happy customers to share testimonials",
|
| 265 |
+
'Frustration': "Implement rapid response protocol for frustrated customers to prevent escalation",
|
| 266 |
+
'Confusion': "Create clearer communication materials and improve customer education",
|
| 267 |
+
'Anxiety': "Provide reassurance through proactive communication about security and processes",
|
| 268 |
+
'Neutral': "Engage neutral customers with targeted campaigns to create"
|
| 269 |
+
}
|
| 270 |
+
return recommendations.get(top_emotion, "Monitor customer emotions closely")
|