Spaces:
Sleeping
Sleeping
File size: 10,990 Bytes
5fdccec a544a7a 5fdccec a544a7a 47dd793 a544a7a 5fdccec a544a7a 44c7d10 a544a7a 5fdccec a544a7a 5fdccec bd85768 a544a7a 5fdccec bd85768 5fdccec bd85768 5fdccec bd85768 5fdccec bd85768 5fdccec bd85768 5fdccec bd85768 5fdccec bd85768 5fdccec a544a7a 5fdccec a544a7a 6f5378f a544a7a 6f5378f 5fdccec 6f5378f a544a7a 6f5378f 5fdccec 6f5378f 5fdccec 6f5378f a544a7a 5fdccec 6f5378f 5fdccec 6f5378f 5fdccec 6f5378f 5fdccec 6f5378f 5fdccec 9dfab9f 5fdccec a544a7a 5fdccec a544a7a 44c7d10 a544a7a bd85768 a544a7a 5fdccec a544a7a bd85768 5fdccec a544a7a 5fdccec a544a7a 47dd793 bd85768 a544a7a bd85768 5fdccec 9dfab9f 5fdccec 79b4249 a544a7a 5fdccec 79b4249 44c7d10 5fdccec 44c7d10 bd85768 5fdccec 44c7d10 5fdccec bd85768 5fdccec a544a7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 |
# src/data_processor.py
import pandas as pd
import re
import numpy as np
import json
import streamlit as st
try:
import openai
OPENAI_AVAILABLE = True
except ImportError:
OPENAI_AVAILABLE = False
try:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon', quiet=True)
NLTK_AVAILABLE = True
except ImportError:
NLTK_AVAILABLE = False
class DataProcessor:
def __init__(self, openai_api_key=None):
self.processed_data = None
if NLTK_AVAILABLE:
try:
self.sia = SentimentIntensityAnalyzer()
except Exception:
self.sia = None
else:
self.sia = None
self.use_gpt = False
if openai_api_key and OPENAI_AVAILABLE:
openai.api_key = openai_api_key
self.use_gpt = True
# Banking patterns - INCLUDING OTHER BANKS
self.bank_patterns = {
'prime_bank': [r'prime\s*bank', r'primebank', r'@primebank', r'prime\s*b\.?'],
'eastern_bank': [r'eastern\s*bank', r'ebl', r'@easternbank'],
'brac_bank': [r'brac\s*bank', r'@bracbank'],
'city_bank': [r'city\s*bank', r'@citybank'],
'dutch_bangla': [r'dutch\s*bangla', r'dbbl', r'@dutchbangla']
}
def load_data_from_files(self, csv_files=None, txt_files=None):
"""Load data from CSV and TXT files"""
all_data = []
if csv_files:
for file_path in csv_files:
try:
df = pd.read_csv(file_path)
df['source_file'] = file_path.split('/')[-1]
all_data.append(df)
except Exception as e:
print(f"Error loading {file_path}: {e}")
if txt_files:
for file_path in txt_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
posts = content.split('\n')
df = pd.DataFrame({
'text': [post.strip() for post in posts if post.strip()],
'source_file': file_path.split('/')[-1]
})
all_data.append(df)
except Exception as e:
print(f"Error loading {file_path}: {e}")
if all_data:
return pd.concat(all_data, ignore_index=True)
return pd.DataFrame()
def identify_bank(self, text):
"""Identify which bank is mentioned in the text"""
if pd.isna(text):
return 'none', []
text_lower = str(text).lower()
mentioned_banks = []
for bank, patterns in self.bank_patterns.items():
for pattern in patterns:
if re.search(pattern, text_lower):
mentioned_banks.append(bank)
break
if not mentioned_banks:
return 'none', []
elif len(mentioned_banks) == 1:
return mentioned_banks[0], mentioned_banks
else:
return 'multiple', mentioned_banks
def count_bank_mentions(self, text, bank='prime_bank'):
"""Count mentions of specific bank"""
if pd.isna(text):
return 0
text_lower = str(text).lower()
total_mentions = 0
if bank in self.bank_patterns:
for pattern in self.bank_patterns[bank]:
mentions = len(re.findall(pattern, text_lower))
total_mentions += mentions
return total_mentions
def analyze_sentiment(self, text):
"""
Analyzes sentiment using a hybrid approach.
1. Uses VADER for a fast, initial assessment.
2. If GPT is available, uses it for a more nuanced analysis on tricky cases
(e.g., complaints, or where VADER is neutral but context seems negative).
"""
if pd.isna(text) or str(text).strip() == '':
return 'Neutral', 0.0
text_str = str(text)
# --- Step 1: Initial analysis with VADER (fast and cheap) ---
vader_sentiment = 'Neutral'
vader_polarity = 0.0
if self.sia:
scores = self.sia.polarity_scores(text_str)
compound = scores['compound']
vader_polarity = compound
if compound >= 0.05:
vader_sentiment = 'Positive'
elif compound <= -0.05:
vader_sentiment = 'Negative'
# --- Step 2: Use GPT for a more accurate, context-aware second opinion (if enabled) ---
is_complaint = any(word in text_str.lower() for word in ['complaint', 'problem', 'issue', 'error', 'failed', 'not working', 'terrible', 'worst', 'pathetic', 'disappointed'])
# Trigger AI if it's a complaint, or if VADER is unsure (Neutral)
if self.use_gpt and (is_complaint or vader_sentiment == 'Neutral'):
try:
prompt = f"""
Analyze the sentiment of the following customer comment for a bank.
The context is critical. A statement like "my balance is zero" is highly negative.
Classify the sentiment as one of: 'Positive', 'Negative', or 'Neutral'.
Also provide a polarity score from -1.0 (most negative) to 1.0 (most positive).
Return your answer ONLY as a JSON object with keys "sentiment", "polarity" and "post_link".
Customer Comment: "{text_str}"
"""
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
temperature=0.0,
response_format={"type": "json_object"}
)
result = json.loads(response.choices[0].message.content)
sentiment = result.get('sentiment', 'Neutral')
polarity = float(result.get('polarity', 0.0))
post_link = result.get('post_link', '')
return (sentiment, polarity, post_link)
except Exception as e:
print(f"OpenAI call failed: {e}. Falling back to VADER.")
return (vader_sentiment, vader_polarity, '')
# --- Step 3: Fallback to VADER if GPT is not used or triggered ---
return (vader_sentiment, vader_polarity, '')
def detect_emotion(self, text):
"""Detect emotion in text with context"""
if pd.isna(text):
return 'Neutral', []
text_lower = str(text).lower()
emotions = {
'Joy': {'keywords': ['happy', 'excellent', 'amazing', 'great', 'wonderful', 'fantastic', 'love', 'best', 'thank you', 'appreciate']},
'Frustration': {'keywords': ['frustrated', 'angry', 'terrible', 'horrible', 'worst', 'hate', 'annoyed', 'disappointed', 'pathetic']},
'Confusion': {'keywords': ['confused', 'unclear', "don't understand", 'what', 'how', 'why', '?', 'help me', 'lost']},
'Anxiety': {'keywords': ['worried', 'concern', 'anxious', 'nervous', 'scared', 'fear', 'panic', 'urgent']}
}
emotion_scores = {}
detected_keywords = {}
for emotion, data in emotions.items():
keywords_found = [kw for kw in data['keywords'] if kw in text_lower]
score = len(keywords_found)
emotion_scores[emotion] = score
if keywords_found:
detected_keywords[emotion] = keywords_found
if max(emotion_scores.values()) > 0:
primary_emotion = max(emotion_scores, key=emotion_scores.get)
return primary_emotion, detected_keywords.get(primary_emotion, [])
return 'Neutral', []
def categorize_post(self, text):
"""Categorize post type with reason"""
if pd.isna(text):
return 'Other', 'No text content'
text_lower = str(text).lower()
if '?' in text_lower or any(phrase in text_lower for phrase in ['how do', 'what is', 'when', 'where', 'can i', 'could you', 'explain']):
return 'Inquiry', 'Contains questions or information seeking'
elif any(word in text_lower for word in ['complaint', 'problem', 'issue', 'error', 'failed', 'not working', 'terrible', 'worst']):
return 'Complaint', 'Contains complaint or problem description'
elif any(word in text_lower for word in ['thank', 'great', 'excellent', 'love', 'best', 'appreciate', 'amazing']):
return 'Praise', 'Contains positive feedback or appreciation'
elif any(word in text_lower for word in ['suggest', 'should', 'could', 'recommend', 'request', 'please add']):
return 'Suggestion', 'Contains suggestions or feature requests'
else:
return 'Other', 'General discussion or observation'
def process_all_data(self, df):
"""Apply all processing to dataframe"""
if df.empty:
return df
print("Shob columns",df.columns)
text_columns = ['text', 'content', 'message', 'review', 'comment', 'post', 'Text', 'Content', 'Post', 'Review Text']
text_col = None
for col in text_columns:
if col in df.columns:
text_col = col
break
if not text_col:
st.warning("Could not find a text column in one of the data sources.")
return pd.DataFrame(columns=df.columns)
if text_col != 'text':
df.rename(columns={text_col: 'text'}, inplace=True)
df[['primary_bank', 'all_banks_mentioned']] = df['text'].apply(lambda x: pd.Series(self.identify_bank(x)))
df['prime_mentions'] = df['text'].apply(lambda x: self.count_bank_mentions(x, 'prime_bank'))
df[['sentiment', 'polarity', 'post_link']] = df['text'].apply(lambda x: pd.Series(self.analyze_sentiment(x)))
df[['emotion', 'emotion_keywords']] = df['text'].apply(lambda x: pd.Series(self.detect_emotion(x)))
df[['category', 'category_reason']] = df['text'].apply(lambda x: pd.Series(self.categorize_post(x)))
# NOTE: The patch for 'Inquiry' sentiment has been removed, as the new
# AI-powered sentiment analysis handles this contextually.
df['viral_score'] = 0
if 'likes' in df.columns:
df['viral_score'] += df['likes'].fillna(0)
if 'shares' in df.columns:
df['viral_score'] += df['shares'].fillna(0) * 2
if 'comments' in df.columns:
df['viral_score'] += df['comments'].fillna(0) * 1.5
if not df.empty and 'prime_mentions' in df.columns:
df.loc[df['prime_mentions'] > 0, 'viral_score'] *= 1.2
return df |