# src/data_processor.py import pandas as pd import re import numpy as np import json import streamlit as st try: import openai OPENAI_AVAILABLE = True except ImportError: OPENAI_AVAILABLE = False try: import nltk from nltk.sentiment import SentimentIntensityAnalyzer nltk.download('vader_lexicon', quiet=True) NLTK_AVAILABLE = True except ImportError: NLTK_AVAILABLE = False class DataProcessor: def __init__(self, openai_api_key=None): self.processed_data = None if NLTK_AVAILABLE: try: self.sia = SentimentIntensityAnalyzer() except Exception: self.sia = None else: self.sia = None self.use_gpt = False if openai_api_key and OPENAI_AVAILABLE: openai.api_key = openai_api_key self.use_gpt = True # Banking patterns - INCLUDING OTHER BANKS self.bank_patterns = { 'prime_bank': [r'prime\s*bank', r'primebank', r'@primebank', r'prime\s*b\.?'], 'eastern_bank': [r'eastern\s*bank', r'ebl', r'@easternbank'], 'brac_bank': [r'brac\s*bank', r'@bracbank'], 'city_bank': [r'city\s*bank', r'@citybank'], 'dutch_bangla': [r'dutch\s*bangla', r'dbbl', r'@dutchbangla'] } def load_data_from_files(self, csv_files=None, txt_files=None): """Load data from CSV and TXT files""" all_data = [] if csv_files: for file_path in csv_files: try: df = pd.read_csv(file_path) df['source_file'] = file_path.split('/')[-1] all_data.append(df) except Exception as e: print(f"Error loading {file_path}: {e}") if txt_files: for file_path in txt_files: try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() posts = content.split('\n') df = pd.DataFrame({ 'text': [post.strip() for post in posts if post.strip()], 'source_file': file_path.split('/')[-1] }) all_data.append(df) except Exception as e: print(f"Error loading {file_path}: {e}") if all_data: return pd.concat(all_data, ignore_index=True) return pd.DataFrame() def identify_bank(self, text): """Identify which bank is mentioned in the text""" if pd.isna(text): return 'none', [] text_lower = str(text).lower() mentioned_banks = [] for bank, patterns in self.bank_patterns.items(): for pattern in patterns: if re.search(pattern, text_lower): mentioned_banks.append(bank) break if not mentioned_banks: return 'none', [] elif len(mentioned_banks) == 1: return mentioned_banks[0], mentioned_banks else: return 'multiple', mentioned_banks def count_bank_mentions(self, text, bank='prime_bank'): """Count mentions of specific bank""" if pd.isna(text): return 0 text_lower = str(text).lower() total_mentions = 0 if bank in self.bank_patterns: for pattern in self.bank_patterns[bank]: mentions = len(re.findall(pattern, text_lower)) total_mentions += mentions return total_mentions def analyze_sentiment(self, text): """ Analyzes sentiment using a hybrid approach. 1. Uses VADER for a fast, initial assessment. 2. If GPT is available, uses it for a more nuanced analysis on tricky cases (e.g., complaints, or where VADER is neutral but context seems negative). """ if pd.isna(text) or str(text).strip() == '': return 'Neutral', 0.0 text_str = str(text) # --- Step 1: Initial analysis with VADER (fast and cheap) --- vader_sentiment = 'Neutral' vader_polarity = 0.0 if self.sia: scores = self.sia.polarity_scores(text_str) compound = scores['compound'] vader_polarity = compound if compound >= 0.05: vader_sentiment = 'Positive' elif compound <= -0.05: vader_sentiment = 'Negative' # --- Step 2: Use GPT for a more accurate, context-aware second opinion (if enabled) --- is_complaint = any(word in text_str.lower() for word in ['complaint', 'problem', 'issue', 'error', 'failed', 'not working', 'terrible', 'worst', 'pathetic', 'disappointed']) # Trigger AI if it's a complaint, or if VADER is unsure (Neutral) if self.use_gpt and (is_complaint or vader_sentiment == 'Neutral'): try: prompt = f""" Analyze the sentiment of the following customer comment for a bank. The context is critical. A statement like "my balance is zero" is highly negative. Classify the sentiment as one of: 'Positive', 'Negative', or 'Neutral'. Also provide a polarity score from -1.0 (most negative) to 1.0 (most positive). Return your answer ONLY as a JSON object with keys "sentiment", "polarity" and "post_link". Customer Comment: "{text_str}" """ response = openai.chat.completions.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}], temperature=0.0, response_format={"type": "json_object"} ) result = json.loads(response.choices[0].message.content) sentiment = result.get('sentiment', 'Neutral') polarity = float(result.get('polarity', 0.0)) post_link = result.get('post_link', '') return (sentiment, polarity, post_link) except Exception as e: print(f"OpenAI call failed: {e}. Falling back to VADER.") return (vader_sentiment, vader_polarity, '') # --- Step 3: Fallback to VADER if GPT is not used or triggered --- return (vader_sentiment, vader_polarity, '') def detect_emotion(self, text): """Detect emotion in text with context""" if pd.isna(text): return 'Neutral', [] text_lower = str(text).lower() emotions = { 'Joy': {'keywords': ['happy', 'excellent', 'amazing', 'great', 'wonderful', 'fantastic', 'love', 'best', 'thank you', 'appreciate']}, 'Frustration': {'keywords': ['frustrated', 'angry', 'terrible', 'horrible', 'worst', 'hate', 'annoyed', 'disappointed', 'pathetic']}, 'Confusion': {'keywords': ['confused', 'unclear', "don't understand", 'what', 'how', 'why', '?', 'help me', 'lost']}, 'Anxiety': {'keywords': ['worried', 'concern', 'anxious', 'nervous', 'scared', 'fear', 'panic', 'urgent']} } emotion_scores = {} detected_keywords = {} for emotion, data in emotions.items(): keywords_found = [kw for kw in data['keywords'] if kw in text_lower] score = len(keywords_found) emotion_scores[emotion] = score if keywords_found: detected_keywords[emotion] = keywords_found if max(emotion_scores.values()) > 0: primary_emotion = max(emotion_scores, key=emotion_scores.get) return primary_emotion, detected_keywords.get(primary_emotion, []) return 'Neutral', [] def categorize_post(self, text): """Categorize post type with reason""" if pd.isna(text): return 'Other', 'No text content' text_lower = str(text).lower() if '?' in text_lower or any(phrase in text_lower for phrase in ['how do', 'what is', 'when', 'where', 'can i', 'could you', 'explain']): return 'Inquiry', 'Contains questions or information seeking' elif any(word in text_lower for word in ['complaint', 'problem', 'issue', 'error', 'failed', 'not working', 'terrible', 'worst']): return 'Complaint', 'Contains complaint or problem description' elif any(word in text_lower for word in ['thank', 'great', 'excellent', 'love', 'best', 'appreciate', 'amazing']): return 'Praise', 'Contains positive feedback or appreciation' elif any(word in text_lower for word in ['suggest', 'should', 'could', 'recommend', 'request', 'please add']): return 'Suggestion', 'Contains suggestions or feature requests' else: return 'Other', 'General discussion or observation' def process_all_data(self, df): """Apply all processing to dataframe""" if df.empty: return df print("Shob columns",df.columns) text_columns = ['text', 'content', 'message', 'review', 'comment', 'post', 'Text', 'Content', 'Post', 'Review Text'] text_col = None for col in text_columns: if col in df.columns: text_col = col break if not text_col: st.warning("Could not find a text column in one of the data sources.") return pd.DataFrame(columns=df.columns) if text_col != 'text': df.rename(columns={text_col: 'text'}, inplace=True) df[['primary_bank', 'all_banks_mentioned']] = df['text'].apply(lambda x: pd.Series(self.identify_bank(x))) df['prime_mentions'] = df['text'].apply(lambda x: self.count_bank_mentions(x, 'prime_bank')) df[['sentiment', 'polarity', 'post_link']] = df['text'].apply(lambda x: pd.Series(self.analyze_sentiment(x))) df[['emotion', 'emotion_keywords']] = df['text'].apply(lambda x: pd.Series(self.detect_emotion(x))) df[['category', 'category_reason']] = df['text'].apply(lambda x: pd.Series(self.categorize_post(x))) # NOTE: The patch for 'Inquiry' sentiment has been removed, as the new # AI-powered sentiment analysis handles this contextually. df['viral_score'] = 0 if 'likes' in df.columns: df['viral_score'] += df['likes'].fillna(0) if 'shares' in df.columns: df['viral_score'] += df['shares'].fillna(0) * 2 if 'comments' in df.columns: df['viral_score'] += df['comments'].fillna(0) * 1.5 if not df.empty and 'prime_mentions' in df.columns: df.loc[df['prime_mentions'] > 0, 'viral_score'] *= 1.2 return df