# src/data_processor.py

import pandas as pd
import re
import numpy as np
import json
import streamlit as st

try:
    import openai
    OPENAI_AVAILABLE = True
except ImportError:
    OPENAI_AVAILABLE = False

try:
    import nltk
    from nltk.sentiment import SentimentIntensityAnalyzer
    nltk.download('vader_lexicon', quiet=True)
    NLTK_AVAILABLE = True
except ImportError:
    NLTK_AVAILABLE = False

class DataProcessor:
    def __init__(self, openai_api_key=None):
        self.processed_data = None
        
        if NLTK_AVAILABLE:
            try:
                self.sia = SentimentIntensityAnalyzer()
            except Exception:
                self.sia = None
        else:
            self.sia = None
            
        self.use_gpt = False
        if openai_api_key and OPENAI_AVAILABLE:
            openai.api_key = openai_api_key
            self.use_gpt = True
            
        # Banking patterns - INCLUDING OTHER BANKS
        self.bank_patterns = {
            'prime_bank': [r'prime\s*bank', r'primebank', r'@primebank', r'prime\s*b\.?'],
            'eastern_bank': [r'eastern\s*bank', r'ebl', r'@easternbank'],
            'brac_bank': [r'brac\s*bank', r'@bracbank'],
            'city_bank': [r'city\s*bank', r'@citybank'],
            'dutch_bangla': [r'dutch\s*bangla', r'dbbl', r'@dutchbangla']
        }
        
    def load_data_from_files(self, csv_files=None, txt_files=None):
        """Load data from CSV and TXT files"""
        all_data = []
        
        if csv_files:
            for file_path in csv_files:
                try:
                    df = pd.read_csv(file_path)
                    df['source_file'] = file_path.split('/')[-1]
                    all_data.append(df)
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
        
        if txt_files:
            for file_path in txt_files:
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                    
                    posts = content.split('\n')
                    
                    df = pd.DataFrame({
                        'text': [post.strip() for post in posts if post.strip()],
                        'source_file': file_path.split('/')[-1]
                    })
                    all_data.append(df)
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
        
        if all_data:
            return pd.concat(all_data, ignore_index=True)
        return pd.DataFrame()
    
    def identify_bank(self, text):
        """Identify which bank is mentioned in the text"""
        if pd.isna(text):
            return 'none', []
        
        text_lower = str(text).lower()
        mentioned_banks = []
        
        for bank, patterns in self.bank_patterns.items():
            for pattern in patterns:
                if re.search(pattern, text_lower):
                    mentioned_banks.append(bank)
                    break
        
        if not mentioned_banks:
            return 'none', []
        elif len(mentioned_banks) == 1:
            return mentioned_banks[0], mentioned_banks
        else:
            return 'multiple', mentioned_banks
    
    def count_bank_mentions(self, text, bank='prime_bank'):
        """Count mentions of specific bank"""
        if pd.isna(text):
            return 0
        
        text_lower = str(text).lower()
        total_mentions = 0
        
        if bank in self.bank_patterns:
            for pattern in self.bank_patterns[bank]:
                mentions = len(re.findall(pattern, text_lower))
                total_mentions += mentions
        
        return total_mentions
    
    def analyze_sentiment(self, text):
        """
        Analyzes sentiment using a hybrid approach.
        1. Uses VADER for a fast, initial assessment.
        2. If GPT is available, uses it for a more nuanced analysis on tricky cases
           (e.g., complaints, or where VADER is neutral but context seems negative).
        """
        if pd.isna(text) or str(text).strip() == '':
            return 'Neutral', 0.0

        text_str = str(text)

        # --- Step 1: Initial analysis with VADER (fast and cheap) ---
        vader_sentiment = 'Neutral'
        vader_polarity = 0.0
        if self.sia:
            scores = self.sia.polarity_scores(text_str)
            compound = scores['compound']
            vader_polarity = compound
            if compound >= 0.05:
                vader_sentiment = 'Positive'
            elif compound <= -0.05:
                vader_sentiment = 'Negative'

        # --- Step 2: Use GPT for a more accurate, context-aware second opinion (if enabled) ---
        is_complaint = any(word in text_str.lower() for word in ['complaint', 'problem', 'issue', 'error', 'failed', 'not working', 'terrible', 'worst', 'pathetic', 'disappointed'])
        
        # Trigger AI if it's a complaint, or if VADER is unsure (Neutral)
        if self.use_gpt and (is_complaint or vader_sentiment == 'Neutral'):
            try:
                prompt = f"""
                Analyze the sentiment of the following customer comment for a bank.
                The context is critical. A statement like "my balance is zero" is highly negative.
                Classify the sentiment as one of: 'Positive', 'Negative', or 'Neutral'.
                Also provide a polarity score from -1.0 (most negative) to 1.0 (most positive).
                Return your answer ONLY as a JSON object with keys "sentiment", "polarity" and "post_link".

                Customer Comment: "{text_str}"
                """
                response = openai.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.0,
                    response_format={"type": "json_object"}
                )
                result = json.loads(response.choices[0].message.content)
                sentiment = result.get('sentiment', 'Neutral')
                polarity = float(result.get('polarity', 0.0))
                post_link = result.get('post_link', '')
                return (sentiment, polarity, post_link)
            except Exception as e:
                print(f"OpenAI call failed: {e}. Falling back to VADER.")
                return (vader_sentiment, vader_polarity, '')

        # --- Step 3: Fallback to VADER if GPT is not used or triggered ---
        return (vader_sentiment, vader_polarity, '')
    
    def detect_emotion(self, text):
        """Detect emotion in text with context"""
        if pd.isna(text):
            return 'Neutral', []
        
        text_lower = str(text).lower()
        emotions = {
            'Joy': {'keywords': ['happy', 'excellent', 'amazing', 'great', 'wonderful', 'fantastic', 'love', 'best', 'thank you', 'appreciate']},
            'Frustration': {'keywords': ['frustrated', 'angry', 'terrible', 'horrible', 'worst', 'hate', 'annoyed', 'disappointed', 'pathetic']},
            'Confusion': {'keywords': ['confused', 'unclear', "don't understand", 'what', 'how', 'why', '?', 'help me', 'lost']},
            'Anxiety': {'keywords': ['worried', 'concern', 'anxious', 'nervous', 'scared', 'fear', 'panic', 'urgent']}
        }
        emotion_scores = {}
        detected_keywords = {}
        for emotion, data in emotions.items():
            keywords_found = [kw for kw in data['keywords'] if kw in text_lower]
            score = len(keywords_found)
            emotion_scores[emotion] = score
            if keywords_found:
                detected_keywords[emotion] = keywords_found
        
        if max(emotion_scores.values()) > 0:
            primary_emotion = max(emotion_scores, key=emotion_scores.get)
            return primary_emotion, detected_keywords.get(primary_emotion, [])
        return 'Neutral', []
    
    def categorize_post(self, text):
        """Categorize post type with reason"""
        if pd.isna(text):
            return 'Other', 'No text content'
        text_lower = str(text).lower()
        if '?' in text_lower or any(phrase in text_lower for phrase in ['how do', 'what is', 'when', 'where', 'can i', 'could you', 'explain']):
            return 'Inquiry', 'Contains questions or information seeking'
        elif any(word in text_lower for word in ['complaint', 'problem', 'issue', 'error', 'failed', 'not working', 'terrible', 'worst']):
            return 'Complaint', 'Contains complaint or problem description'
        elif any(word in text_lower for word in ['thank', 'great', 'excellent', 'love', 'best', 'appreciate', 'amazing']):
            return 'Praise', 'Contains positive feedback or appreciation'
        elif any(word in text_lower for word in ['suggest', 'should', 'could', 'recommend', 'request', 'please add']):
            return 'Suggestion', 'Contains suggestions or feature requests'
        else:
            return 'Other', 'General discussion or observation'
    
    def process_all_data(self, df):
        """Apply all processing to dataframe"""
        if df.empty:
            return df
        print("Shob columns",df.columns)

        text_columns = ['text', 'content', 'message', 'review', 'comment', 'post', 'Text', 'Content', 'Post', 'Review Text']
        text_col = None
        for col in text_columns:
            if col in df.columns:
                text_col = col
                break
        
        if not text_col:
            st.warning("Could not find a text column in one of the data sources.")
            return pd.DataFrame(columns=df.columns)

        if text_col != 'text':
            df.rename(columns={text_col: 'text'}, inplace=True)
        
        df[['primary_bank', 'all_banks_mentioned']] = df['text'].apply(lambda x: pd.Series(self.identify_bank(x)))
        df['prime_mentions'] = df['text'].apply(lambda x: self.count_bank_mentions(x, 'prime_bank'))
        df[['sentiment', 'polarity', 'post_link']] = df['text'].apply(lambda x: pd.Series(self.analyze_sentiment(x)))
        df[['emotion', 'emotion_keywords']] = df['text'].apply(lambda x: pd.Series(self.detect_emotion(x)))
        df[['category', 'category_reason']] = df['text'].apply(lambda x: pd.Series(self.categorize_post(x)))
        
        # NOTE: The patch for 'Inquiry' sentiment has been removed, as the new 
        # AI-powered sentiment analysis handles this contextually.
        
        df['viral_score'] = 0
        if 'likes' in df.columns:
            df['viral_score'] += df['likes'].fillna(0)
        if 'shares' in df.columns:
            df['viral_score'] += df['shares'].fillna(0) * 2
        if 'comments' in df.columns:
            df['viral_score'] += df['comments'].fillna(0) * 1.5
        
        if not df.empty and 'prime_mentions' in df.columns:
            df.loc[df['prime_mentions'] > 0, 'viral_score'] *= 1.2
        
        return df