import pdfplumber
import pandas as pd
import re
from typing import List, Dict, Optional
import logging
from datetime import datetime

logger = logging.getLogger(__name__)

class PDFTransactionProcessor:
    """Process transaction data from PDF bank statements."""
    
    def __init__(self):
        self.transaction_patterns = {
            'standard': re.compile(
                r'(?P<date>\d{2}/\d{2}/\d{2,4})\s+(?P<description>.+?)\s+(?P<amount>-?\$?\d{1,3}(?:,\d{3})*\.\d{2})'
            ),
            'bank_specific': {
                'chase': re.compile(
                    r'(?P<date>\d{2}/\d{2})\s+(?P<description>.+?)\s+(?P<amount>-?\$?\d{1,3}(?:,\d{3})*\.\d{2})'
                ),
                'bank_of_america': re.compile(
                    r'(?P<date>\d{2}/\d{2}/\d{4})\s+(?P<description>.+?)\s+(?P<amount>-?\d{1,3}(?:,\d{3})*\.\d{2})'
                )
            }
        }
    
    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """Extract text from PDF file."""
        try:
            full_text = []
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    if text:
                        full_text.append(text)
            return "\n".join(full_text)
        except Exception as e:
            logger.error(f"Error extracting text from PDF: {str(e)}")
            raise
    
    def identify_bank(self, text: str) -> Optional[str]:
        """Identify bank from PDF text."""
        bank_indicators = {
            'chase': ['chase', 'jpmorgan'],
            'bank_of_america': ['bank of america', 'bofa'],
            'wells_fargo': ['wells fargo'],
            'citi': ['citibank', 'citi']
        }
        
        text_lower = text.lower()
        for bank, indicators in bank_indicators.items():
            if any(indicator in text_lower for indicator in indicators):
                return bank
        return None
    
    def parse_transactions(self, text: str, bank: Optional[str] = None) -> List[Dict]:
        """Parse transactions from extracted text."""
        transactions = []
        lines = text.split('\n')
        
        # Try bank-specific parser first
        if bank and bank in self.transaction_patterns['bank_specific']:
            pattern = self.transaction_patterns['bank_specific'][bank]
            transactions = self._parse_with_pattern(lines, pattern)
        
        # Fall back to standard parser if no matches found
        if not transactions:
            pattern = self.transaction_patterns['standard']
            transactions = self._parse_with_pattern(lines, pattern)
        
        return transactions
    
    def _parse_with_pattern(self, lines: List[str], pattern) -> List[Dict]:
        """Helper method to parse transactions using a regex pattern."""
        transactions = []
        for line in lines:
            match = pattern.search(line)
            if match:
                try:
                    amount = match.group('amount').replace('$', '').replace(',', '')
                    transactions.append({
                        'date': match.group('date'),
                        'description': match.group('description').strip(),
                        'amount': float(amount)
                    })
                except (ValueError, AttributeError) as e:
                    logger.debug(f"Skipping line due to parsing error: {line}")
        return transactions
    
    def process_pdf(self, pdf_path: str) -> pd.DataFrame:
        """Process PDF file and return transactions as DataFrame."""
        try:
            text = self.extract_text_from_pdf(pdf_path)
            bank = self.identify_bank(text)
            transactions = self.parse_transactions(text, bank)
            
            if not transactions:
                raise ValueError("No transactions found in PDF")
                
            df = pd.DataFrame(transactions)
            
            # Clean and standardize data
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df = df.dropna(subset=['date', 'amount'])
            df['description'] = df['description'].str.strip()
            
            return df
            
        except Exception as e:
            logger.error(f"Error processing PDF: {str(e)}")
            raise