Spaces:

kumar715
/

FinMK

Running

File size: 25,358 Bytes

24e6f5b

import pandas as pd
from datetime import datetime, timedelta
from expense_tracker.utils import MongoDBClient
from bson import ObjectId
import pdfplumber
import re
import io
import os
from dotenv import load_dotenv

load_dotenv()


def process_recurring_incomes(user_id):
    """

    Checks all recurring incomes for a user and creates new transactions

    if the recurrence interval has passed since the last run.

    """
    db = MongoDBClient.get_client()
    user = db.users.find_one({'_id': ObjectId(user_id)}, {'financial_data.incomes': 1})
    
    if not user or 'financial_data' not in user or 'incomes' not in user['financial_data']:
        return 0
        
    incomes = user['financial_data']['incomes']
    generated_count = 0
    updated_incomes = []
    
    for income in incomes:
        if not income.get('is_recurring'):
            updated_incomes.append(income)
            continue
            
        last_date = income.get('last_run_date')
        if not last_date:
            last_date = income.get('date') # Default to creation date if never run
        
        # Ensure last_date is datetime
        if isinstance(last_date, str):
             try:
                 last_date = datetime.strptime(last_date, '%Y-%m-%d')
             except:
                 updated_incomes.append(income)
                 continue
                 
        interval = income.get('recurrence_interval')
        if not interval:
            updated_incomes.append(income)
            continue
            
        next_due_date = calculate_next_date(last_date, interval)
        current_date = datetime.now()
        
        new_instances = []
        last_inst_date = last_date
        
        # While the next due date is in the past, generate a transaction
        while next_due_date <= current_date:
            new_tx = income.copy()
            new_tx['_id'] = ObjectId() 
            new_tx['date'] = next_due_date
            new_tx['is_recurring'] = False 
            new_tx['recurrence_interval'] = None
            new_tx['created_at'] = datetime.now()
            new_tx['title'] = f"{income.get('title')} (Recurring)"
            new_tx['parent_id'] = str(income.get('_id'))
            
            new_instances.append(new_tx)
            generated_count += 1
            last_inst_date = next_due_date
            next_due_date = calculate_next_date(last_inst_date, interval)
        
        # Update the template income in the list
        income['last_run_date'] = last_inst_date
        updated_incomes.append(income)
        
        # Add new instances to the batch
        if new_instances:
            updated_incomes.extend(new_instances)
            
    # Save all updates back to user document
    if generated_count > 0:
        db.users.update_one(
            {'_id': ObjectId(user_id)},
            {'$set': {'financial_data.incomes': updated_incomes}}
        )
            
    return generated_count

def calculate_next_date(start_date, interval):
    if interval == 'daily':
        return start_date + timedelta(days=1)
    elif interval == 'weekly':
        return start_date + timedelta(weeks=1)
    elif interval == 'monthly':
        # Simple monthly addition (approx 30 days or same day next month logic)
        # For MVP, using 30 days approximation to avoid calendar complexity
        return start_date + timedelta(days=30) 
    elif interval == 'yearly':
        return start_date + timedelta(days=365)
    return start_date + timedelta(days=36500) # Fallback to far future

def parse_and_import_transactions(file, user_id):
    """

    Parses a CSV or Excel file and imports transaction records (Income/Expense).

    Expected columns: Type, Title, Amount, Category, Date

    If Type is missing, positive amount = Income, negative amount = Expense (optional logic)

    """
    try:
        if file.name.endswith('.csv'):
            df = pd.read_csv(file)
        elif file.name.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(file)
        elif file.name.endswith('.pdf'):
            df = parse_pdf_transactions(file)
        else:
            return 0, "Unsupported file format. Please upload CSV, Excel, or PDF."
            
        if df is None or df.empty:
             return 0, "Could not extract transactions from file."
             
        # Standardize column names (lowercase)
        df.columns = [c.lower().strip() for c in df.columns]
        
        # 'type' is optional but recommended. 'amount' is required.
        required_cols = ['title', 'amount', 'date']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            return 0, f"Missing columns: {', '.join(missing_cols)}"
            
        db = MongoDBClient.get_client()
        count = 0
        income_docs = []
        expense_docs = []
        
        for _, row in df.iterrows():
            try:
                amount = float(row['amount'])
                title = row['title']
                category = row.get('category', 'Uncategorized')
                date_str = row.get('date')
                
                try:
                    date = pd.to_datetime(date_str).to_pydatetime()
                except:
                    date = datetime.now()
                
                # Determine Type
                tran_type = None
                if 'type' in df.columns:
                    val = str(row['type']).lower()
                    if 'income' in val: tran_type = 'income'
                    elif 'expense' in val: tran_type = 'expense'
                
                if not tran_type:
                    if amount >= 0: tran_type = 'income'
                    else: 
                        tran_type = 'expense'
                        amount = abs(amount)
                
                doc = {
                    '_id': ObjectId(),
                    'title': title,
                    'amount': abs(amount),
                    'category': category,
                    'date': date,
                    'created_at': datetime.now(),
                    'source': 'import'
                }
                
                if tran_type == 'income':
                    income_docs.append(doc)
                else:
                    expense_docs.append(doc)

                count += 1
            except Exception:
                continue
        
        # --- NEW: Auto-categorize Batch for CSV/Excel ---
        all_docs = income_docs + expense_docs
        to_categorize = [d for d in all_docs if d.get('category') in ['Uncategorized', '', None]]
        
        if to_categorize:
            try:
                from .category_classifier import batch_classify_transactions
                batch_input = []
                for d in to_categorize:
                    # Find if it's income or expense to pass correct type
                    d_type = 'Income' if any(id(d) == id(inc) for inc in income_docs) else 'Expense'
                    batch_input.append({'title': d['title'], 'type': d_type})
                
                results = batch_classify_transactions(batch_input)
                for idx, res in enumerate(results):
                    to_categorize[idx]['category'] = res['category']
            except Exception as e:
                print(f"Error during import batch categorization: {e}")

        # Batch Update for performance
        if income_docs:
            db.users.update_one(
                {'_id': ObjectId(user_id)},
                {'$push': {'financial_data.incomes': {'$each': income_docs}}}
            )
        
        if expense_docs:
            db.users.update_one(
                {'_id': ObjectId(user_id)},
                {'$push': {'financial_data.expenses': {'$each': expense_docs}}}
            )
                
        return count, None
    except Exception as e:
        import traceback
        traceback.print_exc()
        return 0, str(e)

def parse_pdf_transactions(file):
    """

    Extracts transactions from PDF using deterministic table extraction (pdfplumber).

    Fallback to text regex if tables are not found (TODO).

    """""
    try:
        transactions = []
        
        # Open PDF (file object)
        with pdfplumber.open(file) as pdf:
            for page in pdf.pages:
                tables = page.extract_tables()
                
                for table in tables:
                    if not table: continue
                    
                    # 1. Identify Headers
                    # We look for a row that contains common header keywords
                    header_map = {}
                    header_row_idx = -1
                    
                    for idx, row in enumerate(table):
                        # Clean row: filter None, to_lower
                        row_text = [str(cell).lower().strip() if cell else '' for cell in row]
                        
                        # Check for Date
                        if any(k in row_text for k in ['date', 'txn date', 'transaction date']):
                            header_row_idx = idx
                            
                            # Map columns
                            for col_idx, cell in enumerate(row_text):
                                if 'date' in cell: header_map['date'] = col_idx
                                elif any(k in cell for k in ['title', 'description', 'particulars', 'details', 'narrative', 'transaction']): header_map['title'] = col_idx
                                elif any(k in cell for k in ['debit', 'withdrawal', 'dr']): header_map['debit'] = col_idx
                                elif any(k in cell for k in ['credit', 'deposit', 'cr']): header_map['credit'] = col_idx
                                elif 'amount' in cell: header_map['amount'] = col_idx # Generic amount (check sign or type col)
                                elif 'type' in cell: header_map['type'] = col_idx # cr/dr type column
                                elif 'category' in cell: header_map['category'] = col_idx
                                elif 'balance' in cell: header_map['balance'] = col_idx
                            
                            break # Found header, stop looking
                    
                    if header_row_idx == -1:
                        # No clear header found in this table, skip or try heuristic (first row?)
                        continue
                        
                    # 2. Extract Data Rows
                    for row in table[header_row_idx+1:]:
                        if not row: continue
                        
                        try:
                            # Extract Date
                            date_str = None
                            if 'date' in header_map and header_map['date'] < len(row):
                                date_str = row[header_map['date']]
                            
                            if not date_str: continue # Skip line without date
                            
                            # Normalize Date (Try formats)
                            # Remove newlines
                            date_str = str(date_str).replace('\n', ' ').strip()
                            date_obj = None
                            for fmt in ['%d/%m/%Y', '%d-%m-%Y', '%Y-%m-%d', '%d %b %Y', '%m/%d/%Y']:
                                try:
                                    date_obj = datetime.strptime(date_str, fmt)
                                    break
                                except: pass
                                
                            if not date_obj: continue # Invalid date
                            
                            # Extract Title
                            title = "Transaction"
                            if 'title' in header_map and header_map['title'] < len(row):
                                t_val = row[header_map['title']]
                                if t_val: title = str(t_val).replace('\n', ' ').strip()
                            
                            # Extract Category (if present in PDF)
                            category = "Uncategorized"
                            if 'category' in header_map and header_map['category'] < len(row):
                                c_val = row[header_map['category']]
                                if c_val: category = str(c_val).replace('\n', ' ').strip()

                            # Extract Amount
                            amount = 0.0
                            tran_type = 'expense' # Default
                            
                            # Case A: Debit / Credit Columns
                            if 'debit' in header_map and 'credit' in header_map:
                                debit_val = row[header_map['debit']] if header_map['debit'] < len(row) else None
                                credit_val = row[header_map['credit']] if header_map['credit'] < len(row) else None
                                
                                # Clean values (remove currency symbols, commas)
                                def clean_amt(val):
                                    if not val: return 0.0
                                    v = str(val).replace(',', '').replace('$', '').replace('£', '').replace(' ', '')
                                    if not v: return 0.0
                                    try: return float(v)
                                    except: return 0.0
                                
                                d_amt = clean_amt(debit_val)
                                c_amt = clean_amt(credit_val)
                                
                                if c_amt > 0:
                                    amount = c_amt
                                    tran_type = 'income'
                                elif d_amt > 0:
                                    amount = d_amt
                                    tran_type = 'expense'
                                else:
                                    continue # Zero transaction
                                    
                            # Case B: Single Amount Column
                            elif 'amount' in header_map and header_map['amount'] < len(row):
                                amt_val = row[header_map['amount']]
                                if not amt_val: continue
                                
                                # Check if negative parenthesized (100.00) or -100.00
                                s_val = str(amt_val).replace(',', '').replace('$', '').replace(' ', '')
                                is_neg = False
                                if '(' in s_val and ')' in s_val:
                                    is_neg = True
                                    s_val = s_val.replace('(', '').replace(')', '')
                                elif s_val.startswith('-'):
                                    is_neg = True
                                    s_val = s_val.replace('-', '')
                                
                                try:
                                    amount = float(s_val)
                                except: continue
                                
                                if is_neg:
                                    tran_type = 'expense'
                                else:
                                    # If positive, is it income? Not necessarily. 
                                    # Bank statements often show expenses as positive in a 'Withdrawals' list.
                                    # But since we found generic 'Amount' header, we can check 'type' column
                                    if 'type' in header_map:
                                        t_col = str(row[header_map['type']]).lower()
                                        if any(k in t_col for k in ['cr', 'credit', 'deposit', 'income']):
                                            tran_type = 'income'
                                        elif any(k in t_col for k in ['dr', 'debit', 'withdrawal', 'expense']):
                                            tran_type = 'expense'
                                        else:
                                            tran_type = 'expense'
                                    else:
                                        # Ambiguous: Default to expense? Or look for + sign?
                                        # Let's assume positive = income, negative = expense if mixed.
                                        # BUT: If all are positive, they might be expenses if statement is 'Credit Card'.
                                        # For now: assume positive = income if we can't tell.
                                        # Wait, standard CSV logic (parse_and_import) assumes positive amount + explicit type OR implicit sign.
                                        # Let's stick to signed logic: Positive = Income, Negative = Expense.
                                        # If regex 'CR' found in amount string (e.g. 100.00CR), it's income.
                                        if 'cr' in str(amt_val).lower():
                                            tran_type = 'income'
                                        elif 'dr' in str(amt_val).lower():
                                            tran_type = 'expense'
                                        else:
                                            # Safer: Default to expense unless explicitly marked as income.
                                            # BUT the user said "all are added as income" which was due to this line.
                                            # Let's default to 'expense' if we are unsure, as most transactions are expenses.
                                            tran_type = 'expense'
                                
                            
                            # Add to list
                            transactions.append({
                                'date': date_obj.strftime('%Y-%m-%d'),
                                'title': title,
                                'amount': amount,
                                'type': tran_type,
                                'category': category 
                            })

                            
                        except Exception as e:
                            # Skip row error
                            continue

        if not transactions:
            # Fallback to Regex Text Parsing
            file.seek(0)  # Reset file pointer after table extraction attempt
            with pdfplumber.open(file) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    if not text: continue
                    
                    lines = text.split('\n')
                    for line in lines:
                        line = line.strip()
                        if not line: continue
                        
                        # Regex 1: "Date ... Description ... Amount" (Standard Bank Statement)
                        # Supports: DD/MM/YYYY Description 123.45 OR 123.45CR
                        match = re.search(r'^(\d{1,2}[/\.-]\d{1,2}[/\.-]\d{2,4})\s+(.+?)\s+(-?[\d,]+\.\d{2}[CRcr]*)', line)
                        
                        # Regex 2: "Title Amount Category Date Type" (User's Specific Format)
                        # Pattern: Trader Joes 141.28 Groceries 2023-01-01 Expense
                        if not match:
                             # ^(.+?) matches Title (lazy)
                             # \s+([\d,]+\.\d+) matches Amount
                             # \s+(.+?) matches Category
                             # \s+(\d{4}-\d{2}-\d{2}) matches Date
                             # \s+([A-Za-z]+)$ matches Type
                             match_v2 = re.search(r'^(.+?)\s+([\d,]+\.\d+)\s+(.+?)\s+(\d{4}-\d{2}-\d{2})\s+([A-Za-z]+)$', line)
                             
                             if match_v2:
                                 try:
                                     title = match_v2.group(1).strip()
                                     amount = float(match_v2.group(2).replace(',', ''))
                                     category = match_v2.group(3).strip()
                                     date_str = match_v2.group(4)
                                     type_str = match_v2.group(5).lower()
                                     
                                     if title.lower() == 'title' and amount == 0: continue # Skip Header if matched roughly
                                     if 'amount' in match_v2.group(2).lower(): continue # Skip header strict

                                     date_obj = datetime.strptime(date_str, '%Y-%m-%d')
                                     
                                     transactions.append({
                                         'date': date_obj.strftime('%Y-%m-%d'),
                                         'title': title,
                                         'amount': abs(amount),
                                         'type': 'income' if 'income' in type_str else 'expense',
                                         'category': category
                                     })
                                     continue # Skip other checks
                                 except: pass

                        if not match:
                             # Try "DD Mon YYYY" format for Regex 1
                             match = re.search(r'^(\d{1,2}\s+[A-Za-z]{3}\s+\d{2,4})\s+(.+?)\s+(-?[\d,]+\.\d{2}[CRcr]*)', line)
                        
                        if match:
                            date_str = match.group(1)
                            desc_str = match.group(2)
                            amt_str = match.group(3)
                            
                            # Parse Date
                            date_obj = None
                            for fmt in ['%d/%m/%Y', '%d-%m-%Y', '%Y-%m-%d', '%d.%m.%Y', '%d %b %Y', '%d %b %y']:
                                try:
                                    date_obj = datetime.strptime(date_str, fmt)
                                    break
                                except: pass
                            
                            if not date_obj: continue

                            # Parse Amount
                            try:
                                is_credit = False
                                s_val = amt_str.replace(',', '').replace('$', '').replace(' ', '')
                                if 'cr' in s_val.lower():
                                    is_credit = True
                                    s_val = s_val.lower().replace('cr', '')
                                
                                amount = float(s_val)
                                
                                tran_type = 'income' if (amount > 0 and is_credit) else 'expense' 
                                # Refined logic: Usually statements use -ve for expense. If positive and CR, it's income.
                                # If just positive number at end of line? Usually expense (debit card) or income (deposit)?
                                # Context needed. Let's assume Expense if typical shopping statement.
                                
                                transactions.append({
                                    'date': date_obj.strftime('%Y-%m-%d'),
                                    'title': desc_str.strip(),
                                    'amount': abs(amount),
                                    'type': tran_type,
                                    'category': 'Uncategorized'
                                })
                            except: continue


        if not transactions:
            return None
            
        # --- NEW: Auto-categorize Batch ---
        try:
            from .category_classifier import batch_classify_transactions
            # Prepare for classifier
            batch_input = []
            for tx in transactions:
                batch_input.append({
                    'title': tx['title'],
                    'type': tx['type'].capitalize() if tx.get('type') else 'Expense'
                })
            
            results = batch_classify_transactions(batch_input)
            for idx, res in enumerate(results):
                transactions[idx]['category'] = res['category']
        except Exception as e:
            print(f"Error during PDF batch categorization: {e}")

        return pd.DataFrame(transactions)
        
    except Exception as e:
        import traceback
        traceback.print_exc()
        return None