import pandas as pd from datetime import datetime, timedelta from expense_tracker.utils import MongoDBClient from bson import ObjectId import pdfplumber import re import io import os from dotenv import load_dotenv load_dotenv() def process_recurring_incomes(user_id): """ Checks all recurring incomes for a user and creates new transactions if the recurrence interval has passed since the last run. """ db = MongoDBClient.get_client() user = db.users.find_one({'_id': ObjectId(user_id)}, {'financial_data.incomes': 1}) if not user or 'financial_data' not in user or 'incomes' not in user['financial_data']: return 0 incomes = user['financial_data']['incomes'] generated_count = 0 updated_incomes = [] for income in incomes: if not income.get('is_recurring'): updated_incomes.append(income) continue last_date = income.get('last_run_date') if not last_date: last_date = income.get('date') # Default to creation date if never run # Ensure last_date is datetime if isinstance(last_date, str): try: last_date = datetime.strptime(last_date, '%Y-%m-%d') except: updated_incomes.append(income) continue interval = income.get('recurrence_interval') if not interval: updated_incomes.append(income) continue next_due_date = calculate_next_date(last_date, interval) current_date = datetime.now() new_instances = [] last_inst_date = last_date # While the next due date is in the past, generate a transaction while next_due_date <= current_date: new_tx = income.copy() new_tx['_id'] = ObjectId() new_tx['date'] = next_due_date new_tx['is_recurring'] = False new_tx['recurrence_interval'] = None new_tx['created_at'] = datetime.now() new_tx['title'] = f"{income.get('title')} (Recurring)" new_tx['parent_id'] = str(income.get('_id')) new_instances.append(new_tx) generated_count += 1 last_inst_date = next_due_date next_due_date = calculate_next_date(last_inst_date, interval) # Update the template income in the list income['last_run_date'] = last_inst_date updated_incomes.append(income) # Add new instances to the batch if new_instances: updated_incomes.extend(new_instances) # Save all updates back to user document if generated_count > 0: db.users.update_one( {'_id': ObjectId(user_id)}, {'$set': {'financial_data.incomes': updated_incomes}} ) return generated_count def calculate_next_date(start_date, interval): if interval == 'daily': return start_date + timedelta(days=1) elif interval == 'weekly': return start_date + timedelta(weeks=1) elif interval == 'monthly': # Simple monthly addition (approx 30 days or same day next month logic) # For MVP, using 30 days approximation to avoid calendar complexity return start_date + timedelta(days=30) elif interval == 'yearly': return start_date + timedelta(days=365) return start_date + timedelta(days=36500) # Fallback to far future def parse_and_import_transactions(file, user_id): """ Parses a CSV or Excel file and imports transaction records (Income/Expense). Expected columns: Type, Title, Amount, Category, Date If Type is missing, positive amount = Income, negative amount = Expense (optional logic) """ try: if file.name.endswith('.csv'): df = pd.read_csv(file) elif file.name.endswith(('.xls', '.xlsx')): df = pd.read_excel(file) elif file.name.endswith('.pdf'): df = parse_pdf_transactions(file) else: return 0, "Unsupported file format. Please upload CSV, Excel, or PDF." if df is None or df.empty: return 0, "Could not extract transactions from file." # Standardize column names (lowercase) df.columns = [c.lower().strip() for c in df.columns] # 'type' is optional but recommended. 'amount' is required. required_cols = ['title', 'amount', 'date'] missing_cols = [col for col in required_cols if col not in df.columns] if missing_cols: return 0, f"Missing columns: {', '.join(missing_cols)}" db = MongoDBClient.get_client() count = 0 income_docs = [] expense_docs = [] for _, row in df.iterrows(): try: amount = float(row['amount']) title = row['title'] category = row.get('category', 'Uncategorized') date_str = row.get('date') try: date = pd.to_datetime(date_str).to_pydatetime() except: date = datetime.now() # Determine Type tran_type = None if 'type' in df.columns: val = str(row['type']).lower() if 'income' in val: tran_type = 'income' elif 'expense' in val: tran_type = 'expense' if not tran_type: if amount >= 0: tran_type = 'income' else: tran_type = 'expense' amount = abs(amount) doc = { '_id': ObjectId(), 'title': title, 'amount': abs(amount), 'category': category, 'date': date, 'created_at': datetime.now(), 'source': 'import' } if tran_type == 'income': income_docs.append(doc) else: expense_docs.append(doc) count += 1 except Exception: continue # --- NEW: Auto-categorize Batch for CSV/Excel --- all_docs = income_docs + expense_docs to_categorize = [d for d in all_docs if d.get('category') in ['Uncategorized', '', None]] if to_categorize: try: from .category_classifier import batch_classify_transactions batch_input = [] for d in to_categorize: # Find if it's income or expense to pass correct type d_type = 'Income' if any(id(d) == id(inc) for inc in income_docs) else 'Expense' batch_input.append({'title': d['title'], 'type': d_type}) results = batch_classify_transactions(batch_input) for idx, res in enumerate(results): to_categorize[idx]['category'] = res['category'] except Exception as e: print(f"Error during import batch categorization: {e}") # Batch Update for performance if income_docs: db.users.update_one( {'_id': ObjectId(user_id)}, {'$push': {'financial_data.incomes': {'$each': income_docs}}} ) if expense_docs: db.users.update_one( {'_id': ObjectId(user_id)}, {'$push': {'financial_data.expenses': {'$each': expense_docs}}} ) return count, None except Exception as e: import traceback traceback.print_exc() return 0, str(e) def parse_pdf_transactions(file): """ Extracts transactions from PDF using deterministic table extraction (pdfplumber). Fallback to text regex if tables are not found (TODO). """"" try: transactions = [] # Open PDF (file object) with pdfplumber.open(file) as pdf: for page in pdf.pages: tables = page.extract_tables() for table in tables: if not table: continue # 1. Identify Headers # We look for a row that contains common header keywords header_map = {} header_row_idx = -1 for idx, row in enumerate(table): # Clean row: filter None, to_lower row_text = [str(cell).lower().strip() if cell else '' for cell in row] # Check for Date if any(k in row_text for k in ['date', 'txn date', 'transaction date']): header_row_idx = idx # Map columns for col_idx, cell in enumerate(row_text): if 'date' in cell: header_map['date'] = col_idx elif any(k in cell for k in ['title', 'description', 'particulars', 'details', 'narrative', 'transaction']): header_map['title'] = col_idx elif any(k in cell for k in ['debit', 'withdrawal', 'dr']): header_map['debit'] = col_idx elif any(k in cell for k in ['credit', 'deposit', 'cr']): header_map['credit'] = col_idx elif 'amount' in cell: header_map['amount'] = col_idx # Generic amount (check sign or type col) elif 'type' in cell: header_map['type'] = col_idx # cr/dr type column elif 'category' in cell: header_map['category'] = col_idx elif 'balance' in cell: header_map['balance'] = col_idx break # Found header, stop looking if header_row_idx == -1: # No clear header found in this table, skip or try heuristic (first row?) continue # 2. Extract Data Rows for row in table[header_row_idx+1:]: if not row: continue try: # Extract Date date_str = None if 'date' in header_map and header_map['date'] < len(row): date_str = row[header_map['date']] if not date_str: continue # Skip line without date # Normalize Date (Try formats) # Remove newlines date_str = str(date_str).replace('\n', ' ').strip() date_obj = None for fmt in ['%d/%m/%Y', '%d-%m-%Y', '%Y-%m-%d', '%d %b %Y', '%m/%d/%Y']: try: date_obj = datetime.strptime(date_str, fmt) break except: pass if not date_obj: continue # Invalid date # Extract Title title = "Transaction" if 'title' in header_map and header_map['title'] < len(row): t_val = row[header_map['title']] if t_val: title = str(t_val).replace('\n', ' ').strip() # Extract Category (if present in PDF) category = "Uncategorized" if 'category' in header_map and header_map['category'] < len(row): c_val = row[header_map['category']] if c_val: category = str(c_val).replace('\n', ' ').strip() # Extract Amount amount = 0.0 tran_type = 'expense' # Default # Case A: Debit / Credit Columns if 'debit' in header_map and 'credit' in header_map: debit_val = row[header_map['debit']] if header_map['debit'] < len(row) else None credit_val = row[header_map['credit']] if header_map['credit'] < len(row) else None # Clean values (remove currency symbols, commas) def clean_amt(val): if not val: return 0.0 v = str(val).replace(',', '').replace('$', '').replace('£', '').replace(' ', '') if not v: return 0.0 try: return float(v) except: return 0.0 d_amt = clean_amt(debit_val) c_amt = clean_amt(credit_val) if c_amt > 0: amount = c_amt tran_type = 'income' elif d_amt > 0: amount = d_amt tran_type = 'expense' else: continue # Zero transaction # Case B: Single Amount Column elif 'amount' in header_map and header_map['amount'] < len(row): amt_val = row[header_map['amount']] if not amt_val: continue # Check if negative parenthesized (100.00) or -100.00 s_val = str(amt_val).replace(',', '').replace('$', '').replace(' ', '') is_neg = False if '(' in s_val and ')' in s_val: is_neg = True s_val = s_val.replace('(', '').replace(')', '') elif s_val.startswith('-'): is_neg = True s_val = s_val.replace('-', '') try: amount = float(s_val) except: continue if is_neg: tran_type = 'expense' else: # If positive, is it income? Not necessarily. # Bank statements often show expenses as positive in a 'Withdrawals' list. # But since we found generic 'Amount' header, we can check 'type' column if 'type' in header_map: t_col = str(row[header_map['type']]).lower() if any(k in t_col for k in ['cr', 'credit', 'deposit', 'income']): tran_type = 'income' elif any(k in t_col for k in ['dr', 'debit', 'withdrawal', 'expense']): tran_type = 'expense' else: tran_type = 'expense' else: # Ambiguous: Default to expense? Or look for + sign? # Let's assume positive = income, negative = expense if mixed. # BUT: If all are positive, they might be expenses if statement is 'Credit Card'. # For now: assume positive = income if we can't tell. # Wait, standard CSV logic (parse_and_import) assumes positive amount + explicit type OR implicit sign. # Let's stick to signed logic: Positive = Income, Negative = Expense. # If regex 'CR' found in amount string (e.g. 100.00CR), it's income. if 'cr' in str(amt_val).lower(): tran_type = 'income' elif 'dr' in str(amt_val).lower(): tran_type = 'expense' else: # Safer: Default to expense unless explicitly marked as income. # BUT the user said "all are added as income" which was due to this line. # Let's default to 'expense' if we are unsure, as most transactions are expenses. tran_type = 'expense' # Add to list transactions.append({ 'date': date_obj.strftime('%Y-%m-%d'), 'title': title, 'amount': amount, 'type': tran_type, 'category': category }) except Exception as e: # Skip row error continue if not transactions: # Fallback to Regex Text Parsing file.seek(0) # Reset file pointer after table extraction attempt with pdfplumber.open(file) as pdf: for page in pdf.pages: text = page.extract_text() if not text: continue lines = text.split('\n') for line in lines: line = line.strip() if not line: continue # Regex 1: "Date ... Description ... Amount" (Standard Bank Statement) # Supports: DD/MM/YYYY Description 123.45 OR 123.45CR match = re.search(r'^(\d{1,2}[/\.-]\d{1,2}[/\.-]\d{2,4})\s+(.+?)\s+(-?[\d,]+\.\d{2}[CRcr]*)', line) # Regex 2: "Title Amount Category Date Type" (User's Specific Format) # Pattern: Trader Joes 141.28 Groceries 2023-01-01 Expense if not match: # ^(.+?) matches Title (lazy) # \s+([\d,]+\.\d+) matches Amount # \s+(.+?) matches Category # \s+(\d{4}-\d{2}-\d{2}) matches Date # \s+([A-Za-z]+)$ matches Type match_v2 = re.search(r'^(.+?)\s+([\d,]+\.\d+)\s+(.+?)\s+(\d{4}-\d{2}-\d{2})\s+([A-Za-z]+)$', line) if match_v2: try: title = match_v2.group(1).strip() amount = float(match_v2.group(2).replace(',', '')) category = match_v2.group(3).strip() date_str = match_v2.group(4) type_str = match_v2.group(5).lower() if title.lower() == 'title' and amount == 0: continue # Skip Header if matched roughly if 'amount' in match_v2.group(2).lower(): continue # Skip header strict date_obj = datetime.strptime(date_str, '%Y-%m-%d') transactions.append({ 'date': date_obj.strftime('%Y-%m-%d'), 'title': title, 'amount': abs(amount), 'type': 'income' if 'income' in type_str else 'expense', 'category': category }) continue # Skip other checks except: pass if not match: # Try "DD Mon YYYY" format for Regex 1 match = re.search(r'^(\d{1,2}\s+[A-Za-z]{3}\s+\d{2,4})\s+(.+?)\s+(-?[\d,]+\.\d{2}[CRcr]*)', line) if match: date_str = match.group(1) desc_str = match.group(2) amt_str = match.group(3) # Parse Date date_obj = None for fmt in ['%d/%m/%Y', '%d-%m-%Y', '%Y-%m-%d', '%d.%m.%Y', '%d %b %Y', '%d %b %y']: try: date_obj = datetime.strptime(date_str, fmt) break except: pass if not date_obj: continue # Parse Amount try: is_credit = False s_val = amt_str.replace(',', '').replace('$', '').replace(' ', '') if 'cr' in s_val.lower(): is_credit = True s_val = s_val.lower().replace('cr', '') amount = float(s_val) tran_type = 'income' if (amount > 0 and is_credit) else 'expense' # Refined logic: Usually statements use -ve for expense. If positive and CR, it's income. # If just positive number at end of line? Usually expense (debit card) or income (deposit)? # Context needed. Let's assume Expense if typical shopping statement. transactions.append({ 'date': date_obj.strftime('%Y-%m-%d'), 'title': desc_str.strip(), 'amount': abs(amount), 'type': tran_type, 'category': 'Uncategorized' }) except: continue if not transactions: return None # --- NEW: Auto-categorize Batch --- try: from .category_classifier import batch_classify_transactions # Prepare for classifier batch_input = [] for tx in transactions: batch_input.append({ 'title': tx['title'], 'type': tx['type'].capitalize() if tx.get('type') else 'Expense' }) results = batch_classify_transactions(batch_input) for idx, res in enumerate(results): transactions[idx]['category'] = res['category'] except Exception as e: print(f"Error during PDF batch categorization: {e}") return pd.DataFrame(transactions) except Exception as e: import traceback traceback.print_exc() return None