FinMK / backend /finance /services.py
Kumar
Refactor: Exclude PDF and CSV files from Git to fix HF push error
24e6f5b
import pandas as pd
from datetime import datetime, timedelta
from expense_tracker.utils import MongoDBClient
from bson import ObjectId
import pdfplumber
import re
import io
import os
from dotenv import load_dotenv
load_dotenv()
def process_recurring_incomes(user_id):
"""
Checks all recurring incomes for a user and creates new transactions
if the recurrence interval has passed since the last run.
"""
db = MongoDBClient.get_client()
user = db.users.find_one({'_id': ObjectId(user_id)}, {'financial_data.incomes': 1})
if not user or 'financial_data' not in user or 'incomes' not in user['financial_data']:
return 0
incomes = user['financial_data']['incomes']
generated_count = 0
updated_incomes = []
for income in incomes:
if not income.get('is_recurring'):
updated_incomes.append(income)
continue
last_date = income.get('last_run_date')
if not last_date:
last_date = income.get('date') # Default to creation date if never run
# Ensure last_date is datetime
if isinstance(last_date, str):
try:
last_date = datetime.strptime(last_date, '%Y-%m-%d')
except:
updated_incomes.append(income)
continue
interval = income.get('recurrence_interval')
if not interval:
updated_incomes.append(income)
continue
next_due_date = calculate_next_date(last_date, interval)
current_date = datetime.now()
new_instances = []
last_inst_date = last_date
# While the next due date is in the past, generate a transaction
while next_due_date <= current_date:
new_tx = income.copy()
new_tx['_id'] = ObjectId()
new_tx['date'] = next_due_date
new_tx['is_recurring'] = False
new_tx['recurrence_interval'] = None
new_tx['created_at'] = datetime.now()
new_tx['title'] = f"{income.get('title')} (Recurring)"
new_tx['parent_id'] = str(income.get('_id'))
new_instances.append(new_tx)
generated_count += 1
last_inst_date = next_due_date
next_due_date = calculate_next_date(last_inst_date, interval)
# Update the template income in the list
income['last_run_date'] = last_inst_date
updated_incomes.append(income)
# Add new instances to the batch
if new_instances:
updated_incomes.extend(new_instances)
# Save all updates back to user document
if generated_count > 0:
db.users.update_one(
{'_id': ObjectId(user_id)},
{'$set': {'financial_data.incomes': updated_incomes}}
)
return generated_count
def calculate_next_date(start_date, interval):
if interval == 'daily':
return start_date + timedelta(days=1)
elif interval == 'weekly':
return start_date + timedelta(weeks=1)
elif interval == 'monthly':
# Simple monthly addition (approx 30 days or same day next month logic)
# For MVP, using 30 days approximation to avoid calendar complexity
return start_date + timedelta(days=30)
elif interval == 'yearly':
return start_date + timedelta(days=365)
return start_date + timedelta(days=36500) # Fallback to far future
def parse_and_import_transactions(file, user_id):
"""
Parses a CSV or Excel file and imports transaction records (Income/Expense).
Expected columns: Type, Title, Amount, Category, Date
If Type is missing, positive amount = Income, negative amount = Expense (optional logic)
"""
try:
if file.name.endswith('.csv'):
df = pd.read_csv(file)
elif file.name.endswith(('.xls', '.xlsx')):
df = pd.read_excel(file)
elif file.name.endswith('.pdf'):
df = parse_pdf_transactions(file)
else:
return 0, "Unsupported file format. Please upload CSV, Excel, or PDF."
if df is None or df.empty:
return 0, "Could not extract transactions from file."
# Standardize column names (lowercase)
df.columns = [c.lower().strip() for c in df.columns]
# 'type' is optional but recommended. 'amount' is required.
required_cols = ['title', 'amount', 'date']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
return 0, f"Missing columns: {', '.join(missing_cols)}"
db = MongoDBClient.get_client()
count = 0
income_docs = []
expense_docs = []
for _, row in df.iterrows():
try:
amount = float(row['amount'])
title = row['title']
category = row.get('category', 'Uncategorized')
date_str = row.get('date')
try:
date = pd.to_datetime(date_str).to_pydatetime()
except:
date = datetime.now()
# Determine Type
tran_type = None
if 'type' in df.columns:
val = str(row['type']).lower()
if 'income' in val: tran_type = 'income'
elif 'expense' in val: tran_type = 'expense'
if not tran_type:
if amount >= 0: tran_type = 'income'
else:
tran_type = 'expense'
amount = abs(amount)
doc = {
'_id': ObjectId(),
'title': title,
'amount': abs(amount),
'category': category,
'date': date,
'created_at': datetime.now(),
'source': 'import'
}
if tran_type == 'income':
income_docs.append(doc)
else:
expense_docs.append(doc)
count += 1
except Exception:
continue
# --- NEW: Auto-categorize Batch for CSV/Excel ---
all_docs = income_docs + expense_docs
to_categorize = [d for d in all_docs if d.get('category') in ['Uncategorized', '', None]]
if to_categorize:
try:
from .category_classifier import batch_classify_transactions
batch_input = []
for d in to_categorize:
# Find if it's income or expense to pass correct type
d_type = 'Income' if any(id(d) == id(inc) for inc in income_docs) else 'Expense'
batch_input.append({'title': d['title'], 'type': d_type})
results = batch_classify_transactions(batch_input)
for idx, res in enumerate(results):
to_categorize[idx]['category'] = res['category']
except Exception as e:
print(f"Error during import batch categorization: {e}")
# Batch Update for performance
if income_docs:
db.users.update_one(
{'_id': ObjectId(user_id)},
{'$push': {'financial_data.incomes': {'$each': income_docs}}}
)
if expense_docs:
db.users.update_one(
{'_id': ObjectId(user_id)},
{'$push': {'financial_data.expenses': {'$each': expense_docs}}}
)
return count, None
except Exception as e:
import traceback
traceback.print_exc()
return 0, str(e)
def parse_pdf_transactions(file):
"""
Extracts transactions from PDF using deterministic table extraction (pdfplumber).
Fallback to text regex if tables are not found (TODO).
"""""
try:
transactions = []
# Open PDF (file object)
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
if not table: continue
# 1. Identify Headers
# We look for a row that contains common header keywords
header_map = {}
header_row_idx = -1
for idx, row in enumerate(table):
# Clean row: filter None, to_lower
row_text = [str(cell).lower().strip() if cell else '' for cell in row]
# Check for Date
if any(k in row_text for k in ['date', 'txn date', 'transaction date']):
header_row_idx = idx
# Map columns
for col_idx, cell in enumerate(row_text):
if 'date' in cell: header_map['date'] = col_idx
elif any(k in cell for k in ['title', 'description', 'particulars', 'details', 'narrative', 'transaction']): header_map['title'] = col_idx
elif any(k in cell for k in ['debit', 'withdrawal', 'dr']): header_map['debit'] = col_idx
elif any(k in cell for k in ['credit', 'deposit', 'cr']): header_map['credit'] = col_idx
elif 'amount' in cell: header_map['amount'] = col_idx # Generic amount (check sign or type col)
elif 'type' in cell: header_map['type'] = col_idx # cr/dr type column
elif 'category' in cell: header_map['category'] = col_idx
elif 'balance' in cell: header_map['balance'] = col_idx
break # Found header, stop looking
if header_row_idx == -1:
# No clear header found in this table, skip or try heuristic (first row?)
continue
# 2. Extract Data Rows
for row in table[header_row_idx+1:]:
if not row: continue
try:
# Extract Date
date_str = None
if 'date' in header_map and header_map['date'] < len(row):
date_str = row[header_map['date']]
if not date_str: continue # Skip line without date
# Normalize Date (Try formats)
# Remove newlines
date_str = str(date_str).replace('\n', ' ').strip()
date_obj = None
for fmt in ['%d/%m/%Y', '%d-%m-%Y', '%Y-%m-%d', '%d %b %Y', '%m/%d/%Y']:
try:
date_obj = datetime.strptime(date_str, fmt)
break
except: pass
if not date_obj: continue # Invalid date
# Extract Title
title = "Transaction"
if 'title' in header_map and header_map['title'] < len(row):
t_val = row[header_map['title']]
if t_val: title = str(t_val).replace('\n', ' ').strip()
# Extract Category (if present in PDF)
category = "Uncategorized"
if 'category' in header_map and header_map['category'] < len(row):
c_val = row[header_map['category']]
if c_val: category = str(c_val).replace('\n', ' ').strip()
# Extract Amount
amount = 0.0
tran_type = 'expense' # Default
# Case A: Debit / Credit Columns
if 'debit' in header_map and 'credit' in header_map:
debit_val = row[header_map['debit']] if header_map['debit'] < len(row) else None
credit_val = row[header_map['credit']] if header_map['credit'] < len(row) else None
# Clean values (remove currency symbols, commas)
def clean_amt(val):
if not val: return 0.0
v = str(val).replace(',', '').replace('$', '').replace('£', '').replace(' ', '')
if not v: return 0.0
try: return float(v)
except: return 0.0
d_amt = clean_amt(debit_val)
c_amt = clean_amt(credit_val)
if c_amt > 0:
amount = c_amt
tran_type = 'income'
elif d_amt > 0:
amount = d_amt
tran_type = 'expense'
else:
continue # Zero transaction
# Case B: Single Amount Column
elif 'amount' in header_map and header_map['amount'] < len(row):
amt_val = row[header_map['amount']]
if not amt_val: continue
# Check if negative parenthesized (100.00) or -100.00
s_val = str(amt_val).replace(',', '').replace('$', '').replace(' ', '')
is_neg = False
if '(' in s_val and ')' in s_val:
is_neg = True
s_val = s_val.replace('(', '').replace(')', '')
elif s_val.startswith('-'):
is_neg = True
s_val = s_val.replace('-', '')
try:
amount = float(s_val)
except: continue
if is_neg:
tran_type = 'expense'
else:
# If positive, is it income? Not necessarily.
# Bank statements often show expenses as positive in a 'Withdrawals' list. # But since we found generic 'Amount' header, we can check 'type' column
if 'type' in header_map:
t_col = str(row[header_map['type']]).lower()
if any(k in t_col for k in ['cr', 'credit', 'deposit', 'income']):
tran_type = 'income'
elif any(k in t_col for k in ['dr', 'debit', 'withdrawal', 'expense']):
tran_type = 'expense'
else:
tran_type = 'expense'
else:
# Ambiguous: Default to expense? Or look for + sign?
# Let's assume positive = income, negative = expense if mixed.
# BUT: If all are positive, they might be expenses if statement is 'Credit Card'.
# For now: assume positive = income if we can't tell.
# Wait, standard CSV logic (parse_and_import) assumes positive amount + explicit type OR implicit sign.
# Let's stick to signed logic: Positive = Income, Negative = Expense.
# If regex 'CR' found in amount string (e.g. 100.00CR), it's income.
if 'cr' in str(amt_val).lower():
tran_type = 'income'
elif 'dr' in str(amt_val).lower():
tran_type = 'expense'
else:
# Safer: Default to expense unless explicitly marked as income.
# BUT the user said "all are added as income" which was due to this line.
# Let's default to 'expense' if we are unsure, as most transactions are expenses.
tran_type = 'expense'
# Add to list
transactions.append({
'date': date_obj.strftime('%Y-%m-%d'),
'title': title,
'amount': amount,
'type': tran_type,
'category': category
})
except Exception as e:
# Skip row error
continue
if not transactions:
# Fallback to Regex Text Parsing
file.seek(0) # Reset file pointer after table extraction attempt
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
text = page.extract_text()
if not text: continue
lines = text.split('\n')
for line in lines:
line = line.strip()
if not line: continue
# Regex 1: "Date ... Description ... Amount" (Standard Bank Statement)
# Supports: DD/MM/YYYY Description 123.45 OR 123.45CR
match = re.search(r'^(\d{1,2}[/\.-]\d{1,2}[/\.-]\d{2,4})\s+(.+?)\s+(-?[\d,]+\.\d{2}[CRcr]*)', line)
# Regex 2: "Title Amount Category Date Type" (User's Specific Format)
# Pattern: Trader Joes 141.28 Groceries 2023-01-01 Expense
if not match:
# ^(.+?) matches Title (lazy)
# \s+([\d,]+\.\d+) matches Amount
# \s+(.+?) matches Category
# \s+(\d{4}-\d{2}-\d{2}) matches Date
# \s+([A-Za-z]+)$ matches Type
match_v2 = re.search(r'^(.+?)\s+([\d,]+\.\d+)\s+(.+?)\s+(\d{4}-\d{2}-\d{2})\s+([A-Za-z]+)$', line)
if match_v2:
try:
title = match_v2.group(1).strip()
amount = float(match_v2.group(2).replace(',', ''))
category = match_v2.group(3).strip()
date_str = match_v2.group(4)
type_str = match_v2.group(5).lower()
if title.lower() == 'title' and amount == 0: continue # Skip Header if matched roughly
if 'amount' in match_v2.group(2).lower(): continue # Skip header strict
date_obj = datetime.strptime(date_str, '%Y-%m-%d')
transactions.append({
'date': date_obj.strftime('%Y-%m-%d'),
'title': title,
'amount': abs(amount),
'type': 'income' if 'income' in type_str else 'expense',
'category': category
})
continue # Skip other checks
except: pass
if not match:
# Try "DD Mon YYYY" format for Regex 1
match = re.search(r'^(\d{1,2}\s+[A-Za-z]{3}\s+\d{2,4})\s+(.+?)\s+(-?[\d,]+\.\d{2}[CRcr]*)', line)
if match:
date_str = match.group(1)
desc_str = match.group(2)
amt_str = match.group(3)
# Parse Date
date_obj = None
for fmt in ['%d/%m/%Y', '%d-%m-%Y', '%Y-%m-%d', '%d.%m.%Y', '%d %b %Y', '%d %b %y']:
try:
date_obj = datetime.strptime(date_str, fmt)
break
except: pass
if not date_obj: continue
# Parse Amount
try:
is_credit = False
s_val = amt_str.replace(',', '').replace('$', '').replace(' ', '')
if 'cr' in s_val.lower():
is_credit = True
s_val = s_val.lower().replace('cr', '')
amount = float(s_val)
tran_type = 'income' if (amount > 0 and is_credit) else 'expense'
# Refined logic: Usually statements use -ve for expense. If positive and CR, it's income.
# If just positive number at end of line? Usually expense (debit card) or income (deposit)?
# Context needed. Let's assume Expense if typical shopping statement.
transactions.append({
'date': date_obj.strftime('%Y-%m-%d'),
'title': desc_str.strip(),
'amount': abs(amount),
'type': tran_type,
'category': 'Uncategorized'
})
except: continue
if not transactions:
return None
# --- NEW: Auto-categorize Batch ---
try:
from .category_classifier import batch_classify_transactions
# Prepare for classifier
batch_input = []
for tx in transactions:
batch_input.append({
'title': tx['title'],
'type': tx['type'].capitalize() if tx.get('type') else 'Expense'
})
results = batch_classify_transactions(batch_input)
for idx, res in enumerate(results):
transactions[idx]['category'] = res['category']
except Exception as e:
print(f"Error during PDF batch categorization: {e}")
return pd.DataFrame(transactions)
except Exception as e:
import traceback
traceback.print_exc()
return None