BudgetBuddy / finance_utils.py
ParthBhuptani's picture
Upload 7 files
030fa79 verified
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
COMMON_CATEGORIES = {
"groceries": ["supermarket", "grocery", "walmart", "kroger", "bigbasket", "dmart"],
"transport": ["uber", "ola", "taxi", "metro", "bus", "train", "rail"],
"dining": ["restaurant", "cafe", "mcdonald", "domino", "pizza", "burger"],
"salary": ["salary", "payroll", "adp", "direct deposit"],
"rent": ["rent", "landlord", "apartment"],
"utilities": ["electricity", "water bill", "internet", "wifi", "phone bill"],
"entertainment": ["netflix", "spotify", "movie", "theatre", "concert"],
"health": ["clinic", "hospital", "pharmacy", "drugstore"],
"shopping": ["amazon", "flipkart", "mall", "store", "shop"],
"others": []
}
def preprocess_transactions(df):
# Try to detect common column names and standardize
df = df.copy()
# Lowercase columns
df.columns = [c.strip().lower() for c in df.columns]
# Heuristics for date, amount, description columns
date_col = None
amount_col = None
desc_col = None
for c in df.columns:
if "date" in c:
date_col = c
if c in ("amount", "amt", "transaction amount", "value"):
amount_col = c
if any(x in c for x in ("description","narration","details","remarks","merchant","particulars")):
desc_col = c
# fallback guesses
if date_col is None:
for c in df.columns:
if df[c].dtype == "object" and df[c].str.contains("-").any():
date_col = c
break
if amount_col is None:
for c in df.columns:
if np.issubdtype(df[c].dtype, np.number):
amount_col = c
break
if desc_col is None:
for c in df.columns:
if df[c].dtype == "object":
desc_col = c
break
# Rename to standard
if date_col:
df["date"] = pd.to_datetime(df[date_col], errors="coerce")
else:
df["date"] = pd.NaT
if amount_col:
df["amount"] = pd.to_numeric(df[amount_col], errors="coerce")
else:
df["amount"] = 0.0
if desc_col:
df["description"] = df[desc_col].astype(str)
else:
df["description"] = ""
# Basic cleaning
df = df[["date","amount","description"]]
df = df.dropna(subset=["amount"])
# Fill missing dates with forward fill
df["date"] = df["date"].fillna(method="ffill")
# Assign categories using rules
df["category"] = df["description"].apply(rule_based_category)
# For any uncategorized, use a simple text clustering
uncategorized = df["category"].isin(["others"])
if uncategorized.any():
df.loc[uncategorized, "category"] = cluster_categorize(df.loc[uncategorized, "description"])
# Normalize amounts (assume negative for expenses sometimes)
df["amount"] = df["amount"].astype(float)
return df
def rule_based_category(text):
t = text.lower()
for cat, tokens in COMMON_CATEGORIES.items():
for tk in tokens:
if tk in t:
return cat
# check for debit/credit language
if any(x in t for x in ("salary","credit","deposit")):
return "salary"
if any(x in t for x in ("payment","bill","charge")):
return "utilities"
return "others"
def cluster_categorize(text_series):
txts = text_series.fillna("").astype(str).values
if len(txts) == 0:
return ["others"] * len(txts)
vect = TfidfVectorizer(ngram_range=(1,2), max_features=500)
X = vect.fit_transform(txts)
n_clusters = min(4, max(1, len(txts)//5))
k = KMeans(n_clusters=n_clusters, random_state=42)
labels = k.fit_predict(X)
# Map clusters to generic names
mapping = {i: f"cluster_{i}" for i in range(n_clusters)}
return [mapping[l] for l in labels]
def summarize_transactions(df):
df = df.copy()
df["month"] = df["date"].dt.to_period("M").astype(str)
total_spent = df[df["amount"]<0]["amount"].abs().sum() if (df["amount"]<0).any() else df[df["amount"]>0]["amount"].sum()
# We'll show per-month summary
monthly = df.groupby("month")["amount"].sum().reset_index()
cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount")
text = []
text.append(f"Total transactions: {len(df)}")
text.append(f"Total amount (signed): {df['amount'].sum():.2f}")
text.append("Monthly totals (month -> amount):")
for _, row in monthly.iterrows():
text.append(f" {row['month']} -> {row['amount']:.2f}")
text.append("Category breakdown (category -> amount):")
for _, row in cat.iterrows():
text.append(f" {row['category']} -> {row['amount']:.2f}")
return "\\n".join(text)
def category_summary(df):
cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount", ascending=False)
return cat
def predict_expenses(df):
# Simple linear regression per category using month index
df = df.copy()
df["month"] = df["date"].dt.to_period("M")
df["month_idx"] = (df["date"].dt.year - df["date"].dt.year.min())*12 + df["date"].dt.month
preds = {}
for cat, g in df.groupby("category"):
monthly = g.groupby("month_idx")["amount"].sum().reset_index()
if len(monthly) < 2:
preds[cat] = monthly["amount"].sum()
continue
X = monthly[["month_idx"]].values
y = monthly["amount"].values
model = LinearRegression()
try:
model.fit(X, y)
next_month = np.array([[monthly["month_idx"].max() + 1]])
preds[cat] = float(model.predict(next_month)[0])
except Exception:
preds[cat] = float(y.mean())
return preds