Spaces:

ParthBhuptani
/

BudgetBuddy

Sleeping

App Files Files Community

BudgetBuddy / finance_utils.py

ParthBhuptani

Upload 7 files

030fa79 verified 2 months ago

raw

history blame contribute delete

5.84 kB

	import pandas as pd
	import numpy as np
	from sklearn.cluster import KMeans
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LinearRegression

	COMMON_CATEGORIES = {
	"groceries": ["supermarket", "grocery", "walmart", "kroger", "bigbasket", "dmart"],
	"transport": ["uber", "ola", "taxi", "metro", "bus", "train", "rail"],
	"dining": ["restaurant", "cafe", "mcdonald", "domino", "pizza", "burger"],
	"salary": ["salary", "payroll", "adp", "direct deposit"],
	"rent": ["rent", "landlord", "apartment"],
	"utilities": ["electricity", "water bill", "internet", "wifi", "phone bill"],
	"entertainment": ["netflix", "spotify", "movie", "theatre", "concert"],
	"health": ["clinic", "hospital", "pharmacy", "drugstore"],
	"shopping": ["amazon", "flipkart", "mall", "store", "shop"],
	"others": []
	}

	def preprocess_transactions(df):
	# Try to detect common column names and standardize
	df = df.copy()
	# Lowercase columns
	df.columns = [c.strip().lower() for c in df.columns]
	# Heuristics for date, amount, description columns
	date_col = None
	amount_col = None
	desc_col = None
	for c in df.columns:
	if "date" in c:
	date_col = c
	if c in ("amount", "amt", "transaction amount", "value"):
	amount_col = c
	if any(x in c for x in ("description","narration","details","remarks","merchant","particulars")):
	desc_col = c
	# fallback guesses
	if date_col is None:
	for c in df.columns:
	if df[c].dtype == "object" and df[c].str.contains("-").any():
	date_col = c
	break
	if amount_col is None:
	for c in df.columns:
	if np.issubdtype(df[c].dtype, np.number):
	amount_col = c
	break
	if desc_col is None:
	for c in df.columns:
	if df[c].dtype == "object":
	desc_col = c
	break
	# Rename to standard
	if date_col:
	df["date"] = pd.to_datetime(df[date_col], errors="coerce")
	else:
	df["date"] = pd.NaT
	if amount_col:
	df["amount"] = pd.to_numeric(df[amount_col], errors="coerce")
	else:
	df["amount"] = 0.0
	if desc_col:
	df["description"] = df[desc_col].astype(str)
	else:
	df["description"] = ""
	# Basic cleaning
	df = df[["date","amount","description"]]
	df = df.dropna(subset=["amount"])
	# Fill missing dates with forward fill
	df["date"] = df["date"].fillna(method="ffill")
	# Assign categories using rules
	df["category"] = df["description"].apply(rule_based_category)
	# For any uncategorized, use a simple text clustering
	uncategorized = df["category"].isin(["others"])
	if uncategorized.any():
	df.loc[uncategorized, "category"] = cluster_categorize(df.loc[uncategorized, "description"])
	# Normalize amounts (assume negative for expenses sometimes)
	df["amount"] = df["amount"].astype(float)
	return df

	def rule_based_category(text):
	t = text.lower()
	for cat, tokens in COMMON_CATEGORIES.items():
	for tk in tokens:
	if tk in t:
	return cat
	# check for debit/credit language
	if any(x in t for x in ("salary","credit","deposit")):
	return "salary"
	if any(x in t for x in ("payment","bill","charge")):
	return "utilities"
	return "others"

	def cluster_categorize(text_series):
	txts = text_series.fillna("").astype(str).values
	if len(txts) == 0:
	return ["others"] * len(txts)
	vect = TfidfVectorizer(ngram_range=(1,2), max_features=500)
	X = vect.fit_transform(txts)
	n_clusters = min(4, max(1, len(txts)//5))
	k = KMeans(n_clusters=n_clusters, random_state=42)
	labels = k.fit_predict(X)
	# Map clusters to generic names
	mapping = {i: f"cluster_{i}" for i in range(n_clusters)}
	return [mapping[l] for l in labels]

	def summarize_transactions(df):
	df = df.copy()
	df["month"] = df["date"].dt.to_period("M").astype(str)
	total_spent = df[df["amount"]<0]["amount"].abs().sum() if (df["amount"]<0).any() else df[df["amount"]>0]["amount"].sum()
	# We'll show per-month summary
	monthly = df.groupby("month")["amount"].sum().reset_index()
	cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount")
	text = []
	text.append(f"Total transactions: {len(df)}")
	text.append(f"Total amount (signed): {df['amount'].sum():.2f}")
	text.append("Monthly totals (month -> amount):")
	for _, row in monthly.iterrows():
	text.append(f" {row['month']} -> {row['amount']:.2f}")
	text.append("Category breakdown (category -> amount):")
	for _, row in cat.iterrows():
	text.append(f" {row['category']} -> {row['amount']:.2f}")
	return "\\n".join(text)

	def category_summary(df):
	cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount", ascending=False)
	return cat

	def predict_expenses(df):
	# Simple linear regression per category using month index
	df = df.copy()
	df["month"] = df["date"].dt.to_period("M")
	df["month_idx"] = (df["date"].dt.year - df["date"].dt.year.min())*12 + df["date"].dt.month
	preds = {}
	for cat, g in df.groupby("category"):
	monthly = g.groupby("month_idx")["amount"].sum().reset_index()
	if len(monthly) < 2:
	preds[cat] = monthly["amount"].sum()
	continue
	X = monthly[["month_idx"]].values
	y = monthly["amount"].values
	model = LinearRegression()
	try:
	model.fit(X, y)
	next_month = np.array([[monthly["month_idx"].max() + 1]])
	preds[cat] = float(model.predict(next_month)[0])
	except Exception:
	preds[cat] = float(y.mean())
	return preds