import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression COMMON_CATEGORIES = { "groceries": ["supermarket", "grocery", "walmart", "kroger", "bigbasket", "dmart"], "transport": ["uber", "ola", "taxi", "metro", "bus", "train", "rail"], "dining": ["restaurant", "cafe", "mcdonald", "domino", "pizza", "burger"], "salary": ["salary", "payroll", "adp", "direct deposit"], "rent": ["rent", "landlord", "apartment"], "utilities": ["electricity", "water bill", "internet", "wifi", "phone bill"], "entertainment": ["netflix", "spotify", "movie", "theatre", "concert"], "health": ["clinic", "hospital", "pharmacy", "drugstore"], "shopping": ["amazon", "flipkart", "mall", "store", "shop"], "others": [] } def preprocess_transactions(df): # Try to detect common column names and standardize df = df.copy() # Lowercase columns df.columns = [c.strip().lower() for c in df.columns] # Heuristics for date, amount, description columns date_col = None amount_col = None desc_col = None for c in df.columns: if "date" in c: date_col = c if c in ("amount", "amt", "transaction amount", "value"): amount_col = c if any(x in c for x in ("description","narration","details","remarks","merchant","particulars")): desc_col = c # fallback guesses if date_col is None: for c in df.columns: if df[c].dtype == "object" and df[c].str.contains("-").any(): date_col = c break if amount_col is None: for c in df.columns: if np.issubdtype(df[c].dtype, np.number): amount_col = c break if desc_col is None: for c in df.columns: if df[c].dtype == "object": desc_col = c break # Rename to standard if date_col: df["date"] = pd.to_datetime(df[date_col], errors="coerce") else: df["date"] = pd.NaT if amount_col: df["amount"] = pd.to_numeric(df[amount_col], errors="coerce") else: df["amount"] = 0.0 if desc_col: df["description"] = df[desc_col].astype(str) else: df["description"] = "" # Basic cleaning df = df[["date","amount","description"]] df = df.dropna(subset=["amount"]) # Fill missing dates with forward fill df["date"] = df["date"].fillna(method="ffill") # Assign categories using rules df["category"] = df["description"].apply(rule_based_category) # For any uncategorized, use a simple text clustering uncategorized = df["category"].isin(["others"]) if uncategorized.any(): df.loc[uncategorized, "category"] = cluster_categorize(df.loc[uncategorized, "description"]) # Normalize amounts (assume negative for expenses sometimes) df["amount"] = df["amount"].astype(float) return df def rule_based_category(text): t = text.lower() for cat, tokens in COMMON_CATEGORIES.items(): for tk in tokens: if tk in t: return cat # check for debit/credit language if any(x in t for x in ("salary","credit","deposit")): return "salary" if any(x in t for x in ("payment","bill","charge")): return "utilities" return "others" def cluster_categorize(text_series): txts = text_series.fillna("").astype(str).values if len(txts) == 0: return ["others"] * len(txts) vect = TfidfVectorizer(ngram_range=(1,2), max_features=500) X = vect.fit_transform(txts) n_clusters = min(4, max(1, len(txts)//5)) k = KMeans(n_clusters=n_clusters, random_state=42) labels = k.fit_predict(X) # Map clusters to generic names mapping = {i: f"cluster_{i}" for i in range(n_clusters)} return [mapping[l] for l in labels] def summarize_transactions(df): df = df.copy() df["month"] = df["date"].dt.to_period("M").astype(str) total_spent = df[df["amount"]<0]["amount"].abs().sum() if (df["amount"]<0).any() else df[df["amount"]>0]["amount"].sum() # We'll show per-month summary monthly = df.groupby("month")["amount"].sum().reset_index() cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount") text = [] text.append(f"Total transactions: {len(df)}") text.append(f"Total amount (signed): {df['amount'].sum():.2f}") text.append("Monthly totals (month -> amount):") for _, row in monthly.iterrows(): text.append(f" {row['month']} -> {row['amount']:.2f}") text.append("Category breakdown (category -> amount):") for _, row in cat.iterrows(): text.append(f" {row['category']} -> {row['amount']:.2f}") return "\\n".join(text) def category_summary(df): cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount", ascending=False) return cat def predict_expenses(df): # Simple linear regression per category using month index df = df.copy() df["month"] = df["date"].dt.to_period("M") df["month_idx"] = (df["date"].dt.year - df["date"].dt.year.min())*12 + df["date"].dt.month preds = {} for cat, g in df.groupby("category"): monthly = g.groupby("month_idx")["amount"].sum().reset_index() if len(monthly) < 2: preds[cat] = monthly["amount"].sum() continue X = monthly[["month_idx"]].values y = monthly["amount"].values model = LinearRegression() try: model.fit(X, y) next_month = np.array([[monthly["month_idx"].max() + 1]]) preds[cat] = float(model.predict(next_month)[0]) except Exception: preds[cat] = float(y.mean()) return preds