Spaces:

ParthBhuptani
/

BudgetBuddy

Sleeping

File size: 5,835 Bytes

030fa79

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression

COMMON_CATEGORIES = {
    "groceries": ["supermarket", "grocery", "walmart", "kroger", "bigbasket", "dmart"],
    "transport": ["uber", "ola", "taxi", "metro", "bus", "train", "rail"],
    "dining": ["restaurant", "cafe", "mcdonald", "domino", "pizza", "burger"],
    "salary": ["salary", "payroll", "adp", "direct deposit"],
    "rent": ["rent", "landlord", "apartment"],
    "utilities": ["electricity", "water bill", "internet", "wifi", "phone bill"],
    "entertainment": ["netflix", "spotify", "movie", "theatre", "concert"],
    "health": ["clinic", "hospital", "pharmacy", "drugstore"],
    "shopping": ["amazon", "flipkart", "mall", "store", "shop"],
    "others": []
}

def preprocess_transactions(df):
    # Try to detect common column names and standardize
    df = df.copy()
    # Lowercase columns
    df.columns = [c.strip().lower() for c in df.columns]
    # Heuristics for date, amount, description columns
    date_col = None
    amount_col = None
    desc_col = None
    for c in df.columns:
        if "date" in c:
            date_col = c
        if c in ("amount", "amt", "transaction amount", "value"):
            amount_col = c
        if any(x in c for x in ("description","narration","details","remarks","merchant","particulars")):
            desc_col = c
    # fallback guesses
    if date_col is None:
        for c in df.columns:
            if df[c].dtype == "object" and df[c].str.contains("-").any():
                date_col = c
                break
    if amount_col is None:
        for c in df.columns:
            if np.issubdtype(df[c].dtype, np.number):
                amount_col = c
                break
    if desc_col is None:
        for c in df.columns:
            if df[c].dtype == "object":
                desc_col = c
                break
    # Rename to standard
    if date_col:
        df["date"] = pd.to_datetime(df[date_col], errors="coerce")
    else:
        df["date"] = pd.NaT
    if amount_col:
        df["amount"] = pd.to_numeric(df[amount_col], errors="coerce")
    else:
        df["amount"] = 0.0
    if desc_col:
        df["description"] = df[desc_col].astype(str)
    else:
        df["description"] = ""
    # Basic cleaning
    df = df[["date","amount","description"]]
    df = df.dropna(subset=["amount"])
    # Fill missing dates with forward fill
    df["date"] = df["date"].fillna(method="ffill")
    # Assign categories using rules
    df["category"] = df["description"].apply(rule_based_category)
    # For any uncategorized, use a simple text clustering
    uncategorized = df["category"].isin(["others"])
    if uncategorized.any():
        df.loc[uncategorized, "category"] = cluster_categorize(df.loc[uncategorized, "description"])
    # Normalize amounts (assume negative for expenses sometimes)
    df["amount"] = df["amount"].astype(float)
    return df

def rule_based_category(text):
    t = text.lower()
    for cat, tokens in COMMON_CATEGORIES.items():
        for tk in tokens:
            if tk in t:
                return cat
    # check for debit/credit language
    if any(x in t for x in ("salary","credit","deposit")):
        return "salary"
    if any(x in t for x in ("payment","bill","charge")):
        return "utilities"
    return "others"

def cluster_categorize(text_series):
    txts = text_series.fillna("").astype(str).values
    if len(txts) == 0:
        return ["others"] * len(txts)
    vect = TfidfVectorizer(ngram_range=(1,2), max_features=500)
    X = vect.fit_transform(txts)
    n_clusters = min(4, max(1, len(txts)//5))
    k = KMeans(n_clusters=n_clusters, random_state=42)
    labels = k.fit_predict(X)
    # Map clusters to generic names
    mapping = {i: f"cluster_{i}" for i in range(n_clusters)}
    return [mapping[l] for l in labels]

def summarize_transactions(df):
    df = df.copy()
    df["month"] = df["date"].dt.to_period("M").astype(str)
    total_spent = df[df["amount"]<0]["amount"].abs().sum() if (df["amount"]<0).any() else df[df["amount"]>0]["amount"].sum()
    # We'll show per-month summary
    monthly = df.groupby("month")["amount"].sum().reset_index()
    cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount")
    text = []
    text.append(f"Total transactions: {len(df)}")
    text.append(f"Total amount (signed): {df['amount'].sum():.2f}")
    text.append("Monthly totals (month -> amount):")
    for _, row in monthly.iterrows():
        text.append(f"  {row['month']} -> {row['amount']:.2f}")
    text.append("Category breakdown (category -> amount):")
    for _, row in cat.iterrows():
        text.append(f"  {row['category']} -> {row['amount']:.2f}")
    return "\\n".join(text)

def category_summary(df):
    cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount", ascending=False)
    return cat

def predict_expenses(df):
    # Simple linear regression per category using month index
    df = df.copy()
    df["month"] = df["date"].dt.to_period("M")
    df["month_idx"] = (df["date"].dt.year - df["date"].dt.year.min())*12 + df["date"].dt.month
    preds = {}
    for cat, g in df.groupby("category"):
        monthly = g.groupby("month_idx")["amount"].sum().reset_index()
        if len(monthly) < 2:
            preds[cat] = monthly["amount"].sum()
            continue
        X = monthly[["month_idx"]].values
        y = monthly["amount"].values
        model = LinearRegression()
        try:
            model.fit(X, y)
            next_month = np.array([[monthly["month_idx"].max() + 1]])
            preds[cat] = float(model.predict(next_month)[0])
        except Exception:
            preds[cat] = float(y.mean())
    return preds