Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.cluster import KMeans | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import LinearRegression | |
| COMMON_CATEGORIES = { | |
| "groceries": ["supermarket", "grocery", "walmart", "kroger", "bigbasket", "dmart"], | |
| "transport": ["uber", "ola", "taxi", "metro", "bus", "train", "rail"], | |
| "dining": ["restaurant", "cafe", "mcdonald", "domino", "pizza", "burger"], | |
| "salary": ["salary", "payroll", "adp", "direct deposit"], | |
| "rent": ["rent", "landlord", "apartment"], | |
| "utilities": ["electricity", "water bill", "internet", "wifi", "phone bill"], | |
| "entertainment": ["netflix", "spotify", "movie", "theatre", "concert"], | |
| "health": ["clinic", "hospital", "pharmacy", "drugstore"], | |
| "shopping": ["amazon", "flipkart", "mall", "store", "shop"], | |
| "others": [] | |
| } | |
| def preprocess_transactions(df): | |
| # Try to detect common column names and standardize | |
| df = df.copy() | |
| # Lowercase columns | |
| df.columns = [c.strip().lower() for c in df.columns] | |
| # Heuristics for date, amount, description columns | |
| date_col = None | |
| amount_col = None | |
| desc_col = None | |
| for c in df.columns: | |
| if "date" in c: | |
| date_col = c | |
| if c in ("amount", "amt", "transaction amount", "value"): | |
| amount_col = c | |
| if any(x in c for x in ("description","narration","details","remarks","merchant","particulars")): | |
| desc_col = c | |
| # fallback guesses | |
| if date_col is None: | |
| for c in df.columns: | |
| if df[c].dtype == "object" and df[c].str.contains("-").any(): | |
| date_col = c | |
| break | |
| if amount_col is None: | |
| for c in df.columns: | |
| if np.issubdtype(df[c].dtype, np.number): | |
| amount_col = c | |
| break | |
| if desc_col is None: | |
| for c in df.columns: | |
| if df[c].dtype == "object": | |
| desc_col = c | |
| break | |
| # Rename to standard | |
| if date_col: | |
| df["date"] = pd.to_datetime(df[date_col], errors="coerce") | |
| else: | |
| df["date"] = pd.NaT | |
| if amount_col: | |
| df["amount"] = pd.to_numeric(df[amount_col], errors="coerce") | |
| else: | |
| df["amount"] = 0.0 | |
| if desc_col: | |
| df["description"] = df[desc_col].astype(str) | |
| else: | |
| df["description"] = "" | |
| # Basic cleaning | |
| df = df[["date","amount","description"]] | |
| df = df.dropna(subset=["amount"]) | |
| # Fill missing dates with forward fill | |
| df["date"] = df["date"].fillna(method="ffill") | |
| # Assign categories using rules | |
| df["category"] = df["description"].apply(rule_based_category) | |
| # For any uncategorized, use a simple text clustering | |
| uncategorized = df["category"].isin(["others"]) | |
| if uncategorized.any(): | |
| df.loc[uncategorized, "category"] = cluster_categorize(df.loc[uncategorized, "description"]) | |
| # Normalize amounts (assume negative for expenses sometimes) | |
| df["amount"] = df["amount"].astype(float) | |
| return df | |
| def rule_based_category(text): | |
| t = text.lower() | |
| for cat, tokens in COMMON_CATEGORIES.items(): | |
| for tk in tokens: | |
| if tk in t: | |
| return cat | |
| # check for debit/credit language | |
| if any(x in t for x in ("salary","credit","deposit")): | |
| return "salary" | |
| if any(x in t for x in ("payment","bill","charge")): | |
| return "utilities" | |
| return "others" | |
| def cluster_categorize(text_series): | |
| txts = text_series.fillna("").astype(str).values | |
| if len(txts) == 0: | |
| return ["others"] * len(txts) | |
| vect = TfidfVectorizer(ngram_range=(1,2), max_features=500) | |
| X = vect.fit_transform(txts) | |
| n_clusters = min(4, max(1, len(txts)//5)) | |
| k = KMeans(n_clusters=n_clusters, random_state=42) | |
| labels = k.fit_predict(X) | |
| # Map clusters to generic names | |
| mapping = {i: f"cluster_{i}" for i in range(n_clusters)} | |
| return [mapping[l] for l in labels] | |
| def summarize_transactions(df): | |
| df = df.copy() | |
| df["month"] = df["date"].dt.to_period("M").astype(str) | |
| total_spent = df[df["amount"]<0]["amount"].abs().sum() if (df["amount"]<0).any() else df[df["amount"]>0]["amount"].sum() | |
| # We'll show per-month summary | |
| monthly = df.groupby("month")["amount"].sum().reset_index() | |
| cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount") | |
| text = [] | |
| text.append(f"Total transactions: {len(df)}") | |
| text.append(f"Total amount (signed): {df['amount'].sum():.2f}") | |
| text.append("Monthly totals (month -> amount):") | |
| for _, row in monthly.iterrows(): | |
| text.append(f" {row['month']} -> {row['amount']:.2f}") | |
| text.append("Category breakdown (category -> amount):") | |
| for _, row in cat.iterrows(): | |
| text.append(f" {row['category']} -> {row['amount']:.2f}") | |
| return "\\n".join(text) | |
| def category_summary(df): | |
| cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount", ascending=False) | |
| return cat | |
| def predict_expenses(df): | |
| # Simple linear regression per category using month index | |
| df = df.copy() | |
| df["month"] = df["date"].dt.to_period("M") | |
| df["month_idx"] = (df["date"].dt.year - df["date"].dt.year.min())*12 + df["date"].dt.month | |
| preds = {} | |
| for cat, g in df.groupby("category"): | |
| monthly = g.groupby("month_idx")["amount"].sum().reset_index() | |
| if len(monthly) < 2: | |
| preds[cat] = monthly["amount"].sum() | |
| continue | |
| X = monthly[["month_idx"]].values | |
| y = monthly["amount"].values | |
| model = LinearRegression() | |
| try: | |
| model.fit(X, y) | |
| next_month = np.array([[monthly["month_idx"].max() + 1]]) | |
| preds[cat] = float(model.predict(next_month)[0]) | |
| except Exception: | |
| preds[cat] = float(y.mean()) | |
| return preds | |