Spaces:
Sleeping
Sleeping
File size: 5,835 Bytes
030fa79 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
COMMON_CATEGORIES = {
"groceries": ["supermarket", "grocery", "walmart", "kroger", "bigbasket", "dmart"],
"transport": ["uber", "ola", "taxi", "metro", "bus", "train", "rail"],
"dining": ["restaurant", "cafe", "mcdonald", "domino", "pizza", "burger"],
"salary": ["salary", "payroll", "adp", "direct deposit"],
"rent": ["rent", "landlord", "apartment"],
"utilities": ["electricity", "water bill", "internet", "wifi", "phone bill"],
"entertainment": ["netflix", "spotify", "movie", "theatre", "concert"],
"health": ["clinic", "hospital", "pharmacy", "drugstore"],
"shopping": ["amazon", "flipkart", "mall", "store", "shop"],
"others": []
}
def preprocess_transactions(df):
# Try to detect common column names and standardize
df = df.copy()
# Lowercase columns
df.columns = [c.strip().lower() for c in df.columns]
# Heuristics for date, amount, description columns
date_col = None
amount_col = None
desc_col = None
for c in df.columns:
if "date" in c:
date_col = c
if c in ("amount", "amt", "transaction amount", "value"):
amount_col = c
if any(x in c for x in ("description","narration","details","remarks","merchant","particulars")):
desc_col = c
# fallback guesses
if date_col is None:
for c in df.columns:
if df[c].dtype == "object" and df[c].str.contains("-").any():
date_col = c
break
if amount_col is None:
for c in df.columns:
if np.issubdtype(df[c].dtype, np.number):
amount_col = c
break
if desc_col is None:
for c in df.columns:
if df[c].dtype == "object":
desc_col = c
break
# Rename to standard
if date_col:
df["date"] = pd.to_datetime(df[date_col], errors="coerce")
else:
df["date"] = pd.NaT
if amount_col:
df["amount"] = pd.to_numeric(df[amount_col], errors="coerce")
else:
df["amount"] = 0.0
if desc_col:
df["description"] = df[desc_col].astype(str)
else:
df["description"] = ""
# Basic cleaning
df = df[["date","amount","description"]]
df = df.dropna(subset=["amount"])
# Fill missing dates with forward fill
df["date"] = df["date"].fillna(method="ffill")
# Assign categories using rules
df["category"] = df["description"].apply(rule_based_category)
# For any uncategorized, use a simple text clustering
uncategorized = df["category"].isin(["others"])
if uncategorized.any():
df.loc[uncategorized, "category"] = cluster_categorize(df.loc[uncategorized, "description"])
# Normalize amounts (assume negative for expenses sometimes)
df["amount"] = df["amount"].astype(float)
return df
def rule_based_category(text):
t = text.lower()
for cat, tokens in COMMON_CATEGORIES.items():
for tk in tokens:
if tk in t:
return cat
# check for debit/credit language
if any(x in t for x in ("salary","credit","deposit")):
return "salary"
if any(x in t for x in ("payment","bill","charge")):
return "utilities"
return "others"
def cluster_categorize(text_series):
txts = text_series.fillna("").astype(str).values
if len(txts) == 0:
return ["others"] * len(txts)
vect = TfidfVectorizer(ngram_range=(1,2), max_features=500)
X = vect.fit_transform(txts)
n_clusters = min(4, max(1, len(txts)//5))
k = KMeans(n_clusters=n_clusters, random_state=42)
labels = k.fit_predict(X)
# Map clusters to generic names
mapping = {i: f"cluster_{i}" for i in range(n_clusters)}
return [mapping[l] for l in labels]
def summarize_transactions(df):
df = df.copy()
df["month"] = df["date"].dt.to_period("M").astype(str)
total_spent = df[df["amount"]<0]["amount"].abs().sum() if (df["amount"]<0).any() else df[df["amount"]>0]["amount"].sum()
# We'll show per-month summary
monthly = df.groupby("month")["amount"].sum().reset_index()
cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount")
text = []
text.append(f"Total transactions: {len(df)}")
text.append(f"Total amount (signed): {df['amount'].sum():.2f}")
text.append("Monthly totals (month -> amount):")
for _, row in monthly.iterrows():
text.append(f" {row['month']} -> {row['amount']:.2f}")
text.append("Category breakdown (category -> amount):")
for _, row in cat.iterrows():
text.append(f" {row['category']} -> {row['amount']:.2f}")
return "\\n".join(text)
def category_summary(df):
cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount", ascending=False)
return cat
def predict_expenses(df):
# Simple linear regression per category using month index
df = df.copy()
df["month"] = df["date"].dt.to_period("M")
df["month_idx"] = (df["date"].dt.year - df["date"].dt.year.min())*12 + df["date"].dt.month
preds = {}
for cat, g in df.groupby("category"):
monthly = g.groupby("month_idx")["amount"].sum().reset_index()
if len(monthly) < 2:
preds[cat] = monthly["amount"].sum()
continue
X = monthly[["month_idx"]].values
y = monthly["amount"].values
model = LinearRegression()
try:
model.fit(X, y)
next_month = np.array([[monthly["month_idx"].max() + 1]])
preds[cat] = float(model.predict(next_month)[0])
except Exception:
preds[cat] = float(y.mean())
return preds
|