File size: 5,835 Bytes
030fa79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression

COMMON_CATEGORIES = {
    "groceries": ["supermarket", "grocery", "walmart", "kroger", "bigbasket", "dmart"],
    "transport": ["uber", "ola", "taxi", "metro", "bus", "train", "rail"],
    "dining": ["restaurant", "cafe", "mcdonald", "domino", "pizza", "burger"],
    "salary": ["salary", "payroll", "adp", "direct deposit"],
    "rent": ["rent", "landlord", "apartment"],
    "utilities": ["electricity", "water bill", "internet", "wifi", "phone bill"],
    "entertainment": ["netflix", "spotify", "movie", "theatre", "concert"],
    "health": ["clinic", "hospital", "pharmacy", "drugstore"],
    "shopping": ["amazon", "flipkart", "mall", "store", "shop"],
    "others": []
}

def preprocess_transactions(df):
    # Try to detect common column names and standardize
    df = df.copy()
    # Lowercase columns
    df.columns = [c.strip().lower() for c in df.columns]
    # Heuristics for date, amount, description columns
    date_col = None
    amount_col = None
    desc_col = None
    for c in df.columns:
        if "date" in c:
            date_col = c
        if c in ("amount", "amt", "transaction amount", "value"):
            amount_col = c
        if any(x in c for x in ("description","narration","details","remarks","merchant","particulars")):
            desc_col = c
    # fallback guesses
    if date_col is None:
        for c in df.columns:
            if df[c].dtype == "object" and df[c].str.contains("-").any():
                date_col = c
                break
    if amount_col is None:
        for c in df.columns:
            if np.issubdtype(df[c].dtype, np.number):
                amount_col = c
                break
    if desc_col is None:
        for c in df.columns:
            if df[c].dtype == "object":
                desc_col = c
                break
    # Rename to standard
    if date_col:
        df["date"] = pd.to_datetime(df[date_col], errors="coerce")
    else:
        df["date"] = pd.NaT
    if amount_col:
        df["amount"] = pd.to_numeric(df[amount_col], errors="coerce")
    else:
        df["amount"] = 0.0
    if desc_col:
        df["description"] = df[desc_col].astype(str)
    else:
        df["description"] = ""
    # Basic cleaning
    df = df[["date","amount","description"]]
    df = df.dropna(subset=["amount"])
    # Fill missing dates with forward fill
    df["date"] = df["date"].fillna(method="ffill")
    # Assign categories using rules
    df["category"] = df["description"].apply(rule_based_category)
    # For any uncategorized, use a simple text clustering
    uncategorized = df["category"].isin(["others"])
    if uncategorized.any():
        df.loc[uncategorized, "category"] = cluster_categorize(df.loc[uncategorized, "description"])
    # Normalize amounts (assume negative for expenses sometimes)
    df["amount"] = df["amount"].astype(float)
    return df

def rule_based_category(text):
    t = text.lower()
    for cat, tokens in COMMON_CATEGORIES.items():
        for tk in tokens:
            if tk in t:
                return cat
    # check for debit/credit language
    if any(x in t for x in ("salary","credit","deposit")):
        return "salary"
    if any(x in t for x in ("payment","bill","charge")):
        return "utilities"
    return "others"

def cluster_categorize(text_series):
    txts = text_series.fillna("").astype(str).values
    if len(txts) == 0:
        return ["others"] * len(txts)
    vect = TfidfVectorizer(ngram_range=(1,2), max_features=500)
    X = vect.fit_transform(txts)
    n_clusters = min(4, max(1, len(txts)//5))
    k = KMeans(n_clusters=n_clusters, random_state=42)
    labels = k.fit_predict(X)
    # Map clusters to generic names
    mapping = {i: f"cluster_{i}" for i in range(n_clusters)}
    return [mapping[l] for l in labels]

def summarize_transactions(df):
    df = df.copy()
    df["month"] = df["date"].dt.to_period("M").astype(str)
    total_spent = df[df["amount"]<0]["amount"].abs().sum() if (df["amount"]<0).any() else df[df["amount"]>0]["amount"].sum()
    # We'll show per-month summary
    monthly = df.groupby("month")["amount"].sum().reset_index()
    cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount")
    text = []
    text.append(f"Total transactions: {len(df)}")
    text.append(f"Total amount (signed): {df['amount'].sum():.2f}")
    text.append("Monthly totals (month -> amount):")
    for _, row in monthly.iterrows():
        text.append(f"  {row['month']} -> {row['amount']:.2f}")
    text.append("Category breakdown (category -> amount):")
    for _, row in cat.iterrows():
        text.append(f"  {row['category']} -> {row['amount']:.2f}")
    return "\\n".join(text)

def category_summary(df):
    cat = df.groupby("category")["amount"].sum().reset_index().sort_values("amount", ascending=False)
    return cat

def predict_expenses(df):
    # Simple linear regression per category using month index
    df = df.copy()
    df["month"] = df["date"].dt.to_period("M")
    df["month_idx"] = (df["date"].dt.year - df["date"].dt.year.min())*12 + df["date"].dt.month
    preds = {}
    for cat, g in df.groupby("category"):
        monthly = g.groupby("month_idx")["amount"].sum().reset_index()
        if len(monthly) < 2:
            preds[cat] = monthly["amount"].sum()
            continue
        X = monthly[["month_idx"]].values
        y = monthly["amount"].values
        model = LinearRegression()
        try:
            model.fit(X, y)
            next_month = np.array([[monthly["month_idx"].max() + 1]])
            preds[cat] = float(model.predict(next_month)[0])
        except Exception:
            preds[cat] = float(y.mean())
    return preds