File size: 15,140 Bytes
2045ab3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
# analyzer.py
# Rules engine: Top 13, Recurring, Subscriptions, YoY, Data quality checks

import pandas as pd
import numpy as np
from datetime import datetime
from typing import TypedDict


# ─────────────────────────────────────────────────────────────────────────────
# Type hints
# ─────────────────────────────────────────────────────────────────────────────

class DataSummary(TypedDict):
    total_transactions: int
    total_spent: float
    date_range_start: str
    date_range_end: str
    months_covered: int
    years_covered: list[int]
    has_yoy: bool          # 2+ distinct years
    has_full_year: bool    # 12+ months


# ─────────────────────────────────────────────────────────────────────────────
# Data summary
# ─────────────────────────────────────────────────────────────────────────────

def get_data_summary(df: pd.DataFrame) -> DataSummary:
    years = sorted(df["date"].dt.year.unique().tolist())
    months_covered = df["date"].dt.to_period("M").nunique()
    return DataSummary(
        total_transactions=len(df),
        total_spent=round(df["amount"].sum(), 2),
        date_range_start=df["date"].min().strftime("%b %d, %Y"),
        date_range_end=df["date"].max().strftime("%b %d, %Y"),
        months_covered=months_covered,
        years_covered=years,
        has_yoy=len(years) >= 2,
        has_full_year=months_covered >= 12,
    )


# ─────────────────────────────────────────────────────────────────────────────
# Top 13 most expensive single purchases
# ─────────────────────────────────────────────────────────────────────────────

def get_top_13(df: pd.DataFrame) -> pd.DataFrame:
    """
    Top 13 single transactions by amount.
    Excludes recurring charges (those are shown separately).
    """
    # Get recurring merchants so we can flag them
    recurring = _detect_recurring_merchants(df)
    recurring_names = set(recurring["merchant"].tolist()) if not recurring.empty else set()

    result = (
        df.copy()
        .sort_values("amount", ascending=False)
        .head(13)
        .reset_index(drop=True)
    )
    result.index += 1
    result["is_recurring"] = result["merchant"].isin(recurring_names)
    result["date_fmt"] = result["date"].dt.strftime("%b %d, %Y")
    result["amount_fmt"] = result["amount"].apply(lambda x: f"${x:,.2f}")
    return result[["date_fmt", "merchant", "amount_fmt", "amount", "is_recurring", "source_file"]]


# ─────────────────────────────────────────────────────────────────────────────
# Recurring charge detection (internal helper)
# ─────────────────────────────────────────────────────────────────────────────

def _detect_recurring_merchants(df: pd.DataFrame, min_occurrences: int = 3) -> pd.DataFrame:
    """
    Core recurring detection. A merchant is recurring if it appears
    at least min_occurrences times AND the median gap between charges
    is 25–35 days (monthly) or 6–8 days (weekly) or 88–95 days (quarterly).
    """
    if df.empty:
        return pd.DataFrame()

    results = []
    grouped = df.groupby("merchant")

    for merchant, group in grouped:
        group = group.sort_values("date")
        if len(group) < min_occurrences:
            continue

        dates = group["date"].tolist()
        gaps = [(dates[i+1] - dates[i]).days for i in range(len(dates)-1)]
        if not gaps:
            continue

        median_gap = np.median(gaps)
        avg_amount = group["amount"].mean()
        amounts = group["amount"].tolist()

        # Classify frequency
        if 25 <= median_gap <= 35:
            frequency = "Monthly"
            periods_per_year = 12
        elif 6 <= median_gap <= 8:
            frequency = "Weekly"
            periods_per_year = 52
        elif 88 <= median_gap <= 95:
            frequency = "Quarterly"
            periods_per_year = 4
        elif 355 <= median_gap <= 375:
            frequency = "Annual"
            periods_per_year = 1
        elif 13 <= median_gap <= 17:
            frequency = "Bi-Weekly"
            periods_per_year = 26
        else:
            continue  # Irregular β€” skip

        annual_cost = avg_amount * periods_per_year
        amount_variance = np.std(amounts)
        amount_consistent = amount_variance < (avg_amount * 0.1)  # <10% variation

        results.append({
            "merchant": merchant,
            "frequency": frequency,
            "avg_charge": round(avg_amount, 2),
            "annual_cost": round(annual_cost, 2),
            "occurrences": len(group),
            "amount_consistent": amount_consistent,
            "first_seen": group["date"].min(),
            "last_seen": group["date"].max(),
            "amounts": amounts,
        })

    if not results:
        return pd.DataFrame()

    result_df = pd.DataFrame(results)
    result_df = result_df.sort_values("annual_cost", ascending=False).reset_index(drop=True)
    result_df.index += 1
    return result_df


# ─────────────────────────────────────────────────────────────────────────────
# Recurring charges (public β€” for Recurring tab)
# ─────────────────────────────────────────────────────────────────────────────

def get_recurring_charges(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns recurring charges with annualized cost.
    Excludes likely subscriptions (handled separately).
    """
    rec = _detect_recurring_merchants(df, min_occurrences=3)
    if rec.empty:
        return pd.DataFrame()

    # Exclude those that look like subscriptions (small + consistent)
    mask = ~((rec["avg_charge"] <= 30) & (rec["amount_consistent"]))
    rec = rec[mask].copy()

    rec["avg_charge_fmt"] = rec["avg_charge"].apply(lambda x: f"${x:,.2f}")
    rec["annual_cost_fmt"] = rec["annual_cost"].apply(lambda x: f"${x:,.2f}")
    rec["first_seen_fmt"] = rec["first_seen"].dt.strftime("%b %Y")
    rec["last_seen_fmt"] = rec["last_seen"].dt.strftime("%b %Y")
    return rec


# ─────────────────────────────────────────────────────────────────────────────
# Possible subscriptions
# ─────────────────────────────────────────────────────────────────────────────

def get_possible_subscriptions(df: pd.DataFrame) -> pd.DataFrame:
    """
    Small (≀$30), highly consistent recurring charges.
    These are the 'set it and forget it' subscriptions people often forget.
    """
    rec = _detect_recurring_merchants(df, min_occurrences=2)
    if rec.empty:
        return pd.DataFrame()

    # Keep only small + consistent charges
    mask = (rec["avg_charge"] <= 30) & (rec["amount_consistent"])
    subs = rec[mask].copy()

    if subs.empty:
        return pd.DataFrame()

    # Forgettability score: lower charge + more occurrences = more forgettable
    subs["forgettability"] = (subs["occurrences"] / subs["avg_charge"]).round(2)
    subs = subs.sort_values("forgettability", ascending=False).reset_index(drop=True)
    subs.index += 1

    subs["avg_charge_fmt"] = subs["avg_charge"].apply(lambda x: f"${x:,.2f}")
    subs["annual_cost_fmt"] = subs["annual_cost"].apply(lambda x: f"${x:,.2f}")
    subs["first_seen_fmt"] = subs["first_seen"].dt.strftime("%b %Y")
    return subs


# ─────────────────────────────────────────────────────────────────────────────
# Year-over-Year changes
# ─────────────────────────────────────────────────────────────────────────────

def get_yoy_changes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compare total annual spend per merchant across years.
    Returns merchants with notable increases or decreases.
    Requires at least 2 years of data.
    """
    years = sorted(df["date"].dt.year.unique())
    if len(years) < 2:
        return pd.DataFrame()

    # Annual totals per merchant
    df2 = df.copy()
    df2["year"] = df2["date"].dt.year
    pivot = df2.groupby(["merchant", "year"])["amount"].sum().unstack(fill_value=0)

    results = []
    year_pairs = list(zip(years[:-1], years[1:]))

    for (yr_a, yr_b) in year_pairs:
        if yr_a not in pivot.columns or yr_b not in pivot.columns:
            continue
        for merchant in pivot.index:
            amt_a = pivot.loc[merchant, yr_a]
            amt_b = pivot.loc[merchant, yr_b]

            # Skip if either year is zero (new/dropped merchant)
            if amt_a <= 0 or amt_b <= 0:
                continue
            # Skip very small amounts
            if amt_a < 10 and amt_b < 10:
                continue

            delta = amt_b - amt_a
            pct_change = (delta / amt_a) * 100

            # Only flag meaningful changes (β‰₯5% or β‰₯$25)
            if abs(pct_change) >= 5 or abs(delta) >= 25:
                results.append({
                    "merchant": merchant,
                    "year_a": yr_a,
                    "year_b": yr_b,
                    "amount_a": round(amt_a, 2),
                    "amount_b": round(amt_b, 2),
                    "delta": round(delta, 2),
                    "pct_change": round(pct_change, 1),
                    "direction": "↑ Increase" if delta > 0 else "↓ Decrease",
                })

    if not results:
        return pd.DataFrame()

    result_df = pd.DataFrame(results)
    # Sort: biggest increases first, then decreases
    result_df = result_df.sort_values("delta", ascending=False).reset_index(drop=True)
    result_df.index += 1

    result_df["amount_a_fmt"] = result_df["amount_a"].apply(lambda x: f"${x:,.2f}")
    result_df["amount_b_fmt"] = result_df["amount_b"].apply(lambda x: f"${x:,.2f}")
    result_df["delta_fmt"] = result_df["delta"].apply(
        lambda x: f"+${x:,.2f}" if x > 0 else f"-${abs(x):,.2f}"
    )
    result_df["pct_fmt"] = result_df["pct_change"].apply(
        lambda x: f"+{x:.1f}%" if x > 0 else f"{x:.1f}%"
    )
    return result_df


# ─────────────────────────────────────────────────────────────────────────────
# Build LLM summary payload
# ─────────────────────────────────────────────────────────────────────────────

def build_llm_summary(
    df: pd.DataFrame,
    summary: DataSummary,
    top13: pd.DataFrame,
    recurring: pd.DataFrame,
    subscriptions: pd.DataFrame,
    yoy: pd.DataFrame,
) -> str:
    """
    Build a concise text summary to send to the LLM.
    We send aggregated data, NOT raw transactions, for privacy.
    """
    lines = []
    lines.append("=== CREDIT CARD STATEMENT ANALYSIS DATA ===")
    lines.append(f"Date range: {summary['date_range_start']} to {summary['date_range_end']}")
    lines.append(f"Total transactions: {summary['total_transactions']}")
    lines.append(f"Total spent: ${summary['total_spent']:,.2f}")
    lines.append(f"Months covered: {summary['months_covered']}")
    lines.append(f"Years: {', '.join(str(y) for y in summary['years_covered'])}")
    lines.append("")

    lines.append("--- TOP 13 LARGEST SINGLE PURCHASES ---")
    if not top13.empty:
        for _, row in top13.iterrows():
            lines.append(f"  {row['date_fmt']}  {row['merchant']}  {row['amount_fmt']}")
    lines.append("")

    lines.append("--- RECURRING CHARGES (ANNUALIZED) ---")
    if not recurring.empty:
        for _, row in recurring.iterrows():
            lines.append(
                f"  {row['merchant']}  {row['frequency']}  "
                f"avg {row['avg_charge_fmt']}/period  "
                f"annual est. {row['annual_cost_fmt']}"
            )
    lines.append("")

    lines.append("--- POSSIBLE FORGOTTEN SUBSCRIPTIONS ---")
    if not subscriptions.empty:
        for _, row in subscriptions.iterrows():
            lines.append(
                f"  {row['merchant']}  {row['frequency']}  "
                f"{row['avg_charge_fmt']}/period  "
                f"since {row['first_seen_fmt']}"
            )
    lines.append("")

    if not yoy.empty:
        lines.append("--- YEAR-OVER-YEAR CHANGES ---")
        for _, row in yoy.iterrows():
            lines.append(
                f"  {row['merchant']}  {row['year_a']}β†’{row['year_b']}  "
                f"{row['amount_a_fmt']}β†’{row['amount_b_fmt']}  "
                f"({row['pct_fmt']}, {row['delta_fmt']})"
            )
        lines.append("")

    # Monthly totals for context
    monthly = df.groupby(df["date"].dt.to_period("M"))["amount"].sum()
    lines.append("--- MONTHLY SPEND TOTALS ---")
    for period, total in monthly.items():
        lines.append(f"  {period}: ${total:,.2f}")
    lines.append("")

    # Category-level summary (merchant frequency)
    lines.append("--- TOP MERCHANTS BY TOTAL SPEND ---")
    top_merchants = (
        df.groupby("merchant")["amount"]
        .sum()
        .sort_values(ascending=False)
        .head(20)
    )
    for merchant, total in top_merchants.items():
        lines.append(f"  {merchant}: ${total:,.2f}")

    return "\n".join(lines)