alm7640 commited on
Commit
2045ab3
Β·
0 Parent(s):

Initial commit

Browse files
Files changed (7) hide show
  1. README.md +76 -0
  2. analyzer.py +349 -0
  3. app.py +533 -0
  4. llm.py +106 -0
  5. merchant_map.py +164 -0
  6. parser.py +382 -0
  7. requirements.txt +13 -0
README.md ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # πŸ’³ Statement Analyzer
2
+
3
+ A Streamlit app that ingests credit card statements (PDF, CSV, XLS/XLSX, DOCX)
4
+ and surfaces spending intelligence you'd never catch manually.
5
+
6
+ ## Features
7
+
8
+ | Tab | What it does |
9
+ |---|---|
10
+ | πŸ’° Top 13 | Largest single purchases ranked by amount |
11
+ | πŸ” Recurring Charges | Monthly/weekly/quarterly charges with true annual cost |
12
+ | πŸ“‹ Possible Subscriptions | Small forgotten recurring charges |
13
+ | πŸ“ˆ Year-over-Year | Spend changes across years (requires 2+ years) |
14
+ | πŸ” AI Insights | LLM-powered narrative analysis (BYOK) |
15
+
16
+ ## Setup
17
+
18
+ ### 1. Install dependencies
19
+ ```bash
20
+ pip install -r requirements.txt
21
+ ```
22
+
23
+ ### 2. Run locally
24
+ ```bash
25
+ streamlit run app.py
26
+ ```
27
+
28
+ ### 3. Open in browser
29
+ ```
30
+ http://localhost:8501
31
+ ```
32
+
33
+ ## Deploy to Streamlit Cloud (free)
34
+
35
+ 1. Push this folder to a GitHub repo
36
+ 2. Go to https://share.streamlit.io
37
+ 3. Connect your repo, set `app.py` as the main file
38
+ 4. Deploy β€” you get a shareable URL instantly
39
+
40
+ ## Privacy
41
+
42
+ - Files are processed **entirely in memory** β€” never written to disk or any server
43
+ - Your API key lives only in your browser session and is discarded when you close the tab
44
+ - The AI Insights tab sends only **aggregated data** (merchant names + totals) to the LLM provider β€” no account numbers, card numbers, or personal details
45
+
46
+ ## Supported Banks
47
+
48
+ Any bank that exports in PDF, CSV, or XLS format is supported. Tested against common
49
+ export formats from Chase, Bank of America, Citi, Capital One, American Express,
50
+ Wells Fargo, and Discover.
51
+
52
+ If your bank's export isn't parsing correctly, the CSV export format is the most
53
+ reliable β€” most banks offer this under "Download transactions" in their portal.
54
+
55
+ ## File Structure
56
+
57
+ ```
58
+ statement_analyzer/
59
+ β”œβ”€β”€ app.py Main Streamlit application
60
+ β”œβ”€β”€ parser.py File ingestion & normalization (PDF/CSV/XLS/DOCX)
61
+ β”œβ”€β”€ analyzer.py Rules engine (Top 13, Recurring, Subscriptions, YoY)
62
+ β”œβ”€β”€ llm.py Multi-provider AI calls (OpenAI / Gemini / Anthropic)
63
+ β”œβ”€β”€ merchant_map.py Merchant alias normalization dictionary
64
+ β”œβ”€β”€ requirements.txt
65
+ └── README.md
66
+ ```
67
+
68
+ ## Data Quality Tiers
69
+
70
+ | Data | Features Unlocked |
71
+ |---|---|
72
+ | 1 statement | Top 13 only |
73
+ | 2–5 months | + Possible subscriptions |
74
+ | 6–11 months | + Recurring charges |
75
+ | 12 months | + True annual cost view |
76
+ | 24+ months | + Year-over-Year analysis |
analyzer.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # analyzer.py
2
+ # Rules engine: Top 13, Recurring, Subscriptions, YoY, Data quality checks
3
+
4
+ import pandas as pd
5
+ import numpy as np
6
+ from datetime import datetime
7
+ from typing import TypedDict
8
+
9
+
10
+ # ─────────────────────────────────────────────────────────────────────────────
11
+ # Type hints
12
+ # ─────────────────────────────────────────────────────────────────────────────
13
+
14
+ class DataSummary(TypedDict):
15
+ total_transactions: int
16
+ total_spent: float
17
+ date_range_start: str
18
+ date_range_end: str
19
+ months_covered: int
20
+ years_covered: list[int]
21
+ has_yoy: bool # 2+ distinct years
22
+ has_full_year: bool # 12+ months
23
+
24
+
25
+ # ─────────────────────────────────────────────────────────────────────────────
26
+ # Data summary
27
+ # ─────────────────────────────────────────────────────────────────────────────
28
+
29
+ def get_data_summary(df: pd.DataFrame) -> DataSummary:
30
+ years = sorted(df["date"].dt.year.unique().tolist())
31
+ months_covered = df["date"].dt.to_period("M").nunique()
32
+ return DataSummary(
33
+ total_transactions=len(df),
34
+ total_spent=round(df["amount"].sum(), 2),
35
+ date_range_start=df["date"].min().strftime("%b %d, %Y"),
36
+ date_range_end=df["date"].max().strftime("%b %d, %Y"),
37
+ months_covered=months_covered,
38
+ years_covered=years,
39
+ has_yoy=len(years) >= 2,
40
+ has_full_year=months_covered >= 12,
41
+ )
42
+
43
+
44
+ # ─────────────────────────────────────────────────────────────────────────────
45
+ # Top 13 most expensive single purchases
46
+ # ─────────────────────────────────────────────────────────────────────────────
47
+
48
+ def get_top_13(df: pd.DataFrame) -> pd.DataFrame:
49
+ """
50
+ Top 13 single transactions by amount.
51
+ Excludes recurring charges (those are shown separately).
52
+ """
53
+ # Get recurring merchants so we can flag them
54
+ recurring = _detect_recurring_merchants(df)
55
+ recurring_names = set(recurring["merchant"].tolist()) if not recurring.empty else set()
56
+
57
+ result = (
58
+ df.copy()
59
+ .sort_values("amount", ascending=False)
60
+ .head(13)
61
+ .reset_index(drop=True)
62
+ )
63
+ result.index += 1
64
+ result["is_recurring"] = result["merchant"].isin(recurring_names)
65
+ result["date_fmt"] = result["date"].dt.strftime("%b %d, %Y")
66
+ result["amount_fmt"] = result["amount"].apply(lambda x: f"${x:,.2f}")
67
+ return result[["date_fmt", "merchant", "amount_fmt", "amount", "is_recurring", "source_file"]]
68
+
69
+
70
+ # ─────────────────────────────────────────────────────────────────────────────
71
+ # Recurring charge detection (internal helper)
72
+ # ─────────────────────────────────────────────────────────────────────────────
73
+
74
+ def _detect_recurring_merchants(df: pd.DataFrame, min_occurrences: int = 3) -> pd.DataFrame:
75
+ """
76
+ Core recurring detection. A merchant is recurring if it appears
77
+ at least min_occurrences times AND the median gap between charges
78
+ is 25–35 days (monthly) or 6–8 days (weekly) or 88–95 days (quarterly).
79
+ """
80
+ if df.empty:
81
+ return pd.DataFrame()
82
+
83
+ results = []
84
+ grouped = df.groupby("merchant")
85
+
86
+ for merchant, group in grouped:
87
+ group = group.sort_values("date")
88
+ if len(group) < min_occurrences:
89
+ continue
90
+
91
+ dates = group["date"].tolist()
92
+ gaps = [(dates[i+1] - dates[i]).days for i in range(len(dates)-1)]
93
+ if not gaps:
94
+ continue
95
+
96
+ median_gap = np.median(gaps)
97
+ avg_amount = group["amount"].mean()
98
+ amounts = group["amount"].tolist()
99
+
100
+ # Classify frequency
101
+ if 25 <= median_gap <= 35:
102
+ frequency = "Monthly"
103
+ periods_per_year = 12
104
+ elif 6 <= median_gap <= 8:
105
+ frequency = "Weekly"
106
+ periods_per_year = 52
107
+ elif 88 <= median_gap <= 95:
108
+ frequency = "Quarterly"
109
+ periods_per_year = 4
110
+ elif 355 <= median_gap <= 375:
111
+ frequency = "Annual"
112
+ periods_per_year = 1
113
+ elif 13 <= median_gap <= 17:
114
+ frequency = "Bi-Weekly"
115
+ periods_per_year = 26
116
+ else:
117
+ continue # Irregular β€” skip
118
+
119
+ annual_cost = avg_amount * periods_per_year
120
+ amount_variance = np.std(amounts)
121
+ amount_consistent = amount_variance < (avg_amount * 0.1) # <10% variation
122
+
123
+ results.append({
124
+ "merchant": merchant,
125
+ "frequency": frequency,
126
+ "avg_charge": round(avg_amount, 2),
127
+ "annual_cost": round(annual_cost, 2),
128
+ "occurrences": len(group),
129
+ "amount_consistent": amount_consistent,
130
+ "first_seen": group["date"].min(),
131
+ "last_seen": group["date"].max(),
132
+ "amounts": amounts,
133
+ })
134
+
135
+ if not results:
136
+ return pd.DataFrame()
137
+
138
+ result_df = pd.DataFrame(results)
139
+ result_df = result_df.sort_values("annual_cost", ascending=False).reset_index(drop=True)
140
+ result_df.index += 1
141
+ return result_df
142
+
143
+
144
+ # ─────────────────────────────────────────────────────────────────────────────
145
+ # Recurring charges (public β€” for Recurring tab)
146
+ # ─────────────────────────────────────────────────────────────────────────────
147
+
148
+ def get_recurring_charges(df: pd.DataFrame) -> pd.DataFrame:
149
+ """
150
+ Returns recurring charges with annualized cost.
151
+ Excludes likely subscriptions (handled separately).
152
+ """
153
+ rec = _detect_recurring_merchants(df, min_occurrences=3)
154
+ if rec.empty:
155
+ return pd.DataFrame()
156
+
157
+ # Exclude those that look like subscriptions (small + consistent)
158
+ mask = ~((rec["avg_charge"] <= 30) & (rec["amount_consistent"]))
159
+ rec = rec[mask].copy()
160
+
161
+ rec["avg_charge_fmt"] = rec["avg_charge"].apply(lambda x: f"${x:,.2f}")
162
+ rec["annual_cost_fmt"] = rec["annual_cost"].apply(lambda x: f"${x:,.2f}")
163
+ rec["first_seen_fmt"] = rec["first_seen"].dt.strftime("%b %Y")
164
+ rec["last_seen_fmt"] = rec["last_seen"].dt.strftime("%b %Y")
165
+ return rec
166
+
167
+
168
+ # ─────────────────────────────────────────────────────────────────────────────
169
+ # Possible subscriptions
170
+ # ─────────────────────────────────────────────────────────────────────────────
171
+
172
+ def get_possible_subscriptions(df: pd.DataFrame) -> pd.DataFrame:
173
+ """
174
+ Small (≀$30), highly consistent recurring charges.
175
+ These are the 'set it and forget it' subscriptions people often forget.
176
+ """
177
+ rec = _detect_recurring_merchants(df, min_occurrences=2)
178
+ if rec.empty:
179
+ return pd.DataFrame()
180
+
181
+ # Keep only small + consistent charges
182
+ mask = (rec["avg_charge"] <= 30) & (rec["amount_consistent"])
183
+ subs = rec[mask].copy()
184
+
185
+ if subs.empty:
186
+ return pd.DataFrame()
187
+
188
+ # Forgettability score: lower charge + more occurrences = more forgettable
189
+ subs["forgettability"] = (subs["occurrences"] / subs["avg_charge"]).round(2)
190
+ subs = subs.sort_values("forgettability", ascending=False).reset_index(drop=True)
191
+ subs.index += 1
192
+
193
+ subs["avg_charge_fmt"] = subs["avg_charge"].apply(lambda x: f"${x:,.2f}")
194
+ subs["annual_cost_fmt"] = subs["annual_cost"].apply(lambda x: f"${x:,.2f}")
195
+ subs["first_seen_fmt"] = subs["first_seen"].dt.strftime("%b %Y")
196
+ return subs
197
+
198
+
199
+ # ─────────────────────────────────────────────────────────────────────────────
200
+ # Year-over-Year changes
201
+ # ─────────────────────────────────────────────────────────────────────────────
202
+
203
+ def get_yoy_changes(df: pd.DataFrame) -> pd.DataFrame:
204
+ """
205
+ Compare total annual spend per merchant across years.
206
+ Returns merchants with notable increases or decreases.
207
+ Requires at least 2 years of data.
208
+ """
209
+ years = sorted(df["date"].dt.year.unique())
210
+ if len(years) < 2:
211
+ return pd.DataFrame()
212
+
213
+ # Annual totals per merchant
214
+ df2 = df.copy()
215
+ df2["year"] = df2["date"].dt.year
216
+ pivot = df2.groupby(["merchant", "year"])["amount"].sum().unstack(fill_value=0)
217
+
218
+ results = []
219
+ year_pairs = list(zip(years[:-1], years[1:]))
220
+
221
+ for (yr_a, yr_b) in year_pairs:
222
+ if yr_a not in pivot.columns or yr_b not in pivot.columns:
223
+ continue
224
+ for merchant in pivot.index:
225
+ amt_a = pivot.loc[merchant, yr_a]
226
+ amt_b = pivot.loc[merchant, yr_b]
227
+
228
+ # Skip if either year is zero (new/dropped merchant)
229
+ if amt_a <= 0 or amt_b <= 0:
230
+ continue
231
+ # Skip very small amounts
232
+ if amt_a < 10 and amt_b < 10:
233
+ continue
234
+
235
+ delta = amt_b - amt_a
236
+ pct_change = (delta / amt_a) * 100
237
+
238
+ # Only flag meaningful changes (β‰₯5% or β‰₯$25)
239
+ if abs(pct_change) >= 5 or abs(delta) >= 25:
240
+ results.append({
241
+ "merchant": merchant,
242
+ "year_a": yr_a,
243
+ "year_b": yr_b,
244
+ "amount_a": round(amt_a, 2),
245
+ "amount_b": round(amt_b, 2),
246
+ "delta": round(delta, 2),
247
+ "pct_change": round(pct_change, 1),
248
+ "direction": "↑ Increase" if delta > 0 else "↓ Decrease",
249
+ })
250
+
251
+ if not results:
252
+ return pd.DataFrame()
253
+
254
+ result_df = pd.DataFrame(results)
255
+ # Sort: biggest increases first, then decreases
256
+ result_df = result_df.sort_values("delta", ascending=False).reset_index(drop=True)
257
+ result_df.index += 1
258
+
259
+ result_df["amount_a_fmt"] = result_df["amount_a"].apply(lambda x: f"${x:,.2f}")
260
+ result_df["amount_b_fmt"] = result_df["amount_b"].apply(lambda x: f"${x:,.2f}")
261
+ result_df["delta_fmt"] = result_df["delta"].apply(
262
+ lambda x: f"+${x:,.2f}" if x > 0 else f"-${abs(x):,.2f}"
263
+ )
264
+ result_df["pct_fmt"] = result_df["pct_change"].apply(
265
+ lambda x: f"+{x:.1f}%" if x > 0 else f"{x:.1f}%"
266
+ )
267
+ return result_df
268
+
269
+
270
+ # ─────────────────────────────────────────────────────────────────────────────
271
+ # Build LLM summary payload
272
+ # ─────────────────────────────────────────────────────────────────────────────
273
+
274
+ def build_llm_summary(
275
+ df: pd.DataFrame,
276
+ summary: DataSummary,
277
+ top13: pd.DataFrame,
278
+ recurring: pd.DataFrame,
279
+ subscriptions: pd.DataFrame,
280
+ yoy: pd.DataFrame,
281
+ ) -> str:
282
+ """
283
+ Build a concise text summary to send to the LLM.
284
+ We send aggregated data, NOT raw transactions, for privacy.
285
+ """
286
+ lines = []
287
+ lines.append("=== CREDIT CARD STATEMENT ANALYSIS DATA ===")
288
+ lines.append(f"Date range: {summary['date_range_start']} to {summary['date_range_end']}")
289
+ lines.append(f"Total transactions: {summary['total_transactions']}")
290
+ lines.append(f"Total spent: ${summary['total_spent']:,.2f}")
291
+ lines.append(f"Months covered: {summary['months_covered']}")
292
+ lines.append(f"Years: {', '.join(str(y) for y in summary['years_covered'])}")
293
+ lines.append("")
294
+
295
+ lines.append("--- TOP 13 LARGEST SINGLE PURCHASES ---")
296
+ if not top13.empty:
297
+ for _, row in top13.iterrows():
298
+ lines.append(f" {row['date_fmt']} {row['merchant']} {row['amount_fmt']}")
299
+ lines.append("")
300
+
301
+ lines.append("--- RECURRING CHARGES (ANNUALIZED) ---")
302
+ if not recurring.empty:
303
+ for _, row in recurring.iterrows():
304
+ lines.append(
305
+ f" {row['merchant']} {row['frequency']} "
306
+ f"avg {row['avg_charge_fmt']}/period "
307
+ f"annual est. {row['annual_cost_fmt']}"
308
+ )
309
+ lines.append("")
310
+
311
+ lines.append("--- POSSIBLE FORGOTTEN SUBSCRIPTIONS ---")
312
+ if not subscriptions.empty:
313
+ for _, row in subscriptions.iterrows():
314
+ lines.append(
315
+ f" {row['merchant']} {row['frequency']} "
316
+ f"{row['avg_charge_fmt']}/period "
317
+ f"since {row['first_seen_fmt']}"
318
+ )
319
+ lines.append("")
320
+
321
+ if not yoy.empty:
322
+ lines.append("--- YEAR-OVER-YEAR CHANGES ---")
323
+ for _, row in yoy.iterrows():
324
+ lines.append(
325
+ f" {row['merchant']} {row['year_a']}β†’{row['year_b']} "
326
+ f"{row['amount_a_fmt']}β†’{row['amount_b_fmt']} "
327
+ f"({row['pct_fmt']}, {row['delta_fmt']})"
328
+ )
329
+ lines.append("")
330
+
331
+ # Monthly totals for context
332
+ monthly = df.groupby(df["date"].dt.to_period("M"))["amount"].sum()
333
+ lines.append("--- MONTHLY SPEND TOTALS ---")
334
+ for period, total in monthly.items():
335
+ lines.append(f" {period}: ${total:,.2f}")
336
+ lines.append("")
337
+
338
+ # Category-level summary (merchant frequency)
339
+ lines.append("--- TOP MERCHANTS BY TOTAL SPEND ---")
340
+ top_merchants = (
341
+ df.groupby("merchant")["amount"]
342
+ .sum()
343
+ .sort_values(ascending=False)
344
+ .head(20)
345
+ )
346
+ for merchant, total in top_merchants.items():
347
+ lines.append(f" {merchant}: ${total:,.2f}")
348
+
349
+ return "\n".join(lines)
app.py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py β€” Statement Analyzer
2
+ # Multi-provider credit card statement intelligence tool
3
+
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import sys
7
+ import os
8
+
9
+ sys.path.insert(0, os.path.dirname(__file__))
10
+
11
+ from parser import combine_files
12
+ from analyzer import (
13
+ get_data_summary,
14
+ get_top_13,
15
+ get_recurring_charges,
16
+ get_possible_subscriptions,
17
+ get_yoy_changes,
18
+ build_llm_summary,
19
+ )
20
+ from llm import get_ai_insights
21
+
22
+ # ─────────────────────────────────────────────────────────────────────────────
23
+ # Page config
24
+ # ─────────────────────────────────────────────────────────────────────────────
25
+ st.set_page_config(
26
+ page_title="Statement Analyzer",
27
+ page_icon="πŸ’³",
28
+ layout="wide",
29
+ initial_sidebar_state="expanded",
30
+ )
31
+
32
+ # ─────────────────────────────────────────────────────────────────────────────
33
+ # CSS
34
+ # ─────────────────────────────────────────────────────────────────────────────
35
+ st.markdown("""
36
+ <style>
37
+ .main-header { text-align: center; padding: 1.2rem 0 0.25rem; }
38
+ .main-header h1 { font-size: 2rem; font-weight: 600; }
39
+ .tagline {
40
+ text-align: center; color: #6b7280;
41
+ font-size: 0.9rem; margin-bottom: 0.5rem;
42
+ }
43
+ .privacy-badge {
44
+ background: #f0fdf4; border: 1px solid #bbf7d0;
45
+ border-radius: 8px; padding: 0.5rem 0.85rem;
46
+ font-size: 0.8rem; color: #166534; margin-bottom: 0.75rem;
47
+ }
48
+ .data-quality-banner {
49
+ border-radius: 8px; padding: 0.75rem 1rem;
50
+ font-size: 0.85rem; margin-bottom: 1rem;
51
+ }
52
+ .stat-row {
53
+ display: flex; gap: 12px; flex-wrap: wrap;
54
+ margin-bottom: 1.25rem;
55
+ }
56
+ .stat-card {
57
+ background: #f9fafb; border: 1px solid #e5e7eb;
58
+ border-radius: 10px; padding: 0.75rem 1rem;
59
+ flex: 1; min-width: 130px; text-align: center;
60
+ }
61
+ .stat-label { font-size: 0.75rem; color: #9ca3af; margin-bottom: 2px; }
62
+ .stat-value { font-size: 1.3rem; font-weight: 600; color: #111827; }
63
+ .increase-row { color: #dc2626; }
64
+ .decrease-row { color: #16a34a; }
65
+ .section-note {
66
+ font-size: 0.8rem; color: #9ca3af;
67
+ font-style: italic; margin-bottom: 0.5rem;
68
+ }
69
+ .footer {
70
+ text-align: center; margin-top: 2rem;
71
+ padding-top: 1rem; border-top: 1px solid #e5e7eb;
72
+ color: #9ca3af; font-size: 0.78rem;
73
+ }
74
+ /* Streamlit table tweaks */
75
+ [data-testid="stDataFrame"] { border-radius: 8px; }
76
+ </style>
77
+ """, unsafe_allow_html=True)
78
+
79
+ # ─────────────────────────────────────────────────────────────────────────────
80
+ # Session state
81
+ # ─────────────────────────────────────────────────────────────────────────────
82
+ for key in ["df", "summary", "top13", "recurring", "subscriptions", "yoy",
83
+ "llm_summary_text", "ai_result"]:
84
+ if key not in st.session_state:
85
+ st.session_state[key] = None
86
+
87
+ # ─────────────────────────────────────────────────────────────────────────────
88
+ # Header
89
+ # ─────────────────────────────────────────────────────────────────────────────
90
+ st.markdown("""
91
+ <div class="main-header"><h1>πŸ’³ Statement Analyzer</h1></div>
92
+ <div class="tagline">
93
+ Upload your credit card statements and uncover what your spending is really telling you.<br>
94
+ <strong>Your statements never leave your session β€” processed in memory, never stored.</strong>
95
+ </div>
96
+ """, unsafe_allow_html=True)
97
+
98
+ # ─────────────────────────────────────────────────────────────────────────────
99
+ # Sidebar
100
+ # ──────────────────────────────────────────────────────────────────��──────────
101
+ with st.sidebar:
102
+ st.markdown("## βš™οΈ AI Provider")
103
+ st.markdown('<div class="section-note">Required only for the AI Insights tab</div>',
104
+ unsafe_allow_html=True)
105
+
106
+ provider = st.selectbox(
107
+ "Provider",
108
+ ["OpenAI (GPT-4o)", "Google Gemini", "Anthropic Claude"],
109
+ label_visibility="collapsed",
110
+ )
111
+ provider_hints = {
112
+ "OpenAI (GPT-4o)": "platform.openai.com",
113
+ "Google Gemini": "aistudio.google.com",
114
+ "Anthropic Claude": "console.anthropic.com",
115
+ }
116
+ api_key = st.text_input(
117
+ "API Key",
118
+ type="password",
119
+ placeholder="Paste your key here...",
120
+ help=f"Get your key at {provider_hints[provider]}",
121
+ )
122
+ if api_key:
123
+ st.markdown(
124
+ '<div class="privacy-badge">πŸ”’ Key used only this session. Never stored or shared.</div>',
125
+ unsafe_allow_html=True,
126
+ )
127
+
128
+ st.markdown("---")
129
+ st.markdown("### πŸ“– Tips")
130
+ st.markdown("""
131
+ - Upload **1 year minimum** for recurring detection
132
+ - Upload **2+ years** to unlock Year-over-Year changes
133
+ - Supported: **PDF, CSV, XLS, XLSX, DOCX**
134
+ - Upload multiple files at once β€” one per month is fine
135
+ - Most banks offer CSV export in their online portal
136
+ """)
137
+ st.markdown("---")
138
+ st.markdown(
139
+ '<div class="footer">Made with ❀️ for people who actually want to know where their money goes.</div>',
140
+ unsafe_allow_html=True,
141
+ )
142
+
143
+ # ─────────────────────────────────────────────────────────────────────────────
144
+ # Upload zone
145
+ # ─────────────────────────────────────────────────────────────────────────────
146
+ st.markdown("### πŸ“‚ Upload Your Statements")
147
+
148
+ col_upload, col_tip = st.columns([2, 1])
149
+ with col_upload:
150
+ uploaded_files = st.file_uploader(
151
+ "Drop files here or click to browse",
152
+ type=["pdf", "csv", "xls", "xlsx", "docx"],
153
+ accept_multiple_files=True,
154
+ label_visibility="collapsed",
155
+ )
156
+
157
+ with col_tip:
158
+ st.info(
159
+ "**Better results with more data**\n\n"
160
+ "🟑 1 statement β€” basic insights only\n\n"
161
+ "🟠 6 months β€” recurring detection\n\n"
162
+ "🟒 12 months β€” full annual cost view\n\n"
163
+ "πŸ”΅ 24+ months β€” Year-over-Year unlocked"
164
+ )
165
+
166
+ analyze_btn = st.button(
167
+ "πŸ” Analyze Statements",
168
+ type="primary",
169
+ use_container_width=False,
170
+ disabled=not uploaded_files,
171
+ )
172
+
173
+ # ─────────────────────────────────────────────────────────────────────────────
174
+ # Run analysis
175
+ # ─────────────────────────────────────────────────────────────────────────────
176
+ if analyze_btn and uploaded_files:
177
+ with st.spinner("Parsing files and running analysis..."):
178
+ df, parse_warnings = combine_files(uploaded_files)
179
+
180
+ if df.empty:
181
+ st.error(
182
+ "Could not extract any transactions from the uploaded files. "
183
+ "Please check the file formats and try again."
184
+ )
185
+ st.stop()
186
+
187
+ summary = get_data_summary(df)
188
+ top13 = get_top_13(df)
189
+ recurring = get_recurring_charges(df)
190
+ subscriptions = get_possible_subscriptions(df)
191
+ yoy = get_yoy_changes(df)
192
+ llm_summary_text = build_llm_summary(df, summary, top13, recurring, subscriptions, yoy)
193
+
194
+ # Persist to session
195
+ st.session_state.df = df
196
+ st.session_state.summary = summary
197
+ st.session_state.top13 = top13
198
+ st.session_state.recurring = recurring
199
+ st.session_state.subscriptions = subscriptions
200
+ st.session_state.yoy = yoy
201
+ st.session_state.llm_summary_text = llm_summary_text
202
+ st.session_state.ai_result = None # reset on re-analyze
203
+ st.session_state.parse_warnings = parse_warnings
204
+
205
+ # ─────────────────────────────────────────────────────────────────────────────
206
+ # Results
207
+ # ─────────────────────────────────────────────────────────────────────────────
208
+ if st.session_state.df is not None:
209
+ summary = st.session_state.summary
210
+ df = st.session_state.df
211
+ parse_warnings = st.session_state.get("parse_warnings", [])
212
+
213
+ # Parse warnings
214
+ for w in parse_warnings:
215
+ st.warning(w)
216
+
217
+ # Data quality banner
218
+ months = summary["months_covered"]
219
+ has_yoy = summary["has_yoy"]
220
+ years = summary["years_covered"]
221
+
222
+ if months < 6:
223
+ quality_color = "#fef3c7"
224
+ quality_border = "#f59e0b"
225
+ quality_msg = (
226
+ f"πŸ“Š **{months} month(s)** of data detected. "
227
+ "Upload at least 6 months for recurring charge detection and 12+ for full annual cost analysis."
228
+ )
229
+ elif months < 12:
230
+ quality_color = "#fff7ed"
231
+ quality_border = "#f97316"
232
+ quality_msg = (
233
+ f"πŸ“Š **{months} months** of data detected ({', '.join(str(y) for y in years)}). "
234
+ "Upload 12+ months to see true annual costs. Upload 2+ years to unlock Year-over-Year."
235
+ )
236
+ elif not has_yoy:
237
+ quality_color = "#eff6ff"
238
+ quality_border = "#3b82f6"
239
+ quality_msg = (
240
+ f"πŸ“Š **{months} months** of data detected. "
241
+ "Great for annual analysis! Upload statements from another year to unlock Year-over-Year comparison."
242
+ )
243
+ else:
244
+ quality_color = "#f0fdf4"
245
+ quality_border = "#22c55e"
246
+ quality_msg = (
247
+ f"βœ… **{months} months across {len(years)} years** β€” full analysis unlocked including Year-over-Year!"
248
+ )
249
+
250
+ st.markdown(
251
+ f'<div class="data-quality-banner" style="background:{quality_color};border-left:4px solid {quality_border};">'
252
+ f"{quality_msg}</div>",
253
+ unsafe_allow_html=True,
254
+ )
255
+
256
+ # Summary stat cards
257
+ st.markdown(
258
+ f"""
259
+ <div class="stat-row">
260
+ <div class="stat-card">
261
+ <div class="stat-label">Total Spent</div>
262
+ <div class="stat-value">${summary['total_spent']:,.0f}</div>
263
+ </div>
264
+ <div class="stat-card">
265
+ <div class="stat-label">Transactions</div>
266
+ <div class="stat-value">{summary['total_transactions']:,}</div>
267
+ </div>
268
+ <div class="stat-card">
269
+ <div class="stat-label">Date Range</div>
270
+ <div class="stat-value" style="font-size:0.85rem;">{summary['date_range_start']}<br>β†’ {summary['date_range_end']}</div>
271
+ </div>
272
+ <div class="stat-card">
273
+ <div class="stat-label">Months</div>
274
+ <div class="stat-value">{summary['months_covered']}</div>
275
+ </div>
276
+ <div class="stat-card">
277
+ <div class="stat-label">Avg/Month</div>
278
+ <div class="stat-value">${summary['total_spent']/max(summary['months_covered'],1):,.0f}</div>
279
+ </div>
280
+ </div>
281
+ """,
282
+ unsafe_allow_html=True,
283
+ )
284
+
285
+ st.markdown("---")
286
+
287
+ # ── Tabs ──────────────────────────────────────────────────────────────
288
+ tab1, tab2, tab3, tab4, tab5 = st.tabs([
289
+ "πŸ’° Top 13",
290
+ "πŸ” Recurring Charges",
291
+ "πŸ“‹ Possible Subscriptions",
292
+ "πŸ“ˆ Year-over-Year",
293
+ "πŸ” AI Insights",
294
+ ])
295
+
296
+ # ── Tab 1: Top 13 ─────────────────────────────────────────────────────
297
+ with tab1:
298
+ st.markdown("#### πŸ’° Top 13 Most Expensive Single Purchases")
299
+ st.markdown(
300
+ '<div class="section-note">Ranked by transaction amount. '
301
+ "Charges marked πŸ” also appear as recurring charges.</div>",
302
+ unsafe_allow_html=True,
303
+ )
304
+
305
+ top13 = st.session_state.top13
306
+ if top13.empty:
307
+ st.info("No transactions found.")
308
+ else:
309
+ display = top13.copy()
310
+ display["merchant"] = display.apply(
311
+ lambda r: f"πŸ” {r['merchant']}" if r["is_recurring"] else r["merchant"],
312
+ axis=1,
313
+ )
314
+ st.dataframe(
315
+ display[["date_fmt", "merchant", "amount_fmt", "source_file"]].rename(columns={
316
+ "date_fmt": "Date",
317
+ "merchant": "Merchant",
318
+ "amount_fmt": "Amount",
319
+ "source_file": "Statement File",
320
+ }),
321
+ use_container_width=True,
322
+ hide_index=False,
323
+ )
324
+ total_top13 = top13["amount"].sum()
325
+ pct = (total_top13 / summary["total_spent"] * 100) if summary["total_spent"] > 0 else 0
326
+ st.markdown(
327
+ f"**Top 13 total: ${total_top13:,.2f}** β€” "
328
+ f"that's **{pct:.1f}%** of all spending in this period."
329
+ )
330
+
331
+ # ── Tab 2: Recurring ──────────────────────────────────────────────────
332
+ with tab2:
333
+ st.markdown("#### πŸ” Recurring Charges β€” True Annual Cost")
334
+ st.markdown(
335
+ '<div class="section-note">'
336
+ "These charges appear on a regular schedule. The annual cost column shows what you're "
337
+ "actually paying per year β€” a number most people have never seen laid out clearly."
338
+ "</div>",
339
+ unsafe_allow_html=True,
340
+ )
341
+
342
+ recurring = st.session_state.recurring
343
+ if months < 3:
344
+ st.warning("Upload at least 3 months of statements to detect recurring charges.")
345
+ elif recurring is None or recurring.empty:
346
+ st.info("No recurring charges detected in the uploaded statements.")
347
+ else:
348
+ st.dataframe(
349
+ recurring[["merchant", "frequency", "avg_charge_fmt",
350
+ "annual_cost_fmt", "occurrences",
351
+ "first_seen_fmt", "last_seen_fmt"]].rename(columns={
352
+ "merchant": "Merchant",
353
+ "frequency": "Frequency",
354
+ "avg_charge_fmt": "Avg Charge",
355
+ "annual_cost_fmt": "Est. Annual Cost",
356
+ "occurrences": "Times Seen",
357
+ "first_seen_fmt": "First Seen",
358
+ "last_seen_fmt": "Last Seen",
359
+ }),
360
+ use_container_width=True,
361
+ hide_index=False,
362
+ )
363
+ total_recurring_annual = recurring["annual_cost"].sum()
364
+ st.markdown(
365
+ f"**Estimated total annual cost of recurring charges: "
366
+ f"${total_recurring_annual:,.2f}**"
367
+ )
368
+
369
+ # ── Tab 3: Subscriptions ──────────────────────────────────────────────
370
+ with tab3:
371
+ st.markdown("#### πŸ“‹ Possible Forgotten Subscriptions")
372
+ st.markdown(
373
+ '<div class="section-note">'
374
+ "Small, consistent charges that are easy to forget about. "
375
+ "Sorted by 'forgettability' β€” the ones most likely to be autopilot spending. "
376
+ "Could you cancel any of these?"
377
+ "</div>",
378
+ unsafe_allow_html=True,
379
+ )
380
+
381
+ subscriptions = st.session_state.subscriptions
382
+ if months < 2:
383
+ st.warning("Upload at least 2 months of statements to detect subscriptions.")
384
+ elif subscriptions is None or subscriptions.empty:
385
+ st.info("No small recurring subscriptions detected.")
386
+ else:
387
+ st.dataframe(
388
+ subscriptions[["merchant", "frequency", "avg_charge_fmt",
389
+ "annual_cost_fmt", "occurrences", "first_seen_fmt"]].rename(columns={
390
+ "merchant": "Merchant",
391
+ "frequency": "Frequency",
392
+ "avg_charge_fmt": "Per Period",
393
+ "annual_cost_fmt": "Per Year",
394
+ "occurrences": "Times Seen",
395
+ "first_seen_fmt": "Paying Since",
396
+ }),
397
+ use_container_width=True,
398
+ hide_index=False,
399
+ )
400
+ total_sub_annual = subscriptions["annual_cost"].sum()
401
+ st.markdown(
402
+ f"**Total possible subscription spend: ${total_sub_annual:,.2f}/year** β€” "
403
+ f"that's **${total_sub_annual/12:,.2f}/month** in charges you might not be thinking about."
404
+ )
405
+
406
+ # ── Tab 4: Year-over-Year ─────────────────────────────────────────────
407
+ with tab4:
408
+ st.markdown("#### πŸ“ˆ Year-over-Year Spending Changes")
409
+
410
+ yoy = st.session_state.yoy
411
+ if not has_yoy:
412
+ st.info(
413
+ "πŸ“… Year-over-Year analysis requires at least 2 years of statements.\n\n"
414
+ f"Currently loaded: **{', '.join(str(y) for y in years)}**.\n\n"
415
+ "Upload statements from an additional year to unlock this tab."
416
+ )
417
+ elif yoy is None or yoy.empty:
418
+ st.info("No significant year-over-year changes found in the data.")
419
+ else:
420
+ increases = yoy[yoy["delta"] > 0]
421
+ decreases = yoy[yoy["delta"] < 0]
422
+
423
+ if not increases.empty:
424
+ st.markdown("##### ↑ Charges That Increased")
425
+ st.markdown(
426
+ '<div class="section-note">These cost you more this year than last year.</div>',
427
+ unsafe_allow_html=True,
428
+ )
429
+ inc_display = increases[["merchant", "year_a", "year_b",
430
+ "amount_a_fmt", "amount_b_fmt",
431
+ "delta_fmt", "pct_fmt"]].rename(columns={
432
+ "merchant": "Merchant",
433
+ "year_a": "Year A",
434
+ "year_b": "Year B",
435
+ "amount_a_fmt": "Spent (A)",
436
+ "amount_b_fmt": "Spent (B)",
437
+ "delta_fmt": "Change ($)",
438
+ "pct_fmt": "Change (%)",
439
+ })
440
+ st.dataframe(inc_display, use_container_width=True, hide_index=False)
441
+
442
+ if not decreases.empty:
443
+ st.markdown("##### ↓ Charges That Decreased")
444
+ st.markdown(
445
+ '<div class="section-note">You spent less here β€” cancellations, negotiated rates, or reduced usage.</div>',
446
+ unsafe_allow_html=True,
447
+ )
448
+ dec_display = decreases[["merchant", "year_a", "year_b",
449
+ "amount_a_fmt", "amount_b_fmt",
450
+ "delta_fmt", "pct_fmt"]].rename(columns={
451
+ "merchant": "Merchant",
452
+ "year_a": "Year A",
453
+ "year_b": "Year B",
454
+ "amount_a_fmt": "Spent (A)",
455
+ "amount_b_fmt": "Spent (B)",
456
+ "delta_fmt": "Change ($)",
457
+ "pct_fmt": "Change (%)",
458
+ })
459
+ st.dataframe(dec_display, use_container_width=True, hide_index=False)
460
+
461
+ # ── Tab 5: AI Insights ────────────────────────────────────────────────
462
+ with tab5:
463
+ st.markdown("#### πŸ” AI Insights")
464
+ st.markdown(
465
+ '<div class="section-note">'
466
+ "The AI analyzes your aggregated spending data β€” not your raw transactions. "
467
+ "Merchant names and totals are shared with the AI provider you select; "
468
+ "no account numbers, card numbers, or personal details are ever sent."
469
+ "</div>",
470
+ unsafe_allow_html=True,
471
+ )
472
+
473
+ if not api_key:
474
+ st.warning(
475
+ "Enter your API key in the sidebar to use AI Insights. "
476
+ "Choose any provider β€” OpenAI, Gemini, or Anthropic Claude."
477
+ )
478
+ else:
479
+ depth = st.radio(
480
+ "Analysis depth",
481
+ ["Summary bullets", "Deep narrative analysis"],
482
+ horizontal=True,
483
+ help="Deep analysis uses more tokens (~3-5x the cost of summary).",
484
+ )
485
+
486
+ run_ai_btn = st.button(
487
+ f"✨ Run AI Analysis ({provider})",
488
+ type="secondary",
489
+ )
490
+
491
+ if run_ai_btn:
492
+ with st.spinner(f"Analyzing with {provider}..."):
493
+ result = get_ai_insights(
494
+ data_summary=st.session_state.llm_summary_text,
495
+ provider=provider,
496
+ api_key=api_key,
497
+ depth=depth,
498
+ )
499
+ st.session_state.ai_result = result
500
+
501
+ if st.session_state.ai_result:
502
+ st.markdown(st.session_state.ai_result)
503
+
504
+ st.download_button(
505
+ label="⬇️ Download AI Analysis",
506
+ data=st.session_state.ai_result,
507
+ file_name="statement_ai_insights.txt",
508
+ mime="text/plain",
509
+ )
510
+
511
+ # ── Download full analysis ─────────────────────────────────────────────
512
+ st.markdown("---")
513
+ st.download_button(
514
+ label="⬇️ Download Full Analysis Data (text)",
515
+ data=st.session_state.llm_summary_text,
516
+ file_name="statement_analysis_summary.txt",
517
+ mime="text/plain",
518
+ )
519
+
520
+ else:
521
+ # Landing state
522
+ st.markdown(
523
+ """
524
+ <div style="text-align:center; padding: 3rem 1rem; color: #9ca3af;">
525
+ <div style="font-size: 3rem; margin-bottom: 1rem;">πŸ’³</div>
526
+ <div style="font-size: 1rem;">
527
+ Upload your credit card statements above and click <strong>Analyze</strong>.<br>
528
+ Supports PDF, CSV, XLS, XLSX, and DOCX from any bank.
529
+ </div>
530
+ </div>
531
+ """,
532
+ unsafe_allow_html=True,
533
+ )
llm.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # llm.py
2
+ # Multi-provider LLM calls for AI Insights tab
3
+ # Supports OpenAI (GPT-4o), Google Gemini, Anthropic Claude
4
+
5
+ from typing import Literal
6
+
7
+ DEPTH_PROMPTS = {
8
+ "Summary bullets": """
9
+ You are a personal finance analyst reviewing a year or more of credit card statements.
10
+ Based on the data provided, give a concise bullet-point analysis covering:
11
+
12
+ β€’ 3-5 standout spending patterns or anomalies
13
+ β€’ Any suspicious or duplicate-looking charges
14
+ β€’ Quick wins β€” subscriptions or recurring charges the user could cancel
15
+ β€’ One overall financial habit observation
16
+
17
+ Keep it brief and scannable. Use plain language, no jargon.
18
+ """,
19
+ "Deep narrative analysis": """
20
+ You are an expert personal finance analyst reviewing a year or more of credit card statements.
21
+ Based on the data provided, write a thorough narrative analysis covering:
22
+
23
+ 1. **Spending Personality** β€” What do these statements reveal about this person's lifestyle and habits?
24
+ 2. **Anomalies & Red Flags** β€” Any duplicate charges, unusual timing, or charges that don't fit the pattern?
25
+ 3. **Subscription Audit** β€” Evaluate all recurring and subscription charges. Which ones seem worth it? Which seem forgotten or wasteful?
26
+ 4. **Year-over-Year Trends** β€” What's growing? What's declining? Is spending trending in a healthy or concerning direction?
27
+ 5. **Category Analysis** β€” Where is the bulk of money going? Is it balanced?
28
+ 6. **Missed Savings Opportunities** β€” Specific charges where better options likely exist (e.g. switching providers, bundling services)
29
+ 7. **Action Items** β€” A prioritized list of 5 concrete things this person should do after reading this analysis
30
+
31
+ Be specific, reference actual merchants and amounts from the data. Write for a smart adult who wants honest, direct insight.
32
+ """,
33
+ }
34
+
35
+
36
+ def build_prompt(data_summary: str, depth: str) -> str:
37
+ system_section = DEPTH_PROMPTS.get(depth, DEPTH_PROMPTS["Summary bullets"])
38
+ return f"""{system_section}
39
+
40
+ Here is the spending data to analyze:
41
+
42
+ {data_summary}
43
+ """
44
+
45
+
46
+ def call_openai(prompt: str, api_key: str) -> str:
47
+ try:
48
+ from openai import OpenAI
49
+ client = OpenAI(api_key=api_key)
50
+ response = client.chat.completions.create(
51
+ model="gpt-4o",
52
+ messages=[
53
+ {
54
+ "role": "system",
55
+ "content": "You are an expert personal finance analyst. Be direct, specific, and helpful.",
56
+ },
57
+ {"role": "user", "content": prompt},
58
+ ],
59
+ max_tokens=2000,
60
+ temperature=0.4,
61
+ )
62
+ return response.choices[0].message.content
63
+ except Exception as e:
64
+ return f"❌ OpenAI error: {str(e)}"
65
+
66
+
67
+ def call_gemini(prompt: str, api_key: str) -> str:
68
+ try:
69
+ import google.generativeai as genai
70
+ genai.configure(api_key=api_key)
71
+ model = genai.GenerativeModel("gemini-1.5-pro")
72
+ response = model.generate_content(prompt)
73
+ return response.text
74
+ except Exception as e:
75
+ return f"❌ Gemini error: {str(e)}"
76
+
77
+
78
+ def call_anthropic(prompt: str, api_key: str) -> str:
79
+ try:
80
+ import anthropic
81
+ client = anthropic.Anthropic(api_key=api_key)
82
+ response = client.messages.create(
83
+ model="claude-sonnet-4-20250514",
84
+ max_tokens=2000,
85
+ system="You are an expert personal finance analyst. Be direct, specific, and helpful.",
86
+ messages=[{"role": "user", "content": prompt}],
87
+ )
88
+ return response.content[0].text
89
+ except Exception as e:
90
+ return f"❌ Anthropic error: {str(e)}"
91
+
92
+
93
+ def get_ai_insights(
94
+ data_summary: str,
95
+ provider: str,
96
+ api_key: str,
97
+ depth: str = "Summary bullets",
98
+ ) -> str:
99
+ prompt = build_prompt(data_summary, depth)
100
+ if provider == "OpenAI (GPT-4o)":
101
+ return call_openai(prompt, api_key)
102
+ elif provider == "Google Gemini":
103
+ return call_gemini(prompt, api_key)
104
+ elif provider == "Anthropic Claude":
105
+ return call_anthropic(prompt, api_key)
106
+ return "Unknown provider selected."
merchant_map.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # merchant_map.py
2
+ # Normalizes raw merchant strings to clean display names
3
+ # Format: "fragment_to_match_lowercase": "Clean Name"
4
+
5
+ MERCHANT_ALIASES = {
6
+ # Amazon
7
+ "amzn": "Amazon",
8
+ "amazon": "Amazon",
9
+ "amz*": "Amazon",
10
+ # Streaming
11
+ "netflix": "Netflix",
12
+ "nflx": "Netflix",
13
+ "spotify": "Spotify",
14
+ "hulu": "Hulu",
15
+ "disney": "Disney+",
16
+ "disneyplus": "Disney+",
17
+ "hbo": "HBO Max",
18
+ "max.com": "HBO Max",
19
+ "peacock": "Peacock",
20
+ "paramount": "Paramount+",
21
+ "appletv": "Apple TV+",
22
+ "apple.com/bill": "Apple Services",
23
+ "apple services": "Apple Services",
24
+ "itunes": "Apple Services",
25
+ "youtube": "YouTube Premium",
26
+ "youtubepremium": "YouTube Premium",
27
+ # Food delivery
28
+ "doordash": "DoorDash",
29
+ "ubereats": "Uber Eats",
30
+ "grubhub": "Grubhub",
31
+ "seamless": "Seamless",
32
+ "instacart": "Instacart",
33
+ # Rideshare
34
+ "uber": "Uber",
35
+ "lyft": "Lyft",
36
+ # Grocery
37
+ "wholefds": "Whole Foods",
38
+ "whole foods": "Whole Foods",
39
+ "trader joe": "Trader Joe's",
40
+ "kroger": "Kroger",
41
+ "safeway": "Safeway",
42
+ "wegmans": "Wegmans",
43
+ "shoprite": "ShopRite",
44
+ "costco": "Costco",
45
+ "sams club": "Sam's Club",
46
+ "target": "Target",
47
+ "walmart": "Walmart",
48
+ # Fuel
49
+ "shell": "Shell",
50
+ "exxon": "ExxonMobil",
51
+ "mobil": "ExxonMobil",
52
+ "bp ": "BP",
53
+ "chevron": "Chevron",
54
+ "sunoco": "Sunoco",
55
+ "wawa": "Wawa",
56
+ "quick chek": "Quick Chek",
57
+ "quickchek": "Quick Chek",
58
+ # Coffee
59
+ "starbucks": "Starbucks",
60
+ "dunkin": "Dunkin'",
61
+ "dutch bros": "Dutch Bros",
62
+ "caribou": "Caribou Coffee",
63
+ # Fast food
64
+ "mcdonald": "McDonald's",
65
+ "mcdonalds": "McDonald's",
66
+ "chick-fil-a": "Chick-fil-A",
67
+ "chickfila": "Chick-fil-A",
68
+ "chipotle": "Chipotle",
69
+ "taco bell": "Taco Bell",
70
+ "tacobell": "Taco Bell",
71
+ "burger king": "Burger King",
72
+ "burgerking": "Burger King",
73
+ "wendy": "Wendy's",
74
+ "subway": "Subway",
75
+ "panera": "Panera Bread",
76
+ # Tech / Cloud
77
+ "google": "Google",
78
+ "microsoft": "Microsoft",
79
+ "msft": "Microsoft",
80
+ "adobe": "Adobe",
81
+ "dropbox": "Dropbox",
82
+ "github": "GitHub",
83
+ "openai": "OpenAI",
84
+ "chatgpt": "OpenAI",
85
+ "zoom": "Zoom",
86
+ "slack": "Slack",
87
+ "notion": "Notion",
88
+ "1password": "1Password",
89
+ "lastpass": "LastPass",
90
+ # Fitness
91
+ "planet fitness": "Planet Fitness",
92
+ "la fitness": "LA Fitness",
93
+ "lafitness": "LA Fitness",
94
+ "peloton": "Peloton",
95
+ "equinox": "Equinox",
96
+ "anytime fitness": "Anytime Fitness",
97
+ "ymca": "YMCA",
98
+ # Insurance
99
+ "geico": "GEICO",
100
+ "progressive": "Progressive",
101
+ "statefarm": "State Farm",
102
+ "state farm": "State Farm",
103
+ "allstate": "Allstate",
104
+ # Utilities/Telecom
105
+ "verizon": "Verizon",
106
+ "at&t": "AT&T",
107
+ "att ": "AT&T",
108
+ "t-mobile": "T-Mobile",
109
+ "tmobile": "T-Mobile",
110
+ "comcast": "Comcast/Xfinity",
111
+ "xfinity": "Comcast/Xfinity",
112
+ "spectrum": "Spectrum",
113
+ # Shopping
114
+ "etsy": "Etsy",
115
+ "ebay": "eBay",
116
+ "bestbuy": "Best Buy",
117
+ "best buy": "Best Buy",
118
+ "home depot": "Home Depot",
119
+ "homedepot": "Home Depot",
120
+ "lowes": "Lowe's",
121
+ "wayfair": "Wayfair",
122
+ "chewy": "Chewy",
123
+ # Travel
124
+ "airbnb": "Airbnb",
125
+ "vrbo": "VRBO",
126
+ "expedia": "Expedia",
127
+ "hotels.com": "Hotels.com",
128
+ "booking.com": "Booking.com",
129
+ "united air": "United Airlines",
130
+ "delta air": "Delta Airlines",
131
+ "american air": "American Airlines",
132
+ "southwest": "Southwest Airlines",
133
+ "jetblue": "JetBlue",
134
+ }
135
+
136
+
137
+ def normalize_merchant(raw: str) -> str:
138
+ """
139
+ Attempt to normalize a raw merchant string to a clean name.
140
+ Returns the best match or a cleaned version of the original.
141
+ """
142
+ if not raw:
143
+ return "Unknown"
144
+ cleaned = raw.strip().lower()
145
+ # Remove common noise suffixes
146
+ for noise in ["*", "#", " "]:
147
+ cleaned = cleaned.replace(noise, " ")
148
+ cleaned = cleaned.strip()
149
+
150
+ for fragment, clean_name in MERCHANT_ALIASES.items():
151
+ if fragment in cleaned:
152
+ return clean_name
153
+
154
+ # Fallback: title-case the raw string, trim long codes
155
+ words = raw.strip().split()
156
+ # Drop trailing tokens that look like reference codes (all digits/caps short tokens)
157
+ filtered = []
158
+ for w in words:
159
+ if len(w) <= 3 and w.isupper() and w.isalpha():
160
+ continue # likely a state abbreviation or noise
161
+ if w.isdigit():
162
+ continue
163
+ filtered.append(w)
164
+ return " ".join(filtered[:4]).title() if filtered else raw.title()
parser.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # parser.py
2
+ # Handles ingestion of PDF, CSV, XLS/XLSX, DOCX statement files
3
+ # Normalizes all formats into a standard DataFrame schema:
4
+ # date (datetime), merchant (str), amount (float), raw_merchant (str), source_file (str)
5
+
6
+ import io
7
+ import re
8
+ import pandas as pd
9
+ from datetime import datetime
10
+ from typing import Optional
11
+ from merchant_map import normalize_merchant
12
+
13
+
14
+ # ─────────────────────────────────────────────────────────────────────────────
15
+ # Helpers
16
+ # ─────────────────────────────────────────────────────────────────────────────
17
+
18
+ def _clean_amount(val) -> Optional[float]:
19
+ """Convert various amount formats to a positive float charge, or None."""
20
+ if val is None:
21
+ return None
22
+ s = str(val).strip().replace(",", "").replace("$", "").replace(" ", "")
23
+ # Some banks use parentheses for debits: (123.45)
24
+ negative = False
25
+ if s.startswith("(") and s.endswith(")"):
26
+ s = s[1:-1]
27
+ negative = True
28
+ try:
29
+ amt = float(s)
30
+ except ValueError:
31
+ return None
32
+ # Some exports use negative for charges, positive for payments
33
+ # We want charges as positive β€” caller decides which sign convention
34
+ return abs(amt) if not negative else abs(amt)
35
+
36
+
37
+ def _looks_like_payment(merchant: str, amount: float, credit_flag=False) -> bool:
38
+ """Heuristic: is this row a payment/credit rather than a purchase?"""
39
+ if credit_flag:
40
+ return True
41
+ m = merchant.lower()
42
+ payment_keywords = [
43
+ "payment", "thank you", "autopay", "credit", "refund",
44
+ "return", "adjustment", "reward", "cashback", "cash back",
45
+ "transfer", "deposit", "interest charge", "fee waiver",
46
+ ]
47
+ return any(kw in m for kw in payment_keywords)
48
+
49
+
50
+ def _parse_date(val) -> Optional[datetime]:
51
+ """Try multiple date formats."""
52
+ if isinstance(val, datetime):
53
+ return val
54
+ if isinstance(val, pd.Timestamp):
55
+ return val.to_pydatetime()
56
+ s = str(val).strip()
57
+ formats = [
58
+ "%m/%d/%Y", "%m/%d/%y", "%Y-%m-%d", "%d-%b-%Y",
59
+ "%b %d, %Y", "%B %d, %Y", "%d/%m/%Y", "%m-%d-%Y",
60
+ "%Y%m%d",
61
+ ]
62
+ for fmt in formats:
63
+ try:
64
+ return datetime.strptime(s, fmt)
65
+ except ValueError:
66
+ continue
67
+ return None
68
+
69
+
70
+ # ─────────────────────────────────────────────────────────────────────────────
71
+ # Format-specific parsers
72
+ # ─────────────────────────────────────────────────────────────────────────────
73
+
74
+ def _parse_csv(file_bytes: bytes, filename: str) -> pd.DataFrame:
75
+ """Parse CSV bank exports. Handles many column name variants."""
76
+ try:
77
+ df = pd.read_csv(io.BytesIO(file_bytes), dtype=str, on_bad_lines="skip")
78
+ except Exception:
79
+ df = pd.read_csv(io.BytesIO(file_bytes), dtype=str, error_bad_lines=False)
80
+
81
+ df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
82
+
83
+ # Date column detection
84
+ date_candidates = ["date", "transaction_date", "trans_date", "post_date",
85
+ "posted_date", "activity_date", "transaction date"]
86
+ date_col = next((c for c in date_candidates if c in df.columns), None)
87
+ if not date_col:
88
+ date_col = next((c for c in df.columns if "date" in c), None)
89
+
90
+ # Merchant / description column detection
91
+ desc_candidates = ["description", "merchant", "payee", "name", "merchant_name",
92
+ "transaction_description", "memo", "details", "narrative"]
93
+ desc_col = next((c for c in desc_candidates if c in df.columns), None)
94
+ if not desc_col:
95
+ desc_col = next((c for c in df.columns if any(k in c for k in ["desc", "merch", "payee", "name"])), None)
96
+
97
+ # Amount column detection
98
+ amt_candidates = ["amount", "debit", "charge", "transaction_amount",
99
+ "debit_amount", "withdrawal", "charged_amount"]
100
+ amt_col = next((c for c in amt_candidates if c in df.columns), None)
101
+ if not amt_col:
102
+ amt_col = next((c for c in df.columns if "amount" in c or "debit" in c), None)
103
+
104
+ # Credit column (to detect payments)
105
+ credit_col = next((c for c in df.columns if "credit" in c), None)
106
+
107
+ if not all([date_col, desc_col, amt_col]):
108
+ return pd.DataFrame()
109
+
110
+ rows = []
111
+ for _, row in df.iterrows():
112
+ date = _parse_date(row.get(date_col, ""))
113
+ merchant_raw = str(row.get(desc_col, "")).strip()
114
+ amt = _clean_amount(row.get(amt_col, ""))
115
+ is_credit = credit_col and str(row.get(credit_col, "")).strip() not in ("", "0", "0.00", "nan")
116
+
117
+ if date is None or amt is None or amt <= 0:
118
+ continue
119
+ if _looks_like_payment(merchant_raw, amt, is_credit):
120
+ continue
121
+
122
+ rows.append({
123
+ "date": date,
124
+ "raw_merchant": merchant_raw,
125
+ "merchant": normalize_merchant(merchant_raw),
126
+ "amount": amt,
127
+ "source_file": filename,
128
+ })
129
+
130
+ return pd.DataFrame(rows)
131
+
132
+
133
+ def _parse_excel(file_bytes: bytes, filename: str) -> pd.DataFrame:
134
+ """Parse XLS/XLSX exports β€” tries each sheet."""
135
+ frames = []
136
+ try:
137
+ xl = pd.ExcelFile(io.BytesIO(file_bytes))
138
+ for sheet in xl.sheet_names:
139
+ try:
140
+ df = xl.parse(sheet, dtype=str)
141
+ df.columns = [str(c).strip().lower().replace(" ", "_") for c in df.columns]
142
+ # Reuse CSV logic by converting to CSV bytes
143
+ csv_bytes = df.to_csv(index=False).encode()
144
+ parsed = _parse_csv(csv_bytes, filename)
145
+ if not parsed.empty:
146
+ frames.append(parsed)
147
+ except Exception:
148
+ continue
149
+ except Exception:
150
+ pass
151
+ return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
152
+
153
+
154
+ def _parse_pdf(file_bytes: bytes, filename: str) -> pd.DataFrame:
155
+ """
156
+ Parse PDF credit card statements.
157
+ Strategy 1: pdfplumber table extraction (structured)
158
+ Strategy 2: raw text line-by-line regex parsing (fallback)
159
+ """
160
+ import pdfplumber
161
+
162
+ rows = []
163
+
164
+ # ── Strategy 1: Table extraction ─────────────────────────────────────
165
+ try:
166
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
167
+ for page in pdf.pages:
168
+ tables = page.extract_tables()
169
+ for table in tables:
170
+ if not table or len(table) < 2:
171
+ continue
172
+ headers = [str(h).strip().lower().replace(" ", "_") if h else "" for h in table[0]]
173
+ for data_row in table[1:]:
174
+ if not data_row:
175
+ continue
176
+ row_dict = {headers[i]: str(data_row[i]).strip() if data_row[i] else ""
177
+ for i in range(min(len(headers), len(data_row)))}
178
+ # Try to find date, merchant, amount in this row
179
+ date_val = next((row_dict[k] for k in row_dict if "date" in k and row_dict[k]), None)
180
+ desc_val = next((row_dict[k] for k in row_dict
181
+ if any(x in k for x in ["desc", "merch", "payee", "name"]) and row_dict[k]), None)
182
+ amt_val = next((row_dict[k] for k in row_dict
183
+ if any(x in k for x in ["amount", "debit", "charge"]) and row_dict[k]), None)
184
+
185
+ if not amt_val:
186
+ # Try last numeric-looking column
187
+ for k in reversed(list(row_dict.keys())):
188
+ cleaned = row_dict[k].replace(",", "").replace("$", "").replace("(", "").replace(")", "")
189
+ try:
190
+ float(cleaned)
191
+ amt_val = row_dict[k]
192
+ break
193
+ except ValueError:
194
+ continue
195
+
196
+ if not desc_val:
197
+ # Use second column as fallback description
198
+ vals = list(row_dict.values())
199
+ desc_val = vals[1] if len(vals) > 1 else ""
200
+
201
+ date = _parse_date(date_val) if date_val else None
202
+ amt = _clean_amount(amt_val) if amt_val else None
203
+ merchant_raw = str(desc_val).strip() if desc_val else ""
204
+
205
+ if date is None or amt is None or amt <= 0 or not merchant_raw:
206
+ continue
207
+ if _looks_like_payment(merchant_raw, amt):
208
+ continue
209
+
210
+ rows.append({
211
+ "date": date,
212
+ "raw_merchant": merchant_raw,
213
+ "merchant": normalize_merchant(merchant_raw),
214
+ "amount": amt,
215
+ "source_file": filename,
216
+ })
217
+ except Exception:
218
+ pass
219
+
220
+ # ── Strategy 2: Text regex fallback ──────────────────────────────────
221
+ if not rows:
222
+ try:
223
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
224
+ full_text = "\n".join(
225
+ page.extract_text() or "" for page in pdf.pages
226
+ )
227
+
228
+ # Pattern: date description amount
229
+ # Covers formats like: 01/15/2024 STARBUCKS #1234 5.75
230
+ pattern = re.compile(
231
+ r"(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})\s+"
232
+ r"([A-Za-z][^\d\n]{3,50?}?)\s+"
233
+ r"\$?([\d,]+\.\d{2})"
234
+ )
235
+ for match in pattern.finditer(full_text):
236
+ date_str, desc, amt_str = match.groups()
237
+ date = _parse_date(date_str)
238
+ amt = _clean_amount(amt_str)
239
+ merchant_raw = desc.strip()
240
+
241
+ if date is None or amt is None or amt <= 0:
242
+ continue
243
+ if _looks_like_payment(merchant_raw, amt):
244
+ continue
245
+
246
+ rows.append({
247
+ "date": date,
248
+ "raw_merchant": merchant_raw,
249
+ "merchant": normalize_merchant(merchant_raw),
250
+ "amount": amt,
251
+ "source_file": filename,
252
+ })
253
+ except Exception:
254
+ pass
255
+
256
+ return pd.DataFrame(rows) if rows else pd.DataFrame()
257
+
258
+
259
+ def _parse_docx(file_bytes: bytes, filename: str) -> pd.DataFrame:
260
+ """Parse DOCX β€” extract text then apply regex like PDF fallback."""
261
+ import docx2txt
262
+ import tempfile, os
263
+
264
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
265
+ tmp.write(file_bytes)
266
+ tmp_path = tmp.name
267
+
268
+ try:
269
+ text = docx2txt.process(tmp_path)
270
+ except Exception:
271
+ return pd.DataFrame()
272
+ finally:
273
+ os.unlink(tmp_path)
274
+
275
+ rows = []
276
+ pattern = re.compile(
277
+ r"(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})\s+"
278
+ r"([A-Za-z][^\d\n]{3,50?}?)\s+"
279
+ r"\$?([\d,]+\.\d{2})"
280
+ )
281
+ for match in pattern.finditer(text):
282
+ date_str, desc, amt_str = match.groups()
283
+ date = _parse_date(date_str)
284
+ amt = _clean_amount(amt_str)
285
+ merchant_raw = desc.strip()
286
+
287
+ if date is None or amt is None or amt <= 0:
288
+ continue
289
+ if _looks_like_payment(merchant_raw, amt):
290
+ continue
291
+
292
+ rows.append({
293
+ "date": date,
294
+ "raw_merchant": merchant_raw,
295
+ "merchant": normalize_merchant(merchant_raw),
296
+ "amount": amt,
297
+ "source_file": filename,
298
+ })
299
+
300
+ return pd.DataFrame(rows) if rows else pd.DataFrame()
301
+
302
+
303
+ # ─────────────────────────────────────────────────────────────────────────────
304
+ # Public entry point
305
+ # ─────────────────────────────────────────────────────────────────────────────
306
+
307
+ def parse_uploaded_file(uploaded_file) -> pd.DataFrame:
308
+ """
309
+ Accept a Streamlit UploadedFile and return a normalized DataFrame.
310
+ Returns empty DataFrame on failure.
311
+ """
312
+ filename = uploaded_file.name
313
+ file_bytes = uploaded_file.read()
314
+ ext = filename.lower().split(".")[-1]
315
+
316
+ if ext == "csv":
317
+ df = _parse_csv(file_bytes, filename)
318
+ elif ext in ("xls", "xlsx"):
319
+ df = _parse_excel(file_bytes, filename)
320
+ elif ext == "pdf":
321
+ df = _parse_pdf(file_bytes, filename)
322
+ elif ext == "docx":
323
+ df = _parse_docx(file_bytes, filename)
324
+ else:
325
+ return pd.DataFrame()
326
+
327
+ if df.empty:
328
+ return df
329
+
330
+ # Enforce schema and types
331
+ df = df[["date", "merchant", "raw_merchant", "amount", "source_file"]].copy()
332
+ df["date"] = pd.to_datetime(df["date"])
333
+ df["amount"] = pd.to_numeric(df["amount"], errors="coerce")
334
+ df = df.dropna(subset=["date", "amount"])
335
+ df = df[df["amount"] > 0]
336
+ df = df.sort_values("date").reset_index(drop=True)
337
+ return df
338
+
339
+
340
+ def combine_files(uploaded_files) -> tuple[pd.DataFrame, list[str]]:
341
+ """
342
+ Parse and combine multiple uploaded files.
343
+ Returns (combined_df, list_of_warnings).
344
+ """
345
+ frames = []
346
+ warnings = []
347
+
348
+ for f in uploaded_files:
349
+ df = parse_uploaded_file(f)
350
+ if df.empty:
351
+ warnings.append(f"⚠️ Could not extract transactions from **{f.name}**. "
352
+ "Check that it's a valid statement export.")
353
+ else:
354
+ frames.append(df)
355
+
356
+ if not frames:
357
+ return pd.DataFrame(), warnings
358
+
359
+ combined = pd.concat(frames, ignore_index=True)
360
+
361
+ # Deduplicate: same date + merchant + amount within 1 day
362
+ combined = combined.drop_duplicates(
363
+ subset=["date", "merchant", "amount"], keep="first"
364
+ )
365
+ combined = combined.sort_values("date").reset_index(drop=True)
366
+
367
+ # Check for month gaps
368
+ if not combined.empty:
369
+ months = pd.period_range(
370
+ start=combined["date"].min().to_period("M"),
371
+ end=combined["date"].max().to_period("M"),
372
+ freq="M",
373
+ )
374
+ covered = set(combined["date"].dt.to_period("M").unique())
375
+ missing = [str(m) for m in months if m not in covered]
376
+ if missing:
377
+ warnings.append(
378
+ f"πŸ“… Possible gaps detected β€” no transactions found for: {', '.join(missing)}. "
379
+ "Upload missing statements for more accurate analysis."
380
+ )
381
+
382
+ return combined, warnings
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.35.0
2
+ pandas>=2.0.0
3
+ numpy>=1.24.0
4
+ pdfplumber>=0.10.0
5
+ PyPDF2>=3.0.0
6
+ openpyxl>=3.1.0
7
+ xlrd>=2.0.1
8
+ docx2txt>=0.8
9
+ scikit-learn>=1.3.0
10
+ openai>=1.0.0
11
+ google-generativeai>=0.5.0
12
+ anthropic>=0.25.0
13
+ python-dotenv>=1.0.0