Spaces:

alm7640
/

CC_Smash

Sleeping

App Files Files Community

alm7640 commited on Mar 13

Commit

2045ab3

0 Parent(s):

Initial commit

Browse files

Files changed (7) hide show

README.md +76 -0
analyzer.py +349 -0
app.py +533 -0
llm.py +106 -0
merchant_map.py +164 -0
parser.py +382 -0
requirements.txt +13 -0

README.md ADDED Viewed

	@@ -0,0 +1,76 @@

+# 💳 Statement Analyzer
+A Streamlit app that ingests credit card statements (PDF, CSV, XLS/XLSX, DOCX)
+and surfaces spending intelligence you'd never catch manually.
+## Features
+| Tab | What it does |
+|---|---|
+| 💰 Top 13 | Largest single purchases ranked by amount |
+| 🔁 Recurring Charges | Monthly/weekly/quarterly charges with true annual cost |
+| 📋 Possible Subscriptions | Small forgotten recurring charges |
+| 📈 Year-over-Year | Spend changes across years (requires 2+ years) |
+| 🔍 AI Insights | LLM-powered narrative analysis (BYOK) |
+## Setup
+### 1. Install dependencies
+```bash
+pip install -r requirements.txt
+```
+### 2. Run locally
+```bash
+streamlit run app.py
+```
+### 3. Open in browser
+```
+http://localhost:8501
+```
+## Deploy to Streamlit Cloud (free)
+1. Push this folder to a GitHub repo
+2. Go to https://share.streamlit.io
+3. Connect your repo, set `app.py` as the main file
+4. Deploy — you get a shareable URL instantly
+## Privacy
+- Files are processed **entirely in memory** — never written to disk or any server
+- Your API key lives only in your browser session and is discarded when you close the tab
+- The AI Insights tab sends only **aggregated data** (merchant names + totals) to the LLM provider — no account numbers, card numbers, or personal details
+## Supported Banks
+Any bank that exports in PDF, CSV, or XLS format is supported. Tested against common
+export formats from Chase, Bank of America, Citi, Capital One, American Express,
+Wells Fargo, and Discover.
+If your bank's export isn't parsing correctly, the CSV export format is the most
+reliable — most banks offer this under "Download transactions" in their portal.
+## File Structure
+```
+statement_analyzer/
+├── app.py              Main Streamlit application
+├── parser.py           File ingestion & normalization (PDF/CSV/XLS/DOCX)
+├── analyzer.py         Rules engine (Top 13, Recurring, Subscriptions, YoY)
+├── llm.py              Multi-provider AI calls (OpenAI / Gemini / Anthropic)
+├── merchant_map.py     Merchant alias normalization dictionary
+├── requirements.txt
+└── README.md
+```
+## Data Quality Tiers
+| Data | Features Unlocked |
+|---|---|
+| 1 statement | Top 13 only |
+| 2–5 months | + Possible subscriptions |
+| 6–11 months | + Recurring charges |
+| 12 months | + True annual cost view |
+| 24+ months | + Year-over-Year analysis |

analyzer.py ADDED Viewed

	@@ -0,0 +1,349 @@

+# analyzer.py
+# Rules engine: Top 13, Recurring, Subscriptions, YoY, Data quality checks
+import pandas as pd
+import numpy as np
+from datetime import datetime
+from typing import TypedDict
+# ─────────────────────────────────────────────────────────────────────────────
+# Type hints
+# ─────────────────────────────────────────────────────────────────────────────
+class DataSummary(TypedDict):
+    total_transactions: int
+    total_spent: float
+    date_range_start: str
+    date_range_end: str
+    months_covered: int
+    years_covered: list[int]
+    has_yoy: bool          # 2+ distinct years
+    has_full_year: bool    # 12+ months
+# ─────────────────────────────────────────────────────────────────────────────
+# Data summary
+# ─────────────────────────────────────────────────────────────────────────────
+def get_data_summary(df: pd.DataFrame) -> DataSummary:
+    years = sorted(df["date"].dt.year.unique().tolist())
+    months_covered = df["date"].dt.to_period("M").nunique()
+    return DataSummary(
+        total_transactions=len(df),
+        total_spent=round(df["amount"].sum(), 2),
+        date_range_start=df["date"].min().strftime("%b %d, %Y"),
+        date_range_end=df["date"].max().strftime("%b %d, %Y"),
+        months_covered=months_covered,
+        years_covered=years,
+        has_yoy=len(years) >= 2,
+        has_full_year=months_covered >= 12,
+    )
+# ─────────────────────────────────────────────────────────────────────────────
+# Top 13 most expensive single purchases
+# ─────────────────────────────────────────────────────────────────────────────
+def get_top_13(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Top 13 single transactions by amount.
+    Excludes recurring charges (those are shown separately).
+    """
+    # Get recurring merchants so we can flag them
+    recurring = _detect_recurring_merchants(df)
+    recurring_names = set(recurring["merchant"].tolist()) if not recurring.empty else set()
+    result = (
+        df.copy()
+        .sort_values("amount", ascending=False)
+        .head(13)
+        .reset_index(drop=True)
+    )
+    result.index += 1
+    result["is_recurring"] = result["merchant"].isin(recurring_names)
+    result["date_fmt"] = result["date"].dt.strftime("%b %d, %Y")
+    result["amount_fmt"] = result["amount"].apply(lambda x: f"${x:,.2f}")
+    return result[["date_fmt", "merchant", "amount_fmt", "amount", "is_recurring", "source_file"]]
+# ─────────────────────────────────────────────────────────────────────────────
+# Recurring charge detection (internal helper)
+# ─────────────────────────────────────────────────────────────────────────────
+def _detect_recurring_merchants(df: pd.DataFrame, min_occurrences: int = 3) -> pd.DataFrame:
+    """
+    Core recurring detection. A merchant is recurring if it appears
+    at least min_occurrences times AND the median gap between charges
+    is 25–35 days (monthly) or 6–8 days (weekly) or 88–95 days (quarterly).
+    """
+    if df.empty:
+        return pd.DataFrame()
+    results = []
+    grouped = df.groupby("merchant")
+    for merchant, group in grouped:
+        group = group.sort_values("date")
+        if len(group) < min_occurrences:
+            continue
+        dates = group["date"].tolist()
+        gaps = [(dates[i+1] - dates[i]).days for i in range(len(dates)-1)]
+        if not gaps:
+            continue
+        median_gap = np.median(gaps)
+        avg_amount = group["amount"].mean()
+        amounts = group["amount"].tolist()
+        # Classify frequency
+        if 25 <= median_gap <= 35:
+            frequency = "Monthly"
+            periods_per_year = 12
+        elif 6 <= median_gap <= 8:
+            frequency = "Weekly"
+            periods_per_year = 52
+        elif 88 <= median_gap <= 95:
+            frequency = "Quarterly"
+            periods_per_year = 4
+        elif 355 <= median_gap <= 375:
+            frequency = "Annual"
+            periods_per_year = 1
+        elif 13 <= median_gap <= 17:
+            frequency = "Bi-Weekly"
+            periods_per_year = 26
+        else:
+            continue  # Irregular — skip
+        annual_cost = avg_amount * periods_per_year
+        amount_variance = np.std(amounts)
+        amount_consistent = amount_variance < (avg_amount * 0.1)  # <10% variation
+        results.append({
+            "merchant": merchant,
+            "frequency": frequency,
+            "avg_charge": round(avg_amount, 2),
+            "annual_cost": round(annual_cost, 2),
+            "occurrences": len(group),
+            "amount_consistent": amount_consistent,
+            "first_seen": group["date"].min(),
+            "last_seen": group["date"].max(),
+            "amounts": amounts,
+        })
+    if not results:
+        return pd.DataFrame()
+    result_df = pd.DataFrame(results)
+    result_df = result_df.sort_values("annual_cost", ascending=False).reset_index(drop=True)
+    result_df.index += 1
+    return result_df
+# ─────────────────────────────────────────────────────────────────────────────
+# Recurring charges (public — for Recurring tab)
+# ─────────────────────────────────────────────────────────────────────────────
+def get_recurring_charges(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Returns recurring charges with annualized cost.
+    Excludes likely subscriptions (handled separately).
+    """
+    rec = _detect_recurring_merchants(df, min_occurrences=3)
+    if rec.empty:
+        return pd.DataFrame()
+    # Exclude those that look like subscriptions (small + consistent)
+    mask = ~((rec["avg_charge"] <= 30) & (rec["amount_consistent"]))
+    rec = rec[mask].copy()
+    rec["avg_charge_fmt"] = rec["avg_charge"].apply(lambda x: f"${x:,.2f}")
+    rec["annual_cost_fmt"] = rec["annual_cost"].apply(lambda x: f"${x:,.2f}")
+    rec["first_seen_fmt"] = rec["first_seen"].dt.strftime("%b %Y")
+    rec["last_seen_fmt"] = rec["last_seen"].dt.strftime("%b %Y")
+    return rec
+# ─────────────────────────────────────────────────────────────────────────────
+# Possible subscriptions
+# ─────────────────────────────────────────────────────────────────────────────
+def get_possible_subscriptions(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Small (≤$30), highly consistent recurring charges.
+    These are the 'set it and forget it' subscriptions people often forget.
+    """
+    rec = _detect_recurring_merchants(df, min_occurrences=2)
+    if rec.empty:
+        return pd.DataFrame()
+    # Keep only small + consistent charges
+    mask = (rec["avg_charge"] <= 30) & (rec["amount_consistent"])
+    subs = rec[mask].copy()
+    if subs.empty:
+        return pd.DataFrame()
+    # Forgettability score: lower charge + more occurrences = more forgettable
+    subs["forgettability"] = (subs["occurrences"] / subs["avg_charge"]).round(2)
+    subs = subs.sort_values("forgettability", ascending=False).reset_index(drop=True)
+    subs.index += 1
+    subs["avg_charge_fmt"] = subs["avg_charge"].apply(lambda x: f"${x:,.2f}")
+    subs["annual_cost_fmt"] = subs["annual_cost"].apply(lambda x: f"${x:,.2f}")
+    subs["first_seen_fmt"] = subs["first_seen"].dt.strftime("%b %Y")
+    return subs
+# ─────────────────────────────────────────────────────────────────────────────
+# Year-over-Year changes
+# ─────────────────────────────────────────────────────────────────────────────
+def get_yoy_changes(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Compare total annual spend per merchant across years.
+    Returns merchants with notable increases or decreases.
+    Requires at least 2 years of data.
+    """
+    years = sorted(df["date"].dt.year.unique())
+    if len(years) < 2:
+        return pd.DataFrame()
+    # Annual totals per merchant
+    df2 = df.copy()
+    df2["year"] = df2["date"].dt.year
+    pivot = df2.groupby(["merchant", "year"])["amount"].sum().unstack(fill_value=0)
+    results = []
+    year_pairs = list(zip(years[:-1], years[1:]))
+    for (yr_a, yr_b) in year_pairs:
+        if yr_a not in pivot.columns or yr_b not in pivot.columns:
+            continue
+        for merchant in pivot.index:
+            amt_a = pivot.loc[merchant, yr_a]
+            amt_b = pivot.loc[merchant, yr_b]
+            # Skip if either year is zero (new/dropped merchant)
+            if amt_a <= 0 or amt_b <= 0:
+                continue
+            # Skip very small amounts
+            if amt_a < 10 and amt_b < 10:
+                continue
+            delta = amt_b - amt_a
+            pct_change = (delta / amt_a) * 100
+            # Only flag meaningful changes (≥5% or ≥$25)
+            if abs(pct_change) >= 5 or abs(delta) >= 25:
+                results.append({
+                    "merchant": merchant,
+                    "year_a": yr_a,
+                    "year_b": yr_b,
+                    "amount_a": round(amt_a, 2),
+                    "amount_b": round(amt_b, 2),
+                    "delta": round(delta, 2),
+                    "pct_change": round(pct_change, 1),
+                    "direction": "↑ Increase" if delta > 0 else "↓ Decrease",
+                })
+    if not results:
+        return pd.DataFrame()
+    result_df = pd.DataFrame(results)
+    # Sort: biggest increases first, then decreases
+    result_df = result_df.sort_values("delta", ascending=False).reset_index(drop=True)
+    result_df.index += 1
+    result_df["amount_a_fmt"] = result_df["amount_a"].apply(lambda x: f"${x:,.2f}")
+    result_df["amount_b_fmt"] = result_df["amount_b"].apply(lambda x: f"${x:,.2f}")
+    result_df["delta_fmt"] = result_df["delta"].apply(
+        lambda x: f"+${x:,.2f}" if x > 0 else f"-${abs(x):,.2f}"
+    )
+    result_df["pct_fmt"] = result_df["pct_change"].apply(
+        lambda x: f"+{x:.1f}%" if x > 0 else f"{x:.1f}%"
+    )
+    return result_df
+# ─────────────────────────────────────────────────────────────────────────────
+# Build LLM summary payload
+# ─────────────────────────────────────────────────────────────────────────────
+def build_llm_summary(
+    df: pd.DataFrame,
+    summary: DataSummary,
+    top13: pd.DataFrame,
+    recurring: pd.DataFrame,
+    subscriptions: pd.DataFrame,
+    yoy: pd.DataFrame,
+) -> str:
+    """
+    Build a concise text summary to send to the LLM.
+    We send aggregated data, NOT raw transactions, for privacy.
+    """
+    lines = []
+    lines.append("=== CREDIT CARD STATEMENT ANALYSIS DATA ===")
+    lines.append(f"Date range: {summary['date_range_start']} to {summary['date_range_end']}")
+    lines.append(f"Total transactions: {summary['total_transactions']}")
+    lines.append(f"Total spent: ${summary['total_spent']:,.2f}")
+    lines.append(f"Months covered: {summary['months_covered']}")
+    lines.append(f"Years: {', '.join(str(y) for y in summary['years_covered'])}")
+    lines.append("")
+    lines.append("--- TOP 13 LARGEST SINGLE PURCHASES ---")
+    if not top13.empty:
+        for _, row in top13.iterrows():
+            lines.append(f"  {row['date_fmt']}  {row['merchant']}  {row['amount_fmt']}")
+    lines.append("")
+    lines.append("--- RECURRING CHARGES (ANNUALIZED) ---")
+    if not recurring.empty:
+        for _, row in recurring.iterrows():
+            lines.append(
+                f"  {row['merchant']}  {row['frequency']}  "
+                f"avg {row['avg_charge_fmt']}/period  "
+                f"annual est. {row['annual_cost_fmt']}"
+            )
+    lines.append("")
+    lines.append("--- POSSIBLE FORGOTTEN SUBSCRIPTIONS ---")
+    if not subscriptions.empty:
+        for _, row in subscriptions.iterrows():
+            lines.append(
+                f"  {row['merchant']}  {row['frequency']}  "
+                f"{row['avg_charge_fmt']}/period  "
+                f"since {row['first_seen_fmt']}"
+            )
+    lines.append("")
+    if not yoy.empty:
+        lines.append("--- YEAR-OVER-YEAR CHANGES ---")
+        for _, row in yoy.iterrows():
+            lines.append(
+                f"  {row['merchant']}  {row['year_a']}→{row['year_b']}  "
+                f"{row['amount_a_fmt']}→{row['amount_b_fmt']}  "
+                f"({row['pct_fmt']}, {row['delta_fmt']})"
+            )
+        lines.append("")
+    # Monthly totals for context
+    monthly = df.groupby(df["date"].dt.to_period("M"))["amount"].sum()
+    lines.append("--- MONTHLY SPEND TOTALS ---")
+    for period, total in monthly.items():
+        lines.append(f"  {period}: ${total:,.2f}")
+    lines.append("")
+    # Category-level summary (merchant frequency)
+    lines.append("--- TOP MERCHANTS BY TOTAL SPEND ---")
+    top_merchants = (
+        df.groupby("merchant")["amount"]
+        .sum()
+        .sort_values(ascending=False)
+        .head(20)
+    )
+    for merchant, total in top_merchants.items():
+        lines.append(f"  {merchant}: ${total:,.2f}")
+    return "\n".join(lines)

app.py ADDED Viewed

	@@ -0,0 +1,533 @@

+# app.py — Statement Analyzer
+# Multi-provider credit card statement intelligence tool
+import streamlit as st
+import pandas as pd
+import sys
+import os
+sys.path.insert(0, os.path.dirname(__file__))
+from parser import combine_files
+from analyzer import (
+    get_data_summary,
+    get_top_13,
+    get_recurring_charges,
+    get_possible_subscriptions,
+    get_yoy_changes,
+    build_llm_summary,
+)
+from llm import get_ai_insights
+# ─────────────────────────────────────────────────────────────────────────────
+# Page config
+# ─────────────────────────────────────────────────────────────────────────────
+st.set_page_config(
+    page_title="Statement Analyzer",
+    page_icon="💳",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# ─────────────────────────────────────────────────────────────────────────────
+# CSS
+# ─────────────────────────────────────────────────────────────────────────────
+st.markdown("""
+<style>
+    .main-header { text-align: center; padding: 1.2rem 0 0.25rem; }
+    .main-header h1 { font-size: 2rem; font-weight: 600; }
+    .tagline {
+        text-align: center; color: #6b7280;
+        font-size: 0.9rem; margin-bottom: 0.5rem;
+    }
+    .privacy-badge {
+        background: #f0fdf4; border: 1px solid #bbf7d0;
+        border-radius: 8px; padding: 0.5rem 0.85rem;
+        font-size: 0.8rem; color: #166534; margin-bottom: 0.75rem;
+    }
+    .data-quality-banner {
+        border-radius: 8px; padding: 0.75rem 1rem;
+        font-size: 0.85rem; margin-bottom: 1rem;
+    }
+    .stat-row {
+        display: flex; gap: 12px; flex-wrap: wrap;
+        margin-bottom: 1.25rem;
+    }
+    .stat-card {
+        background: #f9fafb; border: 1px solid #e5e7eb;
+        border-radius: 10px; padding: 0.75rem 1rem;
+        flex: 1; min-width: 130px; text-align: center;
+    }
+    .stat-label { font-size: 0.75rem; color: #9ca3af; margin-bottom: 2px; }
+    .stat-value { font-size: 1.3rem; font-weight: 600; color: #111827; }
+    .increase-row { color: #dc2626; }
+    .decrease-row { color: #16a34a; }
+    .section-note {
+        font-size: 0.8rem; color: #9ca3af;
+        font-style: italic; margin-bottom: 0.5rem;
+    }
+    .footer {
+        text-align: center; margin-top: 2rem;
+        padding-top: 1rem; border-top: 1px solid #e5e7eb;
+        color: #9ca3af; font-size: 0.78rem;
+    }
+    /* Streamlit table tweaks */
+    [data-testid="stDataFrame"] { border-radius: 8px; }
+</style>
+""", unsafe_allow_html=True)
+# ─────────────────────────────────────────────────────────────────────────────
+# Session state
+# ─────────────────────────────────────────────────────────────────────────────
+for key in ["df", "summary", "top13", "recurring", "subscriptions", "yoy",
+            "llm_summary_text", "ai_result"]:
+    if key not in st.session_state:
+        st.session_state[key] = None
+# ─────────────────────────────────────────────────────────────────────────────
+# Header
+# ─────────────────────────────────────────────────────────────────────────────
+st.markdown("""
+<div class="main-header"><h1>💳 Statement Analyzer</h1></div>
+<div class="tagline">
+    Upload your credit card statements and uncover what your spending is really telling you.<br>
+    <strong>Your statements never leave your session — processed in memory, never stored.</strong>
+</div>
+""", unsafe_allow_html=True)
+# ─────────────────────────────────────────────────────────────────────────────
+# Sidebar
+# ──────────────────────────────────────────────────────────────────��──────────
+with st.sidebar:
+    st.markdown("## ⚙️ AI Provider")
+    st.markdown('<div class="section-note">Required only for the AI Insights tab</div>',
+                unsafe_allow_html=True)
+    provider = st.selectbox(
+        "Provider",
+        ["OpenAI (GPT-4o)", "Google Gemini", "Anthropic Claude"],
+        label_visibility="collapsed",
+    )
+    provider_hints = {
+        "OpenAI (GPT-4o)": "platform.openai.com",
+        "Google Gemini": "aistudio.google.com",
+        "Anthropic Claude": "console.anthropic.com",
+    }
+    api_key = st.text_input(
+        "API Key",
+        type="password",
+        placeholder="Paste your key here...",
+        help=f"Get your key at {provider_hints[provider]}",
+    )
+    if api_key:
+        st.markdown(
+            '<div class="privacy-badge">🔒 Key used only this session. Never stored or shared.</div>',
+            unsafe_allow_html=True,
+        )
+    st.markdown("---")
+    st.markdown("### 📖 Tips")
+    st.markdown("""
+- Upload **1 year minimum** for recurring detection
+- Upload **2+ years** to unlock Year-over-Year changes
+- Supported: **PDF, CSV, XLS, XLSX, DOCX**
+- Upload multiple files at once — one per month is fine
+- Most banks offer CSV export in their online portal
+""")
+    st.markdown("---")
+    st.markdown(
+        '<div class="footer">Made with ❤️ for people who actually want to know where their money goes.</div>',
+        unsafe_allow_html=True,
+    )
+# ─────────────────────────────────────────────────────────────────────────────
+# Upload zone
+# ─────────────────────────────────────────────────────────────────────────────
+st.markdown("### 📂 Upload Your Statements")
+col_upload, col_tip = st.columns([2, 1])
+with col_upload:
+    uploaded_files = st.file_uploader(
+        "Drop files here or click to browse",
+        type=["pdf", "csv", "xls", "xlsx", "docx"],
+        accept_multiple_files=True,
+        label_visibility="collapsed",
+    )
+with col_tip:
+    st.info(
+        "**Better results with more data**\n\n"
+        "🟡 1 statement — basic insights only\n\n"
+        "🟠 6 months — recurring detection\n\n"
+        "🟢 12 months — full annual cost view\n\n"
+        "🔵 24+ months — Year-over-Year unlocked"
+    )
+analyze_btn = st.button(
+    "🔍 Analyze Statements",
+    type="primary",
+    use_container_width=False,
+    disabled=not uploaded_files,
+)
+# ─────────────────────────────────────────────────────────────────────────────
+# Run analysis
+# ─────────────────────────────────────────────────────────────────────────────
+if analyze_btn and uploaded_files:
+    with st.spinner("Parsing files and running analysis..."):
+        df, parse_warnings = combine_files(uploaded_files)
+        if df.empty:
+            st.error(
+                "Could not extract any transactions from the uploaded files. "
+                "Please check the file formats and try again."
+            )
+            st.stop()
+        summary = get_data_summary(df)
+        top13 = get_top_13(df)
+        recurring = get_recurring_charges(df)
+        subscriptions = get_possible_subscriptions(df)
+        yoy = get_yoy_changes(df)
+        llm_summary_text = build_llm_summary(df, summary, top13, recurring, subscriptions, yoy)
+        # Persist to session
+        st.session_state.df = df
+        st.session_state.summary = summary
+        st.session_state.top13 = top13
+        st.session_state.recurring = recurring
+        st.session_state.subscriptions = subscriptions
+        st.session_state.yoy = yoy
+        st.session_state.llm_summary_text = llm_summary_text
+        st.session_state.ai_result = None  # reset on re-analyze
+        st.session_state.parse_warnings = parse_warnings
+# ─────────────────────────────────────────────────────────────────────────────
+# Results
+# ─────────────────────────────────────────────────────────────────────────────
+if st.session_state.df is not None:
+    summary = st.session_state.summary
+    df = st.session_state.df
+    parse_warnings = st.session_state.get("parse_warnings", [])
+    # Parse warnings
+    for w in parse_warnings:
+        st.warning(w)
+    # Data quality banner
+    months = summary["months_covered"]
+    has_yoy = summary["has_yoy"]
+    years = summary["years_covered"]
+    if months < 6:
+        quality_color = "#fef3c7"
+        quality_border = "#f59e0b"
+        quality_msg = (
+            f"📊 **{months} month(s)** of data detected. "
+            "Upload at least 6 months for recurring charge detection and 12+ for full annual cost analysis."
+        )
+    elif months < 12:
+        quality_color = "#fff7ed"
+        quality_border = "#f97316"
+        quality_msg = (
+            f"📊 **{months} months** of data detected ({', '.join(str(y) for y in years)}). "
+            "Upload 12+ months to see true annual costs. Upload 2+ years to unlock Year-over-Year."
+        )
+    elif not has_yoy:
+        quality_color = "#eff6ff"
+        quality_border = "#3b82f6"
+        quality_msg = (
+            f"📊 **{months} months** of data detected. "
+            "Great for annual analysis! Upload statements from another year to unlock Year-over-Year comparison."
+        )
+    else:
+        quality_color = "#f0fdf4"
+        quality_border = "#22c55e"
+        quality_msg = (
+            f"✅ **{months} months across {len(years)} years** — full analysis unlocked including Year-over-Year!"
+        )
+    st.markdown(
+        f'<div class="data-quality-banner" style="background:{quality_color};border-left:4px solid {quality_border};">'
+        f"{quality_msg}</div>",
+        unsafe_allow_html=True,
+    )
+    # Summary stat cards
+    st.markdown(
+        f"""
+        <div class="stat-row">
+            <div class="stat-card">
+                <div class="stat-label">Total Spent</div>
+                <div class="stat-value">${summary['total_spent']:,.0f}</div>
+            </div>
+            <div class="stat-card">
+                <div class="stat-label">Transactions</div>
+                <div class="stat-value">{summary['total_transactions']:,}</div>
+            </div>
+            <div class="stat-card">
+                <div class="stat-label">Date Range</div>
+                <div class="stat-value" style="font-size:0.85rem;">{summary['date_range_start']}<br>→ {summary['date_range_end']}</div>
+            </div>
+            <div class="stat-card">
+                <div class="stat-label">Months</div>
+                <div class="stat-value">{summary['months_covered']}</div>
+            </div>
+            <div class="stat-card">
+                <div class="stat-label">Avg/Month</div>
+                <div class="stat-value">${summary['total_spent']/max(summary['months_covered'],1):,.0f}</div>
+            </div>
+        </div>
+        """,
+        unsafe_allow_html=True,
+    )
+    st.markdown("---")
+    # ── Tabs ──────────────────────────────────────────────────────────────
+    tab1, tab2, tab3, tab4, tab5 = st.tabs([
+        "💰 Top 13",
+        "🔁 Recurring Charges",
+        "📋 Possible Subscriptions",
+        "📈 Year-over-Year",
+        "🔍 AI Insights",
+    ])
+    # ── Tab 1: Top 13 ─────────────────────────────────────────────────────
+    with tab1:
+        st.markdown("#### 💰 Top 13 Most Expensive Single Purchases")
+        st.markdown(
+            '<div class="section-note">Ranked by transaction amount. '
+            "Charges marked 🔁 also appear as recurring charges.</div>",
+            unsafe_allow_html=True,
+        )
+        top13 = st.session_state.top13
+        if top13.empty:
+            st.info("No transactions found.")
+        else:
+            display = top13.copy()
+            display["merchant"] = display.apply(
+                lambda r: f"🔁 {r['merchant']}" if r["is_recurring"] else r["merchant"],
+                axis=1,
+            )
+            st.dataframe(
+                display[["date_fmt", "merchant", "amount_fmt", "source_file"]].rename(columns={
+                    "date_fmt": "Date",
+                    "merchant": "Merchant",
+                    "amount_fmt": "Amount",
+                    "source_file": "Statement File",
+                }),
+                use_container_width=True,
+                hide_index=False,
+            )
+            total_top13 = top13["amount"].sum()
+            pct = (total_top13 / summary["total_spent"] * 100) if summary["total_spent"] > 0 else 0
+            st.markdown(
+                f"**Top 13 total: ${total_top13:,.2f}** — "
+                f"that's **{pct:.1f}%** of all spending in this period."
+            )
+    # ── Tab 2: Recurring ──────────────────────────────────────────────────
+    with tab2:
+        st.markdown("#### 🔁 Recurring Charges — True Annual Cost")
+        st.markdown(
+            '<div class="section-note">'
+            "These charges appear on a regular schedule. The annual cost column shows what you're "
+            "actually paying per year — a number most people have never seen laid out clearly."
+            "</div>",
+            unsafe_allow_html=True,
+        )
+        recurring = st.session_state.recurring
+        if months < 3:
+            st.warning("Upload at least 3 months of statements to detect recurring charges.")
+        elif recurring is None or recurring.empty:
+            st.info("No recurring charges detected in the uploaded statements.")
+        else:
+            st.dataframe(
+                recurring[["merchant", "frequency", "avg_charge_fmt",
+                            "annual_cost_fmt", "occurrences",
+                            "first_seen_fmt", "last_seen_fmt"]].rename(columns={
+                    "merchant": "Merchant",
+                    "frequency": "Frequency",
+                    "avg_charge_fmt": "Avg Charge",
+                    "annual_cost_fmt": "Est. Annual Cost",
+                    "occurrences": "Times Seen",
+                    "first_seen_fmt": "First Seen",
+                    "last_seen_fmt": "Last Seen",
+                }),
+                use_container_width=True,
+                hide_index=False,
+            )
+            total_recurring_annual = recurring["annual_cost"].sum()
+            st.markdown(
+                f"**Estimated total annual cost of recurring charges: "
+                f"${total_recurring_annual:,.2f}**"
+            )
+    # ── Tab 3: Subscriptions ──────────────────────────────────────────────
+    with tab3:
+        st.markdown("#### 📋 Possible Forgotten Subscriptions")
+        st.markdown(
+            '<div class="section-note">'
+            "Small, consistent charges that are easy to forget about. "
+            "Sorted by 'forgettability' — the ones most likely to be autopilot spending. "
+            "Could you cancel any of these?"
+            "</div>",
+            unsafe_allow_html=True,
+        )
+        subscriptions = st.session_state.subscriptions
+        if months < 2:
+            st.warning("Upload at least 2 months of statements to detect subscriptions.")
+        elif subscriptions is None or subscriptions.empty:
+            st.info("No small recurring subscriptions detected.")
+        else:
+            st.dataframe(
+                subscriptions[["merchant", "frequency", "avg_charge_fmt",
+                               "annual_cost_fmt", "occurrences", "first_seen_fmt"]].rename(columns={
+                    "merchant": "Merchant",
+                    "frequency": "Frequency",
+                    "avg_charge_fmt": "Per Period",
+                    "annual_cost_fmt": "Per Year",
+                    "occurrences": "Times Seen",
+                    "first_seen_fmt": "Paying Since",
+                }),
+                use_container_width=True,
+                hide_index=False,
+            )
+            total_sub_annual = subscriptions["annual_cost"].sum()
+            st.markdown(
+                f"**Total possible subscription spend: ${total_sub_annual:,.2f}/year** — "
+                f"that's **${total_sub_annual/12:,.2f}/month** in charges you might not be thinking about."
+            )
+    # ── Tab 4: Year-over-Year ─────────────────────────────────────────────
+    with tab4:
+        st.markdown("#### 📈 Year-over-Year Spending Changes")
+        yoy = st.session_state.yoy
+        if not has_yoy:
+            st.info(
+                "📅 Year-over-Year analysis requires at least 2 years of statements.\n\n"
+                f"Currently loaded: **{', '.join(str(y) for y in years)}**.\n\n"
+                "Upload statements from an additional year to unlock this tab."
+            )
+        elif yoy is None or yoy.empty:
+            st.info("No significant year-over-year changes found in the data.")
+        else:
+            increases = yoy[yoy["delta"] > 0]
+            decreases = yoy[yoy["delta"] < 0]
+            if not increases.empty:
+                st.markdown("##### ↑ Charges That Increased")
+                st.markdown(
+                    '<div class="section-note">These cost you more this year than last year.</div>',
+                    unsafe_allow_html=True,
+                )
+                inc_display = increases[["merchant", "year_a", "year_b",
+                                         "amount_a_fmt", "amount_b_fmt",
+                                         "delta_fmt", "pct_fmt"]].rename(columns={
+                    "merchant": "Merchant",
+                    "year_a": "Year A",
+                    "year_b": "Year B",
+                    "amount_a_fmt": "Spent (A)",
+                    "amount_b_fmt": "Spent (B)",
+                    "delta_fmt": "Change ($)",
+                    "pct_fmt": "Change (%)",
+                })
+                st.dataframe(inc_display, use_container_width=True, hide_index=False)
+            if not decreases.empty:
+                st.markdown("##### ↓ Charges That Decreased")
+                st.markdown(
+                    '<div class="section-note">You spent less here — cancellations, negotiated rates, or reduced usage.</div>',
+                    unsafe_allow_html=True,
+                )
+                dec_display = decreases[["merchant", "year_a", "year_b",
+                                          "amount_a_fmt", "amount_b_fmt",
+                                          "delta_fmt", "pct_fmt"]].rename(columns={
+                    "merchant": "Merchant",
+                    "year_a": "Year A",
+                    "year_b": "Year B",
+                    "amount_a_fmt": "Spent (A)",
+                    "amount_b_fmt": "Spent (B)",
+                    "delta_fmt": "Change ($)",
+                    "pct_fmt": "Change (%)",
+                })
+                st.dataframe(dec_display, use_container_width=True, hide_index=False)
+    # ── Tab 5: AI Insights ────────────────────────────────────────────────
+    with tab5:
+        st.markdown("#### 🔍 AI Insights")
+        st.markdown(
+            '<div class="section-note">'
+            "The AI analyzes your aggregated spending data — not your raw transactions. "
+            "Merchant names and totals are shared with the AI provider you select; "
+            "no account numbers, card numbers, or personal details are ever sent."
+            "</div>",
+            unsafe_allow_html=True,
+        )
+        if not api_key:
+            st.warning(
+                "Enter your API key in the sidebar to use AI Insights. "
+                "Choose any provider — OpenAI, Gemini, or Anthropic Claude."
+            )
+        else:
+            depth = st.radio(
+                "Analysis depth",
+                ["Summary bullets", "Deep narrative analysis"],
+                horizontal=True,
+                help="Deep analysis uses more tokens (~3-5x the cost of summary).",
+            )
+            run_ai_btn = st.button(
+                f"✨ Run AI Analysis ({provider})",
+                type="secondary",
+            )
+            if run_ai_btn:
+                with st.spinner(f"Analyzing with {provider}..."):
+                    result = get_ai_insights(
+                        data_summary=st.session_state.llm_summary_text,
+                        provider=provider,
+                        api_key=api_key,
+                        depth=depth,
+                    )
+                    st.session_state.ai_result = result
+            if st.session_state.ai_result:
+                st.markdown(st.session_state.ai_result)
+                st.download_button(
+                    label="⬇️ Download AI Analysis",
+                    data=st.session_state.ai_result,
+                    file_name="statement_ai_insights.txt",
+                    mime="text/plain",
+                )
+    # ── Download full analysis ─────────────────────────────────────────────
+    st.markdown("---")
+    st.download_button(
+        label="⬇️ Download Full Analysis Data (text)",
+        data=st.session_state.llm_summary_text,
+        file_name="statement_analysis_summary.txt",
+        mime="text/plain",
+    )
+else:
+    # Landing state
+    st.markdown(
+        """
+        <div style="text-align:center; padding: 3rem 1rem; color: #9ca3af;">
+            <div style="font-size: 3rem; margin-bottom: 1rem;">💳</div>
+            <div style="font-size: 1rem;">
+                Upload your credit card statements above and click <strong>Analyze</strong>.<br>
+                Supports PDF, CSV, XLS, XLSX, and DOCX from any bank.
+            </div>
+        </div>
+        """,
+        unsafe_allow_html=True,
+    )

llm.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# llm.py
+# Multi-provider LLM calls for AI Insights tab
+# Supports OpenAI (GPT-4o), Google Gemini, Anthropic Claude
+from typing import Literal
+DEPTH_PROMPTS = {
+    "Summary bullets": """
+You are a personal finance analyst reviewing a year or more of credit card statements.
+Based on the data provided, give a concise bullet-point analysis covering:
+• 3-5 standout spending patterns or anomalies
+• Any suspicious or duplicate-looking charges
+• Quick wins — subscriptions or recurring charges the user could cancel
+• One overall financial habit observation
+Keep it brief and scannable. Use plain language, no jargon.
+""",
+    "Deep narrative analysis": """
+You are an expert personal finance analyst reviewing a year or more of credit card statements.
+Based on the data provided, write a thorough narrative analysis covering:
+1. **Spending Personality** — What do these statements reveal about this person's lifestyle and habits?
+2. **Anomalies & Red Flags** — Any duplicate charges, unusual timing, or charges that don't fit the pattern?
+3. **Subscription Audit** — Evaluate all recurring and subscription charges. Which ones seem worth it? Which seem forgotten or wasteful?
+4. **Year-over-Year Trends** — What's growing? What's declining? Is spending trending in a healthy or concerning direction?
+5. **Category Analysis** — Where is the bulk of money going? Is it balanced?
+6. **Missed Savings Opportunities** — Specific charges where better options likely exist (e.g. switching providers, bundling services)
+7. **Action Items** — A prioritized list of 5 concrete things this person should do after reading this analysis
+Be specific, reference actual merchants and amounts from the data. Write for a smart adult who wants honest, direct insight.
+""",
+}
+def build_prompt(data_summary: str, depth: str) -> str:
+    system_section = DEPTH_PROMPTS.get(depth, DEPTH_PROMPTS["Summary bullets"])
+    return f"""{system_section}
+Here is the spending data to analyze:
+{data_summary}
+"""
+def call_openai(prompt: str, api_key: str) -> str:
+    try:
+        from openai import OpenAI
+        client = OpenAI(api_key=api_key)
+        response = client.chat.completions.create(
+            model="gpt-4o",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are an expert personal finance analyst. Be direct, specific, and helpful.",
+                },
+                {"role": "user", "content": prompt},
+            ],
+            max_tokens=2000,
+            temperature=0.4,
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        return f"❌ OpenAI error: {str(e)}"
+def call_gemini(prompt: str, api_key: str) -> str:
+    try:
+        import google.generativeai as genai
+        genai.configure(api_key=api_key)
+        model = genai.GenerativeModel("gemini-1.5-pro")
+        response = model.generate_content(prompt)
+        return response.text
+    except Exception as e:
+        return f"❌ Gemini error: {str(e)}"
+def call_anthropic(prompt: str, api_key: str) -> str:
+    try:
+        import anthropic
+        client = anthropic.Anthropic(api_key=api_key)
+        response = client.messages.create(
+            model="claude-sonnet-4-20250514",
+            max_tokens=2000,
+            system="You are an expert personal finance analyst. Be direct, specific, and helpful.",
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return response.content[0].text
+    except Exception as e:
+        return f"❌ Anthropic error: {str(e)}"
+def get_ai_insights(
+    data_summary: str,
+    provider: str,
+    api_key: str,
+    depth: str = "Summary bullets",
+) -> str:
+    prompt = build_prompt(data_summary, depth)
+    if provider == "OpenAI (GPT-4o)":
+        return call_openai(prompt, api_key)
+    elif provider == "Google Gemini":
+        return call_gemini(prompt, api_key)
+    elif provider == "Anthropic Claude":
+        return call_anthropic(prompt, api_key)
+    return "Unknown provider selected."

merchant_map.py ADDED Viewed

	@@ -0,0 +1,164 @@

+# merchant_map.py
+# Normalizes raw merchant strings to clean display names
+# Format: "fragment_to_match_lowercase": "Clean Name"
+MERCHANT_ALIASES = {
+    # Amazon
+    "amzn": "Amazon",
+    "amazon": "Amazon",
+    "amz*": "Amazon",
+    # Streaming
+    "netflix": "Netflix",
+    "nflx": "Netflix",
+    "spotify": "Spotify",
+    "hulu": "Hulu",
+    "disney": "Disney+",
+    "disneyplus": "Disney+",
+    "hbo": "HBO Max",
+    "max.com": "HBO Max",
+    "peacock": "Peacock",
+    "paramount": "Paramount+",
+    "appletv": "Apple TV+",
+    "apple.com/bill": "Apple Services",
+    "apple services": "Apple Services",
+    "itunes": "Apple Services",
+    "youtube": "YouTube Premium",
+    "youtubepremium": "YouTube Premium",
+    # Food delivery
+    "doordash": "DoorDash",
+    "ubereats": "Uber Eats",
+    "grubhub": "Grubhub",
+    "seamless": "Seamless",
+    "instacart": "Instacart",
+    # Rideshare
+    "uber": "Uber",
+    "lyft": "Lyft",
+    # Grocery
+    "wholefds": "Whole Foods",
+    "whole foods": "Whole Foods",
+    "trader joe": "Trader Joe's",
+    "kroger": "Kroger",
+    "safeway": "Safeway",
+    "wegmans": "Wegmans",
+    "shoprite": "ShopRite",
+    "costco": "Costco",
+    "sams club": "Sam's Club",
+    "target": "Target",
+    "walmart": "Walmart",
+    # Fuel
+    "shell": "Shell",
+    "exxon": "ExxonMobil",
+    "mobil": "ExxonMobil",
+    "bp ": "BP",
+    "chevron": "Chevron",
+    "sunoco": "Sunoco",
+    "wawa": "Wawa",
+    "quick chek": "Quick Chek",
+    "quickchek": "Quick Chek",
+    # Coffee
+    "starbucks": "Starbucks",
+    "dunkin": "Dunkin'",
+    "dutch bros": "Dutch Bros",
+    "caribou": "Caribou Coffee",
+    # Fast food
+    "mcdonald": "McDonald's",
+    "mcdonalds": "McDonald's",
+    "chick-fil-a": "Chick-fil-A",
+    "chickfila": "Chick-fil-A",
+    "chipotle": "Chipotle",
+    "taco bell": "Taco Bell",
+    "tacobell": "Taco Bell",
+    "burger king": "Burger King",
+    "burgerking": "Burger King",
+    "wendy": "Wendy's",
+    "subway": "Subway",
+    "panera": "Panera Bread",
+    # Tech / Cloud
+    "google": "Google",
+    "microsoft": "Microsoft",
+    "msft": "Microsoft",
+    "adobe": "Adobe",
+    "dropbox": "Dropbox",
+    "github": "GitHub",
+    "openai": "OpenAI",
+    "chatgpt": "OpenAI",
+    "zoom": "Zoom",
+    "slack": "Slack",
+    "notion": "Notion",
+    "1password": "1Password",
+    "lastpass": "LastPass",
+    # Fitness
+    "planet fitness": "Planet Fitness",
+    "la fitness": "LA Fitness",
+    "lafitness": "LA Fitness",
+    "peloton": "Peloton",
+    "equinox": "Equinox",
+    "anytime fitness": "Anytime Fitness",
+    "ymca": "YMCA",
+    # Insurance
+    "geico": "GEICO",
+    "progressive": "Progressive",
+    "statefarm": "State Farm",
+    "state farm": "State Farm",
+    "allstate": "Allstate",
+    # Utilities/Telecom
+    "verizon": "Verizon",
+    "at&t": "AT&T",
+    "att ": "AT&T",
+    "t-mobile": "T-Mobile",
+    "tmobile": "T-Mobile",
+    "comcast": "Comcast/Xfinity",
+    "xfinity": "Comcast/Xfinity",
+    "spectrum": "Spectrum",
+    # Shopping
+    "etsy": "Etsy",
+    "ebay": "eBay",
+    "bestbuy": "Best Buy",
+    "best buy": "Best Buy",
+    "home depot": "Home Depot",
+    "homedepot": "Home Depot",
+    "lowes": "Lowe's",
+    "wayfair": "Wayfair",
+    "chewy": "Chewy",
+    # Travel
+    "airbnb": "Airbnb",
+    "vrbo": "VRBO",
+    "expedia": "Expedia",
+    "hotels.com": "Hotels.com",
+    "booking.com": "Booking.com",
+    "united air": "United Airlines",
+    "delta air": "Delta Airlines",
+    "american air": "American Airlines",
+    "southwest": "Southwest Airlines",
+    "jetblue": "JetBlue",
+}
+def normalize_merchant(raw: str) -> str:
+    """
+    Attempt to normalize a raw merchant string to a clean name.
+    Returns the best match or a cleaned version of the original.
+    """
+    if not raw:
+        return "Unknown"
+    cleaned = raw.strip().lower()
+    # Remove common noise suffixes
+    for noise in ["*", "#", "  "]:
+        cleaned = cleaned.replace(noise, " ")
+    cleaned = cleaned.strip()
+    for fragment, clean_name in MERCHANT_ALIASES.items():
+        if fragment in cleaned:
+            return clean_name
+    # Fallback: title-case the raw string, trim long codes
+    words = raw.strip().split()
+    # Drop trailing tokens that look like reference codes (all digits/caps short tokens)
+    filtered = []
+    for w in words:
+        if len(w) <= 3 and w.isupper() and w.isalpha():
+            continue  # likely a state abbreviation or noise
+        if w.isdigit():
+            continue
+        filtered.append(w)
+    return " ".join(filtered[:4]).title() if filtered else raw.title()

parser.py ADDED Viewed

	@@ -0,0 +1,382 @@

+# parser.py
+# Handles ingestion of PDF, CSV, XLS/XLSX, DOCX statement files
+# Normalizes all formats into a standard DataFrame schema:
+#   date (datetime), merchant (str), amount (float), raw_merchant (str), source_file (str)
+import io
+import re
+import pandas as pd
+from datetime import datetime
+from typing import Optional
+from merchant_map import normalize_merchant
+# ─────────────────────────────────────────────────────────────────────────────
+# Helpers
+# ─────────────────────────────────────────────────────────────────────────────
+def _clean_amount(val) -> Optional[float]:
+    """Convert various amount formats to a positive float charge, or None."""
+    if val is None:
+        return None
+    s = str(val).strip().replace(",", "").replace("$", "").replace(" ", "")
+    # Some banks use parentheses for debits: (123.45)
+    negative = False
+    if s.startswith("(") and s.endswith(")"):
+        s = s[1:-1]
+        negative = True
+    try:
+        amt = float(s)
+    except ValueError:
+        return None
+    # Some exports use negative for charges, positive for payments
+    # We want charges as positive — caller decides which sign convention
+    return abs(amt) if not negative else abs(amt)
+def _looks_like_payment(merchant: str, amount: float, credit_flag=False) -> bool:
+    """Heuristic: is this row a payment/credit rather than a purchase?"""
+    if credit_flag:
+        return True
+    m = merchant.lower()
+    payment_keywords = [
+        "payment", "thank you", "autopay", "credit", "refund",
+        "return", "adjustment", "reward", "cashback", "cash back",
+        "transfer", "deposit", "interest charge", "fee waiver",
+    ]
+    return any(kw in m for kw in payment_keywords)
+def _parse_date(val) -> Optional[datetime]:
+    """Try multiple date formats."""
+    if isinstance(val, datetime):
+        return val
+    if isinstance(val, pd.Timestamp):
+        return val.to_pydatetime()
+    s = str(val).strip()
+    formats = [
+        "%m/%d/%Y", "%m/%d/%y", "%Y-%m-%d", "%d-%b-%Y",
+        "%b %d, %Y", "%B %d, %Y", "%d/%m/%Y", "%m-%d-%Y",
+        "%Y%m%d",
+    ]
+    for fmt in formats:
+        try:
+            return datetime.strptime(s, fmt)
+        except ValueError:
+            continue
+    return None
+# ─────────────────────────────────────────────────────────────────────────────
+# Format-specific parsers
+# ─────────────────────────────────────────────────────────────────────────────
+def _parse_csv(file_bytes: bytes, filename: str) -> pd.DataFrame:
+    """Parse CSV bank exports. Handles many column name variants."""
+    try:
+        df = pd.read_csv(io.BytesIO(file_bytes), dtype=str, on_bad_lines="skip")
+    except Exception:
+        df = pd.read_csv(io.BytesIO(file_bytes), dtype=str, error_bad_lines=False)
+    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
+    # Date column detection
+    date_candidates = ["date", "transaction_date", "trans_date", "post_date",
+                       "posted_date", "activity_date", "transaction date"]
+    date_col = next((c for c in date_candidates if c in df.columns), None)
+    if not date_col:
+        date_col = next((c for c in df.columns if "date" in c), None)
+    # Merchant / description column detection
+    desc_candidates = ["description", "merchant", "payee", "name", "merchant_name",
+                       "transaction_description", "memo", "details", "narrative"]
+    desc_col = next((c for c in desc_candidates if c in df.columns), None)
+    if not desc_col:
+        desc_col = next((c for c in df.columns if any(k in c for k in ["desc", "merch", "payee", "name"])), None)
+    # Amount column detection
+    amt_candidates = ["amount", "debit", "charge", "transaction_amount",
+                      "debit_amount", "withdrawal", "charged_amount"]
+    amt_col = next((c for c in amt_candidates if c in df.columns), None)
+    if not amt_col:
+        amt_col = next((c for c in df.columns if "amount" in c or "debit" in c), None)
+    # Credit column (to detect payments)
+    credit_col = next((c for c in df.columns if "credit" in c), None)
+    if not all([date_col, desc_col, amt_col]):
+        return pd.DataFrame()
+    rows = []
+    for _, row in df.iterrows():
+        date = _parse_date(row.get(date_col, ""))
+        merchant_raw = str(row.get(desc_col, "")).strip()
+        amt = _clean_amount(row.get(amt_col, ""))
+        is_credit = credit_col and str(row.get(credit_col, "")).strip() not in ("", "0", "0.00", "nan")
+        if date is None or amt is None or amt <= 0:
+            continue
+        if _looks_like_payment(merchant_raw, amt, is_credit):
+            continue
+        rows.append({
+            "date": date,
+            "raw_merchant": merchant_raw,
+            "merchant": normalize_merchant(merchant_raw),
+            "amount": amt,
+            "source_file": filename,
+        })
+    return pd.DataFrame(rows)
+def _parse_excel(file_bytes: bytes, filename: str) -> pd.DataFrame:
+    """Parse XLS/XLSX exports — tries each sheet."""
+    frames = []
+    try:
+        xl = pd.ExcelFile(io.BytesIO(file_bytes))
+        for sheet in xl.sheet_names:
+            try:
+                df = xl.parse(sheet, dtype=str)
+                df.columns = [str(c).strip().lower().replace(" ", "_") for c in df.columns]
+                # Reuse CSV logic by converting to CSV bytes
+                csv_bytes = df.to_csv(index=False).encode()
+                parsed = _parse_csv(csv_bytes, filename)
+                if not parsed.empty:
+                    frames.append(parsed)
+            except Exception:
+                continue
+    except Exception:
+        pass
+    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
+def _parse_pdf(file_bytes: bytes, filename: str) -> pd.DataFrame:
+    """
+    Parse PDF credit card statements.
+    Strategy 1: pdfplumber table extraction (structured)
+    Strategy 2: raw text line-by-line regex parsing (fallback)
+    """
+    import pdfplumber
+    rows = []
+    # ── Strategy 1: Table extraction ─────────────────────────────────────
+    try:
+        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+            for page in pdf.pages:
+                tables = page.extract_tables()
+                for table in tables:
+                    if not table or len(table) < 2:
+                        continue
+                    headers = [str(h).strip().lower().replace(" ", "_") if h else "" for h in table[0]]
+                    for data_row in table[1:]:
+                        if not data_row:
+                            continue
+                        row_dict = {headers[i]: str(data_row[i]).strip() if data_row[i] else ""
+                                    for i in range(min(len(headers), len(data_row)))}
+                        # Try to find date, merchant, amount in this row
+                        date_val = next((row_dict[k] for k in row_dict if "date" in k and row_dict[k]), None)
+                        desc_val = next((row_dict[k] for k in row_dict
+                                         if any(x in k for x in ["desc", "merch", "payee", "name"]) and row_dict[k]), None)
+                        amt_val = next((row_dict[k] for k in row_dict
+                                        if any(x in k for x in ["amount", "debit", "charge"]) and row_dict[k]), None)
+                        if not amt_val:
+                            # Try last numeric-looking column
+                            for k in reversed(list(row_dict.keys())):
+                                cleaned = row_dict[k].replace(",", "").replace("$", "").replace("(", "").replace(")", "")
+                                try:
+                                    float(cleaned)
+                                    amt_val = row_dict[k]
+                                    break
+                                except ValueError:
+                                    continue
+                        if not desc_val:
+                            # Use second column as fallback description
+                            vals = list(row_dict.values())
+                            desc_val = vals[1] if len(vals) > 1 else ""
+                        date = _parse_date(date_val) if date_val else None
+                        amt = _clean_amount(amt_val) if amt_val else None
+                        merchant_raw = str(desc_val).strip() if desc_val else ""
+                        if date is None or amt is None or amt <= 0 or not merchant_raw:
+                            continue
+                        if _looks_like_payment(merchant_raw, amt):
+                            continue
+                        rows.append({
+                            "date": date,
+                            "raw_merchant": merchant_raw,
+                            "merchant": normalize_merchant(merchant_raw),
+                            "amount": amt,
+                            "source_file": filename,
+                        })
+    except Exception:
+        pass
+    # ── Strategy 2: Text regex fallback ──────────────────────────────────
+    if not rows:
+        try:
+            with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+                full_text = "\n".join(
+                    page.extract_text() or "" for page in pdf.pages
+                )
+            # Pattern: date  description  amount
+            # Covers formats like: 01/15/2024  STARBUCKS #1234  5.75
+            pattern = re.compile(
+                r"(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})\s+"
+                r"([A-Za-z][^\d\n]{3,50?}?)\s+"
+                r"\$?([\d,]+\.\d{2})"
+            )
+            for match in pattern.finditer(full_text):
+                date_str, desc, amt_str = match.groups()
+                date = _parse_date(date_str)
+                amt = _clean_amount(amt_str)
+                merchant_raw = desc.strip()
+                if date is None or amt is None or amt <= 0:
+                    continue
+                if _looks_like_payment(merchant_raw, amt):
+                    continue
+                rows.append({
+                    "date": date,
+                    "raw_merchant": merchant_raw,
+                    "merchant": normalize_merchant(merchant_raw),
+                    "amount": amt,
+                    "source_file": filename,
+                })
+        except Exception:
+            pass
+    return pd.DataFrame(rows) if rows else pd.DataFrame()
+def _parse_docx(file_bytes: bytes, filename: str) -> pd.DataFrame:
+    """Parse DOCX — extract text then apply regex like PDF fallback."""
+    import docx2txt
+    import tempfile, os
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
+        tmp.write(file_bytes)
+        tmp_path = tmp.name
+    try:
+        text = docx2txt.process(tmp_path)
+    except Exception:
+        return pd.DataFrame()
+    finally:
+        os.unlink(tmp_path)
+    rows = []
+    pattern = re.compile(
+        r"(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})\s+"
+        r"([A-Za-z][^\d\n]{3,50?}?)\s+"
+        r"\$?([\d,]+\.\d{2})"
+    )
+    for match in pattern.finditer(text):
+        date_str, desc, amt_str = match.groups()
+        date = _parse_date(date_str)
+        amt = _clean_amount(amt_str)
+        merchant_raw = desc.strip()
+        if date is None or amt is None or amt <= 0:
+            continue
+        if _looks_like_payment(merchant_raw, amt):
+            continue
+        rows.append({
+            "date": date,
+            "raw_merchant": merchant_raw,
+            "merchant": normalize_merchant(merchant_raw),
+            "amount": amt,
+            "source_file": filename,
+        })
+    return pd.DataFrame(rows) if rows else pd.DataFrame()
+# ─────────────────────────────────────────────────────────────────────────────
+# Public entry point
+# ─────────────────────────────────────────────────────────────────────────────
+def parse_uploaded_file(uploaded_file) -> pd.DataFrame:
+    """
+    Accept a Streamlit UploadedFile and return a normalized DataFrame.
+    Returns empty DataFrame on failure.
+    """
+    filename = uploaded_file.name
+    file_bytes = uploaded_file.read()
+    ext = filename.lower().split(".")[-1]
+    if ext == "csv":
+        df = _parse_csv(file_bytes, filename)
+    elif ext in ("xls", "xlsx"):
+        df = _parse_excel(file_bytes, filename)
+    elif ext == "pdf":
+        df = _parse_pdf(file_bytes, filename)
+    elif ext == "docx":
+        df = _parse_docx(file_bytes, filename)
+    else:
+        return pd.DataFrame()
+    if df.empty:
+        return df
+    # Enforce schema and types
+    df = df[["date", "merchant", "raw_merchant", "amount", "source_file"]].copy()
+    df["date"] = pd.to_datetime(df["date"])
+    df["amount"] = pd.to_numeric(df["amount"], errors="coerce")
+    df = df.dropna(subset=["date", "amount"])
+    df = df[df["amount"] > 0]
+    df = df.sort_values("date").reset_index(drop=True)
+    return df
+def combine_files(uploaded_files) -> tuple[pd.DataFrame, list[str]]:
+    """
+    Parse and combine multiple uploaded files.
+    Returns (combined_df, list_of_warnings).
+    """
+    frames = []
+    warnings = []
+    for f in uploaded_files:
+        df = parse_uploaded_file(f)
+        if df.empty:
+            warnings.append(f"⚠️ Could not extract transactions from **{f.name}**. "
+                            "Check that it's a valid statement export.")
+        else:
+            frames.append(df)
+    if not frames:
+        return pd.DataFrame(), warnings
+    combined = pd.concat(frames, ignore_index=True)
+    # Deduplicate: same date + merchant + amount within 1 day
+    combined = combined.drop_duplicates(
+        subset=["date", "merchant", "amount"], keep="first"
+    )
+    combined = combined.sort_values("date").reset_index(drop=True)
+    # Check for month gaps
+    if not combined.empty:
+        months = pd.period_range(
+            start=combined["date"].min().to_period("M"),
+            end=combined["date"].max().to_period("M"),
+            freq="M",
+        )
+        covered = set(combined["date"].dt.to_period("M").unique())
+        missing = [str(m) for m in months if m not in covered]
+        if missing:
+            warnings.append(
+                f"📅 Possible gaps detected — no transactions found for: {', '.join(missing)}. "
+                "Upload missing statements for more accurate analysis."
+            )
+    return combined, warnings

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+streamlit>=1.35.0
+pandas>=2.0.0
+numpy>=1.24.0
+pdfplumber>=0.10.0
+PyPDF2>=3.0.0
+openpyxl>=3.1.0
+xlrd>=2.0.1
+docx2txt>=0.8
+scikit-learn>=1.3.0
+openai>=1.0.0
+google-generativeai>=0.5.0
+anthropic>=0.25.0
+python-dotenv>=1.0.0