Ekow24's picture
Update utils.py
b41f82d verified
from __future__ import annotations
import math
import os
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Dict, Iterable, List, Optional, Tuple
import numpy as np
import pandas as pd
import plotly.express as px
# ------------------------------
# Constants
# ------------------------------
CATEGORIES = [
"Food",
"Travel",
"Shopping",
"Utilities",
"Entertainment",
"Health",
"Subscriptions",
"Transport",
]
MERCHANTS = [
"SuperMart",
"QuickEats",
"Urban Cafe",
"MegaStore",
"Cinema City",
"Fit&Fine Gym",
"City Utilities",
"StreamFlix",
"RideNow",
"Book Haven",
"ElectroWorld",
"TravelCo",
"PharmaPlus",
"HomeNeeds",
]
PAYMENT_METHODS = ["Debit Card", "Credit Card", "Digital Wallet"]
LOCATIONS = [
"London",
"Manchester",
"Birmingham",
"Leeds",
"Glasgow",
"Liverpool",
"Bristol",
"Edinburgh",
"Cardiff",
"Belfast",
]
# ------------------------------
# Data Generation
# ------------------------------
def _random_amounts(n: int, rng: np.random.Generator) -> np.ndarray:
choices = rng.choice(["small", "medium", "large"], size=n, p=[0.65, 0.28, 0.07])
amounts = np.empty(n)
for i, c in enumerate(choices):
if c == "small":
amounts[i] = max(1, rng.normal(15, 8))
elif c == "medium":
amounts[i] = max(5, rng.normal(60, 25))
else:
amounts[i] = max(20, rng.normal(180, 60))
spike_idx = rng.choice(np.arange(n), size=max(1, n // 50), replace=False)
amounts[spike_idx] *= rng.uniform(2.5, 4.0, size=len(spike_idx))
return np.round(amounts, 2)
def generate_synthetic_transactions(n_rows: int = 900, seed: Optional[int] = None) -> pd.DataFrame:
rng = np.random.default_rng(seed)
end = pd.Timestamp.today().normalize()
start = end - pd.Timedelta(days=365)
dates = pd.date_range(start, end, freq="D")
weights = np.array([1.2 if d.weekday() >= 5 else 1.0 for d in dates]) * \
np.array([1.3 if d.day > 25 else 1.0 for d in dates])
weights = weights / weights.sum()
date_choices = rng.choice(len(dates), size=n_rows, replace=True, p=weights)
chosen_dates = dates[date_choices]
categories = rng.choice(CATEGORIES, size=n_rows)
merchants = rng.choice(MERCHANTS, size=n_rows)
payment_methods = rng.choice(PAYMENT_METHODS, size=n_rows, p=[0.6, 0.25, 0.15])
locations = rng.choice(LOCATIONS, size=n_rows)
amts = _random_amounts(n_rows, rng)
df = pd.DataFrame({
"Date": pd.to_datetime(chosen_dates),
"Merchant": merchants,
"Category": categories,
"Amount": amts,
"Payment Method": payment_methods,
"Location": locations,
})
return df.sort_values("Date").reset_index(drop=True)
# ------------------------------
# Filtering
# ------------------------------
def filter_transactions(
df: pd.DataFrame,
date_range: Tuple[datetime, datetime],
categories: Optional[Iterable[str]] = None,
merchant_query: str = "",
) -> pd.DataFrame:
start, end = date_range
mask = (df["Date"] >= pd.to_datetime(start)) & (df["Date"] <= pd.to_datetime(end))
if categories:
mask &= df["Category"].isin(list(categories))
if merchant_query:
mask &= df["Merchant"].str.contains(merchant_query, case=False, na=False)
return df.loc[mask].copy()
# ------------------------------
# Aggregation
# ------------------------------
def _month_key(s: pd.Series) -> pd.Series:
return pd.to_datetime(s).dt.to_period("M").dt.to_timestamp()
def compute_aggregations(df: pd.DataFrame) -> Dict:
if df.empty:
return {
"total_spend": 0.0,
"avg_monthly_spend": 0.0,
"spend_per_category": pd.Series(dtype=float),
"spend_per_payment": pd.Series(dtype=float),
"max_transaction": {"Amount": 0.0},
"min_transaction": {"Amount": 0.0},
"monthly": pd.DataFrame(columns=["Month", "Amount"]),
"category_share": pd.Series(dtype=float),
"rolling_28d": pd.DataFrame(columns=["Date", "Amount", "Rolling28"]),
"spikes": pd.DataFrame(columns=["Date", "Amount", "IsSpike"]),
}
total_spend = float(df["Amount"].sum())
spend_per_category = df.groupby("Category")["Amount"].sum().sort_values(ascending=False)
spend_per_payment = df.groupby("Payment Method")["Amount"].sum().sort_values(ascending=False)
max_txn = df.loc[df["Amount"].idxmax()].to_dict()
min_txn = df.loc[df["Amount"].idxmin()].to_dict()
monthly = df.assign(Month=_month_key(df["Date"])).groupby("Month")["Amount"].sum().reset_index()
avg_monthly_spend = float(monthly["Amount"].mean()) if not monthly.empty else 0.0
category_share = (spend_per_category / max(total_spend, 1e-9)).round(4)
df_daily = df.groupby(pd.to_datetime(df["Date"]).dt.date)["Amount"].sum().reset_index()
df_daily["Date"] = pd.to_datetime(df_daily["Date"])
df_daily = df_daily.sort_values("Date")
df_daily["Rolling28"] = df_daily["Amount"].rolling(window=28, min_periods=7).mean()
mu = df_daily["Amount"].mean()
sigma = df_daily["Amount"].std(ddof=0) or 0.0
threshold = mu + 2.5 * sigma
df_spikes = df_daily.assign(IsSpike=df_daily["Amount"] > threshold)
return {
"total_spend": total_spend,
"avg_monthly_spend": avg_monthly_spend,
"spend_per_category": spend_per_category,
"spend_per_payment": spend_per_payment,
"max_transaction": max_txn,
"min_transaction": min_txn,
"monthly": monthly,
"category_share": category_share,
"rolling_28d": df_daily,
"spikes": df_spikes,
}
# ------------------------------
# Charts
# ------------------------------
def build_time_series_chart(
df: pd.DataFrame,
template: str = "plotly",
spike_overlay: Optional[pd.DataFrame] = None,
fixed_line_width: float = 2.0,
**kwargs,
) -> "px.Figure":
if df.empty:
fig = px.line()
fig.update_layout(template=template)
return fig
daily = df.groupby(pd.to_datetime(df["Date"]).dt.date)["Amount"].sum().reset_index()
daily["Date"] = pd.to_datetime(daily["Date"])
fig = px.line(daily, x="Date", y="Amount", title="Daily Spend Over Time", markers=True)
fig.update_traces(line=dict(width=fixed_line_width), hovertemplate="%{x|%b %d, %Y}: £%{y:.2f}")
fig.update_layout(
margin=dict(l=10, r=10, t=40, b=10),
template=template,
xaxis=dict(fixedrange=True),
yaxis=dict(fixedrange=True),
)
if isinstance(spike_overlay, pd.DataFrame) and not spike_overlay.empty:
spike_points = spike_overlay[spike_overlay.get("IsSpike", False)]
if not spike_points.empty:
fig.add_scatter(
x=spike_points["Date"],
y=spike_points["Amount"],
mode="markers",
name="Spikes",
marker=dict(color="#EF553B", size=9, symbol="diamond"),
hovertemplate="Spike %{x|%b %d, %Y}: £%{y:.2f}",
)
return fig
def build_category_bar_chart(
spend_per_category: pd.Series,
template: str = "plotly",
color_sequence: Optional[list] = None,
**kwargs,
):
if spend_per_category.empty:
fig = px.bar()
fig.update_layout(template=template)
return fig
fig = px.bar(
spend_per_category.reset_index().rename(columns={"index": "Category", 0: "Amount"}),
x="Category",
y="Amount",
title="Spend by Category",
color="Category",
color_discrete_sequence=color_sequence,
)
fig.update_traces(hovertemplate="%{x}: £%{y:.2f}")
fig.update_layout(showlegend=False, margin=dict(l=10, r=10, t=40, b=10), template=template)
return fig
def build_payment_method_pie_chart(
spend_per_payment: pd.Series,
template: str = "plotly",
color_sequence: Optional[list] = None,
):
if spend_per_payment.empty:
fig = px.pie()
fig.update_layout(template=template)
return fig
fig = px.pie(
spend_per_payment.reset_index().rename(columns={"index": "Payment Method", 0: "Amount"}),
values="Amount",
names="Payment Method",
title="Payment Methods Distribution",
hole=0.45,
color_discrete_sequence=color_sequence,
)
fig.update_traces(hovertemplate="%{label}: £%{value:.2f} (%{percent})")
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10), template=template)
return fig
# ------------------------------
# Helpers
# ------------------------------
def _format_number(n: float) -> str:
"""Formats numbers with k/M suffix and ensures spacing to prevent cramping."""
if n >= 1_000_000:
return f"£{n/1_000_000:.1f}M"
if n >= 1_000:
return f"£{n/1_000:.1f}k"
return f"£{n:,.0f}"
def _month_over_month_change(monthly: Optional[pd.DataFrame]) -> float:
if monthly is None or monthly.empty or len(monthly) < 2:
return 0.0
monthly_sorted = monthly.sort_values("Month")
last, prev = monthly_sorted["Amount"].iloc[-1], monthly_sorted["Amount"].iloc[-2]
if prev == 0:
return 0.0
return float((last - prev) / prev)