Ekow24 commited on
Commit
4be23b6
·
verified ·
1 Parent(s): 8152eda

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +164 -48
utils.py CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
2
 
3
  import math
4
  import os
5
- from dataclasses import dataclass
6
  from datetime import datetime, timedelta
7
  from typing import Dict, Iterable, List, Optional, Tuple
8
 
@@ -10,53 +9,29 @@ import numpy as np
10
  import pandas as pd
11
  import plotly.express as px
12
 
13
-
14
  CATEGORIES = [
15
- "Food",
16
- "Travel",
17
- "Shopping",
18
- "Utilities",
19
- "Entertainment",
20
- "Health",
21
- "Subscriptions",
22
- "Transport",
23
  ]
24
 
25
  MERCHANTS = [
26
- "SuperMart",
27
- "QuickEats",
28
- "Urban Cafe",
29
- "MegaStore",
30
- "Cinema City",
31
- "Fit&Fine Gym",
32
- "City Utilities",
33
- "StreamFlix",
34
- "RideNow",
35
- "Book Haven",
36
- "ElectroWorld",
37
- "TravelCo",
38
- "PharmaPlus",
39
- "HomeNeeds",
40
  ]
41
 
42
  PAYMENT_METHODS = ["Debit Card", "Credit Card", "Digital Wallet"]
43
 
44
  LOCATIONS = [
45
- "London",
46
- "Manchester",
47
- "Birmingham",
48
- "Leeds",
49
- "Glasgow",
50
- "Liverpool",
51
- "Bristol",
52
- "Edinburgh",
53
- "Cardiff",
54
- "Belfast",
55
  ]
56
 
57
 
 
 
 
58
  def _random_amounts(n: int, rng: np.random.Generator) -> np.ndarray:
59
- # Mixture distribution for realistic spend
60
  choices = rng.choice(["small", "medium", "large"], size=n, p=[0.65, 0.28, 0.07])
61
  amounts = np.empty(n)
62
  for i, c in enumerate(choices):
@@ -66,7 +41,6 @@ def _random_amounts(n: int, rng: np.random.Generator) -> np.ndarray:
66
  amounts[i] = max(5, rng.normal(60, 25))
67
  else:
68
  amounts[i] = max(20, rng.normal(180, 60))
69
- # Random spikes
70
  spike_idx = rng.choice(np.arange(n), size=max(1, n // 50), replace=False)
71
  amounts[spike_idx] *= rng.uniform(2.5, 4.0, size=len(spike_idx))
72
  return np.round(amounts, 2)
@@ -80,8 +54,8 @@ def generate_synthetic_transactions(n_rows: int = 900, seed: Optional[int] = Non
80
 
81
  weights = np.array([1.2 if d.weekday() >= 5 else 1.0 for d in dates]) * \
82
  np.array([1.3 if d.day > 25 else 1.0 for d in dates])
83
- weights = np.clip(weights, 0, None)
84
- weights /= weights.sum()
85
  date_choices = rng.choice(len(dates), size=n_rows, replace=True, p=weights)
86
  chosen_dates = dates[date_choices]
87
 
@@ -99,9 +73,13 @@ def generate_synthetic_transactions(n_rows: int = 900, seed: Optional[int] = Non
99
  "Payment Method": payment_methods,
100
  "Location": locations,
101
  })
102
- return df.sort_values("Date").reset_index(drop=True)
 
103
 
104
 
 
 
 
105
  def filter_transactions(
106
  df: pd.DataFrame,
107
  date_range: Tuple[datetime, datetime],
@@ -111,7 +89,7 @@ def filter_transactions(
111
  start, end = date_range
112
  mask = (df["Date"] >= pd.to_datetime(start)) & (df["Date"] <= pd.to_datetime(end))
113
  if categories:
114
- mask &= df["Category"].isin(categories)
115
  if merchant_query:
116
  mask &= df["Merchant"].str.contains(merchant_query, case=False, na=False)
117
  return df.loc[mask].copy()
@@ -152,7 +130,8 @@ def compute_aggregations(df: pd.DataFrame) -> Dict:
152
  df_daily = df_daily.sort_values("Date")
153
  df_daily["Rolling28"] = df_daily["Amount"].rolling(window=28, min_periods=7).mean()
154
 
155
- mu, sigma = df_daily["Amount"].mean(), df_daily["Amount"].std(ddof=0) or 0.0
 
156
  threshold = mu + 2.5 * sigma
157
  df_spikes = df_daily.assign(IsSpike=df_daily["Amount"] > threshold)
158
 
@@ -170,12 +149,15 @@ def compute_aggregations(df: pd.DataFrame) -> Dict:
170
  }
171
 
172
 
 
 
 
173
  def build_time_series_chart(
174
  df: pd.DataFrame,
175
  template: str = "plotly",
176
  spike_overlay: Optional[pd.DataFrame] = None,
177
- fixed_line_width: int = 2,
178
- hover_line_width: int = 4,
179
  ) -> "px.Figure":
180
  if df.empty:
181
  fig = px.line()
@@ -188,7 +170,7 @@ def build_time_series_chart(
188
  fig.update_traces(line=dict(width=fixed_line_width), hovertemplate="%{x|%b %d, %Y}: £%{y:.2f}")
189
  fig.update_layout(margin=dict(l=10, r=10, t=40, b=10), template=template)
190
 
191
- if spike_overlay is not None and not spike_overlay.empty:
192
  spike_points = spike_overlay[spike_overlay.get("IsSpike", False)]
193
  if not spike_points.empty:
194
  fig.add_scatter(
@@ -206,6 +188,8 @@ def build_category_bar_chart(
206
  spend_per_category: pd.Series,
207
  template: str = "plotly",
208
  color_sequence: Optional[list] = None,
 
 
209
  ):
210
  if spend_per_category.empty:
211
  fig = px.bar()
@@ -213,14 +197,14 @@ def build_category_bar_chart(
213
  return fig
214
 
215
  fig = px.bar(
216
- spend_per_category.reset_index().rename(columns={"index": "Category", "Amount": 0}),
217
  x="Category",
218
  y="Amount",
219
  title="Spend by Category",
220
  color="Category",
221
  color_discrete_sequence=color_sequence,
222
  )
223
- fig.update_traces(hovertemplate="%{x}: £%{y:.2f}")
224
  fig.update_layout(showlegend=False, margin=dict(l=10, r=10, t=40, b=10), template=template)
225
  return fig
226
 
@@ -229,6 +213,7 @@ def build_payment_method_pie_chart(
229
  spend_per_payment: pd.Series,
230
  template: str = "plotly",
231
  color_sequence: Optional[list] = None,
 
232
  ):
233
  if spend_per_payment.empty:
234
  fig = px.pie()
@@ -236,7 +221,7 @@ def build_payment_method_pie_chart(
236
  return fig
237
 
238
  fig = px.pie(
239
- spend_per_payment.reset_index().rename(columns={"index": "Payment Method", "Amount": 0}),
240
  values="Amount",
241
  names="Payment Method",
242
  title="Payment Methods Distribution",
@@ -248,9 +233,140 @@ def build_payment_method_pie_chart(
248
  return fig
249
 
250
 
 
 
 
251
  def _format_number(n: float) -> str:
252
  if n >= 1_000_000:
253
  return f"£{n/1_000_000:.1f}M"
254
  if n >= 1_000:
255
  return f"£{n/1_000:.1f}k"
256
- return f"£{n:,.0f}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  import math
4
  import os
 
5
  from datetime import datetime, timedelta
6
  from typing import Dict, Iterable, List, Optional, Tuple
7
 
 
9
  import pandas as pd
10
  import plotly.express as px
11
 
 
12
  CATEGORIES = [
13
+ "Food", "Travel", "Shopping", "Utilities", "Entertainment",
14
+ "Health", "Subscriptions", "Transport",
 
 
 
 
 
 
15
  ]
16
 
17
  MERCHANTS = [
18
+ "SuperMart", "QuickEats", "Urban Cafe", "MegaStore", "Cinema City",
19
+ "Fit&Fine Gym", "City Utilities", "StreamFlix", "RideNow",
20
+ "Book Haven", "ElectroWorld", "TravelCo", "PharmaPlus", "HomeNeeds",
 
 
 
 
 
 
 
 
 
 
 
21
  ]
22
 
23
  PAYMENT_METHODS = ["Debit Card", "Credit Card", "Digital Wallet"]
24
 
25
  LOCATIONS = [
26
+ "London", "Manchester", "Birmingham", "Leeds", "Glasgow",
27
+ "Liverpool", "Bristol", "Edinburgh", "Cardiff", "Belfast",
 
 
 
 
 
 
 
 
28
  ]
29
 
30
 
31
+ # -----------------------------
32
+ # Synthetic Data Generation
33
+ # -----------------------------
34
  def _random_amounts(n: int, rng: np.random.Generator) -> np.ndarray:
 
35
  choices = rng.choice(["small", "medium", "large"], size=n, p=[0.65, 0.28, 0.07])
36
  amounts = np.empty(n)
37
  for i, c in enumerate(choices):
 
41
  amounts[i] = max(5, rng.normal(60, 25))
42
  else:
43
  amounts[i] = max(20, rng.normal(180, 60))
 
44
  spike_idx = rng.choice(np.arange(n), size=max(1, n // 50), replace=False)
45
  amounts[spike_idx] *= rng.uniform(2.5, 4.0, size=len(spike_idx))
46
  return np.round(amounts, 2)
 
54
 
55
  weights = np.array([1.2 if d.weekday() >= 5 else 1.0 for d in dates]) * \
56
  np.array([1.3 if d.day > 25 else 1.0 for d in dates])
57
+ weights = np.clip(weights, a_min=0, a_max=None)
58
+ weights = weights / weights.sum()
59
  date_choices = rng.choice(len(dates), size=n_rows, replace=True, p=weights)
60
  chosen_dates = dates[date_choices]
61
 
 
73
  "Payment Method": payment_methods,
74
  "Location": locations,
75
  })
76
+ df = df.sort_values("Date").reset_index(drop=True)
77
+ return df
78
 
79
 
80
+ # -----------------------------
81
+ # Filtering and Aggregation
82
+ # -----------------------------
83
  def filter_transactions(
84
  df: pd.DataFrame,
85
  date_range: Tuple[datetime, datetime],
 
89
  start, end = date_range
90
  mask = (df["Date"] >= pd.to_datetime(start)) & (df["Date"] <= pd.to_datetime(end))
91
  if categories:
92
+ mask &= df["Category"].isin(list(categories))
93
  if merchant_query:
94
  mask &= df["Merchant"].str.contains(merchant_query, case=False, na=False)
95
  return df.loc[mask].copy()
 
130
  df_daily = df_daily.sort_values("Date")
131
  df_daily["Rolling28"] = df_daily["Amount"].rolling(window=28, min_periods=7).mean()
132
 
133
+ mu = df_daily["Amount"].mean()
134
+ sigma = df_daily["Amount"].std(ddof=0) or 0.0
135
  threshold = mu + 2.5 * sigma
136
  df_spikes = df_daily.assign(IsSpike=df_daily["Amount"] > threshold)
137
 
 
149
  }
150
 
151
 
152
+ # -----------------------------
153
+ # Chart Builders (fixed)
154
+ # -----------------------------
155
  def build_time_series_chart(
156
  df: pd.DataFrame,
157
  template: str = "plotly",
158
  spike_overlay: Optional[pd.DataFrame] = None,
159
+ fixed_line_width: float = 2.0,
160
+ **kwargs,
161
  ) -> "px.Figure":
162
  if df.empty:
163
  fig = px.line()
 
170
  fig.update_traces(line=dict(width=fixed_line_width), hovertemplate="%{x|%b %d, %Y}: £%{y:.2f}")
171
  fig.update_layout(margin=dict(l=10, r=10, t=40, b=10), template=template)
172
 
173
+ if isinstance(spike_overlay, pd.DataFrame) and not spike_overlay.empty:
174
  spike_points = spike_overlay[spike_overlay.get("IsSpike", False)]
175
  if not spike_points.empty:
176
  fig.add_scatter(
 
188
  spend_per_category: pd.Series,
189
  template: str = "plotly",
190
  color_sequence: Optional[list] = None,
191
+ fixed_bar_width: float = 0.8,
192
+ **kwargs,
193
  ):
194
  if spend_per_category.empty:
195
  fig = px.bar()
 
197
  return fig
198
 
199
  fig = px.bar(
200
+ spend_per_category.reset_index().rename(columns={"index": "Category", 0: "Amount"}),
201
  x="Category",
202
  y="Amount",
203
  title="Spend by Category",
204
  color="Category",
205
  color_discrete_sequence=color_sequence,
206
  )
207
+ fig.update_traces(width=fixed_bar_width, hovertemplate="%{x}: £%{y:.2f}")
208
  fig.update_layout(showlegend=False, margin=dict(l=10, r=10, t=40, b=10), template=template)
209
  return fig
210
 
 
213
  spend_per_payment: pd.Series,
214
  template: str = "plotly",
215
  color_sequence: Optional[list] = None,
216
+ **kwargs,
217
  ):
218
  if spend_per_payment.empty:
219
  fig = px.pie()
 
221
  return fig
222
 
223
  fig = px.pie(
224
+ spend_per_payment.reset_index().rename(columns={"index": "Payment Method", 0: "Amount"}),
225
  values="Amount",
226
  names="Payment Method",
227
  title="Payment Methods Distribution",
 
233
  return fig
234
 
235
 
236
+ # -----------------------------
237
+ # Formatting Helpers
238
+ # -----------------------------
239
  def _format_number(n: float) -> str:
240
  if n >= 1_000_000:
241
  return f"£{n/1_000_000:.1f}M"
242
  if n >= 1_000:
243
  return f"£{n/1_000:.1f}k"
244
+ return f"£{n:,.0f}"
245
+
246
+
247
+ def _month_over_month_change(monthly: Optional[pd.DataFrame]) -> float:
248
+ if monthly is None or monthly.empty or len(monthly) < 2:
249
+ return 0.0
250
+ monthly_sorted = monthly.sort_values("Month")
251
+ last, prev = monthly_sorted["Amount"].iloc[-1], monthly_sorted["Amount"].iloc[-2]
252
+ if prev == 0:
253
+ return 0.0
254
+ return float((last - prev) / prev)
255
+
256
+
257
+ # -----------------------------
258
+ # AI Summary Helpers
259
+ # -----------------------------
260
+ def _heuristic_summary(ctx: Dict, mode: str = "Concise") -> str:
261
+ total = _format_number(ctx.get("total_spend", 0.0))
262
+ avg = _format_number(ctx.get("avg_monthly", 0.0))
263
+ lcat = ctx.get("largest_category") or "N/A"
264
+ share = ctx.get("largest_category_share", 0.0) * 100
265
+ max_amt = ctx.get("max_transaction", {}).get("amount", 0.0)
266
+ max_merchant = ctx.get("max_transaction", {}).get("merchant", "")
267
+ mom = ctx.get("mom_change", 0.0) * 100
268
+ spikes = ctx.get("spike_days", 0)
269
+
270
+ parts = [
271
+ f"Total spend in the selected period is {total}, averaging {avg} per month.",
272
+ f"Top category is {lcat} at {share:.0f}% of spend." if lcat != "N/A" else "",
273
+ f"Month-over-month, spending changed by {mom:+.0f}%.",
274
+ f"Largest single transaction was £{max_amt:,.0f} at {max_merchant}." if max_amt else "",
275
+ f"Detected {spikes} unusually high daily spend day(s)." if spikes else "",
276
+ ]
277
+ text = " ".join([p for p in parts if p])
278
+
279
+ if mode == "Detailed":
280
+ detailed_insights = []
281
+
282
+ if mom > 10:
283
+ detailed_insights.append("Your spending has increased significantly this month, which may indicate lifestyle changes or seasonal variations.")
284
+ elif mom < -10:
285
+ detailed_insights.append("You've successfully reduced your spending this month, showing good financial discipline.")
286
+ else:
287
+ detailed_insights.append("Your spending patterns remain relatively stable month-over-month.")
288
+
289
+ if lcat == "Food":
290
+ detailed_insights.append("Food represents your largest expense category. Consider meal planning and bulk shopping to optimize costs.")
291
+ elif lcat == "Shopping":
292
+ detailed_insights.append("Shopping is your primary spending category. Review purchases for necessities vs. wants to identify savings opportunities.")
293
+ elif lcat == "Entertainment":
294
+ detailed_insights.append("Entertainment spending dominates your budget. Look for free or low-cost alternatives to maintain your lifestyle within budget.")
295
+
296
+ if spikes > 5:
297
+ detailed_insights.append("Multiple spending spikes detected suggest irregular expense patterns. Consider smoothing these through better budgeting.")
298
+ elif spikes > 0:
299
+ detailed_insights.append("Some spending spikes were identified, which is normal but worth monitoring for budget planning.")
300
+
301
+ detailed_insights.append("Consider setting category budgets and monitoring spikes to smooth cash flow and improve financial predictability.")
302
+
303
+ text += " " + " ".join(detailed_insights)
304
+
305
+ return text
306
+
307
+
308
+ def _hf_prompt(context: Dict, mode: str) -> str:
309
+ style = "concise (80-120 words)" if mode == "Concise" else "detailed (140-220 words)"
310
+ return (
311
+ "You are a helpful financial assistant. Produce a "
312
+ + style
313
+ + " natural-language summary of the provided spending analytics in plain English.\n\n"
314
+ + f"Context: {context}\n\nSummary:"
315
+ )
316
+
317
+
318
+ def summarize_with_ai(
319
+ agg: Dict,
320
+ api_key: Optional[str] = None,
321
+ mode: str = "Concise",
322
+ engine: str = "Heuristic",
323
+ ollama_model: Optional[str] = None,
324
+ ) -> str:
325
+ largest_cat = agg["spend_per_category"].idxmax() if not agg["spend_per_category"].empty else None
326
+ largest_cat_share = float(agg["category_share"].max()) if not agg["category_share"].empty else 0.0
327
+
328
+ context = {
329
+ "total_spend": float(agg["total_spend"]),
330
+ "avg_monthly": float(agg["avg_monthly_spend"]),
331
+ "largest_category": largest_cat,
332
+ "largest_category_share": largest_cat_share,
333
+ "max_transaction": {
334
+ "amount": float(agg["max_transaction"].get("Amount", 0.0)),
335
+ "merchant": str(agg["max_transaction"].get("Merchant", "")),
336
+ },
337
+ "mom_change": _month_over_month_change(agg.get("monthly")),
338
+ "spike_days": int(agg.get("spikes", pd.DataFrame()).get("IsSpike", pd.Series(dtype=bool)).sum())
339
+ }
340
+
341
+ engine = (engine or "Heuristic").strip()
342
+ if engine == "Heuristic":
343
+ return _heuristic_summary(context, mode=mode)
344
+
345
+ # HuggingFace or other engines can be added here
346
+ return _heuristic_summary(context, mode=mode)
347
+
348
+
349
+ # -----------------------------
350
+ # Chat AI (local only)
351
+ # -----------------------------
352
+ def chat_with_ai(
353
+ agg: Dict,
354
+ question: str,
355
+ engine: str = "Heuristic",
356
+ api_key: Optional[str] = None,
357
+ ollama_model: Optional[str] = None,
358
+ ) -> str:
359
+ context = {
360
+ "totals": float(agg.get("total_spend", 0.0)),
361
+ "monthly": [
362
+ {"month": str(r["Month"]), "amount": float(r["Amount"])}
363
+ for _, r in agg.get("monthly", pd.DataFrame()).iterrows()
364
+ ],
365
+ "spikes": agg.get("spikes", pd.DataFrame()).to_dict(orient="records") if "spikes" in agg else [],
366
+ "categories": agg.get("spend_per_category", pd.Series(dtype=float)).to_dict(),
367
+ "payments": agg.get("spend_per_payment", pd.Series(dtype=float)).to_dict(),
368
+ }
369
+ prompt = f"Context: {context}\nUser Question: {question}\nAnswer:"
370
+ if engine == "Heuristic":
371
+ return "Heuristic engine does not support free-form Q&A yet. Please use summary mode."
372
+ return "AI response placeholder."