Ekow24 commited on
Commit
eb1886d
·
verified ·
1 Parent(s): f271fbf

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +26 -279
utils.py CHANGED
@@ -56,7 +56,6 @@ LOCATIONS = [
56
 
57
 
58
  def _random_amounts(n: int, rng: np.random.Generator) -> np.ndarray:
59
- # Mixture distribution for more realistic spend: many small, some medium, few large
60
  choices = rng.choice(["small", "medium", "large"], size=n, p=[0.65, 0.28, 0.07])
61
  amounts = np.empty(n)
62
  for i, c in enumerate(choices):
@@ -66,7 +65,6 @@ def _random_amounts(n: int, rng: np.random.Generator) -> np.ndarray:
66
  amounts[i] = max(5, rng.normal(60, 25))
67
  else:
68
  amounts[i] = max(20, rng.normal(180, 60))
69
- # Random spikes
70
  spike_idx = rng.choice(np.arange(n), size=max(1, n // 50), replace=False)
71
  amounts[spike_idx] *= rng.uniform(2.5, 4.0, size=len(spike_idx))
72
  return np.round(amounts, 2)
@@ -78,12 +76,8 @@ def generate_synthetic_transactions(n_rows: int = 900, seed: Optional[int] = Non
78
  start = end - pd.Timedelta(days=365)
79
  dates = pd.date_range(start, end, freq="D")
80
 
81
- # Draw dates with bias to weekends and month-ends; normalize to ensure probabilities sum to 1
82
- weights = np.array([
83
- 1.2 if d.weekday() >= 5 else 1.0 for d in dates
84
- ]) * np.array([
85
- 1.3 if d.day > 25 else 1.0 for d in dates
86
- ])
87
  weights = np.clip(weights, a_min=0, a_max=None)
88
  weights = weights / weights.sum()
89
  date_choices = rng.choice(len(dates), size=n_rows, replace=True, p=weights)
@@ -95,17 +89,14 @@ def generate_synthetic_transactions(n_rows: int = 900, seed: Optional[int] = Non
95
  locations = rng.choice(LOCATIONS, size=n_rows)
96
  amts = _random_amounts(n_rows, rng)
97
 
98
- df = pd.DataFrame(
99
- {
100
- "Date": pd.to_datetime(chosen_dates),
101
- "Merchant": merchants,
102
- "Category": categories,
103
- "Amount": amts,
104
- "Payment Method": payment_methods,
105
- "Location": locations,
106
- }
107
- )
108
- # Sort by date for better UX
109
  df = df.sort_values("Date").reset_index(drop=True)
110
  return df
111
 
@@ -150,23 +141,15 @@ def compute_aggregations(df: pd.DataFrame) -> Dict:
150
  max_txn = df.loc[df["Amount"].idxmax()].to_dict()
151
  min_txn = df.loc[df["Amount"].idxmin()].to_dict()
152
 
153
- monthly = (
154
- df.assign(Month=_month_key(df["Date"]))
155
- .groupby("Month")["Amount"].sum()
156
- .reset_index()
157
- )
158
  avg_monthly_spend = float(monthly["Amount"].mean()) if not monthly.empty else 0.0
159
-
160
- # Category share
161
  category_share = (spend_per_category / max(total_spend, 1e-9)).round(4)
162
 
163
- # Rolling 28-day spend for simple trend smoothing
164
  df_daily = df.groupby(pd.to_datetime(df["Date"]).dt.date)["Amount"].sum().reset_index()
165
- df_daily["Date"] = pd.to_datetime(df_daily["Date"]) # normalize to midnight
166
  df_daily = df_daily.sort_values("Date")
167
  df_daily["Rolling28"] = df_daily["Amount"].rolling(window=28, min_periods=7).mean()
168
 
169
- # Naive anomaly: mark spikes above mean + 2.5*std on daily amounts
170
  mu = df_daily["Amount"].mean()
171
  sigma = df_daily["Amount"].std(ddof=0) or 0.0
172
  threshold = mu + 2.5 * sigma
@@ -190,6 +173,7 @@ def build_time_series_chart(
190
  df: pd.DataFrame,
191
  template: str = "plotly",
192
  spike_overlay: Optional[pd.DataFrame] = None,
 
193
  ) -> "px.Figure":
194
  if df.empty:
195
  fig = px.line()
@@ -197,7 +181,7 @@ def build_time_series_chart(
197
  return fig
198
 
199
  daily = df.groupby(pd.to_datetime(df["Date"]).dt.date)["Amount"].sum().reset_index()
200
- daily["Date"] = pd.to_datetime(daily["Date"]) # ensure datetime for plotly
201
 
202
  fig = px.line(
203
  daily,
@@ -207,12 +191,14 @@ def build_time_series_chart(
207
  markers=True,
208
  )
209
 
210
- # FIX: Prevent line expansion on hover
211
- fig.update_traces(line=dict(width=2), marker=dict(size=6), hovertemplate="%{x|%b %d, %Y}: £%{y:.2f}")
 
 
 
212
 
213
  fig.update_layout(margin=dict(l=10, r=10, t=40, b=10), template=template)
214
 
215
- # Optional spike overlay
216
  if isinstance(spike_overlay, pd.DataFrame) and not spike_overlay.empty:
217
  spike_points = spike_overlay[spike_overlay.get("IsSpike", False)]
218
  if not spike_points.empty:
@@ -228,6 +214,7 @@ def build_time_series_chart(
228
  return fig
229
 
230
 
 
231
  def build_category_bar_chart(
232
  spend_per_category: pd.Series,
233
  template: str = "plotly",
@@ -271,251 +258,11 @@ def build_payment_method_pie_chart(
271
  fig.update_layout(margin=dict(l=10, r=10, t=40, b=10), template=template)
272
  return fig
273
 
274
- def build_category_bar_chart(
275
- spend_per_category: pd.Series,
276
- template: str = "plotly",
277
- color_sequence: Optional[list] = None,
278
- ):
279
- if spend_per_category.empty:
280
- fig = px.bar()
281
- fig.update_layout(template=template)
282
- return fig
283
- fig = px.bar(
284
- spend_per_category.reset_index().rename(columns={"index": "Category", 0: "Amount"}),
285
- x="Category",
286
- y="Amount",
287
- title="Spend by Category",
288
- color="Category",
289
- color_discrete_sequence=color_sequence,
290
- )
291
- fig.update_traces(hovertemplate="%{x}: £%{y:.2f}")
292
- fig.update_layout(showlegend=False, margin=dict(l=10, r=10, t=40, b=10), template=template)
293
- return fig
294
-
295
-
296
-
297
- def build_payment_method_pie_chart(
298
- spend_per_payment: pd.Series,
299
- template: str = "plotly",
300
- color_sequence: Optional[list] = None,
301
- ):
302
- if spend_per_payment.empty:
303
- fig = px.pie()
304
- fig.update_layout(template=template)
305
- return fig
306
- fig = px.pie(
307
- spend_per_payment.reset_index().rename(columns={"index": "Payment Method", 0: "Amount"}),
308
- values="Amount",
309
- names="Payment Method",
310
- title="Payment Methods Distribution",
311
- hole=0.45,
312
- color_discrete_sequence=color_sequence,
313
- )
314
- fig.update_traces(hovertemplate="%{label}: £%{value:.2f} (%{percent})")
315
- fig.update_layout(margin=dict(l=10, r=10, t=40, b=10), template=template)
316
- return fig
317
-
318
 
 
319
  def _format_number(n: float) -> str:
320
- if n >= 1_000_000:
321
- return f"£{n/1_000_000:.1f}M"
322
- if n >= 1_000:
323
- return f"£{n/1_000:.1f}k"
324
- return f"£{n:,.0f}"
325
-
326
-
327
- def summarize_with_ai(
328
- agg: Dict,
329
- api_key: Optional[str] = None,
330
- mode: str = "Concise",
331
- engine: str = "Heuristic",
332
- ollama_model: Optional[str] = None,
333
- ) -> str:
334
- # Prepare a compact context
335
- largest_cat = (
336
- agg["spend_per_category"].idxmax() if not agg["spend_per_category"].empty else None
337
- )
338
- largest_cat_share = (
339
- float(agg["category_share"].max()) if not agg["category_share"].empty else 0.0
340
- )
341
-
342
- context = {
343
- "total_spend": float(agg["total_spend"]),
344
- "avg_monthly": float(agg["avg_monthly_spend"]),
345
- "largest_category": largest_cat,
346
- "largest_category_share": largest_cat_share,
347
- "max_transaction": {
348
- "amount": float(agg["max_transaction"].get("Amount", 0.0)),
349
- "merchant": str(agg["max_transaction"].get("Merchant", "")),
350
- },
351
- "mom_change": _month_over_month_change(agg.get("monthly")),
352
- "spike_days": int(agg.get("spikes", pd.DataFrame()).get("IsSpike", pd.Series(dtype=bool)).sum()) if isinstance(agg.get("spikes"), pd.DataFrame) else 0,
353
- }
354
-
355
- # Engine selection
356
- engine = (engine or "Heuristic").strip()
357
- if engine == "Heuristic":
358
- return _heuristic_summary(context, mode=mode)
359
-
360
- # Local Hugging Face transformer model (small) - suitable for Spaces without paid APIs
361
- if engine == "HuggingFace":
362
- # Try to load a small, commonly-available model for generation. `distilgpt2`
363
- # is a reasonable CPU-friendly option available on HF Hub and produces
364
- # better text than the ultra-tiny toy models.
365
- model_name = os.getenv("HF_LOCAL_MODEL", "distilgpt2")
366
- try:
367
- from transformers import AutoModelForCausalLM, AutoTokenizer
368
- import torch
369
- # load tokenizer & model (cached by huggingface inside the Space)
370
- tokenizer = AutoTokenizer.from_pretrained(model_name)
371
- model = AutoModelForCausalLM.from_pretrained(model_name)
372
- prompt = _hf_prompt(context, mode)
373
- inputs = tokenizer(prompt, return_tensors="pt")
374
- with torch.no_grad():
375
- out = model.generate(**inputs, max_new_tokens=128, do_sample=True, temperature=0.7)
376
- text = tokenizer.decode(out[0], skip_special_tokens=True)
377
- # post-process: return the generated tail after the prompt if present
378
- if text.startswith(prompt):
379
- return text[len(prompt):].strip() or _heuristic_summary(context, mode=mode)
380
- return text.strip() or _heuristic_summary(context, mode=mode)
381
- except Exception:
382
- # If local HF fails, fallback to heuristic (keeps app running on Spaces)
383
- return _heuristic_summary(context, mode=mode)
384
-
385
- # At this point, only local Hugging Face generation and heuristic fallback are supported
386
- # to keep the app free and self-contained for Hugging Face Spaces.
387
- return _heuristic_summary(context, mode=mode)
388
-
389
-
390
- def _month_over_month_change(monthly: Optional[pd.DataFrame]) -> float:
391
- if monthly is None or monthly.empty or len(monthly) < 2:
392
- return 0.0
393
- monthly_sorted = monthly.sort_values("Month")
394
- last, prev = monthly_sorted["Amount"].iloc[-1], monthly_sorted["Amount"].iloc[-2]
395
- if prev == 0:
396
- return 0.0
397
- return float((last - prev) / prev)
398
-
399
-
400
- def _heuristic_summary(ctx: Dict, mode: str = "Concise") -> str:
401
- total = _format_number(ctx.get("total_spend", 0.0))
402
- avg = _format_number(ctx.get("avg_monthly", 0.0))
403
- lcat = ctx.get("largest_category") or "N/A"
404
- share = ctx.get("largest_category_share", 0.0) * 100
405
- max_amt = ctx.get("max_transaction", {}).get("amount", 0.0)
406
- max_merchant = ctx.get("max_transaction", {}).get("merchant", "")
407
- mom = ctx.get("mom_change", 0.0) * 100
408
- spikes = ctx.get("spike_days", 0)
409
-
410
- parts = [
411
- f"Total spend in the selected period is {total}, averaging {avg} per month.",
412
- f"Top category is {lcat} at {share:.0f}% of spend." if lcat != "N/A" else "",
413
- f"Month-over-month, spending changed by {mom:+.0f}%.",
414
- f"Largest single transaction was £{max_amt:,.0f} at {max_merchant}." if max_amt else "",
415
- f"Detected {spikes} unusually high daily spend day(s)." if spikes else "",
416
- ]
417
- text = " ".join([p for p in parts if p])
418
-
419
- if mode == "Detailed":
420
- # Add more comprehensive analysis for detailed mode
421
- detailed_insights = []
422
-
423
- # Spending pattern analysis
424
- if mom > 10:
425
- detailed_insights.append("Your spending has increased significantly this month, which may indicate lifestyle changes or seasonal variations.")
426
- elif mom < -10:
427
- detailed_insights.append("You've successfully reduced your spending this month, showing good financial discipline.")
428
- else:
429
- detailed_insights.append("Your spending patterns remain relatively stable month-over-month.")
430
-
431
- # Category-specific recommendations
432
- if lcat == "Food":
433
- detailed_insights.append("Food represents your largest expense category. Consider meal planning and bulk shopping to optimize costs.")
434
- elif lcat == "Shopping":
435
- detailed_insights.append("Shopping is your primary spending category. Review purchases for necessities vs. wants to identify savings opportunities.")
436
- elif lcat == "Entertainment":
437
- detailed_insights.append("Entertainment spending dominates your budget. Look for free or low-cost alternatives to maintain your lifestyle within budget.")
438
-
439
- # Spike analysis
440
- if spikes > 5:
441
- detailed_insights.append("Multiple spending spikes detected suggest irregular expense patterns. Consider smoothing these through better budgeting.")
442
- elif spikes > 0:
443
- detailed_insights.append("Some spending spikes were identified, which is normal but worth monitoring for budget planning.")
444
-
445
- # General financial advice
446
- detailed_insights.append("Consider setting category budgets and monitoring spikes to smooth cash flow and improve financial predictability.")
447
-
448
- text += " " + " ".join(detailed_insights)
449
-
450
- return text
451
-
452
-
453
- # Ollama/OpenAI helpers removed to keep the app local-only and free.
454
-
455
-
456
- def _hf_prompt(context: Dict, mode: str) -> str:
457
- style = "concise (80-120 words)" if mode == "Concise" else "detailed (140-220 words)"
458
- return (
459
- "You are a helpful financial assistant. Produce a "
460
- + style
461
- + " natural-language summary of the provided spending analytics in plain English.\n\n"
462
- + f"Context: {context}\n\nSummary:"
463
- )
464
-
465
-
466
- def chat_with_ai(
467
- agg: Dict,
468
- question: str,
469
- engine: str = "Heuristic",
470
- api_key: Optional[str] = None,
471
- ollama_model: Optional[str] = None,
472
- ) -> str:
473
- # Provide compact context; reuse from summarize
474
- context = {
475
- "totals": float(agg.get("total_spend", 0.0)),
476
- "monthly": [
477
- { "month": str(r["Month"]), "amount": float(r["Amount"]) }
478
- for _, r in agg.get("monthly", pd.DataFrame()).iterrows()
479
- ],
480
- "by_category": agg.get("spend_per_category", pd.Series(dtype=float)).to_dict(),
481
- "by_payment": agg.get("spend_per_payment", pd.Series(dtype=float)).to_dict(),
482
- "max_txn": agg.get("max_transaction", {}),
483
- }
484
-
485
- engine = (engine or "Heuristic").strip()
486
- if engine == "Heuristic" or not question.strip():
487
- return "Here's what I can tell from your data: total spend is " \
488
- + _format_number(context["totals"]) \
489
- + ". Ask about trends, categories, or months for more detail."
490
-
491
- # Support local Hugging Face model for Q&A if requested; otherwise, return heuristic answer.
492
- engine = (engine or "Heuristic").strip()
493
- if engine == "Heuristic" or not question.strip():
494
- return "Here's what I can tell from your data: total spend is " \
495
- + _format_number(context["totals"]) \
496
- + ". Ask about trends, categories, or months for more detail."
497
-
498
- if engine == "HuggingFace":
499
- model_name = os.getenv("HF_LOCAL_MODEL", "distilgpt2")
500
- try:
501
- from transformers import AutoModelForCausalLM, AutoTokenizer
502
- import torch
503
- tokenizer = AutoTokenizer.from_pretrained(model_name)
504
- model = AutoModelForCausalLM.from_pretrained(model_name)
505
- prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
506
- inputs = tokenizer(prompt, return_tensors="pt")
507
- with torch.no_grad():
508
- out = model.generate(**inputs, max_new_tokens=128, do_sample=True, temperature=0.7)
509
- text = tokenizer.decode(out[0], skip_special_tokens=True)
510
- if text.startswith(prompt):
511
- return text[len(prompt):].strip()
512
- return text.strip()
513
- except Exception:
514
- return "Local model unavailable. Falling back to heuristic answer: " + (
515
- "Here's what I can tell from your data: total spend is " + _format_number(context["totals"]) + "."
516
- )
517
-
518
- # Default fallback
519
- return "I can't answer that right now. Try the Heuristic engine."
520
-
521
-
 
56
 
57
 
58
  def _random_amounts(n: int, rng: np.random.Generator) -> np.ndarray:
 
59
  choices = rng.choice(["small", "medium", "large"], size=n, p=[0.65, 0.28, 0.07])
60
  amounts = np.empty(n)
61
  for i, c in enumerate(choices):
 
65
  amounts[i] = max(5, rng.normal(60, 25))
66
  else:
67
  amounts[i] = max(20, rng.normal(180, 60))
 
68
  spike_idx = rng.choice(np.arange(n), size=max(1, n // 50), replace=False)
69
  amounts[spike_idx] *= rng.uniform(2.5, 4.0, size=len(spike_idx))
70
  return np.round(amounts, 2)
 
76
  start = end - pd.Timedelta(days=365)
77
  dates = pd.date_range(start, end, freq="D")
78
 
79
+ weights = np.array([1.2 if d.weekday() >= 5 else 1.0 for d in dates]) * \
80
+ np.array([1.3 if d.day > 25 else 1.0 for d in dates])
 
 
 
 
81
  weights = np.clip(weights, a_min=0, a_max=None)
82
  weights = weights / weights.sum()
83
  date_choices = rng.choice(len(dates), size=n_rows, replace=True, p=weights)
 
89
  locations = rng.choice(LOCATIONS, size=n_rows)
90
  amts = _random_amounts(n_rows, rng)
91
 
92
+ df = pd.DataFrame({
93
+ "Date": pd.to_datetime(chosen_dates),
94
+ "Merchant": merchants,
95
+ "Category": categories,
96
+ "Amount": amts,
97
+ "Payment Method": payment_methods,
98
+ "Location": locations,
99
+ })
 
 
 
100
  df = df.sort_values("Date").reset_index(drop=True)
101
  return df
102
 
 
141
  max_txn = df.loc[df["Amount"].idxmax()].to_dict()
142
  min_txn = df.loc[df["Amount"].idxmin()].to_dict()
143
 
144
+ monthly = df.assign(Month=_month_key(df["Date"])).groupby("Month")["Amount"].sum().reset_index()
 
 
 
 
145
  avg_monthly_spend = float(monthly["Amount"].mean()) if not monthly.empty else 0.0
 
 
146
  category_share = (spend_per_category / max(total_spend, 1e-9)).round(4)
147
 
 
148
  df_daily = df.groupby(pd.to_datetime(df["Date"]).dt.date)["Amount"].sum().reset_index()
149
+ df_daily["Date"] = pd.to_datetime(df_daily["Date"])
150
  df_daily = df_daily.sort_values("Date")
151
  df_daily["Rolling28"] = df_daily["Amount"].rolling(window=28, min_periods=7).mean()
152
 
 
153
  mu = df_daily["Amount"].mean()
154
  sigma = df_daily["Amount"].std(ddof=0) or 0.0
155
  threshold = mu + 2.5 * sigma
 
173
  df: pd.DataFrame,
174
  template: str = "plotly",
175
  spike_overlay: Optional[pd.DataFrame] = None,
176
+ fixed_line_width: int = 2,
177
  ) -> "px.Figure":
178
  if df.empty:
179
  fig = px.line()
 
181
  return fig
182
 
183
  daily = df.groupby(pd.to_datetime(df["Date"]).dt.date)["Amount"].sum().reset_index()
184
+ daily["Date"] = pd.to_datetime(daily["Date"])
185
 
186
  fig = px.line(
187
  daily,
 
191
  markers=True,
192
  )
193
 
194
+ fig.update_traces(
195
+ line=dict(width=fixed_line_width),
196
+ marker=dict(size=6),
197
+ hovertemplate="%{x|%b %d, %Y}: £%{y:.2f}",
198
+ )
199
 
200
  fig.update_layout(margin=dict(l=10, r=10, t=40, b=10), template=template)
201
 
 
202
  if isinstance(spike_overlay, pd.DataFrame) and not spike_overlay.empty:
203
  spike_points = spike_overlay[spike_overlay.get("IsSpike", False)]
204
  if not spike_points.empty:
 
214
  return fig
215
 
216
 
217
+ # --- Category / Payment Charts ---
218
  def build_category_bar_chart(
219
  spend_per_category: pd.Series,
220
  template: str = "plotly",
 
258
  fig.update_layout(margin=dict(l=10, r=10, t=40, b=10), template=template)
259
  return fig
260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
+ # --- Helpers for AI summaries ---
263
  def _format_number(n: float) -> str:
264
+ if n >= 1_000_000:
265
+ return f"£{n/1_000_000:.1f}M"
266
+ if n >= 1_000:
267
+ return f"£{n/1_000:.1f}k"
268
+ return f"£{n:,.0f}"