Spaces:

Ekow24
/

AI_Spending_Analyzer

Sleeping

App Files Files Community

Ekow24 commited on Oct 8, 2025

Commit

eb1886d

verified ·

1 Parent(s): f271fbf

Update utils.py

Browse files

Files changed (1) hide show

utils.py +26 -279

utils.py CHANGED Viewed

@@ -56,7 +56,6 @@ LOCATIONS = [
 def _random_amounts(n: int, rng: np.random.Generator) -> np.ndarray:
-    # Mixture distribution for more realistic spend: many small, some medium, few large
     choices = rng.choice(["small", "medium", "large"], size=n, p=[0.65, 0.28, 0.07])
     amounts = np.empty(n)
     for i, c in enumerate(choices):
@@ -66,7 +65,6 @@ def _random_amounts(n: int, rng: np.random.Generator) -> np.ndarray:
             amounts[i] = max(5, rng.normal(60, 25))
         else:
             amounts[i] = max(20, rng.normal(180, 60))
-    # Random spikes
     spike_idx = rng.choice(np.arange(n), size=max(1, n // 50), replace=False)
     amounts[spike_idx] *= rng.uniform(2.5, 4.0, size=len(spike_idx))
     return np.round(amounts, 2)
@@ -78,12 +76,8 @@ def generate_synthetic_transactions(n_rows: int = 900, seed: Optional[int] = Non
     start = end - pd.Timedelta(days=365)
     dates = pd.date_range(start, end, freq="D")
-    # Draw dates with bias to weekends and month-ends; normalize to ensure probabilities sum to 1
-    weights = np.array([
-        1.2 if d.weekday() >= 5 else 1.0 for d in dates
-    ]) * np.array([
-        1.3 if d.day > 25 else 1.0 for d in dates
-    ])
     weights = np.clip(weights, a_min=0, a_max=None)
     weights = weights / weights.sum()
     date_choices = rng.choice(len(dates), size=n_rows, replace=True, p=weights)
@@ -95,17 +89,14 @@ def generate_synthetic_transactions(n_rows: int = 900, seed: Optional[int] = Non
     locations = rng.choice(LOCATIONS, size=n_rows)
     amts = _random_amounts(n_rows, rng)
-    df = pd.DataFrame(
-        {
-            "Date": pd.to_datetime(chosen_dates),
-            "Merchant": merchants,
-            "Category": categories,
-            "Amount": amts,
-            "Payment Method": payment_methods,
-            "Location": locations,
-        }
-    )
-    # Sort by date for better UX
     df = df.sort_values("Date").reset_index(drop=True)
     return df
@@ -150,23 +141,15 @@ def compute_aggregations(df: pd.DataFrame) -> Dict:
     max_txn = df.loc[df["Amount"].idxmax()].to_dict()
     min_txn = df.loc[df["Amount"].idxmin()].to_dict()
-    monthly = (
-        df.assign(Month=_month_key(df["Date"]))
-        .groupby("Month")["Amount"].sum()
-        .reset_index()
-    )
     avg_monthly_spend = float(monthly["Amount"].mean()) if not monthly.empty else 0.0
-    # Category share
     category_share = (spend_per_category / max(total_spend, 1e-9)).round(4)
-    # Rolling 28-day spend for simple trend smoothing
     df_daily = df.groupby(pd.to_datetime(df["Date"]).dt.date)["Amount"].sum().reset_index()
-    df_daily["Date"] = pd.to_datetime(df_daily["Date"])  # normalize to midnight
     df_daily = df_daily.sort_values("Date")
     df_daily["Rolling28"] = df_daily["Amount"].rolling(window=28, min_periods=7).mean()
-    # Naive anomaly: mark spikes above mean + 2.5*std on daily amounts
     mu = df_daily["Amount"].mean()
     sigma = df_daily["Amount"].std(ddof=0) or 0.0
     threshold = mu + 2.5 * sigma
@@ -190,6 +173,7 @@ def build_time_series_chart(
     df: pd.DataFrame,
     template: str = "plotly",
     spike_overlay: Optional[pd.DataFrame] = None,
 ) -> "px.Figure":
     if df.empty:
         fig = px.line()
@@ -197,7 +181,7 @@ def build_time_series_chart(
         return fig
     daily = df.groupby(pd.to_datetime(df["Date"]).dt.date)["Amount"].sum().reset_index()
-    daily["Date"] = pd.to_datetime(daily["Date"])  # ensure datetime for plotly
     fig = px.line(
         daily,
@@ -207,12 +191,14 @@ def build_time_series_chart(
         markers=True,
     )
-    # FIX: Prevent line expansion on hover
-    fig.update_traces(line=dict(width=2), marker=dict(size=6), hovertemplate="%{x|%b %d, %Y}: £%{y:.2f}")
     fig.update_layout(margin=dict(l=10, r=10, t=40, b=10), template=template)
-    # Optional spike overlay
     if isinstance(spike_overlay, pd.DataFrame) and not spike_overlay.empty:
         spike_points = spike_overlay[spike_overlay.get("IsSpike", False)]
         if not spike_points.empty:
@@ -228,6 +214,7 @@ def build_time_series_chart(
     return fig
 def build_category_bar_chart(
     spend_per_category: pd.Series,
     template: str = "plotly",
@@ -271,251 +258,11 @@ def build_payment_method_pie_chart(
     fig.update_layout(margin=dict(l=10, r=10, t=40, b=10), template=template)
     return fig
-def build_category_bar_chart(
-	spend_per_category: pd.Series,
-	template: str = "plotly",
-	color_sequence: Optional[list] = None,
-):
-	if spend_per_category.empty:
-		fig = px.bar()
-		fig.update_layout(template=template)
-		return fig
-	fig = px.bar(
-		spend_per_category.reset_index().rename(columns={"index": "Category", 0: "Amount"}),
-		x="Category",
-		y="Amount",
-		title="Spend by Category",
-		color="Category",
-		color_discrete_sequence=color_sequence,
-	)
-	fig.update_traces(hovertemplate="%{x}: £%{y:.2f}")
-	fig.update_layout(showlegend=False, margin=dict(l=10, r=10, t=40, b=10), template=template)
-	return fig
-def build_payment_method_pie_chart(
-	spend_per_payment: pd.Series,
-	template: str = "plotly",
-	color_sequence: Optional[list] = None,
-):
-	if spend_per_payment.empty:
-		fig = px.pie()
-		fig.update_layout(template=template)
-		return fig
-	fig = px.pie(
-		spend_per_payment.reset_index().rename(columns={"index": "Payment Method", 0: "Amount"}),
-		values="Amount",
-		names="Payment Method",
-		title="Payment Methods Distribution",
-		hole=0.45,
-		color_discrete_sequence=color_sequence,
-	)
-	fig.update_traces(hovertemplate="%{label}: £%{value:.2f} (%{percent})")
-	fig.update_layout(margin=dict(l=10, r=10, t=40, b=10), template=template)
-	return fig
 def _format_number(n: float) -> str:
-	if n >= 1_000_000:
-		return f"£{n/1_000_000:.1f}M"
-	if n >= 1_000:
-		return f"£{n/1_000:.1f}k"
-	return f"£{n:,.0f}"
-def summarize_with_ai(
-	agg: Dict,
-	api_key: Optional[str] = None,
-	mode: str = "Concise",
-	engine: str = "Heuristic",
-	ollama_model: Optional[str] = None,
-) -> str:
-	# Prepare a compact context
-	largest_cat = (
-		agg["spend_per_category"].idxmax() if not agg["spend_per_category"].empty else None
-	)
-	largest_cat_share = (
-		float(agg["category_share"].max()) if not agg["category_share"].empty else 0.0
-	)
-	context = {
-		"total_spend": float(agg["total_spend"]),
-		"avg_monthly": float(agg["avg_monthly_spend"]),
-		"largest_category": largest_cat,
-		"largest_category_share": largest_cat_share,
-		"max_transaction": {
-			"amount": float(agg["max_transaction"].get("Amount", 0.0)),
-			"merchant": str(agg["max_transaction"].get("Merchant", "")),
-		},
-		"mom_change": _month_over_month_change(agg.get("monthly")),
-		"spike_days": int(agg.get("spikes", pd.DataFrame()).get("IsSpike", pd.Series(dtype=bool)).sum()) if isinstance(agg.get("spikes"), pd.DataFrame) else 0,
-	}
-	# Engine selection
-	engine = (engine or "Heuristic").strip()
-	if engine == "Heuristic":
-		return _heuristic_summary(context, mode=mode)
-	# Local Hugging Face transformer model (small) - suitable for Spaces without paid APIs
-	if engine == "HuggingFace":
-		# Try to load a small, commonly-available model for generation. `distilgpt2`
-		# is a reasonable CPU-friendly option available on HF Hub and produces
-		# better text than the ultra-tiny toy models.
-		model_name = os.getenv("HF_LOCAL_MODEL", "distilgpt2")
-		try:
-			from transformers import AutoModelForCausalLM, AutoTokenizer
-			import torch
-			# load tokenizer & model (cached by huggingface inside the Space)
-			tokenizer = AutoTokenizer.from_pretrained(model_name)
-			model = AutoModelForCausalLM.from_pretrained(model_name)
-			prompt = _hf_prompt(context, mode)
-			inputs = tokenizer(prompt, return_tensors="pt")
-			with torch.no_grad():
-				out = model.generate(**inputs, max_new_tokens=128, do_sample=True, temperature=0.7)
-			text = tokenizer.decode(out[0], skip_special_tokens=True)
-			# post-process: return the generated tail after the prompt if present
-			if text.startswith(prompt):
-				return text[len(prompt):].strip() or _heuristic_summary(context, mode=mode)
-			return text.strip() or _heuristic_summary(context, mode=mode)
-		except Exception:
-			# If local HF fails, fallback to heuristic (keeps app running on Spaces)
-			return _heuristic_summary(context, mode=mode)
-	# At this point, only local Hugging Face generation and heuristic fallback are supported
-	# to keep the app free and self-contained for Hugging Face Spaces.
-	return _heuristic_summary(context, mode=mode)
-def _month_over_month_change(monthly: Optional[pd.DataFrame]) -> float:
-	if monthly is None or monthly.empty or len(monthly) < 2:
-		return 0.0
-	monthly_sorted = monthly.sort_values("Month")
-	last, prev = monthly_sorted["Amount"].iloc[-1], monthly_sorted["Amount"].iloc[-2]
-	if prev == 0:
-		return 0.0
-	return float((last - prev) / prev)
-def _heuristic_summary(ctx: Dict, mode: str = "Concise") -> str:
-	total = _format_number(ctx.get("total_spend", 0.0))
-	avg = _format_number(ctx.get("avg_monthly", 0.0))
-	lcat = ctx.get("largest_category") or "N/A"
-	share = ctx.get("largest_category_share", 0.0) * 100
-	max_amt = ctx.get("max_transaction", {}).get("amount", 0.0)
-	max_merchant = ctx.get("max_transaction", {}).get("merchant", "")
-	mom = ctx.get("mom_change", 0.0) * 100
-	spikes = ctx.get("spike_days", 0)
-	parts = [
-		f"Total spend in the selected period is {total}, averaging {avg} per month.",
-		f"Top category is {lcat} at {share:.0f}% of spend." if lcat != "N/A" else "",
-		f"Month-over-month, spending changed by {mom:+.0f}%.",
-		f"Largest single transaction was £{max_amt:,.0f} at {max_merchant}." if max_amt else "",
-		f"Detected {spikes} unusually high daily spend day(s)." if spikes else "",
-	]
-	text = " ".join([p for p in parts if p])
-	if mode == "Detailed":
-		# Add more comprehensive analysis for detailed mode
-		detailed_insights = []
-		# Spending pattern analysis
-		if mom > 10:
-			detailed_insights.append("Your spending has increased significantly this month, which may indicate lifestyle changes or seasonal variations.")
-		elif mom < -10:
-			detailed_insights.append("You've successfully reduced your spending this month, showing good financial discipline.")
-		else:
-			detailed_insights.append("Your spending patterns remain relatively stable month-over-month.")
-		# Category-specific recommendations
-		if lcat == "Food":
-			detailed_insights.append("Food represents your largest expense category. Consider meal planning and bulk shopping to optimize costs.")
-		elif lcat == "Shopping":
-			detailed_insights.append("Shopping is your primary spending category. Review purchases for necessities vs. wants to identify savings opportunities.")
-		elif lcat == "Entertainment":
-			detailed_insights.append("Entertainment spending dominates your budget. Look for free or low-cost alternatives to maintain your lifestyle within budget.")
-		# Spike analysis
-		if spikes > 5:
-			detailed_insights.append("Multiple spending spikes detected suggest irregular expense patterns. Consider smoothing these through better budgeting.")
-		elif spikes > 0:
-			detailed_insights.append("Some spending spikes were identified, which is normal but worth monitoring for budget planning.")
-		# General financial advice
-		detailed_insights.append("Consider setting category budgets and monitoring spikes to smooth cash flow and improve financial predictability.")
-		text += " " + " ".join(detailed_insights)
-	return text
-# Ollama/OpenAI helpers removed to keep the app local-only and free.
-def _hf_prompt(context: Dict, mode: str) -> str:
-	style = "concise (80-120 words)" if mode == "Concise" else "detailed (140-220 words)"
-	return (
-		"You are a helpful financial assistant. Produce a "
-		+ style
-		+ " natural-language summary of the provided spending analytics in plain English.\n\n"
-		+ f"Context: {context}\n\nSummary:"
-	)
-def chat_with_ai(
-	agg: Dict,
-	question: str,
-	engine: str = "Heuristic",
-	api_key: Optional[str] = None,
-	ollama_model: Optional[str] = None,
-) -> str:
-	# Provide compact context; reuse from summarize
-	context = {
-		"totals": float(agg.get("total_spend", 0.0)),
-		"monthly": [
-			{ "month": str(r["Month"]), "amount": float(r["Amount"]) }
-			for _, r in agg.get("monthly", pd.DataFrame()).iterrows()
-		],
-		"by_category": agg.get("spend_per_category", pd.Series(dtype=float)).to_dict(),
-		"by_payment": agg.get("spend_per_payment", pd.Series(dtype=float)).to_dict(),
-		"max_txn": agg.get("max_transaction", {}),
-	}
-	engine = (engine or "Heuristic").strip()
-	if engine == "Heuristic" or not question.strip():
-		return "Here's what I can tell from your data: total spend is " \
-			+ _format_number(context["totals"]) \
-			+ ". Ask about trends, categories, or months for more detail."
-	# Support local Hugging Face model for Q&A if requested; otherwise, return heuristic answer.
-	engine = (engine or "Heuristic").strip()
-	if engine == "Heuristic" or not question.strip():
-		return "Here's what I can tell from your data: total spend is " \
-			+ _format_number(context["totals"]) \
-			+ ". Ask about trends, categories, or months for more detail."
-	if engine == "HuggingFace":
-		model_name = os.getenv("HF_LOCAL_MODEL", "distilgpt2")
-		try:
-			from transformers import AutoModelForCausalLM, AutoTokenizer
-			import torch
-			tokenizer = AutoTokenizer.from_pretrained(model_name)
-			model = AutoModelForCausalLM.from_pretrained(model_name)
-			prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
-			inputs = tokenizer(prompt, return_tensors="pt")
-			with torch.no_grad():
-				out = model.generate(**inputs, max_new_tokens=128, do_sample=True, temperature=0.7)
-			text = tokenizer.decode(out[0], skip_special_tokens=True)
-			if text.startswith(prompt):
-				return text[len(prompt):].strip()
-			return text.strip()
-		except Exception:
-			return "Local model unavailable. Falling back to heuristic answer: " + (
-				"Here's what I can tell from your data: total spend is " + _format_number(context["totals"]) + "."
-			)
-	# Default fallback
-	return "I can't answer that right now. Try the Heuristic engine."

 def _random_amounts(n: int, rng: np.random.Generator) -> np.ndarray:
     choices = rng.choice(["small", "medium", "large"], size=n, p=[0.65, 0.28, 0.07])
     amounts = np.empty(n)
     for i, c in enumerate(choices):
             amounts[i] = max(5, rng.normal(60, 25))
         else:
             amounts[i] = max(20, rng.normal(180, 60))
     spike_idx = rng.choice(np.arange(n), size=max(1, n // 50), replace=False)
     amounts[spike_idx] *= rng.uniform(2.5, 4.0, size=len(spike_idx))
     return np.round(amounts, 2)
     start = end - pd.Timedelta(days=365)
     dates = pd.date_range(start, end, freq="D")
+    weights = np.array([1.2 if d.weekday() >= 5 else 1.0 for d in dates]) * \
+              np.array([1.3 if d.day > 25 else 1.0 for d in dates])
     weights = np.clip(weights, a_min=0, a_max=None)
     weights = weights / weights.sum()
     date_choices = rng.choice(len(dates), size=n_rows, replace=True, p=weights)
     locations = rng.choice(LOCATIONS, size=n_rows)
     amts = _random_amounts(n_rows, rng)
+    df = pd.DataFrame({
+        "Date": pd.to_datetime(chosen_dates),
+        "Merchant": merchants,
+        "Category": categories,
+        "Amount": amts,
+        "Payment Method": payment_methods,
+        "Location": locations,
+    })
     df = df.sort_values("Date").reset_index(drop=True)
     return df
     max_txn = df.loc[df["Amount"].idxmax()].to_dict()
     min_txn = df.loc[df["Amount"].idxmin()].to_dict()
+    monthly = df.assign(Month=_month_key(df["Date"])).groupby("Month")["Amount"].sum().reset_index()
     avg_monthly_spend = float(monthly["Amount"].mean()) if not monthly.empty else 0.0
     category_share = (spend_per_category / max(total_spend, 1e-9)).round(4)
     df_daily = df.groupby(pd.to_datetime(df["Date"]).dt.date)["Amount"].sum().reset_index()
+    df_daily["Date"] = pd.to_datetime(df_daily["Date"])
     df_daily = df_daily.sort_values("Date")
     df_daily["Rolling28"] = df_daily["Amount"].rolling(window=28, min_periods=7).mean()
     mu = df_daily["Amount"].mean()
     sigma = df_daily["Amount"].std(ddof=0) or 0.0
     threshold = mu + 2.5 * sigma
     df: pd.DataFrame,
     template: str = "plotly",
     spike_overlay: Optional[pd.DataFrame] = None,
+    fixed_line_width: int = 2,
 ) -> "px.Figure":
     if df.empty:
         fig = px.line()
         return fig
     daily = df.groupby(pd.to_datetime(df["Date"]).dt.date)["Amount"].sum().reset_index()
+    daily["Date"] = pd.to_datetime(daily["Date"])
     fig = px.line(
         daily,
         markers=True,
     )
+    fig.update_traces(
+        line=dict(width=fixed_line_width),
+        marker=dict(size=6),
+        hovertemplate="%{x|%b %d, %Y}: £%{y:.2f}",
+    )
     fig.update_layout(margin=dict(l=10, r=10, t=40, b=10), template=template)
     if isinstance(spike_overlay, pd.DataFrame) and not spike_overlay.empty:
         spike_points = spike_overlay[spike_overlay.get("IsSpike", False)]
         if not spike_points.empty:
     return fig
+# --- Category / Payment Charts ---
 def build_category_bar_chart(
     spend_per_category: pd.Series,
     template: str = "plotly",
     fig.update_layout(margin=dict(l=10, r=10, t=40, b=10), template=template)
     return fig
+# --- Helpers for AI summaries ---
 def _format_number(n: float) -> str:
+    if n >= 1_000_000:
+        return f"£{n/1_000_000:.1f}M"
+    if n >= 1_000:
+        return f"£{n/1_000:.1f}k"
+    return f"£{n:,.0f}"