VladGeekPro Copilot commited on
Commit
ffee684
·
1 Parent(s): 82b086c

ChangedLogicToPredictSum

Browse files

Co-authored-by: Copilot <copilot@github.com>

Files changed (2) hide show
  1. app.py +6 -1
  2. expense_predictor.py +105 -61
app.py CHANGED
@@ -679,12 +679,17 @@ def predict_expenses_endpoint():
679
  """Predicts top 3 expenses user should add based on 6-month history."""
680
  payload = parse_json_payload()
681
  expenses = payload.get("expenses") or []
 
 
682
 
683
  if not isinstance(expenses, list):
684
  return jsonify({"status": "error", "message": "expenses must be a list"}), 422
 
 
 
685
 
686
  try:
687
- predictions = predict_expenses(expenses)
688
  return jsonify({
689
  "status": "ok",
690
  "predictions": predictions
 
679
  """Predicts top 3 expenses user should add based on 6-month history."""
680
  payload = parse_json_payload()
681
  expenses = payload.get("expenses") or []
682
+ user_id = payload.get("user_id")
683
+ debug = (request.args.get("debug") or request.args.get("debut") or "").strip().lower() == ""
684
 
685
  if not isinstance(expenses, list):
686
  return jsonify({"status": "error", "message": "expenses must be a list"}), 422
687
+
688
+ if user_id is None:
689
+ return jsonify({"status": "error", "message": "user_id is required"}), 422
690
 
691
  try:
692
+ predictions = predict_expenses(expenses, target_user_id=user_id, debug=debug)
693
  return jsonify({
694
  "status": "ok",
695
  "predictions": predictions
expense_predictor.py CHANGED
@@ -4,99 +4,143 @@ Expense prediction model: suggests next expenses based on 6-month history.
4
  - Output: Top 3 predicted expenses (date, sum, supplier, user)
5
  """
6
 
7
- from datetime import datetime, timedelta
8
  from collections import defaultdict
9
- from typing import Optional
10
  import statistics
11
 
 
12
 
13
- def predict_expenses(expenses: list[dict]) -> list[dict]:
14
- """
15
- Predict top 3 expenses user should add.
16
-
17
- Input: [{"date": "2026-01-15", "sum": 150.50, "supplier_id": 5, "user_id": 1, ...}, ...]
18
- Output: [{"date": str, "sum": float, "supplier_id": int, "user_id": int, "confidence": float}, ...]
19
- """
20
  if not expenses or len(expenses) < 2:
21
- print(f"[PREDICT] Not enough records: {len(expenses) if expenses else 0}")
 
22
  return []
23
 
24
- # Group by (supplier_id, user_id)
25
- supplier_user_history = defaultdict(list)
26
  supplier_freq = defaultdict(int)
27
  total_records = len(expenses)
28
 
29
- print(f"[PREDICT] Total records received: {total_records}")
30
- for i, exp in enumerate(expenses):
31
- print(f"[PREDICT] [{i+1}] date={exp.get('date')}, sum={exp.get('sum')}, supplier_id={exp.get('supplier_id')}, user_id={exp.get('user_id')}")
 
32
 
33
  for exp in expenses:
34
- key = (exp["supplier_id"], exp["user_id"])
35
- supplier_user_history[key].append(exp)
36
- supplier_freq[key] += 1
37
-
38
- print(f"[PREDICT] Unique (supplier, user) pairs: {len(supplier_user_history)}")
39
- for key, count in supplier_freq.items():
40
- pct = count / total_records * 100
41
- print(f"[PREDICT] supplier_id={key[0]}, user_id={key[1]} → {count} records ({pct:.1f}%)")
42
-
43
- # Filter: frequency > 50% over 6 months
44
- candidates = {
45
- key: records
46
- for key, records in supplier_user_history.items()
47
- if supplier_freq[key] / total_records >= 0.5
48
- }
49
-
50
- print(f"[PREDICT] Candidates after >50% filter: {len(candidates)}")
51
-
52
- if not candidates:
53
- print("[PREDICT] No candidates passed the frequency filter. Returning empty.")
 
 
 
 
54
  return []
55
 
56
- # Analyze each candidate: avg amount, interval, last date
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  predictions = []
58
 
59
- for (supplier_id, user_id), records in candidates.items():
 
60
  amounts = [float(r["sum"]) for r in records]
61
  avg_amount = statistics.mean(amounts)
62
 
63
- # Calculate interval between transactions (days)
64
- dates = sorted([datetime.fromisoformat(r["date"]) for r in records])
65
- if len(dates) >= 2:
66
- intervals = [(dates[i+1] - dates[i]).days for i in range(len(dates) - 1)]
67
- avg_interval = statistics.mean(intervals)
 
 
 
 
 
 
68
  else:
69
- avg_interval = 30 # default monthly
 
70
 
71
- last_date = dates[-1]
72
- next_predicted_date = (last_date + timedelta(days=avg_interval)).strftime("%Y-%m-%d")
73
 
74
- # Confidence: higher if more consistent (lower std dev)
75
  amount_std = statistics.stdev(amounts) if len(amounts) > 1 else 0
76
  consistency = max(0, 1 - (amount_std / avg_amount)) if avg_amount > 0 else 0.5
77
- frequency_score = min(supplier_freq[(supplier_id, user_id)] / total_records, 1.0)
78
- confidence = (consistency + frequency_score) / 2
79
-
80
- print(
81
- f"[PREDICT] supplier_id={supplier_id}, user_id={user_id} | "
82
- f"avg_amount={avg_amount:.2f}, avg_interval={avg_interval:.1f}d, "
83
- f"last_date={last_date.date()}, next_date={next_predicted_date}, "
84
- f"consistency={consistency:.2f}, freq_score={frequency_score:.2f}, confidence={confidence:.2f}"
85
- )
86
-
 
 
87
  predictions.append({
88
  "date": next_predicted_date,
89
- "sum": round(avg_amount, 2),
90
  "supplier_id": supplier_id,
91
- "user_id": user_id,
92
  "confidence": round(confidence, 2)
93
  })
94
 
95
  # Return top 3 by confidence
96
  result = sorted(predictions, key=lambda x: x["confidence"], reverse=True)[:3]
97
 
98
- print(f"[PREDICT] Final top {len(result)} predictions:")
99
- for i, p in enumerate(result, 1):
100
- print(f"[PREDICT] #{i}: supplier_id={p['supplier_id']}, user_id={p['user_id']}, date={p['date']}, sum={p['sum']}, confidence={p['confidence']}")
 
101
 
102
  return result
 
4
  - Output: Top 3 predicted expenses (date, sum, supplier, user)
5
  """
6
 
7
+ from datetime import datetime
8
  from collections import defaultdict
 
9
  import statistics
10
 
11
+ from sklearn.ensemble import RandomForestRegressor
12
 
13
+
14
+ def predict_expenses(expenses: list[dict], target_user_id, debug: bool = False) -> list[dict]:
 
 
 
 
 
15
  if not expenses or len(expenses) < 2:
16
+ if debug:
17
+ print(f"[PREDICT] Not enough records: {len(expenses) if expenses else 0}")
18
  return []
19
 
20
+ # Group by supplier_id (top-3 different suppliers)
21
+ supplier_history = defaultdict(list)
22
  supplier_freq = defaultdict(int)
23
  total_records = len(expenses)
24
 
25
+ if debug:
26
+ print(f"[PREDICT] Total records received: {total_records}")
27
+ for i, exp in enumerate(expenses):
28
+ print(f"[PREDICT] [{i+1}] date={exp.get('date')}, sum={exp.get('sum')}, supplier_id={exp.get('supplier_id')}, user_id={exp.get('user_id')}")
29
 
30
  for exp in expenses:
31
+ supplier_id = exp["supplier_id"]
32
+ supplier_history[supplier_id].append(exp)
33
+ supplier_freq[supplier_id] += 1
34
+
35
+ if debug:
36
+ print(f"[PREDICT] Unique suppliers: {len(supplier_history)}")
37
+ for supplier_id, count in supplier_freq.items():
38
+ pct = count / total_records * 100
39
+ print(f"[PREDICT] supplier_id={supplier_id} -> {count} records ({pct:.1f}%)")
40
+
41
+ # Keep only top 3 suppliers by frequency (different suppliers)
42
+ candidates = supplier_history
43
+ top_candidate_items = sorted(
44
+ candidates.items(),
45
+ key=lambda item: supplier_freq[item[0]],
46
+ reverse=True,
47
+ )[:3]
48
+
49
+ if debug:
50
+ print(f"[PREDICT] Processing top {len(top_candidate_items)} suppliers by frequency")
51
+
52
+ if not top_candidate_items:
53
+ if debug:
54
+ print("[PREDICT] No suppliers found. Returning empty.")
55
  return []
56
 
57
+ now = datetime.now()
58
+
59
+ # Build one regression model for sum prediction.
60
+ supplier_to_idx = {supplier_id: idx for idx, supplier_id in enumerate(supplier_history.keys())}
61
+ user_values = [exp.get("user_id") for exp in expenses if exp.get("user_id") is not None]
62
+ user_to_idx = {user_id: idx for idx, user_id in enumerate(sorted(set(user_values), key=str))}
63
+
64
+ X_train = []
65
+ y_train = []
66
+
67
+ for exp in expenses:
68
+ supplier_id = exp.get("supplier_id")
69
+ user_id = exp.get("user_id")
70
+ raw_date = exp.get("date")
71
+ raw_sum = exp.get("sum")
72
+ if supplier_id is None or user_id is None or raw_date is None or raw_sum is None:
73
+ continue
74
+
75
+ tx_date = datetime.fromisoformat(raw_date)
76
+ X_train.append([
77
+ supplier_to_idx.get(supplier_id, -1),
78
+ user_to_idx.get(user_id, -1),
79
+ tx_date.day,
80
+ tx_date.weekday(),
81
+ tx_date.month,
82
+ ])
83
+ y_train.append(float(raw_sum))
84
+
85
+ sum_model = None
86
+ if len(X_train) >= 5:
87
+ sum_model = RandomForestRegressor(n_estimators=300, random_state=42)
88
+ sum_model.fit(X_train, y_train)
89
+
90
+ # Predict only amount for each of top-3 suppliers.
91
  predictions = []
92
 
93
+ for supplier_id, records in top_candidate_items:
94
+ records = sorted(records, key=lambda r: datetime.fromisoformat(r["date"]))
95
  amounts = [float(r["sum"]) for r in records]
96
  avg_amount = statistics.mean(amounts)
97
 
98
+ next_features = [[
99
+ supplier_to_idx.get(supplier_id, -1),
100
+ user_to_idx.get(target_user_id, -1),
101
+ now.day,
102
+ now.weekday(),
103
+ now.month,
104
+ ]]
105
+
106
+ if sum_model is not None:
107
+ predicted_amount = float(sum_model.predict(next_features)[0])
108
+ model_conf = 0.8
109
  else:
110
+ predicted_amount = avg_amount
111
+ model_conf = 0.5
112
 
113
+ next_predicted_date = now.strftime("%Y-%m-%d")
114
+ predicted_user = target_user_id
115
 
 
116
  amount_std = statistics.stdev(amounts) if len(amounts) > 1 else 0
117
  consistency = max(0, 1 - (amount_std / avg_amount)) if avg_amount > 0 else 0.5
118
+ frequency_score = min(supplier_freq[supplier_id] / total_records, 1.0)
119
+ confidence = (0.4 * consistency) + (0.3 * frequency_score) + (0.3 * model_conf)
120
+
121
+ if debug:
122
+ print(
123
+ f"[PREDICT] supplier_id={supplier_id}, user_id={predicted_user} | "
124
+ f"avg_amount={avg_amount:.2f}, pred_sum={predicted_amount:.2f}, "
125
+ f"target_date={next_predicted_date}, "
126
+ f"consistency={consistency:.2f}, freq_score={frequency_score:.2f}, "
127
+ f"model_conf={model_conf:.2f}, confidence={confidence:.2f}"
128
+ )
129
+
130
  predictions.append({
131
  "date": next_predicted_date,
132
+ "sum": round(max(0.0, predicted_amount), 2),
133
  "supplier_id": supplier_id,
134
+ "user_id": predicted_user,
135
  "confidence": round(confidence, 2)
136
  })
137
 
138
  # Return top 3 by confidence
139
  result = sorted(predictions, key=lambda x: x["confidence"], reverse=True)[:3]
140
 
141
+ if debug:
142
+ print(f"[PREDICT] Final top {len(result)} predictions:")
143
+ for i, p in enumerate(result, 1):
144
+ print(f"[PREDICT] #{i}: supplier_id={p['supplier_id']}, user_id={p['user_id']}, date={p['date']}, sum={p['sum']}, confidence={p['confidence']}")
145
 
146
  return result