Spaces:
Sleeping
Sleeping
VladGeekPro Copilot commited on
Commit ·
ffee684
1
Parent(s): 82b086c
ChangedLogicToPredictSum
Browse filesCo-authored-by: Copilot <copilot@github.com>
- app.py +6 -1
- expense_predictor.py +105 -61
app.py
CHANGED
|
@@ -679,12 +679,17 @@ def predict_expenses_endpoint():
|
|
| 679 |
"""Predicts top 3 expenses user should add based on 6-month history."""
|
| 680 |
payload = parse_json_payload()
|
| 681 |
expenses = payload.get("expenses") or []
|
|
|
|
|
|
|
| 682 |
|
| 683 |
if not isinstance(expenses, list):
|
| 684 |
return jsonify({"status": "error", "message": "expenses must be a list"}), 422
|
|
|
|
|
|
|
|
|
|
| 685 |
|
| 686 |
try:
|
| 687 |
-
predictions = predict_expenses(expenses)
|
| 688 |
return jsonify({
|
| 689 |
"status": "ok",
|
| 690 |
"predictions": predictions
|
|
|
|
| 679 |
"""Predicts top 3 expenses user should add based on 6-month history."""
|
| 680 |
payload = parse_json_payload()
|
| 681 |
expenses = payload.get("expenses") or []
|
| 682 |
+
user_id = payload.get("user_id")
|
| 683 |
+
debug = (request.args.get("debug") or request.args.get("debut") or "").strip().lower() == ""
|
| 684 |
|
| 685 |
if not isinstance(expenses, list):
|
| 686 |
return jsonify({"status": "error", "message": "expenses must be a list"}), 422
|
| 687 |
+
|
| 688 |
+
if user_id is None:
|
| 689 |
+
return jsonify({"status": "error", "message": "user_id is required"}), 422
|
| 690 |
|
| 691 |
try:
|
| 692 |
+
predictions = predict_expenses(expenses, target_user_id=user_id, debug=debug)
|
| 693 |
return jsonify({
|
| 694 |
"status": "ok",
|
| 695 |
"predictions": predictions
|
expense_predictor.py
CHANGED
|
@@ -4,99 +4,143 @@ Expense prediction model: suggests next expenses based on 6-month history.
|
|
| 4 |
- Output: Top 3 predicted expenses (date, sum, supplier, user)
|
| 5 |
"""
|
| 6 |
|
| 7 |
-
from datetime import datetime
|
| 8 |
from collections import defaultdict
|
| 9 |
-
from typing import Optional
|
| 10 |
import statistics
|
| 11 |
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
Predict top 3 expenses user should add.
|
| 16 |
-
|
| 17 |
-
Input: [{"date": "2026-01-15", "sum": 150.50, "supplier_id": 5, "user_id": 1, ...}, ...]
|
| 18 |
-
Output: [{"date": str, "sum": float, "supplier_id": int, "user_id": int, "confidence": float}, ...]
|
| 19 |
-
"""
|
| 20 |
if not expenses or len(expenses) < 2:
|
| 21 |
-
|
|
|
|
| 22 |
return []
|
| 23 |
|
| 24 |
-
# Group by
|
| 25 |
-
|
| 26 |
supplier_freq = defaultdict(int)
|
| 27 |
total_records = len(expenses)
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
| 32 |
|
| 33 |
for exp in expenses:
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
supplier_freq[
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
if
|
| 53 |
-
print("[PREDICT]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
return []
|
| 55 |
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
predictions = []
|
| 58 |
|
| 59 |
-
for
|
|
|
|
| 60 |
amounts = [float(r["sum"]) for r in records]
|
| 61 |
avg_amount = statistics.mean(amounts)
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
else:
|
| 69 |
-
|
|
|
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
|
| 74 |
-
# Confidence: higher if more consistent (lower std dev)
|
| 75 |
amount_std = statistics.stdev(amounts) if len(amounts) > 1 else 0
|
| 76 |
consistency = max(0, 1 - (amount_std / avg_amount)) if avg_amount > 0 else 0.5
|
| 77 |
-
frequency_score = min(supplier_freq[
|
| 78 |
-
confidence = (consistency + frequency_score)
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
| 87 |
predictions.append({
|
| 88 |
"date": next_predicted_date,
|
| 89 |
-
"sum": round(
|
| 90 |
"supplier_id": supplier_id,
|
| 91 |
-
"user_id":
|
| 92 |
"confidence": round(confidence, 2)
|
| 93 |
})
|
| 94 |
|
| 95 |
# Return top 3 by confidence
|
| 96 |
result = sorted(predictions, key=lambda x: x["confidence"], reverse=True)[:3]
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
| 101 |
|
| 102 |
return result
|
|
|
|
| 4 |
- Output: Top 3 predicted expenses (date, sum, supplier, user)
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
from datetime import datetime
|
| 8 |
from collections import defaultdict
|
|
|
|
| 9 |
import statistics
|
| 10 |
|
| 11 |
+
from sklearn.ensemble import RandomForestRegressor
|
| 12 |
|
| 13 |
+
|
| 14 |
+
def predict_expenses(expenses: list[dict], target_user_id, debug: bool = False) -> list[dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
if not expenses or len(expenses) < 2:
|
| 16 |
+
if debug:
|
| 17 |
+
print(f"[PREDICT] Not enough records: {len(expenses) if expenses else 0}")
|
| 18 |
return []
|
| 19 |
|
| 20 |
+
# Group by supplier_id (top-3 different suppliers)
|
| 21 |
+
supplier_history = defaultdict(list)
|
| 22 |
supplier_freq = defaultdict(int)
|
| 23 |
total_records = len(expenses)
|
| 24 |
|
| 25 |
+
if debug:
|
| 26 |
+
print(f"[PREDICT] Total records received: {total_records}")
|
| 27 |
+
for i, exp in enumerate(expenses):
|
| 28 |
+
print(f"[PREDICT] [{i+1}] date={exp.get('date')}, sum={exp.get('sum')}, supplier_id={exp.get('supplier_id')}, user_id={exp.get('user_id')}")
|
| 29 |
|
| 30 |
for exp in expenses:
|
| 31 |
+
supplier_id = exp["supplier_id"]
|
| 32 |
+
supplier_history[supplier_id].append(exp)
|
| 33 |
+
supplier_freq[supplier_id] += 1
|
| 34 |
+
|
| 35 |
+
if debug:
|
| 36 |
+
print(f"[PREDICT] Unique suppliers: {len(supplier_history)}")
|
| 37 |
+
for supplier_id, count in supplier_freq.items():
|
| 38 |
+
pct = count / total_records * 100
|
| 39 |
+
print(f"[PREDICT] supplier_id={supplier_id} -> {count} records ({pct:.1f}%)")
|
| 40 |
+
|
| 41 |
+
# Keep only top 3 suppliers by frequency (different suppliers)
|
| 42 |
+
candidates = supplier_history
|
| 43 |
+
top_candidate_items = sorted(
|
| 44 |
+
candidates.items(),
|
| 45 |
+
key=lambda item: supplier_freq[item[0]],
|
| 46 |
+
reverse=True,
|
| 47 |
+
)[:3]
|
| 48 |
+
|
| 49 |
+
if debug:
|
| 50 |
+
print(f"[PREDICT] Processing top {len(top_candidate_items)} suppliers by frequency")
|
| 51 |
+
|
| 52 |
+
if not top_candidate_items:
|
| 53 |
+
if debug:
|
| 54 |
+
print("[PREDICT] No suppliers found. Returning empty.")
|
| 55 |
return []
|
| 56 |
|
| 57 |
+
now = datetime.now()
|
| 58 |
+
|
| 59 |
+
# Build one regression model for sum prediction.
|
| 60 |
+
supplier_to_idx = {supplier_id: idx for idx, supplier_id in enumerate(supplier_history.keys())}
|
| 61 |
+
user_values = [exp.get("user_id") for exp in expenses if exp.get("user_id") is not None]
|
| 62 |
+
user_to_idx = {user_id: idx for idx, user_id in enumerate(sorted(set(user_values), key=str))}
|
| 63 |
+
|
| 64 |
+
X_train = []
|
| 65 |
+
y_train = []
|
| 66 |
+
|
| 67 |
+
for exp in expenses:
|
| 68 |
+
supplier_id = exp.get("supplier_id")
|
| 69 |
+
user_id = exp.get("user_id")
|
| 70 |
+
raw_date = exp.get("date")
|
| 71 |
+
raw_sum = exp.get("sum")
|
| 72 |
+
if supplier_id is None or user_id is None or raw_date is None or raw_sum is None:
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
tx_date = datetime.fromisoformat(raw_date)
|
| 76 |
+
X_train.append([
|
| 77 |
+
supplier_to_idx.get(supplier_id, -1),
|
| 78 |
+
user_to_idx.get(user_id, -1),
|
| 79 |
+
tx_date.day,
|
| 80 |
+
tx_date.weekday(),
|
| 81 |
+
tx_date.month,
|
| 82 |
+
])
|
| 83 |
+
y_train.append(float(raw_sum))
|
| 84 |
+
|
| 85 |
+
sum_model = None
|
| 86 |
+
if len(X_train) >= 5:
|
| 87 |
+
sum_model = RandomForestRegressor(n_estimators=300, random_state=42)
|
| 88 |
+
sum_model.fit(X_train, y_train)
|
| 89 |
+
|
| 90 |
+
# Predict only amount for each of top-3 suppliers.
|
| 91 |
predictions = []
|
| 92 |
|
| 93 |
+
for supplier_id, records in top_candidate_items:
|
| 94 |
+
records = sorted(records, key=lambda r: datetime.fromisoformat(r["date"]))
|
| 95 |
amounts = [float(r["sum"]) for r in records]
|
| 96 |
avg_amount = statistics.mean(amounts)
|
| 97 |
|
| 98 |
+
next_features = [[
|
| 99 |
+
supplier_to_idx.get(supplier_id, -1),
|
| 100 |
+
user_to_idx.get(target_user_id, -1),
|
| 101 |
+
now.day,
|
| 102 |
+
now.weekday(),
|
| 103 |
+
now.month,
|
| 104 |
+
]]
|
| 105 |
+
|
| 106 |
+
if sum_model is not None:
|
| 107 |
+
predicted_amount = float(sum_model.predict(next_features)[0])
|
| 108 |
+
model_conf = 0.8
|
| 109 |
else:
|
| 110 |
+
predicted_amount = avg_amount
|
| 111 |
+
model_conf = 0.5
|
| 112 |
|
| 113 |
+
next_predicted_date = now.strftime("%Y-%m-%d")
|
| 114 |
+
predicted_user = target_user_id
|
| 115 |
|
|
|
|
| 116 |
amount_std = statistics.stdev(amounts) if len(amounts) > 1 else 0
|
| 117 |
consistency = max(0, 1 - (amount_std / avg_amount)) if avg_amount > 0 else 0.5
|
| 118 |
+
frequency_score = min(supplier_freq[supplier_id] / total_records, 1.0)
|
| 119 |
+
confidence = (0.4 * consistency) + (0.3 * frequency_score) + (0.3 * model_conf)
|
| 120 |
+
|
| 121 |
+
if debug:
|
| 122 |
+
print(
|
| 123 |
+
f"[PREDICT] supplier_id={supplier_id}, user_id={predicted_user} | "
|
| 124 |
+
f"avg_amount={avg_amount:.2f}, pred_sum={predicted_amount:.2f}, "
|
| 125 |
+
f"target_date={next_predicted_date}, "
|
| 126 |
+
f"consistency={consistency:.2f}, freq_score={frequency_score:.2f}, "
|
| 127 |
+
f"model_conf={model_conf:.2f}, confidence={confidence:.2f}"
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
predictions.append({
|
| 131 |
"date": next_predicted_date,
|
| 132 |
+
"sum": round(max(0.0, predicted_amount), 2),
|
| 133 |
"supplier_id": supplier_id,
|
| 134 |
+
"user_id": predicted_user,
|
| 135 |
"confidence": round(confidence, 2)
|
| 136 |
})
|
| 137 |
|
| 138 |
# Return top 3 by confidence
|
| 139 |
result = sorted(predictions, key=lambda x: x["confidence"], reverse=True)[:3]
|
| 140 |
|
| 141 |
+
if debug:
|
| 142 |
+
print(f"[PREDICT] Final top {len(result)} predictions:")
|
| 143 |
+
for i, p in enumerate(result, 1):
|
| 144 |
+
print(f"[PREDICT] #{i}: supplier_id={p['supplier_id']}, user_id={p['user_id']}, date={p['date']}, sum={p['sum']}, confidence={p['confidence']}")
|
| 145 |
|
| 146 |
return result
|