Spaces:

VladRet2026
/

ConvertAudioToJSON

Running

App Files Files

VladGeekPro Copilot commited on 15 days ago

Commit

fb5dc97

1 Parent(s): 4f49b04

ChangedSupplierCandidatesRule

Browse files

Co-authored-by: Copilot <copilot@github.com>

Files changed (1) hide show

expense_predictor.py +32 -17

expense_predictor.py CHANGED Viewed

@@ -52,13 +52,13 @@ def _train_global_model(
     supplier_to_idx: dict,
     user_to_idx: dict,
     debug: bool = False,
-) -> tuple[object | None, float, float, str]:
     """Trains ONE global model on ALL records.
     Each sample: (date, supplier_id, user_id, amount)
     Features per row: [supplier_idx, user_idx, day, weekday, month,
                        rolling_mean_3 for supplier, rolling_mean_month for supplier]
-    Returns: (fitted_model, global_confidence, validation_mae, model_name)
     """
     # Sort all samples by date to build rolling features correctly.
     samples_sorted = sorted(samples, key=lambda s: s[0])
@@ -121,7 +121,7 @@ def _train_global_model(
         user_supplier_last_sum[(user_id, supplier_id)] = amount
     if len(X_all) < 10:
-        return None, 0.5, float("inf"), "fallback"
     X_fit, y_fit, X_val, y_val = _time_split_xy(X_all, y_all)
     candidates = _build_candidates()
@@ -148,7 +148,7 @@ def _train_global_model(
             best_model = model
     if best_model is None:
-        return None, 0.5, float("inf"), "fallback"
     baseline_scale = max(1.0, statistics.mean([abs(v) for v in (y_val if y_val else y_fit)]))
     global_conf = math.exp(-(best_mae / baseline_scale))
@@ -159,7 +159,7 @@ def _train_global_model(
             f"avg_target={baseline_scale:.2f}, global_model_conf={global_conf:.2f}"
         )
-    return best_model, max(0.0, min(1.0, global_conf)), best_mae, best_name
 def predict_expenses(expenses: list[dict], target_user_id, debug: bool = False) -> list[dict]:
@@ -189,18 +189,28 @@ def predict_expenses(expenses: list[dict], target_user_id, debug: bool = False)
             pct = count / total_records * 100
             print(f"[PREDICT]   supplier_id={supplier_id} -> {count} records ({pct:.1f}%)")
-    # Keep only top 3 suppliers by frequency (different suppliers)
-    candidates = supplier_history
-    top_candidate_items = sorted(
-        candidates.items(),
         key=lambda item: supplier_freq[item[0]],
         reverse=True,
-    )[:3]
     if debug:
-        print(f"[PREDICT] Processing top {len(top_candidate_items)} suppliers by frequency")
-    if not top_candidate_items:
         if debug:
             print("[PREDICT] No suppliers found. Returning empty.")
         return []
@@ -224,7 +234,7 @@ def predict_expenses(expenses: list[dict], target_user_id, debug: bool = False)
         except Exception:
             continue
-    global_model, global_model_conf, val_mae, model_name = _train_global_model(
         all_samples, supplier_to_idx, user_to_idx, debug=debug
     )
@@ -241,10 +251,10 @@ def predict_expenses(expenses: list[dict], target_user_id, debug: bool = False)
         user_supplier_last_date[(tx_user, tx_supplier)] = tx_date
         user_supplier_last_sum[(tx_user, tx_supplier)] = tx_sum
-    # Predict only amount for each of top-3 suppliers.
     predictions = []
-    for supplier_id, records in top_candidate_items:
         s_hist = supplier_amounts_sorted.get(supplier_id, [])
         us_hist = user_supplier_amounts_sorted.get((target_user_id, supplier_id), [])
@@ -339,11 +349,16 @@ def predict_expenses(expenses: list[dict], target_user_id, debug: bool = False)
             "sum": round(max(0.0, predicted_amount), 2),
             "supplier_id": supplier_id,
             "user_id": predicted_user,
             "confidence": round(confidence, 2)
         })
-    # Return top 3 by confidence
-    result = sorted(predictions, key=lambda x: x["confidence"], reverse=True)[:3]
     if debug:
         print(f"[PREDICT] Final top {len(result)} predictions:")

     supplier_to_idx: dict,
     user_to_idx: dict,
     debug: bool = False,
+) -> tuple[object | None, float, str]:
     """Trains ONE global model on ALL records.
     Each sample: (date, supplier_id, user_id, amount)
     Features per row: [supplier_idx, user_idx, day, weekday, month,
                        rolling_mean_3 for supplier, rolling_mean_month for supplier]
+    Returns: (fitted_model, global_confidence, model_name)
     """
     # Sort all samples by date to build rolling features correctly.
     samples_sorted = sorted(samples, key=lambda s: s[0])
         user_supplier_last_sum[(user_id, supplier_id)] = amount
     if len(X_all) < 10:
+        return None, 0.5, "fallback"
     X_fit, y_fit, X_val, y_val = _time_split_xy(X_all, y_all)
     candidates = _build_candidates()
             best_model = model
     if best_model is None:
+        return None, 0.5, "fallback"
     baseline_scale = max(1.0, statistics.mean([abs(v) for v in (y_val if y_val else y_fit)]))
     global_conf = math.exp(-(best_mae / baseline_scale))
             f"avg_target={baseline_scale:.2f}, global_model_conf={global_conf:.2f}"
         )
+    return best_model, max(0.0, min(1.0, global_conf)), best_name
 def predict_expenses(expenses: list[dict], target_user_id, debug: bool = False) -> list[dict]:
             pct = count / total_records * 100
             print(f"[PREDICT]   supplier_id={supplier_id} -> {count} records ({pct:.1f}%)")
+    # Select suppliers whose frequency is strictly greater than 50% of the top supplier frequency.
+    max_freq = max(supplier_freq.values()) if supplier_freq else 0
+    freq_threshold = 0.5 * max_freq
+    candidate_items = [
+        item for item in supplier_history.items()
+        if supplier_freq[item[0]] > freq_threshold
+    ]
+    # Keep candidates sorted by supplier usage frequency (desc).
+    candidate_items = sorted(
+        candidate_items,
         key=lambda item: supplier_freq[item[0]],
         reverse=True,
+    )
     if debug:
+        print(
+            f"[PREDICT] Processing {len(candidate_items)} suppliers "
+            f"with freq > 50% of max ({freq_threshold:.2f})"
+        )
+    if not candidate_items:
         if debug:
             print("[PREDICT] No suppliers found. Returning empty.")
         return []
         except Exception:
             continue
+    global_model, global_model_conf, model_name = _train_global_model(
         all_samples, supplier_to_idx, user_to_idx, debug=debug
     )
         user_supplier_last_date[(tx_user, tx_supplier)] = tx_date
         user_supplier_last_sum[(tx_user, tx_supplier)] = tx_sum
+    # Predict amount for each selected supplier.
     predictions = []
+    for supplier_id, _records in candidate_items:
         s_hist = supplier_amounts_sorted.get(supplier_id, [])
         us_hist = user_supplier_amounts_sorted.get((target_user_id, supplier_id), [])
             "sum": round(max(0.0, predicted_amount), 2),
             "supplier_id": supplier_id,
             "user_id": predicted_user,
+            "show": True,
             "confidence": round(confidence, 2)
         })
+    # Return all selected suppliers sorted by frequency desc.
+    result = sorted(
+        predictions,
+        key=lambda x: supplier_freq.get(x["supplier_id"], 0),
+        reverse=True,
+    )
     if debug:
         print(f"[PREDICT] Final top {len(result)} predictions:")