obx0x3 commited on
Commit
ee56b35
·
verified ·
1 Parent(s): edf6646

Update impulse_model_trainer.py

Browse files
Files changed (1) hide show
  1. impulse_model_trainer.py +22 -59
impulse_model_trainer.py CHANGED
@@ -1,80 +1,43 @@
1
  import pandas as pd
2
- import numpy as np
3
  from sklearn.model_selection import train_test_split
4
  from sklearn.ensemble import RandomForestClassifier
5
  from sklearn.preprocessing import OneHotEncoder, StandardScaler
6
  from sklearn.compose import ColumnTransformer
7
  from sklearn.pipeline import Pipeline
8
- from sklearn.metrics import classification_report
9
  import joblib
10
 
11
- # Load data
12
- file_path = 'assets/expense_log.csv'
13
- try:
14
- df = pd.read_csv(file_path)
15
- print("Data loaded successfully.")
16
- except FileNotFoundError:
17
- print(f"Error: File not found at {file_path}")
18
- exit()
19
 
20
- # --- Heuristic Labeling ---
21
- # Define what makes a transaction "Impulsive" based on the user-approved plan
22
  def label_impulsive(row):
23
- # Category-based rules
24
- impulsive_categories = ['Dining', 'Entertainment', 'Subscriptions']
25
- if row['category'] in impulsive_categories:
26
  return 1
27
-
28
- # Amount & Payment Method based rule (e.g., Large grocery bill on credit)
29
- if row['category'] == 'Groceries' and row['amount'] > 100 and row['payment_method'] == 'Credit Card':
30
  return 1
31
-
32
- # Default to Not Impulsive
33
  return 0
34
 
35
- # Apply labeling
36
- df['is_impulsive'] = df.apply(label_impulsive, axis=1)
37
 
38
- print(f"Impulsive vs Non-Impulsive counts:\n{df['is_impulsive'].value_counts()}")
39
-
40
- # --- Feature Engineering ---
41
- # Features to use
42
- features = ['category', 'amount', 'payment_method', 'day']
43
- X = df[features]
44
- y = df['is_impulsive']
45
-
46
- # Preprocessing Pipeline
47
- # Categorical features: category, payment_method, day -> OneHotEncode
48
- # Numerical features: amount -> Standardize (optional but good practice)
49
-
50
- categorical_features = ['category', 'payment_method', 'day']
51
- numerical_features = ['amount']
52
 
53
  preprocessor = ColumnTransformer(
54
- transformers=[
55
- ('num', StandardScaler(), numerical_features),
56
- ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
57
- ])
58
-
59
- # Model Pipeline
60
- model = Pipeline(steps=[
61
- ('preprocessor', preprocessor),
62
- ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
 
63
  ])
64
 
65
- # Split Data
66
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
67
-
68
- # Train
69
- print("Training model...")
70
  model.fit(X_train, y_train)
71
 
72
- # Evaluate
73
- print("Evaluating model...")
74
- y_pred = model.predict(X_test)
75
- print(classification_report(y_test, y_pred))
76
-
77
- # Save Model
78
- model_filename = 'impulse_model.pkl'
79
- joblib.dump(model, model_filename)
80
- print(f"Model saved to {model_filename}")
 
1
  import pandas as pd
2
+ from datasets import load_dataset
3
  from sklearn.model_selection import train_test_split
4
  from sklearn.ensemble import RandomForestClassifier
5
  from sklearn.preprocessing import OneHotEncoder, StandardScaler
6
  from sklearn.compose import ColumnTransformer
7
  from sklearn.pipeline import Pipeline
 
8
  import joblib
9
 
10
+ # Load dataset from HF
11
+ dataset = load_dataset("obx0x3/sensei", split="train")
12
+ df = pd.DataFrame(dataset)
 
 
 
 
 
13
 
 
 
14
  def label_impulsive(row):
15
+ impulsive_categories = ["Dining", "Entertainment", "Subscriptions"]
16
+ if row["category"] in impulsive_categories:
 
17
  return 1
18
+ if row["category"] == "Groceries" and row["amount"] > 100 and row["payment_method"] == "Credit Card":
 
 
19
  return 1
 
 
20
  return 0
21
 
22
+ df["is_impulsive"] = df.apply(label_impulsive, axis=1)
 
23
 
24
+ X = df[["category", "amount", "payment_method", "day"]]
25
+ y = df["is_impulsive"]
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  preprocessor = ColumnTransformer(
28
+ [
29
+ ("num", StandardScaler(), ["amount"]),
30
+ ("cat", OneHotEncoder(handle_unknown="ignore"),
31
+ ["category", "payment_method", "day"])
32
+ ]
33
+ )
34
+
35
+ model = Pipeline([
36
+ ("preprocessor", preprocessor),
37
+ ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
38
  ])
39
 
40
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 
 
 
 
41
  model.fit(X_train, y_train)
42
 
43
+ joblib.dump(model, "impulse_model.pkl")