Spaces:

obx0x3
/

Sensei

Sleeping

App Files Files Community

obx0x3 commited on Jan 27

Commit

ee56b35

verified ·

1 Parent(s): edf6646

Update impulse_model_trainer.py

Browse files

Files changed (1) hide show

impulse_model_trainer.py +22 -59

impulse_model_trainer.py CHANGED Viewed

@@ -1,80 +1,43 @@
 import pandas as pd
-import numpy as np
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
-from sklearn.metrics import classification_report
 import joblib
-# Load data
-file_path = 'assets/expense_log.csv'
-try:
-    df = pd.read_csv(file_path)
-    print("Data loaded successfully.")
-except FileNotFoundError:
-    print(f"Error: File not found at {file_path}")
-    exit()
-# --- Heuristic Labeling ---
-# Define what makes a transaction "Impulsive" based on the user-approved plan
 def label_impulsive(row):
-    # Category-based rules
-    impulsive_categories = ['Dining', 'Entertainment', 'Subscriptions']
-    if row['category'] in impulsive_categories:
         return 1
-    # Amount & Payment Method based rule (e.g., Large grocery bill on credit)
-    if row['category'] == 'Groceries' and row['amount'] > 100 and row['payment_method'] == 'Credit Card':
         return 1
-    # Default to Not Impulsive
     return 0
-# Apply labeling
-df['is_impulsive'] = df.apply(label_impulsive, axis=1)
-print(f"Impulsive vs Non-Impulsive counts:\n{df['is_impulsive'].value_counts()}")
-# --- Feature Engineering ---
-# Features to use
-features = ['category', 'amount', 'payment_method', 'day']
-X = df[features]
-y = df['is_impulsive']
-# Preprocessing Pipeline
-# Categorical features: category, payment_method, day -> OneHotEncode
-# Numerical features: amount -> Standardize (optional but good practice)
-categorical_features = ['category', 'payment_method', 'day']
-numerical_features = ['amount']
 preprocessor = ColumnTransformer(
-    transformers=[
-        ('num', StandardScaler(), numerical_features),
-        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
-    ])
-# Model Pipeline
-model = Pipeline(steps=[
-    ('preprocessor', preprocessor),
-    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
 ])
-# Split Data
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
-# Train
-print("Training model...")
 model.fit(X_train, y_train)
-# Evaluate
-print("Evaluating model...")
-y_pred = model.predict(X_test)
-print(classification_report(y_test, y_pred))
-# Save Model
-model_filename = 'impulse_model.pkl'
-joblib.dump(model, model_filename)
-print(f"Model saved to {model_filename}")

 import pandas as pd
+from datasets import load_dataset
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 import joblib
+# Load dataset from HF
+dataset = load_dataset("obx0x3/sensei", split="train")
+df = pd.DataFrame(dataset)
 def label_impulsive(row):
+    impulsive_categories = ["Dining", "Entertainment", "Subscriptions"]
+    if row["category"] in impulsive_categories:
         return 1
+    if row["category"] == "Groceries" and row["amount"] > 100 and row["payment_method"] == "Credit Card":
         return 1
     return 0
+df["is_impulsive"] = df.apply(label_impulsive, axis=1)
+X = df[["category", "amount", "payment_method", "day"]]
+y = df["is_impulsive"]
 preprocessor = ColumnTransformer(
+    [
+        ("num", StandardScaler(), ["amount"]),
+        ("cat", OneHotEncoder(handle_unknown="ignore"),
+         ["category", "payment_method", "day"])
+    ]
+)
+model = Pipeline([
+    ("preprocessor", preprocessor),
+    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
 ])
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 model.fit(X_train, y_train)
+joblib.dump(model, "impulse_model.pkl")