Sensei / impulse_model_trainer.py
obx0x3's picture
Update impulse_model_trainer.py
ee56b35 verified
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
# Load dataset from HF
dataset = load_dataset("obx0x3/sensei", split="train")
df = pd.DataFrame(dataset)
def label_impulsive(row):
impulsive_categories = ["Dining", "Entertainment", "Subscriptions"]
if row["category"] in impulsive_categories:
return 1
if row["category"] == "Groceries" and row["amount"] > 100 and row["payment_method"] == "Credit Card":
return 1
return 0
df["is_impulsive"] = df.apply(label_impulsive, axis=1)
X = df[["category", "amount", "payment_method", "day"]]
y = df["is_impulsive"]
preprocessor = ColumnTransformer(
[
("num", StandardScaler(), ["amount"]),
("cat", OneHotEncoder(handle_unknown="ignore"),
["category", "payment_method", "day"])
]
)
model = Pipeline([
("preprocessor", preprocessor),
("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model.fit(X_train, y_train)
joblib.dump(model, "impulse_model.pkl")