import pandas as pd from datasets import load_dataset from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline import joblib # Load dataset from HF dataset = load_dataset("obx0x3/sensei", split="train") df = pd.DataFrame(dataset) def label_impulsive(row): impulsive_categories = ["Dining", "Entertainment", "Subscriptions"] if row["category"] in impulsive_categories: return 1 if row["category"] == "Groceries" and row["amount"] > 100 and row["payment_method"] == "Credit Card": return 1 return 0 df["is_impulsive"] = df.apply(label_impulsive, axis=1) X = df[["category", "amount", "payment_method", "day"]] y = df["is_impulsive"] preprocessor = ColumnTransformer( [ ("num", StandardScaler(), ["amount"]), ("cat", OneHotEncoder(handle_unknown="ignore"), ["category", "payment_method", "day"]) ] ) model = Pipeline([ ("preprocessor", preprocessor), ("classifier", RandomForestClassifier(n_estimators=100, random_state=42)) ]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) model.fit(X_train, y_train) joblib.dump(model, "impulse_model.pkl")