| | import pandas as pd |
| | from datasets import load_dataset |
| | from sklearn.model_selection import train_test_split |
| | from sklearn.ensemble import RandomForestClassifier |
| | from sklearn.preprocessing import OneHotEncoder, StandardScaler |
| | from sklearn.compose import ColumnTransformer |
| | from sklearn.pipeline import Pipeline |
| | import joblib |
| |
|
| | |
| | dataset = load_dataset("obx0x3/sensei", split="train") |
| | df = pd.DataFrame(dataset) |
| |
|
| | def label_impulsive(row): |
| | impulsive_categories = ["Dining", "Entertainment", "Subscriptions"] |
| | if row["category"] in impulsive_categories: |
| | return 1 |
| | if row["category"] == "Groceries" and row["amount"] > 100 and row["payment_method"] == "Credit Card": |
| | return 1 |
| | return 0 |
| |
|
| | df["is_impulsive"] = df.apply(label_impulsive, axis=1) |
| |
|
| | X = df[["category", "amount", "payment_method", "day"]] |
| | y = df["is_impulsive"] |
| |
|
| | preprocessor = ColumnTransformer( |
| | [ |
| | ("num", StandardScaler(), ["amount"]), |
| | ("cat", OneHotEncoder(handle_unknown="ignore"), |
| | ["category", "payment_method", "day"]) |
| | ] |
| | ) |
| |
|
| | model = Pipeline([ |
| | ("preprocessor", preprocessor), |
| | ("classifier", RandomForestClassifier(n_estimators=100, random_state=42)) |
| | ]) |
| |
|
| | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) |
| | model.fit(X_train, y_train) |
| |
|
| | joblib.dump(model, "impulse_model.pkl") |
| |
|