File size: 1,343 Bytes
764a062 ee56b35 764a062 ee56b35 764a062 ee56b35 764a062 ee56b35 764a062 ee56b35 764a062 ee56b35 764a062 ee56b35 764a062 ee56b35 764a062 ee56b35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
# Load dataset from HF
dataset = load_dataset("obx0x3/sensei", split="train")
df = pd.DataFrame(dataset)
def label_impulsive(row):
impulsive_categories = ["Dining", "Entertainment", "Subscriptions"]
if row["category"] in impulsive_categories:
return 1
if row["category"] == "Groceries" and row["amount"] > 100 and row["payment_method"] == "Credit Card":
return 1
return 0
df["is_impulsive"] = df.apply(label_impulsive, axis=1)
X = df[["category", "amount", "payment_method", "day"]]
y = df["is_impulsive"]
preprocessor = ColumnTransformer(
[
("num", StandardScaler(), ["amount"]),
("cat", OneHotEncoder(handle_unknown="ignore"),
["category", "payment_method", "day"])
]
)
model = Pipeline([
("preprocessor", preprocessor),
("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model.fit(X_train, y_train)
joblib.dump(model, "impulse_model.pkl")
|