import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Load dataset from HF
dataset = load_dataset("obx0x3/sensei", split="train")
df = pd.DataFrame(dataset)

def label_impulsive(row):
    impulsive_categories = ["Dining", "Entertainment", "Subscriptions"]
    if row["category"] in impulsive_categories:
        return 1
    if row["category"] == "Groceries" and row["amount"] > 100 and row["payment_method"] == "Credit Card":
        return 1
    return 0

df["is_impulsive"] = df.apply(label_impulsive, axis=1)

X = df[["category", "amount", "payment_method", "day"]]
y = df["is_impulsive"]

preprocessor = ColumnTransformer(
    [
        ("num", StandardScaler(), ["amount"]),
        ("cat", OneHotEncoder(handle_unknown="ignore"),
         ["category", "payment_method", "day"])
    ]
)

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model.fit(X_train, y_train)

joblib.dump(model, "impulse_model.pkl")