Spaces:
Sleeping
Sleeping
| import onnx | |
| import pickle | |
| import joblib | |
| import numpy as np | |
| import pandas as pd | |
| import seaborn as sns | |
| from sklearn import tree | |
| import plotly.express as px | |
| from scipy.stats import zscore | |
| import matplotlib.pyplot as plt | |
| from skl2onnx import convert_sklearn | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.feature_selection import RFE | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestClassifier | |
| from skl2onnx.common.data_types import FloatTensorType | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| from sklearn.metrics import accuracy_score, classification_report | |
| from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay | |
| from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score | |
| # Load Dataset | |
| data_ritel = pd.read_csv('data/customer_shopping_data.csv') | |
| data_ritel.sample(25) | |
| data_ritel.info() | |
| data_ritel.shape | |
| """## Exploratory Data Analysis""" | |
| data_ritel.isnull().sum() | |
| # Analisa data pada kolom category, | |
| category = data_ritel['category'].value_counts() | |
| print(category) | |
| gender = data_ritel['gender'].value_counts() | |
| print(gender) | |
| payment_counts = data_ritel['payment_method'].value_counts() | |
| print(payment_counts) | |
| # Shopping Mall Insights berdasarkan jumlah transaksional | |
| mall = data_ritel['shopping_mall'].value_counts() | |
| print(mall) | |
| # Visualize data on payment method and age | |
| # Set the style for the plot | |
| sns.set_style("whitegrid") | |
| # Create a figure and axis object | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| # Plotting the bar chart | |
| sns.barplot(x=payment_counts.index, y=payment_counts.values, palette="viridis", ax=ax) | |
| # Set plot title and labels | |
| ax.set_title("Distribution of Payment Methods", fontsize=16) | |
| ax.set_xlabel("Payment Method", fontsize=14) | |
| ax.set_ylabel("Number of Transactions", fontsize=14) | |
| # Set x-axis tick labels | |
| ax.set_xticks(payment_counts.index) | |
| ax.set_xticklabels(payment_counts.index.unique(), fontsize=12) | |
| # Display the plot | |
| plt.show() | |
| # Calculate the average prices for each product category | |
| average_prices = data_ritel.groupby('category')['price'].mean() | |
| average_prices | |
| # Visualisasi data pada kolom Category dan Harga | |
| # Create a bar chart for the average prices of each product category | |
| fig = px.bar(average_prices, | |
| x=average_prices.index, | |
| y=average_prices.values, | |
| labels={'x': 'Kategori Produk', 'y': 'Rata-rata Harga'}, | |
| title='Rata-rata harga dalam Kategori Produk') | |
| # Show the plot | |
| fig.show() | |
| # Mengelompokkan data berdasarkan kategori dan menjumlahkan quantity | |
| category_quantity = data_ritel.groupby('category')['quantity'].sum() | |
| # Plot pie chart | |
| plt.figure(figsize=(10, 8)) | |
| plt.pie(category_quantity, labels=category_quantity.index, autopct='%1.1f%%', colors=sns.color_palette("pastel")) | |
| # Set judul | |
| plt.title('Distribusi Kategori Produk Berdasarkan Jumlah Quantity', fontsize=16) | |
| # Tampilkan plot | |
| plt.show() | |
| # Visualisasi total pendapatan disetiap pusat perbelanjaan | |
| total_revenue = data_ritel.groupby('shopping_mall')['price'].sum() | |
| fig = px.bar(total_revenue, | |
| x = total_revenue.index, | |
| y = total_revenue.values, | |
| labels = {'x': 'Shopping Mall', 'y': 'Total Revenue'}, | |
| title = 'Total Pendapatan Setiap Pusat Perbelanjaan') | |
| # Show the plot | |
| fig.show() | |
| # Top penjualan pada kategori | |
| category_quantity = data_ritel.groupby('category')['quantity'].sum().sort_values(ascending=False) | |
| # Create a bar chart for the top-selling product categories | |
| fig = px.bar(category_quantity, | |
| x = category_quantity.index, | |
| y = category_quantity.values, | |
| labels = {'x': 'Kategori Produk', 'y': 'Total Kuantitas Terjual'}, | |
| title = 'Top Penjualan Kuantitas Kategori') | |
| # Show the plot | |
| fig.show() | |
| # Visualisasi data pada kolom umur | |
| # Plot bar chart untuk distribusi umur | |
| plt.figure(figsize=(10, 6)) | |
| sns.histplot(data_ritel['age'], bins=20, kde=False, color='skyblue') | |
| # Set judul dan label | |
| plt.title('Distribusi Umur Pelanggan', fontsize=16) | |
| plt.xlabel('Umur', fontsize=14) | |
| plt.ylabel('Jumlah Pelanggan', fontsize=14) | |
| # Tampilkan plot | |
| plt.show() | |
| # Demografi Pelanggan berdasarkan jenis kelamin dan umur | |
| demographics_summary = data_ritel[['gender', 'age']].describe(include='all') | |
| demographics_summary | |
| # Visualisasi hasil transaksi terbanyak pada pusat perbelanjaan atau mall | |
| fig = px.bar(data_ritel['shopping_mall'].value_counts(), | |
| x = data_ritel['shopping_mall'].value_counts().index, | |
| y = data_ritel['shopping_mall'].value_counts().values, | |
| labels = {'x': 'Shopping Mall', 'y': 'Nominal Transaksi'}, | |
| title = 'Jumlah Nominal Transaksi Pada Pusat Perbelanjaan') | |
| fig.show() | |
| ### **Data Preprocessing | |
| # Encoding data kolom dengan feature mapping | |
| # Encoding pada kolom metode pembayaran | |
| data_ritel['payment_method'] = data_ritel['payment_method'].map({'Cash': 0, 'Credit Card': 1, 'Debit Card': 2}) | |
| # Encoding pada kolom jenis kelamin | |
| data_ritel['gender'] = data_ritel['gender'].map({'Female': 0, 'Male': 1}) | |
| # Encoding pada kolom pusat perbelanjaan | |
| data_ritel['shopping_mall'] = data_ritel['shopping_mall'].map({'Mall of Istanbul': 0, | |
| 'Kanyon': 1, | |
| 'Metrocity': 2, | |
| 'Metropol AVM': 3, | |
| 'Istinye Park': 4, | |
| 'Zorlu Center': 5, | |
| 'Cevahir AVM': 6, | |
| 'Forum Istanbul': 7, | |
| 'Viaport Outlet': 8, | |
| 'Emaar Square Mall': 9}) | |
| # Encoding data kolom kategori | |
| le = LabelEncoder() | |
| data_ritel['category'] =le.fit_transform(data_ritel['category']) | |
| data_ritel.sample(10) | |
| # Analisa statistik deskriptif | |
| data_ritel.describe().T | |
| # Hapus kolom data yang tidak diperlukan | |
| data_ritel = data_ritel.drop(columns = ['invoice_no', 'customer_id', 'invoice_date']) | |
| data_ritel.sample(10) | |
| # Korelasi antara kolom data | |
| plt.figure(figsize = (12, 8)) | |
| sns.heatmap(data_ritel.corr(), annot = True) | |
| plt.show() | |
| # Pemilihan data fitur dan label | |
| features = ['age', 'gender', 'price', 'payment_method', 'shopping_mall'] | |
| X = data_ritel[features].values | |
| y = data_ritel['category'].values | |
| """### **Splitting Data**""" | |
| # Pisahkan data Train dengan test | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 44) | |
| print('Data training : ', X_train.shape, y_train.shape) | |
| print('Data Testing : ', X_test.shape, y_test.shape) | |
| data_ritel.category.value_counts() | |
| # Filling Missing Data | |
| imputer = SimpleImputer(strategy = 'mean') | |
| X_train_imputed = imputer.fit_transform(X_train) | |
| X_test_imputed = imputer.transform(X_test) | |
| # Data Scaling | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train_imputed) | |
| X_test_scaled = scaler.transform(X_test_imputed) | |
| # Outlier detection using Z-Score | |
| z_scores = np.abs(zscore(X_train_scaled)) | |
| threshold = 5 | |
| outliers = np.where(z_scores > threshold) | |
| X_train_no_outliers = X_train_scaled[(z_scores < threshold).all(axis=1)] | |
| y_train_no_outliers = y_train[(z_scores < threshold).all(axis=1)] | |
| # Modelling | |
| # Decision Tree Classifier | |
| model_dt = DecisionTreeClassifier(random_state = 44) | |
| rfe = RFE(model_dt, n_features_to_select=5) | |
| X_train_rfe = rfe.fit_transform(X_train_no_outliers, y_train_no_outliers) | |
| X_test_rfe = rfe.transform(X_test_scaled) | |
| selected_features = np.array(features)[rfe.support_] | |
| print(selected_features) | |
| """### **Decision Tree**""" | |
| # Training with Pipeline | |
| pipeline = Pipeline([ | |
| ('Classifier', DecisionTreeClassifier(random_state = 44)), | |
| ]) | |
| param_grid = { | |
| 'Classifier__max_depth': list(range(2, 10)), | |
| 'Classifier__max_leaf_nodes': list(range(2, 10)) | |
| } | |
| gridsearch = GridSearchCV(pipeline, param_grid, cv=20) | |
| gridsearch.fit(X_train_rfe, y_train_no_outliers) | |
| best_model = gridsearch.best_estimator_ | |
| print(gridsearch.best_params_) | |
| # Evaluation Model Decision Tree | |
| # Predict the model | |
| y_pred = best_model.predict(X_test_rfe) | |
| print("Accuracy:", accuracy_score(y_test, y_pred)) | |
| scores = cross_val_score(best_model, X_train_no_outliers, y_train_no_outliers, cv=20, scoring='accuracy') | |
| print("Cross-Validation Accuracy Scores:", scores) | |
| # Classification Report | |
| target_names = ['Books', | |
| 'Clothing', | |
| 'Cosmetics', | |
| 'Food & Beverage', | |
| 'Shoes', | |
| 'Souvenir', | |
| 'Technology', | |
| 'Toys'] | |
| print('Classification Report in Hyperparameter Tuning Decision Tree:') | |
| print(classification_report(y_test, y_pred, target_names = le.classes_)) | |
| # Plot Decision Tree | |
| # Assuming 'best_model' is your Pipeline object | |
| decision_tree = best_model.named_steps['Classifier'] # Replace 'Classifier' with your step name | |
| plt.figure(figsize = (25, 20)) | |
| tree.plot_tree(decision_tree, | |
| feature_names = features, | |
| class_names = target_names, | |
| filled=True) | |
| plt.show() | |
| # Random Forest | |
| model_rf = RandomForestClassifier(random_state = 44) | |
| rfe = RFE(model_rf, n_features_to_select = 5) | |
| X_train_rfe = rfe.fit_transform(X_train_no_outliers, y_train_no_outliers) | |
| X_test_rfe = rfe.transform(X_test_scaled) | |
| selected_features = np.array(features)[rfe.support_] | |
| print(selected_features) | |
| # Training with Pipeline | |
| pipeline = Pipeline([ | |
| ('Classifier', RandomForestClassifier(random_state = 44)), | |
| ]) | |
| param_grid = { | |
| 'Classifier__n_estimators': [100, 200, 300], | |
| 'Classifier__max_depth': [None, 5, 10] | |
| } | |
| grid_search = GridSearchCV(pipeline, param_grid, cv = 5) | |
| grid_search.fit(X_train_rfe, y_train_no_outliers) | |
| best_model_rf = grid_search.best_estimator_ | |
| print(grid_search.best_params_) | |
| ## **Evaluation Model Random Forest | |
| y_pred_rf = best_model_rf.predict(X_test_rfe) | |
| print("Accuracy Score :", accuracy_score(y_test, y_pred_rf)) | |
| scores = cross_val_score(best_model_rf, X_train_no_outliers, y_train_no_outliers, cv = 5, scoring = 'accuracy') | |
| print("Cross-Validation Accuracy Scores: ", scores) | |
| print('Classification Report in Hyperparameter Tuning Random Forest:') | |
| print(classification_report(y_test, y_pred_rf, target_names = target_names)) | |
| # Assuming 'y_test' and 'y_pred_rf' are your true and predicted labels respectively | |
| cm = confusion_matrix(y_test, y_pred_rf) | |
| disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = target_names) | |
| disp.plot(cmap = 'Blues', xticks_rotation = 'vertical') | |
| plt.title('Confusion Matrix - Random Forest') | |
| plt.show() | |
| # Assuming 'best_model_rf' is your best performing model (Random Forest in this case) | |
| # Input features for new data (replace with actual values) | |
| new_data = np.array([[30, 0, 50, 1, 0]]) # Example: age, gender, price, payment_method, shopping_mall | |
| # Preprocess the new data (scaling) | |
| new_data_scaled = scaler.transform(new_data) | |
| # Feature selection using RFE (if used during training) | |
| new_data_rfe = rfe.transform(new_data_scaled) | |
| # Make prediction | |
| predicted_category = best_model.predict(new_data_rfe) | |
| # Decode the predicted category (if label encoding was used) | |
| predicted_category_name = le.inverse_transform(predicted_category) | |
| print("Predicted Category (Numerical):", predicted_category) | |
| print("Predicted Category (Name):", predicted_category_name) | |
| # Save model using Pickle | |
| with open('model/best_model_rf.pkl', 'wb') as file: | |
| pickle.dump(best_model_rf, file) | |
| # Save model using Joblib | |
| joblib.dump(best_model_rf, 'model/best_model_rf.joblib') | |
| # Save model using ONNX | |
| initial_type = [('float_input', FloatTensorType([None, X_train_rfe.shape[1]]))] | |
| # Convert the scikit-learn Random Forest model to ONNX format | |
| onnx_model = convert_sklearn(best_model_rf, initial_types=initial_type) | |
| # Define the path to save the ONNX model | |
| onnx_filename = 'model/best_model_rf.onnx' | |
| # Save the ONNX model to a file | |
| with open(onnx_filename, "wb") as f: | |
| f.write(onnx_model.SerializeToString()) | |
| print(f"Model saved to {onnx_filename} in ONNX format.") | |
| # Optional: Verify the ONNX model | |
| onnx_model_loaded = onnx.load(onnx_filename) | |
| onnx.checker.check_model(onnx_model_loaded) | |
| print("ONNX model check successful!") | |
| # Load Model Pickle | |
| filename = 'model/best_model_rf.pkl' | |
| model = pickle.load(open(filename, 'rb')) | |
| model.score(X_test_rfe, y_test) |