fendy07 commited on
Commit
0822fa4
·
1 Parent(s): 75d0e60

Update .gitignore and modify project files

Browse files
.gitignore ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Virtual environment
2
+ .venv/
3
+ # Python cache files
4
+ __pycache__/
5
+ *.py[cod]
6
+ # Jupyter Notebook checkpoints
7
+ .ipynb_checkpoints/
8
+ # Model files
9
+ model/*.pkl
10
+ model/*.h5
11
+ model/*.joblib
12
+ model/*.sav
13
+ model/*.onnx
14
+ # Logs
15
+ logs/
16
+ *.log
17
+ # token files
18
+ *.token
19
+ HUGGINGFACE_TOKEN.txt
README.md CHANGED
@@ -11,7 +11,7 @@ pinned: false
11
  short_description: This project about Customer Prediction in Turkiye
12
  ---
13
 
14
- # Welcome to Streamlit!
15
 
16
  Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
 
 
11
  short_description: This project about Customer Prediction in Turkiye
12
  ---
13
 
14
+ # <b>Welcome to Streamlit!</b>
15
 
16
  Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
 
requirements.txt CHANGED
@@ -1,4 +1,15 @@
1
  altair
2
  pandas
 
3
  streamlit
4
- huggingface_hub
 
 
 
 
 
 
 
 
 
 
 
1
  altair
2
  pandas
3
+ scipy
4
  streamlit
5
+ huggingface_hub
6
+ streamlit_extras
7
+ plotly
8
+ requests
9
+ scikit-learn
10
+ imbalanced-learn
11
+ pickle
12
+ joblib
13
+ onnx
14
+ skl2onnx
15
+ onnxruntime
src/.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [deprecation]
2
+ showPyplotGlobalUse = false
src/streamlit_app.py CHANGED
@@ -1,40 +1,36 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
 
3
+ st.set_page_config(page_title = 'Customer Prediction', page_icon = '📈', layout = 'wide')
4
+
5
+ # --- PAGE SETUP -----
6
+ info_page = st.Page(
7
+ page = 'pages/about.py',
8
+ title = 'Profile Developer',
9
+ icon = ':material/person:',
10
+ default= True
11
+ )
12
+ #---- PAGE PROJECT ------
13
+ dashboard = st.Page(
14
+ page = 'pages/dashboard.py',
15
+ title = 'Dashboard Customer Retail',
16
+ icon = ':material/bar_chart:',
17
+ )
18
+ #---- PAGE PREDICTION ------
19
+ prediction = st.Page(
20
+ page = 'pages/predict.py',
21
+ title = 'Customer Category Prediction',
22
+ icon = ':material/thumb_up:'
23
+ )
24
+
25
+ page = st.navigation(
26
+ {
27
+ "Info": [info_page],
28
+ "Projects": [dashboard, prediction],
29
+ }
30
+ )
31
+
32
+ st.sidebar.info("Source code, find in My Github:")
33
+ st.sidebar.link_button("Github Source", "https://github.com/fendy07/customer-prediction")
34
+ st.sidebar.text(f'Created by Fendy Hendriyanto 👨🏼‍💻')
35
+
36
+ page.run()
 
src/train_model.py CHANGED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import onnx
2
+ import pickle
3
+ import joblib
4
+ import numpy as np
5
+ import pandas as pd
6
+ import seaborn as sns
7
+ from sklearn import tree
8
+ import plotly.express as px
9
+ from scipy.stats import zscore
10
+ import matplotlib.pyplot as plt
11
+ from skl2onnx import convert_sklearn
12
+ from sklearn.pipeline import Pipeline
13
+ from sklearn.impute import SimpleImputer
14
+ from sklearn.feature_selection import RFE
15
+ from sklearn.tree import DecisionTreeClassifier
16
+ from sklearn.ensemble import RandomForestClassifier
17
+ from skl2onnx.common.data_types import FloatTensorType
18
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
19
+ from sklearn.metrics import accuracy_score, classification_report
20
+ from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
21
+ from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
22
+
23
+ # Load Dataset
24
+ data_ritel = pd.read_csv('data/customer_shopping_data.csv')
25
+ data_ritel.sample(25)
26
+
27
+ data_ritel.info()
28
+
29
+ data_ritel.shape
30
+
31
+ """## Exploratory Data Analysis"""
32
+
33
+ data_ritel.isnull().sum()
34
+
35
+ # Analisa data pada kolom category,
36
+ category = data_ritel['category'].value_counts()
37
+ print(category)
38
+
39
+ gender = data_ritel['gender'].value_counts()
40
+ print(gender)
41
+
42
+ payment_counts = data_ritel['payment_method'].value_counts()
43
+ print(payment_counts)
44
+
45
+ # Shopping Mall Insights berdasarkan jumlah transaksional
46
+ mall = data_ritel['shopping_mall'].value_counts()
47
+ print(mall)
48
+
49
+ # Visualize data on payment method and age
50
+ # Set the style for the plot
51
+ sns.set_style("whitegrid")
52
+ # Create a figure and axis object
53
+ fig, ax = plt.subplots(figsize=(10, 6))
54
+ # Plotting the bar chart
55
+ sns.barplot(x=payment_counts.index, y=payment_counts.values, palette="viridis", ax=ax)
56
+ # Set plot title and labels
57
+ ax.set_title("Distribution of Payment Methods", fontsize=16)
58
+ ax.set_xlabel("Payment Method", fontsize=14)
59
+ ax.set_ylabel("Number of Transactions", fontsize=14)
60
+ # Set x-axis tick labels
61
+ ax.set_xticks(payment_counts.index)
62
+ ax.set_xticklabels(payment_counts.index.unique(), fontsize=12)
63
+ # Display the plot
64
+ plt.show()
65
+
66
+ # Calculate the average prices for each product category
67
+ average_prices = data_ritel.groupby('category')['price'].mean()
68
+ average_prices
69
+
70
+ # Visualisasi data pada kolom Category dan Harga
71
+ # Create a bar chart for the average prices of each product category
72
+ fig = px.bar(average_prices,
73
+ x=average_prices.index,
74
+ y=average_prices.values,
75
+ labels={'x': 'Kategori Produk', 'y': 'Rata-rata Harga'},
76
+ title='Rata-rata harga dalam Kategori Produk')
77
+
78
+ # Show the plot
79
+ fig.show()
80
+
81
+ # Mengelompokkan data berdasarkan kategori dan menjumlahkan quantity
82
+ category_quantity = data_ritel.groupby('category')['quantity'].sum()
83
+ # Plot pie chart
84
+ plt.figure(figsize=(10, 8))
85
+ plt.pie(category_quantity, labels=category_quantity.index, autopct='%1.1f%%', colors=sns.color_palette("pastel"))
86
+ # Set judul
87
+ plt.title('Distribusi Kategori Produk Berdasarkan Jumlah Quantity', fontsize=16)
88
+ # Tampilkan plot
89
+ plt.show()
90
+
91
+ # Visualisasi total pendapatan disetiap pusat perbelanjaan
92
+ total_revenue = data_ritel.groupby('shopping_mall')['price'].sum()
93
+ fig = px.bar(total_revenue,
94
+ x = total_revenue.index,
95
+ y = total_revenue.values,
96
+ labels = {'x': 'Shopping Mall', 'y': 'Total Revenue'},
97
+ title = 'Total Pendapatan Setiap Pusat Perbelanjaan')
98
+
99
+ # Show the plot
100
+ fig.show()
101
+
102
+ # Top penjualan pada kategori
103
+ category_quantity = data_ritel.groupby('category')['quantity'].sum().sort_values(ascending=False)
104
+ # Create a bar chart for the top-selling product categories
105
+ fig = px.bar(category_quantity,
106
+ x = category_quantity.index,
107
+ y = category_quantity.values,
108
+ labels = {'x': 'Kategori Produk', 'y': 'Total Kuantitas Terjual'},
109
+ title = 'Top Penjualan Kuantitas Kategori')
110
+
111
+ # Show the plot
112
+ fig.show()
113
+
114
+ # Visualisasi data pada kolom umur
115
+ # Plot bar chart untuk distribusi umur
116
+ plt.figure(figsize=(10, 6))
117
+ sns.histplot(data_ritel['age'], bins=20, kde=False, color='skyblue')
118
+
119
+ # Set judul dan label
120
+ plt.title('Distribusi Umur Pelanggan', fontsize=16)
121
+ plt.xlabel('Umur', fontsize=14)
122
+ plt.ylabel('Jumlah Pelanggan', fontsize=14)
123
+ # Tampilkan plot
124
+ plt.show()
125
+ # Demografi Pelanggan berdasarkan jenis kelamin dan umur
126
+ demographics_summary = data_ritel[['gender', 'age']].describe(include='all')
127
+ demographics_summary
128
+ # Visualisasi hasil transaksi terbanyak pada pusat perbelanjaan atau mall
129
+ fig = px.bar(data_ritel['shopping_mall'].value_counts(),
130
+ x = data_ritel['shopping_mall'].value_counts().index,
131
+ y = data_ritel['shopping_mall'].value_counts().values,
132
+ labels = {'x': 'Shopping Mall', 'y': 'Nominal Transaksi'},
133
+ title = 'Jumlah Nominal Transaksi Pada Pusat Perbelanjaan')
134
+ fig.show()
135
+
136
+
137
+ ### **Data Preprocessing
138
+ # Encoding data kolom dengan feature mapping
139
+ # Encoding pada kolom metode pembayaran
140
+ data_ritel['payment_method'] = data_ritel['payment_method'].map({'Cash': 0, 'Credit Card': 1, 'Debit Card': 2})
141
+ # Encoding pada kolom jenis kelamin
142
+ data_ritel['gender'] = data_ritel['gender'].map({'Female': 0, 'Male': 1})
143
+ # Encoding pada kolom pusat perbelanjaan
144
+ data_ritel['shopping_mall'] = data_ritel['shopping_mall'].map({'Mall of Istanbul': 0,
145
+ 'Kanyon': 1,
146
+ 'Metrocity': 2,
147
+ 'Metropol AVM': 3,
148
+ 'Istinye Park': 4,
149
+ 'Zorlu Center': 5,
150
+ 'Cevahir AVM': 6,
151
+ 'Forum Istanbul': 7,
152
+ 'Viaport Outlet': 8,
153
+ 'Emaar Square Mall': 9})
154
+
155
+ # Encoding data kolom kategori
156
+ le = LabelEncoder()
157
+ data_ritel['category'] =le.fit_transform(data_ritel['category'])
158
+ data_ritel.sample(10)
159
+ # Analisa statistik deskriptif
160
+ data_ritel.describe().T
161
+ # Hapus kolom data yang tidak diperlukan
162
+ data_ritel = data_ritel.drop(columns = ['invoice_no', 'customer_id', 'invoice_date'])
163
+ data_ritel.sample(10)
164
+ # Korelasi antara kolom data
165
+ plt.figure(figsize = (12, 8))
166
+ sns.heatmap(data_ritel.corr(), annot = True)
167
+ plt.show()
168
+
169
+ # Pemilihan data fitur dan label
170
+ features = ['age', 'gender', 'price', 'payment_method', 'shopping_mall']
171
+
172
+ X = data_ritel[features].values
173
+ y = data_ritel['category'].values
174
+
175
+ """### **Splitting Data**"""
176
+
177
+ # Pisahkan data Train dengan test
178
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 44)
179
+ print('Data training : ', X_train.shape, y_train.shape)
180
+ print('Data Testing : ', X_test.shape, y_test.shape)
181
+
182
+ data_ritel.category.value_counts()
183
+
184
+ # Filling Missing Data
185
+ imputer = SimpleImputer(strategy = 'mean')
186
+ X_train_imputed = imputer.fit_transform(X_train)
187
+ X_test_imputed = imputer.transform(X_test)
188
+
189
+ # Data Scaling
190
+ scaler = StandardScaler()
191
+ X_train_scaled = scaler.fit_transform(X_train_imputed)
192
+ X_test_scaled = scaler.transform(X_test_imputed)
193
+
194
+ # Outlier detection using Z-Score
195
+ z_scores = np.abs(zscore(X_train_scaled))
196
+ threshold = 5
197
+ outliers = np.where(z_scores > threshold)
198
+
199
+ X_train_no_outliers = X_train_scaled[(z_scores < threshold).all(axis=1)]
200
+ y_train_no_outliers = y_train[(z_scores < threshold).all(axis=1)]
201
+
202
+ # Modelling
203
+ # Decision Tree Classifier
204
+ model_dt = DecisionTreeClassifier(random_state = 44)
205
+ rfe = RFE(model_dt, n_features_to_select=5)
206
+
207
+ X_train_rfe = rfe.fit_transform(X_train_no_outliers, y_train_no_outliers)
208
+ X_test_rfe = rfe.transform(X_test_scaled)
209
+
210
+ selected_features = np.array(features)[rfe.support_]
211
+ print(selected_features)
212
+
213
+ """### **Decision Tree**"""
214
+
215
+ # Training with Pipeline
216
+ pipeline = Pipeline([
217
+ ('Classifier', DecisionTreeClassifier(random_state = 44)),
218
+
219
+ ])
220
+
221
+ param_grid = {
222
+ 'Classifier__max_depth': list(range(2, 10)),
223
+ 'Classifier__max_leaf_nodes': list(range(2, 10))
224
+ }
225
+
226
+ gridsearch = GridSearchCV(pipeline, param_grid, cv=20)
227
+ gridsearch.fit(X_train_rfe, y_train_no_outliers)
228
+
229
+ best_model = gridsearch.best_estimator_
230
+ print(gridsearch.best_params_)
231
+
232
+ # Evaluation Model Decision Tree
233
+ # Predict the model
234
+ y_pred = best_model.predict(X_test_rfe)
235
+ print("Accuracy:", accuracy_score(y_test, y_pred))
236
+
237
+ scores = cross_val_score(best_model, X_train_no_outliers, y_train_no_outliers, cv=20, scoring='accuracy')
238
+ print("Cross-Validation Accuracy Scores:", scores)
239
+
240
+ # Classification Report
241
+ target_names = ['Books',
242
+ 'Clothing',
243
+ 'Cosmetics',
244
+ 'Food & Beverage',
245
+ 'Shoes',
246
+ 'Souvenir',
247
+ 'Technology',
248
+ 'Toys']
249
+
250
+ print('Classification Report in Hyperparameter Tuning Decision Tree:')
251
+ print(classification_report(y_test, y_pred, target_names = le.classes_))
252
+
253
+ # Plot Decision Tree
254
+ # Assuming 'best_model' is your Pipeline object
255
+ decision_tree = best_model.named_steps['Classifier'] # Replace 'Classifier' with your step name
256
+ plt.figure(figsize = (25, 20))
257
+ tree.plot_tree(decision_tree,
258
+ feature_names = features,
259
+ class_names = target_names,
260
+ filled=True)
261
+
262
+ plt.show()
263
+
264
+ # Random Forest
265
+ model_rf = RandomForestClassifier(random_state = 44)
266
+ rfe = RFE(model_rf, n_features_to_select = 5)
267
+
268
+ X_train_rfe = rfe.fit_transform(X_train_no_outliers, y_train_no_outliers)
269
+ X_test_rfe = rfe.transform(X_test_scaled)
270
+
271
+ selected_features = np.array(features)[rfe.support_]
272
+ print(selected_features)
273
+
274
+ # Training with Pipeline
275
+ pipeline = Pipeline([
276
+ ('Classifier', RandomForestClassifier(random_state = 44)),
277
+ ])
278
+
279
+ param_grid = {
280
+ 'Classifier__n_estimators': [100, 200, 300],
281
+ 'Classifier__max_depth': [None, 5, 10]
282
+ }
283
+
284
+ grid_search = GridSearchCV(pipeline, param_grid, cv = 5)
285
+ grid_search.fit(X_train_rfe, y_train_no_outliers)
286
+
287
+ best_model_rf = grid_search.best_estimator_
288
+ print(grid_search.best_params_)
289
+
290
+ ## **Evaluation Model Random Forest
291
+ y_pred_rf = best_model_rf.predict(X_test_rfe)
292
+ print("Accuracy Score :", accuracy_score(y_test, y_pred_rf))
293
+ scores = cross_val_score(best_model_rf, X_train_no_outliers, y_train_no_outliers, cv = 5, scoring = 'accuracy')
294
+ print("Cross-Validation Accuracy Scores: ", scores)
295
+ print('Classification Report in Hyperparameter Tuning Random Forest:')
296
+ print(classification_report(y_test, y_pred_rf, target_names = target_names))
297
+
298
+ # Assuming 'y_test' and 'y_pred_rf' are your true and predicted labels respectively
299
+ cm = confusion_matrix(y_test, y_pred_rf)
300
+ disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = target_names)
301
+ disp.plot(cmap = 'Blues', xticks_rotation = 'vertical')
302
+ plt.title('Confusion Matrix - Random Forest')
303
+ plt.show()
304
+
305
+ # Assuming 'best_model_rf' is your best performing model (Random Forest in this case)
306
+ # Input features for new data (replace with actual values)
307
+ new_data = np.array([[30, 0, 50, 1, 0]]) # Example: age, gender, price, payment_method, shopping_mall
308
+ # Preprocess the new data (scaling)
309
+ new_data_scaled = scaler.transform(new_data)
310
+ # Feature selection using RFE (if used during training)
311
+ new_data_rfe = rfe.transform(new_data_scaled)
312
+ # Make prediction
313
+ predicted_category = best_model.predict(new_data_rfe)
314
+ # Decode the predicted category (if label encoding was used)
315
+ predicted_category_name = le.inverse_transform(predicted_category)
316
+ print("Predicted Category (Numerical):", predicted_category)
317
+ print("Predicted Category (Name):", predicted_category_name)
318
+
319
+ # Save model using Pickle
320
+ with open('model/best_model_rf.pkl', 'wb') as file:
321
+ pickle.dump(best_model_rf, file)
322
+
323
+ # Save model using Joblib
324
+ joblib.dump(best_model_rf, 'model/best_model_rf.joblib')
325
+
326
+ # Save model using ONNX
327
+ initial_type = [('float_input', FloatTensorType([None, X_train_rfe.shape[1]]))]
328
+ # Convert the scikit-learn Random Forest model to ONNX format
329
+ onnx_model = convert_sklearn(best_model_rf, initial_types=initial_type)
330
+ # Define the path to save the ONNX model
331
+ onnx_filename = 'model/best_model_rf.onnx'
332
+ # Save the ONNX model to a file
333
+ with open(onnx_filename, "wb") as f:
334
+ f.write(onnx_model.SerializeToString())
335
+ print(f"Model saved to {onnx_filename} in ONNX format.")
336
+
337
+ # Optional: Verify the ONNX model
338
+ onnx_model_loaded = onnx.load(onnx_filename)
339
+ onnx.checker.check_model(onnx_model_loaded)
340
+ print("ONNX model check successful!")
341
+
342
+ # Load Model Pickle
343
+ filename = 'model/best_model_rf.pkl'
344
+ model = pickle.load(open(filename, 'rb'))
345
+ model.score(X_test_rfe, y_test)