fendy07 commited on
Commit
899d70c
ยท
1 Parent(s): 0822fa4

Update project structure and add new pages

Browse files
requirements.txt CHANGED
@@ -8,7 +8,6 @@ plotly
8
  requests
9
  scikit-learn
10
  imbalanced-learn
11
- pickle
12
  joblib
13
  onnx
14
  skl2onnx
 
8
  requests
9
  scikit-learn
10
  imbalanced-learn
 
11
  joblib
12
  onnx
13
  skl2onnx
src/pages/about.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.title('Author Project')
4
+
5
+ col1, col2 = st.columns(2, gap='small', vertical_alignment='center')
6
+ with col1:
7
+ st.image("images/Fendy.png", width=250)
8
+ with col2:
9
+ st.title("Fendy Hendriyanto", anchor=False)
10
+ st.write(
11
+ "AI Engineer and Instructor"
12
+ )
13
+ st.write(
14
+ "Assisting and mentoring students to help analyze and supporting data driven with creativity and decision making."
15
+ )
16
+
17
+ #--- EXPERIENCE & QUALIFICATIONS ------
18
+ st.write("\n")
19
+ st.subheader("Experience and Qualifications", anchor=False)
20
+ st.write(
21
+ """
22
+ - 2 years experience coaching and mentoring about Artificial Intelligence
23
+ - Strong hands-on experience and knowledge in Python and Data Science
24
+ - Proficient in using various libraries and tools such as TensorFlow, Keras, Scikit-learn, OpenCV, Pandas
25
+ - Good understanding and analyzing of statistical principles and their perspective applications
26
+ - Excellent team player and initiative on tasks
27
+
28
+ """
29
+ )
30
+
31
+ # ---- SKILLS ----
32
+ st.write("\n")
33
+ st.subheader("Hard Skills", anchor=False)
34
+ st.write(
35
+ """
36
+ - Programming : Python (Pandas, Scikit-learn, Scikit-image), R, SQL, JavaScript
37
+ - Data Visualization : Tableau, Spreadsheet, Excel
38
+ - Modelling : Tensorflow, Keras, PyCaret, XGBoost, CometML
39
+ - Databases : MySQL, PostgreSQL, SQLite
40
+ - Deployment : Streamlit, Flask, Gradio, Huggingface, Git
41
+ - Frameworks : OpenCV, NLTK
42
+
43
+ """
44
+ )
src/pages/dashboard.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import seaborn as sns
3
+ import streamlit as st
4
+ import plotly.express as px
5
+ import matplotlib.pyplot as plt
6
+ import plotly.graph_objects as go
7
+
8
+ st.title("Dashboard Analysis Customer Retail")
9
+
10
+ # Load CSS style
11
+ with open('static/styles.css') as f:
12
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
13
+
14
+ # Load dataset
15
+ @st.cache_data
16
+ def load_data():
17
+ data = pd.read_csv('data/customer_shopping_data.csv')
18
+ return data
19
+ data = load_data()
20
+
21
+ with st.expander("HASIL DATA"):
22
+ data = pd.DataFrame({
23
+ 'InvoiceNo': data['invoice_no'],
24
+ 'CustomerID': data['customer_id'],
25
+ 'Gender': data['gender'],
26
+ 'Age': data['age'],
27
+ 'Category': data['category'],
28
+ 'Quantity': data['quantity'],
29
+ 'Price': data['price'],
30
+ 'PaymentMethod': data['payment_method'],
31
+ 'InvoiceDate': data['invoice_date'],
32
+ 'ShoppingMall': data['shopping_mall']
33
+ })
34
+ st.dataframe(data, use_container_width=True)
35
+
36
+ # Download Dataset
37
+ download = data.to_csv(index=False).encode('utf-8')
38
+ st.download_button(label = "DOWNLOAD DATASET",
39
+ data = download,
40
+ key = 'download_data.csv',
41
+ file_name = 'dataset_retail.csv')
42
+
43
+ # Visualization
44
+ with st.expander("DISTRIBUSI KATEGORI DAN PEMBAYARAN"):
45
+ col1, col2 = st.columns(2)
46
+ with col1:
47
+ data_quantity = data.groupby('Category')['Quantity'].sum()
48
+ # Plot Pie Chart
49
+ plt.figure(figsize = (10, 8))
50
+ plt.pie(data_quantity.values, labels = data_quantity.index,
51
+ autopct = '%1.1f%%', colors = sns.color_palette("pastel"))
52
+ # Title
53
+ plt.title('Kuantitas Produk Berdasarkan Kategori', fontsize = 16)
54
+ st.pyplot(plt)
55
+
56
+ with col2:
57
+ payment_counts = data['PaymentMethod'].value_counts()
58
+ fig = px.bar(x = payment_counts.index, y = payment_counts.values,
59
+ labels = {'x': 'Metode Pembayaran', 'y': 'Jumlah Transaksi'},
60
+ color = payment_counts.index)
61
+ fig.update_layout(font_size = 14)
62
+ title = fig.update_layout(title = {'text': 'Distribusi Metode Pembayaran',
63
+ 'xanchor': 'center',
64
+ 'yanchor': 'top',
65
+ 'x': 0.5,
66
+ 'y': 0.95})
67
+
68
+ st.plotly_chart(fig, use_container_width=True)
69
+
70
+ st.write(f"<b>NOTES</b>: Distribusi dalam kategori berdasarkan kuantitas kategori produk yang sering dibeli oleh pelanggan adalah baju, kosmetik dan F&B. Sedangkan, metode pembayaran dengan transaksi terbanyak adalah Cash dan Credit.", unsafe_allow_html=True)
71
+
72
+ with st.expander("TOTAL PENDAPATAN DAN PENJUALAN"):
73
+ col1, col2 = st.columns(2)
74
+ with col1:
75
+ total_revenue = data.groupby('ShoppingMall')['Price'].sum()
76
+ fig = px.bar(x = total_revenue.index, y = total_revenue.values,
77
+ labels = {'x': 'Mall', 'y': 'Total Pendapatan'},
78
+ color = total_revenue.index)
79
+ title = fig.update_layout(title = {'text': 'Total Pendapatan Setiap Pusat Perbelanjaan',
80
+ 'xanchor': 'center',
81
+ 'yanchor': 'top',
82
+ 'x': 0.5,
83
+ 'y': 0.95})
84
+
85
+ st.plotly_chart(fig, use_container_width=True)
86
+
87
+ with col2:
88
+ total_sales = data.groupby('ShoppingMall')['Quantity'].sum().sort_values(ascending=False)
89
+ fig = px.bar(x = total_sales.index, y = total_sales.values,
90
+ labels = {'x': 'Mall', 'y': 'Total Penjualan'},
91
+ color = total_sales.index)
92
+ title = fig.update_layout(title = {'text': 'Total Penjualan Setiap Pusat Perbelanjaan',
93
+ 'xanchor': 'center',
94
+ 'yanchor': 'top',
95
+ 'x': 0.5,
96
+ 'y': 0.95})
97
+
98
+ st.plotly_chart(fig, use_container_width=True)
99
+ st.write(f"<b>NOTES</b>: Pusat perbelanjaan dengan total pendapatan dan penjualan tertinggi adalah Mall of Istanbul, diikuti oleh Mall Kanyon dan Mall Metrocity.", unsafe_allow_html=True)
src/pages/predict.py ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import numpy as np
4
+ import pandas as pd
5
+ import seaborn as sns
6
+ import streamlit as st
7
+ import onnxruntime as ort
8
+ import plotly.express as px
9
+ from scipy.stats import zscore
10
+ import matplotlib.pyplot as plt
11
+ from skl2onnx import convert_sklearn
12
+ from sklearn.feature_selection import RFE
13
+ from sklearn.ensemble import RandomForestClassifier
14
+ from sklearn.model_selection import train_test_split
15
+ from skl2onnx.common.data_types import FloatTensorType
16
+ from streamlit_extras.metric_cards import style_metric_cards
17
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
18
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
19
+
20
+ st.title("Customer Category Prediction (Case: Turkey Customer)")
21
+ st.write("Prediction Customer in Turkey with Probability Using Ensemble Technique Based")
22
+
23
+ # Load CSS style
24
+ with open('static/styles.css') as f:
25
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
26
+
27
+ # Load Dataset
28
+ retail = pd.read_csv('data/customer_shopping_data.csv')
29
+
30
+ X = retail.loc[:, ['age', 'gender', 'price', 'payment_method', 'shopping_mall']]
31
+ y = retail[['category']]
32
+
33
+ # Encode categorical variables
34
+ le = LabelEncoder()
35
+ X['gender'] = le.fit_transform(X['gender'])
36
+ X['payment_method'] = le.fit_transform(X['payment_method'])
37
+ X['shopping_mall'] = le.fit_transform(X['shopping_mall'])
38
+ y_encoded = le.fit_transform(y)
39
+
40
+ # Splitting data
41
+ X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=44)
42
+
43
+ # Preprocessing
44
+ scaler = StandardScaler()
45
+ X_train_scaled = scaler.fit_transform(X_train)
46
+ X_test_scaled = scaler.transform(X_test)
47
+
48
+ # Outlier detection using Z-Score
49
+ z_scores = np.abs(zscore(X_train_scaled))
50
+ threshold = 5
51
+ outliers = np.where(z_scores > threshold)
52
+
53
+ X_train_clean = X_train_scaled[(z_scores < threshold).all(axis=1)]
54
+ y_train_clean = y_train[(z_scores < threshold).all(axis=1)]
55
+
56
+ #------------ MODEL TRAINING SECTION ---------
57
+ with st.expander("๐Ÿ”„ MODEL TRAINING & MANAGEMENT"):
58
+ st.subheader("Train or Load Model")
59
+
60
+ col1, col2 = st.columns(2)
61
+
62
+ with col1:
63
+ st.write("### Training Parameters")
64
+ n_estimators = st.slider("Number of Trees (n_estimators)",
65
+ min_value=50, max_value=500, value=300, step=50)
66
+ test_size = st.slider("Test Size",
67
+ min_value=0.1, max_value=0.4, value=0.2, step=0.05)
68
+ random_state = st.number_input("Random State",
69
+ min_value=0, max_value=100, value=44)
70
+ n_features = st.slider("Number of Features to Select (RFE)",
71
+ min_value=1, max_value=5, value=5)
72
+
73
+ train_button = st.button("๐Ÿš€ TRAIN NEW MODEL", type="primary")
74
+
75
+ with col2:
76
+ st.write("### Model Management")
77
+ model_format = st.radio("Choose Model Format:",
78
+ ["ONNX Model (.onnx)", "Pickle Model (.pkl)"])
79
+
80
+ load_option = st.radio("Choose Model Source:",
81
+ ["Load Existing Model", "Use Newly Trained Model"])
82
+
83
+ if load_option == "Load Existing Model":
84
+ if model_format == "ONNX Model (.onnx)":
85
+ model_path = 'model/best_model_rf.onnx'
86
+ metadata_path = 'model/model_metadata.pkl'
87
+ if os.path.exists(model_path) and os.path.exists(metadata_path):
88
+ st.success("โœ… ONNX model found!")
89
+ model_loaded = True
90
+ use_onnx = True
91
+ else:
92
+ st.error("โŒ ONNX model not found. Please train a new model first.")
93
+ model_loaded = False
94
+ use_onnx = False
95
+ else:
96
+ model_path = 'model/best_model_rf.pkl'
97
+ if os.path.exists(model_path):
98
+ st.success("โœ… Pickle model found!")
99
+ model_loaded = True
100
+ use_onnx = False
101
+ else:
102
+ st.error("โŒ Pickle model not found. Please train a new model first.")
103
+ model_loaded = False
104
+ use_onnx = False
105
+ else:
106
+ model_loaded = False
107
+ use_onnx = False
108
+
109
+ # Initialize session state for model
110
+ if 'trained_model' not in st.session_state:
111
+ st.session_state.trained_model = None
112
+ st.session_state.trained_rfe = None
113
+ st.session_state.trained_scaler = None
114
+ st.session_state.trained_le = None
115
+ st.session_state.model_metrics = None
116
+ st.session_state.onnx_session = None
117
+
118
+ # Train new model
119
+ if train_button:
120
+ with st.spinner("Training model... Please wait..."):
121
+ # Re-split data with new test_size
122
+ X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(
123
+ X, y_encoded, test_size=test_size, random_state=random_state
124
+ )
125
+
126
+ # Preprocessing
127
+ scaler_new = StandardScaler()
128
+ X_train_scaled_new = scaler_new.fit_transform(X_train_new)
129
+ X_test_scaled_new = scaler_new.transform(X_test_new)
130
+
131
+ # Outlier removal
132
+ z_scores_new = np.abs(zscore(X_train_scaled_new))
133
+ X_train_clean_new = X_train_scaled_new[(z_scores_new < threshold).all(axis=1)]
134
+ y_train_clean_new = y_train_new[(z_scores_new < threshold).all(axis=1)]
135
+
136
+ # Model training with RFE
137
+ classifier_new = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
138
+ rfe_new = RFE(classifier_new, n_features_to_select=n_features)
139
+ X_train_rfe = rfe_new.fit_transform(X_train_clean_new, y_train_clean_new)
140
+ X_test_rfe = rfe_new.transform(X_test_scaled_new)
141
+
142
+ # Fit the model
143
+ classifier_new.fit(X_train_rfe, y_train_clean_new)
144
+
145
+ # Predictions
146
+ y_pred_new = classifier_new.predict(X_test_rfe)
147
+
148
+ # Calculate metrics
149
+ metrics = {
150
+ 'accuracy': accuracy_score(y_test_new, y_pred_new),
151
+ 'precision': precision_score(y_test_new, y_pred_new, average='weighted'),
152
+ 'recall': recall_score(y_test_new, y_pred_new, average='weighted'),
153
+ 'f1_score': f1_score(y_test_new, y_pred_new, average='weighted')
154
+ }
155
+
156
+ # Save to session state
157
+ st.session_state.trained_model = classifier_new
158
+ st.session_state.trained_rfe = rfe_new
159
+ st.session_state.trained_scaler = scaler_new
160
+ st.session_state.trained_le = le
161
+ st.session_state.model_metrics = metrics
162
+ st.session_state.X_test = X_test_rfe
163
+ st.session_state.y_test = y_test_new
164
+ st.session_state.y_pred = y_pred_new
165
+
166
+ # Save as Pickle
167
+ model_package = {
168
+ 'classifier': classifier_new,
169
+ 'rfe': rfe_new,
170
+ 'scaler': scaler_new,
171
+ 'label_encoder': le,
172
+ 'metrics': metrics,
173
+ 'n_features': n_features
174
+ }
175
+
176
+ with open('model/best_model_rf.pkl', 'wb') as f:
177
+ pickle.dump(model_package, f)
178
+
179
+ # Convert and Save as ONNX
180
+ try:
181
+ # Define initial type for ONNX conversion
182
+ initial_type = [('float_input', FloatTensorType([None, n_features]))]
183
+
184
+ # Convert model to ONNX
185
+ onnx_model = convert_sklearn(classifier_new, initial_types=initial_type,
186
+ target_opset=12)
187
+
188
+ # Save ONNX model
189
+ with open('model/best_model_rf.onnx', 'wb') as f:
190
+ f.write(onnx_model.SerializeToString())
191
+
192
+ # Save metadata (scaler, rfe, label_encoder) separately
193
+ metadata = {
194
+ 'scaler': scaler_new,
195
+ 'rfe': rfe_new,
196
+ 'label_encoder': le,
197
+ 'metrics': metrics,
198
+ 'n_features': n_features,
199
+ 'feature_names': ['age', 'gender', 'price', 'payment_method', 'shopping_mall']
200
+ }
201
+
202
+ with open('model/model_metadata.pkl', 'wb') as f:
203
+ pickle.dump(metadata, f)
204
+
205
+ st.success(f"โœ… Model trained and saved successfully!")
206
+ st.success(f"๐Ÿ“Š Accuracy: {metrics['accuracy']:.4f}")
207
+ st.success(f"๐Ÿ’พ Saved as: Pickle (.pkl) and ONNX (.onnx)")
208
+
209
+ except Exception as e:
210
+ st.warning(f"โš ๏ธ Model saved as Pickle only. ONNX conversion failed: {str(e)}")
211
+
212
+ st.balloons()
213
+
214
+ # Determine which model to use
215
+ if load_option == "Use Newly Trained Model" and st.session_state.trained_model is not None:
216
+ classifier = st.session_state.trained_model
217
+ rfe = st.session_state.trained_rfe
218
+ scaler = st.session_state.trained_scaler
219
+ le_model = st.session_state.trained_le
220
+ X_test_final = st.session_state.X_test
221
+ y_test_final = st.session_state.y_test
222
+ y_pred_final = st.session_state.y_pred
223
+
224
+ accuracy = st.session_state.model_metrics['accuracy']
225
+ precision = st.session_state.model_metrics['precision']
226
+ recall = st.session_state.model_metrics['recall']
227
+ f1 = st.session_state.model_metrics['f1_score']
228
+
229
+ onnx_session = None
230
+ st.info("๐Ÿ”ต Using newly trained model from this session")
231
+
232
+ elif model_loaded and use_onnx:
233
+ # Load ONNX Model
234
+ try:
235
+ onnx_session = ort.InferenceSession('model/best_model_rf.onnx')
236
+
237
+ # Load metadata
238
+ with open('model/model_metadata.pkl', 'rb') as f:
239
+ metadata = pickle.load(f)
240
+
241
+ scaler = metadata['scaler']
242
+ rfe = metadata['rfe']
243
+ le_model = metadata['label_encoder']
244
+ metrics = metadata.get('metrics', {})
245
+
246
+ # Apply transformations
247
+ X_train_rfe = rfe.fit_transform(X_train_clean, y_train_clean)
248
+ X_test_final = rfe.transform(X_test_scaled)
249
+
250
+ # Predict using ONNX
251
+ input_name = onnx_session.get_inputs()[0].name
252
+ label_name = onnx_session.get_outputs()[0].name
253
+
254
+ y_pred_final = onnx_session.run([label_name], {input_name: X_test_final.astype(np.float32)})[0]
255
+ y_test_final = y_test
256
+
257
+ # Calculate metrics
258
+ accuracy = metrics.get('accuracy', accuracy_score(y_test_final, y_pred_final))
259
+ precision = metrics.get('precision', precision_score(y_test_final, y_pred_final, average='weighted'))
260
+ recall = metrics.get('recall', recall_score(y_test_final, y_pred_final, average='weighted'))
261
+ f1 = metrics.get('f1_score', f1_score(y_test_final, y_pred_final, average='weighted'))
262
+
263
+ classifier = None # ONNX doesn't need sklearn classifier
264
+
265
+ st.info("๐ŸŸข Using ONNX model from file")
266
+
267
+ except Exception as e:
268
+ st.error(f"Failed to load ONNX model: {str(e)}")
269
+ st.warning("Falling back to default model...")
270
+ model_loaded = False
271
+ use_onnx = False
272
+ onnx_session = None
273
+
274
+ elif model_loaded and not use_onnx:
275
+ # Load Pickle Model
276
+ with open('model/best_model_rf.pkl', 'rb') as f:
277
+ model_data = pickle.load(f)
278
+
279
+ if isinstance(model_data, dict):
280
+ classifier = model_data['classifier']
281
+ rfe = model_data.get('rfe', None)
282
+ scaler = model_data.get('scaler', scaler)
283
+ le_model = model_data.get('label_encoder', le)
284
+
285
+ if rfe is None:
286
+ rfe = RFE(classifier, n_features_to_select=5)
287
+
288
+ # Apply transformations
289
+ X_train_rfe = rfe.fit_transform(X_train_clean, y_train_clean)
290
+ X_test_final = rfe.transform(X_test_scaled)
291
+ classifier.fit(X_train_rfe, y_train_clean)
292
+ y_pred_final = classifier.predict(X_test_final)
293
+ y_test_final = y_test
294
+
295
+ # Calculate metrics
296
+ accuracy = accuracy_score(y_test_final, y_pred_final)
297
+ precision = precision_score(y_test_final, y_pred_final, average='weighted')
298
+ recall = recall_score(y_test_final, y_pred_final, average='weighted')
299
+ f1 = f1_score(y_test_final, y_pred_final, average='weighted')
300
+ else:
301
+ classifier = model_data
302
+ le_model = le
303
+
304
+ if hasattr(classifier, 'named_steps') or hasattr(classifier, 'steps'):
305
+ y_pred_final = classifier.predict(X_test)
306
+ y_test_final = y_test
307
+ X_test_final = X_test_scaled
308
+ rfe = None
309
+ else:
310
+ rfe = RFE(classifier, n_features_to_select=5)
311
+ X_train_rfe = rfe.fit_transform(X_train_clean, y_train_clean)
312
+ X_test_final = rfe.transform(X_test_scaled)
313
+ classifier.fit(X_train_rfe, y_train_clean)
314
+ y_pred_final = classifier.predict(X_test_final)
315
+ y_test_final = y_test
316
+
317
+ accuracy = accuracy_score(y_test_final, y_pred_final)
318
+ precision = precision_score(y_test_final, y_pred_final, average='weighted')
319
+ recall = recall_score(y_test_final, y_pred_final, average='weighted')
320
+ f1 = f1_score(y_test_final, y_pred_final, average='weighted')
321
+
322
+ onnx_session = None
323
+ st.info("๐ŸŸข Using Pickle model from file")
324
+
325
+ else:
326
+ # Default: train on the fly
327
+ classifier = RandomForestClassifier(n_estimators=300, random_state=44)
328
+ rfe = RFE(classifier, n_features_to_select=5)
329
+ X_train_rfe = rfe.fit_transform(X_train_clean, y_train_clean)
330
+ X_test_final = rfe.transform(X_test_scaled)
331
+ classifier.fit(X_train_rfe, y_train_clean)
332
+ y_pred_final = classifier.predict(X_test_final)
333
+ y_test_final = y_test
334
+ le_model = le
335
+
336
+ accuracy = accuracy_score(y_test_final, y_pred_final)
337
+ precision = precision_score(y_test_final, y_pred_final, average='weighted')
338
+ recall = recall_score(y_test_final, y_pred_final, average='weighted')
339
+ f1 = f1_score(y_test_final, y_pred_final, average='weighted')
340
+
341
+ onnx_session = None
342
+ st.warning("โš ๏ธ Using default model (trained on-the-fly)")
343
+
344
+ # Evaluation Metrics
345
+ with st.expander("๐Ÿ“Š EVALUATION METRICS"):
346
+ col1, col2, col3, col4 = st.columns(4)
347
+ col1.metric("ACCURACY", value=f'{accuracy:.4f}', delta='Accuracy Score')
348
+ col2.metric("PRECISION", value=f'{precision:.4f}', delta='Precision Score With Weighted Average')
349
+ col3.metric("RECALL", value=f'{recall:.4f}', delta='Recall Score With Weighted Average')
350
+ col4.metric("F1 SCORE", value=f'{f1:.4f}', delta='F1 Score with Weighted Average')
351
+ style_metric_cards(background_color='#FFFFFF', border_left_color='#9900AD', border_color='#1F66BD', box_shadow='#F71938')
352
+ st.write(f"<b>NOTES</b>: Hasil evaluasi metriks yang diterapkan sangat baik dan sudah sesuai dengan hasil pelatihan model algoritma Random Forest.", unsafe_allow_html=True)
353
+
354
+ # Prediction Table
355
+ with st.expander("๐Ÿ“‹ PREDICTION TABLE"):
356
+ prediction_table = pd.DataFrame({
357
+ 'age': X_test_final[:, 0].ravel(),
358
+ 'gender': X_test_final[:, 1].ravel(),
359
+ 'price': X_test_final[:, 2].ravel(),
360
+ 'payment_method': X_test_final[:, 3].ravel(),
361
+ 'shopping_mall': X_test_final[:, 4].ravel(),
362
+ 'Category | Actual Y': y_test_final.ravel(),
363
+ 'Y_Predicted': y_pred_final.ravel(),
364
+ 'Accuracy': [accuracy] * len(y_test_final),
365
+ 'Precision': [precision] * len(y_test_final),
366
+ 'Recall': [recall] * len(y_test_final),
367
+ 'F1 Score': [f1] * len(y_test_final)
368
+ })
369
+
370
+ st.dataframe(prediction_table, use_container_width=True)
371
+ st.write(f'<b>NOTES</b>: Pada bagian tabel prediksi ini menggunakan data yang telah diolah sebelumnya sehingga sangat berbeda dengan data asli.', unsafe_allow_html=True)
372
+
373
+ # Download Predicted Table in CSV
374
+ df_predict = prediction_table.to_csv(index=False).encode('utf-8')
375
+ st.download_button(label="๐Ÿ“ฅ DOWNLOAD PREDICTED DATA",
376
+ data=df_predict,
377
+ key="download_predict.csv",
378
+ file_name='data_predict.csv')
379
+
380
+ # Confusion Matrix and Feature Importance
381
+ with st.expander("๐Ÿ” CONFUSION MATRIX & FEATURE IMPORTANCE"):
382
+ col1, col2 = st.columns(2)
383
+ with col1:
384
+ target_names = ['Books', 'Clothing', 'Cosmetics', 'Food & Beverage',
385
+ 'Shoes', 'Souvenir', 'Technology', 'Toys']
386
+ cm = confusion_matrix(y_test_final, y_pred_final)
387
+ plt.figure(figsize=(15, 8))
388
+ sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=target_names, yticklabels=target_names)
389
+ plt.title('Confusion Matrix Customer Category Prediction')
390
+ plt.xlabel('Predicted labels')
391
+ plt.ylabel('True labels')
392
+ st.pyplot(fig=plt, use_container_width=True)
393
+
394
+ # Ganti bagian Feature Importance (sekitar baris 390-410) dengan kode ini:
395
+ with col2:
396
+ # Feature Importance only available for sklearn models, not ONNX
397
+ if classifier is not None:
398
+ try:
399
+ # Check if classifier is a Pipeline
400
+ if hasattr(classifier, 'named_steps'):
401
+ # Try common pipeline step names
402
+ if 'randomforestclassifier' in classifier.named_steps:
403
+ actual_classifier = classifier.named_steps['randomforestclassifier']
404
+ elif 'classifier' in classifier.named_steps:
405
+ actual_classifier = classifier.named_steps['classifier']
406
+ elif 'model' in classifier.named_steps:
407
+ actual_classifier = classifier.named_steps['model']
408
+ else:
409
+ # Get the last step (usually the classifier)
410
+ actual_classifier = list(classifier.named_steps.values())[-1]
411
+ feature_importance = actual_classifier.feature_importances_
412
+
413
+ # Check if classifier has 'steps' attribute (another Pipeline format)
414
+ elif hasattr(classifier, 'steps'):
415
+ # Get the last step which is typically the classifier
416
+ actual_classifier = classifier.steps[-1][1]
417
+ feature_importance = actual_classifier.feature_importances_
418
+
419
+ # Direct classifier (not a pipeline)
420
+ elif hasattr(classifier, 'feature_importances_'):
421
+ feature_importance = classifier.feature_importances_
422
+
423
+ else:
424
+ raise AttributeError("No feature_importances_ found")
425
+
426
+ # Create feature importance plot
427
+ feature_names = ['age', 'gender', 'price', 'payment_method', 'shopping_mall']
428
+ importance_df = pd.DataFrame({
429
+ "Feature": feature_names,
430
+ "Importance": feature_importance
431
+ })
432
+ importance_df = importance_df.sort_values("Importance", ascending=True)
433
+
434
+ bar = px.bar(importance_df, x='Importance', y='Feature')
435
+ bar.update_layout(
436
+ title={
437
+ 'text': 'Feature Importance Model Random Forest',
438
+ 'xanchor': 'center',
439
+ 'yanchor': 'top',
440
+ 'x': 0.5,
441
+ 'y': 0.95
442
+ }
443
+ )
444
+ st.plotly_chart(bar, use_container_width=True)
445
+
446
+ except (AttributeError, KeyError, IndexError) as e:
447
+ st.warning(f"โš ๏ธ Feature importance is not available for this model type.\n\nDetails: {str(e)}")
448
+ st.info("๐Ÿ’ก This usually happens when:\n- The model is a Pipeline without a RandomForest classifier\n- The model is loaded from ONNX format\n- The classifier doesn't support feature importance")
449
+ else:
450
+ st.info("๐Ÿ“Š Feature importance is not available for ONNX models.\nPlease use Pickle model to view feature importance.")
451
+
452
+ st.write(f'<b>NOTES</b>: Hasil feature importance menunjukkan data fitur Price lebih dominan dibandingkan fitur lainnya dan evaluasi dengan Confusion Matrix terlihat sudah sangat cukup baik dalam hal identifikasi tiap kategori.', unsafe_allow_html=True)
453
+
454
+ #------------ PREDICT NEW DATA ---------
455
+ with st.expander("๐ŸŽฏ PREDICT NEW DATA"):
456
+ with st.form("input_form", clear_on_submit=True):
457
+ x1 = st.number_input("Age", min_value=0, max_value=100)
458
+ x2 = st.selectbox("Gender", ["Male", "Female"])
459
+ x3 = st.number_input("Price", min_value=0.0, max_value=10000.0, step=0.1)
460
+ x4 = st.selectbox("Payment Method", ["Cash", "Credit Card", "Debit Card"])
461
+ x5 = st.selectbox("Shopping Mall", ["Mall of Istanbul", "Kanyon",
462
+ "Metrocity", "Metropol AVM",
463
+ "Istinye Park", "Zorlu Center",
464
+ "Cevahir AVM", "Forum Istanbul",
465
+ "Viaport Outlet", "Emaar Square Mall"])
466
+ submitted = st.form_submit_button(label="๐Ÿ”ฎ PREDICT")
467
+
468
+ if submitted:
469
+ new_data = pd.DataFrame({'age': [x1], 'gender': [x2], 'price': [x3],
470
+ 'payment_method': [x4], 'shopping_mall': [x5]})
471
+
472
+ le_gender = LabelEncoder()
473
+ le_payment_method = LabelEncoder()
474
+ le_shopping_mall = LabelEncoder()
475
+
476
+ # Fit with original data to ensure consistent encoding
477
+ le_gender.fit(retail['gender'])
478
+ le_payment_method.fit(retail['payment_method'])
479
+ le_shopping_mall.fit(retail['shopping_mall'])
480
+
481
+ new_data['gender'] = le_gender.transform(new_data['gender'])
482
+ new_data['payment_method'] = le_payment_method.transform(new_data['payment_method'])
483
+ new_data['shopping_mall'] = le_shopping_mall.transform(new_data['shopping_mall'])
484
+
485
+ # Apply transformations
486
+ new_data_scaled = scaler.transform(new_data)
487
+ if rfe is not None:
488
+ new_data_rfe = rfe.transform(new_data_scaled.reshape(1, -1))
489
+ else:
490
+ new_data_rfe = new_data_scaled.reshape(1, -1)
491
+
492
+ # Make prediction based on model type
493
+ if onnx_session is not None:
494
+ # ONNX Prediction
495
+ input_name = onnx_session.get_inputs()[0].name
496
+ label_name = onnx_session.get_outputs()[0].name
497
+ prob_name = onnx_session.get_outputs()[1].name
498
+
499
+ pred_result = onnx_session.run([label_name, prob_name],
500
+ {input_name: new_data_rfe.astype(np.float32)})
501
+ predict_category = pred_result[0]
502
+ predict_proba = pred_result[1]
503
+ else:
504
+ # Sklearn Prediction
505
+ if hasattr(classifier, 'named_steps') or hasattr(classifier, 'steps'):
506
+ predict_category = classifier.predict(new_data)
507
+ predict_proba = classifier.predict_proba(new_data)
508
+ else:
509
+ predict_category = classifier.predict(new_data_rfe)
510
+ predict_proba = classifier.predict_proba(new_data_rfe)
511
+
512
+ prediction = le_model.inverse_transform(predict_category)
513
+
514
+ st.write(f"<span style='font-size:34px; color:green;'>Predicted Category: </span> <span style='font-size:34px;'>{prediction[0]}</span>", unsafe_allow_html=True)
515
+
516
+ # Show probability
517
+ st.write("### Prediction Probability:")
518
+ target_names = ['Books', 'Clothing', 'Cosmetics', 'Food & Beverage',
519
+ 'Shoes', 'Souvenir', 'Technology', 'Toys']
520
+ prob_df = pd.DataFrame({'Category': target_names, 'Probability': predict_proba[0]})
521
+ prob_df = prob_df.sort_values('Probability', ascending=False)
522
+
523
+ fig = px.bar(prob_df, x='Probability', y='Category', orientation='h',
524
+ title='Prediction Probability for Each Category')
525
+ st.plotly_chart(fig, use_container_width=True)
src/static/styles.css ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [data-test-id=metric-container] {
2
+ box-shadow: 0 0 4px #c9d6d6;
3
+ padding: auto;
4
+ }
5
+
6
+ .plot-container>div {
7
+ box-shadow: 0 0 4px #071021;
8
+ padding: auto;
9
+
10
+
11
+
12
+ }
13
+
14
+ div[data-testid="stExpander"] div[role="button"] p
15
+ {
16
+ font-size: 1.3rem;
17
+ }
18
+ div[data-testid="stDataframe"] div[role="button"] p {
19
+ font-size: 1.3rem;
20
+ color: rgb(1, 84, 84);
21
+ }