File size: 12,045 Bytes
c82c9a3 5dc262f 5aa5d27 c82c9a3 5aa5d27 c82c9a3 5aa5d27 c82c9a3 e335c32 5dc262f 5aa5d27 c82c9a3 f49284b c82c9a3 5aa5d27 c82c9a3 5aa5d27 c82c9a3 5aa5d27 c82c9a3 5aa5d27 c82c9a3 5aa5d27 c82c9a3 5aa5d27 9647fcb c82c9a3 5aa5d27 7b29dae 5aa5d27 7b29dae 5aa5d27 7b29dae 5aa5d27 7b29dae 5aa5d27 7b29dae 5aa5d27 7b29dae 5aa5d27 7b29dae 5aa5d27 7b29dae 5aa5d27 7b29dae 5aa5d27 7b29dae 5aa5d27 dbfd064 c5f8e0f 7b29dae 5aa5d27 7b29dae 162d752 8b4d29e 162d752 7b29dae fad59f5 7b29dae 5aa5d27 6ed37d3 57db9de 2514237 c82c9a3 2514237 c82c9a3 6ed37d3 2514237 c82c9a3 2514237 c82c9a3 2514237 c82c9a3 5aa5d27 c82c9a3 5aa5d27 2514237 3865712 ea8630d 5e062f1 f49284b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 | import streamlit as st
import pandas as pd
import numpy as np
from io import StringIO
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import optuna
from sklearn.preprocessing import PolynomialFeatures
# Page configuration
st.set_page_config(page_title="Predictive Modelling", layout="wide")
# Title with centered alignment
st.markdown(
"""
<h1 style="text-align: center; color: white;">📱 Predictive Model Creation and Evaluation 💻</h1>
""",
unsafe_allow_html=True
)
# Flowchart title
st.markdown(
"""
<h1 style="text-align: center; color: white;">Model Creation Flow</h1>
""",
unsafe_allow_html=True
)
st.markdown(
"""
<div style="text-align: center;">
<img src="https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/g-lmBAPoAV_5uO_fpqFYc.gif" alt="model-creation-flowchart.gif" width="70%" />
</div>
""",
unsafe_allow_html=True
)
df = st.session_state.get("dataset")
# Exclude 'ProductID' from the dataset
if df is not None:
df = df.drop(columns=['ProductID'], errors='ignore') # Exclude 'ProductID' if it exists
st.subheader("Dataset Preview:")
st.write(df.head())
# Dropping unnecessary columns
df.drop(['age_bins', 'ProductPriceBucket', 'CustomerAgeGroup'], axis=1, inplace=True, errors='ignore')
st.write(df.head())
# Splitting Feature Variables and Class Labels
st.markdown("### Split Feature Variables and Class Labels")
fv = df.iloc[:, :-1]
cv = df.iloc[:, -1]
st.write(fv)
st.write(cv)
# Feature Engineering
st.markdown("### Feature Engineering")
label_encoder = LabelEncoder()
fv['ProductBrand'] = label_encoder.fit_transform(fv['ProductBrand'])
fv['ProductCategory'] = label_encoder.fit_transform(fv['ProductCategory'])
st.write(fv.head())
# Polynomial Featurisation for Non-Linearity
st.markdown("### Polynomial Featurisation for Non-Linearity:")
numeric_columns = fv.select_dtypes(include=[float, int]).columns
degree = 2
poly = PolynomialFeatures(degree=degree, include_bias=False)
poly_features = poly.fit_transform(fv[numeric_columns])
poly_feature_names = poly.get_feature_names_out(numeric_columns)
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)
fv_with_poly = pd.concat([fv.reset_index(drop=True), poly_df], axis=1)
fv_with_poly = fv_with_poly.loc[:, ~fv_with_poly.columns.duplicated()]
st.write(fv_with_poly.head())
# SMOTE for Handling Imbalanced Dataset
st.markdown("### SMOTE for Handling Imbalanced Dataset")
smote = SMOTE(sampling_strategy=1)
fv1, cv1 = smote.fit_resample(fv_with_poly, cv)
st.write(pd.Series(cv1).value_counts())
# Data Splitting
st.markdown("### Data Splitting")
x_train, x_test, y_train, y_test = train_test_split(fv1, cv1, test_size=0.2, random_state=42)
# Scaling
st.markdown("### Scaling")
std = StandardScaler()
x_train_std = std.fit_transform(x_train)
x_test_std = std.transform(x_test)
st.code("""
std = StandardScaler()
x_train_std = std.fit_transform(x_train)
x_test_std = std.transform(x_test)
""")
st.markdown("## Hyperparameter Tuning using OPTUNA")
# Define the objective function for Optuna
st.code("""
import numpy as np
import optuna
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
# Check for NaN or infinite values in the data
assert not np.any(np.isnan(x_train_std)), "Input data contains NaN values"
assert not np.any(np.isnan(y_train)), "Target data contains NaN values"
assert not np.any(np.isinf(x_train_std)), "Input data contains infinite values"
# Global lists to store training and validation scores for each trial
training_scores = []
validation_scores = []
def objective(trial):
# Log trial parameters for debugging
print(f"Trial params: {trial.params}")
algo = trial.suggest_categorical("algo", ["lor", "svc"])
if algo == "svc":
# Hyperparameters for SVC
c = trial.suggest_float("C", 0.001, 1000, log=True)
kernel = trial.suggest_categorical("kernel", ['linear', 'poly', 'rbf', 'sigmoid'])
if kernel == 'poly':
degree = trial.suggest_int("degree", 1, 3)
model = SVC(C=c, kernel=kernel, degree=degree, random_state=42)
elif kernel in ['rbf', 'sigmoid']:
gamma = trial.suggest_categorical("gamma", ['scale', 'auto'])
model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)
else:
model = SVC(C=c, kernel=kernel, random_state=42)
else:
# Hyperparameters for Logistic Regression
solver, penalty = trial.suggest_categorical(
"choices", [
("lbfgs", "l2"), ("newton-cg", "l2"),
("sag", "l2"), ("saga", "l1"),
("saga", "l2"), ("saga", "elasticnet")
]
)
reg_strength = trial.suggest_float("C", 0.001, 1000, log=True)
l1_ratio = trial.suggest_float("l1_ratio", 0, 1) if penalty == "elasticnet" else None
if penalty == "elasticnet":
model = LogisticRegression(solver=solver, penalty=penalty, C=reg_strength, l1_ratio=l1_ratio, random_state=42)
else:
model = LogisticRegression(solver=solver, penalty=penalty, C=reg_strength, random_state=42)
# Cross-validation scoring with training and validation
try:
scores = cross_validate(
model, x_train_std, y_train, cv=5,
scoring="accuracy", return_train_score=True
)
train_score = scores["train_score"].mean()
val_score = scores["test_score"].mean()
# Append scores to global lists
training_scores.append(train_score)
validation_scores.append(val_score)
except ValueError as e:
print(f"Error during cross-validation: {e}")
train_score, val_score = float("-inf"), float("-inf")
return val_score
# Running the optimization
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=100)
# Plotting training vs. validation scores
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(training_scores, label="Training Score", marker="o")
plt.plot(validation_scores, label="Validation Score", marker="x")
plt.xlabel("Trial")
plt.ylabel("Accuracy")
plt.title("Training vs. Validation Scores Across Trials")
plt.legend()
plt.grid()
plt.show()
# Display best trial
print("Best Parameters:")
print(study.best_params)
""", language="python")
st.markdown(
"""
<div style="text-align: center;">
<img src="https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/FqUoV8hSyCWU3WocaqqGc.png" width="70%" />
</div>
""",
unsafe_allow_html=True
)
# Create the best model
st.markdown("## Create the Model with the best algorithm and parameters you have received by perfroming Hyperparameter Tuning using Optuna")
st.markdown("## SVC(kernel='poly', gamma = 'scale', C = 974.1963187644974, degree = 2)")
model = SVC(kernel='poly', gamma = 'scale', C = 974.1963187644974, degree = 2)
st.write(model)
# Train the model
st.markdown("### Train the Model")
model.fit(x_train_std, y_train)
# Model Evaluation
st.markdown("# Model Evaluation")
y_pred = model.predict(x_test_std)
# Evaluation metrics
st.write("Accuracy:", accuracy_score(y_test, y_pred))
st.write("Classification Report:\n", classification_report(y_test, y_pred))
st.write("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Example: Replace this with your actual test data and predictions
y_pred = model.predict(x_test_std)
# Calculate evaluation metrics
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True) # Output as a dictionary
# Convert the classification report to a DataFrame
class_report_df = pd.DataFrame(class_report).iloc[:-1, :-1] # Exclude support and accuracy rows
# Streamlit app
st.title("Model Evaluation: Confusion Matrix and Classification Report")
# Plotting with Matplotlib and Seaborn
fig, axs = plt.subplots(1, 2, figsize=(16, 6))
# Confusion Matrix Heatmap
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axs[0], annot_kws={"size": 14})
axs[0].set_title("Confusion Matrix", fontsize=16)
axs[0].set_xlabel("Predicted Labels", fontsize=14)
axs[0].set_ylabel("True Labels", fontsize=14)
# Classification Report Heatmap
sns.heatmap(class_report_df, annot=True, fmt=".2f", cmap="YlGnBu", cbar=False, ax=axs[1], annot_kws={"size": 12})
axs[1].set_title("Classification Report", fontsize=16)
axs[1].set_xlabel("Metrics", fontsize=14)
axs[1].set_ylabel("Classes", fontsize=14)
# Adjust layout
plt.tight_layout()
# Display the plots in Streamlit
st.pyplot(fig)
# Display additional metrics (optional)
accuracy = accuracy_score(y_test, y_pred)
st.success(f"**Accuracy:** {accuracy:.2f}")
else:
st.warning("No Dataset Found")
background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/7ZCmkouk1pS37_kREZmYJ.jpeg"
# Apply custom CSS for the background image and overlay
st.markdown(
f"""
<style>
.stApp {{
background-image: url("{background_image_url}");
background-size: auto; /* Ensures the image retains its original size */
background-repeat: repeat; /* Makes the image repeat to cover the entire background */
background-position: top left; /* Starts repeating from the top-left corner */
background-attachment: fixed; /* Keeps the background fixed as you scroll */
}}
/* Semi-transparent overlay */
.stApp::before {{
content: "";
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
z-index: -1;
}}
/* Container to center elements and limit width */
.content-container {{
max-width: 70%; /* Limit content width to 70% */
margin: 0 auto; /* Center the container */
padding: 50px; /* Add some padding for spacing */
}}
/* Styling the markdown content */
.stMarkdown {{
color: white; /* White text to ensure visibility */
font-size: 100px; /* Adjust font size for readability */
# text-align: center; /* Center align text */
}}
</style>
""",
unsafe_allow_html=True
)
if st.button("Previous ⏮️"):
st.switch_page("pages/3_EDA_and_Feature_Engineering.py")
if st.button("Next ⏭️"):
st.switch_page("pages/5_Conclusion.py") |