|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from io import StringIO |
|
|
import sys |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.preprocessing import StandardScaler, LabelEncoder |
|
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix |
|
|
from imblearn.over_sampling import SMOTE |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.svm import SVC |
|
|
import optuna |
|
|
from sklearn.preprocessing import PolynomialFeatures |
|
|
|
|
|
|
|
|
st.set_page_config(page_title="Predictive Modelling", layout="wide") |
|
|
|
|
|
|
|
|
st.markdown( |
|
|
""" |
|
|
<h1 style="text-align: center; color: white;">📱 Predictive Model Creation and Evaluation 💻</h1> |
|
|
""", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|
|
|
|
|
|
st.markdown( |
|
|
""" |
|
|
<h1 style="text-align: center; color: white;">Model Creation Flow</h1> |
|
|
""", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|
|
|
st.markdown( |
|
|
""" |
|
|
<div style="text-align: center;"> |
|
|
<img src="https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/g-lmBAPoAV_5uO_fpqFYc.gif" alt="model-creation-flowchart.gif" width="70%" /> |
|
|
</div> |
|
|
""", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|
|
|
df = st.session_state.get("dataset") |
|
|
|
|
|
|
|
|
if df is not None: |
|
|
df = df.drop(columns=['ProductID'], errors='ignore') |
|
|
|
|
|
st.subheader("Dataset Preview:") |
|
|
st.write(df.head()) |
|
|
|
|
|
|
|
|
df.drop(['age_bins', 'ProductPriceBucket', 'CustomerAgeGroup'], axis=1, inplace=True, errors='ignore') |
|
|
st.write(df.head()) |
|
|
|
|
|
|
|
|
st.markdown("### Split Feature Variables and Class Labels") |
|
|
fv = df.iloc[:, :-1] |
|
|
cv = df.iloc[:, -1] |
|
|
st.write(fv) |
|
|
st.write(cv) |
|
|
|
|
|
|
|
|
st.markdown("### Feature Engineering") |
|
|
label_encoder = LabelEncoder() |
|
|
fv['ProductBrand'] = label_encoder.fit_transform(fv['ProductBrand']) |
|
|
fv['ProductCategory'] = label_encoder.fit_transform(fv['ProductCategory']) |
|
|
st.write(fv.head()) |
|
|
|
|
|
|
|
|
st.markdown("### Polynomial Featurisation for Non-Linearity:") |
|
|
numeric_columns = fv.select_dtypes(include=[float, int]).columns |
|
|
degree = 2 |
|
|
poly = PolynomialFeatures(degree=degree, include_bias=False) |
|
|
poly_features = poly.fit_transform(fv[numeric_columns]) |
|
|
poly_feature_names = poly.get_feature_names_out(numeric_columns) |
|
|
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names) |
|
|
fv_with_poly = pd.concat([fv.reset_index(drop=True), poly_df], axis=1) |
|
|
fv_with_poly = fv_with_poly.loc[:, ~fv_with_poly.columns.duplicated()] |
|
|
st.write(fv_with_poly.head()) |
|
|
|
|
|
|
|
|
st.markdown("### SMOTE for Handling Imbalanced Dataset") |
|
|
smote = SMOTE(sampling_strategy=1) |
|
|
fv1, cv1 = smote.fit_resample(fv_with_poly, cv) |
|
|
st.write(pd.Series(cv1).value_counts()) |
|
|
|
|
|
|
|
|
st.markdown("### Data Splitting") |
|
|
x_train, x_test, y_train, y_test = train_test_split(fv1, cv1, test_size=0.2, random_state=42) |
|
|
|
|
|
|
|
|
st.markdown("### Scaling") |
|
|
std = StandardScaler() |
|
|
x_train_std = std.fit_transform(x_train) |
|
|
x_test_std = std.transform(x_test) |
|
|
st.code(""" |
|
|
std = StandardScaler() |
|
|
x_train_std = std.fit_transform(x_train) |
|
|
x_test_std = std.transform(x_test) |
|
|
""") |
|
|
|
|
|
st.markdown("## Hyperparameter Tuning using OPTUNA") |
|
|
|
|
|
|
|
|
st.code(""" |
|
|
import numpy as np |
|
|
import optuna |
|
|
from sklearn.svm import SVC |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.model_selection import cross_validate |
|
|
from sklearn.preprocessing import StandardScaler |
|
|
|
|
|
# Check for NaN or infinite values in the data |
|
|
assert not np.any(np.isnan(x_train_std)), "Input data contains NaN values" |
|
|
assert not np.any(np.isnan(y_train)), "Target data contains NaN values" |
|
|
assert not np.any(np.isinf(x_train_std)), "Input data contains infinite values" |
|
|
|
|
|
# Global lists to store training and validation scores for each trial |
|
|
training_scores = [] |
|
|
validation_scores = [] |
|
|
|
|
|
def objective(trial): |
|
|
# Log trial parameters for debugging |
|
|
print(f"Trial params: {trial.params}") |
|
|
|
|
|
algo = trial.suggest_categorical("algo", ["lor", "svc"]) |
|
|
|
|
|
if algo == "svc": |
|
|
# Hyperparameters for SVC |
|
|
c = trial.suggest_float("C", 0.001, 1000, log=True) |
|
|
kernel = trial.suggest_categorical("kernel", ['linear', 'poly', 'rbf', 'sigmoid']) |
|
|
|
|
|
if kernel == 'poly': |
|
|
degree = trial.suggest_int("degree", 1, 3) |
|
|
model = SVC(C=c, kernel=kernel, degree=degree, random_state=42) |
|
|
elif kernel in ['rbf', 'sigmoid']: |
|
|
gamma = trial.suggest_categorical("gamma", ['scale', 'auto']) |
|
|
model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42) |
|
|
else: |
|
|
model = SVC(C=c, kernel=kernel, random_state=42) |
|
|
else: |
|
|
# Hyperparameters for Logistic Regression |
|
|
solver, penalty = trial.suggest_categorical( |
|
|
"choices", [ |
|
|
("lbfgs", "l2"), ("newton-cg", "l2"), |
|
|
("sag", "l2"), ("saga", "l1"), |
|
|
("saga", "l2"), ("saga", "elasticnet") |
|
|
] |
|
|
) |
|
|
reg_strength = trial.suggest_float("C", 0.001, 1000, log=True) |
|
|
l1_ratio = trial.suggest_float("l1_ratio", 0, 1) if penalty == "elasticnet" else None |
|
|
|
|
|
if penalty == "elasticnet": |
|
|
model = LogisticRegression(solver=solver, penalty=penalty, C=reg_strength, l1_ratio=l1_ratio, random_state=42) |
|
|
else: |
|
|
model = LogisticRegression(solver=solver, penalty=penalty, C=reg_strength, random_state=42) |
|
|
|
|
|
# Cross-validation scoring with training and validation |
|
|
try: |
|
|
scores = cross_validate( |
|
|
model, x_train_std, y_train, cv=5, |
|
|
scoring="accuracy", return_train_score=True |
|
|
) |
|
|
train_score = scores["train_score"].mean() |
|
|
val_score = scores["test_score"].mean() |
|
|
|
|
|
# Append scores to global lists |
|
|
training_scores.append(train_score) |
|
|
validation_scores.append(val_score) |
|
|
except ValueError as e: |
|
|
print(f"Error during cross-validation: {e}") |
|
|
train_score, val_score = float("-inf"), float("-inf") |
|
|
|
|
|
return val_score |
|
|
|
|
|
# Running the optimization |
|
|
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler()) |
|
|
study.optimize(objective, n_trials=100) |
|
|
|
|
|
# Plotting training vs. validation scores |
|
|
import matplotlib.pyplot as plt |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
|
plt.plot(training_scores, label="Training Score", marker="o") |
|
|
plt.plot(validation_scores, label="Validation Score", marker="x") |
|
|
plt.xlabel("Trial") |
|
|
plt.ylabel("Accuracy") |
|
|
plt.title("Training vs. Validation Scores Across Trials") |
|
|
plt.legend() |
|
|
plt.grid() |
|
|
plt.show() |
|
|
|
|
|
# Display best trial |
|
|
print("Best Parameters:") |
|
|
print(study.best_params) |
|
|
|
|
|
""", language="python") |
|
|
|
|
|
st.markdown( |
|
|
""" |
|
|
<div style="text-align: center;"> |
|
|
<img src="https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/FqUoV8hSyCWU3WocaqqGc.png" width="70%" /> |
|
|
</div> |
|
|
""", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|
|
|
|
|
|
st.markdown("## Create the Model with the best algorithm and parameters you have received by perfroming Hyperparameter Tuning using Optuna") |
|
|
st.markdown("## SVC(kernel='poly', gamma = 'scale', C = 974.1963187644974, degree = 2)") |
|
|
model = SVC(kernel='poly', gamma = 'scale', C = 974.1963187644974, degree = 2) |
|
|
st.write(model) |
|
|
|
|
|
|
|
|
st.markdown("### Train the Model") |
|
|
model.fit(x_train_std, y_train) |
|
|
|
|
|
|
|
|
st.markdown("# Model Evaluation") |
|
|
y_pred = model.predict(x_test_std) |
|
|
|
|
|
|
|
|
st.write("Accuracy:", accuracy_score(y_test, y_pred)) |
|
|
st.write("Classification Report:\n", classification_report(y_test, y_pred)) |
|
|
st.write("Confusion Matrix:\n", confusion_matrix(y_test, y_pred)) |
|
|
|
|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import seaborn as sns |
|
|
import matplotlib.pyplot as plt |
|
|
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score |
|
|
|
|
|
|
|
|
y_pred = model.predict(x_test_std) |
|
|
|
|
|
|
|
|
conf_matrix = confusion_matrix(y_test, y_pred) |
|
|
class_report = classification_report(y_test, y_pred, output_dict=True) |
|
|
|
|
|
|
|
|
class_report_df = pd.DataFrame(class_report).iloc[:-1, :-1] |
|
|
|
|
|
|
|
|
st.title("Model Evaluation: Confusion Matrix and Classification Report") |
|
|
|
|
|
|
|
|
fig, axs = plt.subplots(1, 2, figsize=(16, 6)) |
|
|
|
|
|
|
|
|
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axs[0], annot_kws={"size": 14}) |
|
|
axs[0].set_title("Confusion Matrix", fontsize=16) |
|
|
axs[0].set_xlabel("Predicted Labels", fontsize=14) |
|
|
axs[0].set_ylabel("True Labels", fontsize=14) |
|
|
|
|
|
|
|
|
sns.heatmap(class_report_df, annot=True, fmt=".2f", cmap="YlGnBu", cbar=False, ax=axs[1], annot_kws={"size": 12}) |
|
|
axs[1].set_title("Classification Report", fontsize=16) |
|
|
axs[1].set_xlabel("Metrics", fontsize=14) |
|
|
axs[1].set_ylabel("Classes", fontsize=14) |
|
|
|
|
|
|
|
|
plt.tight_layout() |
|
|
|
|
|
|
|
|
st.pyplot(fig) |
|
|
|
|
|
|
|
|
accuracy = accuracy_score(y_test, y_pred) |
|
|
st.success(f"**Accuracy:** {accuracy:.2f}") |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
st.warning("No Dataset Found") |
|
|
background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/7ZCmkouk1pS37_kREZmYJ.jpeg" |
|
|
|
|
|
|
|
|
st.markdown( |
|
|
f""" |
|
|
<style> |
|
|
.stApp {{ |
|
|
background-image: url("{background_image_url}"); |
|
|
background-size: auto; /* Ensures the image retains its original size */ |
|
|
background-repeat: repeat; /* Makes the image repeat to cover the entire background */ |
|
|
background-position: top left; /* Starts repeating from the top-left corner */ |
|
|
background-attachment: fixed; /* Keeps the background fixed as you scroll */ |
|
|
}} |
|
|
|
|
|
/* Semi-transparent overlay */ |
|
|
.stApp::before {{ |
|
|
content: ""; |
|
|
position: absolute; |
|
|
top: 0; |
|
|
left: 0; |
|
|
width: 100%; |
|
|
height: 100%; |
|
|
background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */ |
|
|
z-index: -1; |
|
|
}} |
|
|
|
|
|
/* Container to center elements and limit width */ |
|
|
.content-container {{ |
|
|
max-width: 70%; /* Limit content width to 70% */ |
|
|
margin: 0 auto; /* Center the container */ |
|
|
padding: 50px; /* Add some padding for spacing */ |
|
|
}} |
|
|
|
|
|
/* Styling the markdown content */ |
|
|
.stMarkdown {{ |
|
|
color: white; /* White text to ensure visibility */ |
|
|
font-size: 100px; /* Adjust font size for readability */ |
|
|
# text-align: center; /* Center align text */ |
|
|
}} |
|
|
</style> |
|
|
""", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
if st.button("Previous ⏮️"): |
|
|
st.switch_page("pages/3_EDA_and_Feature_Engineering.py") |
|
|
if st.button("Next ⏭️"): |
|
|
st.switch_page("pages/5_Conclusion.py") |