Electronics-Sales-Classification / pages /4_Model_Creation_and_Evaluation.py
trohith89's picture
Update pages/4_Model_Creation_and_Evaluation.py
8b4d29e verified
import streamlit as st
import pandas as pd
import numpy as np
from io import StringIO
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import optuna
from sklearn.preprocessing import PolynomialFeatures
# Page configuration
st.set_page_config(page_title="Predictive Modelling", layout="wide")
# Title with centered alignment
st.markdown(
"""
<h1 style="text-align: center; color: white;">📱 Predictive Model Creation and Evaluation 💻</h1>
""",
unsafe_allow_html=True
)
# Flowchart title
st.markdown(
"""
<h1 style="text-align: center; color: white;">Model Creation Flow</h1>
""",
unsafe_allow_html=True
)
st.markdown(
"""
<div style="text-align: center;">
<img src="https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/g-lmBAPoAV_5uO_fpqFYc.gif" alt="model-creation-flowchart.gif" width="70%" />
</div>
""",
unsafe_allow_html=True
)
df = st.session_state.get("dataset")
# Exclude 'ProductID' from the dataset
if df is not None:
df = df.drop(columns=['ProductID'], errors='ignore') # Exclude 'ProductID' if it exists
st.subheader("Dataset Preview:")
st.write(df.head())
# Dropping unnecessary columns
df.drop(['age_bins', 'ProductPriceBucket', 'CustomerAgeGroup'], axis=1, inplace=True, errors='ignore')
st.write(df.head())
# Splitting Feature Variables and Class Labels
st.markdown("### Split Feature Variables and Class Labels")
fv = df.iloc[:, :-1]
cv = df.iloc[:, -1]
st.write(fv)
st.write(cv)
# Feature Engineering
st.markdown("### Feature Engineering")
label_encoder = LabelEncoder()
fv['ProductBrand'] = label_encoder.fit_transform(fv['ProductBrand'])
fv['ProductCategory'] = label_encoder.fit_transform(fv['ProductCategory'])
st.write(fv.head())
# Polynomial Featurisation for Non-Linearity
st.markdown("### Polynomial Featurisation for Non-Linearity:")
numeric_columns = fv.select_dtypes(include=[float, int]).columns
degree = 2
poly = PolynomialFeatures(degree=degree, include_bias=False)
poly_features = poly.fit_transform(fv[numeric_columns])
poly_feature_names = poly.get_feature_names_out(numeric_columns)
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)
fv_with_poly = pd.concat([fv.reset_index(drop=True), poly_df], axis=1)
fv_with_poly = fv_with_poly.loc[:, ~fv_with_poly.columns.duplicated()]
st.write(fv_with_poly.head())
# SMOTE for Handling Imbalanced Dataset
st.markdown("### SMOTE for Handling Imbalanced Dataset")
smote = SMOTE(sampling_strategy=1)
fv1, cv1 = smote.fit_resample(fv_with_poly, cv)
st.write(pd.Series(cv1).value_counts())
# Data Splitting
st.markdown("### Data Splitting")
x_train, x_test, y_train, y_test = train_test_split(fv1, cv1, test_size=0.2, random_state=42)
# Scaling
st.markdown("### Scaling")
std = StandardScaler()
x_train_std = std.fit_transform(x_train)
x_test_std = std.transform(x_test)
st.code("""
std = StandardScaler()
x_train_std = std.fit_transform(x_train)
x_test_std = std.transform(x_test)
""")
st.markdown("## Hyperparameter Tuning using OPTUNA")
# Define the objective function for Optuna
st.code("""
import numpy as np
import optuna
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
# Check for NaN or infinite values in the data
assert not np.any(np.isnan(x_train_std)), "Input data contains NaN values"
assert not np.any(np.isnan(y_train)), "Target data contains NaN values"
assert not np.any(np.isinf(x_train_std)), "Input data contains infinite values"
# Global lists to store training and validation scores for each trial
training_scores = []
validation_scores = []
def objective(trial):
# Log trial parameters for debugging
print(f"Trial params: {trial.params}")
algo = trial.suggest_categorical("algo", ["lor", "svc"])
if algo == "svc":
# Hyperparameters for SVC
c = trial.suggest_float("C", 0.001, 1000, log=True)
kernel = trial.suggest_categorical("kernel", ['linear', 'poly', 'rbf', 'sigmoid'])
if kernel == 'poly':
degree = trial.suggest_int("degree", 1, 3)
model = SVC(C=c, kernel=kernel, degree=degree, random_state=42)
elif kernel in ['rbf', 'sigmoid']:
gamma = trial.suggest_categorical("gamma", ['scale', 'auto'])
model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)
else:
model = SVC(C=c, kernel=kernel, random_state=42)
else:
# Hyperparameters for Logistic Regression
solver, penalty = trial.suggest_categorical(
"choices", [
("lbfgs", "l2"), ("newton-cg", "l2"),
("sag", "l2"), ("saga", "l1"),
("saga", "l2"), ("saga", "elasticnet")
]
)
reg_strength = trial.suggest_float("C", 0.001, 1000, log=True)
l1_ratio = trial.suggest_float("l1_ratio", 0, 1) if penalty == "elasticnet" else None
if penalty == "elasticnet":
model = LogisticRegression(solver=solver, penalty=penalty, C=reg_strength, l1_ratio=l1_ratio, random_state=42)
else:
model = LogisticRegression(solver=solver, penalty=penalty, C=reg_strength, random_state=42)
# Cross-validation scoring with training and validation
try:
scores = cross_validate(
model, x_train_std, y_train, cv=5,
scoring="accuracy", return_train_score=True
)
train_score = scores["train_score"].mean()
val_score = scores["test_score"].mean()
# Append scores to global lists
training_scores.append(train_score)
validation_scores.append(val_score)
except ValueError as e:
print(f"Error during cross-validation: {e}")
train_score, val_score = float("-inf"), float("-inf")
return val_score
# Running the optimization
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=100)
# Plotting training vs. validation scores
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(training_scores, label="Training Score", marker="o")
plt.plot(validation_scores, label="Validation Score", marker="x")
plt.xlabel("Trial")
plt.ylabel("Accuracy")
plt.title("Training vs. Validation Scores Across Trials")
plt.legend()
plt.grid()
plt.show()
# Display best trial
print("Best Parameters:")
print(study.best_params)
""", language="python")
st.markdown(
"""
<div style="text-align: center;">
<img src="https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/FqUoV8hSyCWU3WocaqqGc.png" width="70%" />
</div>
""",
unsafe_allow_html=True
)
# Create the best model
st.markdown("## Create the Model with the best algorithm and parameters you have received by perfroming Hyperparameter Tuning using Optuna")
st.markdown("## SVC(kernel='poly', gamma = 'scale', C = 974.1963187644974, degree = 2)")
model = SVC(kernel='poly', gamma = 'scale', C = 974.1963187644974, degree = 2)
st.write(model)
# Train the model
st.markdown("### Train the Model")
model.fit(x_train_std, y_train)
# Model Evaluation
st.markdown("# Model Evaluation")
y_pred = model.predict(x_test_std)
# Evaluation metrics
st.write("Accuracy:", accuracy_score(y_test, y_pred))
st.write("Classification Report:\n", classification_report(y_test, y_pred))
st.write("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Example: Replace this with your actual test data and predictions
y_pred = model.predict(x_test_std)
# Calculate evaluation metrics
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True) # Output as a dictionary
# Convert the classification report to a DataFrame
class_report_df = pd.DataFrame(class_report).iloc[:-1, :-1] # Exclude support and accuracy rows
# Streamlit app
st.title("Model Evaluation: Confusion Matrix and Classification Report")
# Plotting with Matplotlib and Seaborn
fig, axs = plt.subplots(1, 2, figsize=(16, 6))
# Confusion Matrix Heatmap
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axs[0], annot_kws={"size": 14})
axs[0].set_title("Confusion Matrix", fontsize=16)
axs[0].set_xlabel("Predicted Labels", fontsize=14)
axs[0].set_ylabel("True Labels", fontsize=14)
# Classification Report Heatmap
sns.heatmap(class_report_df, annot=True, fmt=".2f", cmap="YlGnBu", cbar=False, ax=axs[1], annot_kws={"size": 12})
axs[1].set_title("Classification Report", fontsize=16)
axs[1].set_xlabel("Metrics", fontsize=14)
axs[1].set_ylabel("Classes", fontsize=14)
# Adjust layout
plt.tight_layout()
# Display the plots in Streamlit
st.pyplot(fig)
# Display additional metrics (optional)
accuracy = accuracy_score(y_test, y_pred)
st.success(f"**Accuracy:** {accuracy:.2f}")
else:
st.warning("No Dataset Found")
background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/7ZCmkouk1pS37_kREZmYJ.jpeg"
# Apply custom CSS for the background image and overlay
st.markdown(
f"""
<style>
.stApp {{
background-image: url("{background_image_url}");
background-size: auto; /* Ensures the image retains its original size */
background-repeat: repeat; /* Makes the image repeat to cover the entire background */
background-position: top left; /* Starts repeating from the top-left corner */
background-attachment: fixed; /* Keeps the background fixed as you scroll */
}}
/* Semi-transparent overlay */
.stApp::before {{
content: "";
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
z-index: -1;
}}
/* Container to center elements and limit width */
.content-container {{
max-width: 70%; /* Limit content width to 70% */
margin: 0 auto; /* Center the container */
padding: 50px; /* Add some padding for spacing */
}}
/* Styling the markdown content */
.stMarkdown {{
color: white; /* White text to ensure visibility */
font-size: 100px; /* Adjust font size for readability */
# text-align: center; /* Center align text */
}}
</style>
""",
unsafe_allow_html=True
)
if st.button("Previous ⏮️"):
st.switch_page("pages/3_EDA_and_Feature_Engineering.py")
if st.button("Next ⏭️"):
st.switch_page("pages/5_Conclusion.py")