Spaces:

trohith89
/

Electronics-Sales-Classification

Sleeping

App Files Files Community

Electronics-Sales-Classification / pages /4_Model_Creation_and_Evaluation.py

trohith89

Update pages/4_Model_Creation_and_Evaluation.py

8b4d29e verified 11 months ago

raw

history blame contribute delete

12 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	from io import StringIO
	import sys
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
	from imblearn.over_sampling import SMOTE
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC
	import optuna
	from sklearn.preprocessing import PolynomialFeatures

	# Page configuration
	st.set_page_config(page_title="Predictive Modelling", layout="wide")

	# Title with centered alignment
	st.markdown(
	"""
	<h1 style="text-align: center; color: white;">📱 Predictive Model Creation and Evaluation 💻</h1>
	""",
	unsafe_allow_html=True
	)

	# Flowchart title
	st.markdown(
	"""
	<h1 style="text-align: center; color: white;">Model Creation Flow</h1>
	""",
	unsafe_allow_html=True
	)

	st.markdown(
	"""
	<div style="text-align: center;">
	<img src="https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/g-lmBAPoAV_5uO_fpqFYc.gif" alt="model-creation-flowchart.gif" width="70%" />
	</div>
	""",
	unsafe_allow_html=True
	)

	df = st.session_state.get("dataset")

	# Exclude 'ProductID' from the dataset
	if df is not None:
	df = df.drop(columns=['ProductID'], errors='ignore') # Exclude 'ProductID' if it exists

	st.subheader("Dataset Preview:")
	st.write(df.head())

	# Dropping unnecessary columns
	df.drop(['age_bins', 'ProductPriceBucket', 'CustomerAgeGroup'], axis=1, inplace=True, errors='ignore')
	st.write(df.head())

	# Splitting Feature Variables and Class Labels
	st.markdown("### Split Feature Variables and Class Labels")
	fv = df.iloc[:, :-1]
	cv = df.iloc[:, -1]
	st.write(fv)
	st.write(cv)

	# Feature Engineering
	st.markdown("### Feature Engineering")
	label_encoder = LabelEncoder()
	fv['ProductBrand'] = label_encoder.fit_transform(fv['ProductBrand'])
	fv['ProductCategory'] = label_encoder.fit_transform(fv['ProductCategory'])
	st.write(fv.head())

	# Polynomial Featurisation for Non-Linearity
	st.markdown("### Polynomial Featurisation for Non-Linearity:")
	numeric_columns = fv.select_dtypes(include=[float, int]).columns
	degree = 2
	poly = PolynomialFeatures(degree=degree, include_bias=False)
	poly_features = poly.fit_transform(fv[numeric_columns])
	poly_feature_names = poly.get_feature_names_out(numeric_columns)
	poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)
	fv_with_poly = pd.concat([fv.reset_index(drop=True), poly_df], axis=1)
	fv_with_poly = fv_with_poly.loc[:, ~fv_with_poly.columns.duplicated()]
	st.write(fv_with_poly.head())

	# SMOTE for Handling Imbalanced Dataset
	st.markdown("### SMOTE for Handling Imbalanced Dataset")
	smote = SMOTE(sampling_strategy=1)
	fv1, cv1 = smote.fit_resample(fv_with_poly, cv)
	st.write(pd.Series(cv1).value_counts())

	# Data Splitting
	st.markdown("### Data Splitting")
	x_train, x_test, y_train, y_test = train_test_split(fv1, cv1, test_size=0.2, random_state=42)

	# Scaling
	st.markdown("### Scaling")
	std = StandardScaler()
	x_train_std = std.fit_transform(x_train)
	x_test_std = std.transform(x_test)
	st.code("""
	std = StandardScaler()
	x_train_std = std.fit_transform(x_train)
	x_test_std = std.transform(x_test)
	""")

	st.markdown("## Hyperparameter Tuning using OPTUNA")

	# Define the objective function for Optuna
	st.code("""
	import numpy as np
	import optuna
	from sklearn.svm import SVC
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import cross_validate
	from sklearn.preprocessing import StandardScaler

	# Check for NaN or infinite values in the data
	assert not np.any(np.isnan(x_train_std)), "Input data contains NaN values"
	assert not np.any(np.isnan(y_train)), "Target data contains NaN values"
	assert not np.any(np.isinf(x_train_std)), "Input data contains infinite values"

	# Global lists to store training and validation scores for each trial
	training_scores = []
	validation_scores = []

	def objective(trial):
	# Log trial parameters for debugging
	print(f"Trial params: {trial.params}")

	algo = trial.suggest_categorical("algo", ["lor", "svc"])

	if algo == "svc":
	# Hyperparameters for SVC
	c = trial.suggest_float("C", 0.001, 1000, log=True)
	kernel = trial.suggest_categorical("kernel", ['linear', 'poly', 'rbf', 'sigmoid'])

	if kernel == 'poly':
	degree = trial.suggest_int("degree", 1, 3)
	model = SVC(C=c, kernel=kernel, degree=degree, random_state=42)
	elif kernel in ['rbf', 'sigmoid']:
	gamma = trial.suggest_categorical("gamma", ['scale', 'auto'])
	model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)
	else:
	model = SVC(C=c, kernel=kernel, random_state=42)
	else:
	# Hyperparameters for Logistic Regression
	solver, penalty = trial.suggest_categorical(
	"choices", [
	("lbfgs", "l2"), ("newton-cg", "l2"),
	("sag", "l2"), ("saga", "l1"),
	("saga", "l2"), ("saga", "elasticnet")
	]
	)
	reg_strength = trial.suggest_float("C", 0.001, 1000, log=True)
	l1_ratio = trial.suggest_float("l1_ratio", 0, 1) if penalty == "elasticnet" else None

	if penalty == "elasticnet":
	model = LogisticRegression(solver=solver, penalty=penalty, C=reg_strength, l1_ratio=l1_ratio, random_state=42)
	else:
	model = LogisticRegression(solver=solver, penalty=penalty, C=reg_strength, random_state=42)

	# Cross-validation scoring with training and validation
	try:
	scores = cross_validate(
	model, x_train_std, y_train, cv=5,
	scoring="accuracy", return_train_score=True
	)
	train_score = scores["train_score"].mean()
	val_score = scores["test_score"].mean()

	# Append scores to global lists
	training_scores.append(train_score)
	validation_scores.append(val_score)
	except ValueError as e:
	print(f"Error during cross-validation: {e}")
	train_score, val_score = float("-inf"), float("-inf")

	return val_score

	# Running the optimization
	study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
	study.optimize(objective, n_trials=100)

	# Plotting training vs. validation scores
	import matplotlib.pyplot as plt

	plt.figure(figsize=(10, 6))
	plt.plot(training_scores, label="Training Score", marker="o")
	plt.plot(validation_scores, label="Validation Score", marker="x")
	plt.xlabel("Trial")
	plt.ylabel("Accuracy")
	plt.title("Training vs. Validation Scores Across Trials")
	plt.legend()
	plt.grid()
	plt.show()

	# Display best trial
	print("Best Parameters:")
	print(study.best_params)

	""", language="python")

	st.markdown(
	"""
	<div style="text-align: center;">
	<img src="https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/FqUoV8hSyCWU3WocaqqGc.png" width="70%" />
	</div>
	""",
	unsafe_allow_html=True
	)

	# Create the best model
	st.markdown("## Create the Model with the best algorithm and parameters you have received by perfroming Hyperparameter Tuning using Optuna")
	st.markdown("## SVC(kernel='poly', gamma = 'scale', C = 974.1963187644974, degree = 2)")
	model = SVC(kernel='poly', gamma = 'scale', C = 974.1963187644974, degree = 2)
	st.write(model)

	# Train the model
	st.markdown("### Train the Model")
	model.fit(x_train_std, y_train)

	# Model Evaluation
	st.markdown("# Model Evaluation")
	y_pred = model.predict(x_test_std)

	# Evaluation metrics
	st.write("Accuracy:", accuracy_score(y_test, y_pred))
	st.write("Classification Report:\n", classification_report(y_test, y_pred))
	st.write("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

	import streamlit as st
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

	# Example: Replace this with your actual test data and predictions
	y_pred = model.predict(x_test_std)

	# Calculate evaluation metrics
	conf_matrix = confusion_matrix(y_test, y_pred)
	class_report = classification_report(y_test, y_pred, output_dict=True) # Output as a dictionary

	# Convert the classification report to a DataFrame
	class_report_df = pd.DataFrame(class_report).iloc[:-1, :-1] # Exclude support and accuracy rows

	# Streamlit app
	st.title("Model Evaluation: Confusion Matrix and Classification Report")

	# Plotting with Matplotlib and Seaborn
	fig, axs = plt.subplots(1, 2, figsize=(16, 6))

	# Confusion Matrix Heatmap
	sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axs[0], annot_kws={"size": 14})
	axs[0].set_title("Confusion Matrix", fontsize=16)
	axs[0].set_xlabel("Predicted Labels", fontsize=14)
	axs[0].set_ylabel("True Labels", fontsize=14)

	# Classification Report Heatmap
	sns.heatmap(class_report_df, annot=True, fmt=".2f", cmap="YlGnBu", cbar=False, ax=axs[1], annot_kws={"size": 12})
	axs[1].set_title("Classification Report", fontsize=16)
	axs[1].set_xlabel("Metrics", fontsize=14)
	axs[1].set_ylabel("Classes", fontsize=14)

	# Adjust layout
	plt.tight_layout()

	# Display the plots in Streamlit
	st.pyplot(fig)

	# Display additional metrics (optional)
	accuracy = accuracy_score(y_test, y_pred)
	st.success(f"Accuracy: {accuracy:.2f}")



	else:
	st.warning("No Dataset Found")
	background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/7ZCmkouk1pS37_kREZmYJ.jpeg"

	# Apply custom CSS for the background image and overlay
	st.markdown(
	f"""
	<style>
	.stApp {{
	background-image: url("{background_image_url}");
	background-size: auto; /* Ensures the image retains its original size */
	background-repeat: repeat; /* Makes the image repeat to cover the entire background */
	background-position: top left; /* Starts repeating from the top-left corner */
	background-attachment: fixed; /* Keeps the background fixed as you scroll */
	}}

	/* Semi-transparent overlay */
	.stApp::before {{
	content: "";
	position: absolute;
	top: 0;
	left: 0;
	width: 100%;
	height: 100%;
	background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
	z-index: -1;
	}}

	/* Container to center elements and limit width */
	.content-container {{
	max-width: 70%; /* Limit content width to 70% */
	margin: 0 auto; /* Center the container */
	padding: 50px; /* Add some padding for spacing */
	}}

	/* Styling the markdown content */
	.stMarkdown {{
	color: white; /* White text to ensure visibility */
	font-size: 100px; /* Adjust font size for readability */
	# text-align: center; /* Center align text */
	}}
	</style>
	""",
	unsafe_allow_html=True
	)



	if st.button("Previous ⏮️"):
	st.switch_page("pages/3_EDA_and_Feature_Engineering.py")
	if st.button("Next ⏭️"):
	st.switch_page("pages/5_Conclusion.py")