import streamlit as st import pandas as pd import numpy as np from io import StringIO import sys from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from imblearn.over_sampling import SMOTE from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC import optuna from sklearn.preprocessing import PolynomialFeatures # Page configuration st.set_page_config(page_title="Predictive Modelling", layout="wide") # Title with centered alignment st.markdown( """

📱 Predictive Model Creation and Evaluation 💻

""", unsafe_allow_html=True ) # Flowchart title st.markdown( """

Model Creation Flow

""", unsafe_allow_html=True ) st.markdown( """

""", unsafe_allow_html=True ) df = st.session_state.get("dataset") # Exclude 'ProductID' from the dataset if df is not None: df = df.drop(columns=['ProductID'], errors='ignore') # Exclude 'ProductID' if it exists st.subheader("Dataset Preview:") st.write(df.head()) # Dropping unnecessary columns df.drop(['age_bins', 'ProductPriceBucket', 'CustomerAgeGroup'], axis=1, inplace=True, errors='ignore') st.write(df.head()) # Splitting Feature Variables and Class Labels st.markdown("### Split Feature Variables and Class Labels") fv = df.iloc[:, :-1] cv = df.iloc[:, -1] st.write(fv) st.write(cv) # Feature Engineering st.markdown("### Feature Engineering") label_encoder = LabelEncoder() fv['ProductBrand'] = label_encoder.fit_transform(fv['ProductBrand']) fv['ProductCategory'] = label_encoder.fit_transform(fv['ProductCategory']) st.write(fv.head()) # Polynomial Featurisation for Non-Linearity st.markdown("### Polynomial Featurisation for Non-Linearity:") numeric_columns = fv.select_dtypes(include=[float, int]).columns degree = 2 poly = PolynomialFeatures(degree=degree, include_bias=False) poly_features = poly.fit_transform(fv[numeric_columns]) poly_feature_names = poly.get_feature_names_out(numeric_columns) poly_df = pd.DataFrame(poly_features, columns=poly_feature_names) fv_with_poly = pd.concat([fv.reset_index(drop=True), poly_df], axis=1) fv_with_poly = fv_with_poly.loc[:, ~fv_with_poly.columns.duplicated()] st.write(fv_with_poly.head()) # SMOTE for Handling Imbalanced Dataset st.markdown("### SMOTE for Handling Imbalanced Dataset") smote = SMOTE(sampling_strategy=1) fv1, cv1 = smote.fit_resample(fv_with_poly, cv) st.write(pd.Series(cv1).value_counts()) # Data Splitting st.markdown("### Data Splitting") x_train, x_test, y_train, y_test = train_test_split(fv1, cv1, test_size=0.2, random_state=42) # Scaling st.markdown("### Scaling") std = StandardScaler() x_train_std = std.fit_transform(x_train) x_test_std = std.transform(x_test) st.code(""" std = StandardScaler() x_train_std = std.fit_transform(x_train) x_test_std = std.transform(x_test) """) st.markdown("## Hyperparameter Tuning using OPTUNA") # Define the objective function for Optuna st.code(""" import numpy as np import optuna from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_validate from sklearn.preprocessing import StandardScaler # Check for NaN or infinite values in the data assert not np.any(np.isnan(x_train_std)), "Input data contains NaN values" assert not np.any(np.isnan(y_train)), "Target data contains NaN values" assert not np.any(np.isinf(x_train_std)), "Input data contains infinite values" # Global lists to store training and validation scores for each trial training_scores = [] validation_scores = [] def objective(trial): # Log trial parameters for debugging print(f"Trial params: {trial.params}") algo = trial.suggest_categorical("algo", ["lor", "svc"]) if algo == "svc": # Hyperparameters for SVC c = trial.suggest_float("C", 0.001, 1000, log=True) kernel = trial.suggest_categorical("kernel", ['linear', 'poly', 'rbf', 'sigmoid']) if kernel == 'poly': degree = trial.suggest_int("degree", 1, 3) model = SVC(C=c, kernel=kernel, degree=degree, random_state=42) elif kernel in ['rbf', 'sigmoid']: gamma = trial.suggest_categorical("gamma", ['scale', 'auto']) model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42) else: model = SVC(C=c, kernel=kernel, random_state=42) else: # Hyperparameters for Logistic Regression solver, penalty = trial.suggest_categorical( "choices", [ ("lbfgs", "l2"), ("newton-cg", "l2"), ("sag", "l2"), ("saga", "l1"), ("saga", "l2"), ("saga", "elasticnet") ] ) reg_strength = trial.suggest_float("C", 0.001, 1000, log=True) l1_ratio = trial.suggest_float("l1_ratio", 0, 1) if penalty == "elasticnet" else None if penalty == "elasticnet": model = LogisticRegression(solver=solver, penalty=penalty, C=reg_strength, l1_ratio=l1_ratio, random_state=42) else: model = LogisticRegression(solver=solver, penalty=penalty, C=reg_strength, random_state=42) # Cross-validation scoring with training and validation try: scores = cross_validate( model, x_train_std, y_train, cv=5, scoring="accuracy", return_train_score=True ) train_score = scores["train_score"].mean() val_score = scores["test_score"].mean() # Append scores to global lists training_scores.append(train_score) validation_scores.append(val_score) except ValueError as e: print(f"Error during cross-validation: {e}") train_score, val_score = float("-inf"), float("-inf") return val_score # Running the optimization study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler()) study.optimize(objective, n_trials=100) # Plotting training vs. validation scores import matplotlib.pyplot as plt plt.figure(figsize=(10, 6)) plt.plot(training_scores, label="Training Score", marker="o") plt.plot(validation_scores, label="Validation Score", marker="x") plt.xlabel("Trial") plt.ylabel("Accuracy") plt.title("Training vs. Validation Scores Across Trials") plt.legend() plt.grid() plt.show() # Display best trial print("Best Parameters:") print(study.best_params) """, language="python") st.markdown( """

""", unsafe_allow_html=True ) # Create the best model st.markdown("## Create the Model with the best algorithm and parameters you have received by perfroming Hyperparameter Tuning using Optuna") st.markdown("## SVC(kernel='poly', gamma = 'scale', C = 974.1963187644974, degree = 2)") model = SVC(kernel='poly', gamma = 'scale', C = 974.1963187644974, degree = 2) st.write(model) # Train the model st.markdown("### Train the Model") model.fit(x_train_std, y_train) # Model Evaluation st.markdown("# Model Evaluation") y_pred = model.predict(x_test_std) # Evaluation metrics st.write("Accuracy:", accuracy_score(y_test, y_pred)) st.write("Classification Report:\n", classification_report(y_test, y_pred)) st.write("Confusion Matrix:\n", confusion_matrix(y_test, y_pred)) import streamlit as st import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # Example: Replace this with your actual test data and predictions y_pred = model.predict(x_test_std) # Calculate evaluation metrics conf_matrix = confusion_matrix(y_test, y_pred) class_report = classification_report(y_test, y_pred, output_dict=True) # Output as a dictionary # Convert the classification report to a DataFrame class_report_df = pd.DataFrame(class_report).iloc[:-1, :-1] # Exclude support and accuracy rows # Streamlit app st.title("Model Evaluation: Confusion Matrix and Classification Report") # Plotting with Matplotlib and Seaborn fig, axs = plt.subplots(1, 2, figsize=(16, 6)) # Confusion Matrix Heatmap sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axs[0], annot_kws={"size": 14}) axs[0].set_title("Confusion Matrix", fontsize=16) axs[0].set_xlabel("Predicted Labels", fontsize=14) axs[0].set_ylabel("True Labels", fontsize=14) # Classification Report Heatmap sns.heatmap(class_report_df, annot=True, fmt=".2f", cmap="YlGnBu", cbar=False, ax=axs[1], annot_kws={"size": 12}) axs[1].set_title("Classification Report", fontsize=16) axs[1].set_xlabel("Metrics", fontsize=14) axs[1].set_ylabel("Classes", fontsize=14) # Adjust layout plt.tight_layout() # Display the plots in Streamlit st.pyplot(fig) # Display additional metrics (optional) accuracy = accuracy_score(y_test, y_pred) st.success(f"**Accuracy:** {accuracy:.2f}") else: st.warning("No Dataset Found") background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/7ZCmkouk1pS37_kREZmYJ.jpeg" # Apply custom CSS for the background image and overlay st.markdown( f""" """, unsafe_allow_html=True ) if st.button("Previous ⏮️"): st.switch_page("pages/3_EDA_and_Feature_Engineering.py") if st.button("Next ⏭️"): st.switch_page("pages/5_Conclusion.py")