Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import pickle | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.figure_factory as ff | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.svm import SVC | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.naive_bayes import GaussianNB | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
| from datasets import load_dataset | |
| # Load Data | |
| def load_data(): | |
| train_df = load_dataset("kheejay88/phone_price_classification_train")["train"].to_pandas() | |
| test_df = load_dataset("kheejay88/phone_price_classification_test")["test"].to_pandas() | |
| return train_df, test_df | |
| train_df, test_df = load_data() | |
| # Data Preprocessing | |
| def preprocess_data(df): | |
| df = df.copy() | |
| df.fillna(df.median(), inplace=True) # Handle missing values | |
| label_encoders = {} | |
| for col in df.select_dtypes(include=['object']).columns: | |
| le = LabelEncoder() | |
| df[col] = le.fit_transform(df[col]) | |
| label_encoders[col] = le | |
| return df, label_encoders | |
| train_df, encoders = preprocess_data(train_df) | |
| # Splitting features and target variable | |
| X = train_df.drop(columns=['price_range']) # Updated target variable | |
| y = train_df['price_range'] | |
| # Splitting into training and testing sets | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # Standardizing the data | |
| scaler = StandardScaler() | |
| X_train = scaler.fit_transform(X_train) | |
| X_test = scaler.transform(X_test) | |
| # Model Training and Evaluation | |
| models = { | |
| "Logistic Regression": LogisticRegression(), | |
| "Random Forest": RandomForestClassifier(), | |
| "Gradient Boosting": GradientBoostingClassifier(), | |
| "AdaBoost": AdaBoostClassifier(), | |
| "Extra Trees": ExtraTreesClassifier(), | |
| "SVC": SVC(), | |
| "Decision Tree": DecisionTreeClassifier(), | |
| "K-Nearest Neighbors": KNeighborsClassifier(), | |
| "Naive Bayes": GaussianNB() | |
| } | |
| performance = {} | |
| trained_models = {} | |
| for name, model in models.items(): | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_test) | |
| acc = accuracy_score(y_test, y_pred) | |
| performance[name] = acc | |
| trained_models[name] = model # Store the trained model | |
| # Save trained models | |
| with open(f"{name.replace(' ', '_')}.pkl", "wb") as f: | |
| pickle.dump(model, f) | |
| # Selecting the best model | |
| best_model_name = max(performance, key=performance.get) | |
| best_model = trained_models[best_model_name] | |
| # Streamlit UI | |
| st.title("π Machine Learning Model Evaluation App") | |
| st.write("This application evaluates multiple machine learning models for predicting phone price ranges based on various phone specifications.") | |
| # Data Overview | |
| st.write("## π Data Overview") | |
| st.write(train_df.head()) | |
| # Data Visualization | |
| st.write("## π Data Visualization") | |
| # Target Distribution | |
| st.write("### π― Target Distribution") | |
| fig, ax = plt.subplots(figsize=(6, 4)) | |
| sns.countplot(x=y, ax=ax) | |
| ax.set_xlabel("Price Range") | |
| ax.set_ylabel("Count") | |
| st.pyplot(fig) | |
| # Model Performance | |
| st.write("## π Model Performance") | |
| performance_df = pd.DataFrame.from_dict(performance, orient='index', columns=['Accuracy']) | |
| performance_df = performance_df.sort_values(by='Accuracy', ascending=False) | |
| st.table(performance_df) | |
| st.write(f"### ποΈ Best Model: **{best_model_name}** with accuracy **{performance[best_model_name]:.4f}**") | |
| # Classification Report | |
| st.write("## π Classification Report") | |
| y_pred_best = best_model.predict(X_test) | |
| report_dict = classification_report(y_test, y_pred_best, output_dict=True) | |
| report_df = pd.DataFrame(report_dict).transpose() | |
| st.dataframe(report_df.style.format("{:.2f}")) | |
| # Confusion Matrix | |
| st.write("## π₯ Confusion Matrix") | |
| cm = confusion_matrix(y_test, y_pred_best) | |
| labels = list(map(str, np.unique(y_test))) # Ensure labels are a list of strings | |
| fig_cm = ff.create_annotated_heatmap( | |
| z=cm, | |
| x=labels, | |
| y=labels, | |
| annotation_text=cm.astype(str), # Show exact values inside the heatmap | |
| colorscale='Blues', | |
| showscale=True | |
| ) | |
| st.plotly_chart(fig_cm) | |