kheejay88's picture
444
ceeb6ed verified
import streamlit as st
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from datasets import load_dataset
# Load Data
@st.cache_data
def load_data():
train_df = load_dataset("kheejay88/phone_price_classification_train")["train"].to_pandas()
test_df = load_dataset("kheejay88/phone_price_classification_test")["test"].to_pandas()
return train_df, test_df
train_df, test_df = load_data()
# Data Preprocessing
def preprocess_data(df):
df = df.copy()
df.fillna(df.median(), inplace=True) # Handle missing values
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
label_encoders[col] = le
return df, label_encoders
train_df, encoders = preprocess_data(train_df)
# Splitting features and target variable
X = train_df.drop(columns=['price_range']) # Updated target variable
y = train_df['price_range']
# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Model Training and Evaluation
models = {
"Logistic Regression": LogisticRegression(),
"Random Forest": RandomForestClassifier(),
"Gradient Boosting": GradientBoostingClassifier(),
"AdaBoost": AdaBoostClassifier(),
"Extra Trees": ExtraTreesClassifier(),
"SVC": SVC(),
"Decision Tree": DecisionTreeClassifier(),
"K-Nearest Neighbors": KNeighborsClassifier(),
"Naive Bayes": GaussianNB()
}
performance = {}
trained_models = {}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
performance[name] = acc
trained_models[name] = model # Store the trained model
# Save trained models
with open(f"{name.replace(' ', '_')}.pkl", "wb") as f:
pickle.dump(model, f)
# Selecting the best model
best_model_name = max(performance, key=performance.get)
best_model = trained_models[best_model_name]
# Streamlit UI
st.title("πŸ“Š Machine Learning Model Evaluation App")
st.write("This application evaluates multiple machine learning models for predicting phone price ranges based on various phone specifications.")
# Data Overview
st.write("## πŸ” Data Overview")
st.write(train_df.head())
# Data Visualization
st.write("## πŸ“ˆ Data Visualization")
# Target Distribution
st.write("### 🎯 Target Distribution")
fig, ax = plt.subplots(figsize=(6, 4))
sns.countplot(x=y, ax=ax)
ax.set_xlabel("Price Range")
ax.set_ylabel("Count")
st.pyplot(fig)
# Model Performance
st.write("## πŸ† Model Performance")
performance_df = pd.DataFrame.from_dict(performance, orient='index', columns=['Accuracy'])
performance_df = performance_df.sort_values(by='Accuracy', ascending=False)
st.table(performance_df)
st.write(f"### πŸŽ–οΈ Best Model: **{best_model_name}** with accuracy **{performance[best_model_name]:.4f}**")
# Classification Report
st.write("## πŸ“Š Classification Report")
y_pred_best = best_model.predict(X_test)
report_dict = classification_report(y_test, y_pred_best, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()
st.dataframe(report_df.style.format("{:.2f}"))
# Confusion Matrix
st.write("## πŸ”₯ Confusion Matrix")
cm = confusion_matrix(y_test, y_pred_best)
labels = list(map(str, np.unique(y_test))) # Ensure labels are a list of strings
fig_cm = ff.create_annotated_heatmap(
z=cm,
x=labels,
y=labels,
annotation_text=cm.astype(str), # Show exact values inside the heatmap
colorscale='Blues',
showscale=True
)
st.plotly_chart(fig_cm)