Spaces:

kheejay88
/

phone_price_category_classification

Build error

App Files Files Community

phone_price_category_classification / app.py

kheejay88

444

ceeb6ed verified about 1 year ago

raw

history blame contribute delete

4.38 kB

	import streamlit as st
	import pandas as pd
	import pickle
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import plotly.figure_factory as ff
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.naive_bayes import GaussianNB
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
	from datasets import load_dataset

	# Load Data
	@st.cache_data
	def load_data():
	train_df = load_dataset("kheejay88/phone_price_classification_train")["train"].to_pandas()
	test_df = load_dataset("kheejay88/phone_price_classification_test")["test"].to_pandas()
	return train_df, test_df

	train_df, test_df = load_data()

	# Data Preprocessing
	def preprocess_data(df):
	df = df.copy()
	df.fillna(df.median(), inplace=True) # Handle missing values
	label_encoders = {}

	for col in df.select_dtypes(include=['object']).columns:
	le = LabelEncoder()
	df[col] = le.fit_transform(df[col])
	label_encoders[col] = le

	return df, label_encoders

	train_df, encoders = preprocess_data(train_df)

	# Splitting features and target variable
	X = train_df.drop(columns=['price_range']) # Updated target variable
	y = train_df['price_range']

	# Splitting into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Standardizing the data
	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_test = scaler.transform(X_test)

	# Model Training and Evaluation
	models = {
	"Logistic Regression": LogisticRegression(),
	"Random Forest": RandomForestClassifier(),
	"Gradient Boosting": GradientBoostingClassifier(),
	"AdaBoost": AdaBoostClassifier(),
	"Extra Trees": ExtraTreesClassifier(),
	"SVC": SVC(),
	"Decision Tree": DecisionTreeClassifier(),
	"K-Nearest Neighbors": KNeighborsClassifier(),
	"Naive Bayes": GaussianNB()
	}

	performance = {}
	trained_models = {}

	for name, model in models.items():
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	acc = accuracy_score(y_test, y_pred)
	performance[name] = acc
	trained_models[name] = model # Store the trained model

	# Save trained models
	with open(f"{name.replace(' ', '_')}.pkl", "wb") as f:
	pickle.dump(model, f)

	# Selecting the best model
	best_model_name = max(performance, key=performance.get)
	best_model = trained_models[best_model_name]

	# Streamlit UI
	st.title("📊 Machine Learning Model Evaluation App")
	st.write("This application evaluates multiple machine learning models for predicting phone price ranges based on various phone specifications.")

	# Data Overview
	st.write("## 🔍 Data Overview")
	st.write(train_df.head())

	# Data Visualization
	st.write("## 📈 Data Visualization")

	# Target Distribution
	st.write("### 🎯 Target Distribution")
	fig, ax = plt.subplots(figsize=(6, 4))
	sns.countplot(x=y, ax=ax)
	ax.set_xlabel("Price Range")
	ax.set_ylabel("Count")
	st.pyplot(fig)

	# Model Performance
	st.write("## 🏆 Model Performance")
	performance_df = pd.DataFrame.from_dict(performance, orient='index', columns=['Accuracy'])
	performance_df = performance_df.sort_values(by='Accuracy', ascending=False)
	st.table(performance_df)

	st.write(f"### 🎖️ Best Model: {best_model_name} with accuracy {performance[best_model_name]:.4f}")

	# Classification Report
	st.write("## 📊 Classification Report")
	y_pred_best = best_model.predict(X_test)
	report_dict = classification_report(y_test, y_pred_best, output_dict=True)
	report_df = pd.DataFrame(report_dict).transpose()
	st.dataframe(report_df.style.format("{:.2f}"))

	# Confusion Matrix
	st.write("## 🔥 Confusion Matrix")
	cm = confusion_matrix(y_test, y_pred_best)
	labels = list(map(str, np.unique(y_test))) # Ensure labels are a list of strings

	fig_cm = ff.create_annotated_heatmap(
	z=cm,
	x=labels,
	y=labels,
	annotation_text=cm.astype(str), # Show exact values inside the heatmap
	colorscale='Blues',
	showscale=True
	)

	st.plotly_chart(fig_cm)