| | import streamlit as st |
| | import pandas as pd |
| | import numpy as np |
| | import seaborn as sns |
| | import matplotlib.pyplot as plt |
| | from sklearn.model_selection import train_test_split |
| | from sklearn.preprocessing import StandardScaler |
| | from sklearn.neighbors import KNeighborsClassifier |
| | from sklearn.ensemble import RandomForestClassifier |
| | from lightgbm import LGBMClassifier |
| | from sklearn.metrics import classification_report, confusion_matrix, accuracy_score |
| | from sklearn.datasets import load_wine |
| |
|
| | |
| | st.set_page_config( |
| | page_title="Wine Quality Analysis", |
| | page_icon="🍇", |
| | layout="wide" |
| | ) |
| |
|
| | |
| | st.title("🍇 Wine Quality Analysis Dashboard") |
| | st.markdown(""" |
| | This dashboard analyzes wine quality data using different machine learning models. |
| | The dataset includes various wine attributes and their classifications. |
| | """) |
| |
|
| | |
| | @st.cache_data |
| | def load_data(): |
| | wine_data = load_wine() |
| | df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names) |
| | df['class'] = wine_data.target |
| | return df, wine_data |
| |
|
| | df, wine_data = load_data() |
| |
|
| | |
| | st.sidebar.header("Navigation") |
| | page = st.sidebar.radio("Go to", ["Data Overview", "Exploratory Analysis", "Model Training", "Model Comparison"]) |
| |
|
| | |
| | if page == "Data Overview": |
| | st.header("Dataset Overview") |
| |
|
| | |
| | col1, col2, col3, col4 = st.columns(4) |
| |
|
| | with col1: |
| | st.metric( |
| | label="Total Records", |
| | value=f"{len(df):,}" |
| | ) |
| |
|
| | with col2: |
| | st.metric( |
| | label="Features", |
| | value=len(df.columns) - 1 |
| | ) |
| |
|
| | with col3: |
| | st.metric( |
| | label="Target Classes", |
| | value=len(df['class'].unique()) |
| | ) |
| |
|
| | with col4: |
| | st.metric( |
| | label="Missing Values", |
| | value=df.isnull().sum().sum() |
| | ) |
| |
|
| | st.write("") |
| |
|
| | |
| | st.subheader("Sample Data") |
| | st.dataframe( |
| | df.head(), |
| | use_container_width=True, |
| | height=230 |
| | ) |
| |
|
| | |
| | st.subheader("Target Class Distribution") |
| |
|
| | col1, col2 = st.columns([2, 1]) |
| |
|
| | with col1: |
| | fig, ax = plt.subplots(figsize=(10, 6)) |
| | sns.countplot(data=df, x='class', palette='rocket') |
| | plt.title('Distribution of Wine Classes') |
| | st.pyplot(fig) |
| |
|
| | with col2: |
| | st.write("") |
| | st.write("") |
| | class_distribution = df['class'].value_counts() |
| | for class_name, count in class_distribution.items(): |
| | st.metric( |
| | label=f"Class {class_name}", |
| | value=count |
| | ) |
| |
|
| | |
| | elif page == "Exploratory Analysis": |
| | st.header("Exploratory Data Analysis") |
| |
|
| | |
| | st.subheader("Feature Distributions") |
| | feature_to_plot = st.selectbox("Select Feature", df.columns[:-1]) |
| |
|
| | fig, ax = plt.subplots(figsize=(10, 6)) |
| | sns.histplot(data=df, x=feature_to_plot, kde=True, color='purple') |
| | plt.title(f'Distribution of {feature_to_plot}') |
| | plt.xticks(rotation=45) |
| | st.pyplot(fig) |
| |
|
| | |
| | st.subheader("Correlation Heatmap") |
| | fig, ax = plt.subplots(figsize=(12, 8)) |
| | sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm') |
| | plt.title('Feature Correlation Heatmap') |
| | st.pyplot(fig) |
| |
|
| | |
| | elif page == "Model Training": |
| | st.header("Model Training and Evaluation") |
| |
|
| | |
| | X = df.drop('class', axis=1) |
| | y = df['class'] |
| |
|
| | |
| | test_size = st.slider("Select Test Size", 0.1, 0.4, 0.2, 0.05) |
| | X_train, X_test, y_train, y_test = train_test_split( |
| | X, y, test_size=test_size, random_state=42, stratify=y |
| | ) |
| |
|
| | |
| | scaler = StandardScaler() |
| | X_train_scaled = scaler.fit_transform(X_train) |
| | X_test_scaled = scaler.transform(X_test) |
| |
|
| | |
| | model_choice = st.selectbox( |
| | "Select Model", |
| | ["KNN", "Random Forest", "LightGBM"] |
| | ) |
| |
|
| | if st.button("Train Model"): |
| | with st.spinner("Training model..."): |
| | if model_choice == "KNN": |
| | model = KNeighborsClassifier(n_neighbors=5) |
| | elif model_choice == "Random Forest": |
| | model = RandomForestClassifier(n_estimators=100, random_state=42) |
| | else: |
| | model = LGBMClassifier(n_estimators=100, random_state=42) |
| |
|
| | model.fit(X_train_scaled, y_train) |
| | y_pred = model.predict(X_test_scaled) |
| |
|
| | |
| | col1, col2 = st.columns(2) |
| |
|
| | with col1: |
| | st.subheader("Model Performance") |
| | accuracy = accuracy_score(y_test, y_pred) |
| | st.metric(label="Accuracy", value=f"{accuracy:.4f}") |
| | st.text("Classification Report:") |
| | st.text(classification_report(y_test, y_pred)) |
| |
|
| | with col2: |
| | st.subheader("Confusion Matrix") |
| | fig, ax = plt.subplots(figsize=(8, 6)) |
| | sns.heatmap( |
| | confusion_matrix(y_test, y_pred), |
| | annot=True, |
| | fmt='d', |
| | cmap='Blues', |
| | xticklabels=wine_data.target_names, |
| | yticklabels=wine_data.target_names |
| | ) |
| | plt.title(f'{model_choice} Confusion Matrix') |
| | plt.xlabel('Predicted') |
| | plt.ylabel('Actual') |
| | st.pyplot(fig) |
| |
|
| | |
| | if model_choice in ["Random Forest", "LightGBM"]: |
| | st.subheader("Feature Importance") |
| | feature_importance = pd.Series( |
| | model.feature_importances_, index=wine_data.feature_names |
| | ).sort_values(ascending=False) |
| |
|
| | fig, ax = plt.subplots(figsize=(10, 6)) |
| | sns.barplot( |
| | data=feature_importance.reset_index(), |
| | x=0, |
| | y='index', |
| | palette='viridis' |
| | ) |
| | plt.title('Top Features by Importance') |
| | plt.xlabel('Importance') |
| | plt.ylabel('Feature') |
| | st.pyplot(fig) |
| |
|
| | |
| | else: |
| | st.header("Model Comparison") |
| |
|
| | if st.button("Compare All Models"): |
| | with st.spinner("Training all models..."): |
| | |
| | X = df.drop('class', axis=1) |
| | y = df['class'] |
| |
|
| | |
| | X_train, X_test, y_train, y_test = train_test_split( |
| | X, y, test_size=0.2, random_state=42, stratify=y |
| | ) |
| |
|
| | |
| | scaler = StandardScaler() |
| | X_train_scaled = scaler.fit_transform(X_train) |
| | X_test_scaled = scaler.transform(X_test) |
| |
|
| | |
| | models = { |
| | "KNN": KNeighborsClassifier(n_neighbors=5), |
| | "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42), |
| | "LightGBM": LGBMClassifier(n_estimators=100, random_state=42) |
| | } |
| |
|
| | results = {} |
| | for name, model in models.items(): |
| | model.fit(X_train_scaled, y_train) |
| | y_pred = model.predict(X_test_scaled) |
| | results[name] = { |
| | 'accuracy': accuracy_score(y_test, y_pred), |
| | 'predictions': y_pred |
| | } |
| |
|
| | |
| | st.subheader("Accuracy Comparison") |
| | accuracy_df = pd.DataFrame({ |
| | 'Model': list(results.keys()), |
| | 'Accuracy': [results[model]['accuracy'] for model in results.keys()] |
| | }) |
| |
|
| | col1, col2 = st.columns(2) |
| |
|
| | with col1: |
| | st.dataframe(accuracy_df) |
| |
|
| | with col2: |
| | fig, ax = plt.subplots(figsize=(10, 6)) |
| | sns.barplot( |
| | data=accuracy_df, |
| | x='Model', |
| | y='Accuracy', |
| | palette='rocket' |
| | ) |
| | plt.title('Model Accuracy Comparison') |
| | plt.ylim(0, 1) |
| | st.pyplot(fig) |
| |
|
| | |
| | st.subheader("Detailed Model Performance") |
| | for name in results.keys(): |
| | st.write(f"\n{name}:") |
| | st.text(classification_report(y_test, results[name]['predictions'])) |
| |
|
| | fig, ax = plt.subplots(figsize=(8, 6)) |
| | sns.heatmap( |
| | confusion_matrix(y_test, results[name]['predictions']), |
| | annot=True, |
| | fmt='d', |
| | cmap='Blues', |
| | xticklabels=wine_data.target_names, |
| | yticklabels=wine_data.target_names |
| | ) |
| | plt.title(f'{name} Confusion Matrix') |
| | plt.xlabel('Predicted') |
| | plt.ylabel('Actual') |
| | st.pyplot(fig) |
| |
|
| | |
| | if name in ["Random Forest", "LightGBM"]: |
| | st.subheader(f"{name} Feature Importance") |
| | feature_importance = pd.Series( |
| | models[name].feature_importances_, index=wine_data.feature_names |
| | ).sort_values(ascending=False) |
| |
|
| | fig, ax = plt.subplots(figsize=(10, 6)) |
| | sns.barplot( |
| | data=feature_importance.reset_index(), |
| | x=0, |
| | y='index', |
| | palette='viridis' |
| | ) |
| | plt.title(f'{name} Feature Importance') |
| | plt.xlabel('Importance') |
| | plt.ylabel('Feature') |
| | st.pyplot(fig) |
| | |
| | st.markdown(""" |
| | --- |
| | Created with ❤️ using Streamlit |
| | """) |
| |
|