Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| from sklearn.model_selection import train_test_split, cross_val_score | |
| from sklearn.naive_bayes import GaussianNB | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.metrics import accuracy_score, confusion_matrix | |
| from sklearn.preprocessing import StandardScaler | |
| # Page configuration | |
| st.set_page_config( | |
| page_title="Seattle Weather Analysis", | |
| page_icon="🌦️", | |
| layout="wide" | |
| ) | |
| # Title and introduction | |
| st.title("🌦️ Seattle Weather Machine Learning") | |
| st.markdown(""" | |
| This dashboard analyzes Seattle weather data using different machine learning models. | |
| The dataset includes weather attributes and their classification. | |
| """) | |
| # ====================== | |
| # Helper Functions | |
| # ====================== | |
| def get_dataset_overview(df): | |
| """Generate a comprehensive overview of the dataset""" | |
| return { | |
| "Total Records": len(df), | |
| "Features": len(df.columns) - 1, # Excluding target column | |
| "Target Classes": len(df['weather'].unique()), | |
| "Missing Values": df.isnull().sum().sum() | |
| } | |
| def load_data(): | |
| """Load and preprocess the Seattle weather dataset""" | |
| df = pd.read_csv('seattle-weather.csv') | |
| df_cleaned = df.drop(columns=['date']) | |
| weather_mapping = {'drizzle': 0, 'rain': 1, 'sun': 2, 'snow': 3, 'fog': 4} | |
| df_cleaned['weather_encoded'] = df_cleaned['weather'].map(weather_mapping) | |
| # Split features and target | |
| X = df_cleaned.drop(columns=['weather', 'weather_encoded']) | |
| y = df_cleaned['weather_encoded'] | |
| # Scale features | |
| scaler = StandardScaler() | |
| X_scaled = scaler.fit_transform(X) | |
| X_scaled = pd.DataFrame(X_scaled, columns=X.columns) | |
| # Train-test split | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X_scaled, y, test_size=0.2, random_state=42 | |
| ) | |
| return df, df_cleaned, X, y, X_train, X_test, y_train, y_test, weather_mapping | |
| def plot_weather_distribution(df): | |
| """Plot distribution of weather types""" | |
| fig, ax = plt.subplots() | |
| sns.countplot(x='weather', data=df, palette='viridis', ax=ax) | |
| ax.set_title("Distribution of Weather Types") | |
| st.pyplot(fig) | |
| def plot_temp_relationship(df): | |
| """Plot relationship between max and min temperatures""" | |
| fig, ax = plt.subplots() | |
| sns.scatterplot(x='temp_max', y='temp_min', hue='weather', data=df, ax=ax) | |
| ax.set_title("Relationship Between Temp_max and Temp_min") | |
| st.pyplot(fig) | |
| def train_models(X_train, X_test, y_train, y_test): | |
| """Train Naive Bayes, Decision Tree, and Random Forest models""" | |
| models = { | |
| 'Naive Bayes': GaussianNB(), | |
| 'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=5), | |
| 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42) | |
| } | |
| results = {} | |
| for name, model in models.items(): | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_test) | |
| accuracy = accuracy_score(y_test, y_pred) | |
| cv_scores = cross_val_score(model, X_train, y_train, cv=5) | |
| results[name] = { | |
| 'model': model, | |
| 'accuracy': accuracy, | |
| 'cv_mean': cv_scores.mean(), | |
| 'cv_std': cv_scores.std(), | |
| 'pred': y_pred | |
| } | |
| return results | |
| def plot_confusion_matrix(y_test, y_pred, model_name, weather_mapping): | |
| """Plot confusion matrix for a given model""" | |
| fig, ax = plt.subplots() | |
| conf_matrix = confusion_matrix(y_test, y_pred) | |
| sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', | |
| xticklabels=list(weather_mapping.keys()), | |
| yticklabels=list(weather_mapping.keys()), ax=ax) | |
| ax.set_title(f"Confusion Matrix - {model_name}") | |
| ax.set_xlabel("Predicted") | |
| ax.set_ylabel("Actual") | |
| st.pyplot(fig) | |
| def plot_feature_importance(model, X, model_name): | |
| """Plot feature importance for Decision Tree or Random Forest""" | |
| if hasattr(model, "feature_importances_"): | |
| fig, ax = plt.subplots() | |
| feature_importance = pd.DataFrame({ | |
| 'Feature': X.columns, | |
| 'Importance': model.feature_importances_ | |
| }).sort_values('Importance', ascending=False) | |
| sns.barplot(x='Importance', y='Feature', data=feature_importance, palette='viridis', ax=ax) | |
| ax.set_title(f"{model_name} Feature Importance") | |
| st.pyplot(fig) | |
| # ====================== | |
| # Main App | |
| # ====================== | |
| def main(): | |
| df, df_cleaned, X, y, X_train, X_test, y_train, y_test, weather_mapping = load_data() | |
| menu = st.sidebar.selectbox("Choose Analysis", [ | |
| "Data Overview", | |
| "Data Visualization", | |
| "Model Training", | |
| "Model Comparison" | |
| ]) | |
| if menu == "Data Overview": | |
| st.header("Dataset Overview") | |
| overview = get_dataset_overview(df) | |
| col1, col2, col3, col4 = st.columns(4) | |
| col1.metric("Total Records", overview["Total Records"]) | |
| col2.metric("Features", overview["Features"]) | |
| col3.metric("Target Classes", overview["Target Classes"]) | |
| col4.metric("Missing Values", overview["Missing Values"]) | |
| st.subheader("First Few Rows") | |
| st.dataframe(df.head()) | |
| st.subheader("Weather Type Distribution") | |
| weather_dist = df['weather'].value_counts() | |
| col1, col2 = st.columns(2) | |
| col1.dataframe(weather_dist) | |
| fig, ax = plt.subplots() | |
| weather_dist.plot(kind='pie', autopct='%1.1f%%', ax=ax) | |
| ax.set_ylabel('') | |
| ax.set_title("Weather Type Percentage") | |
| col2.pyplot(fig) | |
| st.subheader("Descriptive Statistics") | |
| st.dataframe(df.describe()) | |
| elif menu == "Data Visualization": | |
| st.header("Weather Data Visualizations") | |
| viz_option = st.selectbox("Choose Visualization", [ | |
| "Weather Type Distribution", | |
| "Temperature Relationship", | |
| "Correlation Heatmap" | |
| ]) | |
| if viz_option == "Weather Type Distribution": | |
| plot_weather_distribution(df) | |
| elif viz_option == "Temperature Relationship": | |
| plot_temp_relationship(df) | |
| elif viz_option == "Correlation Heatmap": | |
| fig, ax = plt.subplots(figsize=(10, 8)) | |
| corr_matrix = pd.concat([X, y], axis=1).corr() | |
| sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1, ax=ax) | |
| ax.set_title("Correlation Heatmap") | |
| st.pyplot(fig) | |
| elif menu == "Model Training": | |
| st.header("Machine Learning Models") | |
| results = train_models(X_train, X_test, y_train, y_test) | |
| model_select = st.selectbox("Choose Model", list(results.keys())) | |
| model_result = results[model_select] | |
| st.write(f"{model_select} Results:") | |
| st.write(f"Test Accuracy: {model_result['accuracy']:.4f}") | |
| st.write(f"Cross-Validation Mean Accuracy: {model_result['cv_mean']:.4f}") | |
| st.write(f"Cross-Validation Std: {model_result['cv_std']:.4f}") | |
| plot_confusion_matrix(y_test, model_result['pred'], model_select, weather_mapping) | |
| if model_select != 'Naive Bayes': | |
| plot_feature_importance(model_result['model'], X, model_select) | |
| elif menu == "Model Comparison": | |
| st.header("Model Performance Comparison") | |
| results = train_models(X_train, X_test, y_train, y_test) | |
| comparison_df = pd.DataFrame({ | |
| 'Model': list(results.keys()), | |
| 'Test Accuracy': [results[m]['accuracy'] for m in results], | |
| 'CV Mean Accuracy': [results[m]['cv_mean'] for m in results], | |
| 'CV Std': [results[m]['cv_std'] for m in results] | |
| }) | |
| st.dataframe(comparison_df) | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5)) | |
| sns.barplot(x='Model', y='Test Accuracy', data=comparison_df, ax=ax1) | |
| ax1.set_title('Test Accuracy Comparison') | |
| ax1.tick_params(axis='x', rotation=45) | |
| sns.barplot(x='Model', y='CV Mean Accuracy', data=comparison_df, ax=ax2) | |
| ax2.errorbar(x=range(len(comparison_df)), | |
| y=comparison_df['CV Mean Accuracy'], | |
| yerr=comparison_df['CV Std']*2, | |
| fmt='none', color='black', capsize=5) | |
| ax2.set_title('Cross-validation Accuracy') | |
| ax2.tick_params(axis='x', rotation=45) | |
| plt.tight_layout() | |
| st.pyplot(fig) | |
| if __name__ == "__main__": | |
| main() | |