import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay from sklearn.decomposition import PCA st.set_option('deprecation.showPyplotGlobalUse', False) st.title("Electric Vehicle ML Dashboard (Optimized for Hugging Face)") # Load data @st.cache_data def load_data(): url = "https://drive.google.com/uc?export=download&id=1QBTnXxORRbJzE5Z2aqKHsVqgB7mqowiN" return pd.read_csv(url) df = load_data() st.subheader("1. Data Preview") st.dataframe(df.head()) # Fill missing values for col in df.select_dtypes(include='object').columns: df[col] = df[col].fillna(df[col].mode()[0]) for col in df.select_dtypes(include=np.number).columns: df[col] = df[col].fillna(df[col].median()) # Encode categories for col in df.select_dtypes(include='object').columns: df[col] = LabelEncoder().fit_transform(df[col]) # Feature engineering if 'Model Year' in df.columns: df['Vehicle_Age'] = 2025 - df['Model Year'] # Target setup if 'Electric Range' not in df.columns: st.error("'Electric Range' column missing!") st.stop() df['Target'] = (df['Electric Range'] > df['Electric Range'].median()).astype(int) y = df['Target'] X = df.drop(columns=['Electric Range', 'Target']) # Feature selection via Random Forest scaler = StandardScaler() X_scaled = scaler.fit_transform(X) rf = RandomForestClassifier(n_estimators=50, random_state=42) rf.fit(X_scaled, y) top_features = pd.Series(rf.feature_importances_, index=X.columns).nlargest(5).index.tolis_