import streamlit as st import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error, r2_score, classification_report, confusion_matrix ) from sklearn.preprocessing import LabelEncoder import plotly.express as px import plotly.graph_objects as go import seaborn as sns import matplotlib.pyplot as plt import io # Metadata AUTHOR = "Eduardo Nacimiento Garcรญa" EMAIL = "enacimie@ull.edu.es" LICENSE = "Apache 2.0" # Page config st.set_page_config( page_title="SimpleML", page_icon="๐Ÿค–", layout="wide", initial_sidebar_state="expanded", ) # Title st.title("๐Ÿค– SimpleML") st.markdown(f"**Author:** {AUTHOR} | **Email:** {EMAIL} | **License:** {LICENSE}") st.write(""" Upload a CSV or use the demo dataset to train a machine learning model (classification or regression) in seconds. """) # === GENERATE DEMO DATASET === @st.cache_data def create_demo_data(task="classification"): np.random.seed(42) n = 500 data = { "Age": np.random.normal(35, 12, n).astype(int), "Income": np.random.normal(45000, 15000, n), "Experience": np.random.randint(0, 20, n), "Education_Level": np.random.choice(["High School", "Bachelor", "Master", "PhD"], n), "City": np.random.choice(["Madrid", "Barcelona", "Valencia", "Seville"], n), } df = pd.DataFrame(data) if task == "classification": # Create binary target: Purchase (0/1) purchase_prob = ( 0.3 + (df["Income"] > df["Income"].median()) * 0.4 + (df["Experience"] > 10) * 0.2 + (df["Education_Level"] == "Master") * 0.1 + (df["Education_Level"] == "PhD") * 0.15 ) df["Purchase"] = np.random.binomial(1, np.clip(purchase_prob, 0, 1), n) return df elif task == "regression": # Create continuous target: Salary df["Salary"] = ( 25000 + df["Experience"] * 1500 + (df["Income"] / 100) + (df["Age"] * 100) + (df["Education_Level"] == "Master") * 8000 + (df["Education_Level"] == "PhD") * 15000 + np.random.normal(0, 5000, n) ) return df # === LOAD DATA === if "demo_loaded" not in st.session_state: st.session_state.demo_loaded = False st.session_state.task_type = "classification" if st.button("๐Ÿงช Load Classification Demo Dataset"): st.session_state.demo_loaded = True st.session_state.task_type = "classification" st.session_state.df = create_demo_data("classification") st.success("โœ… Classification demo loaded!") if st.button("๐Ÿงช Load Regression Demo Dataset"): st.session_state.demo_loaded = True st.session_state.task_type = "regression" st.session_state.df = create_demo_data("regression") st.success("โœ… Regression demo loaded!") uploaded_file = st.file_uploader("๐Ÿ“‚ Upload your CSV file", type=["csv"]) # Use demo or uploaded file if uploaded_file: df = pd.read_csv(uploaded_file) st.session_state.df = df st.session_state.demo_loaded = False st.success("โœ… File uploaded successfully.") elif "df" in st.session_state: df = st.session_state.df task_type = st.session_state.task_type if st.session_state.demo_loaded: st.info(f"Using **{task_type}** demo dataset.") else: df = None st.info("๐Ÿ‘† Upload a CSV or load a demo dataset to begin.") st.stop() # Show data preview with st.expander("๐Ÿ” Data Preview (first 10 rows)"): st.dataframe(df.head(10)) # === TARGET & FEATURE SELECTION === st.subheader("๐ŸŽฏ Select Target Variable") target_col = st.selectbox("Target column (y):", df.columns) # Auto-detect task type if not demo if "task_type" not in st.session_state or not st.session_state.demo_loaded: if df[target_col].nunique() <= 10 and df[target_col].dtype == 'object' or df[target_col].dtype.name == 'category': task_type = "classification" elif df[target_col].dtype in [np.int64, np.float64] and df[target_col].nunique() <= 10: task_type = "classification" else: task_type = "regression" else: task_type = st.session_state.task_type st.write(f"**Detected task:** `{task_type}`") # Select features feature_cols = [col for col in df.columns if col != target_col] selected_features = st.multiselect( "Select features (X):", feature_cols, default=feature_cols ) if not selected_features: st.warning("โš ๏ธ Please select at least one feature.") st.stop() # Prepare data X = df[selected_features].copy() y = df[target_col].copy() # Handle categorical variables le_dict = {} for col in X.select_dtypes(include=['object', 'category']).columns: le = LabelEncoder() X[col] = le.fit_transform(X[col].astype(str)) le_dict[col] = le if task_type == "classification" and y.dtype == 'object': le_target = LabelEncoder() y = le_target.fit_transform(y.astype(str)) class_names = le_target.classes_ else: class_names = None # Train/test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # === MODEL SELECTION === st.subheader("โš™๏ธ Choose Model") if task_type == "classification": model_choice = st.selectbox("Model:", ["Random Forest Classifier", "Logistic Regression"]) if model_choice == "Random Forest Classifier": model = RandomForestClassifier(n_estimators=100, random_state=42) else: model = LogisticRegression(max_iter=1000, random_state=42) else: model_choice = st.selectbox("Model:", ["Random Forest Regressor", "Linear Regression"]) if model_choice == "Random Forest Regressor": model = RandomForestRegressor(n_estimators=100, random_state=42) else: model = LinearRegression() # Train model model.fit(X_train, y_train) y_pred = model.predict(X_test) # === RESULTS === st.header("๐Ÿ“ˆ Results") if task_type == "classification": # Metrics acc = accuracy_score(y_test, y_pred) prec = precision_score(y_test, y_pred, average='weighted') rec = recall_score(y_test, y_pred, average='weighted') f1 = f1_score(y_test, y_pred, average='weighted') st.subheader("๐Ÿ“Š Classification Metrics") col1, col2, col3, col4 = st.columns(4) col1.metric("Accuracy", f"{acc:.3f}") col2.metric("Precision", f"{prec:.3f}") col3.metric("Recall", f"{rec:.3f}") col4.metric("F1-Score", f"{f1:.3f}") # Classification report with st.expander("๐Ÿ“‹ Detailed Classification Report"): if class_names is not None: report = classification_report(y_test, y_pred, target_names=class_names, output_dict=True) else: report = classification_report(y_test, y_pred, output_dict=True) st.dataframe(pd.DataFrame(report).transpose()) # Confusion Matrix st.subheader("๐Ÿงฉ Confusion Matrix") cm = confusion_matrix(y_test, y_pred) fig = px.imshow( cm, text_auto=True, labels=dict(x="Predicted", y="Actual"), x=class_names if class_names is not None else [f"Class {i}" for i in range(cm.shape[1])], y=class_names if class_names is not None else [f"Class {i}" for i in range(cm.shape[0])], title="Confusion Matrix" ) st.plotly_chart(fig, use_container_width=True) else: # regression mae = mean_absolute_error(y_test, y_pred) mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) r2 = r2_score(y_test, y_pred) st.subheader("๐Ÿ“Š Regression Metrics") col1, col2, col3, col4 = st.columns(4) col1.metric("MAE", f"{mae:.2f}") col2.metric("MSE", f"{mse:.2f}") col3.metric("RMSE", f"{rmse:.2f}") col4.metric("Rยฒ", f"{r2:.3f}") # Prediction vs Actual plot st.subheader("๐Ÿ“‰ Predicted vs Actual") fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual', 'y': 'Predicted'}, title="Predicted vs Actual Values") fig.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], mode='lines', name='Ideal Fit', line=dict(dash='dash', color='red'))) st.plotly_chart(fig, use_container_width=True) # Feature Importance (for tree-based models) if "Forest" in model_choice: st.subheader("๐Ÿ”‘ Feature Importance") importance = model.feature_importances_ feat_imp_df = pd.DataFrame({ 'Feature': selected_features, 'Importance': importance }).sort_values('Importance', ascending=False) fig = px.bar(feat_imp_df, x='Importance', y='Feature', orientation='h', title="Feature Importance") st.plotly_chart(fig, use_container_width=True) with st.expander("๐Ÿ“‹ Feature Importance Table"): st.dataframe(feat_imp_df) # === PREDICTION DEMO === st.header("๐Ÿ”ฎ Make a Prediction") st.write("Enter values below to predict:") input_data = {} for feature in selected_features: if feature in le_dict: # Categorical original_values = df[feature].dropna().unique() choice = st.selectbox(f"{feature}:", original_values, key=f"pred_{feature}") input_data[feature] = le_dict[feature].transform([str(choice)])[0] else: # Numeric if df[feature].dtype in [np.int64, np.int32]: val = st.number_input(f"{feature}:", value=int(df[feature].median()), step=1, key=f"pred_{feature}") else: val = st.number_input(f"{feature}:", value=float(df[feature].median()), step=0.1, key=f"pred_{feature}") input_data[feature] = val if st.button("๐Ÿš€ Predict"): input_df = pd.DataFrame([input_data]) prediction = model.predict(input_df)[0] if task_type == "classification" and class_names is not None: prediction = class_names[prediction] st.success(f"**Prediction:** `{prediction}`") # Footer st.markdown("---") st.caption(f"ยฉ {AUTHOR} | License {LICENSE} | Contact: {EMAIL}")