File size: 10,133 Bytes
5498214
94bd0c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5498214
94bd0c4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_absolute_error, mean_squared_error, r2_score,
    classification_report, confusion_matrix
)
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import io

# Metadata
AUTHOR = "Eduardo Nacimiento García"
EMAIL = "enacimie@ull.edu.es"
LICENSE = "Apache 2.0"

# Page config
st.set_page_config(
    page_title="SimpleML",
    page_icon="🤖",
    layout="wide",
    initial_sidebar_state="expanded",
)

# Title
st.title("🤖 SimpleML")
st.markdown(f"**Author:** {AUTHOR} | **Email:** {EMAIL} | **License:** {LICENSE}")
st.write("""
Upload a CSV or use the demo dataset to train a machine learning model (classification or regression) in seconds.
""")

# === GENERATE DEMO DATASET ===
@st.cache_data
def create_demo_data(task="classification"):
    np.random.seed(42)
    n = 500
    data = {
        "Age": np.random.normal(35, 12, n).astype(int),
        "Income": np.random.normal(45000, 15000, n),
        "Experience": np.random.randint(0, 20, n),
        "Education_Level": np.random.choice(["High School", "Bachelor", "Master", "PhD"], n),
        "City": np.random.choice(["Madrid", "Barcelona", "Valencia", "Seville"], n),
    }
    df = pd.DataFrame(data)

    if task == "classification":
        # Create binary target: Purchase (0/1)
        purchase_prob = (
            0.3 +
            (df["Income"] > df["Income"].median()) * 0.4 +
            (df["Experience"] > 10) * 0.2 +
            (df["Education_Level"] == "Master") * 0.1 +
            (df["Education_Level"] == "PhD") * 0.15
        )
        df["Purchase"] = np.random.binomial(1, np.clip(purchase_prob, 0, 1), n)
        return df

    elif task == "regression":
        # Create continuous target: Salary
        df["Salary"] = (
            25000 +
            df["Experience"] * 1500 +
            (df["Income"] / 100) +
            (df["Age"] * 100) +
            (df["Education_Level"] == "Master") * 8000 +
            (df["Education_Level"] == "PhD") * 15000 +
            np.random.normal(0, 5000, n)
        )
        return df

# === LOAD DATA ===
if "demo_loaded" not in st.session_state:
    st.session_state.demo_loaded = False
    st.session_state.task_type = "classification"

if st.button("🧪 Load Classification Demo Dataset"):
    st.session_state.demo_loaded = True
    st.session_state.task_type = "classification"
    st.session_state.df = create_demo_data("classification")
    st.success("✅ Classification demo loaded!")

if st.button("🧪 Load Regression Demo Dataset"):
    st.session_state.demo_loaded = True
    st.session_state.task_type = "regression"
    st.session_state.df = create_demo_data("regression")
    st.success("✅ Regression demo loaded!")

uploaded_file = st.file_uploader("📂 Upload your CSV file", type=["csv"])

# Use demo or uploaded file
if uploaded_file:
    df = pd.read_csv(uploaded_file)
    st.session_state.df = df
    st.session_state.demo_loaded = False
    st.success("✅ File uploaded successfully.")
elif "df" in st.session_state:
    df = st.session_state.df
    task_type = st.session_state.task_type
    if st.session_state.demo_loaded:
        st.info(f"Using **{task_type}** demo dataset.")
else:
    df = None
    st.info("👆 Upload a CSV or load a demo dataset to begin.")
    st.stop()

# Show data preview
with st.expander("🔍 Data Preview (first 10 rows)"):
    st.dataframe(df.head(10))

# === TARGET & FEATURE SELECTION ===
st.subheader("🎯 Select Target Variable")
target_col = st.selectbox("Target column (y):", df.columns)

# Auto-detect task type if not demo
if "task_type" not in st.session_state or not st.session_state.demo_loaded:
    if df[target_col].nunique() <= 10 and df[target_col].dtype == 'object' or df[target_col].dtype.name == 'category':
        task_type = "classification"
    elif df[target_col].dtype in [np.int64, np.float64] and df[target_col].nunique() <= 10:
        task_type = "classification"
    else:
        task_type = "regression"
else:
    task_type = st.session_state.task_type

st.write(f"**Detected task:** `{task_type}`")

# Select features
feature_cols = [col for col in df.columns if col != target_col]
selected_features = st.multiselect(
    "Select features (X):",
    feature_cols,
    default=feature_cols
)

if not selected_features:
    st.warning("⚠️ Please select at least one feature.")
    st.stop()

# Prepare data
X = df[selected_features].copy()
y = df[target_col].copy()

# Handle categorical variables
le_dict = {}
for col in X.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    le_dict[col] = le

if task_type == "classification" and y.dtype == 'object':
    le_target = LabelEncoder()
    y = le_target.fit_transform(y.astype(str))
    class_names = le_target.classes_
else:
    class_names = None

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === MODEL SELECTION ===
st.subheader("⚙️ Choose Model")

if task_type == "classification":
    model_choice = st.selectbox("Model:", ["Random Forest Classifier", "Logistic Regression"])
    if model_choice == "Random Forest Classifier":
        model = RandomForestClassifier(n_estimators=100, random_state=42)
    else:
        model = LogisticRegression(max_iter=1000, random_state=42)
else:
    model_choice = st.selectbox("Model:", ["Random Forest Regressor", "Linear Regression"])
    if model_choice == "Random Forest Regressor":
        model = RandomForestRegressor(n_estimators=100, random_state=42)
    else:
        model = LinearRegression()

# Train model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# === RESULTS ===
st.header("📈 Results")

if task_type == "classification":
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    st.subheader("📊 Classification Metrics")
    col1, col2, col3, col4 = st.columns(4)
    col1.metric("Accuracy", f"{acc:.3f}")
    col2.metric("Precision", f"{prec:.3f}")
    col3.metric("Recall", f"{rec:.3f}")
    col4.metric("F1-Score", f"{f1:.3f}")

    # Classification report
    with st.expander("📋 Detailed Classification Report"):
        if class_names is not None:
            report = classification_report(y_test, y_pred, target_names=class_names, output_dict=True)
        else:
            report = classification_report(y_test, y_pred, output_dict=True)
        st.dataframe(pd.DataFrame(report).transpose())

    # Confusion Matrix
    st.subheader("🧩 Confusion Matrix")
    cm = confusion_matrix(y_test, y_pred)
    fig = px.imshow(
        cm,
        text_auto=True,
        labels=dict(x="Predicted", y="Actual"),
        x=class_names if class_names is not None else [f"Class {i}" for i in range(cm.shape[1])],
        y=class_names if class_names is not None else [f"Class {i}" for i in range(cm.shape[0])],
        title="Confusion Matrix"
    )
    st.plotly_chart(fig, use_container_width=True)

else:  # regression
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    st.subheader("📊 Regression Metrics")
    col1, col2, col3, col4 = st.columns(4)
    col1.metric("MAE", f"{mae:.2f}")
    col2.metric("MSE", f"{mse:.2f}")
    col3.metric("RMSE", f"{rmse:.2f}")
    col4.metric("R²", f"{r2:.3f}")

    # Prediction vs Actual plot
    st.subheader("📉 Predicted vs Actual")
    fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual', 'y': 'Predicted'}, title="Predicted vs Actual Values")
    fig.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()],
                             mode='lines', name='Ideal Fit', line=dict(dash='dash', color='red')))
    st.plotly_chart(fig, use_container_width=True)

# Feature Importance (for tree-based models)
if "Forest" in model_choice:
    st.subheader("🔑 Feature Importance")
    importance = model.feature_importances_
    feat_imp_df = pd.DataFrame({
        'Feature': selected_features,
        'Importance': importance
    }).sort_values('Importance', ascending=False)

    fig = px.bar(feat_imp_df, x='Importance', y='Feature', orientation='h', title="Feature Importance")
    st.plotly_chart(fig, use_container_width=True)

    with st.expander("📋 Feature Importance Table"):
        st.dataframe(feat_imp_df)

# === PREDICTION DEMO ===
st.header("🔮 Make a Prediction")

st.write("Enter values below to predict:")

input_data = {}
for feature in selected_features:
    if feature in le_dict:
        # Categorical
        original_values = df[feature].dropna().unique()
        choice = st.selectbox(f"{feature}:", original_values, key=f"pred_{feature}")
        input_data[feature] = le_dict[feature].transform([str(choice)])[0]
    else:
        # Numeric
        if df[feature].dtype in [np.int64, np.int32]:
            val = st.number_input(f"{feature}:", value=int(df[feature].median()), step=1, key=f"pred_{feature}")
        else:
            val = st.number_input(f"{feature}:", value=float(df[feature].median()), step=0.1, key=f"pred_{feature}")
        input_data[feature] = val

if st.button("🚀 Predict"):
    input_df = pd.DataFrame([input_data])
    prediction = model.predict(input_df)[0]
    if task_type == "classification" and class_names is not None:
        prediction = class_names[prediction]
    st.success(f"**Prediction:** `{prediction}`")

# Footer
st.markdown("---")
st.caption(f"© {AUTHOR} | License {LICENSE} | Contact: {EMAIL}")