Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -29,24 +29,23 @@ if uploaded_file is not None:
|
|
| 29 |
for column in df.select_dtypes(include=['object']).columns:
|
| 30 |
if df[column].nunique() < 10: # If the column has fewer unique values, encode it
|
| 31 |
df[column] = le.fit_transform(df[column].astype(str))
|
| 32 |
-
|
| 33 |
# Handle missing values (impute numerical columns with median and categorical columns with mode)
|
| 34 |
-
# Handle categorical columns
|
| 35 |
categorical_columns = df.select_dtypes(include=['object']).columns
|
| 36 |
if len(categorical_columns) > 0:
|
| 37 |
imputer = SimpleImputer(strategy='most_frequent')
|
| 38 |
df[categorical_columns] = imputer.fit_transform(df[categorical_columns])
|
| 39 |
-
|
| 40 |
# Handle numerical columns
|
| 41 |
numerical_columns = df.select_dtypes(include=['number']).columns
|
| 42 |
if len(numerical_columns) > 0:
|
| 43 |
imputer = SimpleImputer(strategy='median')
|
| 44 |
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
|
| 45 |
-
|
| 46 |
# Remove outliers (using z-score method)
|
| 47 |
z_scores = np.abs(stats.zscore(df.select_dtypes(include=['number'])))
|
| 48 |
-
df = df[(z_scores < 3).all(axis=1)]
|
| 49 |
-
|
| 50 |
# Normalize numerical data
|
| 51 |
scaler = StandardScaler()
|
| 52 |
df[df.select_dtypes(include=['number']).columns] = scaler.fit_transform(df.select_dtypes(include=['number']))
|
|
@@ -57,16 +56,16 @@ if uploaded_file is not None:
|
|
| 57 |
# Ensure that all columns are numeric before using in models
|
| 58 |
for column in df.select_dtypes(include=['object']).columns:
|
| 59 |
df[column] = pd.to_numeric(df[column], errors='coerce')
|
| 60 |
-
|
| 61 |
return df
|
| 62 |
|
| 63 |
# Apply the clean_dataset function
|
| 64 |
df_cleaned = clean_dataset(df)
|
| 65 |
-
|
| 66 |
# Show the cleaned dataset
|
| 67 |
st.write("Cleaned Dataset:")
|
| 68 |
st.dataframe(df_cleaned)
|
| 69 |
-
|
| 70 |
# Model Training Section
|
| 71 |
st.subheader("Model Training")
|
| 72 |
if df_cleaned.empty:
|
|
@@ -76,10 +75,10 @@ if uploaded_file is not None:
|
|
| 76 |
features = [col for col in df_cleaned.columns if col != target]
|
| 77 |
X = df_cleaned[features]
|
| 78 |
y = df_cleaned[target]
|
| 79 |
-
|
| 80 |
# Determine if the target is continuous or categorical
|
| 81 |
is_classification = len(y.unique()) <= 10 # If target has fewer than or equal to 10 unique values, treat as classification
|
| 82 |
-
|
| 83 |
# Ensure there is enough data before proceeding with train-test split
|
| 84 |
if len(X) == 0 or len(y) == 0:
|
| 85 |
st.warning("Insufficient data after cleaning. Please adjust the cleaning parameters.")
|
|
@@ -87,7 +86,7 @@ if uploaded_file is not None:
|
|
| 87 |
# Split the data into training and test sets with customizable training size
|
| 88 |
train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
|
| 89 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
|
| 90 |
-
|
| 91 |
# Store results in a dictionary
|
| 92 |
results = []
|
| 93 |
|
|
@@ -122,8 +121,7 @@ if uploaded_file is not None:
|
|
| 122 |
model.fit(X_train, y_train)
|
| 123 |
y_pred = model.predict(X_test)
|
| 124 |
mse = mean_squared_error(y_test, y_pred)
|
| 125 |
-
|
| 126 |
-
results.append([name, accuracy, mse])
|
| 127 |
|
| 128 |
# Display results in a table
|
| 129 |
st.subheader("Model Performance Results")
|
|
|
|
| 29 |
for column in df.select_dtypes(include=['object']).columns:
|
| 30 |
if df[column].nunique() < 10: # If the column has fewer unique values, encode it
|
| 31 |
df[column] = le.fit_transform(df[column].astype(str))
|
| 32 |
+
|
| 33 |
# Handle missing values (impute numerical columns with median and categorical columns with mode)
|
|
|
|
| 34 |
categorical_columns = df.select_dtypes(include=['object']).columns
|
| 35 |
if len(categorical_columns) > 0:
|
| 36 |
imputer = SimpleImputer(strategy='most_frequent')
|
| 37 |
df[categorical_columns] = imputer.fit_transform(df[categorical_columns])
|
| 38 |
+
|
| 39 |
# Handle numerical columns
|
| 40 |
numerical_columns = df.select_dtypes(include=['number']).columns
|
| 41 |
if len(numerical_columns) > 0:
|
| 42 |
imputer = SimpleImputer(strategy='median')
|
| 43 |
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
|
| 44 |
+
|
| 45 |
# Remove outliers (using z-score method)
|
| 46 |
z_scores = np.abs(stats.zscore(df.select_dtypes(include=['number'])))
|
| 47 |
+
df = df[(z_scores < 3).all(axis=1)]
|
| 48 |
+
|
| 49 |
# Normalize numerical data
|
| 50 |
scaler = StandardScaler()
|
| 51 |
df[df.select_dtypes(include=['number']).columns] = scaler.fit_transform(df.select_dtypes(include=['number']))
|
|
|
|
| 56 |
# Ensure that all columns are numeric before using in models
|
| 57 |
for column in df.select_dtypes(include=['object']).columns:
|
| 58 |
df[column] = pd.to_numeric(df[column], errors='coerce')
|
| 59 |
+
|
| 60 |
return df
|
| 61 |
|
| 62 |
# Apply the clean_dataset function
|
| 63 |
df_cleaned = clean_dataset(df)
|
| 64 |
+
|
| 65 |
# Show the cleaned dataset
|
| 66 |
st.write("Cleaned Dataset:")
|
| 67 |
st.dataframe(df_cleaned)
|
| 68 |
+
|
| 69 |
# Model Training Section
|
| 70 |
st.subheader("Model Training")
|
| 71 |
if df_cleaned.empty:
|
|
|
|
| 75 |
features = [col for col in df_cleaned.columns if col != target]
|
| 76 |
X = df_cleaned[features]
|
| 77 |
y = df_cleaned[target]
|
| 78 |
+
|
| 79 |
# Determine if the target is continuous or categorical
|
| 80 |
is_classification = len(y.unique()) <= 10 # If target has fewer than or equal to 10 unique values, treat as classification
|
| 81 |
+
|
| 82 |
# Ensure there is enough data before proceeding with train-test split
|
| 83 |
if len(X) == 0 or len(y) == 0:
|
| 84 |
st.warning("Insufficient data after cleaning. Please adjust the cleaning parameters.")
|
|
|
|
| 86 |
# Split the data into training and test sets with customizable training size
|
| 87 |
train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
|
| 88 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
|
| 89 |
+
|
| 90 |
# Store results in a dictionary
|
| 91 |
results = []
|
| 92 |
|
|
|
|
| 121 |
model.fit(X_train, y_train)
|
| 122 |
y_pred = model.predict(X_test)
|
| 123 |
mse = mean_squared_error(y_test, y_pred)
|
| 124 |
+
results.append([name, None, mse])
|
|
|
|
| 125 |
|
| 126 |
# Display results in a table
|
| 127 |
st.subheader("Model Performance Results")
|