Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,46 +1,26 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
-
from
|
| 4 |
-
from
|
| 5 |
-
from
|
| 6 |
-
from io import BytesIO
|
| 7 |
-
from utils.data_cleaning import handle_missing_values, remove_outliers_iqr, cap_extreme_values
|
| 8 |
-
from utils.model_training import encode_categorical, train_and_evaluate
|
| 9 |
-
from utils.visualizations import plot_correlation_heatmap, plot_model_performance, save_plot_as_png
|
| 10 |
-
# Streamlit app title
|
| 11 |
-
st.title("Model Training with Outlier Removal, Metrics, and Correlation Heatmap")
|
| 12 |
|
| 13 |
# File uploader
|
|
|
|
| 14 |
uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
|
| 15 |
|
| 16 |
if uploaded_file is not None:
|
| 17 |
-
# Read the uploaded CSV file
|
| 18 |
df = pd.read_csv(uploaded_file)
|
| 19 |
|
| 20 |
-
#
|
| 21 |
st.write("Dataset:")
|
| 22 |
st.dataframe(df)
|
| 23 |
|
| 24 |
-
#
|
| 25 |
-
df =
|
| 26 |
-
|
| 27 |
-
# Handle missing values
|
| 28 |
-
st.write("Handling Missing (Null) Values:")
|
| 29 |
-
fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
|
| 30 |
-
df = handle_missing_values(df, method=fill_method)
|
| 31 |
-
|
| 32 |
-
# Remove outliers using the IQR method
|
| 33 |
-
st.write("Removing Outliers Using IQR:")
|
| 34 |
-
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
|
| 35 |
-
for col in numeric_cols:
|
| 36 |
-
df = remove_outliers_iqr(df, col)
|
| 37 |
-
|
| 38 |
-
# Capping Extreme Values (based on 5% and 95% percentiles)
|
| 39 |
-
st.write("Handling Extreme Values (Capping):")
|
| 40 |
df = cap_extreme_values(df)
|
| 41 |
|
| 42 |
-
|
| 43 |
-
st.write("Dataset After Cleaning:")
|
| 44 |
st.dataframe(df)
|
| 45 |
|
| 46 |
# Add clean data download option
|
|
@@ -66,40 +46,18 @@ if uploaded_file is not None:
|
|
| 66 |
mime="image/png"
|
| 67 |
)
|
| 68 |
|
| 69 |
-
#
|
| 70 |
target = st.selectbox("Select Target Variable", df.columns)
|
| 71 |
features = [col for col in df.columns if col != target]
|
| 72 |
X = df[features]
|
| 73 |
y = df[target]
|
| 74 |
|
| 75 |
-
#
|
| 76 |
-
if len(y.unique())
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
|
| 81 |
-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
|
| 82 |
-
|
| 83 |
-
metrics = train_and_evaluate(X_train, X_test, y_train, y_test, model_type=model_type)
|
| 84 |
-
|
| 85 |
-
# Displaying model performance metrics
|
| 86 |
-
metrics_df = pd.DataFrame(metrics)
|
| 87 |
-
st.subheader(f"{model_type.title()} Model Performance Metrics")
|
| 88 |
st.dataframe(metrics_df)
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
st.pyplot(performance_graph)
|
| 94 |
-
|
| 95 |
-
# Save performance graph as PNG
|
| 96 |
-
performance_graph_buf = save_plot_as_png(performance_graph)
|
| 97 |
-
st.download_button(
|
| 98 |
-
label=f"Download {model_type.title()} Performance Graph as PNG",
|
| 99 |
-
data=performance_graph_buf,
|
| 100 |
-
file_name=f"{model_type}_performance_graph.png",
|
| 101 |
-
mime="image/png"
|
| 102 |
-
)
|
| 103 |
-
|
| 104 |
-
else:
|
| 105 |
-
st.error("The target variable must contain at least two unique values for classification or regression.")
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
+
from data_cleaning import handle_missing_values, remove_outliers_iqr, cap_extreme_values
|
| 4 |
+
from model_training import train_classification_model, train_regression_model
|
| 5 |
+
from visualization import plot_correlation_heatmap, save_plot_as_png
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# File uploader
|
| 8 |
+
st.title("Model Training with Metrics and Correlation Heatmap")
|
| 9 |
uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
|
| 10 |
|
| 11 |
if uploaded_file is not None:
|
|
|
|
| 12 |
df = pd.read_csv(uploaded_file)
|
| 13 |
|
| 14 |
+
# Show the dataset
|
| 15 |
st.write("Dataset:")
|
| 16 |
st.dataframe(df)
|
| 17 |
|
| 18 |
+
# Clean data: Missing values, outliers, and extreme values
|
| 19 |
+
df = handle_missing_values(df)
|
| 20 |
+
df = remove_outliers_iqr(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
df = cap_extreme_values(df)
|
| 22 |
|
| 23 |
+
st.write("Cleaned Dataset:")
|
|
|
|
| 24 |
st.dataframe(df)
|
| 25 |
|
| 26 |
# Add clean data download option
|
|
|
|
| 46 |
mime="image/png"
|
| 47 |
)
|
| 48 |
|
| 49 |
+
# Target and features selection
|
| 50 |
target = st.selectbox("Select Target Variable", df.columns)
|
| 51 |
features = [col for col in df.columns if col != target]
|
| 52 |
X = df[features]
|
| 53 |
y = df[target]
|
| 54 |
|
| 55 |
+
# Train and evaluate models
|
| 56 |
+
if y.dtype == 'object' or len(y.unique()) <= 10: # Classification
|
| 57 |
+
st.subheader("Classification Model Training")
|
| 58 |
+
metrics_df = train_classification_model(X, y)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
st.dataframe(metrics_df)
|
| 60 |
+
else: # Regression
|
| 61 |
+
st.subheader("Regression Model Training")
|
| 62 |
+
regression_metrics_df = train_regression_model(X, y)
|
| 63 |
+
st.dataframe(regression_metrics_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|