Spaces:
Sleeping
Sleeping
File size: 8,454 Bytes
fc84ade 063744e 5cb3e7f 1f9976d bf2f644 5cb3e7f 32533be 5cb3e7f 1f9976d 9369add 0e899e9 9369add 43d6671 1f9976d 9369add 1f9976d 9369add d6bf5be fc84ade 9369add c38654f 9369add 34b1335 1531532 9369add 46016e0 9369add 34b1335 67c4e86 063744e 5cb3e7f fc84ade 67c4e86 5cb3e7f 1f9976d 67c4e86 34b1335 a89946b 32533be a89946b 9369add 43d6671 9369add 1f9976d 9369add |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import io
# Import custom functions from your utils
from utils.data_cleaning import preprocess_data, remove_outliers_iqr, cap_extreme_values, convert_string_to_numeric
from utils.model_training import train_all_models
# New Function: Combined Histogram and Bar Plot Comparison
def combined_histogram_barplot(df):
"""
Creates a combined histogram (numeric) and bar plot (categorical) for all attributes in the dataset.
"""
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns
# Create a figure for combined plots
fig, axes = plt.subplots(len(numeric_columns) + len(categorical_columns), 1, figsize=(10, 5 * (len(numeric_columns) + len(categorical_columns))))
# Histogram for numeric columns
for i, col in enumerate(numeric_columns):
axes[i].hist(df[col], bins=20, color='blue', alpha=0.7, edgecolor='black')
axes[i].set_title(f"Histogram of {col}")
axes[i].set_xlabel(col)
axes[i].set_ylabel("Frequency")
# Bar plots for categorical columns
for j, col in enumerate(categorical_columns, start=len(numeric_columns)):
df[col].value_counts().plot(kind='bar', ax=axes[j], color='orange', alpha=0.7, edgecolor='black')
axes[j].set_title(f"Bar Plot of {col}")
axes[j].set_xlabel(col)
axes[j].set_ylabel("Count")
plt.tight_layout()
return fig
# Plotting Functions
def plot_correlation_heatmap(df):
"""
Plot a correlation heatmap for the numeric columns in the dataframe.
"""
corr = df.corr()
fig = plt.figure(figsize=(10, 8)) # Create a new figure object
heatmap = sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
return fig # Return the figure object
def save_figure_as_png(fig):
"""
Save the given figure as a PNG file to a BytesIO buffer.
"""
buffer = io.BytesIO()
fig.savefig(buffer, format="png") # Save the figure to the buffer
buffer.seek(0) # Reset the buffer's position to the beginning
return buffer
def plot_histogram(df, column):
"""
Plot a histogram for a specific column in the dataframe.
"""
plt.figure(figsize=(8, 6))
sns.histplot(df[column], kde=True, bins=30, color="skyblue")
plt.title(f"Histogram of {column}")
plt.xlabel(column)
plt.ylabel("Frequency")
return plt.gcf()
def plot_box_plot(df, column):
"""
Plot a box plot for a specific column in the dataframe.
"""
plt.figure(figsize=(8, 6))
sns.boxplot(x=df[column])
plt.title(f"Box Plot of {column}")
return plt.gcf()
def plot_pair_plot(df):
"""
Plot a pair plot for numeric columns in the dataframe.
"""
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
return sns.pairplot(df[numeric_columns])
def plot_scatter_plot(df, x_col, y_col):
"""
Plot a scatter plot between two numeric columns.
"""
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df[x_col], y=df[y_col], color="green")
plt.title(f"Scatter Plot between {x_col} and {y_col}")
return plt.gcf()
def plot_bar_plot(df, column):
"""
Plot a bar plot for a categorical column.
"""
plt.figure(figsize=(8, 6))
sns.countplot(x=df[column])
plt.title(f"Bar Plot of {column}")
return plt.gcf()
# Streamlit App Title
st.title("Data Analysis, Model Training, and Visualization")
# File Uploader
uploaded_file = st.file_uploader("Upload a CSV file for data analysis", type=["csv"])
if uploaded_file is not None:
# Load dataset
df = pd.read_csv(uploaded_file)
st.write("### Dataset Preview")
st.dataframe(df)
try:
# Data Cleaning
st.subheader("Data Cleaning")
st.write("Handling missing values, removing outliers, and capping extreme values...")
df_cleaned = preprocess_data(df)
df_cleaned = remove_outliers_iqr(df_cleaned)
df_cleaned = cap_extreme_values(df_cleaned)
# Convert string columns to numeric (if any)
st.subheader("String to Numeric Conversion")
st.write("Converting string categorical values to numeric using Label Encoding...")
df_cleaned = convert_string_to_numeric(df_cleaned)
st.write("### Cleaned Dataset")
st.dataframe(df_cleaned)
# Download option for cleaned dataset
st.download_button(
label="Download Cleaned Dataset (CSV)",
data=df_cleaned.to_csv(index=False),
file_name="cleaned_dataset.csv",
mime="text/csv"
)
# Correlation Heatmap
st.subheader("Correlation Heatmap")
st.write("Visualizing correlations between numeric features...")
heatmap_fig = plot_correlation_heatmap(df_cleaned)
st.pyplot(heatmap_fig) # Display the heatmap using Streamlit
# Save and download heatmap as PNG
heatmap_buffer = save_figure_as_png(heatmap_fig) # Save the figure to buffer
st.download_button(
label="Download Correlation Heatmap (PNG)",
data=heatmap_buffer,
file_name="correlation_heatmap.png",
mime="image/png"
)
# Additional Visualizations
st.subheader("Additional Visualizations")
numeric_columns = df_cleaned.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_columns = df_cleaned.select_dtypes(include=['object']).columns.tolist()
# Combined Histogram and Bar Plot
st.subheader("Combined Histogram and Bar Plot")
combined_plot = combined_histogram_barplot(df_cleaned)
st.pyplot(combined_plot)
# Distribution Plot
if numeric_columns:
st.write("### Distribution Plots (Histograms)")
for col in numeric_columns:
st.write(f"#### {col}")
hist_plot = plot_histogram(df_cleaned, col)
st.pyplot(hist_plot)
# Box Plot
if numeric_columns:
st.write("### Box Plots (Outlier Detection)")
for col in numeric_columns:
st.write(f"#### {col}")
box_plot = plot_box_plot(df_cleaned, col)
st.pyplot(box_plot)
# Pair Plot
if len(numeric_columns) > 1:
st.write("### Pair Plot")
pair_plot = plot_pair_plot(df_cleaned)
st.pyplot(pair_plot)
# Scatter Plot
if len(numeric_columns) > 1:
st.write("### Scatter Plot")
x_col = st.selectbox("Select X-axis for Scatter Plot", numeric_columns)
y_col = st.selectbox("Select Y-axis for Scatter Plot", numeric_columns)
if x_col and y_col:
scatter_plot = plot_scatter_plot(df_cleaned, x_col, y_col)
st.pyplot(scatter_plot)
# Bar Plot
if categorical_columns:
st.write("### Bar Plots (For Categorical Data)")
for col in categorical_columns:
st.write(f"#### {col}")
bar_plot = plot_bar_plot(df_cleaned, col)
st.pyplot(bar_plot)
# Select Target and Features
st.subheader("Feature and Target Selection")
target = st.selectbox("Select Target Variable", df_cleaned.columns)
features = [col for col in df_cleaned.columns if col != target]
if not features:
st.warning("No features available after removing the target variable.")
else:
X = df_cleaned[features]
y = df_cleaned[target]
# Train and Evaluate Models
st.subheader("Model Training and Evaluation")
st.write("Training models and calculating metrics...")
model_results = train_all_models(X, y)
st.write("### Model Training Results")
st.dataframe(model_results)
# Download option for model results
st.download_button(
label="Download Model Results (CSV)",
data=model_results.to_csv(index=False),
file_name="model_results.csv",
mime="text/csv"
)
except Exception as e:
st.error(f"An error occurred: {e}")
else:
st.info("Please upload a CSV file to proceed.")
|