Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -128,7 +128,7 @@ def add_footer():
|
|
| 128 |
"""
|
| 129 |
<footer>
|
| 130 |
Developed by <a href="https://github.com/Mahatir-Ahmed-Tusher" target="_blank">Mahatir Ahmed Tusher</a>.
|
| 131 |
-
Inspired by the project "Predicta" by
|
| 132 |
</footer>
|
| 133 |
""",
|
| 134 |
unsafe_allow_html=True
|
|
@@ -146,7 +146,7 @@ def add_sidebar():
|
|
| 146 |
"DataGenie: AI-powered data science assistant. Generate datasets, analyze data, build ML models. Features: dataset generation, visualization, outlier detection, feature processing, ML model selection, and chat-based exploration."
|
| 147 |
)
|
| 148 |
st.sidebar.write("**Developed by:** Mahatir Ahmed Tusher")
|
| 149 |
-
st.sidebar.write("**Inspired by:** Predicta by
|
| 150 |
st.sidebar.markdown("---")
|
| 151 |
st.sidebar.write("**Your**")
|
| 152 |
st.sidebar.image(
|
|
@@ -367,45 +367,427 @@ def download_image(fig, key_prefix):
|
|
| 367 |
# Data processing functions
|
| 368 |
def dataset_overview(df):
|
| 369 |
st.subheader("Dataset Overview")
|
| 370 |
-
st.
|
| 371 |
-
st.write("
|
| 372 |
-
st.write(df.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
|
| 374 |
def clean_data(df):
|
| 375 |
st.subheader("Clean Data")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
cleaned_df = df.dropna().drop_duplicates()
|
| 377 |
-
st.write("Cleaned Dataset:
|
|
|
|
| 378 |
return cleaned_df
|
| 379 |
|
| 380 |
def detect_outlier(df):
|
| 381 |
st.subheader("Detect Outliers")
|
| 382 |
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
for col in numerical_cols:
|
| 384 |
Q1, Q3 = df[col].quantile([0.25, 0.75])
|
| 385 |
IQR = Q3 - Q1
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
def encoder(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
st.subheader("Encode Data")
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
encoded_df
|
| 396 |
-
|
| 397 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
|
|
|
|
| 399 |
def data_transformer(df):
|
| 400 |
st.subheader("Data Transformer")
|
| 401 |
transformed_df = df.copy() # Placeholder for future transformations
|
| 402 |
st.write("Transformed Dataset:", transformed_df.head())
|
| 403 |
return transformed_df
|
| 404 |
|
|
|
|
| 405 |
def data_analysis(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
st.subheader("Data Analysis")
|
| 407 |
-
st.write(df.describe())
|
| 408 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
def feature_importance_analyzer(df):
|
| 410 |
st.subheader("Feature Importance Analyzer")
|
| 411 |
target_column = st.selectbox("Select Target Column", df.columns)
|
|
|
|
| 128 |
"""
|
| 129 |
<footer>
|
| 130 |
Developed by <a href="https://github.com/Mahatir-Ahmed-Tusher" target="_blank">Mahatir Ahmed Tusher</a>.
|
| 131 |
+
Inspired by the project "Predicta" by <a href="https://github.com/ahammadnafiz" target="_blank"> Ahammad Nafiz </a>.
|
| 132 |
</footer>
|
| 133 |
""",
|
| 134 |
unsafe_allow_html=True
|
|
|
|
| 146 |
"DataGenie: AI-powered data science assistant. Generate datasets, analyze data, build ML models. Features: dataset generation, visualization, outlier detection, feature processing, ML model selection, and chat-based exploration."
|
| 147 |
)
|
| 148 |
st.sidebar.write("**Developed by:** Mahatir Ahmed Tusher")
|
| 149 |
+
st.sidebar.write("**Inspired by:** Predicta by Ahammad Nafiz")
|
| 150 |
st.sidebar.markdown("---")
|
| 151 |
st.sidebar.write("**Your**")
|
| 152 |
st.sidebar.image(
|
|
|
|
| 367 |
# Data processing functions
|
| 368 |
def dataset_overview(df):
|
| 369 |
st.subheader("Dataset Overview")
|
| 370 |
+
st.markdown("#### Basic Information")
|
| 371 |
+
st.write(f"**Rows**: {len(df):,} | **Columns**: {len(df.columns):,}")
|
| 372 |
+
st.write(f"**Memory Usage**: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
|
| 373 |
+
|
| 374 |
+
st.markdown("#### Data Types and Missing Values")
|
| 375 |
+
dtypes_df = pd.DataFrame({
|
| 376 |
+
"Column": df.columns,
|
| 377 |
+
"Data Type": df.dtypes,
|
| 378 |
+
"Non-Null Count": df.count(),
|
| 379 |
+
"Missing Values": df.isna().sum(),
|
| 380 |
+
"Missing %": (df.isna().sum() / len(df) * 100).round(2)
|
| 381 |
+
}).reset_index(drop=True)
|
| 382 |
+
st.dataframe(dtypes_df.style.highlight_null(color='lightcoral'))
|
| 383 |
+
|
| 384 |
+
st.markdown("#### Numerical Columns Summary")
|
| 385 |
+
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
|
| 386 |
+
if numerical_cols.size > 0:
|
| 387 |
+
numerical_summary = df[numerical_cols].describe().T.round(2)
|
| 388 |
+
numerical_summary['Skewness'] = df[numerical_cols].skew().round(2)
|
| 389 |
+
numerical_summary['Kurtosis'] = df[numerical_cols].kurt().round(2)
|
| 390 |
+
st.dataframe(numerical_summary)
|
| 391 |
+
|
| 392 |
+
st.markdown("#### Categorical Columns Summary")
|
| 393 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
| 394 |
+
if categorical_cols.size > 0:
|
| 395 |
+
for col in categorical_cols:
|
| 396 |
+
value_counts = df[col].value_counts().head(5)
|
| 397 |
+
st.write(f"**{col}** (Top 5 values):")
|
| 398 |
+
st.dataframe(pd.DataFrame({
|
| 399 |
+
"Value": value_counts.index,
|
| 400 |
+
"Count": value_counts.values,
|
| 401 |
+
"% of Total": (value_counts.values / len(df) * 100).round(2)
|
| 402 |
+
}))
|
| 403 |
+
|
| 404 |
+
st.markdown("#### Duplicate Rows")
|
| 405 |
+
duplicate_count = df.duplicated().sum()
|
| 406 |
+
if duplicate_count > 0:
|
| 407 |
+
st.warning(f"Found {duplicate_count} duplicate rows ({duplicate_count / len(df) * 100:.2f}% of total).")
|
| 408 |
+
else:
|
| 409 |
+
st.success("No duplicate rows detected.")
|
| 410 |
+
|
| 411 |
+
st.markdown("#### Sample Data (First 5 Rows)")
|
| 412 |
+
st.dataframe(df.head())
|
| 413 |
|
| 414 |
def clean_data(df):
|
| 415 |
st.subheader("Clean Data")
|
| 416 |
+
st.markdown("#### Missing Values")
|
| 417 |
+
missing_values = df.isnull().sum()
|
| 418 |
+
missing_percentage = (missing_values / len(df) * 100).round(2)
|
| 419 |
+
missing_summary = pd.DataFrame({
|
| 420 |
+
"Missing Values": missing_values,
|
| 421 |
+
"Missing Percentage (%)": missing_percentage
|
| 422 |
+
}).sort_values(by="Missing Values", ascending=False)
|
| 423 |
+
st.dataframe(missing_summary)
|
| 424 |
+
|
| 425 |
+
st.markdown("#### Duplicate Rows")
|
| 426 |
+
duplicate_count = df.duplicated().sum()
|
| 427 |
+
if duplicate_count > 0:
|
| 428 |
+
st.warning(f"Found {duplicate_count} duplicate rows. They will be removed.")
|
| 429 |
+
else:
|
| 430 |
+
st.success("No duplicate rows detected.")
|
| 431 |
+
|
| 432 |
cleaned_df = df.dropna().drop_duplicates()
|
| 433 |
+
st.write(f"Cleaned Dataset: {len(cleaned_df)} rows remaining after cleaning.")
|
| 434 |
+
st.dataframe(cleaned_df.head())
|
| 435 |
return cleaned_df
|
| 436 |
|
| 437 |
def detect_outlier(df):
|
| 438 |
st.subheader("Detect Outliers")
|
| 439 |
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
|
| 440 |
+
if not numerical_cols.any():
|
| 441 |
+
st.warning("No numerical columns available for outlier detection.")
|
| 442 |
+
return
|
| 443 |
+
|
| 444 |
+
st.markdown("#### Outlier Detection Summary")
|
| 445 |
+
outlier_summary = []
|
| 446 |
for col in numerical_cols:
|
| 447 |
Q1, Q3 = df[col].quantile([0.25, 0.75])
|
| 448 |
IQR = Q3 - Q1
|
| 449 |
+
lower_bound = Q1 - 1.5 * IQR
|
| 450 |
+
upper_bound = Q3 + 1.5 * IQR
|
| 451 |
+
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
|
| 452 |
+
outlier_count = len(outliers)
|
| 453 |
+
outlier_percentage = round((outlier_count / len(df) * 100), 2)
|
| 454 |
+
outlier_summary.append({
|
| 455 |
+
"Column": col,
|
| 456 |
+
"Outliers": outlier_count,
|
| 457 |
+
"Outlier Percentage (%)": outlier_percentage
|
| 458 |
+
})
|
| 459 |
+
|
| 460 |
+
outlier_df = pd.DataFrame(outlier_summary).sort_values(by="Outliers", ascending=False)
|
| 461 |
+
st.dataframe(outlier_df)
|
| 462 |
+
|
| 463 |
+
st.markdown("#### Outlier Visualization")
|
| 464 |
+
selected_col = st.selectbox("Select a column to visualize outliers", numerical_cols)
|
| 465 |
+
if selected_col:
|
| 466 |
+
Q1, Q3 = df[selected_col].quantile([0.25, 0.75])
|
| 467 |
+
IQR = Q3 - Q1
|
| 468 |
+
lower_bound = Q1 - 1.5 * IQR
|
| 469 |
+
upper_bound = Q3 + 1.5 * IQR
|
| 470 |
|
| 471 |
+
fig, ax = plt.subplots()
|
| 472 |
+
sns.boxplot(data=df, x=selected_col, ax=ax)
|
| 473 |
+
ax.axhline(lower_bound, color='red', linestyle='--', label='Lower Bound')
|
| 474 |
+
ax.axhline(upper_bound, color='blue', linestyle='--', label='Upper Bound')
|
| 475 |
+
ax.legend()
|
| 476 |
+
st.pyplot(fig)
|
| 477 |
+
plt.close(fig)
|
| 478 |
+
|
| 479 |
+
# Data Encoder
|
| 480 |
def encoder(df):
|
| 481 |
+
"""
|
| 482 |
+
Encodes categorical columns in the dataset using user-selected methods (Label Encoding,
|
| 483 |
+
One-Hot Encoding, or Frequency Encoding). Provides control over column selection, handles
|
| 484 |
+
missing values, and displays encoding details.
|
| 485 |
+
|
| 486 |
+
Args:
|
| 487 |
+
df (pd.DataFrame): Input dataset to encode.
|
| 488 |
+
|
| 489 |
+
Returns:
|
| 490 |
+
pd.DataFrame: Encoded dataset.
|
| 491 |
+
"""
|
| 492 |
st.subheader("Encode Data")
|
| 493 |
+
|
| 494 |
+
# Initialize session state for encoded DataFrame
|
| 495 |
+
if 'encoded_df' not in st.session_state:
|
| 496 |
+
st.session_state.encoded_df = df.copy()
|
| 497 |
+
|
| 498 |
+
# Identify categorical columns
|
| 499 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 500 |
+
if not categorical_cols:
|
| 501 |
+
st.warning("No categorical columns ('object' or 'category') found in the dataset.")
|
| 502 |
+
return df
|
| 503 |
+
|
| 504 |
+
# Display original categorical columns
|
| 505 |
+
st.markdown("### Categorical Columns Detected")
|
| 506 |
+
st.write(f"Found {len(categorical_cols)} categorical columns: {', '.join(categorical_cols)}")
|
| 507 |
+
for col in categorical_cols:
|
| 508 |
+
st.write(f"- **{col}**: {df[col].nunique()} unique values, "
|
| 509 |
+
f"{df[col].isna().sum()} missing ({df[col].isna().sum() / len(df) * 100:.2f}%)")
|
| 510 |
+
|
| 511 |
+
# User configuration
|
| 512 |
+
st.markdown("### Encoding Configuration")
|
| 513 |
+
encoding_methods = {
|
| 514 |
+
"Label Encoding": "Assigns integers to categories (best for ordinal data).",
|
| 515 |
+
"One-Hot Encoding": "Creates binary columns for each category (best for non-ordinal data, avoid high cardinality).",
|
| 516 |
+
"Frequency Encoding": "Replaces categories with their frequency (useful for high-cardinality columns)."
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
# Select columns to encode
|
| 520 |
+
cols_to_encode = st.multiselect("Select Columns to Encode", categorical_cols, default=categorical_cols,
|
| 521 |
+
help="Choose which categorical columns to encode. Unselected columns remain unchanged.")
|
| 522 |
+
|
| 523 |
+
if not cols_to_encode:
|
| 524 |
+
st.warning("Please select at least one column to encode.")
|
| 525 |
+
return st.session_state.encoded_df
|
| 526 |
+
|
| 527 |
+
# Missing value handling
|
| 528 |
+
missing_strategy = st.selectbox("Handle Missing Values",
|
| 529 |
+
["Keep as NaN", "Impute with Mode", "Impute with Custom Value"],
|
| 530 |
+
help="Choose how to handle missing values before encoding.")
|
| 531 |
+
custom_value = None
|
| 532 |
+
if missing_strategy == "Impute with Custom Value":
|
| 533 |
+
custom_value = st.text_input("Enter Custom Value for Missing Entries", value="Unknown")
|
| 534 |
+
|
| 535 |
+
# Apply missing value handling
|
| 536 |
+
encoded_df = st.session_state.encoded_df.copy()
|
| 537 |
+
for col in cols_to_encode:
|
| 538 |
+
if missing_strategy == "Impute with Mode":
|
| 539 |
+
mode_val = df[col].mode()[0] if not df[col].mode().empty else "Unknown"
|
| 540 |
+
encoded_df[col] = df[col].fillna(mode_val)
|
| 541 |
+
elif missing_strategy == "Impute with Custom Value":
|
| 542 |
+
encoded_df[col] = df[col].fillna(custom_value)
|
| 543 |
+
|
| 544 |
+
# Encoding method selection per column
|
| 545 |
+
st.markdown("### Assign Encoding Methods")
|
| 546 |
+
encoding_assignments = {}
|
| 547 |
+
for col in cols_to_encode:
|
| 548 |
+
default_method = "One-Hot Encoding" if df[col].nunique() <= 10 else "Frequency Encoding"
|
| 549 |
+
encoding_assignments[col] = st.selectbox(
|
| 550 |
+
f"Encoding Method for {col}",
|
| 551 |
+
list(encoding_methods.keys()),
|
| 552 |
+
index=list(encoding_methods.keys()).index(default_method),
|
| 553 |
+
help=f"{encoding_methods[default_method]} Unique values: {df[col].nunique()}"
|
| 554 |
+
)
|
| 555 |
+
|
| 556 |
+
# Apply encoding
|
| 557 |
+
if st.button("Apply Encoding"):
|
| 558 |
+
try:
|
| 559 |
+
for col, method in encoding_assignments.items():
|
| 560 |
+
if method == "Label Encoding":
|
| 561 |
+
le = LabelEncoder()
|
| 562 |
+
# Convert to string to handle mixed types and NaNs
|
| 563 |
+
encoded_df[col] = le.fit_transform(encoded_df[col].astype(str))
|
| 564 |
+
st.session_state[f"label_encoder_{col}"] = le # Store encoder for reference
|
| 565 |
+
st.write(f"**{col}**: Label Encoded. Classes: {list(le.classes_)}")
|
| 566 |
+
|
| 567 |
+
elif method == "One-Hot Encoding":
|
| 568 |
+
if df[col].nunique() > 50:
|
| 569 |
+
st.warning(f"**{col}** has {df[col].nunique()} unique values. One-Hot Encoding may create many columns.")
|
| 570 |
+
# Drop NaN for one-hot encoding, reintroduce after
|
| 571 |
+
mask = encoded_df[col].notna()
|
| 572 |
+
ohe_df = pd.get_dummies(encoded_df.loc[mask, col], prefix=col, drop_first=True)
|
| 573 |
+
encoded_df = pd.concat([encoded_df.drop(columns=[col]), ohe_df], axis=1)
|
| 574 |
+
encoded_df.loc[~mask, ohe_df.columns] = np.nan
|
| 575 |
+
st.write(f"**{col}**: One-Hot Encoded. Created {len(ohe_df.columns)} new columns.")
|
| 576 |
+
|
| 577 |
+
elif method == "Frequency Encoding":
|
| 578 |
+
freq_map = df[col].value_counts(normalize=True).to_dict()
|
| 579 |
+
encoded_df[col] = df[col].map(freq_map)
|
| 580 |
+
st.write(f"**{col}**: Frequency Encoded. Values mapped to proportions.")
|
| 581 |
+
|
| 582 |
+
# Update session state
|
| 583 |
+
st.session_state.encoded_df = encoded_df
|
| 584 |
+
|
| 585 |
+
# Display results
|
| 586 |
+
st.markdown("### Encoded Dataset Preview")
|
| 587 |
+
st.dataframe(encoded_df.head())
|
| 588 |
+
|
| 589 |
+
# Data quality check
|
| 590 |
+
new_cols = len(encoded_df.columns) - len(df.columns)
|
| 591 |
+
if new_cols > 0:
|
| 592 |
+
st.info(f"Encoding added {new_cols} new columns.")
|
| 593 |
+
if encoded_df.isna().sum().sum() > 0:
|
| 594 |
+
st.warning(f"Encoded dataset still has {encoded_df.isna().sum().sum()} missing values.")
|
| 595 |
+
|
| 596 |
+
# Download option
|
| 597 |
+
csv_bytes = encoded_df.to_csv(index=False).encode()
|
| 598 |
+
st.download_button(
|
| 599 |
+
label="Download Encoded Dataset",
|
| 600 |
+
data=csv_bytes,
|
| 601 |
+
file_name="encoded_dataset.csv",
|
| 602 |
+
mime="text/csv",
|
| 603 |
+
key="download_encoded"
|
| 604 |
+
)
|
| 605 |
+
|
| 606 |
+
except Exception as e:
|
| 607 |
+
st.error(f"Error during encoding: {str(e)}")
|
| 608 |
+
return df
|
| 609 |
+
|
| 610 |
+
# Preview current encoded state
|
| 611 |
+
else:
|
| 612 |
+
st.markdown("### Current Dataset Preview")
|
| 613 |
+
st.dataframe(st.session_state.encoded_df.head())
|
| 614 |
+
|
| 615 |
+
return st.session_state.encoded_df
|
| 616 |
|
| 617 |
+
# Data Transformer part eta
|
| 618 |
def data_transformer(df):
|
| 619 |
st.subheader("Data Transformer")
|
| 620 |
transformed_df = df.copy() # Placeholder for future transformations
|
| 621 |
st.write("Transformed Dataset:", transformed_df.head())
|
| 622 |
return transformed_df
|
| 623 |
|
| 624 |
+
# Data Analysis
|
| 625 |
def data_analysis(df):
|
| 626 |
+
"""
|
| 627 |
+
Performs an in-depth analysis of the dataset, including numerical and categorical summaries,
|
| 628 |
+
interactive visualizations, data quality checks, and column-specific exploration.
|
| 629 |
+
|
| 630 |
+
Args:
|
| 631 |
+
df (pd.DataFrame): Input dataset to analyze.
|
| 632 |
+
"""
|
| 633 |
st.subheader("Data Analysis")
|
|
|
|
| 634 |
|
| 635 |
+
# Initialize tabs for different analysis types
|
| 636 |
+
tab1, tab2, tab3, tab4 = st.tabs(["Summary Statistics", "Visual Exploration", "Data Quality", "Column Deep Dive"])
|
| 637 |
+
|
| 638 |
+
with tab1:
|
| 639 |
+
st.markdown("### Summary Statistics")
|
| 640 |
+
# Numerical Columns
|
| 641 |
+
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
|
| 642 |
+
if numerical_cols.size > 0:
|
| 643 |
+
st.markdown("#### Numerical Columns")
|
| 644 |
+
numerical_summary = df[numerical_cols].describe().T.round(2)
|
| 645 |
+
numerical_summary['Skewness'] = df[numerical_cols].skew().round(2)
|
| 646 |
+
numerical_summary['Kurtosis'] = df[numerical_cols].kurt().round(2)
|
| 647 |
+
numerical_summary['Missing %'] = (df[numerical_cols].isna().sum() / len(df) * 100).round(2)
|
| 648 |
+
st.dataframe(numerical_summary.style.highlight_max(axis=0, color='lightgreen'))
|
| 649 |
+
st.write("*Skewness > 1 or < -1 indicates high skew. Kurtosis > 3 indicates heavy tails.*")
|
| 650 |
+
else:
|
| 651 |
+
st.info("No numerical columns found.")
|
| 652 |
+
|
| 653 |
+
# Categorical Columns
|
| 654 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
| 655 |
+
if categorical_cols.size > 0:
|
| 656 |
+
st.markdown("#### Categorical Columns")
|
| 657 |
+
cat_summary = pd.DataFrame({
|
| 658 |
+
"Column": categorical_cols,
|
| 659 |
+
"Unique Values": [df[col].nunique() for col in categorical_cols],
|
| 660 |
+
"Most Frequent": [df[col].mode()[0] if not df[col].mode().empty else np.nan for col in categorical_cols],
|
| 661 |
+
"Missing %": [(df[col].isna().sum() / len(df) * 100).round(2) for col in categorical_cols]
|
| 662 |
+
})
|
| 663 |
+
st.dataframe(cat_summary)
|
| 664 |
+
else:
|
| 665 |
+
st.info("No categorical columns found.")
|
| 666 |
+
|
| 667 |
+
with tab2:
|
| 668 |
+
st.markdown("### Visual Exploration")
|
| 669 |
+
viz_type = st.selectbox("Select Visualization Type",
|
| 670 |
+
["Distribution (Numerical)", "Count Plot (Categorical)", "Correlation Heatmap", "Pair Plot"],
|
| 671 |
+
key="data_analysis_viz")
|
| 672 |
+
|
| 673 |
+
if viz_type == "Distribution (Numerical)" and numerical_cols.size > 0:
|
| 674 |
+
col = st.selectbox("Select Column", numerical_cols, key="num_dist_col")
|
| 675 |
+
fig, ax = plt.subplots()
|
| 676 |
+
sns.histplot(data=df, x=col, kde=True, bins='auto', ax=ax)
|
| 677 |
+
ax.set_title(f"Distribution of {col}")
|
| 678 |
+
st.pyplot(fig)
|
| 679 |
+
download_image(fig, f"dist_{col}")
|
| 680 |
+
plt.close(fig)
|
| 681 |
+
|
| 682 |
+
elif viz_type == "Count Plot (Categorical)" and categorical_cols.size > 0:
|
| 683 |
+
col = st.selectbox("Select Column", categorical_cols, key="cat_count_col")
|
| 684 |
+
fig, ax = plt.subplots()
|
| 685 |
+
sns.countplot(data=df, x=col, ax=ax)
|
| 686 |
+
plt.xticks(rotation=45, ha='right')
|
| 687 |
+
ax.set_title(f"Count Plot of {col}")
|
| 688 |
+
st.pyplot(fig)
|
| 689 |
+
download_image(fig, f"count_{col}")
|
| 690 |
+
plt.close(fig)
|
| 691 |
+
|
| 692 |
+
elif viz_type == "Correlation Heatmap" and numerical_cols.size > 1:
|
| 693 |
+
fig, ax = plt.subplots(figsize=(10, 8))
|
| 694 |
+
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap="coolwarm", vmin=-1, vmax=1, fmt='.2f', ax=ax)
|
| 695 |
+
ax.set_title("Correlation Heatmap")
|
| 696 |
+
st.pyplot(fig)
|
| 697 |
+
download_image(fig, "corr_heatmap")
|
| 698 |
+
plt.close(fig)
|
| 699 |
+
|
| 700 |
+
elif viz_type == "Pair Plot" and numerical_cols.size > 0:
|
| 701 |
+
selected_cols = st.multiselect("Select Columns (max 4)", numerical_cols, max_selections=4, key="pair_cols")
|
| 702 |
+
if len(selected_cols) >= 2:
|
| 703 |
+
fig = sns.pairplot(df[selected_cols].dropna())
|
| 704 |
+
st.pyplot(fig)
|
| 705 |
+
img_bytes = BytesIO()
|
| 706 |
+
fig.savefig(img_bytes, format='png', bbox_inches='tight')
|
| 707 |
+
img_bytes.seek(0)
|
| 708 |
+
st.download_button(label="Download Pair Plot", data=img_bytes,
|
| 709 |
+
file_name=f"pairplot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png",
|
| 710 |
+
mime="image/png", key=f"pairplot_{datetime.now().strftime('%H%M%S')}")
|
| 711 |
+
plt.close()
|
| 712 |
+
|
| 713 |
+
with tab3:
|
| 714 |
+
st.markdown("### Data Quality Checks")
|
| 715 |
+
# Missing Values
|
| 716 |
+
missing_total = df.isna().sum().sum()
|
| 717 |
+
if missing_total > 0:
|
| 718 |
+
st.warning(f"**Missing Values**: {missing_total} across {df.isna().any().sum()} columns.")
|
| 719 |
+
missing_df = pd.DataFrame({
|
| 720 |
+
"Column": df.columns,
|
| 721 |
+
"Missing Count": df.isna().sum(),
|
| 722 |
+
"Missing %": (df.isna().sum() / len(df) * 100).round(2)
|
| 723 |
+
})
|
| 724 |
+
missing_df = missing_df[missing_df["Missing Count"] > 0]
|
| 725 |
+
st.dataframe(missing_df)
|
| 726 |
+
else:
|
| 727 |
+
st.success("No missing values detected.")
|
| 728 |
+
|
| 729 |
+
# Duplicates
|
| 730 |
+
duplicates = df.duplicated().sum()
|
| 731 |
+
if duplicates > 0:
|
| 732 |
+
st.warning(f"**Duplicates**: {duplicates} duplicate rows ({duplicates / len(df) * 100:.2f}%).")
|
| 733 |
+
else:
|
| 734 |
+
st.success("No duplicate rows detected.")
|
| 735 |
+
|
| 736 |
+
# Outliers (Numerical)
|
| 737 |
+
if numerical_cols.size > 0:
|
| 738 |
+
outlier_summary = []
|
| 739 |
+
for col in numerical_cols:
|
| 740 |
+
Q1, Q3 = df[col].quantile([0.25, 0.75])
|
| 741 |
+
IQR = Q3 - Q1
|
| 742 |
+
outliers = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
|
| 743 |
+
if outliers > 0:
|
| 744 |
+
outlier_summary.append({"Column": col, "Outlier Count": outliers,
|
| 745 |
+
"Outlier %": (outliers / len(df) * 100).round(2)})
|
| 746 |
+
if outlier_summary:
|
| 747 |
+
st.warning("**Outliers Detected**:")
|
| 748 |
+
st.dataframe(pd.DataFrame(outlier_summary))
|
| 749 |
+
else:
|
| 750 |
+
st.success("No outliers detected in numerical columns.")
|
| 751 |
+
|
| 752 |
+
with tab4:
|
| 753 |
+
st.markdown("### Column Deep Dive")
|
| 754 |
+
selected_col = st.selectbox("Select Column for Detailed Analysis", df.columns, key="deep_dive_col")
|
| 755 |
+
st.write(f"**Column**: {selected_col}")
|
| 756 |
+
st.write(f"**Data Type**: {df[selected_col].dtype}")
|
| 757 |
+
st.write(f"**Missing Values**: {df[selected_col].isna().sum()} ({df[selected_col].isna().sum() / len(df) * 100:.2f}%)")
|
| 758 |
+
st.write(f"**Unique Values**: {df[selected_col].nunique()} ({df[selected_col].nunique() / len(df) * 100:.2f}%)")
|
| 759 |
+
|
| 760 |
+
if pd.api.types.is_numeric_dtype(df[selected_col]):
|
| 761 |
+
st.write("**Summary Statistics**:")
|
| 762 |
+
stats = df[selected_col].describe().round(2)
|
| 763 |
+
stats['Skewness'] = df[selected_col].skew().round(2)
|
| 764 |
+
stats['Kurtosis'] = df[selected_col].kurt().round(2)
|
| 765 |
+
st.dataframe(stats)
|
| 766 |
+
fig = px.histogram(df, x=selected_col, nbins=30, title=f"Distribution of {selected_col}")
|
| 767 |
+
st.plotly_chart(fig)
|
| 768 |
+
elif pd.api.types.is_object_dtype(df[selected_col]) or pd.api.types.is_categorical_dtype(df[selected_col]):
|
| 769 |
+
st.write("**Top 5 Values**:")
|
| 770 |
+
value_counts = df[selected_col].value_counts().head(5)
|
| 771 |
+
st.dataframe(pd.DataFrame({
|
| 772 |
+
"Value": value_counts.index,
|
| 773 |
+
"Count": value_counts.values,
|
| 774 |
+
"% of Total": (value_counts.values / len(df) * 100).round(2)
|
| 775 |
+
}))
|
| 776 |
+
fig = px.bar(x=value_counts.index, y=value_counts.values, title=f"Top Values in {selected_col}")
|
| 777 |
+
st.plotly_chart(fig)
|
| 778 |
+
|
| 779 |
+
def download_image(fig, key_prefix):
|
| 780 |
+
"""
|
| 781 |
+
Utility function to download a Matplotlib figure as PNG.
|
| 782 |
+
"""
|
| 783 |
+
img_bytes = BytesIO()
|
| 784 |
+
fig.savefig(img_bytes, format='png', bbox_inches='tight')
|
| 785 |
+
img_bytes.seek(0)
|
| 786 |
+
st.download_button(label="Download Image", data=img_bytes,
|
| 787 |
+
file_name=f"{key_prefix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png",
|
| 788 |
+
mime="image/png", key=f"download_{key_prefix}_{datetime.now().strftime('%H%M%S')}")
|
| 789 |
+
|
| 790 |
+
# Feature Importance Analysis
|
| 791 |
def feature_importance_analyzer(df):
|
| 792 |
st.subheader("Feature Importance Analyzer")
|
| 793 |
target_column = st.selectbox("Select Target Column", df.columns)
|