MahatirTusher commited on
Commit
23f3efd
·
verified ·
1 Parent(s): a133fd6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +398 -16
app.py CHANGED
@@ -128,7 +128,7 @@ def add_footer():
128
  """
129
  <footer>
130
  Developed by <a href="https://github.com/Mahatir-Ahmed-Tusher" target="_blank">Mahatir Ahmed Tusher</a>.
131
- Inspired by the project "Predicta" by Ahmed Nafiz.
132
  </footer>
133
  """,
134
  unsafe_allow_html=True
@@ -146,7 +146,7 @@ def add_sidebar():
146
  "DataGenie: AI-powered data science assistant. Generate datasets, analyze data, build ML models. Features: dataset generation, visualization, outlier detection, feature processing, ML model selection, and chat-based exploration."
147
  )
148
  st.sidebar.write("**Developed by:** Mahatir Ahmed Tusher")
149
- st.sidebar.write("**Inspired by:** Predicta by Ahmed Nafiz")
150
  st.sidebar.markdown("---")
151
  st.sidebar.write("**Your**")
152
  st.sidebar.image(
@@ -367,45 +367,427 @@ def download_image(fig, key_prefix):
367
  # Data processing functions
368
  def dataset_overview(df):
369
  st.subheader("Dataset Overview")
370
- st.write(f"Rows: {len(df)}, Columns: {len(df.columns)}")
371
- st.write("Data Types:", df.dtypes)
372
- st.write(df.head())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
 
374
  def clean_data(df):
375
  st.subheader("Clean Data")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  cleaned_df = df.dropna().drop_duplicates()
377
- st.write("Cleaned Dataset:", cleaned_df.head())
 
378
  return cleaned_df
379
 
380
  def detect_outlier(df):
381
  st.subheader("Detect Outliers")
382
  numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
 
 
 
 
 
 
383
  for col in numerical_cols:
384
  Q1, Q3 = df[col].quantile([0.25, 0.75])
385
  IQR = Q3 - Q1
386
- outliers = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]
387
- if not outliers.empty:
388
- st.write(f"Outliers in {col}:", outliers)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
 
 
 
 
 
 
 
 
 
 
390
  def encoder(df):
 
 
 
 
 
 
 
 
 
 
 
391
  st.subheader("Encode Data")
392
- le = LabelEncoder()
393
- encoded_df = df.copy()
394
- for col in df.select_dtypes(include=['object', 'category']).columns:
395
- encoded_df[col] = le.fit_transform(df[col])
396
- st.write("Encoded Dataset:", encoded_df.head())
397
- return encoded_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
 
399
  def data_transformer(df):
400
  st.subheader("Data Transformer")
401
  transformed_df = df.copy() # Placeholder for future transformations
402
  st.write("Transformed Dataset:", transformed_df.head())
403
  return transformed_df
404
 
 
405
  def data_analysis(df):
 
 
 
 
 
 
 
406
  st.subheader("Data Analysis")
407
- st.write(df.describe())
408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  def feature_importance_analyzer(df):
410
  st.subheader("Feature Importance Analyzer")
411
  target_column = st.selectbox("Select Target Column", df.columns)
 
128
  """
129
  <footer>
130
  Developed by <a href="https://github.com/Mahatir-Ahmed-Tusher" target="_blank">Mahatir Ahmed Tusher</a>.
131
+ Inspired by the project "Predicta" by <a href="https://github.com/ahammadnafiz" target="_blank"> Ahammad Nafiz </a>.
132
  </footer>
133
  """,
134
  unsafe_allow_html=True
 
146
  "DataGenie: AI-powered data science assistant. Generate datasets, analyze data, build ML models. Features: dataset generation, visualization, outlier detection, feature processing, ML model selection, and chat-based exploration."
147
  )
148
  st.sidebar.write("**Developed by:** Mahatir Ahmed Tusher")
149
+ st.sidebar.write("**Inspired by:** Predicta by Ahammad Nafiz")
150
  st.sidebar.markdown("---")
151
  st.sidebar.write("**Your**")
152
  st.sidebar.image(
 
367
  # Data processing functions
368
  def dataset_overview(df):
369
  st.subheader("Dataset Overview")
370
+ st.markdown("#### Basic Information")
371
+ st.write(f"**Rows**: {len(df):,} | **Columns**: {len(df.columns):,}")
372
+ st.write(f"**Memory Usage**: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
373
+
374
+ st.markdown("#### Data Types and Missing Values")
375
+ dtypes_df = pd.DataFrame({
376
+ "Column": df.columns,
377
+ "Data Type": df.dtypes,
378
+ "Non-Null Count": df.count(),
379
+ "Missing Values": df.isna().sum(),
380
+ "Missing %": (df.isna().sum() / len(df) * 100).round(2)
381
+ }).reset_index(drop=True)
382
+ st.dataframe(dtypes_df.style.highlight_null(color='lightcoral'))
383
+
384
+ st.markdown("#### Numerical Columns Summary")
385
+ numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
386
+ if numerical_cols.size > 0:
387
+ numerical_summary = df[numerical_cols].describe().T.round(2)
388
+ numerical_summary['Skewness'] = df[numerical_cols].skew().round(2)
389
+ numerical_summary['Kurtosis'] = df[numerical_cols].kurt().round(2)
390
+ st.dataframe(numerical_summary)
391
+
392
+ st.markdown("#### Categorical Columns Summary")
393
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
394
+ if categorical_cols.size > 0:
395
+ for col in categorical_cols:
396
+ value_counts = df[col].value_counts().head(5)
397
+ st.write(f"**{col}** (Top 5 values):")
398
+ st.dataframe(pd.DataFrame({
399
+ "Value": value_counts.index,
400
+ "Count": value_counts.values,
401
+ "% of Total": (value_counts.values / len(df) * 100).round(2)
402
+ }))
403
+
404
+ st.markdown("#### Duplicate Rows")
405
+ duplicate_count = df.duplicated().sum()
406
+ if duplicate_count > 0:
407
+ st.warning(f"Found {duplicate_count} duplicate rows ({duplicate_count / len(df) * 100:.2f}% of total).")
408
+ else:
409
+ st.success("No duplicate rows detected.")
410
+
411
+ st.markdown("#### Sample Data (First 5 Rows)")
412
+ st.dataframe(df.head())
413
 
414
  def clean_data(df):
415
  st.subheader("Clean Data")
416
+ st.markdown("#### Missing Values")
417
+ missing_values = df.isnull().sum()
418
+ missing_percentage = (missing_values / len(df) * 100).round(2)
419
+ missing_summary = pd.DataFrame({
420
+ "Missing Values": missing_values,
421
+ "Missing Percentage (%)": missing_percentage
422
+ }).sort_values(by="Missing Values", ascending=False)
423
+ st.dataframe(missing_summary)
424
+
425
+ st.markdown("#### Duplicate Rows")
426
+ duplicate_count = df.duplicated().sum()
427
+ if duplicate_count > 0:
428
+ st.warning(f"Found {duplicate_count} duplicate rows. They will be removed.")
429
+ else:
430
+ st.success("No duplicate rows detected.")
431
+
432
  cleaned_df = df.dropna().drop_duplicates()
433
+ st.write(f"Cleaned Dataset: {len(cleaned_df)} rows remaining after cleaning.")
434
+ st.dataframe(cleaned_df.head())
435
  return cleaned_df
436
 
437
  def detect_outlier(df):
438
  st.subheader("Detect Outliers")
439
  numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
440
+ if not numerical_cols.any():
441
+ st.warning("No numerical columns available for outlier detection.")
442
+ return
443
+
444
+ st.markdown("#### Outlier Detection Summary")
445
+ outlier_summary = []
446
  for col in numerical_cols:
447
  Q1, Q3 = df[col].quantile([0.25, 0.75])
448
  IQR = Q3 - Q1
449
+ lower_bound = Q1 - 1.5 * IQR
450
+ upper_bound = Q3 + 1.5 * IQR
451
+ outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
452
+ outlier_count = len(outliers)
453
+ outlier_percentage = round((outlier_count / len(df) * 100), 2)
454
+ outlier_summary.append({
455
+ "Column": col,
456
+ "Outliers": outlier_count,
457
+ "Outlier Percentage (%)": outlier_percentage
458
+ })
459
+
460
+ outlier_df = pd.DataFrame(outlier_summary).sort_values(by="Outliers", ascending=False)
461
+ st.dataframe(outlier_df)
462
+
463
+ st.markdown("#### Outlier Visualization")
464
+ selected_col = st.selectbox("Select a column to visualize outliers", numerical_cols)
465
+ if selected_col:
466
+ Q1, Q3 = df[selected_col].quantile([0.25, 0.75])
467
+ IQR = Q3 - Q1
468
+ lower_bound = Q1 - 1.5 * IQR
469
+ upper_bound = Q3 + 1.5 * IQR
470
 
471
+ fig, ax = plt.subplots()
472
+ sns.boxplot(data=df, x=selected_col, ax=ax)
473
+ ax.axhline(lower_bound, color='red', linestyle='--', label='Lower Bound')
474
+ ax.axhline(upper_bound, color='blue', linestyle='--', label='Upper Bound')
475
+ ax.legend()
476
+ st.pyplot(fig)
477
+ plt.close(fig)
478
+
479
+ # Data Encoder
480
  def encoder(df):
481
+ """
482
+ Encodes categorical columns in the dataset using user-selected methods (Label Encoding,
483
+ One-Hot Encoding, or Frequency Encoding). Provides control over column selection, handles
484
+ missing values, and displays encoding details.
485
+
486
+ Args:
487
+ df (pd.DataFrame): Input dataset to encode.
488
+
489
+ Returns:
490
+ pd.DataFrame: Encoded dataset.
491
+ """
492
  st.subheader("Encode Data")
493
+
494
+ # Initialize session state for encoded DataFrame
495
+ if 'encoded_df' not in st.session_state:
496
+ st.session_state.encoded_df = df.copy()
497
+
498
+ # Identify categorical columns
499
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
500
+ if not categorical_cols:
501
+ st.warning("No categorical columns ('object' or 'category') found in the dataset.")
502
+ return df
503
+
504
+ # Display original categorical columns
505
+ st.markdown("### Categorical Columns Detected")
506
+ st.write(f"Found {len(categorical_cols)} categorical columns: {', '.join(categorical_cols)}")
507
+ for col in categorical_cols:
508
+ st.write(f"- **{col}**: {df[col].nunique()} unique values, "
509
+ f"{df[col].isna().sum()} missing ({df[col].isna().sum() / len(df) * 100:.2f}%)")
510
+
511
+ # User configuration
512
+ st.markdown("### Encoding Configuration")
513
+ encoding_methods = {
514
+ "Label Encoding": "Assigns integers to categories (best for ordinal data).",
515
+ "One-Hot Encoding": "Creates binary columns for each category (best for non-ordinal data, avoid high cardinality).",
516
+ "Frequency Encoding": "Replaces categories with their frequency (useful for high-cardinality columns)."
517
+ }
518
+
519
+ # Select columns to encode
520
+ cols_to_encode = st.multiselect("Select Columns to Encode", categorical_cols, default=categorical_cols,
521
+ help="Choose which categorical columns to encode. Unselected columns remain unchanged.")
522
+
523
+ if not cols_to_encode:
524
+ st.warning("Please select at least one column to encode.")
525
+ return st.session_state.encoded_df
526
+
527
+ # Missing value handling
528
+ missing_strategy = st.selectbox("Handle Missing Values",
529
+ ["Keep as NaN", "Impute with Mode", "Impute with Custom Value"],
530
+ help="Choose how to handle missing values before encoding.")
531
+ custom_value = None
532
+ if missing_strategy == "Impute with Custom Value":
533
+ custom_value = st.text_input("Enter Custom Value for Missing Entries", value="Unknown")
534
+
535
+ # Apply missing value handling
536
+ encoded_df = st.session_state.encoded_df.copy()
537
+ for col in cols_to_encode:
538
+ if missing_strategy == "Impute with Mode":
539
+ mode_val = df[col].mode()[0] if not df[col].mode().empty else "Unknown"
540
+ encoded_df[col] = df[col].fillna(mode_val)
541
+ elif missing_strategy == "Impute with Custom Value":
542
+ encoded_df[col] = df[col].fillna(custom_value)
543
+
544
+ # Encoding method selection per column
545
+ st.markdown("### Assign Encoding Methods")
546
+ encoding_assignments = {}
547
+ for col in cols_to_encode:
548
+ default_method = "One-Hot Encoding" if df[col].nunique() <= 10 else "Frequency Encoding"
549
+ encoding_assignments[col] = st.selectbox(
550
+ f"Encoding Method for {col}",
551
+ list(encoding_methods.keys()),
552
+ index=list(encoding_methods.keys()).index(default_method),
553
+ help=f"{encoding_methods[default_method]} Unique values: {df[col].nunique()}"
554
+ )
555
+
556
+ # Apply encoding
557
+ if st.button("Apply Encoding"):
558
+ try:
559
+ for col, method in encoding_assignments.items():
560
+ if method == "Label Encoding":
561
+ le = LabelEncoder()
562
+ # Convert to string to handle mixed types and NaNs
563
+ encoded_df[col] = le.fit_transform(encoded_df[col].astype(str))
564
+ st.session_state[f"label_encoder_{col}"] = le # Store encoder for reference
565
+ st.write(f"**{col}**: Label Encoded. Classes: {list(le.classes_)}")
566
+
567
+ elif method == "One-Hot Encoding":
568
+ if df[col].nunique() > 50:
569
+ st.warning(f"**{col}** has {df[col].nunique()} unique values. One-Hot Encoding may create many columns.")
570
+ # Drop NaN for one-hot encoding, reintroduce after
571
+ mask = encoded_df[col].notna()
572
+ ohe_df = pd.get_dummies(encoded_df.loc[mask, col], prefix=col, drop_first=True)
573
+ encoded_df = pd.concat([encoded_df.drop(columns=[col]), ohe_df], axis=1)
574
+ encoded_df.loc[~mask, ohe_df.columns] = np.nan
575
+ st.write(f"**{col}**: One-Hot Encoded. Created {len(ohe_df.columns)} new columns.")
576
+
577
+ elif method == "Frequency Encoding":
578
+ freq_map = df[col].value_counts(normalize=True).to_dict()
579
+ encoded_df[col] = df[col].map(freq_map)
580
+ st.write(f"**{col}**: Frequency Encoded. Values mapped to proportions.")
581
+
582
+ # Update session state
583
+ st.session_state.encoded_df = encoded_df
584
+
585
+ # Display results
586
+ st.markdown("### Encoded Dataset Preview")
587
+ st.dataframe(encoded_df.head())
588
+
589
+ # Data quality check
590
+ new_cols = len(encoded_df.columns) - len(df.columns)
591
+ if new_cols > 0:
592
+ st.info(f"Encoding added {new_cols} new columns.")
593
+ if encoded_df.isna().sum().sum() > 0:
594
+ st.warning(f"Encoded dataset still has {encoded_df.isna().sum().sum()} missing values.")
595
+
596
+ # Download option
597
+ csv_bytes = encoded_df.to_csv(index=False).encode()
598
+ st.download_button(
599
+ label="Download Encoded Dataset",
600
+ data=csv_bytes,
601
+ file_name="encoded_dataset.csv",
602
+ mime="text/csv",
603
+ key="download_encoded"
604
+ )
605
+
606
+ except Exception as e:
607
+ st.error(f"Error during encoding: {str(e)}")
608
+ return df
609
+
610
+ # Preview current encoded state
611
+ else:
612
+ st.markdown("### Current Dataset Preview")
613
+ st.dataframe(st.session_state.encoded_df.head())
614
+
615
+ return st.session_state.encoded_df
616
 
617
+ # Data Transformer part eta
618
  def data_transformer(df):
619
  st.subheader("Data Transformer")
620
  transformed_df = df.copy() # Placeholder for future transformations
621
  st.write("Transformed Dataset:", transformed_df.head())
622
  return transformed_df
623
 
624
+ # Data Analysis
625
  def data_analysis(df):
626
+ """
627
+ Performs an in-depth analysis of the dataset, including numerical and categorical summaries,
628
+ interactive visualizations, data quality checks, and column-specific exploration.
629
+
630
+ Args:
631
+ df (pd.DataFrame): Input dataset to analyze.
632
+ """
633
  st.subheader("Data Analysis")
 
634
 
635
+ # Initialize tabs for different analysis types
636
+ tab1, tab2, tab3, tab4 = st.tabs(["Summary Statistics", "Visual Exploration", "Data Quality", "Column Deep Dive"])
637
+
638
+ with tab1:
639
+ st.markdown("### Summary Statistics")
640
+ # Numerical Columns
641
+ numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
642
+ if numerical_cols.size > 0:
643
+ st.markdown("#### Numerical Columns")
644
+ numerical_summary = df[numerical_cols].describe().T.round(2)
645
+ numerical_summary['Skewness'] = df[numerical_cols].skew().round(2)
646
+ numerical_summary['Kurtosis'] = df[numerical_cols].kurt().round(2)
647
+ numerical_summary['Missing %'] = (df[numerical_cols].isna().sum() / len(df) * 100).round(2)
648
+ st.dataframe(numerical_summary.style.highlight_max(axis=0, color='lightgreen'))
649
+ st.write("*Skewness > 1 or < -1 indicates high skew. Kurtosis > 3 indicates heavy tails.*")
650
+ else:
651
+ st.info("No numerical columns found.")
652
+
653
+ # Categorical Columns
654
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
655
+ if categorical_cols.size > 0:
656
+ st.markdown("#### Categorical Columns")
657
+ cat_summary = pd.DataFrame({
658
+ "Column": categorical_cols,
659
+ "Unique Values": [df[col].nunique() for col in categorical_cols],
660
+ "Most Frequent": [df[col].mode()[0] if not df[col].mode().empty else np.nan for col in categorical_cols],
661
+ "Missing %": [(df[col].isna().sum() / len(df) * 100).round(2) for col in categorical_cols]
662
+ })
663
+ st.dataframe(cat_summary)
664
+ else:
665
+ st.info("No categorical columns found.")
666
+
667
+ with tab2:
668
+ st.markdown("### Visual Exploration")
669
+ viz_type = st.selectbox("Select Visualization Type",
670
+ ["Distribution (Numerical)", "Count Plot (Categorical)", "Correlation Heatmap", "Pair Plot"],
671
+ key="data_analysis_viz")
672
+
673
+ if viz_type == "Distribution (Numerical)" and numerical_cols.size > 0:
674
+ col = st.selectbox("Select Column", numerical_cols, key="num_dist_col")
675
+ fig, ax = plt.subplots()
676
+ sns.histplot(data=df, x=col, kde=True, bins='auto', ax=ax)
677
+ ax.set_title(f"Distribution of {col}")
678
+ st.pyplot(fig)
679
+ download_image(fig, f"dist_{col}")
680
+ plt.close(fig)
681
+
682
+ elif viz_type == "Count Plot (Categorical)" and categorical_cols.size > 0:
683
+ col = st.selectbox("Select Column", categorical_cols, key="cat_count_col")
684
+ fig, ax = plt.subplots()
685
+ sns.countplot(data=df, x=col, ax=ax)
686
+ plt.xticks(rotation=45, ha='right')
687
+ ax.set_title(f"Count Plot of {col}")
688
+ st.pyplot(fig)
689
+ download_image(fig, f"count_{col}")
690
+ plt.close(fig)
691
+
692
+ elif viz_type == "Correlation Heatmap" and numerical_cols.size > 1:
693
+ fig, ax = plt.subplots(figsize=(10, 8))
694
+ sns.heatmap(df[numerical_cols].corr(), annot=True, cmap="coolwarm", vmin=-1, vmax=1, fmt='.2f', ax=ax)
695
+ ax.set_title("Correlation Heatmap")
696
+ st.pyplot(fig)
697
+ download_image(fig, "corr_heatmap")
698
+ plt.close(fig)
699
+
700
+ elif viz_type == "Pair Plot" and numerical_cols.size > 0:
701
+ selected_cols = st.multiselect("Select Columns (max 4)", numerical_cols, max_selections=4, key="pair_cols")
702
+ if len(selected_cols) >= 2:
703
+ fig = sns.pairplot(df[selected_cols].dropna())
704
+ st.pyplot(fig)
705
+ img_bytes = BytesIO()
706
+ fig.savefig(img_bytes, format='png', bbox_inches='tight')
707
+ img_bytes.seek(0)
708
+ st.download_button(label="Download Pair Plot", data=img_bytes,
709
+ file_name=f"pairplot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png",
710
+ mime="image/png", key=f"pairplot_{datetime.now().strftime('%H%M%S')}")
711
+ plt.close()
712
+
713
+ with tab3:
714
+ st.markdown("### Data Quality Checks")
715
+ # Missing Values
716
+ missing_total = df.isna().sum().sum()
717
+ if missing_total > 0:
718
+ st.warning(f"**Missing Values**: {missing_total} across {df.isna().any().sum()} columns.")
719
+ missing_df = pd.DataFrame({
720
+ "Column": df.columns,
721
+ "Missing Count": df.isna().sum(),
722
+ "Missing %": (df.isna().sum() / len(df) * 100).round(2)
723
+ })
724
+ missing_df = missing_df[missing_df["Missing Count"] > 0]
725
+ st.dataframe(missing_df)
726
+ else:
727
+ st.success("No missing values detected.")
728
+
729
+ # Duplicates
730
+ duplicates = df.duplicated().sum()
731
+ if duplicates > 0:
732
+ st.warning(f"**Duplicates**: {duplicates} duplicate rows ({duplicates / len(df) * 100:.2f}%).")
733
+ else:
734
+ st.success("No duplicate rows detected.")
735
+
736
+ # Outliers (Numerical)
737
+ if numerical_cols.size > 0:
738
+ outlier_summary = []
739
+ for col in numerical_cols:
740
+ Q1, Q3 = df[col].quantile([0.25, 0.75])
741
+ IQR = Q3 - Q1
742
+ outliers = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
743
+ if outliers > 0:
744
+ outlier_summary.append({"Column": col, "Outlier Count": outliers,
745
+ "Outlier %": (outliers / len(df) * 100).round(2)})
746
+ if outlier_summary:
747
+ st.warning("**Outliers Detected**:")
748
+ st.dataframe(pd.DataFrame(outlier_summary))
749
+ else:
750
+ st.success("No outliers detected in numerical columns.")
751
+
752
+ with tab4:
753
+ st.markdown("### Column Deep Dive")
754
+ selected_col = st.selectbox("Select Column for Detailed Analysis", df.columns, key="deep_dive_col")
755
+ st.write(f"**Column**: {selected_col}")
756
+ st.write(f"**Data Type**: {df[selected_col].dtype}")
757
+ st.write(f"**Missing Values**: {df[selected_col].isna().sum()} ({df[selected_col].isna().sum() / len(df) * 100:.2f}%)")
758
+ st.write(f"**Unique Values**: {df[selected_col].nunique()} ({df[selected_col].nunique() / len(df) * 100:.2f}%)")
759
+
760
+ if pd.api.types.is_numeric_dtype(df[selected_col]):
761
+ st.write("**Summary Statistics**:")
762
+ stats = df[selected_col].describe().round(2)
763
+ stats['Skewness'] = df[selected_col].skew().round(2)
764
+ stats['Kurtosis'] = df[selected_col].kurt().round(2)
765
+ st.dataframe(stats)
766
+ fig = px.histogram(df, x=selected_col, nbins=30, title=f"Distribution of {selected_col}")
767
+ st.plotly_chart(fig)
768
+ elif pd.api.types.is_object_dtype(df[selected_col]) or pd.api.types.is_categorical_dtype(df[selected_col]):
769
+ st.write("**Top 5 Values**:")
770
+ value_counts = df[selected_col].value_counts().head(5)
771
+ st.dataframe(pd.DataFrame({
772
+ "Value": value_counts.index,
773
+ "Count": value_counts.values,
774
+ "% of Total": (value_counts.values / len(df) * 100).round(2)
775
+ }))
776
+ fig = px.bar(x=value_counts.index, y=value_counts.values, title=f"Top Values in {selected_col}")
777
+ st.plotly_chart(fig)
778
+
779
+ def download_image(fig, key_prefix):
780
+ """
781
+ Utility function to download a Matplotlib figure as PNG.
782
+ """
783
+ img_bytes = BytesIO()
784
+ fig.savefig(img_bytes, format='png', bbox_inches='tight')
785
+ img_bytes.seek(0)
786
+ st.download_button(label="Download Image", data=img_bytes,
787
+ file_name=f"{key_prefix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png",
788
+ mime="image/png", key=f"download_{key_prefix}_{datetime.now().strftime('%H%M%S')}")
789
+
790
+ # Feature Importance Analysis
791
  def feature_importance_analyzer(df):
792
  st.subheader("Feature Importance Analyzer")
793
  target_column = st.selectbox("Select Target Column", df.columns)