Spaces:
No application file
No application file
Upload 13 files
Browse files- README.MD +0 -0
- app.py +1023 -0
- chatbot.py +1051 -0
- data_preprocessing.py +387 -0
- data_quality.py +252 -0
- dataset_overview.py +1159 -0
- explainability.py +176 -0
- insights.py +369 -0
- ml_pipeline.py +940 -0
- requirements.txt +16 -0
- statistical_analysis.py +928 -0
- utils.py +208 -0
- visualization.py +435 -0
README.MD
ADDED
|
Binary file (7.64 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,1023 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import plotly.io as pio
|
| 4 |
+
import traceback
|
| 5 |
+
import sys
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
from data_preprocessing import preprocess_data
|
| 9 |
+
from insights import generate_business_insights
|
| 10 |
+
from dataset_overview import eda_analysis # Updated import
|
| 11 |
+
from visualization import auto_visualizations
|
| 12 |
+
from ml_pipeline import run_ml_pipeline
|
| 13 |
+
from statistical_analysis import statistical_analysis
|
| 14 |
+
from data_quality import quality_report
|
| 15 |
+
from chatbot import data_chatbot
|
| 16 |
+
|
| 17 |
+
# Set plotly template
|
| 18 |
+
pio.templates.default = "plotly_white"
|
| 19 |
+
|
| 20 |
+
# ---------------------------------------
|
| 21 |
+
# PAGE CONFIG
|
| 22 |
+
# ---------------------------------------
|
| 23 |
+
|
| 24 |
+
st.set_page_config(
|
| 25 |
+
page_title="AI Data Analyst Pro",
|
| 26 |
+
layout="wide",
|
| 27 |
+
page_icon="📊",
|
| 28 |
+
initial_sidebar_state="expanded"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# ---------------------------------------
|
| 32 |
+
# CUSTOM ERROR HANDLER
|
| 33 |
+
# ---------------------------------------
|
| 34 |
+
|
| 35 |
+
class StreamlitExceptionHandler:
|
| 36 |
+
"""Custom exception handler for Streamlit"""
|
| 37 |
+
|
| 38 |
+
@staticmethod
|
| 39 |
+
def handle_exception(e, context="application"):
|
| 40 |
+
"""Handle exceptions with user-friendly messages"""
|
| 41 |
+
error_type = type(e).__name__
|
| 42 |
+
error_msg = str(e)
|
| 43 |
+
|
| 44 |
+
# Create user-friendly error message
|
| 45 |
+
user_message = f"""
|
| 46 |
+
### ❌ An error occurred in the {context}
|
| 47 |
+
|
| 48 |
+
**Error Type:** {error_type}
|
| 49 |
+
|
| 50 |
+
**What happened:** {error_msg if error_msg else "An unexpected error occurred"}
|
| 51 |
+
|
| 52 |
+
**Possible solutions:**
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
# Add specific solutions based on error type
|
| 56 |
+
if "MemoryError" in error_type:
|
| 57 |
+
user_message += """
|
| 58 |
+
- Your dataset might be too large. Try uploading a smaller file.
|
| 59 |
+
- Close other applications to free up memory.
|
| 60 |
+
- Consider sampling your data before uploading.
|
| 61 |
+
"""
|
| 62 |
+
elif "KeyError" in error_type or "IndexError" in error_type:
|
| 63 |
+
user_message += """
|
| 64 |
+
- The requested column or index doesn't exist in your dataset.
|
| 65 |
+
- Check if you've selected valid columns for the operation.
|
| 66 |
+
- Try refreshing the page and uploading your data again.
|
| 67 |
+
"""
|
| 68 |
+
elif "ValueError" in error_type:
|
| 69 |
+
user_message += """
|
| 70 |
+
- The data values don't match the expected format.
|
| 71 |
+
- Check for invalid values in your dataset (e.g., text in numeric columns).
|
| 72 |
+
- Ensure your data types are correct for the selected operation.
|
| 73 |
+
"""
|
| 74 |
+
elif "TypeError" in error_type:
|
| 75 |
+
user_message += """
|
| 76 |
+
- There's a mismatch in data types.
|
| 77 |
+
- Check if you're mixing numeric and text data in operations.
|
| 78 |
+
- Use the preprocessing tab to convert data types appropriately.
|
| 79 |
+
"""
|
| 80 |
+
elif "FileNotFoundError" in error_type:
|
| 81 |
+
user_message += """
|
| 82 |
+
- The file couldn't be found. Please upload it again.
|
| 83 |
+
- Check if the file path is correct.
|
| 84 |
+
"""
|
| 85 |
+
elif "PermissionError" in error_type:
|
| 86 |
+
user_message += """
|
| 87 |
+
- Permission denied when accessing the file.
|
| 88 |
+
- Make sure the file isn't open in another program.
|
| 89 |
+
"""
|
| 90 |
+
elif "pd.errors.EmptyDataError" in error_type:
|
| 91 |
+
user_message += """
|
| 92 |
+
- The uploaded file is empty.
|
| 93 |
+
- Please upload a file containing data.
|
| 94 |
+
"""
|
| 95 |
+
elif "pd.errors.ParserError" in error_type:
|
| 96 |
+
user_message += """
|
| 97 |
+
- Couldn't parse the file. Check if it's a valid CSV or Excel file.
|
| 98 |
+
- Ensure the file format matches the selected file type.
|
| 99 |
+
"""
|
| 100 |
+
else:
|
| 101 |
+
user_message += """
|
| 102 |
+
- Try refreshing the page and uploading your data again.
|
| 103 |
+
- Check if your data format is compatible with the operation.
|
| 104 |
+
- If the problem persists, try with a smaller sample of your data.
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
# Add technical details in an expander for debugging
|
| 108 |
+
user_message += f"""
|
| 109 |
+
|
| 110 |
+
**Technical Details:**
|
| 111 |
+
"""
|
| 112 |
+
|
| 113 |
+
return user_message
|
| 114 |
+
|
| 115 |
+
# Initialize session state for error tracking
|
| 116 |
+
if "error_log" not in st.session_state:
|
| 117 |
+
st.session_state.error_log = []
|
| 118 |
+
|
| 119 |
+
if "last_successful_operation" not in st.session_state:
|
| 120 |
+
st.session_state.last_successful_operation = None
|
| 121 |
+
|
| 122 |
+
# ---------------------------------------
|
| 123 |
+
# ADVANCED CSS WITH RESPONSIVE DESIGN
|
| 124 |
+
# ---------------------------------------
|
| 125 |
+
|
| 126 |
+
st.markdown("""
|
| 127 |
+
<style>
|
| 128 |
+
/* Global Styles */
|
| 129 |
+
.main {
|
| 130 |
+
padding: 0rem 1rem;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
/* Header Styling */
|
| 134 |
+
.header-container {
|
| 135 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 136 |
+
padding: 2rem;
|
| 137 |
+
border-radius: 20px;
|
| 138 |
+
margin-bottom: 2rem;
|
| 139 |
+
color: white;
|
| 140 |
+
text-align: center;
|
| 141 |
+
box-shadow: 0 10px 30px rgba(0,0,0,0.2);
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
.header-title {
|
| 145 |
+
font-size: 2.5rem;
|
| 146 |
+
font-weight: 700;
|
| 147 |
+
margin-bottom: 0.5rem;
|
| 148 |
+
animation: fadeInDown 1s;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
.header-subtitle {
|
| 152 |
+
font-size: 1.1rem;
|
| 153 |
+
opacity: 0.95;
|
| 154 |
+
animation: fadeInUp 1s;
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
/* Card Styling */
|
| 158 |
+
.custom-card {
|
| 159 |
+
background: white;
|
| 160 |
+
padding: 1.5rem;
|
| 161 |
+
border-radius: 15px;
|
| 162 |
+
box-shadow: 0 5px 15px rgba(0,0,0,0.08);
|
| 163 |
+
margin-bottom: 1.5rem;
|
| 164 |
+
border: 1px solid rgba(0,0,0,0.05);
|
| 165 |
+
transition: transform 0.3s, box-shadow 0.3s;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
.custom-card:hover {
|
| 169 |
+
transform: translateY(-5px);
|
| 170 |
+
box-shadow: 0 8px 25px rgba(0,0,0,0.15);
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
/* Error Message Styling */
|
| 174 |
+
.error-container {
|
| 175 |
+
background: linear-gradient(135deg, #ff6b6b 0%, #ff4757 100%);
|
| 176 |
+
color: white;
|
| 177 |
+
padding: 1.5rem;
|
| 178 |
+
border-radius: 15px;
|
| 179 |
+
margin: 1rem 0;
|
| 180 |
+
box-shadow: 0 10px 30px rgba(255, 71, 87, 0.3);
|
| 181 |
+
animation: slideInRight 0.5s;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
.error-title {
|
| 185 |
+
font-size: 1.5rem;
|
| 186 |
+
font-weight: 700;
|
| 187 |
+
margin-bottom: 1rem;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
.error-solution {
|
| 191 |
+
background: rgba(255, 255, 255, 0.2);
|
| 192 |
+
padding: 1rem;
|
| 193 |
+
border-radius: 10px;
|
| 194 |
+
margin-top: 1rem;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
/* Success Message Styling */
|
| 198 |
+
.success-container {
|
| 199 |
+
background: linear-gradient(135deg, #51cf66 0%, #37b24d 100%);
|
| 200 |
+
color: white;
|
| 201 |
+
padding: 1rem;
|
| 202 |
+
border-radius: 10px;
|
| 203 |
+
margin: 1rem 0;
|
| 204 |
+
animation: fadeInUp 0.5s;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
/* Warning Message Styling */
|
| 208 |
+
.warning-container {
|
| 209 |
+
background: linear-gradient(135deg, #ffd43b 0%, #fcc419 100%);
|
| 210 |
+
color: #2c3e50;
|
| 211 |
+
padding: 1rem;
|
| 212 |
+
border-radius: 10px;
|
| 213 |
+
margin: 1rem 0;
|
| 214 |
+
animation: fadeInUp 0.5s;
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
/* Metric Cards */
|
| 218 |
+
.metric-card {
|
| 219 |
+
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
|
| 220 |
+
padding: 1.2rem;
|
| 221 |
+
border-radius: 12px;
|
| 222 |
+
text-align: center;
|
| 223 |
+
border-left: 4px solid #667eea;
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
.metric-value {
|
| 227 |
+
font-size: 2rem;
|
| 228 |
+
font-weight: 700;
|
| 229 |
+
color: #2c3e50;
|
| 230 |
+
margin: 0.5rem 0;
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
.metric-label {
|
| 234 |
+
font-size: 0.9rem;
|
| 235 |
+
color: #7f8c8d;
|
| 236 |
+
text-transform: uppercase;
|
| 237 |
+
letter-spacing: 1px;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
/* Chatbot Styling */
|
| 241 |
+
.chat-container {
|
| 242 |
+
max-width: 800px;
|
| 243 |
+
margin: 2rem auto;
|
| 244 |
+
background: #f8f9fa;
|
| 245 |
+
border-radius: 20px;
|
| 246 |
+
padding: 1.5rem;
|
| 247 |
+
box-shadow: 0 5px 20px rgba(0,0,0,0.1);
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
.user-message {
|
| 251 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 252 |
+
color: white;
|
| 253 |
+
padding: 12px 18px;
|
| 254 |
+
border-radius: 20px 20px 5px 20px;
|
| 255 |
+
margin: 10px 0;
|
| 256 |
+
max-width: 80%;
|
| 257 |
+
margin-left: auto;
|
| 258 |
+
animation: slideInRight 0.5s;
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
.bot-message {
|
| 262 |
+
background: white;
|
| 263 |
+
color: #2c3e50;
|
| 264 |
+
padding: 12px 18px;
|
| 265 |
+
border-radius: 20px 20px 20px 5px;
|
| 266 |
+
margin: 10px 0;
|
| 267 |
+
max-width: 80%;
|
| 268 |
+
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
| 269 |
+
animation: slideInLeft 0.5s;
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
/* Loading Spinner */
|
| 273 |
+
.custom-spinner {
|
| 274 |
+
border: 4px solid #f3f3f3;
|
| 275 |
+
border-top: 4px solid #667eea;
|
| 276 |
+
border-radius: 50%;
|
| 277 |
+
width: 40px;
|
| 278 |
+
height: 40px;
|
| 279 |
+
animation: spin 1s linear infinite;
|
| 280 |
+
margin: 20px auto;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
@keyframes spin {
|
| 284 |
+
0% { transform: rotate(0deg); }
|
| 285 |
+
100% { transform: rotate(360deg); }
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
/* Animations */
|
| 289 |
+
@keyframes fadeInDown {
|
| 290 |
+
from {
|
| 291 |
+
opacity: 0;
|
| 292 |
+
transform: translateY(-20px);
|
| 293 |
+
}
|
| 294 |
+
to {
|
| 295 |
+
opacity: 1;
|
| 296 |
+
transform: translateY(0);
|
| 297 |
+
}
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
@keyframes fadeInUp {
|
| 301 |
+
from {
|
| 302 |
+
opacity: 0;
|
| 303 |
+
transform: translateY(20px);
|
| 304 |
+
}
|
| 305 |
+
to {
|
| 306 |
+
opacity: 1;
|
| 307 |
+
transform: translateY(0);
|
| 308 |
+
}
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
@keyframes slideInRight {
|
| 312 |
+
from {
|
| 313 |
+
opacity: 0;
|
| 314 |
+
transform: translateX(30px);
|
| 315 |
+
}
|
| 316 |
+
to {
|
| 317 |
+
opacity: 1;
|
| 318 |
+
transform: translateX(0);
|
| 319 |
+
}
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
@keyframes slideInLeft {
|
| 323 |
+
from {
|
| 324 |
+
opacity: 0;
|
| 325 |
+
transform: translateX(-30px);
|
| 326 |
+
}
|
| 327 |
+
to {
|
| 328 |
+
opacity: 1;
|
| 329 |
+
transform: translateX(0);
|
| 330 |
+
}
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
/* Responsive Design */
|
| 334 |
+
@media (max-width: 768px) {
|
| 335 |
+
.header-title {
|
| 336 |
+
font-size: 1.8rem;
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
.metric-value {
|
| 340 |
+
font-size: 1.5rem;
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
.user-message, .bot-message {
|
| 344 |
+
max-width: 95%;
|
| 345 |
+
}
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
/* Sidebar Styling */
|
| 349 |
+
.css-1d391kg {
|
| 350 |
+
background: linear-gradient(180deg, #f8f9fa 0%, #e9ecef 100%);
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
/* Button Styling */
|
| 354 |
+
.stButton > button {
|
| 355 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 356 |
+
color: white;
|
| 357 |
+
border: none;
|
| 358 |
+
padding: 0.5rem 2rem;
|
| 359 |
+
border-radius: 25px;
|
| 360 |
+
font-weight: 600;
|
| 361 |
+
transition: transform 0.2s, box-shadow 0.2s;
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
.stButton > button:hover {
|
| 365 |
+
transform: translateY(-2px);
|
| 366 |
+
box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
|
| 367 |
+
}
|
| 368 |
+
|
| 369 |
+
.stButton > button:disabled {
|
| 370 |
+
opacity: 0.5;
|
| 371 |
+
cursor: not-allowed;
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
/* Progress Bar */
|
| 375 |
+
.stProgress > div > div {
|
| 376 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 377 |
+
}
|
| 378 |
+
|
| 379 |
+
/* Tabs */
|
| 380 |
+
.stTabs [data-baseweb="tab-list"] {
|
| 381 |
+
gap: 2rem;
|
| 382 |
+
background-color: #f8f9fa;
|
| 383 |
+
padding: 0.5rem;
|
| 384 |
+
border-radius: 30px;
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
.stTabs [data-baseweb="tab"] {
|
| 388 |
+
border-radius: 25px;
|
| 389 |
+
padding: 0.5rem 2rem;
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
/* Tooltip */
|
| 393 |
+
.tooltip {
|
| 394 |
+
position: relative;
|
| 395 |
+
display: inline-block;
|
| 396 |
+
cursor: help;
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
.tooltip .tooltiptext {
|
| 400 |
+
visibility: hidden;
|
| 401 |
+
width: 200px;
|
| 402 |
+
background-color: #555;
|
| 403 |
+
color: #fff;
|
| 404 |
+
text-align: center;
|
| 405 |
+
border-radius: 6px;
|
| 406 |
+
padding: 5px;
|
| 407 |
+
position: absolute;
|
| 408 |
+
z-index: 1;
|
| 409 |
+
bottom: 125%;
|
| 410 |
+
left: 50%;
|
| 411 |
+
margin-left: -100px;
|
| 412 |
+
opacity: 0;
|
| 413 |
+
transition: opacity 0.3s;
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
.tooltip:hover .tooltiptext {
|
| 417 |
+
visibility: visible;
|
| 418 |
+
opacity: 1;
|
| 419 |
+
}
|
| 420 |
+
</style>
|
| 421 |
+
""", unsafe_allow_html=True)
|
| 422 |
+
|
| 423 |
+
# ---------------------------------------
|
| 424 |
+
# HEADER WITH ANIMATION
|
| 425 |
+
# ---------------------------------------
|
| 426 |
+
|
| 427 |
+
st.markdown("""
|
| 428 |
+
<div class="header-container">
|
| 429 |
+
<div class="header-title">📊 AI Data Analyst Pro</div>
|
| 430 |
+
<div class="header-subtitle">Intelligent Data Analysis & Visualization Platform</div>
|
| 431 |
+
</div>
|
| 432 |
+
""", unsafe_allow_html=True)
|
| 433 |
+
|
| 434 |
+
# ---------------------------------------
|
| 435 |
+
# SESSION STATE INITIALIZATION
|
| 436 |
+
# ---------------------------------------
|
| 437 |
+
|
| 438 |
+
if "data" not in st.session_state:
|
| 439 |
+
st.session_state.data = None
|
| 440 |
+
|
| 441 |
+
if "processed_data" not in st.session_state:
|
| 442 |
+
st.session_state.processed_data = None
|
| 443 |
+
|
| 444 |
+
if "uploaded_file_name" not in st.session_state:
|
| 445 |
+
st.session_state.uploaded_file_name = None
|
| 446 |
+
|
| 447 |
+
if "upload_error" not in st.session_state:
|
| 448 |
+
st.session_state.upload_error = None
|
| 449 |
+
|
| 450 |
+
if "data_loaded" not in st.session_state:
|
| 451 |
+
st.session_state.data_loaded = False
|
| 452 |
+
|
| 453 |
+
if "operation_status" not in st.session_state:
|
| 454 |
+
st.session_state.operation_status = {}
|
| 455 |
+
|
| 456 |
+
# ---------------------------------------
|
| 457 |
+
# HELPER FUNCTIONS
|
| 458 |
+
# ---------------------------------------
|
| 459 |
+
|
| 460 |
+
def safe_dataframe_operation(func, df, *args, **kwargs):
|
| 461 |
+
"""Safely execute dataframe operations with error handling"""
|
| 462 |
+
try:
|
| 463 |
+
result = func(df, *args, **kwargs)
|
| 464 |
+
st.session_state.last_successful_operation = func.__name__
|
| 465 |
+
return result, None
|
| 466 |
+
except Exception as e:
|
| 467 |
+
error_msg = StreamlitExceptionHandler.handle_exception(e, func.__name__)
|
| 468 |
+
return None, error_msg
|
| 469 |
+
|
| 470 |
+
def validate_dataset(df):
|
| 471 |
+
"""Validate dataset for common issues"""
|
| 472 |
+
issues = []
|
| 473 |
+
|
| 474 |
+
if df.empty:
|
| 475 |
+
issues.append("The dataset is empty")
|
| 476 |
+
|
| 477 |
+
if df.shape[0] == 0:
|
| 478 |
+
issues.append("No rows in the dataset")
|
| 479 |
+
|
| 480 |
+
if df.shape[1] == 0:
|
| 481 |
+
issues.append("No columns in the dataset")
|
| 482 |
+
|
| 483 |
+
# Check for memory issues
|
| 484 |
+
memory_usage = df.memory_usage(deep=True).sum() / 1024**3 # GB
|
| 485 |
+
if memory_usage > 1:
|
| 486 |
+
issues.append(f"Large dataset detected ({memory_usage:.2f} GB). Some operations may be slow.")
|
| 487 |
+
|
| 488 |
+
# Check for mixed types
|
| 489 |
+
for col in df.columns:
|
| 490 |
+
if df[col].dtype == 'object':
|
| 491 |
+
# Check if column has mixed types
|
| 492 |
+
types = df[col].apply(type).unique()
|
| 493 |
+
if len(types) > 1:
|
| 494 |
+
issues.append(f"Column '{col}' has mixed data types: {types}")
|
| 495 |
+
|
| 496 |
+
return issues
|
| 497 |
+
|
| 498 |
+
def show_validation_warnings(issues):
|
| 499 |
+
"""Display validation warnings"""
|
| 500 |
+
if issues:
|
| 501 |
+
st.markdown("""
|
| 502 |
+
<div class="warning-container">
|
| 503 |
+
<strong>⚠️ Data Quality Warnings:</strong><br>
|
| 504 |
+
""" + "<br>".join([f"• {issue}" for issue in issues]) + """
|
| 505 |
+
</div>
|
| 506 |
+
""", unsafe_allow_html=True)
|
| 507 |
+
|
| 508 |
+
# ---------------------------------------
|
| 509 |
+
# SIDEBAR WITH ENHANCED NAVIGATION
|
| 510 |
+
# ---------------------------------------
|
| 511 |
+
|
| 512 |
+
with st.sidebar:
|
| 513 |
+
st.markdown("### 🧭 Navigation")
|
| 514 |
+
|
| 515 |
+
# Custom radio buttons styling
|
| 516 |
+
page = st.radio(
|
| 517 |
+
"Select Module",
|
| 518 |
+
["📤 Upload Dataset", "🛠️ Preprocessing", "🔍 EDA",
|
| 519 |
+
"📈 Visualization", "🤖 Machine Learning", "💡 Insights",
|
| 520 |
+
"💬 Chatbot", "📋 Data Quality", "📐 Statistical Analysis"],
|
| 521 |
+
label_visibility="collapsed"
|
| 522 |
+
)
|
| 523 |
+
|
| 524 |
+
st.markdown("---")
|
| 525 |
+
|
| 526 |
+
# Dataset info in sidebar
|
| 527 |
+
if st.session_state.data is not None:
|
| 528 |
+
st.markdown("### 📂 Current Dataset")
|
| 529 |
+
df = st.session_state.data
|
| 530 |
+
col1, col2 = st.columns(2)
|
| 531 |
+
with col1:
|
| 532 |
+
st.metric("Rows", f"{df.shape[0]:,}")
|
| 533 |
+
with col2:
|
| 534 |
+
st.metric("Columns", df.shape[1])
|
| 535 |
+
|
| 536 |
+
# Show data quality indicator
|
| 537 |
+
missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
|
| 538 |
+
if missing_pct == 0:
|
| 539 |
+
st.success("✅ Data Quality: Excellent")
|
| 540 |
+
elif missing_pct < 5:
|
| 541 |
+
st.info(f"ℹ️ Data Quality: Good ({missing_pct:.1f}% missing)")
|
| 542 |
+
elif missing_pct < 20:
|
| 543 |
+
st.warning(f"⚠️ Data Quality: Fair ({missing_pct:.1f}% missing)")
|
| 544 |
+
else:
|
| 545 |
+
st.error(f"❌ Data Quality: Poor ({missing_pct:.1f}% missing)")
|
| 546 |
+
|
| 547 |
+
# Quick actions
|
| 548 |
+
st.markdown("### ⚡ Quick Actions")
|
| 549 |
+
|
| 550 |
+
col1, col2 = st.columns(2)
|
| 551 |
+
with col1:
|
| 552 |
+
if st.button("🔄 Reset Data", use_container_width=True):
|
| 553 |
+
st.session_state.data = None
|
| 554 |
+
st.session_state.processed_data = None
|
| 555 |
+
st.session_state.data_loaded = False
|
| 556 |
+
st.rerun()
|
| 557 |
+
|
| 558 |
+
with col2:
|
| 559 |
+
if st.button("📥 Download Sample", use_container_width=True):
|
| 560 |
+
# Create sample data download
|
| 561 |
+
sample_df = df.head(100)
|
| 562 |
+
csv = sample_df.to_csv(index=False)
|
| 563 |
+
st.download_button(
|
| 564 |
+
label="Download Sample",
|
| 565 |
+
data=csv,
|
| 566 |
+
file_name="sample_data.csv",
|
| 567 |
+
mime="text/csv"
|
| 568 |
+
)
|
| 569 |
+
|
| 570 |
+
# Show operation history
|
| 571 |
+
if st.session_state.operation_status:
|
| 572 |
+
with st.expander("📋 Operation History"):
|
| 573 |
+
for op, status in st.session_state.operation_status.items():
|
| 574 |
+
if status == "success":
|
| 575 |
+
st.success(f"✅ {op}")
|
| 576 |
+
elif status == "error":
|
| 577 |
+
st.error(f"❌ {op}")
|
| 578 |
+
else:
|
| 579 |
+
st.info(f"⏳ {op}")
|
| 580 |
+
else:
|
| 581 |
+
st.info("👆 Upload a dataset to get started")
|
| 582 |
+
|
| 583 |
+
# ---------------------------------------
|
| 584 |
+
# MAIN CONTENT AREA
|
| 585 |
+
# ---------------------------------------
|
| 586 |
+
|
| 587 |
+
# Map page names to functions
|
| 588 |
+
page_map = {
|
| 589 |
+
"📤 Upload Dataset": "upload",
|
| 590 |
+
"🛠️ Preprocessing": "preprocess",
|
| 591 |
+
"🔍 EDA": "eda",
|
| 592 |
+
"📈 Visualization": "visualization",
|
| 593 |
+
"🤖 Machine Learning": "ml",
|
| 594 |
+
"💡 Insights": "insights",
|
| 595 |
+
"💬 Chatbot": "chatbot",
|
| 596 |
+
"📋 Data Quality": "quality",
|
| 597 |
+
"📐 Statistical Analysis": "statistical"
|
| 598 |
+
}
|
| 599 |
+
|
| 600 |
+
current_page = page_map[page]
|
| 601 |
+
|
| 602 |
+
# ---------------------------------------
|
| 603 |
+
# UPLOAD DATASET PAGE
|
| 604 |
+
# ---------------------------------------
|
| 605 |
+
|
| 606 |
+
if current_page == "upload":
|
| 607 |
+
|
| 608 |
+
col1, col2, col3 = st.columns([1, 2, 1])
|
| 609 |
+
|
| 610 |
+
with col2:
|
| 611 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 612 |
+
st.markdown("### 📂 Upload Your Dataset")
|
| 613 |
+
|
| 614 |
+
# File uploader with size limit warning
|
| 615 |
+
file = st.file_uploader(
|
| 616 |
+
"Choose a CSV or Excel file",
|
| 617 |
+
type=["csv", "xlsx"],
|
| 618 |
+
help="Maximum recommended file size: 200MB. Larger files may cause performance issues."
|
| 619 |
+
)
|
| 620 |
+
|
| 621 |
+
if file:
|
| 622 |
+
try:
|
| 623 |
+
# Check file size
|
| 624 |
+
file_size = file.size / 1024**2 # MB
|
| 625 |
+
if file_size > 200:
|
| 626 |
+
st.warning(f"⚠️ Large file detected ({file_size:.2f} MB). Processing may be slow.")
|
| 627 |
+
|
| 628 |
+
with st.spinner("📂 Loading file..."):
|
| 629 |
+
# Read file based on extension
|
| 630 |
+
if file.name.endswith("csv"):
|
| 631 |
+
# Try different encodings
|
| 632 |
+
encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
|
| 633 |
+
df = None
|
| 634 |
+
|
| 635 |
+
for encoding in encodings:
|
| 636 |
+
try:
|
| 637 |
+
df = pd.read_csv(file, encoding=encoding)
|
| 638 |
+
break
|
| 639 |
+
except UnicodeDecodeError:
|
| 640 |
+
continue
|
| 641 |
+
|
| 642 |
+
if df is None:
|
| 643 |
+
st.error("❌ Could not read CSV file with any common encoding.")
|
| 644 |
+
st.stop()
|
| 645 |
+
|
| 646 |
+
elif file.name.endswith(("xlsx", "xls")):
|
| 647 |
+
try:
|
| 648 |
+
df = pd.read_excel(file)
|
| 649 |
+
except Exception as e:
|
| 650 |
+
st.error(f"❌ Error reading Excel file: {str(e)}")
|
| 651 |
+
st.info("💡 Try saving the file as CSV and uploading again.")
|
| 652 |
+
st.stop()
|
| 653 |
+
|
| 654 |
+
# Validate dataset
|
| 655 |
+
issues = validate_dataset(df)
|
| 656 |
+
show_validation_warnings(issues)
|
| 657 |
+
|
| 658 |
+
if not issues or all("Large dataset" not in issue for issue in issues):
|
| 659 |
+
# Store in session state
|
| 660 |
+
st.session_state.data = df
|
| 661 |
+
st.session_state.uploaded_file_name = file.name
|
| 662 |
+
st.session_state.data_loaded = True
|
| 663 |
+
st.session_state.upload_error = None
|
| 664 |
+
|
| 665 |
+
# Show success message
|
| 666 |
+
st.markdown("""
|
| 667 |
+
<div class="success-container">
|
| 668 |
+
<strong>✅ Successfully loaded:</strong> {}<br>
|
| 669 |
+
<strong>📊 Shape:</strong> {} rows × {} columns
|
| 670 |
+
</div>
|
| 671 |
+
""".format(file.name, df.shape[0], df.shape[1]), unsafe_allow_html=True)
|
| 672 |
+
|
| 673 |
+
# File statistics
|
| 674 |
+
st.markdown("### 📊 File Statistics")
|
| 675 |
+
col1, col2, col3 = st.columns(3)
|
| 676 |
+
with col1:
|
| 677 |
+
st.metric("Total Rows", f"{df.shape[0]:,}")
|
| 678 |
+
with col2:
|
| 679 |
+
st.metric("Total Columns", df.shape[1])
|
| 680 |
+
with col3:
|
| 681 |
+
memory = df.memory_usage(deep=True).sum() / 1024**2
|
| 682 |
+
st.metric("Memory Usage", f"{memory:.2f} MB")
|
| 683 |
+
|
| 684 |
+
# Data preview with scroll
|
| 685 |
+
st.markdown("### 👁️ Data Preview")
|
| 686 |
+
st.dataframe(
|
| 687 |
+
df.head(10),
|
| 688 |
+
use_container_width=True,
|
| 689 |
+
height=300
|
| 690 |
+
)
|
| 691 |
+
|
| 692 |
+
# Column info with sorting
|
| 693 |
+
st.markdown("### 📋 Column Information")
|
| 694 |
+
col_info = pd.DataFrame({
|
| 695 |
+
'Column': df.columns,
|
| 696 |
+
'Type': df.dtypes.astype(str),
|
| 697 |
+
'Non-Null Count': df.count().values,
|
| 698 |
+
'Null Count': df.isnull().sum().values,
|
| 699 |
+
'Null %': (df.isnull().sum().values / len(df) * 100).round(2),
|
| 700 |
+
'Unique Values': [df[col].nunique() for col in df.columns]
|
| 701 |
+
})
|
| 702 |
+
|
| 703 |
+
# Sort by null count
|
| 704 |
+
col_info = col_info.sort_values('Null %', ascending=False)
|
| 705 |
+
|
| 706 |
+
st.dataframe(
|
| 707 |
+
col_info.style.background_gradient(subset=['Null %'], cmap='YlOrRd'),
|
| 708 |
+
use_container_width=True
|
| 709 |
+
)
|
| 710 |
+
|
| 711 |
+
# Quick stats
|
| 712 |
+
st.markdown("### 📈 Quick Statistics")
|
| 713 |
+
|
| 714 |
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
| 715 |
+
if len(numeric_cols) > 0:
|
| 716 |
+
st.dataframe(
|
| 717 |
+
df[numeric_cols].describe(),
|
| 718 |
+
use_container_width=True
|
| 719 |
+
)
|
| 720 |
+
|
| 721 |
+
# Navigation buttons
|
| 722 |
+
st.markdown("### 🚀 Next Steps")
|
| 723 |
+
col1, col2, col3 = st.columns(3)
|
| 724 |
+
|
| 725 |
+
with col1:
|
| 726 |
+
if st.button("🛠️ Go to Preprocessing", use_container_width=True):
|
| 727 |
+
st.session_state.page = "🛠️ Preprocessing"
|
| 728 |
+
st.rerun()
|
| 729 |
+
|
| 730 |
+
with col2:
|
| 731 |
+
if st.button("📊 Go to EDA", use_container_width=True):
|
| 732 |
+
st.session_state.page = "📊 EDA"
|
| 733 |
+
st.rerun()
|
| 734 |
+
|
| 735 |
+
with col3:
|
| 736 |
+
if st.button("📈 Go to Visualization", use_container_width=True):
|
| 737 |
+
st.session_state.page = "📈 Visualization"
|
| 738 |
+
st.rerun()
|
| 739 |
+
|
| 740 |
+
except pd.errors.EmptyDataError:
|
| 741 |
+
st.error("❌ The uploaded file is empty. Please upload a file with data.")
|
| 742 |
+
except pd.errors.ParserError as e:
|
| 743 |
+
st.error(f"❌ Error parsing file: {str(e)}")
|
| 744 |
+
st.info("💡 Check if your CSV file has consistent delimiters and quoting.")
|
| 745 |
+
except MemoryError:
|
| 746 |
+
st.error("❌ Out of memory! The file is too large to process.")
|
| 747 |
+
st.info("💡 Try uploading a smaller file or sampling your data first.")
|
| 748 |
+
except Exception as e:
|
| 749 |
+
error_msg = StreamlitExceptionHandler.handle_exception(e, "file upload")
|
| 750 |
+
st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
|
| 751 |
+
|
| 752 |
+
# Log error
|
| 753 |
+
st.session_state.error_log.append({
|
| 754 |
+
'timestamp': datetime.now(),
|
| 755 |
+
'error': str(e),
|
| 756 |
+
'traceback': traceback.format_exc()
|
| 757 |
+
})
|
| 758 |
+
|
| 759 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 760 |
+
|
| 761 |
+
# Sample data option
|
| 762 |
+
with st.expander("🔄 Or use sample data"):
|
| 763 |
+
st.markdown("Don't have a dataset? Try our sample data:")
|
| 764 |
+
|
| 765 |
+
if st.button("Load Sample Dataset", use_container_width=True):
|
| 766 |
+
try:
|
| 767 |
+
from utils import create_sample_dataset
|
| 768 |
+
sample_df = create_sample_dataset()
|
| 769 |
+
st.session_state.data = sample_df
|
| 770 |
+
st.session_state.uploaded_file_name = "sample_dataset.csv"
|
| 771 |
+
st.session_state.data_loaded = True
|
| 772 |
+
st.success("✅ Sample dataset loaded successfully!")
|
| 773 |
+
st.rerun()
|
| 774 |
+
except Exception as e:
|
| 775 |
+
st.error(f"❌ Error loading sample data: {str(e)}")
|
| 776 |
+
|
| 777 |
+
# ---------------------------------------
|
| 778 |
+
# PREPROCESSING PAGE
|
| 779 |
+
# ---------------------------------------
|
| 780 |
+
|
| 781 |
+
elif current_page == "preprocess":
|
| 782 |
+
try:
|
| 783 |
+
if st.session_state.data is not None:
|
| 784 |
+
df = st.session_state.data
|
| 785 |
+
|
| 786 |
+
# Validate data before preprocessing
|
| 787 |
+
issues = validate_dataset(df)
|
| 788 |
+
if issues:
|
| 789 |
+
show_validation_warnings(issues)
|
| 790 |
+
|
| 791 |
+
# Run preprocessing with error handling
|
| 792 |
+
with st.spinner("🔄 Preprocessing data..."):
|
| 793 |
+
processed_df, error = safe_dataframe_operation(preprocess_data, df)
|
| 794 |
+
|
| 795 |
+
if error:
|
| 796 |
+
st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
|
| 797 |
+
st.session_state.operation_status['Preprocessing'] = 'error'
|
| 798 |
+
else:
|
| 799 |
+
st.session_state.processed_data = processed_df
|
| 800 |
+
st.session_state.operation_status['Preprocessing'] = 'success'
|
| 801 |
+
|
| 802 |
+
# Show success message
|
| 803 |
+
st.markdown("""
|
| 804 |
+
<div class="success-container">
|
| 805 |
+
<strong>✅ Preprocessing completed successfully!</strong><br>
|
| 806 |
+
You can now proceed to analysis or visualization.
|
| 807 |
+
</div>
|
| 808 |
+
""", unsafe_allow_html=True)
|
| 809 |
+
else:
|
| 810 |
+
st.warning("⚠️ Please upload a dataset first in the Upload section")
|
| 811 |
+
except Exception as e:
|
| 812 |
+
error_msg = StreamlitExceptionHandler.handle_exception(e, "preprocessing")
|
| 813 |
+
st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
|
| 814 |
+
|
| 815 |
+
# ---------------------------------------
|
| 816 |
+
# EDA PAGE
|
| 817 |
+
# ---------------------------------------
|
| 818 |
+
|
| 819 |
+
elif current_page == "eda":
|
| 820 |
+
try:
|
| 821 |
+
if st.session_state.data is not None:
|
| 822 |
+
df = st.session_state.data
|
| 823 |
+
|
| 824 |
+
# Validate data
|
| 825 |
+
issues = validate_dataset(df)
|
| 826 |
+
if issues:
|
| 827 |
+
show_validation_warnings(issues)
|
| 828 |
+
|
| 829 |
+
# Run EDA with error handling
|
| 830 |
+
with st.spinner("🔍 Performing Exploratory Data Analysis..."):
|
| 831 |
+
result, error = safe_dataframe_operation(eda_analysis, df)
|
| 832 |
+
|
| 833 |
+
if error:
|
| 834 |
+
st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
|
| 835 |
+
st.session_state.operation_status['EDA'] = 'error'
|
| 836 |
+
else:
|
| 837 |
+
st.session_state.operation_status['EDA'] = 'success'
|
| 838 |
+
else:
|
| 839 |
+
st.warning("⚠️ Please upload a dataset first in the Upload section")
|
| 840 |
+
except Exception as e:
|
| 841 |
+
error_msg = StreamlitExceptionHandler.handle_exception(e, "EDA")
|
| 842 |
+
st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
|
| 843 |
+
|
| 844 |
+
# ---------------------------------------
|
| 845 |
+
# VISUALIZATION PAGE
|
| 846 |
+
# ---------------------------------------
|
| 847 |
+
|
| 848 |
+
elif current_page == "visualization":
|
| 849 |
+
try:
|
| 850 |
+
if st.session_state.data is not None:
|
| 851 |
+
df = st.session_state.data
|
| 852 |
+
|
| 853 |
+
# Validate data
|
| 854 |
+
issues = validate_dataset(df)
|
| 855 |
+
if issues:
|
| 856 |
+
show_validation_warnings(issues)
|
| 857 |
+
|
| 858 |
+
# Run visualization with error handling
|
| 859 |
+
with st.spinner("📊 Generating visualizations..."):
|
| 860 |
+
result, error = safe_dataframe_operation(auto_visualizations, df)
|
| 861 |
+
|
| 862 |
+
if error:
|
| 863 |
+
st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
|
| 864 |
+
st.session_state.operation_status['Visualization'] = 'error'
|
| 865 |
+
else:
|
| 866 |
+
st.session_state.operation_status['Visualization'] = 'success'
|
| 867 |
+
else:
|
| 868 |
+
st.warning("⚠️ Please upload a dataset first in the Upload section")
|
| 869 |
+
except Exception as e:
|
| 870 |
+
error_msg = StreamlitExceptionHandler.handle_exception(e, "visualization")
|
| 871 |
+
st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
|
| 872 |
+
|
| 873 |
+
# ---------------------------------------
|
| 874 |
+
# MACHINE LEARNING PAGE
|
| 875 |
+
# ---------------------------------------
|
| 876 |
+
|
| 877 |
+
elif current_page == "ml":
|
| 878 |
+
try:
|
| 879 |
+
if st.session_state.data is not None:
|
| 880 |
+
data_to_use = st.session_state.processed_data if st.session_state.processed_data is not None else st.session_state.data
|
| 881 |
+
|
| 882 |
+
# Validate data for ML
|
| 883 |
+
if data_to_use.shape[0] < 10:
|
| 884 |
+
st.warning("⚠️ Dataset too small for machine learning (need at least 10 rows)")
|
| 885 |
+
else:
|
| 886 |
+
# Run ML pipeline with error handling
|
| 887 |
+
with st.spinner("🤖 Running machine learning pipeline..."):
|
| 888 |
+
result, error = safe_dataframe_operation(run_ml_pipeline, data_to_use)
|
| 889 |
+
|
| 890 |
+
if error:
|
| 891 |
+
st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
|
| 892 |
+
st.session_state.operation_status['ML'] = 'error'
|
| 893 |
+
else:
|
| 894 |
+
st.session_state.operation_status['ML'] = 'success'
|
| 895 |
+
else:
|
| 896 |
+
st.warning("⚠️ Please upload a dataset first in the Upload section")
|
| 897 |
+
except Exception as e:
|
| 898 |
+
error_msg = StreamlitExceptionHandler.handle_exception(e, "machine learning")
|
| 899 |
+
st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
|
| 900 |
+
|
| 901 |
+
# ---------------------------------------
|
| 902 |
+
# INSIGHTS PAGE
|
| 903 |
+
# ---------------------------------------
|
| 904 |
+
|
| 905 |
+
elif current_page == "insights":
|
| 906 |
+
try:
|
| 907 |
+
if st.session_state.data is not None:
|
| 908 |
+
df = st.session_state.data
|
| 909 |
+
|
| 910 |
+
# Generate insights with error handling
|
| 911 |
+
with st.spinner("💡 Generating business insights..."):
|
| 912 |
+
result, error = safe_dataframe_operation(generate_business_insights, df)
|
| 913 |
+
|
| 914 |
+
if error:
|
| 915 |
+
st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
|
| 916 |
+
st.session_state.operation_status['Insights'] = 'error'
|
| 917 |
+
else:
|
| 918 |
+
st.session_state.operation_status['Insights'] = 'success'
|
| 919 |
+
else:
|
| 920 |
+
st.warning("⚠️ Please upload a dataset first in the Upload section")
|
| 921 |
+
except Exception as e:
|
| 922 |
+
error_msg = StreamlitExceptionHandler.handle_exception(e, "insights generation")
|
| 923 |
+
st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
|
| 924 |
+
|
| 925 |
+
# ---------------------------------------
|
| 926 |
+
# CHATBOT PAGE
|
| 927 |
+
# ---------------------------------------
|
| 928 |
+
|
| 929 |
+
elif current_page == "chatbot":
|
| 930 |
+
try:
|
| 931 |
+
if st.session_state.data is not None:
|
| 932 |
+
df = st.session_state.data
|
| 933 |
+
|
| 934 |
+
# Run chatbot with error handling
|
| 935 |
+
with st.spinner("🤖 Initializing chatbot..."):
|
| 936 |
+
result, error = safe_dataframe_operation(data_chatbot, df)
|
| 937 |
+
|
| 938 |
+
if error:
|
| 939 |
+
st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
|
| 940 |
+
st.session_state.operation_status['Chatbot'] = 'error'
|
| 941 |
+
else:
|
| 942 |
+
st.session_state.operation_status['Chatbot'] = 'success'
|
| 943 |
+
else:
|
| 944 |
+
st.warning("⚠️ Please upload a dataset first in the Upload section")
|
| 945 |
+
except Exception as e:
|
| 946 |
+
error_msg = StreamlitExceptionHandler.handle_exception(e, "chatbot")
|
| 947 |
+
st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
|
| 948 |
+
|
| 949 |
+
# ---------------------------------------
|
| 950 |
+
# DATA QUALITY PAGE
|
| 951 |
+
# ---------------------------------------
|
| 952 |
+
|
| 953 |
+
elif current_page == "quality":
|
| 954 |
+
try:
|
| 955 |
+
if st.session_state.data is not None:
|
| 956 |
+
df = st.session_state.data
|
| 957 |
+
|
| 958 |
+
# Run quality report with error handling
|
| 959 |
+
with st.spinner("📋 Generating quality report..."):
|
| 960 |
+
from data_quality import quality_report
|
| 961 |
+
result, error = safe_dataframe_operation(quality_report, df)
|
| 962 |
+
|
| 963 |
+
if error:
|
| 964 |
+
st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
|
| 965 |
+
st.session_state.operation_status['Data Quality'] = 'error'
|
| 966 |
+
else:
|
| 967 |
+
st.session_state.operation_status['Data Quality'] = 'success'
|
| 968 |
+
else:
|
| 969 |
+
st.warning("⚠️ Please upload a dataset first in the Upload section")
|
| 970 |
+
except Exception as e:
|
| 971 |
+
error_msg = StreamlitExceptionHandler.handle_exception(e, "data quality")
|
| 972 |
+
st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
|
| 973 |
+
|
| 974 |
+
# ---------------------------------------
|
| 975 |
+
# STATISTICAL ANALYSIS PAGE
|
| 976 |
+
# ---------------------------------------
|
| 977 |
+
|
| 978 |
+
elif current_page == "statistical":
|
| 979 |
+
try:
|
| 980 |
+
if st.session_state.data is not None:
|
| 981 |
+
df = st.session_state.data
|
| 982 |
+
|
| 983 |
+
# Validate numeric data
|
| 984 |
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
| 985 |
+
if len(numeric_cols) == 0:
|
| 986 |
+
st.warning("⚠️ No numeric columns found. Statistical analysis requires numeric data.")
|
| 987 |
+
else:
|
| 988 |
+
# Run statistical analysis with error handling
|
| 989 |
+
with st.spinner("📐 Performing statistical analysis..."):
|
| 990 |
+
from statistical_analysis import statistical_analysis
|
| 991 |
+
result, error = safe_dataframe_operation(statistical_analysis, df)
|
| 992 |
+
|
| 993 |
+
if error:
|
| 994 |
+
st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
|
| 995 |
+
st.session_state.operation_status['Statistical Analysis'] = 'error'
|
| 996 |
+
else:
|
| 997 |
+
st.session_state.operation_status['Statistical Analysis'] = 'success'
|
| 998 |
+
else:
|
| 999 |
+
st.warning("⚠️ Please upload a dataset first in the Upload section")
|
| 1000 |
+
except Exception as e:
|
| 1001 |
+
error_msg = StreamlitExceptionHandler.handle_exception(e, "statistical analysis")
|
| 1002 |
+
st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
|
| 1003 |
+
|
| 1004 |
+
# ---------------------------------------
|
| 1005 |
+
# ERROR LOG DISPLAY (Hidden by default)
|
| 1006 |
+
# ---------------------------------------
|
| 1007 |
+
|
| 1008 |
+
if st.session_state.error_log and st.checkbox("🔧 Show Error Log (Debug Mode)"):
|
| 1009 |
+
st.markdown("### 📋 Error Log")
|
| 1010 |
+
for i, error_entry in enumerate(st.session_state.error_log[-5:]): # Show last 5 errors
|
| 1011 |
+
with st.expander(f"Error {i+1}: {error_entry['timestamp'].strftime('%Y-%m-%d %H:%M:%S')}"):
|
| 1012 |
+
st.code(error_entry['error'])
|
| 1013 |
+
st.code(error_entry['traceback'])
|
| 1014 |
+
|
| 1015 |
+
# ---------------------------------------
|
| 1016 |
+
# FOOTER
|
| 1017 |
+
# ---------------------------------------
|
| 1018 |
+
|
| 1019 |
+
st.markdown("---")
|
| 1020 |
+
st.markdown(
|
| 1021 |
+
"<p style='text-align: center; color: gray;'>Made with ❤️ using Streamlit | Version 2.0 | Enhanced Error Handling</p>",
|
| 1022 |
+
unsafe_allow_html=True
|
| 1023 |
+
)
|
chatbot.py
ADDED
|
@@ -0,0 +1,1051 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
import plotly.graph_objects as go
|
| 6 |
+
from plotly.subplots import make_subplots
|
| 7 |
+
import re
|
| 8 |
+
from datetime import datetime, timedelta
|
| 9 |
+
|
| 10 |
+
def data_chatbot(df):
|
| 11 |
+
"""
|
| 12 |
+
Advanced chatbot that provides data access and visualizations based on user questions
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
st.markdown("""
|
| 16 |
+
<style>
|
| 17 |
+
.chat-header {
|
| 18 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 19 |
+
padding: 25px;
|
| 20 |
+
border-radius: 15px;
|
| 21 |
+
color: white;
|
| 22 |
+
text-align: center;
|
| 23 |
+
margin-bottom: 25px;
|
| 24 |
+
box-shadow: 0 10px 30px rgba(102, 126, 234, 0.3);
|
| 25 |
+
}
|
| 26 |
+
.chat-header h2 {
|
| 27 |
+
font-size: 2.2rem;
|
| 28 |
+
margin-bottom: 10px;
|
| 29 |
+
}
|
| 30 |
+
.chat-header p {
|
| 31 |
+
font-size: 1.1rem;
|
| 32 |
+
opacity: 0.95;
|
| 33 |
+
}
|
| 34 |
+
.user-message {
|
| 35 |
+
background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
|
| 36 |
+
padding: 15px 20px;
|
| 37 |
+
border-radius: 20px 20px 5px 20px;
|
| 38 |
+
margin: 10px 0;
|
| 39 |
+
max-width: 80%;
|
| 40 |
+
margin-left: auto;
|
| 41 |
+
border-left: 4px solid #1976d2;
|
| 42 |
+
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
| 43 |
+
}
|
| 44 |
+
.bot-message {
|
| 45 |
+
background: white;
|
| 46 |
+
padding: 15px 20px;
|
| 47 |
+
border-radius: 20px 20px 20px 5px;
|
| 48 |
+
margin: 10px 0;
|
| 49 |
+
max-width: 80%;
|
| 50 |
+
border-left: 4px solid #4caf50;
|
| 51 |
+
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
| 52 |
+
}
|
| 53 |
+
.metric-card {
|
| 54 |
+
background: white;
|
| 55 |
+
padding: 15px;
|
| 56 |
+
border-radius: 10px;
|
| 57 |
+
text-align: center;
|
| 58 |
+
box-shadow: 0 2px 10px rgba(0,0,0,0.05);
|
| 59 |
+
border-left: 4px solid #667eea;
|
| 60 |
+
}
|
| 61 |
+
.viz-container {
|
| 62 |
+
background: white;
|
| 63 |
+
padding: 20px;
|
| 64 |
+
border-radius: 15px;
|
| 65 |
+
margin: 20px 0;
|
| 66 |
+
box-shadow: 0 5px 20px rgba(0,0,0,0.1);
|
| 67 |
+
}
|
| 68 |
+
.insight-badge {
|
| 69 |
+
background: #4caf50;
|
| 70 |
+
color: white;
|
| 71 |
+
padding: 5px 10px;
|
| 72 |
+
border-radius: 15px;
|
| 73 |
+
font-size: 12px;
|
| 74 |
+
display: inline-block;
|
| 75 |
+
margin-right: 5px;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
|
| 79 |
+
<div class="chat-header">
|
| 80 |
+
<h2>🤖 Smart Data Assistant</h2>
|
| 81 |
+
<p>Ask questions and get instant visualizations - I'll show you the data!</p>
|
| 82 |
+
</div>
|
| 83 |
+
""", unsafe_allow_html=True)
|
| 84 |
+
|
| 85 |
+
# Initialize session state
|
| 86 |
+
if "chat_messages" not in st.session_state:
|
| 87 |
+
st.session_state.chat_messages = []
|
| 88 |
+
|
| 89 |
+
if "last_viz" not in st.session_state:
|
| 90 |
+
st.session_state.last_viz = None
|
| 91 |
+
|
| 92 |
+
if "last_data" not in st.session_state:
|
| 93 |
+
st.session_state.last_data = None
|
| 94 |
+
|
| 95 |
+
# Main layout
|
| 96 |
+
main_col, viz_col = st.columns([1, 1])
|
| 97 |
+
|
| 98 |
+
with main_col:
|
| 99 |
+
# Chat history
|
| 100 |
+
chat_container = st.container()
|
| 101 |
+
|
| 102 |
+
with chat_container:
|
| 103 |
+
if not st.session_state.chat_messages:
|
| 104 |
+
st.info("""
|
| 105 |
+
👋 **Hi! I can show you data and create visualizations. Try asking:**
|
| 106 |
+
|
| 107 |
+
**📊 Show Data:**
|
| 108 |
+
• "Show me the first 10 rows"
|
| 109 |
+
• "Show me data where age > 30"
|
| 110 |
+
• "Display top 5 by sales"
|
| 111 |
+
|
| 112 |
+
**📈 Create Visualizations:**
|
| 113 |
+
• "Show me a bar chart of category"
|
| 114 |
+
• "Plot histogram of age"
|
| 115 |
+
• "Create scatter plot of price vs quantity"
|
| 116 |
+
• "Show trend of sales over time"
|
| 117 |
+
|
| 118 |
+
**🔍 Analyze:**
|
| 119 |
+
• "What's the average of salary?"
|
| 120 |
+
• "Show statistics for all columns"
|
| 121 |
+
• "Find outliers in price"
|
| 122 |
+
""")
|
| 123 |
+
|
| 124 |
+
for msg in st.session_state.chat_messages:
|
| 125 |
+
if msg["role"] == "user":
|
| 126 |
+
st.markdown(f'<div class="user-message"><b>👤 You:</b> {msg["content"]}</div>', unsafe_allow_html=True)
|
| 127 |
+
else:
|
| 128 |
+
st.markdown(f'<div class="bot-message">{msg["content"]}</div>', unsafe_allow_html=True)
|
| 129 |
+
|
| 130 |
+
# Input area
|
| 131 |
+
st.markdown("<br>", unsafe_allow_html=True)
|
| 132 |
+
input_col1, input_col2 = st.columns([5, 1])
|
| 133 |
+
|
| 134 |
+
with input_col1:
|
| 135 |
+
user_query = st.text_input("", placeholder="💬 Ask a question or request a visualization...",
|
| 136 |
+
key="chat_input", label_visibility="collapsed")
|
| 137 |
+
|
| 138 |
+
with input_col2:
|
| 139 |
+
send_button = st.button("📤 Ask", use_container_width=True)
|
| 140 |
+
|
| 141 |
+
if send_button and user_query:
|
| 142 |
+
# Add user message
|
| 143 |
+
st.session_state.chat_messages.append({"role": "user", "content": user_query})
|
| 144 |
+
|
| 145 |
+
# Process query and get response with data/viz
|
| 146 |
+
with st.spinner("🔍 Processing your request..."):
|
| 147 |
+
response, viz_data, table_data = process_query_with_viz(user_query, df)
|
| 148 |
+
|
| 149 |
+
# Add bot response
|
| 150 |
+
st.session_state.chat_messages.append({"role": "bot", "content": response})
|
| 151 |
+
|
| 152 |
+
# Store visualization and data for display
|
| 153 |
+
if viz_data:
|
| 154 |
+
st.session_state.last_viz = viz_data
|
| 155 |
+
if table_data is not None:
|
| 156 |
+
st.session_state.last_data = table_data
|
| 157 |
+
|
| 158 |
+
st.rerun()
|
| 159 |
+
|
| 160 |
+
with viz_col:
|
| 161 |
+
# Display visualizations and data
|
| 162 |
+
if st.session_state.last_viz:
|
| 163 |
+
st.markdown('<div class="viz-container">', unsafe_allow_html=True)
|
| 164 |
+
st.markdown("### 📊 Generated Visualization")
|
| 165 |
+
display_visualization(st.session_state.last_viz)
|
| 166 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 167 |
+
|
| 168 |
+
if st.session_state.last_data is not None:
|
| 169 |
+
st.markdown('<div class="viz-container">', unsafe_allow_html=True)
|
| 170 |
+
st.markdown("### 📋 Data Result")
|
| 171 |
+
st.dataframe(st.session_state.last_data, use_container_width=True, height=300)
|
| 172 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 173 |
+
|
| 174 |
+
# Quick action buttons
|
| 175 |
+
st.markdown("---")
|
| 176 |
+
st.markdown("### 🔍 Quick Actions")
|
| 177 |
+
|
| 178 |
+
col1, col2, col3, col4, col5 = st.columns(5)
|
| 179 |
+
|
| 180 |
+
actions = [
|
| 181 |
+
("📊 First 10 Rows", "Show me first 10 rows", col1),
|
| 182 |
+
("📈 Bar Chart", "Show bar chart of first categorical column", col2),
|
| 183 |
+
("📉 Histogram", "Plot histogram of first numeric column", col3),
|
| 184 |
+
("🔎 Filter", "Show rows where value > average", col4),
|
| 185 |
+
("📋 Statistics", "Show me statistics", col5)
|
| 186 |
+
]
|
| 187 |
+
|
| 188 |
+
for label, query, col in actions:
|
| 189 |
+
if col.button(label, use_container_width=True):
|
| 190 |
+
st.session_state.chat_messages.append({"role": "user", "content": query})
|
| 191 |
+
response, viz_data, table_data = process_query_with_viz(query, df)
|
| 192 |
+
st.session_state.chat_messages.append({"role": "bot", "content": response})
|
| 193 |
+
if viz_data:
|
| 194 |
+
st.session_state.last_viz = viz_data
|
| 195 |
+
if table_data is not None:
|
| 196 |
+
st.session_state.last_data = table_data
|
| 197 |
+
st.rerun()
|
| 198 |
+
|
| 199 |
+
# Clear button
|
| 200 |
+
col1, col2, col3 = st.columns([1, 1, 1])
|
| 201 |
+
with col2:
|
| 202 |
+
if st.button("🗑️ Clear Chat & Visualizations", use_container_width=True):
|
| 203 |
+
st.session_state.chat_messages = []
|
| 204 |
+
st.session_state.last_viz = None
|
| 205 |
+
st.session_state.last_data = None
|
| 206 |
+
st.rerun()
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def process_query_with_viz(query, df):
|
| 210 |
+
"""Process query and return response with visualization and data"""
|
| 211 |
+
query_lower = query.lower().strip()
|
| 212 |
+
|
| 213 |
+
# Get column information
|
| 214 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 215 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 216 |
+
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
|
| 217 |
+
all_cols = df.columns.tolist()
|
| 218 |
+
|
| 219 |
+
# Extract numbers from query
|
| 220 |
+
numbers = re.findall(r'\d+', query_lower)
|
| 221 |
+
n = int(numbers[0]) if numbers else 10
|
| 222 |
+
|
| 223 |
+
# 1. SHOW DATA - First/Last/Random rows
|
| 224 |
+
if any(word in query_lower for word in ['first', 'head', 'top']):
|
| 225 |
+
return show_first_rows(df, n)
|
| 226 |
+
|
| 227 |
+
elif any(word in query_lower for word in ['last', 'tail', 'bottom']):
|
| 228 |
+
return show_last_rows(df, n)
|
| 229 |
+
|
| 230 |
+
elif 'random' in query_lower or 'sample' in query_lower:
|
| 231 |
+
return show_random_rows(df, n)
|
| 232 |
+
|
| 233 |
+
# 2. FILTER DATA
|
| 234 |
+
elif any(word in query_lower for word in ['find', 'where', 'filter', 'search', 'with']):
|
| 235 |
+
return filter_data(query_lower, df)
|
| 236 |
+
|
| 237 |
+
# 3. SORT DATA
|
| 238 |
+
elif 'sort' in query_lower or 'order by' in query_lower:
|
| 239 |
+
return sort_data(query_lower, df)
|
| 240 |
+
|
| 241 |
+
# 4. BAR CHART
|
| 242 |
+
elif any(word in query_lower for word in ['bar chart', 'bar plot', 'bar graph', 'count plot']):
|
| 243 |
+
return create_bar_chart(query_lower, df, categorical_cols)
|
| 244 |
+
|
| 245 |
+
# 5. HISTOGRAM
|
| 246 |
+
elif any(word in query_lower for word in ['histogram', 'distribution', 'hist', 'frequency']):
|
| 247 |
+
return create_histogram(query_lower, df, numeric_cols)
|
| 248 |
+
|
| 249 |
+
# 6. SCATTER PLOT
|
| 250 |
+
elif any(word in query_lower for word in ['scatter', 'scatter plot', 'scatterplot', 'relationship']):
|
| 251 |
+
return create_scatter_plot(query_lower, df, numeric_cols)
|
| 252 |
+
|
| 253 |
+
# 7. LINE CHART / TREND
|
| 254 |
+
elif any(word in query_lower for word in ['line chart', 'line plot', 'trend', 'over time']):
|
| 255 |
+
return create_line_chart(query_lower, df, numeric_cols, datetime_cols)
|
| 256 |
+
|
| 257 |
+
# 8. BOX PLOT
|
| 258 |
+
elif any(word in query_lower for word in ['box plot', 'boxplot', 'box', 'outliers']):
|
| 259 |
+
return create_box_plot(query_lower, df, numeric_cols, categorical_cols)
|
| 260 |
+
|
| 261 |
+
# 9. PIE CHART
|
| 262 |
+
elif any(word in query_lower for word in ['pie chart', 'pie', 'proportion', 'percentage']):
|
| 263 |
+
return create_pie_chart(query_lower, df, categorical_cols)
|
| 264 |
+
|
| 265 |
+
# 10. HEATMAP / CORRELATION
|
| 266 |
+
elif any(word in query_lower for word in ['heatmap', 'correlation', 'corr', 'heat map']):
|
| 267 |
+
return create_heatmap(df, numeric_cols)
|
| 268 |
+
|
| 269 |
+
# 11. VIOLIN PLOT
|
| 270 |
+
elif 'violin' in query_lower:
|
| 271 |
+
return create_violin_plot(query_lower, df, numeric_cols, categorical_cols)
|
| 272 |
+
|
| 273 |
+
# 12. STATISTICS
|
| 274 |
+
elif any(word in query_lower for word in ['statistics', 'stats', 'describe', 'summary']):
|
| 275 |
+
return show_statistics(query_lower, df, numeric_cols, all_cols)
|
| 276 |
+
|
| 277 |
+
# 13. COLUMN INFORMATION
|
| 278 |
+
elif any(word in query_lower for word in ['column info', 'column details', 'info about']):
|
| 279 |
+
return show_column_info(query_lower, df, all_cols)
|
| 280 |
+
|
| 281 |
+
# 14. MISSING VALUES
|
| 282 |
+
elif any(word in query_lower for word in ['missing', 'null', 'na', 'empty']):
|
| 283 |
+
return show_missing_values(df)
|
| 284 |
+
|
| 285 |
+
# 15. OUTLIERS
|
| 286 |
+
elif 'outlier' in query_lower:
|
| 287 |
+
return detect_outliers(query_lower, df, numeric_cols)
|
| 288 |
+
|
| 289 |
+
# 16. UNIQUE VALUES
|
| 290 |
+
elif any(word in query_lower for word in ['unique', 'distinct', 'categories']):
|
| 291 |
+
return show_unique_values(query_lower, df, all_cols, categorical_cols)
|
| 292 |
+
|
| 293 |
+
# 17. COMPARE COLUMNS
|
| 294 |
+
elif 'compare' in query_lower:
|
| 295 |
+
return compare_columns(query_lower, df, numeric_cols, categorical_cols)
|
| 296 |
+
|
| 297 |
+
# 18. HELP
|
| 298 |
+
elif any(word in query_lower for word in ['help', 'what can you do', 'capabilities']):
|
| 299 |
+
return show_help(), None, None
|
| 300 |
+
|
| 301 |
+
# 19. DEFAULT - Try to understand if asking about a specific column
|
| 302 |
+
else:
|
| 303 |
+
return handle_general_query(query_lower, df, numeric_cols, categorical_cols, all_cols)
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def show_first_rows(df, n=10):
|
| 307 |
+
"""Show first n rows"""
|
| 308 |
+
data = df.head(n)
|
| 309 |
+
response = f"### 👁️ First {n} Rows\n\nHere's the data you requested:"
|
| 310 |
+
return response, None, data
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
def show_last_rows(df, n=10):
|
| 314 |
+
"""Show last n rows"""
|
| 315 |
+
data = df.tail(n)
|
| 316 |
+
response = f"### 👁️ Last {n} Rows\n\nHere's the data you requested:"
|
| 317 |
+
return response, None, data
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
def show_random_rows(df, n=5):
|
| 321 |
+
"""Show random n rows"""
|
| 322 |
+
data = df.sample(min(n, len(df)))
|
| 323 |
+
response = f"### 🎲 Random Sample of {n} Rows\n\nHere's a random sample from your data:"
|
| 324 |
+
return response, None, data
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def filter_data(query, df):
|
| 328 |
+
"""Filter data based on conditions"""
|
| 329 |
+
# Common patterns
|
| 330 |
+
patterns = [
|
| 331 |
+
(r'(\w+)\s*>\s*(\d+\.?\d*)', '>'),
|
| 332 |
+
(r'(\w+)\s*<\s*(\d+\.?\d*)', '<'),
|
| 333 |
+
(r'(\w+)\s*>=\s*(\d+\.?\d*)', '>='),
|
| 334 |
+
(r'(\w+)\s*<=\s*(\d+\.?\d*)', '<='),
|
| 335 |
+
(r'(\w+)\s*=\s*(\d+\.?\d*)', '=='),
|
| 336 |
+
(r'(\w+)\s*==\s*(\d+\.?\d*)', '=='),
|
| 337 |
+
(r'(\w+)\s*contains\s*["\']?([^"\']+)["\']?', 'contains'),
|
| 338 |
+
(r'(\w+)\s*is\s*["\']?([^"\']+)["\']?', '=='),
|
| 339 |
+
]
|
| 340 |
+
|
| 341 |
+
for pattern, op in patterns:
|
| 342 |
+
match = re.search(pattern, query.lower())
|
| 343 |
+
if match:
|
| 344 |
+
col = match.group(1)
|
| 345 |
+
val = match.group(2)
|
| 346 |
+
|
| 347 |
+
# Find matching column
|
| 348 |
+
for c in df.columns:
|
| 349 |
+
if c.lower() == col:
|
| 350 |
+
try:
|
| 351 |
+
if op in ['>', '<', '>=', '<=']:
|
| 352 |
+
val = float(val)
|
| 353 |
+
if op == '>':
|
| 354 |
+
filtered = df[df[c] > val]
|
| 355 |
+
condition = f"{c} > {val}"
|
| 356 |
+
elif op == '<':
|
| 357 |
+
filtered = df[df[c] < val]
|
| 358 |
+
condition = f"{c} < {val}"
|
| 359 |
+
elif op == '>=':
|
| 360 |
+
filtered = df[df[c] >= val]
|
| 361 |
+
condition = f"{c} >= {val}"
|
| 362 |
+
elif op == '<=':
|
| 363 |
+
filtered = df[df[c] <= val]
|
| 364 |
+
condition = f"{c} <= {val}"
|
| 365 |
+
elif op == 'contains':
|
| 366 |
+
filtered = df[df[c].astype(str).str.contains(val, case=False, na=False)]
|
| 367 |
+
condition = f"{c} contains '{val}'"
|
| 368 |
+
else:
|
| 369 |
+
if df[c].dtype in ['int64', 'float64']:
|
| 370 |
+
filtered = df[df[c] == float(val)]
|
| 371 |
+
else:
|
| 372 |
+
filtered = df[df[c].astype(str).str.lower() == val.lower()]
|
| 373 |
+
condition = f"{c} = {val}"
|
| 374 |
+
|
| 375 |
+
if len(filtered) > 0:
|
| 376 |
+
response = f"### 🔍 Found {len(filtered)} rows where {condition}\n\nShowing first 20 results:"
|
| 377 |
+
return response, None, filtered.head(20)
|
| 378 |
+
else:
|
| 379 |
+
return f"❌ No rows found where {condition}", None, None
|
| 380 |
+
except:
|
| 381 |
+
pass
|
| 382 |
+
|
| 383 |
+
return "❌ I couldn't understand the filter condition. Try something like: 'show rows where age > 30'", None, None
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
def sort_data(query, df):
|
| 387 |
+
"""Sort data by column"""
|
| 388 |
+
# Extract column name
|
| 389 |
+
for col in df.columns:
|
| 390 |
+
if col.lower() in query:
|
| 391 |
+
sort_col = col
|
| 392 |
+
break
|
| 393 |
+
else:
|
| 394 |
+
sort_col = df.columns[0] if len(df.columns) > 0 else None
|
| 395 |
+
|
| 396 |
+
if not sort_col:
|
| 397 |
+
return "❌ Please specify a column to sort by", None, None
|
| 398 |
+
|
| 399 |
+
# Determine order
|
| 400 |
+
if 'desc' in query or 'highest' in query or 'largest' in query:
|
| 401 |
+
ascending = False
|
| 402 |
+
order = "descending"
|
| 403 |
+
else:
|
| 404 |
+
ascending = True
|
| 405 |
+
order = "ascending"
|
| 406 |
+
|
| 407 |
+
# Get number
|
| 408 |
+
numbers = re.findall(r'\d+', query)
|
| 409 |
+
n = int(numbers[0]) if numbers else 20
|
| 410 |
+
|
| 411 |
+
sorted_df = df.sort_values(sort_col, ascending=ascending).head(n)
|
| 412 |
+
|
| 413 |
+
response = f"### 📊 Sorted by {sort_col} ({order})\n\nShowing top {n} results:"
|
| 414 |
+
return response, None, sorted_df
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
def create_bar_chart(query, df, categorical_cols):
|
| 418 |
+
"""Create bar chart for categorical column"""
|
| 419 |
+
# Find requested column
|
| 420 |
+
col = None
|
| 421 |
+
for c in categorical_cols:
|
| 422 |
+
if c.lower() in query:
|
| 423 |
+
col = c
|
| 424 |
+
break
|
| 425 |
+
|
| 426 |
+
if not col and categorical_cols:
|
| 427 |
+
col = categorical_cols[0]
|
| 428 |
+
|
| 429 |
+
if col:
|
| 430 |
+
value_counts = df[col].value_counts().head(20)
|
| 431 |
+
|
| 432 |
+
fig = px.bar(
|
| 433 |
+
x=value_counts.index,
|
| 434 |
+
y=value_counts.values,
|
| 435 |
+
title=f"Bar Chart of {col} (Top 20)",
|
| 436 |
+
labels={'x': col, 'y': 'Count'},
|
| 437 |
+
color_discrete_sequence=['#667eea']
|
| 438 |
+
)
|
| 439 |
+
|
| 440 |
+
fig.update_layout(
|
| 441 |
+
plot_bgcolor='white',
|
| 442 |
+
paper_bgcolor='white',
|
| 443 |
+
font=dict(color='#2c3e50'),
|
| 444 |
+
xaxis_tickangle=-45,
|
| 445 |
+
height=500
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
response = f"### 📊 Bar Chart of '{col}'\n\nHere's the distribution of values:"
|
| 449 |
+
return response, fig, None
|
| 450 |
+
|
| 451 |
+
return "❌ No categorical column found for bar chart", None, None
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
def create_histogram(query, df, numeric_cols):
|
| 455 |
+
"""Create histogram for numeric column"""
|
| 456 |
+
# Find requested column
|
| 457 |
+
col = None
|
| 458 |
+
for c in numeric_cols:
|
| 459 |
+
if c.lower() in query:
|
| 460 |
+
col = c
|
| 461 |
+
break
|
| 462 |
+
|
| 463 |
+
if not col and numeric_cols:
|
| 464 |
+
col = numeric_cols[0]
|
| 465 |
+
|
| 466 |
+
if col:
|
| 467 |
+
fig = px.histogram(
|
| 468 |
+
df,
|
| 469 |
+
x=col,
|
| 470 |
+
nbins=30,
|
| 471 |
+
title=f"Histogram of {col}",
|
| 472 |
+
marginal="box",
|
| 473 |
+
color_discrete_sequence=['#667eea']
|
| 474 |
+
)
|
| 475 |
+
|
| 476 |
+
fig.update_layout(
|
| 477 |
+
plot_bgcolor='white',
|
| 478 |
+
paper_bgcolor='white',
|
| 479 |
+
font=dict(color='#2c3e50'),
|
| 480 |
+
height=500
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
# Add statistics
|
| 484 |
+
data = df[col].dropna()
|
| 485 |
+
stats = f"Mean: {data.mean():.2f} | Median: {data.median():.2f} | Std: {data.std():.2f}"
|
| 486 |
+
|
| 487 |
+
response = f"### 📊 Histogram of '{col}'\n\n{stats}"
|
| 488 |
+
return response, fig, None
|
| 489 |
+
|
| 490 |
+
return "❌ No numeric column found for histogram", None, None
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
def create_scatter_plot(query, df, numeric_cols):
|
| 494 |
+
"""Create scatter plot between two numeric columns"""
|
| 495 |
+
# Find two numeric columns
|
| 496 |
+
cols = []
|
| 497 |
+
for col in numeric_cols:
|
| 498 |
+
if col.lower() in query:
|
| 499 |
+
cols.append(col)
|
| 500 |
+
|
| 501 |
+
if len(cols) >= 2:
|
| 502 |
+
x_col, y_col = cols[0], cols[1]
|
| 503 |
+
elif len(numeric_cols) >= 2:
|
| 504 |
+
x_col, y_col = numeric_cols[0], numeric_cols[1]
|
| 505 |
+
else:
|
| 506 |
+
return "❌ Need at least 2 numeric columns for scatter plot", None, None
|
| 507 |
+
|
| 508 |
+
fig = px.scatter(
|
| 509 |
+
df,
|
| 510 |
+
x=x_col,
|
| 511 |
+
y=y_col,
|
| 512 |
+
title=f"Scatter Plot: {y_col} vs {x_col}",
|
| 513 |
+
trendline="ols",
|
| 514 |
+
opacity=0.6,
|
| 515 |
+
color_discrete_sequence=['#667eea']
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
fig.update_layout(
|
| 519 |
+
plot_bgcolor='white',
|
| 520 |
+
paper_bgcolor='white',
|
| 521 |
+
font=dict(color='#2c3e50'),
|
| 522 |
+
height=500
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
+
# Calculate correlation
|
| 526 |
+
corr = df[x_col].corr(df[y_col])
|
| 527 |
+
|
| 528 |
+
response = f"### 📊 Scatter Plot: {y_col} vs {x_col}\n\nCorrelation: {corr:.4f}"
|
| 529 |
+
return response, fig, None
|
| 530 |
+
|
| 531 |
+
|
| 532 |
+
def create_line_chart(query, df, numeric_cols, datetime_cols):
|
| 533 |
+
"""Create line chart for time series or sequential data"""
|
| 534 |
+
# Find date column
|
| 535 |
+
date_col = None
|
| 536 |
+
for col in datetime_cols:
|
| 537 |
+
if col.lower() in query:
|
| 538 |
+
date_col = col
|
| 539 |
+
break
|
| 540 |
+
|
| 541 |
+
if not date_col and datetime_cols:
|
| 542 |
+
date_col = datetime_cols[0]
|
| 543 |
+
|
| 544 |
+
# Find value column
|
| 545 |
+
val_col = None
|
| 546 |
+
for col in numeric_cols:
|
| 547 |
+
if col.lower() in query:
|
| 548 |
+
val_col = col
|
| 549 |
+
break
|
| 550 |
+
|
| 551 |
+
if not val_col and numeric_cols:
|
| 552 |
+
val_col = numeric_cols[0]
|
| 553 |
+
|
| 554 |
+
if date_col and val_col:
|
| 555 |
+
# Sort by date
|
| 556 |
+
plot_df = df[[date_col, val_col]].dropna().sort_values(date_col)
|
| 557 |
+
|
| 558 |
+
fig = px.line(
|
| 559 |
+
plot_df,
|
| 560 |
+
x=date_col,
|
| 561 |
+
y=val_col,
|
| 562 |
+
title=f"Trend of {val_col} over Time",
|
| 563 |
+
color_discrete_sequence=['#667eea']
|
| 564 |
+
)
|
| 565 |
+
|
| 566 |
+
fig.update_layout(
|
| 567 |
+
plot_bgcolor='white',
|
| 568 |
+
paper_bgcolor='white',
|
| 569 |
+
font=dict(color='#2c3e50'),
|
| 570 |
+
height=500
|
| 571 |
+
)
|
| 572 |
+
|
| 573 |
+
response = f"### 📈 Line Chart: {val_col} over Time"
|
| 574 |
+
return response, fig, None
|
| 575 |
+
|
| 576 |
+
return "❌ Need a datetime column and numeric column for line chart", None, None
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
def create_box_plot(query, df, numeric_cols, categorical_cols):
|
| 580 |
+
"""Create box plot"""
|
| 581 |
+
# Find numeric column
|
| 582 |
+
num_col = None
|
| 583 |
+
for col in numeric_cols:
|
| 584 |
+
if col.lower() in query:
|
| 585 |
+
num_col = col
|
| 586 |
+
break
|
| 587 |
+
|
| 588 |
+
if not num_col and numeric_cols:
|
| 589 |
+
num_col = numeric_cols[0]
|
| 590 |
+
|
| 591 |
+
# Find categorical column for grouping
|
| 592 |
+
cat_col = None
|
| 593 |
+
for col in categorical_cols:
|
| 594 |
+
if col.lower() in query:
|
| 595 |
+
cat_col = col
|
| 596 |
+
break
|
| 597 |
+
|
| 598 |
+
if num_col:
|
| 599 |
+
if cat_col:
|
| 600 |
+
fig = px.box(
|
| 601 |
+
df,
|
| 602 |
+
x=cat_col,
|
| 603 |
+
y=num_col,
|
| 604 |
+
title=f"Box Plot of {num_col} by {cat_col}",
|
| 605 |
+
color_discrete_sequence=['#667eea']
|
| 606 |
+
)
|
| 607 |
+
response = f"### 📊 Box Plot: {num_col} grouped by {cat_col}"
|
| 608 |
+
else:
|
| 609 |
+
fig = px.box(
|
| 610 |
+
df,
|
| 611 |
+
y=num_col,
|
| 612 |
+
title=f"Box Plot of {num_col}",
|
| 613 |
+
color_discrete_sequence=['#667eea']
|
| 614 |
+
)
|
| 615 |
+
response = f"### 📊 Box Plot of {num_col}"
|
| 616 |
+
|
| 617 |
+
fig.update_layout(
|
| 618 |
+
plot_bgcolor='white',
|
| 619 |
+
paper_bgcolor='white',
|
| 620 |
+
font=dict(color='#2c3e50'),
|
| 621 |
+
height=500
|
| 622 |
+
)
|
| 623 |
+
|
| 624 |
+
return response, fig, None
|
| 625 |
+
|
| 626 |
+
return "❌ No numeric column found for box plot", None, None
|
| 627 |
+
|
| 628 |
+
|
| 629 |
+
def create_pie_chart(query, df, categorical_cols):
|
| 630 |
+
"""Create pie chart for categorical column"""
|
| 631 |
+
# Find categorical column
|
| 632 |
+
col = None
|
| 633 |
+
for c in categorical_cols:
|
| 634 |
+
if c.lower() in query:
|
| 635 |
+
col = c
|
| 636 |
+
break
|
| 637 |
+
|
| 638 |
+
if not col and categorical_cols:
|
| 639 |
+
col = categorical_cols[0]
|
| 640 |
+
|
| 641 |
+
if col:
|
| 642 |
+
value_counts = df[col].value_counts().head(10)
|
| 643 |
+
|
| 644 |
+
fig = px.pie(
|
| 645 |
+
values=value_counts.values,
|
| 646 |
+
names=value_counts.index,
|
| 647 |
+
title=f"Pie Chart of {col} (Top 10)",
|
| 648 |
+
hole=0.3,
|
| 649 |
+
color_discrete_sequence=px.colors.qualitative.Set3
|
| 650 |
+
)
|
| 651 |
+
|
| 652 |
+
fig.update_layout(
|
| 653 |
+
height=500,
|
| 654 |
+
showlegend=True
|
| 655 |
+
)
|
| 656 |
+
|
| 657 |
+
response = f"### 🥧 Pie Chart of '{col}'\n\nProportion of values:"
|
| 658 |
+
return response, fig, None
|
| 659 |
+
|
| 660 |
+
return "❌ No categorical column found for pie chart", None, None
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
def create_heatmap(df, numeric_cols):
|
| 664 |
+
"""Create correlation heatmap"""
|
| 665 |
+
if len(numeric_cols) < 2:
|
| 666 |
+
return "❌ Need at least 2 numeric columns for correlation heatmap", None, None
|
| 667 |
+
|
| 668 |
+
corr_matrix = df[numeric_cols].corr()
|
| 669 |
+
|
| 670 |
+
fig = px.imshow(
|
| 671 |
+
corr_matrix,
|
| 672 |
+
text_auto=True,
|
| 673 |
+
aspect="auto",
|
| 674 |
+
color_continuous_scale='RdBu_r',
|
| 675 |
+
title="Correlation Heatmap",
|
| 676 |
+
zmin=-1, zmax=1
|
| 677 |
+
)
|
| 678 |
+
|
| 679 |
+
fig.update_layout(
|
| 680 |
+
height=600,
|
| 681 |
+
plot_bgcolor='white',
|
| 682 |
+
paper_bgcolor='white'
|
| 683 |
+
)
|
| 684 |
+
|
| 685 |
+
response = "### 🔥 Correlation Heatmap\n\nStrong correlations are shown in dark red/blue:"
|
| 686 |
+
return response, fig, None
|
| 687 |
+
|
| 688 |
+
|
| 689 |
+
def create_violin_plot(query, df, numeric_cols, categorical_cols):
|
| 690 |
+
"""Create violin plot"""
|
| 691 |
+
# Find numeric column
|
| 692 |
+
num_col = None
|
| 693 |
+
for col in numeric_cols:
|
| 694 |
+
if col.lower() in query:
|
| 695 |
+
num_col = col
|
| 696 |
+
break
|
| 697 |
+
|
| 698 |
+
if not num_col and numeric_cols:
|
| 699 |
+
num_col = numeric_cols[0]
|
| 700 |
+
|
| 701 |
+
# Find categorical column for grouping
|
| 702 |
+
cat_col = None
|
| 703 |
+
for col in categorical_cols:
|
| 704 |
+
if col.lower() in query:
|
| 705 |
+
cat_col = col
|
| 706 |
+
break
|
| 707 |
+
|
| 708 |
+
if num_col:
|
| 709 |
+
if cat_col:
|
| 710 |
+
fig = px.violin(
|
| 711 |
+
df,
|
| 712 |
+
x=cat_col,
|
| 713 |
+
y=num_col,
|
| 714 |
+
title=f"Violin Plot of {num_col} by {cat_col}",
|
| 715 |
+
box=True,
|
| 716 |
+
points="all",
|
| 717 |
+
color_discrete_sequence=['#667eea']
|
| 718 |
+
)
|
| 719 |
+
response = f"### 🎻 Violin Plot: {num_col} grouped by {cat_col}"
|
| 720 |
+
else:
|
| 721 |
+
fig = px.violin(
|
| 722 |
+
df,
|
| 723 |
+
y=num_col,
|
| 724 |
+
title=f"Violin Plot of {num_col}",
|
| 725 |
+
box=True,
|
| 726 |
+
points="all",
|
| 727 |
+
color_discrete_sequence=['#667eea']
|
| 728 |
+
)
|
| 729 |
+
response = f"### 🎻 Violin Plot of {num_col}"
|
| 730 |
+
|
| 731 |
+
fig.update_layout(
|
| 732 |
+
plot_bgcolor='white',
|
| 733 |
+
paper_bgcolor='white',
|
| 734 |
+
font=dict(color='#2c3e50'),
|
| 735 |
+
height=500
|
| 736 |
+
)
|
| 737 |
+
|
| 738 |
+
return response, fig, None
|
| 739 |
+
|
| 740 |
+
return "❌ No numeric column found for violin plot", None, None
|
| 741 |
+
|
| 742 |
+
|
| 743 |
+
def show_statistics(query, df, numeric_cols, all_cols):
|
| 744 |
+
"""Show statistics for columns"""
|
| 745 |
+
# Check if asking about specific column
|
| 746 |
+
for col in all_cols:
|
| 747 |
+
if col.lower() in query and col in numeric_cols:
|
| 748 |
+
data = df[col].dropna()
|
| 749 |
+
|
| 750 |
+
stats_data = pd.DataFrame({
|
| 751 |
+
'Statistic': ['Count', 'Mean', 'Std Dev', 'Min', '25%', '50%', '75%', 'Max', 'Skewness', 'Kurtosis'],
|
| 752 |
+
'Value': [
|
| 753 |
+
len(data),
|
| 754 |
+
f"{data.mean():.4f}",
|
| 755 |
+
f"{data.std():.4f}",
|
| 756 |
+
f"{data.min():.4f}",
|
| 757 |
+
f"{data.quantile(0.25):.4f}",
|
| 758 |
+
f"{data.median():.4f}",
|
| 759 |
+
f"{data.quantile(0.75):.4f}",
|
| 760 |
+
f"{data.max():.4f}",
|
| 761 |
+
f"{data.skew():.4f}",
|
| 762 |
+
f"{data.kurtosis():.4f}"
|
| 763 |
+
]
|
| 764 |
+
})
|
| 765 |
+
|
| 766 |
+
response = f"### 📊 Statistics for '{col}'"
|
| 767 |
+
return response, None, stats_data
|
| 768 |
+
|
| 769 |
+
# General statistics for all numeric columns
|
| 770 |
+
if numeric_cols:
|
| 771 |
+
stats_df = df[numeric_cols].describe().T
|
| 772 |
+
stats_df['skew'] = df[numeric_cols].skew()
|
| 773 |
+
stats_df['kurtosis'] = df[numeric_cols].kurtosis()
|
| 774 |
+
|
| 775 |
+
response = "### 📈 Summary Statistics for Numeric Columns"
|
| 776 |
+
return response, None, stats_df
|
| 777 |
+
|
| 778 |
+
return "❌ No numeric columns found for statistics", None, None
|
| 779 |
+
|
| 780 |
+
|
| 781 |
+
def show_column_info(query, df, all_cols):
|
| 782 |
+
"""Show information about specific column or all columns"""
|
| 783 |
+
# Check if asking about specific column
|
| 784 |
+
for col in all_cols:
|
| 785 |
+
if col.lower() in query:
|
| 786 |
+
info_data = pd.DataFrame({
|
| 787 |
+
'Property': ['Data Type', 'Unique Values', 'Missing Values', 'Missing %', 'Sample Values'],
|
| 788 |
+
'Value': [
|
| 789 |
+
str(df[col].dtype),
|
| 790 |
+
df[col].nunique(),
|
| 791 |
+
df[col].isnull().sum(),
|
| 792 |
+
f"{(df[col].isnull().sum()/len(df)*100):.2f}%",
|
| 793 |
+
str(df[col].dropna().iloc[:3].tolist())
|
| 794 |
+
]
|
| 795 |
+
})
|
| 796 |
+
|
| 797 |
+
response = f"### 📋 Column Information: '{col}'"
|
| 798 |
+
return response, None, info_data
|
| 799 |
+
|
| 800 |
+
# General column information
|
| 801 |
+
col_info = pd.DataFrame({
|
| 802 |
+
'Column': df.columns,
|
| 803 |
+
'Data Type': df.dtypes.astype(str),
|
| 804 |
+
'Unique Values': [df[col].nunique() for col in df.columns],
|
| 805 |
+
'Missing Values': df.isnull().sum().values,
|
| 806 |
+
'Missing %': (df.isnull().sum().values / len(df) * 100).round(2)
|
| 807 |
+
})
|
| 808 |
+
|
| 809 |
+
response = "### 📋 All Columns Information"
|
| 810 |
+
return response, None, col_info
|
| 811 |
+
|
| 812 |
+
|
| 813 |
+
def show_missing_values(df):
|
| 814 |
+
"""Show missing values analysis"""
|
| 815 |
+
missing = df.isnull().sum()
|
| 816 |
+
missing = missing[missing > 0]
|
| 817 |
+
|
| 818 |
+
if len(missing) == 0:
|
| 819 |
+
return "✅ **Good news!** No missing values found in the dataset.", None, None
|
| 820 |
+
|
| 821 |
+
missing_data = pd.DataFrame({
|
| 822 |
+
'Column': missing.index,
|
| 823 |
+
'Missing Count': missing.values,
|
| 824 |
+
'Missing %': (missing.values / len(df) * 100).round(2)
|
| 825 |
+
}).sort_values('Missing %', ascending=False)
|
| 826 |
+
|
| 827 |
+
total_missing = missing.sum()
|
| 828 |
+
total_cells = df.shape[0] * df.shape[1]
|
| 829 |
+
|
| 830 |
+
response = f"### 🔍 Missing Values Analysis\n\n**Total Missing:** {total_missing} out of {total_cells} cells ({total_missing/total_cells*100:.2f}%)"
|
| 831 |
+
return response, None, missing_data
|
| 832 |
+
|
| 833 |
+
|
| 834 |
+
def detect_outliers(query, df, numeric_cols):
|
| 835 |
+
"""Detect outliers in numeric columns"""
|
| 836 |
+
# Check if asking about specific column
|
| 837 |
+
target_cols = []
|
| 838 |
+
for col in numeric_cols:
|
| 839 |
+
if col.lower() in query:
|
| 840 |
+
target_cols.append(col)
|
| 841 |
+
|
| 842 |
+
if not target_cols:
|
| 843 |
+
target_cols = numeric_cols[:3] # Check first 3 numeric columns
|
| 844 |
+
|
| 845 |
+
outlier_data = []
|
| 846 |
+
|
| 847 |
+
for col in target_cols:
|
| 848 |
+
data = df[col].dropna()
|
| 849 |
+
Q1 = data.quantile(0.25)
|
| 850 |
+
Q3 = data.quantile(0.75)
|
| 851 |
+
IQR = Q3 - Q1
|
| 852 |
+
outliers = data[(data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR)]
|
| 853 |
+
|
| 854 |
+
outlier_data.append({
|
| 855 |
+
'Column': col,
|
| 856 |
+
'Outliers Count': len(outliers),
|
| 857 |
+
'Outliers %': f"{(len(outliers)/len(data)*100):.2f}%",
|
| 858 |
+
'Normal Range': f"[{Q1 - 1.5 * IQR:.4f}, {Q3 + 1.5 * IQR:.4f}]",
|
| 859 |
+
'Severity': 'High' if len(outliers)/len(data)*100 > 10 else 'Medium' if len(outliers)/len(data)*100 > 5 else 'Low'
|
| 860 |
+
})
|
| 861 |
+
|
| 862 |
+
outlier_df = pd.DataFrame(outlier_data)
|
| 863 |
+
|
| 864 |
+
response = "### ⚠️ Outlier Detection Results"
|
| 865 |
+
return response, None, outlier_df
|
| 866 |
+
|
| 867 |
+
|
| 868 |
+
def show_unique_values(query, df, all_cols, categorical_cols):
|
| 869 |
+
"""Show unique values in columns"""
|
| 870 |
+
# Check if asking about specific column
|
| 871 |
+
for col in all_cols:
|
| 872 |
+
if col.lower() in query:
|
| 873 |
+
value_counts = df[col].value_counts().reset_index()
|
| 874 |
+
value_counts.columns = [col, 'Count']
|
| 875 |
+
value_counts['Percentage'] = (value_counts['Count'] / len(df) * 100).round(2)
|
| 876 |
+
|
| 877 |
+
response = f"### 🎯 Unique Values in '{col}'\n\n**Total Unique:** {df[col].nunique()}"
|
| 878 |
+
return response, None, value_counts.head(20)
|
| 879 |
+
|
| 880 |
+
# Show for categorical columns
|
| 881 |
+
if categorical_cols:
|
| 882 |
+
unique_data = []
|
| 883 |
+
for col in categorical_cols[:10]:
|
| 884 |
+
unique_data.append({
|
| 885 |
+
'Column': col,
|
| 886 |
+
'Unique Values': df[col].nunique(),
|
| 887 |
+
'Most Common': df[col].value_counts().index[0] if len(df[col].value_counts()) > 0 else 'N/A',
|
| 888 |
+
'Most Common Count': df[col].value_counts().values[0] if len(df[col].value_counts()) > 0 else 0
|
| 889 |
+
})
|
| 890 |
+
|
| 891 |
+
unique_df = pd.DataFrame(unique_data)
|
| 892 |
+
response = "### 🎯 Unique Values in Categorical Columns"
|
| 893 |
+
return response, None, unique_df
|
| 894 |
+
|
| 895 |
+
return "❌ No categorical columns found", None, None
|
| 896 |
+
|
| 897 |
+
|
| 898 |
+
def compare_columns(query, df, numeric_cols, categorical_cols):
|
| 899 |
+
"""Compare two columns"""
|
| 900 |
+
# Find two columns to compare
|
| 901 |
+
cols = []
|
| 902 |
+
for col in df.columns:
|
| 903 |
+
if col.lower() in query:
|
| 904 |
+
cols.append(col)
|
| 905 |
+
|
| 906 |
+
if len(cols) >= 2:
|
| 907 |
+
col1, col2 = cols[0], cols[1]
|
| 908 |
+
|
| 909 |
+
if col1 in numeric_cols and col2 in numeric_cols:
|
| 910 |
+
# Numeric comparison
|
| 911 |
+
comparison_data = pd.DataFrame({
|
| 912 |
+
'Metric': ['Mean', 'Median', 'Std Dev', 'Min', 'Max'],
|
| 913 |
+
col1: [
|
| 914 |
+
df[col1].mean(),
|
| 915 |
+
df[col1].median(),
|
| 916 |
+
df[col1].std(),
|
| 917 |
+
df[col1].min(),
|
| 918 |
+
df[col1].max()
|
| 919 |
+
],
|
| 920 |
+
col2: [
|
| 921 |
+
df[col2].mean(),
|
| 922 |
+
df[col2].median(),
|
| 923 |
+
df[col2].std(),
|
| 924 |
+
df[col2].min(),
|
| 925 |
+
df[col2].max()
|
| 926 |
+
]
|
| 927 |
+
})
|
| 928 |
+
|
| 929 |
+
response = f"### 🔄 Comparison: {col1} vs {col2}"
|
| 930 |
+
return response, None, comparison_data
|
| 931 |
+
|
| 932 |
+
elif col1 in categorical_cols and col2 in categorical_cols:
|
| 933 |
+
# Categorical comparison - crosstab
|
| 934 |
+
cross_tab = pd.crosstab(df[col1], df[col2])
|
| 935 |
+
response = f"### 🔄 Cross-tabulation: {col1} vs {col2}"
|
| 936 |
+
return response, None, cross_tab
|
| 937 |
+
|
| 938 |
+
return "❌ Please specify two columns to compare", None, None
|
| 939 |
+
|
| 940 |
+
|
| 941 |
+
def show_help():
|
| 942 |
+
"""Show help information"""
|
| 943 |
+
help_text = """
|
| 944 |
+
### 🤖 I Can Help You With:
|
| 945 |
+
|
| 946 |
+
**📊 Show Data:**
|
| 947 |
+
• "Show me first 10 rows"
|
| 948 |
+
• "Show me last 5 rows"
|
| 949 |
+
• "Show random sample of 10 rows"
|
| 950 |
+
• "Find rows where age > 30"
|
| 951 |
+
• "Sort by price descending"
|
| 952 |
+
• "Top 5 by sales"
|
| 953 |
+
|
| 954 |
+
**📈 Create Visualizations:**
|
| 955 |
+
• "Show bar chart of category"
|
| 956 |
+
• "Plot histogram of age"
|
| 957 |
+
• "Create scatter plot of price vs quantity"
|
| 958 |
+
• "Show line chart of sales over time"
|
| 959 |
+
• "Create box plot of salary"
|
| 960 |
+
• "Show pie chart of region"
|
| 961 |
+
• "Display correlation heatmap"
|
| 962 |
+
• "Create violin plot of price"
|
| 963 |
+
|
| 964 |
+
**🔍 Analyze Data:**
|
| 965 |
+
• "Show statistics for all columns"
|
| 966 |
+
• "Tell me about [column name]"
|
| 967 |
+
• "Any missing values?"
|
| 968 |
+
• "Find outliers in price"
|
| 969 |
+
• "Show unique values in category"
|
| 970 |
+
• "Compare age and income"
|
| 971 |
+
|
| 972 |
+
**Just ask naturally and I'll show you the data and visualizations!**
|
| 973 |
+
"""
|
| 974 |
+
return help_text
|
| 975 |
+
|
| 976 |
+
|
| 977 |
+
def handle_general_query(query, df, numeric_cols, categorical_cols, all_cols):
|
| 978 |
+
"""Handle general queries that don't match specific patterns"""
|
| 979 |
+
|
| 980 |
+
# Check if asking about a specific column
|
| 981 |
+
for col in all_cols:
|
| 982 |
+
if col.lower() in query:
|
| 983 |
+
if col in numeric_cols:
|
| 984 |
+
data = df[col].dropna()
|
| 985 |
+
return f"**{col}** - Mean: {data.mean():.2f}, Min: {data.min():.2f}, Max: {data.max():.2f}", None, None
|
| 986 |
+
else:
|
| 987 |
+
return f"**{col}** - Unique values: {df[col].nunique()}, Most common: {df[col].value_counts().index[0] if len(df[col].value_counts()) > 0 else 'N/A'}", None, None
|
| 988 |
+
|
| 989 |
+
# Check for dataset size
|
| 990 |
+
if 'size' in query or 'large' in query or 'big' in query:
|
| 991 |
+
size_mb = df.memory_usage(deep=True).sum() / 1024**2
|
| 992 |
+
return f"Dataset size: {size_mb:.2f} MB ({df.shape[0]:,} rows × {df.shape[1]} columns)", None, None
|
| 993 |
+
|
| 994 |
+
# Default response
|
| 995 |
+
return "❌ I didn't understand. Try asking for data, visualizations, or type 'help'", None, None
|
| 996 |
+
|
| 997 |
+
|
| 998 |
+
def display_visualization(fig):
|
| 999 |
+
"""Display the visualization"""
|
| 1000 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1001 |
+
|
| 1002 |
+
|
| 1003 |
+
# Simple version for quick integration
|
| 1004 |
+
def run_simple_chatbot(df):
|
| 1005 |
+
"""Simplified chatbot version"""
|
| 1006 |
+
st.markdown("### 💬 Simple Data Chat")
|
| 1007 |
+
|
| 1008 |
+
if "simple_msgs" not in st.session_state:
|
| 1009 |
+
st.session_state.simple_msgs = []
|
| 1010 |
+
|
| 1011 |
+
# Chat display
|
| 1012 |
+
for msg in st.session_state.simple_msgs:
|
| 1013 |
+
if msg["role"] == "user":
|
| 1014 |
+
st.info(f"👤 {msg['content']}")
|
| 1015 |
+
else:
|
| 1016 |
+
st.success(f"🤖 {msg['content']}")
|
| 1017 |
+
|
| 1018 |
+
# Input
|
| 1019 |
+
user_input = st.text_input("Ask:", key="simple_chat_input")
|
| 1020 |
+
|
| 1021 |
+
if st.button("Send") and user_input:
|
| 1022 |
+
st.session_state.simple_msgs.append({"role": "user", "content": user_input})
|
| 1023 |
+
|
| 1024 |
+
# Simple responses
|
| 1025 |
+
response = "I don't understand. Try: rows, columns, missing, stats, chart"
|
| 1026 |
+
|
| 1027 |
+
if "row" in user_input.lower():
|
| 1028 |
+
response = f"Dataset has {df.shape[0]} rows"
|
| 1029 |
+
elif "column" in user_input.lower():
|
| 1030 |
+
response = f"Dataset has {df.shape[1]} columns: {', '.join(df.columns[:5])}"
|
| 1031 |
+
elif "missing" in user_input.lower():
|
| 1032 |
+
missing = df.isnull().sum().sum()
|
| 1033 |
+
response = f"Found {missing} missing values" if missing > 0 else "No missing values"
|
| 1034 |
+
elif "stat" in user_input.lower():
|
| 1035 |
+
numeric = df.select_dtypes(include=[np.number]).columns
|
| 1036 |
+
if len(numeric) > 0:
|
| 1037 |
+
response = f"Mean of {numeric[0]}: {df[numeric[0]].mean():.2f}"
|
| 1038 |
+
elif "chart" in user_input.lower() or "plot" in user_input.lower():
|
| 1039 |
+
response = "📊 Creating visualization... (check the plot above)"
|
| 1040 |
+
# Simple histogram
|
| 1041 |
+
numeric = df.select_dtypes(include=[np.number]).columns
|
| 1042 |
+
if len(numeric) > 0:
|
| 1043 |
+
fig = px.histogram(df, x=numeric[0], title=f"Distribution of {numeric[0]}")
|
| 1044 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1045 |
+
|
| 1046 |
+
st.session_state.simple_msgs.append({"role": "bot", "content": response})
|
| 1047 |
+
st.rerun()
|
| 1048 |
+
|
| 1049 |
+
if st.button("Clear Chat"):
|
| 1050 |
+
st.session_state.simple_msgs = []
|
| 1051 |
+
st.rerun()
|
data_preprocessing.py
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
|
| 5 |
+
from sklearn.impute import SimpleImputer, KNNImputer
|
| 6 |
+
from sklearn.ensemble import IsolationForest
|
| 7 |
+
import plotly.express as px
|
| 8 |
+
import plotly.graph_objects as go
|
| 9 |
+
|
| 10 |
+
def preprocess_data(df):
|
| 11 |
+
|
| 12 |
+
st.markdown("""
|
| 13 |
+
<div style='text-align: center; margin-bottom: 2rem;'>
|
| 14 |
+
<h2>⚙️ Data Preprocessing Pipeline</h2>
|
| 15 |
+
<p style='color: gray;'>Clean, transform, and prepare your data for analysis</p>
|
| 16 |
+
</div>
|
| 17 |
+
""", unsafe_allow_html=True)
|
| 18 |
+
|
| 19 |
+
# Create tabs for different preprocessing steps
|
| 20 |
+
tab1, tab2, tab3, tab4, tab5 = st.tabs([
|
| 21 |
+
"📊 Overview", "🧹 Clean Data", "🔄 Transform",
|
| 22 |
+
"📏 Scale & Encode", "📈 Feature Engineering"
|
| 23 |
+
])
|
| 24 |
+
|
| 25 |
+
with tab1:
|
| 26 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 27 |
+
|
| 28 |
+
col1, col2, col3 = st.columns(3)
|
| 29 |
+
|
| 30 |
+
with col1:
|
| 31 |
+
st.metric("Original Rows", df.shape[0])
|
| 32 |
+
with col2:
|
| 33 |
+
st.metric("Original Columns", df.shape[1])
|
| 34 |
+
with col3:
|
| 35 |
+
missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
|
| 36 |
+
st.metric("Missing Data", f"{missing_pct:.1f}%")
|
| 37 |
+
|
| 38 |
+
# Data quality before preprocessing
|
| 39 |
+
st.subheader("Data Quality Check")
|
| 40 |
+
|
| 41 |
+
quality_df = pd.DataFrame({
|
| 42 |
+
'Column': df.columns,
|
| 43 |
+
'Data Type': df.dtypes,
|
| 44 |
+
'Missing Values': df.isnull().sum(),
|
| 45 |
+
'Missing %': (df.isnull().sum() / len(df) * 100).round(2),
|
| 46 |
+
'Unique Values': [df[col].nunique() for col in df.columns]
|
| 47 |
+
})
|
| 48 |
+
|
| 49 |
+
st.dataframe(quality_df, use_container_width=True)
|
| 50 |
+
|
| 51 |
+
# Visualize missing values
|
| 52 |
+
if df.isnull().sum().sum() > 0:
|
| 53 |
+
st.subheader("Missing Value Heatmap")
|
| 54 |
+
missing_df = df.isnull().astype(int)
|
| 55 |
+
fig = px.imshow(missing_df.T,
|
| 56 |
+
color_continuous_scale='reds',
|
| 57 |
+
aspect="auto",
|
| 58 |
+
title="Missing Values Pattern")
|
| 59 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 60 |
+
|
| 61 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 62 |
+
|
| 63 |
+
with tab2:
|
| 64 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 65 |
+
st.subheader("🧹 Data Cleaning Options")
|
| 66 |
+
|
| 67 |
+
# Create a copy for processing
|
| 68 |
+
processed_df = df.copy()
|
| 69 |
+
|
| 70 |
+
# Remove duplicates
|
| 71 |
+
st.markdown("### Duplicate Removal")
|
| 72 |
+
duplicates = processed_df.duplicated().sum()
|
| 73 |
+
st.write(f"Duplicate rows found: **{duplicates}**")
|
| 74 |
+
|
| 75 |
+
if duplicates > 0:
|
| 76 |
+
if st.button("Remove Duplicates", use_container_width=True):
|
| 77 |
+
processed_df = processed_df.drop_duplicates()
|
| 78 |
+
st.success(f"✅ Removed {duplicates} duplicate rows")
|
| 79 |
+
|
| 80 |
+
# Handle missing values
|
| 81 |
+
st.markdown("### Missing Value Handling")
|
| 82 |
+
|
| 83 |
+
missing_cols = processed_df.columns[processed_df.isnull().any()].tolist()
|
| 84 |
+
|
| 85 |
+
if missing_cols:
|
| 86 |
+
selected_col = st.selectbox("Select column to handle missing values", missing_cols)
|
| 87 |
+
|
| 88 |
+
col_type = processed_df[selected_col].dtype
|
| 89 |
+
|
| 90 |
+
if pd.api.types.is_numeric_dtype(processed_df[selected_col]):
|
| 91 |
+
method = st.radio(
|
| 92 |
+
"Choose imputation method",
|
| 93 |
+
["Mean", "Median", "Mode", "KNN Imputer", "Drop rows", "Fill with value"]
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
if method == "Mean":
|
| 97 |
+
processed_df[selected_col].fillna(processed_df[selected_col].mean(), inplace=True)
|
| 98 |
+
elif method == "Median":
|
| 99 |
+
processed_df[selected_col].fillna(processed_df[selected_col].median(), inplace=True)
|
| 100 |
+
elif method == "Mode":
|
| 101 |
+
processed_df[selected_col].fillna(processed_df[selected_col].mode()[0], inplace=True)
|
| 102 |
+
elif method == "KNN Imputer":
|
| 103 |
+
st.info("KNN Imputer will be applied to all numeric columns")
|
| 104 |
+
if st.button("Apply KNN Imputer"):
|
| 105 |
+
numeric_cols = processed_df.select_dtypes(include=[np.number]).columns
|
| 106 |
+
imputer = KNNImputer(n_neighbors=5)
|
| 107 |
+
processed_df[numeric_cols] = imputer.fit_transform(processed_df[numeric_cols])
|
| 108 |
+
elif method == "Drop rows":
|
| 109 |
+
if st.button(f"Drop rows with missing values in {selected_col}"):
|
| 110 |
+
processed_df = processed_df.dropna(subset=[selected_col])
|
| 111 |
+
else:
|
| 112 |
+
fill_value = st.text_input("Enter fill value")
|
| 113 |
+
if fill_value:
|
| 114 |
+
if pd.api.types.is_numeric_dtype(processed_df[selected_col]):
|
| 115 |
+
processed_df[selected_col].fillna(float(fill_value), inplace=True)
|
| 116 |
+
else:
|
| 117 |
+
processed_df[selected_col].fillna(fill_value, inplace=True)
|
| 118 |
+
|
| 119 |
+
else: # Categorical column
|
| 120 |
+
method = st.radio(
|
| 121 |
+
"Choose imputation method",
|
| 122 |
+
["Mode", "Drop rows", "Fill with value"]
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
if method == "Mode":
|
| 126 |
+
processed_df[selected_col].fillna(processed_df[selected_col].mode()[0], inplace=True)
|
| 127 |
+
elif method == "Drop rows":
|
| 128 |
+
if st.button(f"Drop rows with missing values in {selected_col}"):
|
| 129 |
+
processed_df = processed_df.dropna(subset=[selected_col])
|
| 130 |
+
else:
|
| 131 |
+
fill_value = st.text_input("Enter fill value")
|
| 132 |
+
if fill_value:
|
| 133 |
+
processed_df[selected_col].fillna(fill_value, inplace=True)
|
| 134 |
+
else:
|
| 135 |
+
st.success("✅ No missing values found!")
|
| 136 |
+
|
| 137 |
+
# Outlier detection
|
| 138 |
+
st.markdown("### Outlier Detection")
|
| 139 |
+
numeric_cols = processed_df.select_dtypes(include=[np.number]).columns
|
| 140 |
+
|
| 141 |
+
if len(numeric_cols) > 0:
|
| 142 |
+
selected_num = st.selectbox("Select numeric column for outlier detection", numeric_cols)
|
| 143 |
+
|
| 144 |
+
# Calculate IQR
|
| 145 |
+
Q1 = processed_df[selected_num].quantile(0.25)
|
| 146 |
+
Q3 = processed_df[selected_num].quantile(0.75)
|
| 147 |
+
IQR = Q3 - Q1
|
| 148 |
+
|
| 149 |
+
outliers = processed_df[
|
| 150 |
+
(processed_df[selected_num] < Q1 - 1.5 * IQR) |
|
| 151 |
+
(processed_df[selected_num] > Q3 + 1.5 * IQR)
|
| 152 |
+
]
|
| 153 |
+
|
| 154 |
+
st.write(f"Outliers detected: **{len(outliers)}** rows")
|
| 155 |
+
|
| 156 |
+
if len(outliers) > 0:
|
| 157 |
+
if st.button(f"Remove outliers from {selected_num}"):
|
| 158 |
+
processed_df = processed_df[
|
| 159 |
+
(processed_df[selected_num] >= Q1 - 1.5 * IQR) &
|
| 160 |
+
(processed_df[selected_num] <= Q3 + 1.5 * IQR)
|
| 161 |
+
]
|
| 162 |
+
st.success(f"✅ Removed {len(outliers)} outliers")
|
| 163 |
+
|
| 164 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 165 |
+
|
| 166 |
+
# Update session state
|
| 167 |
+
st.session_state.data = processed_df
|
| 168 |
+
|
| 169 |
+
with tab3:
|
| 170 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 171 |
+
st.subheader("🔄 Data Transformations")
|
| 172 |
+
|
| 173 |
+
processed_df = st.session_state.data.copy() if 'processed_df' not in locals() else processed_df
|
| 174 |
+
|
| 175 |
+
# Column operations
|
| 176 |
+
st.markdown("### Column Operations")
|
| 177 |
+
|
| 178 |
+
operation = st.selectbox(
|
| 179 |
+
"Choose operation",
|
| 180 |
+
["Create new column", "Rename column", "Drop column", "Change data type"]
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
if operation == "Create new column":
|
| 184 |
+
col1, col2, col3 = st.columns(3)
|
| 185 |
+
with col1:
|
| 186 |
+
new_col_name = st.text_input("New column name")
|
| 187 |
+
with col2:
|
| 188 |
+
col_to_use = st.selectbox("Based on column", processed_df.columns)
|
| 189 |
+
with col3:
|
| 190 |
+
operation_type = st.selectbox(
|
| 191 |
+
"Operation",
|
| 192 |
+
["Square", "Square Root", "Log", "Absolute", "Round", "Binary encode"]
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
if st.button("Create column") and new_col_name:
|
| 196 |
+
if operation_type == "Square":
|
| 197 |
+
processed_df[new_col_name] = processed_df[col_to_use] ** 2
|
| 198 |
+
elif operation_type == "Square Root":
|
| 199 |
+
processed_df[new_col_name] = np.sqrt(processed_df[col_to_use])
|
| 200 |
+
elif operation_type == "Log":
|
| 201 |
+
processed_df[new_col_name] = np.log1p(processed_df[col_to_use])
|
| 202 |
+
elif operation_type == "Absolute":
|
| 203 |
+
processed_df[new_col_name] = np.abs(processed_df[col_to_use])
|
| 204 |
+
elif operation_type == "Round":
|
| 205 |
+
processed_df[new_col_name] = np.round(processed_df[col_to_use])
|
| 206 |
+
elif operation_type == "Binary encode":
|
| 207 |
+
threshold = st.number_input("Threshold for binary encoding")
|
| 208 |
+
processed_df[new_col_name] = (processed_df[col_to_use] > threshold).astype(int)
|
| 209 |
+
|
| 210 |
+
st.success(f"✅ Created column: {new_col_name}")
|
| 211 |
+
|
| 212 |
+
elif operation == "Rename column":
|
| 213 |
+
col_to_rename = st.selectbox("Select column to rename", processed_df.columns)
|
| 214 |
+
new_name = st.text_input("New column name")
|
| 215 |
+
|
| 216 |
+
if st.button("Rename") and new_name:
|
| 217 |
+
processed_df.rename(columns={col_to_rename: new_name}, inplace=True)
|
| 218 |
+
st.success(f"✅ Renamed {col_to_rename} to {new_name}")
|
| 219 |
+
|
| 220 |
+
elif operation == "Drop column":
|
| 221 |
+
cols_to_drop = st.multiselect("Select columns to drop", processed_df.columns)
|
| 222 |
+
|
| 223 |
+
if st.button("Drop columns") and cols_to_drop:
|
| 224 |
+
processed_df = processed_df.drop(columns=cols_to_drop)
|
| 225 |
+
st.success(f"✅ Dropped columns: {', '.join(cols_to_drop)}")
|
| 226 |
+
|
| 227 |
+
elif operation == "Change data type":
|
| 228 |
+
col_to_change = st.selectbox("Select column", processed_df.columns)
|
| 229 |
+
new_type = st.selectbox(
|
| 230 |
+
"New data type",
|
| 231 |
+
["int", "float", "str", "datetime", "category"]
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
if st.button("Change type"):
|
| 235 |
+
try:
|
| 236 |
+
if new_type == "int":
|
| 237 |
+
processed_df[col_to_change] = processed_df[col_to_change].astype(int)
|
| 238 |
+
elif new_type == "float":
|
| 239 |
+
processed_df[col_to_change] = processed_df[col_to_change].astype(float)
|
| 240 |
+
elif new_type == "str":
|
| 241 |
+
processed_df[col_to_change] = processed_df[col_to_change].astype(str)
|
| 242 |
+
elif new_type == "datetime":
|
| 243 |
+
processed_df[col_to_change] = pd.to_datetime(processed_df[col_to_change])
|
| 244 |
+
elif new_type == "category":
|
| 245 |
+
processed_df[col_to_change] = processed_df[col_to_change].astype('category')
|
| 246 |
+
|
| 247 |
+
st.success(f"✅ Changed {col_to_change} to {new_type}")
|
| 248 |
+
except Exception as e:
|
| 249 |
+
st.error(f"Error: {str(e)}")
|
| 250 |
+
|
| 251 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 252 |
+
|
| 253 |
+
# Update session state
|
| 254 |
+
st.session_state.data = processed_df
|
| 255 |
+
|
| 256 |
+
with tab4:
|
| 257 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 258 |
+
st.subheader("📏 Feature Scaling & Encoding")
|
| 259 |
+
|
| 260 |
+
processed_df = st.session_state.data.copy() if 'processed_df' not in locals() else processed_df
|
| 261 |
+
|
| 262 |
+
col1, col2 = st.columns(2)
|
| 263 |
+
|
| 264 |
+
with col1:
|
| 265 |
+
st.markdown("### Feature Scaling")
|
| 266 |
+
numeric_cols = processed_df.select_dtypes(include=[np.number]).columns.tolist()
|
| 267 |
+
|
| 268 |
+
if numeric_cols:
|
| 269 |
+
scale_cols = st.multiselect("Select columns to scale", numeric_cols)
|
| 270 |
+
scale_method = st.radio("Scaling method", ["StandardScaler", "MinMaxScaler"])
|
| 271 |
+
|
| 272 |
+
if st.button("Apply Scaling") and scale_cols:
|
| 273 |
+
if scale_method == "StandardScaler":
|
| 274 |
+
scaler = StandardScaler()
|
| 275 |
+
else:
|
| 276 |
+
scaler = MinMaxScaler()
|
| 277 |
+
|
| 278 |
+
processed_df[scale_cols] = scaler.fit_transform(processed_df[scale_cols])
|
| 279 |
+
st.success(f"✅ Applied {scale_method} to {len(scale_cols)} columns")
|
| 280 |
+
|
| 281 |
+
with col2:
|
| 282 |
+
st.markdown("### Categorical Encoding")
|
| 283 |
+
cat_cols = processed_df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 284 |
+
|
| 285 |
+
if cat_cols:
|
| 286 |
+
encode_cols = st.multiselect("Select columns to encode", cat_cols)
|
| 287 |
+
encode_method = st.radio("Encoding method", ["Label Encoding", "One-Hot Encoding"])
|
| 288 |
+
|
| 289 |
+
if st.button("Apply Encoding") and encode_cols:
|
| 290 |
+
if encode_method == "Label Encoding":
|
| 291 |
+
for col in encode_cols:
|
| 292 |
+
le = LabelEncoder()
|
| 293 |
+
processed_df[col + '_encoded'] = le.fit_transform(processed_df[col])
|
| 294 |
+
st.success(f"✅ Applied Label Encoding to {len(encode_cols)} columns")
|
| 295 |
+
else:
|
| 296 |
+
processed_df = pd.get_dummies(processed_df, columns=encode_cols)
|
| 297 |
+
st.success(f"✅ Applied One-Hot Encoding to {len(encode_cols)} columns")
|
| 298 |
+
|
| 299 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 300 |
+
|
| 301 |
+
# Update session state
|
| 302 |
+
st.session_state.data = processed_df
|
| 303 |
+
|
| 304 |
+
with tab5:
|
| 305 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 306 |
+
st.subheader("📈 Feature Engineering")
|
| 307 |
+
|
| 308 |
+
processed_df = st.session_state.data.copy() if 'processed_df' not in locals() else processed_df
|
| 309 |
+
|
| 310 |
+
# Feature interactions
|
| 311 |
+
st.markdown("### Feature Interactions")
|
| 312 |
+
numeric_cols = processed_df.select_dtypes(include=[np.number]).columns.tolist()
|
| 313 |
+
|
| 314 |
+
if len(numeric_cols) >= 2:
|
| 315 |
+
col1, col2 = st.columns(2)
|
| 316 |
+
with col1:
|
| 317 |
+
feat1 = st.selectbox("First feature", numeric_cols)
|
| 318 |
+
with col2:
|
| 319 |
+
feat2 = st.selectbox("Second feature", [c for c in numeric_cols if c != feat1])
|
| 320 |
+
|
| 321 |
+
interaction_type = st.selectbox(
|
| 322 |
+
"Interaction type",
|
| 323 |
+
["Multiplication", "Addition", "Subtraction", "Division", "Ratio"]
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
new_col_name = st.text_input("New column name", f"{feat1}_{interaction_type}_{feat2}")
|
| 327 |
+
|
| 328 |
+
if st.button("Create Interaction Feature"):
|
| 329 |
+
if interaction_type == "Multiplication":
|
| 330 |
+
processed_df[new_col_name] = processed_df[feat1] * processed_df[feat2]
|
| 331 |
+
elif interaction_type == "Addition":
|
| 332 |
+
processed_df[new_col_name] = processed_df[feat1] + processed_df[feat2]
|
| 333 |
+
elif interaction_type == "Subtraction":
|
| 334 |
+
processed_df[new_col_name] = processed_df[feat1] - processed_df[feat2]
|
| 335 |
+
elif interaction_type == "Division":
|
| 336 |
+
processed_df[new_col_name] = processed_df[feat1] / (processed_df[feat2] + 1e-8)
|
| 337 |
+
elif interaction_type == "Ratio":
|
| 338 |
+
processed_df[new_col_name] = processed_df[feat1] / (processed_df[feat2].sum() + 1e-8)
|
| 339 |
+
|
| 340 |
+
st.success(f"✅ Created feature: {new_col_name}")
|
| 341 |
+
|
| 342 |
+
# Binning
|
| 343 |
+
st.markdown("### Feature Binning")
|
| 344 |
+
if numeric_cols:
|
| 345 |
+
bin_col = st.selectbox("Select column for binning", numeric_cols)
|
| 346 |
+
n_bins = st.slider("Number of bins", 2, 20, 5)
|
| 347 |
+
bin_labels = [f"Bin_{i}" for i in range(n_bins)]
|
| 348 |
+
|
| 349 |
+
if st.button("Create Binned Feature"):
|
| 350 |
+
processed_df[bin_col + '_binned'] = pd.cut(processed_df[bin_col],
|
| 351 |
+
bins=n_bins,
|
| 352 |
+
labels=bin_labels)
|
| 353 |
+
st.success(f"✅ Created binned feature: {bin_col}_binned")
|
| 354 |
+
|
| 355 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 356 |
+
|
| 357 |
+
# Update session state
|
| 358 |
+
st.session_state.data = processed_df
|
| 359 |
+
|
| 360 |
+
# Preview processed data
|
| 361 |
+
st.markdown("---")
|
| 362 |
+
st.subheader("📋 Processed Data Preview")
|
| 363 |
+
|
| 364 |
+
data_to_show = st.session_state.data
|
| 365 |
+
|
| 366 |
+
col1, col2, col3 = st.columns(3)
|
| 367 |
+
with col1:
|
| 368 |
+
st.metric("Final Rows", data_to_show.shape[0])
|
| 369 |
+
with col2:
|
| 370 |
+
st.metric("Final Columns", data_to_show.shape[1])
|
| 371 |
+
with col3:
|
| 372 |
+
final_missing = data_to_show.isnull().sum().sum()
|
| 373 |
+
st.metric("Remaining Missing", final_missing)
|
| 374 |
+
|
| 375 |
+
st.dataframe(data_to_show.head(10), use_container_width=True)
|
| 376 |
+
|
| 377 |
+
# Download processed data
|
| 378 |
+
csv = data_to_show.to_csv(index=False)
|
| 379 |
+
st.download_button(
|
| 380 |
+
label="📥 Download Processed Data",
|
| 381 |
+
data=csv,
|
| 382 |
+
file_name="processed_data.csv",
|
| 383 |
+
mime="text/csv",
|
| 384 |
+
use_container_width=True
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
return data_to_show
|
data_quality.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
import plotly.graph_objects as go
|
| 6 |
+
from sklearn.ensemble import IsolationForest
|
| 7 |
+
|
| 8 |
+
def quality_report(df):
|
| 9 |
+
|
| 10 |
+
st.markdown("""
|
| 11 |
+
<div style='text-align: center; margin-bottom: 2rem;'>
|
| 12 |
+
<h2>📋 Data Quality Report</h2>
|
| 13 |
+
<p style='color: gray;'>Comprehensive data quality assessment</p>
|
| 14 |
+
</div>
|
| 15 |
+
""", unsafe_allow_html=True)
|
| 16 |
+
|
| 17 |
+
# Overall quality score
|
| 18 |
+
st.subheader("📊 Overall Data Quality Score")
|
| 19 |
+
|
| 20 |
+
# Calculate various quality metrics
|
| 21 |
+
completeness = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
|
| 22 |
+
uniqueness = (1 - df.duplicated().sum() / df.shape[0]) * 100
|
| 23 |
+
|
| 24 |
+
# Data type consistency
|
| 25 |
+
type_consistency = 100
|
| 26 |
+
for col in df.columns:
|
| 27 |
+
if df[col].dtype == 'object':
|
| 28 |
+
# Check if column has consistent types
|
| 29 |
+
try:
|
| 30 |
+
pd.to_numeric(df[col], errors='raise')
|
| 31 |
+
# If convertible to numeric, it might be inconsistent
|
| 32 |
+
type_consistency -= 5
|
| 33 |
+
except:
|
| 34 |
+
pass
|
| 35 |
+
|
| 36 |
+
# Outlier impact
|
| 37 |
+
outlier_impact = 100
|
| 38 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 39 |
+
if len(numeric_cols) > 0:
|
| 40 |
+
for col in numeric_cols:
|
| 41 |
+
Q1 = df[col].quantile(0.25)
|
| 42 |
+
Q3 = df[col].quantile(0.75)
|
| 43 |
+
IQR = Q3 - Q1
|
| 44 |
+
outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
|
| 45 |
+
outlier_pct = len(outliers) / len(df) * 100
|
| 46 |
+
if outlier_pct > 10:
|
| 47 |
+
outlier_impact -= 10
|
| 48 |
+
|
| 49 |
+
quality_score = (completeness + uniqueness + type_consistency + outlier_impact) / 4
|
| 50 |
+
|
| 51 |
+
# Display gauge
|
| 52 |
+
fig = go.Figure(go.Indicator(
|
| 53 |
+
mode="gauge+number",
|
| 54 |
+
value=quality_score,
|
| 55 |
+
domain={'x': [0, 1], 'y': [0, 1]},
|
| 56 |
+
title={'text': "Quality Score"},
|
| 57 |
+
gauge={
|
| 58 |
+
'axis': {'range': [None, 100]},
|
| 59 |
+
'bar': {'color': "#2E86AB"},
|
| 60 |
+
'steps': [
|
| 61 |
+
{'range': [0, 50], 'color': "#FF6B6B"},
|
| 62 |
+
{'range': [50, 70], 'color': "#FFD93D"},
|
| 63 |
+
{'range': [70, 85], 'color': "#6BCB77"},
|
| 64 |
+
{'range': [85, 100], 'color': "#4CAF50"}
|
| 65 |
+
],
|
| 66 |
+
'threshold': {
|
| 67 |
+
'line': {'color': "red", 'width': 4},
|
| 68 |
+
'thickness': 0.75,
|
| 69 |
+
'value': 90
|
| 70 |
+
}
|
| 71 |
+
}))
|
| 72 |
+
|
| 73 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 74 |
+
|
| 75 |
+
# Quality metrics cards
|
| 76 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 77 |
+
|
| 78 |
+
with col1:
|
| 79 |
+
st.metric("Completeness", f"{completeness:.1f}%",
|
| 80 |
+
delta=None, delta_color="normal")
|
| 81 |
+
|
| 82 |
+
with col2:
|
| 83 |
+
st.metric("Uniqueness", f"{uniqueness:.1f}%",
|
| 84 |
+
delta=None, delta_color="normal")
|
| 85 |
+
|
| 86 |
+
with col3:
|
| 87 |
+
st.metric("Type Consistency", f"{type_consistency:.1f}%",
|
| 88 |
+
delta=None, delta_color="normal")
|
| 89 |
+
|
| 90 |
+
with col4:
|
| 91 |
+
st.metric("Outlier Impact", f"{outlier_impact:.1f}%",
|
| 92 |
+
delta=None, delta_color="inverse")
|
| 93 |
+
|
| 94 |
+
# Detailed quality report
|
| 95 |
+
st.subheader("🔍 Detailed Quality Report")
|
| 96 |
+
|
| 97 |
+
quality_df = pd.DataFrame({
|
| 98 |
+
'Column': df.columns,
|
| 99 |
+
'Data Type': df.dtypes,
|
| 100 |
+
'Missing Count': df.isnull().sum().values,
|
| 101 |
+
'Missing %': (df.isnull().sum().values / len(df) * 100).round(2),
|
| 102 |
+
'Unique Values': [df[col].nunique() for col in df.columns],
|
| 103 |
+
'Unique %': [round((df[col].nunique() / len(df) * 100),2) for col in df.columns],
|
| 104 |
+
'Duplicate Values?': [df[col].duplicated().any() for col in df.columns]
|
| 105 |
+
})
|
| 106 |
+
|
| 107 |
+
# Add outlier info for numeric columns
|
| 108 |
+
outlier_info = []
|
| 109 |
+
for col in df.columns:
|
| 110 |
+
if df[col].dtype in ['int64', 'float64']:
|
| 111 |
+
Q1 = df[col].quantile(0.25)
|
| 112 |
+
Q3 = df[col].quantile(0.75)
|
| 113 |
+
IQR = Q3 - Q1
|
| 114 |
+
outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
|
| 115 |
+
outlier_info.append(len(outliers))
|
| 116 |
+
else:
|
| 117 |
+
outlier_info.append(0)
|
| 118 |
+
|
| 119 |
+
quality_df['Outliers'] = outlier_info
|
| 120 |
+
|
| 121 |
+
st.dataframe(quality_df.style.background_gradient(subset=['Missing %', 'Outliers'], cmap='YlOrRd'),
|
| 122 |
+
use_container_width=True)
|
| 123 |
+
|
| 124 |
+
# Visualizations
|
| 125 |
+
st.subheader("📊 Quality Visualizations")
|
| 126 |
+
|
| 127 |
+
col1, col2 = st.columns(2)
|
| 128 |
+
|
| 129 |
+
with col1:
|
| 130 |
+
# Missing values bar chart
|
| 131 |
+
missing_cols = df.isnull().sum()[df.isnull().sum() > 0]
|
| 132 |
+
if len(missing_cols) > 0:
|
| 133 |
+
fig = px.bar(x=missing_cols.index, y=missing_cols.values,
|
| 134 |
+
title="Missing Values by Column",
|
| 135 |
+
labels={'x': 'Column', 'y': 'Missing Count'})
|
| 136 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 137 |
+
else:
|
| 138 |
+
st.success("No missing values found!")
|
| 139 |
+
|
| 140 |
+
with col2:
|
| 141 |
+
# Data type distribution
|
| 142 |
+
dtype_counts = df.dtypes.value_counts()
|
| 143 |
+
fig = px.pie(values=dtype_counts.values, names=dtype_counts.index.astype(str),
|
| 144 |
+
title="Data Type Distribution")
|
| 145 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 146 |
+
|
| 147 |
+
# Outlier detection with Isolation Forest
|
| 148 |
+
st.subheader("🕵️ Anomaly Detection")
|
| 149 |
+
|
| 150 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 151 |
+
|
| 152 |
+
if len(numeric_cols) > 0:
|
| 153 |
+
contamination = st.slider("Expected outlier proportion", 0.01, 0.5, 0.1, 0.01)
|
| 154 |
+
|
| 155 |
+
iso_forest = IsolationForest(contamination=contamination, random_state=42)
|
| 156 |
+
outliers = iso_forest.fit_predict(df[numeric_cols].fillna(0))
|
| 157 |
+
|
| 158 |
+
n_outliers = (outliers == -1).sum()
|
| 159 |
+
st.write(f"**Detected Anomalies:** {n_outliers} rows ({n_outliers/len(df)*100:.2f}%)")
|
| 160 |
+
|
| 161 |
+
# Visualize outliers (if 2 or 3 numeric columns)
|
| 162 |
+
if len(numeric_cols) >= 2:
|
| 163 |
+
df_with_outliers = df[numeric_cols[:3]].copy()
|
| 164 |
+
df_with_outliers['is_outlier'] = outliers
|
| 165 |
+
|
| 166 |
+
if len(numeric_cols) == 2:
|
| 167 |
+
fig = px.scatter(df_with_outliers, x=numeric_cols[0], y=numeric_cols[1],
|
| 168 |
+
color='is_outlier', title="Anomaly Detection Results",
|
| 169 |
+
color_continuous_scale=['blue', 'red'])
|
| 170 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 171 |
+
elif len(numeric_cols) >= 3:
|
| 172 |
+
fig = px.scatter_3d(df_with_outliers, x=numeric_cols[0],
|
| 173 |
+
y=numeric_cols[1], z=numeric_cols[2],
|
| 174 |
+
color='is_outlier', title="Anomaly Detection Results (3D)",
|
| 175 |
+
color_continuous_scale=['blue', 'red'])
|
| 176 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 177 |
+
else:
|
| 178 |
+
st.info("No numeric columns available for anomaly detection")
|
| 179 |
+
|
| 180 |
+
# Recommendations
|
| 181 |
+
st.subheader("💡 Quality Improvement Recommendations")
|
| 182 |
+
|
| 183 |
+
recommendations = []
|
| 184 |
+
|
| 185 |
+
# Missing value recommendations
|
| 186 |
+
missing_cols = df.columns[df.isnull().any()].tolist()
|
| 187 |
+
if missing_cols:
|
| 188 |
+
recommendations.append(f"• Handle missing values in {len(missing_cols)} columns: {', '.join(missing_cols[:5])}")
|
| 189 |
+
|
| 190 |
+
# Duplicate recommendations
|
| 191 |
+
if df.duplicated().sum() > 0:
|
| 192 |
+
recommendations.append(f"• Remove {df.duplicated().sum()} duplicate rows")
|
| 193 |
+
|
| 194 |
+
# Outlier recommendations
|
| 195 |
+
outlier_cols = []
|
| 196 |
+
for col in numeric_cols:
|
| 197 |
+
Q1 = df[col].quantile(0.25)
|
| 198 |
+
Q3 = df[col].quantile(0.75)
|
| 199 |
+
IQR = Q3 - Q1
|
| 200 |
+
outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
|
| 201 |
+
if len(outliers) > len(df) * 0.1: # More than 10% outliers
|
| 202 |
+
outlier_cols.append(col)
|
| 203 |
+
|
| 204 |
+
if outlier_cols:
|
| 205 |
+
recommendations.append(f"• Investigate outliers in: {', '.join(outlier_cols[:3])}")
|
| 206 |
+
|
| 207 |
+
# Data type recommendations
|
| 208 |
+
for col in df.columns:
|
| 209 |
+
if df[col].dtype == 'object':
|
| 210 |
+
# Check if column should be numeric
|
| 211 |
+
try:
|
| 212 |
+
pd.to_numeric(df[col].dropna().iloc[:100])
|
| 213 |
+
recommendations.append(f"• Convert '{col}' to numeric type")
|
| 214 |
+
except:
|
| 215 |
+
pass
|
| 216 |
+
|
| 217 |
+
if recommendations:
|
| 218 |
+
for rec in recommendations:
|
| 219 |
+
st.markdown(rec)
|
| 220 |
+
else:
|
| 221 |
+
st.success("✅ Dataset quality looks good! No major issues detected.")
|
| 222 |
+
|
| 223 |
+
# Download quality report
|
| 224 |
+
report_text = f"""
|
| 225 |
+
DATA QUALITY REPORT
|
| 226 |
+
===================
|
| 227 |
+
|
| 228 |
+
Overall Quality Score: {quality_score:.1f}/100
|
| 229 |
+
|
| 230 |
+
Metrics:
|
| 231 |
+
• Completeness: {completeness:.1f}%
|
| 232 |
+
• Uniqueness: {uniqueness:.1f}%
|
| 233 |
+
• Type Consistency: {type_consistency:.1f}%
|
| 234 |
+
• Outlier Impact: {outlier_impact:.1f}%
|
| 235 |
+
|
| 236 |
+
Dataset Statistics:
|
| 237 |
+
• Rows: {df.shape[0]:,}
|
| 238 |
+
• Columns: {df.shape[1]}
|
| 239 |
+
• Missing Values: {df.isnull().sum().sum():,}
|
| 240 |
+
• Duplicate Rows: {df.duplicated().sum():,}
|
| 241 |
+
|
| 242 |
+
Recommendations:
|
| 243 |
+
{chr(10).join(recommendations)}
|
| 244 |
+
"""
|
| 245 |
+
|
| 246 |
+
st.download_button(
|
| 247 |
+
label="📥 Download Quality Report",
|
| 248 |
+
data=report_text,
|
| 249 |
+
file_name="data_quality_report.txt",
|
| 250 |
+
mime="text/plain",
|
| 251 |
+
use_container_width=True
|
| 252 |
+
)
|
dataset_overview.py
ADDED
|
@@ -0,0 +1,1159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
import plotly.graph_objects as go
|
| 6 |
+
from plotly.subplots import make_subplots
|
| 7 |
+
|
| 8 |
+
def eda_analysis(df):
|
| 9 |
+
"""
|
| 10 |
+
Comprehensive Exploratory Data Analysis (EDA) with visual insights
|
| 11 |
+
"""
|
| 12 |
+
st.markdown("""
|
| 13 |
+
<div style='text-align: center; margin-bottom: 2rem;'>
|
| 14 |
+
<h2>🔍 Exploratory Data Analysis (EDA)</h2>
|
| 15 |
+
<p style='color: gray;'>Discover patterns, relationships, and insights through visual exploration</p>
|
| 16 |
+
</div>
|
| 17 |
+
""", unsafe_allow_html=True)
|
| 18 |
+
|
| 19 |
+
# Error handling
|
| 20 |
+
if df.empty:
|
| 21 |
+
st.error("❌ The dataset is empty. Please upload a valid dataset.")
|
| 22 |
+
return
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
# Create tabs for different EDA aspects
|
| 26 |
+
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
|
| 27 |
+
"📋 Data Overview",
|
| 28 |
+
"🔍 Missing Data Analysis",
|
| 29 |
+
"📊 Univariate Analysis",
|
| 30 |
+
"🔄 Bivariate Analysis",
|
| 31 |
+
"📈 Multivariate Analysis",
|
| 32 |
+
"🎯 Pattern Discovery"
|
| 33 |
+
])
|
| 34 |
+
|
| 35 |
+
with tab1:
|
| 36 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 37 |
+
st.subheader("📋 Dataset Overview")
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
# Key metrics in cards
|
| 41 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 42 |
+
|
| 43 |
+
with col1:
|
| 44 |
+
st.metric("Total Rows", f"{df.shape[0]:,}")
|
| 45 |
+
with col2:
|
| 46 |
+
st.metric("Total Columns", df.shape[1])
|
| 47 |
+
with col3:
|
| 48 |
+
memory_usage = df.memory_usage(deep=True).sum() / 1024**2
|
| 49 |
+
st.metric("Memory Usage", f"{memory_usage:.2f} MB")
|
| 50 |
+
with col4:
|
| 51 |
+
missing_total = df.isnull().sum().sum()
|
| 52 |
+
st.metric("Missing Values", f"{missing_total:,}")
|
| 53 |
+
|
| 54 |
+
# Data preview with interactive controls
|
| 55 |
+
st.subheader("🔍 Data Preview")
|
| 56 |
+
col1, col2 = st.columns(2)
|
| 57 |
+
with col1:
|
| 58 |
+
preview_rows = st.slider("Number of rows to display", 5, 50, 10, key="preview_rows")
|
| 59 |
+
with col2:
|
| 60 |
+
preview_type = st.radio("Preview type", ["Head", "Tail", "Random Sample"],
|
| 61 |
+
horizontal=True, key="preview_type")
|
| 62 |
+
|
| 63 |
+
if preview_type == "Head":
|
| 64 |
+
st.dataframe(df.head(preview_rows), use_container_width=True)
|
| 65 |
+
elif preview_type == "Tail":
|
| 66 |
+
st.dataframe(df.tail(preview_rows), use_container_width=True)
|
| 67 |
+
else:
|
| 68 |
+
if len(df) > preview_rows:
|
| 69 |
+
st.dataframe(df.sample(preview_rows), use_container_width=True)
|
| 70 |
+
else:
|
| 71 |
+
st.warning("⚠️ Sample size larger than dataset. Showing all rows.")
|
| 72 |
+
st.dataframe(df, use_container_width=True)
|
| 73 |
+
|
| 74 |
+
# Column information with visual indicators
|
| 75 |
+
st.subheader("📋 Column Information")
|
| 76 |
+
|
| 77 |
+
col_info = pd.DataFrame({
|
| 78 |
+
'Column': df.columns,
|
| 79 |
+
'Data Type': df.dtypes.astype(str),
|
| 80 |
+
'Non-Null Count': df.count().values,
|
| 81 |
+
'Null Count': df.isnull().sum().values,
|
| 82 |
+
'Null %': (df.isnull().sum().values / len(df) * 100).round(2),
|
| 83 |
+
'Unique Values': [df[col].nunique() for col in df.columns],
|
| 84 |
+
'Sample Values': [str(df[col].dropna().iloc[:3].tolist()) if len(df[col].dropna()) > 0 else "All null" for col in df.columns]
|
| 85 |
+
})
|
| 86 |
+
|
| 87 |
+
# Add color coding for data types
|
| 88 |
+
def color_data_type(val):
|
| 89 |
+
if 'int' in val or 'float' in val:
|
| 90 |
+
return 'background-color: #e3f2fd'
|
| 91 |
+
elif 'object' in val:
|
| 92 |
+
return 'background-color: #f1f8e9'
|
| 93 |
+
elif 'datetime' in val:
|
| 94 |
+
return 'background-color: #fff3e0'
|
| 95 |
+
return ''
|
| 96 |
+
|
| 97 |
+
st.dataframe(col_info.style.applymap(color_data_type, subset=['Data Type']),
|
| 98 |
+
use_container_width=True)
|
| 99 |
+
|
| 100 |
+
# Data type distribution
|
| 101 |
+
st.subheader("📊 Data Type Distribution")
|
| 102 |
+
|
| 103 |
+
dtype_counts = df.dtypes.value_counts()
|
| 104 |
+
if len(dtype_counts) > 0:
|
| 105 |
+
fig = make_subplots(rows=1, cols=2,
|
| 106 |
+
specs=[[{"type": "pie"}, {"type": "bar"}]],
|
| 107 |
+
subplot_titles=("Pie Chart", "Bar Chart"))
|
| 108 |
+
|
| 109 |
+
fig.add_trace(go.Pie(labels=dtype_counts.index.astype(str),
|
| 110 |
+
values=dtype_counts.values,
|
| 111 |
+
hole=0.3), row=1, col=1)
|
| 112 |
+
|
| 113 |
+
fig.add_trace(go.Bar(x=dtype_counts.index.astype(str),
|
| 114 |
+
y=dtype_counts.values,
|
| 115 |
+
marker_color=['#42a5f5', '#66bb6a', '#ffa726'][:len(dtype_counts)]),
|
| 116 |
+
row=1, col=2)
|
| 117 |
+
|
| 118 |
+
fig.update_layout(height=400, title_text="Column Types Distribution")
|
| 119 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 120 |
+
else:
|
| 121 |
+
st.warning("⚠️ No data type information available")
|
| 122 |
+
|
| 123 |
+
# Dataset statistics
|
| 124 |
+
st.subheader("📈 Dataset Statistics")
|
| 125 |
+
|
| 126 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 127 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 128 |
+
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
|
| 129 |
+
bool_cols = df.select_dtypes(include=['bool']).columns.tolist()
|
| 130 |
+
|
| 131 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 132 |
+
with col1:
|
| 133 |
+
st.info(f"**Numeric:** {len(numeric_cols)} columns")
|
| 134 |
+
with col2:
|
| 135 |
+
st.info(f"**Categorical:** {len(categorical_cols)} columns")
|
| 136 |
+
with col3:
|
| 137 |
+
st.info(f"**Datetime:** {len(datetime_cols)} columns")
|
| 138 |
+
with col4:
|
| 139 |
+
st.info(f"**Boolean:** {len(bool_cols)} columns")
|
| 140 |
+
|
| 141 |
+
except Exception as e:
|
| 142 |
+
st.error(f"❌ Error in data overview: {str(e)}")
|
| 143 |
+
st.info("💡 Tip: Check if your dataset contains valid data types")
|
| 144 |
+
|
| 145 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 146 |
+
|
| 147 |
+
with tab2:
|
| 148 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 149 |
+
st.subheader("🔍 Missing Data Analysis")
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
if df.isnull().sum().sum() > 0:
|
| 153 |
+
# Missing data overview
|
| 154 |
+
missing_df = pd.DataFrame({
|
| 155 |
+
'Column': df.columns,
|
| 156 |
+
'Missing Count': df.isnull().sum().values,
|
| 157 |
+
'Missing %': (df.isnull().sum().values / len(df) * 100).round(2)
|
| 158 |
+
}).sort_values('Missing %', ascending=False)
|
| 159 |
+
|
| 160 |
+
missing_df = missing_df[missing_df['Missing Count'] > 0]
|
| 161 |
+
|
| 162 |
+
if len(missing_df) > 0:
|
| 163 |
+
# Visualize missing data
|
| 164 |
+
fig = make_subplots(rows=2, cols=2,
|
| 165 |
+
subplot_titles=("Missing Values Heatmap",
|
| 166 |
+
"Missing Values by Column",
|
| 167 |
+
"Missing Data Patterns",
|
| 168 |
+
"Missing Data Matrix"),
|
| 169 |
+
specs=[[{"type": "heatmap"}, {"type": "bar"}],
|
| 170 |
+
[{"type": "scatter"}, {"type": "heatmap"}]])
|
| 171 |
+
|
| 172 |
+
# Heatmap of missing values
|
| 173 |
+
missing_matrix = df.isnull().astype(int).T
|
| 174 |
+
fig.add_trace(go.Heatmap(z=missing_matrix.values,
|
| 175 |
+
y=missing_matrix.index,
|
| 176 |
+
colorscale='Reds',
|
| 177 |
+
showscale=False), row=1, col=1)
|
| 178 |
+
|
| 179 |
+
# Bar chart of missing values
|
| 180 |
+
fig.add_trace(go.Bar(x=missing_df['Column'].head(20),
|
| 181 |
+
y=missing_df['Missing Count'].head(20),
|
| 182 |
+
marker_color='#ef5350',
|
| 183 |
+
name="Missing Count"), row=1, col=2)
|
| 184 |
+
|
| 185 |
+
# Missing data patterns (rows with missing data)
|
| 186 |
+
missing_rows = df[df.isnull().any(axis=1)]
|
| 187 |
+
if len(missing_rows) > 0:
|
| 188 |
+
pattern_df = missing_rows.isnull().sum(axis=1).value_counts().reset_index()
|
| 189 |
+
pattern_df.columns = ['Missing Count per Row', 'Number of Rows']
|
| 190 |
+
pattern_df = pattern_df.sort_values('Missing Count per Row')
|
| 191 |
+
|
| 192 |
+
fig.add_trace(go.Scatter(x=pattern_df['Missing Count per Row'],
|
| 193 |
+
y=pattern_df['Number of Rows'],
|
| 194 |
+
mode='lines+markers',
|
| 195 |
+
name="Patterns"), row=2, col=1)
|
| 196 |
+
|
| 197 |
+
# Missing data matrix for first 50 rows
|
| 198 |
+
sample_missing = df.head(min(50, len(df))).isnull().astype(int).T
|
| 199 |
+
fig.add_trace(go.Heatmap(z=sample_missing.values,
|
| 200 |
+
y=sample_missing.index,
|
| 201 |
+
colorscale='Reds',
|
| 202 |
+
showscale=False,
|
| 203 |
+
name="Matrix"), row=2, col=2)
|
| 204 |
+
|
| 205 |
+
fig.update_layout(height=800, title_text="Missing Data Analysis",
|
| 206 |
+
showlegend=False)
|
| 207 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 208 |
+
|
| 209 |
+
# Detailed missing data table
|
| 210 |
+
st.subheader("📋 Missing Data Details")
|
| 211 |
+
|
| 212 |
+
# Add severity classification
|
| 213 |
+
def classify_severity(pct):
|
| 214 |
+
if pct == 0:
|
| 215 |
+
return "✅ None"
|
| 216 |
+
elif pct < 5:
|
| 217 |
+
return "🟢 Low"
|
| 218 |
+
elif pct < 20:
|
| 219 |
+
return "🟡 Medium"
|
| 220 |
+
else:
|
| 221 |
+
return "🔴 High"
|
| 222 |
+
|
| 223 |
+
missing_df['Severity'] = missing_df['Missing %'].apply(classify_severity)
|
| 224 |
+
missing_df['Recommendation'] = missing_df['Missing %'].apply(
|
| 225 |
+
lambda x: "No action needed" if x == 0 else
|
| 226 |
+
"Consider imputation" if x < 5 else
|
| 227 |
+
"Imputation recommended" if x < 20 else
|
| 228 |
+
"Consider dropping column"
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
st.dataframe(missing_df, use_container_width=True)
|
| 232 |
+
|
| 233 |
+
# Missing data patterns
|
| 234 |
+
if len(missing_df) > 1:
|
| 235 |
+
st.subheader("🔄 Missing Data Patterns")
|
| 236 |
+
|
| 237 |
+
# Find columns with similar missing patterns
|
| 238 |
+
missing_corr = df[missing_df['Column'].tolist()].isnull().corr()
|
| 239 |
+
|
| 240 |
+
if len(missing_corr) > 1:
|
| 241 |
+
fig = px.imshow(missing_corr,
|
| 242 |
+
text_auto=True,
|
| 243 |
+
aspect="auto",
|
| 244 |
+
color_continuous_scale='RdBu_r',
|
| 245 |
+
title="Missing Value Correlation Matrix")
|
| 246 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 247 |
+
|
| 248 |
+
# Find highly correlated missing patterns
|
| 249 |
+
high_corr = []
|
| 250 |
+
for i in range(len(missing_corr.columns)):
|
| 251 |
+
for j in range(i+1, len(missing_corr.columns)):
|
| 252 |
+
if abs(missing_corr.iloc[i, j]) > 0.7:
|
| 253 |
+
high_corr.append({
|
| 254 |
+
'Column 1': missing_corr.columns[i],
|
| 255 |
+
'Column 2': missing_corr.columns[j],
|
| 256 |
+
'Correlation': missing_corr.iloc[i, j]
|
| 257 |
+
})
|
| 258 |
+
|
| 259 |
+
if high_corr:
|
| 260 |
+
st.info("🔍 **Columns with similar missing patterns:**")
|
| 261 |
+
for item in high_corr[:5]: # Show top 5
|
| 262 |
+
st.write(f"• {item['Column 1']} & {item['Column 2']}: {item['Correlation']:.2f}")
|
| 263 |
+
else:
|
| 264 |
+
st.success("✅ No missing values found in the dataset!")
|
| 265 |
+
else:
|
| 266 |
+
st.success("✅ No missing values found in the dataset!")
|
| 267 |
+
|
| 268 |
+
# Show complete data visualization
|
| 269 |
+
fig = go.Figure()
|
| 270 |
+
fig.add_trace(go.Indicator(
|
| 271 |
+
mode="number+gauge",
|
| 272 |
+
value=100,
|
| 273 |
+
title={'text': "Data Completeness"},
|
| 274 |
+
gauge={'axis': {'range': [0, 100]},
|
| 275 |
+
'bar': {'color': "green"},
|
| 276 |
+
'steps': [{'range': [0, 100], 'color': "lightgreen"}]}
|
| 277 |
+
))
|
| 278 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 279 |
+
|
| 280 |
+
except Exception as e:
|
| 281 |
+
st.error(f"❌ Error in missing data analysis: {str(e)}")
|
| 282 |
+
st.info("💡 Tip: Ensure your dataset has valid data for missing value analysis")
|
| 283 |
+
|
| 284 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 285 |
+
|
| 286 |
+
with tab3:
|
| 287 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 288 |
+
st.subheader("📊 Univariate Analysis")
|
| 289 |
+
|
| 290 |
+
try:
|
| 291 |
+
col_type = st.radio("Select column type", ["Numeric", "Categorical", "Datetime"],
|
| 292 |
+
horizontal=True, key="univariate_type")
|
| 293 |
+
|
| 294 |
+
if col_type == "Numeric":
|
| 295 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 296 |
+
if numeric_cols:
|
| 297 |
+
selected_col = st.selectbox("Select numeric column", numeric_cols, key="univariate_num")
|
| 298 |
+
|
| 299 |
+
data = df[selected_col].dropna()
|
| 300 |
+
|
| 301 |
+
if len(data) > 0:
|
| 302 |
+
# Create comprehensive visualization
|
| 303 |
+
fig = make_subplots(rows=2, cols=3,
|
| 304 |
+
subplot_titles=("Histogram", "Box Plot", "Violin Plot",
|
| 305 |
+
"ECDF", "QQ Plot", "Summary Stats"),
|
| 306 |
+
specs=[[{"type": "xy"}, {"type": "xy"}, {"type": "xy"}],
|
| 307 |
+
[{"type": "xy"}, {"type": "xy"}, {"type": "domain"}]])
|
| 308 |
+
|
| 309 |
+
# Histogram
|
| 310 |
+
fig.add_trace(go.Histogram(x=data, nbinsx=30, name="Histogram",
|
| 311 |
+
marker_color='#42a5f5'), row=1, col=1)
|
| 312 |
+
|
| 313 |
+
# Box plot
|
| 314 |
+
fig.add_trace(go.Box(y=data, name="Box Plot", boxpoints='outliers',
|
| 315 |
+
marker_color='#66bb6a'), row=1, col=2)
|
| 316 |
+
|
| 317 |
+
# Violin plot
|
| 318 |
+
fig.add_trace(go.Violin(y=data, name="Violin Plot", box_visible=True,
|
| 319 |
+
line_color='black', fillcolor='#ffa726',
|
| 320 |
+
opacity=0.6), row=1, col=3)
|
| 321 |
+
|
| 322 |
+
# ECDF
|
| 323 |
+
sorted_data = np.sort(data)
|
| 324 |
+
ecdf = np.arange(1, len(sorted_data)+1) / len(sorted_data)
|
| 325 |
+
fig.add_trace(go.Scatter(x=sorted_data, y=ecdf, mode='lines',
|
| 326 |
+
name="ECDF", line=dict(color='#ab47bc')),
|
| 327 |
+
row=2, col=1)
|
| 328 |
+
|
| 329 |
+
# QQ plot
|
| 330 |
+
theoretical_q = np.random.normal(data.mean(), data.std(), len(data))
|
| 331 |
+
theoretical_q.sort()
|
| 332 |
+
fig.add_trace(go.Scatter(x=theoretical_q, y=sorted_data,
|
| 333 |
+
mode='markers', name="QQ Plot",
|
| 334 |
+
marker=dict(color='#7e57c2', size=3)),
|
| 335 |
+
row=2, col=2)
|
| 336 |
+
|
| 337 |
+
# Add reference line to QQ plot
|
| 338 |
+
min_val = min(theoretical_q.min(), sorted_data.min())
|
| 339 |
+
max_val = max(theoretical_q.max(), sorted_data.max())
|
| 340 |
+
fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
|
| 341 |
+
mode='lines', line=dict(color='red', dash='dash'),
|
| 342 |
+
showlegend=False), row=2, col=2)
|
| 343 |
+
|
| 344 |
+
# Summary statistics as table
|
| 345 |
+
stats_text = f"""
|
| 346 |
+
<b>Summary Statistics</b><br>
|
| 347 |
+
Count: {len(data):,}<br>
|
| 348 |
+
Mean: {data.mean():.4f}<br>
|
| 349 |
+
Std: {data.std():.4f}<br>
|
| 350 |
+
Min: {data.min():.4f}<br>
|
| 351 |
+
Q1: {data.quantile(0.25):.4f}<br>
|
| 352 |
+
Median: {data.median():.4f}<br>
|
| 353 |
+
Q3: {data.quantile(0.75):.4f}<br>
|
| 354 |
+
Max: {data.max():.4f}<br>
|
| 355 |
+
IQR: {data.quantile(0.75) - data.quantile(0.25):.4f}<br>
|
| 356 |
+
Skewness: {data.skew():.4f}<br>
|
| 357 |
+
Kurtosis: {data.kurtosis():.4f}
|
| 358 |
+
"""
|
| 359 |
+
|
| 360 |
+
fig.add_annotation(x=0.5, y=0.5, text=stats_text,
|
| 361 |
+
showarrow=False, font=dict(size=10),
|
| 362 |
+
row=2, col=3, align='left')
|
| 363 |
+
|
| 364 |
+
fig.update_layout(height=800, title_text=f"Univariate Analysis: {selected_col}")
|
| 365 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 366 |
+
|
| 367 |
+
# Outlier detection
|
| 368 |
+
Q1 = data.quantile(0.25)
|
| 369 |
+
Q3 = data.quantile(0.75)
|
| 370 |
+
IQR = Q3 - Q1
|
| 371 |
+
outliers = data[(data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR)]
|
| 372 |
+
|
| 373 |
+
col1, col2 = st.columns(2)
|
| 374 |
+
with col1:
|
| 375 |
+
st.metric("Outliers Count", len(outliers))
|
| 376 |
+
with col2:
|
| 377 |
+
st.metric("Outliers %", f"{len(outliers)/len(data)*100:.2f}%")
|
| 378 |
+
|
| 379 |
+
if len(outliers) > 0:
|
| 380 |
+
with st.expander("View outlier values"):
|
| 381 |
+
st.write(outliers.tolist()[:20]) # Show first 20 outliers
|
| 382 |
+
if len(outliers) > 20:
|
| 383 |
+
st.info(f"... and {len(outliers) - 20} more outliers")
|
| 384 |
+
else:
|
| 385 |
+
st.warning("⚠️ No numeric columns available for analysis")
|
| 386 |
+
|
| 387 |
+
elif col_type == "Categorical":
|
| 388 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 389 |
+
if categorical_cols:
|
| 390 |
+
selected_col = st.selectbox("Select categorical column", categorical_cols,
|
| 391 |
+
key="univariate_cat")
|
| 392 |
+
|
| 393 |
+
# Get value counts
|
| 394 |
+
value_counts = df[selected_col].value_counts().reset_index()
|
| 395 |
+
value_counts.columns = [selected_col, 'count']
|
| 396 |
+
value_counts['percentage'] = (value_counts['count'] / len(df) * 100).round(2)
|
| 397 |
+
|
| 398 |
+
if len(value_counts) > 0:
|
| 399 |
+
# Create visualizations
|
| 400 |
+
fig = make_subplots(rows=2, cols=2,
|
| 401 |
+
subplot_titles=("Bar Chart (Top 20)", "Pie Chart (Top 10)",
|
| 402 |
+
"Treemap (Top 10)", "Frequency Table"),
|
| 403 |
+
specs=[[{"type": "xy"}, {"type": "domain"}],
|
| 404 |
+
[{"type": "domain"}, {"type": "table"}]])
|
| 405 |
+
|
| 406 |
+
# Bar chart (top 20)
|
| 407 |
+
top20 = value_counts.head(20)
|
| 408 |
+
fig.add_trace(go.Bar(x=top20[selected_col],
|
| 409 |
+
y=top20['count'],
|
| 410 |
+
marker_color='#42a5f5',
|
| 411 |
+
name="Count"), row=1, col=1)
|
| 412 |
+
|
| 413 |
+
# Pie chart (top 10)
|
| 414 |
+
top10 = value_counts.head(10)
|
| 415 |
+
fig.add_trace(go.Pie(labels=top10[selected_col],
|
| 416 |
+
values=top10['count'],
|
| 417 |
+
hole=0.3,
|
| 418 |
+
textinfo='percent+label',
|
| 419 |
+
name="Proportion"), row=1, col=2)
|
| 420 |
+
|
| 421 |
+
# Treemap (top 10)
|
| 422 |
+
fig.add_trace(go.Treemap(labels=top10[selected_col],
|
| 423 |
+
parents=['']*len(top10),
|
| 424 |
+
values=top10['count'],
|
| 425 |
+
textinfo='label+value',
|
| 426 |
+
name="Treemap"), row=2, col=1)
|
| 427 |
+
|
| 428 |
+
# Frequency table (top 10)
|
| 429 |
+
fig.add_trace(go.Table(header=dict(values=[selected_col, 'Count', 'Percentage']),
|
| 430 |
+
cells=dict(values=[top10[selected_col].tolist(),
|
| 431 |
+
top10['count'].tolist(),
|
| 432 |
+
top10['percentage'].tolist()]),
|
| 433 |
+
name="Table"), row=2, col=2)
|
| 434 |
+
|
| 435 |
+
fig.update_layout(height=800, title_text=f"Categorical Analysis: {selected_col}")
|
| 436 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 437 |
+
|
| 438 |
+
# Summary statistics for categorical
|
| 439 |
+
col1, col2, col3 = st.columns(3)
|
| 440 |
+
with col1:
|
| 441 |
+
st.metric("Unique Values", f"{value_counts.shape[0]:,}")
|
| 442 |
+
with col2:
|
| 443 |
+
st.metric("Most Frequent", f"{value_counts.iloc[0, 0]}")
|
| 444 |
+
with col3:
|
| 445 |
+
st.metric("Frequency", f"{value_counts.iloc[0, 1]:,} ({value_counts.iloc[0, 2]}%)")
|
| 446 |
+
|
| 447 |
+
# Cardinality warning
|
| 448 |
+
if value_counts.shape[0] > 50:
|
| 449 |
+
st.warning(f"⚠️ High cardinality detected: {value_counts.shape[0]} unique values. Consider grouping rare categories.")
|
| 450 |
+
else:
|
| 451 |
+
st.warning("⚠️ No categorical columns available for analysis")
|
| 452 |
+
|
| 453 |
+
elif col_type == "Datetime":
|
| 454 |
+
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
|
| 455 |
+
if datetime_cols:
|
| 456 |
+
selected_col = st.selectbox("Select datetime column", datetime_cols,
|
| 457 |
+
key="univariate_datetime")
|
| 458 |
+
|
| 459 |
+
# Extract temporal features
|
| 460 |
+
df_temp = df[selected_col].dropna()
|
| 461 |
+
|
| 462 |
+
if len(df_temp) > 0:
|
| 463 |
+
# Create temporal distributions
|
| 464 |
+
fig = make_subplots(rows=2, cols=2,
|
| 465 |
+
subplot_titles=("Year Distribution", "Month Distribution",
|
| 466 |
+
"Day of Week Distribution", "Hour Distribution"),
|
| 467 |
+
specs=[[{"type": "xy"}, {"type": "xy"}],
|
| 468 |
+
[{"type": "xy"}, {"type": "xy"}]])
|
| 469 |
+
|
| 470 |
+
# Year distribution
|
| 471 |
+
years = df_temp.dt.year.value_counts().sort_index()
|
| 472 |
+
if len(years) > 0:
|
| 473 |
+
fig.add_trace(go.Bar(x=years.index.astype(str), y=years.values,
|
| 474 |
+
marker_color='#42a5f5', name="Year"), row=1, col=1)
|
| 475 |
+
|
| 476 |
+
# Month distribution
|
| 477 |
+
months = df_temp.dt.month.value_counts().sort_index()
|
| 478 |
+
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
|
| 479 |
+
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
| 480 |
+
if len(months) > 0:
|
| 481 |
+
fig.add_trace(go.Bar(x=[month_names[i-1] for i in months.index],
|
| 482 |
+
y=months.values, marker_color='#66bb6a',
|
| 483 |
+
name="Month"), row=1, col=2)
|
| 484 |
+
|
| 485 |
+
# Day of week distribution
|
| 486 |
+
days = df_temp.dt.dayofweek.value_counts().sort_index()
|
| 487 |
+
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
|
| 488 |
+
if len(days) > 0:
|
| 489 |
+
fig.add_trace(go.Bar(x=[day_names[i] for i in days.index],
|
| 490 |
+
y=days.values, marker_color='#ffa726',
|
| 491 |
+
name="Day of Week"), row=2, col=1)
|
| 492 |
+
|
| 493 |
+
# Hour distribution (if time component exists)
|
| 494 |
+
if df_temp.dt.hour.nunique() > 1:
|
| 495 |
+
hours = df_temp.dt.hour.value_counts().sort_index()
|
| 496 |
+
fig.add_trace(go.Bar(x=hours.index.astype(str), y=hours.values,
|
| 497 |
+
marker_color='#ab47bc', name="Hour"), row=2, col=2)
|
| 498 |
+
|
| 499 |
+
fig.update_layout(height=800, title_text=f"Temporal Analysis: {selected_col}")
|
| 500 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 501 |
+
|
| 502 |
+
# Date range information
|
| 503 |
+
col1, col2, col3 = st.columns(3)
|
| 504 |
+
with col1:
|
| 505 |
+
st.metric("Start Date", df_temp.min().strftime('%Y-%m-%d'))
|
| 506 |
+
with col2:
|
| 507 |
+
st.metric("End Date", df_temp.max().strftime('%Y-%m-%d'))
|
| 508 |
+
with col3:
|
| 509 |
+
date_range = (df_temp.max() - df_temp.min()).days
|
| 510 |
+
st.metric("Date Range", f"{date_range} days")
|
| 511 |
+
else:
|
| 512 |
+
st.warning("⚠️ No datetime columns available for analysis")
|
| 513 |
+
|
| 514 |
+
except Exception as e:
|
| 515 |
+
st.error(f"❌ Error in univariate analysis: {str(e)}")
|
| 516 |
+
st.info("💡 Tip: Ensure the selected column contains valid data for analysis")
|
| 517 |
+
|
| 518 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 519 |
+
|
| 520 |
+
with tab4:
|
| 521 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 522 |
+
st.subheader("🔄 Bivariate Analysis")
|
| 523 |
+
|
| 524 |
+
try:
|
| 525 |
+
analysis_type = st.radio("Select analysis type",
|
| 526 |
+
["Numeric vs Numeric", "Numeric vs Categorical",
|
| 527 |
+
"Categorical vs Categorical"],
|
| 528 |
+
horizontal=True, key="bivariate_type")
|
| 529 |
+
|
| 530 |
+
if analysis_type == "Numeric vs Numeric":
|
| 531 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 532 |
+
if len(numeric_cols) >= 2:
|
| 533 |
+
col1, col2 = st.columns(2)
|
| 534 |
+
with col1:
|
| 535 |
+
x_col = st.selectbox("Select X axis", numeric_cols, key="bi_x")
|
| 536 |
+
with col2:
|
| 537 |
+
y_col = st.selectbox("Select Y axis", [c for c in numeric_cols if c != x_col],
|
| 538 |
+
key="bi_y")
|
| 539 |
+
|
| 540 |
+
# Clean data for analysis
|
| 541 |
+
plot_df = df[[x_col, y_col]].dropna()
|
| 542 |
+
|
| 543 |
+
if len(plot_df) > 0:
|
| 544 |
+
# Create comprehensive visualization
|
| 545 |
+
fig = make_subplots(rows=2, cols=3,
|
| 546 |
+
subplot_titles=("Scatter Plot", "Hexbin Plot", "Density Contour",
|
| 547 |
+
"Marginal Distributions", "Residuals", "Statistics"),
|
| 548 |
+
specs=[[{"type": "xy"}, {"type": "xy"}, {"type": "xy"}],
|
| 549 |
+
[{"type": "xy"}, {"type": "xy"}, {"type": "domain"}]])
|
| 550 |
+
|
| 551 |
+
# Scatter plot with trendline
|
| 552 |
+
fig.add_trace(go.Scatter(x=plot_df[x_col], y=plot_df[y_col],
|
| 553 |
+
mode='markers', name="Scatter",
|
| 554 |
+
marker=dict(size=5, opacity=0.6, color='#42a5f5')),
|
| 555 |
+
row=1, col=1)
|
| 556 |
+
|
| 557 |
+
# Add trendline
|
| 558 |
+
try:
|
| 559 |
+
z = np.polyfit(plot_df[x_col], plot_df[y_col], 1)
|
| 560 |
+
p = np.poly1d(z)
|
| 561 |
+
x_range = np.linspace(plot_df[x_col].min(), plot_df[x_col].max(), 100)
|
| 562 |
+
fig.add_trace(go.Scatter(x=x_range, y=p(x_range),
|
| 563 |
+
mode='lines', name="Trend",
|
| 564 |
+
line=dict(color='red', width=2)), row=1, col=1)
|
| 565 |
+
except:
|
| 566 |
+
pass
|
| 567 |
+
|
| 568 |
+
# Hexbin plot
|
| 569 |
+
fig.add_trace(go.Histogram2d(x=plot_df[x_col], y=plot_df[y_col],
|
| 570 |
+
colorscale='Viridis',
|
| 571 |
+
name="Hexbin"), row=1, col=2)
|
| 572 |
+
|
| 573 |
+
# Density contour
|
| 574 |
+
fig.add_trace(go.Histogram2dContour(x=plot_df[x_col], y=plot_df[y_col],
|
| 575 |
+
colorscale='Viridis',
|
| 576 |
+
name="Contour"), row=1, col=3)
|
| 577 |
+
|
| 578 |
+
# Marginal distributions
|
| 579 |
+
fig.add_trace(go.Histogram(x=plot_df[x_col], name=f"{x_col}",
|
| 580 |
+
marker_color='#66bb6a'), row=2, col=1)
|
| 581 |
+
fig.add_trace(go.Histogram(y=plot_df[y_col], name=f"{y_col}",
|
| 582 |
+
marker_color='#ffa726', orientation='h'),
|
| 583 |
+
row=2, col=1)
|
| 584 |
+
|
| 585 |
+
# Residuals
|
| 586 |
+
try:
|
| 587 |
+
residuals = plot_df[y_col] - p(plot_df[x_col])
|
| 588 |
+
fig.add_trace(go.Scatter(x=plot_df[x_col], y=residuals,
|
| 589 |
+
mode='markers', name="Residuals",
|
| 590 |
+
marker=dict(size=3, opacity=0.5, color='#ab47bc')),
|
| 591 |
+
row=2, col=2)
|
| 592 |
+
fig.add_hline(y=0, line_dash="dash", line_color="red", row=2, col=2)
|
| 593 |
+
except:
|
| 594 |
+
pass
|
| 595 |
+
|
| 596 |
+
# Statistics
|
| 597 |
+
corr = plot_df[x_col].corr(plot_df[y_col])
|
| 598 |
+
stats_text = f"""
|
| 599 |
+
<b>Statistics</b><br>
|
| 600 |
+
Correlation: {corr:.4f}<br>
|
| 601 |
+
R²: {corr**2:.4f}<br>
|
| 602 |
+
Covariance: {plot_df[x_col].cov(plot_df[y_col]):.4f}<br>
|
| 603 |
+
Sample Size: {len(plot_df)}<br>
|
| 604 |
+
"""
|
| 605 |
+
|
| 606 |
+
fig.add_annotation(x=0.5, y=0.5, text=stats_text,
|
| 607 |
+
showarrow=False, font=dict(size=10),
|
| 608 |
+
row=2, col=3, align='left')
|
| 609 |
+
|
| 610 |
+
fig.update_layout(height=800, title_text=f"Bivariate Analysis: {x_col} vs {y_col}")
|
| 611 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 612 |
+
|
| 613 |
+
# Correlation interpretation
|
| 614 |
+
if abs(corr) > 0.7:
|
| 615 |
+
st.success(f"✅ Strong {'positive' if corr > 0 else 'negative'} correlation detected")
|
| 616 |
+
elif abs(corr) > 0.3:
|
| 617 |
+
st.info(f"ℹ️ Moderate {'positive' if corr > 0 else 'negative'} correlation detected")
|
| 618 |
+
else:
|
| 619 |
+
st.warning(f"⚠️ Weak or no correlation detected")
|
| 620 |
+
else:
|
| 621 |
+
st.warning("⚠️ Need at least 2 numeric columns for this analysis")
|
| 622 |
+
|
| 623 |
+
elif analysis_type == "Numeric vs Categorical":
|
| 624 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 625 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 626 |
+
|
| 627 |
+
if numeric_cols and categorical_cols:
|
| 628 |
+
col1, col2 = st.columns(2)
|
| 629 |
+
with col1:
|
| 630 |
+
num_col = st.selectbox("Select numeric column", numeric_cols, key="bi_num")
|
| 631 |
+
with col2:
|
| 632 |
+
cat_col = st.selectbox("Select categorical column", categorical_cols, key="bi_cat")
|
| 633 |
+
|
| 634 |
+
# Clean data
|
| 635 |
+
plot_df = df[[num_col, cat_col]].dropna()
|
| 636 |
+
|
| 637 |
+
if len(plot_df) > 0 and plot_df[cat_col].nunique() <= 30:
|
| 638 |
+
# Create visualizations
|
| 639 |
+
fig = make_subplots(rows=2, cols=2,
|
| 640 |
+
subplot_titles=("Box Plot", "Violin Plot",
|
| 641 |
+
"Strip Plot", "Bar Chart (Means ± SD)"),
|
| 642 |
+
specs=[[{"type": "xy"}, {"type": "xy"}],
|
| 643 |
+
[{"type": "xy"}, {"type": "xy"}]])
|
| 644 |
+
|
| 645 |
+
# Box plot
|
| 646 |
+
fig.add_trace(go.Box(x=plot_df[cat_col], y=plot_df[num_col],
|
| 647 |
+
name="Box Plot", marker_color='#42a5f5'), row=1, col=1)
|
| 648 |
+
|
| 649 |
+
# Violin plot
|
| 650 |
+
fig.add_trace(go.Violin(x=plot_df[cat_col], y=plot_df[num_col],
|
| 651 |
+
box_visible=True, line_color='black',
|
| 652 |
+
fillcolor='#66bb6a', opacity=0.6,
|
| 653 |
+
name="Violin Plot"), row=1, col=2)
|
| 654 |
+
|
| 655 |
+
# Strip plot
|
| 656 |
+
fig.add_trace(go.Scatter(x=plot_df[cat_col], y=plot_df[num_col],
|
| 657 |
+
mode='markers', name="Strip Plot",
|
| 658 |
+
marker=dict(size=3, opacity=0.3, color='#ffa726')),
|
| 659 |
+
row=2, col=1)
|
| 660 |
+
|
| 661 |
+
# Bar chart with error bars
|
| 662 |
+
stats_by_cat = plot_df.groupby(cat_col)[num_col].agg(['mean', 'std', 'count']).reset_index()
|
| 663 |
+
stats_by_cat = stats_by_cat.sort_values('mean', ascending=False).head(15)
|
| 664 |
+
|
| 665 |
+
fig.add_trace(go.Bar(x=stats_by_cat[cat_col], y=stats_by_cat['mean'],
|
| 666 |
+
error_y=dict(type='data', array=stats_by_cat['std']),
|
| 667 |
+
name="Mean ± SD", marker_color='#ab47bc'),
|
| 668 |
+
row=2, col=2)
|
| 669 |
+
|
| 670 |
+
fig.update_layout(height=800, title_text=f"{num_col} by {cat_col}")
|
| 671 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 672 |
+
|
| 673 |
+
# ANOVA test for groups with >2 categories
|
| 674 |
+
if plot_df[cat_col].nunique() >= 2:
|
| 675 |
+
groups = [group[num_col].values for name, group in plot_df.groupby(cat_col)]
|
| 676 |
+
if all(len(g) > 0 for g in groups):
|
| 677 |
+
f_stat, p_val = stats.f_oneway(*groups)
|
| 678 |
+
st.write(f"**One-way ANOVA Results:** F-statistic = {f_stat:.4f}, p-value = {p_val:.4f}")
|
| 679 |
+
if p_val < 0.05:
|
| 680 |
+
st.success("✅ Significant differences exist between groups")
|
| 681 |
+
else:
|
| 682 |
+
st.info("ℹ️ No significant differences found between groups")
|
| 683 |
+
elif plot_df[cat_col].nunique() > 30:
|
| 684 |
+
st.warning(f"⚠️ Categorical column has {plot_df[cat_col].nunique()} unique values. Consider grouping or selecting another column.")
|
| 685 |
+
else:
|
| 686 |
+
st.warning("⚠️ Need both numeric and categorical columns for this analysis")
|
| 687 |
+
|
| 688 |
+
elif analysis_type == "Categorical vs Categorical":
|
| 689 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 690 |
+
|
| 691 |
+
if len(categorical_cols) >= 2:
|
| 692 |
+
col1, col2 = st.columns(2)
|
| 693 |
+
with col1:
|
| 694 |
+
cat1 = st.selectbox("Select first categorical column", categorical_cols, key="bi_cat1")
|
| 695 |
+
with col2:
|
| 696 |
+
cat2 = st.selectbox("Select second categorical column",
|
| 697 |
+
[c for c in categorical_cols if c != cat1], key="bi_cat2")
|
| 698 |
+
|
| 699 |
+
# Create contingency table
|
| 700 |
+
contingency = pd.crosstab(df[cat1], df[cat2])
|
| 701 |
+
|
| 702 |
+
if contingency.size > 0:
|
| 703 |
+
fig = make_subplots(rows=1, cols=2,
|
| 704 |
+
subplot_titles=("Stacked Bar Chart", "Heatmap"),
|
| 705 |
+
specs=[[{"type": "xy"}, {"type": "heatmap"}]])
|
| 706 |
+
|
| 707 |
+
# Stacked bar chart
|
| 708 |
+
for col in contingency.columns[:10]: # Limit to 10 categories
|
| 709 |
+
fig.add_trace(go.Bar(x=contingency.index[:10], y=contingency[col][:10],
|
| 710 |
+
name=str(col)), row=1, col=1)
|
| 711 |
+
|
| 712 |
+
# Heatmap
|
| 713 |
+
fig.add_trace(go.Heatmap(z=contingency.values[:10, :10],
|
| 714 |
+
x=contingency.columns[:10].astype(str),
|
| 715 |
+
y=contingency.index[:10].astype(str),
|
| 716 |
+
colorscale='Viridis',
|
| 717 |
+
text=contingency.values[:10, :10],
|
| 718 |
+
texttemplate="%{text}"), row=1, col=2)
|
| 719 |
+
|
| 720 |
+
fig.update_layout(height=600, title_text=f"Relationship: {cat1} vs {cat2}",
|
| 721 |
+
barmode='stack')
|
| 722 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 723 |
+
|
| 724 |
+
# Chi-square test
|
| 725 |
+
from scipy.stats import chi2_contingency
|
| 726 |
+
chi2, p_val, dof, expected = chi2_contingency(contingency)
|
| 727 |
+
|
| 728 |
+
st.write(f"**Chi-square Test Results:**")
|
| 729 |
+
st.write(f"χ² = {chi2:.4f}, df = {dof}, p-value = {p_val:.4f}")
|
| 730 |
+
|
| 731 |
+
if p_val < 0.05:
|
| 732 |
+
st.success("✅ Significant association found between variables")
|
| 733 |
+
|
| 734 |
+
# Cramer's V for effect size
|
| 735 |
+
n = contingency.sum().sum()
|
| 736 |
+
cramer_v = np.sqrt(chi2 / (n * (min(contingency.shape) - 1)))
|
| 737 |
+
st.write(f"**Cramer's V (effect size):** {cramer_v:.4f}")
|
| 738 |
+
else:
|
| 739 |
+
st.info("ℹ️ No significant association found")
|
| 740 |
+
else:
|
| 741 |
+
st.warning("⚠️ Need at least 2 categorical columns for this analysis")
|
| 742 |
+
|
| 743 |
+
except Exception as e:
|
| 744 |
+
st.error(f"❌ Error in bivariate analysis: {str(e)}")
|
| 745 |
+
st.info("💡 Tip: Check if selected columns have sufficient data for analysis")
|
| 746 |
+
|
| 747 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 748 |
+
|
| 749 |
+
with tab5:
|
| 750 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 751 |
+
st.subheader("📈 Multivariate Analysis")
|
| 752 |
+
|
| 753 |
+
try:
|
| 754 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 755 |
+
|
| 756 |
+
if len(numeric_cols) >= 3:
|
| 757 |
+
analysis_type = st.radio("Select analysis type",
|
| 758 |
+
["Correlation Matrix", "Parallel Coordinates",
|
| 759 |
+
"3D Scatter", "Radar Chart"],
|
| 760 |
+
horizontal=True, key="multivariate_type")
|
| 761 |
+
|
| 762 |
+
if analysis_type == "Correlation Matrix":
|
| 763 |
+
corr_matrix = df[numeric_cols].corr()
|
| 764 |
+
|
| 765 |
+
fig = px.imshow(corr_matrix,
|
| 766 |
+
text_auto=True,
|
| 767 |
+
aspect="auto",
|
| 768 |
+
color_continuous_scale='RdBu_r',
|
| 769 |
+
title="Correlation Matrix Heatmap",
|
| 770 |
+
zmin=-1, zmax=1)
|
| 771 |
+
|
| 772 |
+
fig.update_layout(height=700)
|
| 773 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 774 |
+
|
| 775 |
+
# Find highly correlated pairs
|
| 776 |
+
high_corr = []
|
| 777 |
+
for i in range(len(numeric_cols)):
|
| 778 |
+
for j in range(i+1, len(numeric_cols)):
|
| 779 |
+
if abs(corr_matrix.iloc[i, j]) > 0.7:
|
| 780 |
+
high_corr.append({
|
| 781 |
+
'Feature 1': numeric_cols[i],
|
| 782 |
+
'Feature 2': numeric_cols[j],
|
| 783 |
+
'Correlation': corr_matrix.iloc[i, j]
|
| 784 |
+
})
|
| 785 |
+
|
| 786 |
+
if high_corr:
|
| 787 |
+
st.subheader("🔍 Highly Correlated Pairs (|r| > 0.7)")
|
| 788 |
+
for item in high_corr:
|
| 789 |
+
st.write(f"• **{item['Feature 1']}** & **{item['Feature 2']}**: {item['Correlation']:.4f}")
|
| 790 |
+
|
| 791 |
+
elif analysis_type == "Parallel Coordinates":
|
| 792 |
+
# Select dimensions
|
| 793 |
+
selected_dims = st.multiselect("Select dimensions (columns)",
|
| 794 |
+
numeric_cols,
|
| 795 |
+
default=numeric_cols[:min(4, len(numeric_cols))])
|
| 796 |
+
|
| 797 |
+
if len(selected_dims) >= 2:
|
| 798 |
+
# Optional color dimension
|
| 799 |
+
color_dim = st.selectbox("Color by", ["None"] + numeric_cols +
|
| 800 |
+
df.select_dtypes(include=['object', 'category']).columns.tolist())
|
| 801 |
+
|
| 802 |
+
plot_df = df[selected_dims].dropna()
|
| 803 |
+
|
| 804 |
+
if len(plot_df) > 0:
|
| 805 |
+
if color_dim == "None":
|
| 806 |
+
fig = px.parallel_coordinates(plot_df,
|
| 807 |
+
dimensions=selected_dims,
|
| 808 |
+
title="Parallel Coordinates Plot")
|
| 809 |
+
else:
|
| 810 |
+
if color_dim in numeric_cols:
|
| 811 |
+
fig = px.parallel_coordinates(plot_df,
|
| 812 |
+
dimensions=selected_dims,
|
| 813 |
+
color=color_dim,
|
| 814 |
+
color_continuous_scale=px.colors.diverging.RdBu,
|
| 815 |
+
title=f"Parallel Coordinates colored by {color_dim}")
|
| 816 |
+
else:
|
| 817 |
+
# Categorical color
|
| 818 |
+
temp_df = df[selected_dims + [color_dim]].dropna()
|
| 819 |
+
fig = px.parallel_coordinates(temp_df,
|
| 820 |
+
dimensions=selected_dims,
|
| 821 |
+
color=color_dim,
|
| 822 |
+
title=f"Parallel Coordinates colored by {color_dim}")
|
| 823 |
+
|
| 824 |
+
fig.update_layout(height=600)
|
| 825 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 826 |
+
|
| 827 |
+
elif analysis_type == "3D Scatter":
|
| 828 |
+
if len(numeric_cols) >= 3:
|
| 829 |
+
col1, col2, col3 = st.columns(3)
|
| 830 |
+
with col1:
|
| 831 |
+
x_3d = st.selectbox("X axis", numeric_cols, key="3d_x")
|
| 832 |
+
with col2:
|
| 833 |
+
y_3d = st.selectbox("Y axis", [c for c in numeric_cols if c != x_3d], key="3d_y")
|
| 834 |
+
with col3:
|
| 835 |
+
z_3d = st.selectbox("Z axis", [c for c in numeric_cols if c not in [x_3d, y_3d]],
|
| 836 |
+
key="3d_z")
|
| 837 |
+
|
| 838 |
+
color_3d = st.selectbox("Color by", ["None"] +
|
| 839 |
+
df.select_dtypes(include=['object', 'category']).columns.tolist())
|
| 840 |
+
|
| 841 |
+
plot_df = df[[x_3d, y_3d, z_3d]].dropna()
|
| 842 |
+
|
| 843 |
+
if len(plot_df) > 0:
|
| 844 |
+
if color_3d == "None":
|
| 845 |
+
fig = px.scatter_3d(plot_df, x=x_3d, y=y_3d, z=z_3d,
|
| 846 |
+
title=f"3D Scatter Plot",
|
| 847 |
+
opacity=0.7)
|
| 848 |
+
else:
|
| 849 |
+
temp_df = df[[x_3d, y_3d, z_3d, color_3d]].dropna()
|
| 850 |
+
fig = px.scatter_3d(temp_df, x=x_3d, y=y_3d, z=z_3d,
|
| 851 |
+
color=color_3d,
|
| 852 |
+
title=f"3D Scatter colored by {color_3d}",
|
| 853 |
+
opacity=0.7)
|
| 854 |
+
|
| 855 |
+
fig.update_layout(height=700)
|
| 856 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 857 |
+
|
| 858 |
+
elif analysis_type == "Radar Chart":
|
| 859 |
+
# Select features for radar
|
| 860 |
+
radar_features = st.multiselect("Select features for radar chart",
|
| 861 |
+
numeric_cols,
|
| 862 |
+
default=numeric_cols[:min(5, len(numeric_cols))])
|
| 863 |
+
|
| 864 |
+
if len(radar_features) >= 3:
|
| 865 |
+
# Select how many samples to show
|
| 866 |
+
n_samples = st.slider("Number of samples to show", 1, min(10, len(df)), 3)
|
| 867 |
+
|
| 868 |
+
fig = go.Figure()
|
| 869 |
+
|
| 870 |
+
for i in range(n_samples):
|
| 871 |
+
sample = df.iloc[i][radar_features].values
|
| 872 |
+
fig.add_trace(go.Scatterpolar(
|
| 873 |
+
r=sample,
|
| 874 |
+
theta=radar_features,
|
| 875 |
+
fill='toself',
|
| 876 |
+
name=f'Sample {i}'
|
| 877 |
+
))
|
| 878 |
+
|
| 879 |
+
fig.update_layout(
|
| 880 |
+
polar=dict(
|
| 881 |
+
radialaxis=dict(
|
| 882 |
+
visible=True,
|
| 883 |
+
range=[df[radar_features].min().min(), df[radar_features].max().max()]
|
| 884 |
+
)),
|
| 885 |
+
title=f"Radar Chart - First {n_samples} Samples",
|
| 886 |
+
height=600
|
| 887 |
+
)
|
| 888 |
+
|
| 889 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 890 |
+
else:
|
| 891 |
+
st.warning("⚠️ Need at least 3 numeric columns for multivariate analysis")
|
| 892 |
+
|
| 893 |
+
except Exception as e:
|
| 894 |
+
st.error(f"❌ Error in multivariate analysis: {str(e)}")
|
| 895 |
+
st.info("💡 Tip: Ensure you have enough numeric columns for multivariate analysis")
|
| 896 |
+
|
| 897 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 898 |
+
|
| 899 |
+
with tab6:
|
| 900 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 901 |
+
st.subheader("🎯 Pattern Discovery")
|
| 902 |
+
|
| 903 |
+
try:
|
| 904 |
+
analysis_type = st.radio("Select pattern discovery method",
|
| 905 |
+
["Clustering Visualization", "Outlier Detection",
|
| 906 |
+
"Trend Detection", "Seasonal Patterns"],
|
| 907 |
+
horizontal=True, key="pattern_type")
|
| 908 |
+
|
| 909 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 910 |
+
|
| 911 |
+
if analysis_type == "Clustering Visualization":
|
| 912 |
+
if len(numeric_cols) >= 2:
|
| 913 |
+
from sklearn.cluster import KMeans
|
| 914 |
+
from sklearn.preprocessing import StandardScaler
|
| 915 |
+
|
| 916 |
+
# Select features for clustering
|
| 917 |
+
cluster_features = st.multiselect("Select features for clustering",
|
| 918 |
+
numeric_cols,
|
| 919 |
+
default=numeric_cols[:min(3, len(numeric_cols))])
|
| 920 |
+
|
| 921 |
+
if len(cluster_features) >= 2:
|
| 922 |
+
n_clusters = st.slider("Number of clusters", 2, 8, 3)
|
| 923 |
+
|
| 924 |
+
# Prepare data
|
| 925 |
+
X = df[cluster_features].dropna()
|
| 926 |
+
|
| 927 |
+
if len(X) > 0:
|
| 928 |
+
# Scale data
|
| 929 |
+
scaler = StandardScaler()
|
| 930 |
+
X_scaled = scaler.fit_transform(X)
|
| 931 |
+
|
| 932 |
+
# Perform clustering
|
| 933 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
| 934 |
+
clusters = kmeans.fit_predict(X_scaled)
|
| 935 |
+
|
| 936 |
+
# Create visualization
|
| 937 |
+
if len(cluster_features) == 2:
|
| 938 |
+
fig = px.scatter(x=X[cluster_features[0]], y=X[cluster_features[1]],
|
| 939 |
+
color=clusters.astype(str),
|
| 940 |
+
title=f"K-Means Clustering (k={n_clusters})",
|
| 941 |
+
labels={'x': cluster_features[0], 'y': cluster_features[1],
|
| 942 |
+
'color': 'Cluster'})
|
| 943 |
+
elif len(cluster_features) >= 3:
|
| 944 |
+
fig = px.scatter_3d(x=X[cluster_features[0]], y=X[cluster_features[1]],
|
| 945 |
+
z=X[cluster_features[2]], color=clusters.astype(str),
|
| 946 |
+
title=f"K-Means Clustering (k={n_clusters})",
|
| 947 |
+
labels={cluster_features[0]: cluster_features[0],
|
| 948 |
+
cluster_features[1]: cluster_features[1],
|
| 949 |
+
cluster_features[2]: cluster_features[2],
|
| 950 |
+
'color': 'Cluster'})
|
| 951 |
+
|
| 952 |
+
fig.update_layout(height=600)
|
| 953 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 954 |
+
|
| 955 |
+
# Cluster statistics
|
| 956 |
+
st.subheader("📊 Cluster Statistics")
|
| 957 |
+
X['Cluster'] = clusters
|
| 958 |
+
cluster_stats = X.groupby('Cluster')[cluster_features].mean()
|
| 959 |
+
st.dataframe(cluster_stats.style.format("{:.4f}"))
|
| 960 |
+
|
| 961 |
+
elif analysis_type == "Outlier Detection":
|
| 962 |
+
if len(numeric_cols) >= 2:
|
| 963 |
+
from sklearn.ensemble import IsolationForest
|
| 964 |
+
|
| 965 |
+
# Select features for outlier detection
|
| 966 |
+
outlier_features = st.multiselect("Select features for outlier detection",
|
| 967 |
+
numeric_cols,
|
| 968 |
+
default=numeric_cols[:min(3, len(numeric_cols))])
|
| 969 |
+
|
| 970 |
+
if len(outlier_features) >= 2:
|
| 971 |
+
contamination = st.slider("Expected outlier proportion", 0.01, 0.5, 0.1, 0.01)
|
| 972 |
+
|
| 973 |
+
# Prepare data
|
| 974 |
+
X = df[outlier_features].dropna()
|
| 975 |
+
|
| 976 |
+
if len(X) > 0:
|
| 977 |
+
# Detect outliers
|
| 978 |
+
iso_forest = IsolationForest(contamination=contamination, random_state=42)
|
| 979 |
+
outliers = iso_forest.fit_predict(X)
|
| 980 |
+
|
| 981 |
+
# Create visualization
|
| 982 |
+
if len(outlier_features) == 2:
|
| 983 |
+
fig = px.scatter(x=X[outlier_features[0]], y=X[outlier_features[1]],
|
| 984 |
+
color=outliers,
|
| 985 |
+
color_continuous_scale=['blue', 'red'],
|
| 986 |
+
title=f"Outlier Detection (contamination={contamination})",
|
| 987 |
+
labels={'x': outlier_features[0], 'y': outlier_features[1],
|
| 988 |
+
'color': 'Outlier'})
|
| 989 |
+
elif len(outlier_features) >= 3:
|
| 990 |
+
fig = px.scatter_3d(x=X[outlier_features[0]], y=X[outlier_features[1]],
|
| 991 |
+
z=X[outlier_features[2]], color=outliers,
|
| 992 |
+
color_continuous_scale=['blue', 'red'],
|
| 993 |
+
title=f"Outlier Detection (contamination={contamination})",
|
| 994 |
+
labels={outlier_features[0]: outlier_features[0],
|
| 995 |
+
outlier_features[1]: outlier_features[1],
|
| 996 |
+
outlier_features[2]: outlier_features[2],
|
| 997 |
+
'color': 'Outlier'})
|
| 998 |
+
|
| 999 |
+
fig.update_layout(height=600)
|
| 1000 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1001 |
+
|
| 1002 |
+
# Outlier statistics
|
| 1003 |
+
n_outliers = (outliers == -1).sum()
|
| 1004 |
+
st.write(f"**Outliers detected:** {n_outliers} ({n_outliers/len(X)*100:.2f}%)")
|
| 1005 |
+
|
| 1006 |
+
elif analysis_type == "Trend Detection":
|
| 1007 |
+
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
|
| 1008 |
+
|
| 1009 |
+
if datetime_cols and numeric_cols:
|
| 1010 |
+
date_col = st.selectbox("Select date column", datetime_cols)
|
| 1011 |
+
value_col = st.selectbox("Select value column", numeric_cols)
|
| 1012 |
+
|
| 1013 |
+
# Prepare time series data
|
| 1014 |
+
ts_df = df[[date_col, value_col]].dropna().sort_values(date_col)
|
| 1015 |
+
|
| 1016 |
+
if len(ts_df) > 10:
|
| 1017 |
+
# Calculate moving averages
|
| 1018 |
+
window = st.slider("Moving average window", 2, 30, 7)
|
| 1019 |
+
ts_df['MA'] = ts_df[value_col].rolling(window=window).mean()
|
| 1020 |
+
|
| 1021 |
+
# Detect trend using linear regression
|
| 1022 |
+
from sklearn.linear_model import LinearRegression
|
| 1023 |
+
|
| 1024 |
+
X = np.arange(len(ts_df)).reshape(-1, 1)
|
| 1025 |
+
y = ts_df[value_col].values
|
| 1026 |
+
|
| 1027 |
+
model = LinearRegression()
|
| 1028 |
+
model.fit(X, y)
|
| 1029 |
+
trend = model.predict(X)
|
| 1030 |
+
|
| 1031 |
+
# Create visualization
|
| 1032 |
+
fig = go.Figure()
|
| 1033 |
+
fig.add_trace(go.Scatter(x=ts_df[date_col], y=ts_df[value_col],
|
| 1034 |
+
mode='lines', name='Original'))
|
| 1035 |
+
fig.add_trace(go.Scatter(x=ts_df[date_col], y=ts_df['MA'],
|
| 1036 |
+
mode='lines', name=f'{window}-period MA',
|
| 1037 |
+
line=dict(color='orange')))
|
| 1038 |
+
fig.add_trace(go.Scatter(x=ts_df[date_col], y=trend,
|
| 1039 |
+
mode='lines', name='Linear Trend',
|
| 1040 |
+
line=dict(color='red', dash='dash')))
|
| 1041 |
+
|
| 1042 |
+
fig.update_layout(title="Trend Detection",
|
| 1043 |
+
xaxis_title="Date",
|
| 1044 |
+
yaxis_title=value_col,
|
| 1045 |
+
height=500)
|
| 1046 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1047 |
+
|
| 1048 |
+
# Trend statistics
|
| 1049 |
+
slope = model.coef_[0]
|
| 1050 |
+
st.write(f"**Trend slope:** {slope:.4f} units per time step")
|
| 1051 |
+
if slope > 0:
|
| 1052 |
+
st.success("✅ Upward trend detected")
|
| 1053 |
+
elif slope < 0:
|
| 1054 |
+
st.warning("⚠�� Downward trend detected")
|
| 1055 |
+
else:
|
| 1056 |
+
st.info("ℹ️ No clear trend detected")
|
| 1057 |
+
|
| 1058 |
+
elif analysis_type == "Seasonal Patterns":
|
| 1059 |
+
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
|
| 1060 |
+
|
| 1061 |
+
if datetime_cols and numeric_cols:
|
| 1062 |
+
date_col = st.selectbox("Select date column", datetime_cols, key="seasonal_date")
|
| 1063 |
+
value_col = st.selectbox("Select value column", numeric_cols, key="seasonal_value")
|
| 1064 |
+
|
| 1065 |
+
# Extract seasonal components
|
| 1066 |
+
df_temp = df[[date_col, value_col]].dropna()
|
| 1067 |
+
df_temp['year'] = pd.DatetimeIndex(df_temp[date_col]).year
|
| 1068 |
+
df_temp['month'] = pd.DatetimeIndex(df_temp[date_col]).month
|
| 1069 |
+
df_temp['quarter'] = pd.DatetimeIndex(df_temp[date_col]).quarter
|
| 1070 |
+
df_temp['dayofweek'] = pd.DatetimeIndex(df_temp[date_col]).dayofweek
|
| 1071 |
+
|
| 1072 |
+
# Create seasonal visualizations
|
| 1073 |
+
fig = make_subplots(rows=2, cols=2,
|
| 1074 |
+
subplot_titles=("Year-over-Year", "Monthly Pattern",
|
| 1075 |
+
"Quarterly Pattern", "Day of Week Pattern"),
|
| 1076 |
+
specs=[[{"type": "xy"}, {"type": "xy"}],
|
| 1077 |
+
[{"type": "xy"}, {"type": "xy"}]])
|
| 1078 |
+
|
| 1079 |
+
# Year-over-Year
|
| 1080 |
+
yearly_avg = df_temp.groupby('year')[value_col].mean().reset_index()
|
| 1081 |
+
fig.add_trace(go.Scatter(x=yearly_avg['year'], y=yearly_avg[value_col],
|
| 1082 |
+
mode='lines+markers', name="Yearly Avg"), row=1, col=1)
|
| 1083 |
+
|
| 1084 |
+
# Monthly pattern
|
| 1085 |
+
monthly_avg = df_temp.groupby('month')[value_col].mean().reset_index()
|
| 1086 |
+
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
|
| 1087 |
+
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
| 1088 |
+
fig.add_trace(go.Bar(x=[month_names[m-1] for m in monthly_avg['month']],
|
| 1089 |
+
y=monthly_avg[value_col], name="Monthly Avg"), row=1, col=2)
|
| 1090 |
+
|
| 1091 |
+
# Quarterly pattern
|
| 1092 |
+
quarterly_avg = df_temp.groupby('quarter')[value_col].mean().reset_index()
|
| 1093 |
+
quarter_names = ['Q1', 'Q2', 'Q3', 'Q4']
|
| 1094 |
+
fig.add_trace(go.Bar(x=[quarter_names[q-1] for q in quarterly_avg['quarter']],
|
| 1095 |
+
y=quarterly_avg[value_col], name="Quarterly Avg"), row=2, col=1)
|
| 1096 |
+
|
| 1097 |
+
# Day of week pattern
|
| 1098 |
+
dow_avg = df_temp.groupby('dayofweek')[value_col].mean().reset_index()
|
| 1099 |
+
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
|
| 1100 |
+
fig.add_trace(go.Bar(x=[day_names[d] for d in dow_avg['dayofweek']],
|
| 1101 |
+
y=dow_avg[value_col], name="Day of Week Avg"), row=2, col=2)
|
| 1102 |
+
|
| 1103 |
+
fig.update_layout(height=800, title_text="Seasonal Pattern Analysis")
|
| 1104 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1105 |
+
|
| 1106 |
+
except Exception as e:
|
| 1107 |
+
st.error(f"❌ Error in pattern discovery: {str(e)}")
|
| 1108 |
+
st.info("💡 Tip: Ensure you have sufficient data for pattern detection")
|
| 1109 |
+
|
| 1110 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 1111 |
+
|
| 1112 |
+
except Exception as e:
|
| 1113 |
+
st.error(f"❌ Critical error in EDA: {str(e)}")
|
| 1114 |
+
st.info("💡 Please check your dataset and try again")
|
| 1115 |
+
|
| 1116 |
+
# Export options
|
| 1117 |
+
st.markdown("---")
|
| 1118 |
+
st.markdown("### 📥 Export EDA Report")
|
| 1119 |
+
|
| 1120 |
+
try:
|
| 1121 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 1122 |
+
|
| 1123 |
+
report_text = f"""
|
| 1124 |
+
EXPLORATORY DATA ANALYSIS REPORT
|
| 1125 |
+
=================================
|
| 1126 |
+
|
| 1127 |
+
Dataset Information:
|
| 1128 |
+
• Total Rows: {df.shape[0]:,}
|
| 1129 |
+
• Total Columns: {df.shape[1]}
|
| 1130 |
+
• Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB
|
| 1131 |
+
|
| 1132 |
+
Column Types:
|
| 1133 |
+
• Numeric: {len(numeric_cols)}
|
| 1134 |
+
• Categorical: {len(df.select_dtypes(include=['object', 'category']).columns)}
|
| 1135 |
+
• Datetime: {len(df.select_dtypes(include=['datetime64']).columns)}
|
| 1136 |
+
|
| 1137 |
+
Data Quality:
|
| 1138 |
+
• Missing Values: {df.isnull().sum().sum():,}
|
| 1139 |
+
• Complete Cases: {df.dropna().shape[0]:,}
|
| 1140 |
+
• Duplicate Rows: {df.duplicated().sum():,}
|
| 1141 |
+
|
| 1142 |
+
Analysis Performed:
|
| 1143 |
+
• Data Overview
|
| 1144 |
+
• Missing Data Analysis
|
| 1145 |
+
• Univariate Analysis
|
| 1146 |
+
• Bivariate Analysis
|
| 1147 |
+
• Multivariate Analysis
|
| 1148 |
+
• Pattern Discovery
|
| 1149 |
+
"""
|
| 1150 |
+
|
| 1151 |
+
st.download_button(
|
| 1152 |
+
label="📥 Download EDA Report",
|
| 1153 |
+
data=report_text,
|
| 1154 |
+
file_name="eda_report.txt",
|
| 1155 |
+
mime="text/plain",
|
| 1156 |
+
use_container_width=True
|
| 1157 |
+
)
|
| 1158 |
+
except Exception as e:
|
| 1159 |
+
st.error(f"❌ Error generating report: {str(e)}")
|
explainability.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
import plotly.graph_objects as go
|
| 6 |
+
from sklearn.inspection import permutation_importance
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
import shap
|
| 9 |
+
|
| 10 |
+
def explain_model(model, X, y=None, feature_names=None):
|
| 11 |
+
"""
|
| 12 |
+
Explain model predictions using various techniques
|
| 13 |
+
"""
|
| 14 |
+
st.subheader("🔍 Model Explainability")
|
| 15 |
+
|
| 16 |
+
if feature_names is None:
|
| 17 |
+
feature_names = X.columns if hasattr(X, 'columns') else [f"Feature {i}" for i in range(X.shape[1])]
|
| 18 |
+
|
| 19 |
+
# Create tabs for different explanation methods
|
| 20 |
+
tab1, tab2, tab3 = st.tabs(["Feature Importance", "SHAP Values", "Partial Dependence"])
|
| 21 |
+
|
| 22 |
+
with tab1:
|
| 23 |
+
st.markdown("### 📊 Feature Importance")
|
| 24 |
+
|
| 25 |
+
# Method selection
|
| 26 |
+
method = st.radio(
|
| 27 |
+
"Importance method",
|
| 28 |
+
["Built-in", "Permutation"],
|
| 29 |
+
horizontal=True
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
if method == "Built-in":
|
| 33 |
+
if hasattr(model, 'feature_importances_'):
|
| 34 |
+
importance = model.feature_importances_
|
| 35 |
+
importance_df = pd.DataFrame({
|
| 36 |
+
'feature': feature_names,
|
| 37 |
+
'importance': importance
|
| 38 |
+
}).sort_values('importance', ascending=False)
|
| 39 |
+
|
| 40 |
+
fig = px.bar(importance_df.head(20), x='importance', y='feature',
|
| 41 |
+
orientation='h', title="Feature Importance (Built-in)")
|
| 42 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 43 |
+
else:
|
| 44 |
+
st.warning("Model doesn't have built-in feature importance")
|
| 45 |
+
|
| 46 |
+
else: # Permutation importance
|
| 47 |
+
if y is not None:
|
| 48 |
+
with st.spinner("Calculating permutation importance..."):
|
| 49 |
+
perm_importance = permutation_importance(model, X, y, n_repeats=10)
|
| 50 |
+
|
| 51 |
+
importance_df = pd.DataFrame({
|
| 52 |
+
'feature': feature_names,
|
| 53 |
+
'importance': perm_importance.importances_mean,
|
| 54 |
+
'std': perm_importance.importances_std
|
| 55 |
+
}).sort_values('importance', ascending=False)
|
| 56 |
+
|
| 57 |
+
fig = go.Figure()
|
| 58 |
+
fig.add_trace(go.Bar(
|
| 59 |
+
x=importance_df['importance'].head(20),
|
| 60 |
+
y=importance_df['feature'].head(20),
|
| 61 |
+
orientation='h',
|
| 62 |
+
error_x=dict(
|
| 63 |
+
type='data',
|
| 64 |
+
array=importance_df['std'].head(20),
|
| 65 |
+
visible=True
|
| 66 |
+
)
|
| 67 |
+
))
|
| 68 |
+
fig.update_layout(title="Permutation Importance (with error bars)",
|
| 69 |
+
xaxis_title="Importance")
|
| 70 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 71 |
+
else:
|
| 72 |
+
st.warning("Need target values for permutation importance")
|
| 73 |
+
|
| 74 |
+
with tab2:
|
| 75 |
+
st.markdown("### 📈 SHAP Values")
|
| 76 |
+
|
| 77 |
+
if hasattr(model, 'predict'):
|
| 78 |
+
with st.spinner("Calculating SHAP values (this may take a moment)..."):
|
| 79 |
+
try:
|
| 80 |
+
# Create explainer based on model type
|
| 81 |
+
if str(type(model)).find('sklearn') != -1:
|
| 82 |
+
explainer = shap.Explainer(model, X[:100]) # Use subset for speed
|
| 83 |
+
else:
|
| 84 |
+
explainer = shap.TreeExplainer(model) if hasattr(model, 'feature_importances_') else shap.Explainer(model, X[:100])
|
| 85 |
+
|
| 86 |
+
# Calculate SHAP values
|
| 87 |
+
shap_values = explainer(X[:100]) # Limit to 100 samples for performance
|
| 88 |
+
|
| 89 |
+
# Summary plot
|
| 90 |
+
st.markdown("#### SHAP Summary Plot")
|
| 91 |
+
fig, ax = plt.subplots()
|
| 92 |
+
shap.summary_plot(shap_values, X[:100], feature_names=feature_names, show=False)
|
| 93 |
+
st.pyplot(fig)
|
| 94 |
+
plt.close()
|
| 95 |
+
|
| 96 |
+
# Waterfall plot for a single prediction
|
| 97 |
+
st.markdown("#### Single Prediction Explanation")
|
| 98 |
+
sample_idx = st.slider("Select sample index", 0, min(99, len(X)-1), 0)
|
| 99 |
+
|
| 100 |
+
fig, ax = plt.subplots()
|
| 101 |
+
shap.waterfall_plot(shap_values[sample_idx], show=False)
|
| 102 |
+
st.pyplot(fig)
|
| 103 |
+
plt.close()
|
| 104 |
+
|
| 105 |
+
except Exception as e:
|
| 106 |
+
st.error(f"Error calculating SHAP values: {str(e)}")
|
| 107 |
+
st.info("Try using a smaller sample or a different model type")
|
| 108 |
+
else:
|
| 109 |
+
st.warning("Model doesn't support prediction")
|
| 110 |
+
|
| 111 |
+
with tab3:
|
| 112 |
+
st.markdown("### 📉 Partial Dependence Plots")
|
| 113 |
+
|
| 114 |
+
if hasattr(model, 'predict') and len(feature_names) > 0:
|
| 115 |
+
from sklearn.inspection import partial_dependence
|
| 116 |
+
|
| 117 |
+
selected_feature = st.selectbox("Select feature for PDP", feature_names)
|
| 118 |
+
|
| 119 |
+
if selected_feature:
|
| 120 |
+
feature_idx = list(feature_names).index(selected_feature)
|
| 121 |
+
|
| 122 |
+
# Calculate partial dependence
|
| 123 |
+
pdp = partial_dependence(model, X, [feature_idx], grid_resolution=50)
|
| 124 |
+
|
| 125 |
+
# Create plot
|
| 126 |
+
fig = go.Figure()
|
| 127 |
+
fig.add_trace(go.Scatter(
|
| 128 |
+
x=pdp['values'][0],
|
| 129 |
+
y=pdp['average'][0],
|
| 130 |
+
mode='lines+markers',
|
| 131 |
+
name='Partial Dependence'
|
| 132 |
+
))
|
| 133 |
+
|
| 134 |
+
fig.update_layout(
|
| 135 |
+
title=f"Partial Dependence Plot for {selected_feature}",
|
| 136 |
+
xaxis_title=selected_feature,
|
| 137 |
+
yaxis_title="Prediction"
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 141 |
+
|
| 142 |
+
# Individual conditional expectation (ICE) plots
|
| 143 |
+
if st.checkbox("Show ICE plots"):
|
| 144 |
+
ice_data = []
|
| 145 |
+
for i in range(min(10, X.shape[0])): # Show up to 10 lines
|
| 146 |
+
ice = partial_dependence(model, X.iloc[i:i+1], [feature_idx], grid_resolution=20)
|
| 147 |
+
ice_data.append(ice['average'][0])
|
| 148 |
+
|
| 149 |
+
fig = go.Figure()
|
| 150 |
+
for i, ice in enumerate(ice_data):
|
| 151 |
+
fig.add_trace(go.Scatter(
|
| 152 |
+
x=pdp['values'][0],
|
| 153 |
+
y=ice,
|
| 154 |
+
mode='lines',
|
| 155 |
+
name=f'Sample {i}',
|
| 156 |
+
line=dict(width=1, color='lightgray')
|
| 157 |
+
))
|
| 158 |
+
|
| 159 |
+
# Add average line
|
| 160 |
+
fig.add_trace(go.Scatter(
|
| 161 |
+
x=pdp['values'][0],
|
| 162 |
+
y=pdp['average'][0],
|
| 163 |
+
mode='lines',
|
| 164 |
+
name='Average',
|
| 165 |
+
line=dict(width=3, color='red')
|
| 166 |
+
))
|
| 167 |
+
|
| 168 |
+
fig.update_layout(
|
| 169 |
+
title=f"ICE Plots for {selected_feature}",
|
| 170 |
+
xaxis_title=selected_feature,
|
| 171 |
+
yaxis_title="Prediction"
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 175 |
+
else:
|
| 176 |
+
st.warning("Need more features for partial dependence plots")
|
insights.py
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
import plotly.graph_objects as go
|
| 6 |
+
from scipy import stats
|
| 7 |
+
|
| 8 |
+
def generate_business_insights(df):
|
| 9 |
+
|
| 10 |
+
st.markdown("""
|
| 11 |
+
<div style='text-align: center; margin-bottom: 2rem;'>
|
| 12 |
+
<h2>💡 Automated Business Insights</h2>
|
| 13 |
+
<p style='color: gray;'>AI-powered analysis to uncover hidden patterns and opportunities</p>
|
| 14 |
+
</div>
|
| 15 |
+
""", unsafe_allow_html=True)
|
| 16 |
+
|
| 17 |
+
# Get column types
|
| 18 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 19 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 20 |
+
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
|
| 21 |
+
|
| 22 |
+
# Create tabs for different insight categories
|
| 23 |
+
tab1, tab2, tab3, tab4, tab5 = st.tabs([
|
| 24 |
+
"📊 Overview", "📈 Trends & Patterns", "🎯 Key Drivers",
|
| 25 |
+
"⚠️ Anomalies", "💡 Recommendations"
|
| 26 |
+
])
|
| 27 |
+
|
| 28 |
+
with tab1:
|
| 29 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 30 |
+
st.subheader("📊 Dataset Overview")
|
| 31 |
+
|
| 32 |
+
# Key metrics
|
| 33 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 34 |
+
with col1:
|
| 35 |
+
st.metric("Total Records", f"{df.shape[0]:,}")
|
| 36 |
+
with col2:
|
| 37 |
+
st.metric("Total Features", df.shape[1])
|
| 38 |
+
with col3:
|
| 39 |
+
completeness = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
|
| 40 |
+
st.metric("Data Completeness", f"{completeness:.1f}%")
|
| 41 |
+
with col4:
|
| 42 |
+
if numeric_cols:
|
| 43 |
+
total_value = df[numeric_cols].sum().sum()
|
| 44 |
+
st.metric("Total Value", f"{total_value:,.0f}" if total_value < 1e6 else f"{total_value/1e6:,.1f}M")
|
| 45 |
+
|
| 46 |
+
# Column composition
|
| 47 |
+
st.markdown("### 📋 Column Composition")
|
| 48 |
+
|
| 49 |
+
comp_data = {
|
| 50 |
+
'Type': ['Numeric', 'Categorical', 'Datetime'],
|
| 51 |
+
'Count': [len(numeric_cols), len(categorical_cols), len(datetime_cols)]
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
fig = px.pie(comp_data, values='Count', names='Type',
|
| 55 |
+
title="Column Type Distribution",
|
| 56 |
+
color_discrete_sequence=px.colors.qualitative.Set3)
|
| 57 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 58 |
+
|
| 59 |
+
# Data quality score
|
| 60 |
+
st.markdown("### 📊 Data Quality Score")
|
| 61 |
+
|
| 62 |
+
quality_score = 0
|
| 63 |
+
quality_metrics = []
|
| 64 |
+
|
| 65 |
+
# Completeness score
|
| 66 |
+
completeness_score = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
|
| 67 |
+
quality_metrics.append(completeness_score)
|
| 68 |
+
|
| 69 |
+
# Uniqueness score (avoid duplicates)
|
| 70 |
+
duplicate_pct = (df.duplicated().sum() / df.shape[0]) * 100
|
| 71 |
+
uniqueness_score = 100 - duplicate_pct
|
| 72 |
+
quality_metrics.append(uniqueness_score)
|
| 73 |
+
|
| 74 |
+
# Consistency score (data type consistency)
|
| 75 |
+
type_consistency = 100 # Default high
|
| 76 |
+
for col in df.columns:
|
| 77 |
+
if df[col].dtype == 'object':
|
| 78 |
+
# Check if column has mixed types
|
| 79 |
+
try:
|
| 80 |
+
pd.to_numeric(df[col], errors='raise')
|
| 81 |
+
# If convertible to numeric, it's consistent
|
| 82 |
+
except:
|
| 83 |
+
pass # Object type is fine
|
| 84 |
+
else:
|
| 85 |
+
# Numeric columns are consistent
|
| 86 |
+
pass
|
| 87 |
+
quality_metrics.append(type_consistency)
|
| 88 |
+
|
| 89 |
+
# Average quality score
|
| 90 |
+
avg_quality = np.mean(quality_metrics)
|
| 91 |
+
|
| 92 |
+
# Display gauge chart
|
| 93 |
+
fig = go.Figure(go.Indicator(
|
| 94 |
+
mode = "gauge+number",
|
| 95 |
+
value = avg_quality,
|
| 96 |
+
domain = {'x': [0, 1], 'y': [0, 1]},
|
| 97 |
+
title = {'text': "Overall Data Quality"},
|
| 98 |
+
gauge = {
|
| 99 |
+
'axis': {'range': [None, 100]},
|
| 100 |
+
'bar': {'color': "darkblue"},
|
| 101 |
+
'steps': [
|
| 102 |
+
{'range': [0, 50], 'color': "lightgray"},
|
| 103 |
+
{'range': [50, 80], 'color': "gray"},
|
| 104 |
+
{'range': [80, 100], 'color': "lightgreen"}],
|
| 105 |
+
'threshold': {
|
| 106 |
+
'line': {'color': "red", 'width': 4},
|
| 107 |
+
'thickness': 0.75,
|
| 108 |
+
'value': 90}}))
|
| 109 |
+
|
| 110 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 111 |
+
|
| 112 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 113 |
+
|
| 114 |
+
with tab2:
|
| 115 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 116 |
+
st.subheader("📈 Trends & Patterns")
|
| 117 |
+
|
| 118 |
+
if len(numeric_cols) > 0:
|
| 119 |
+
# Correlation analysis
|
| 120 |
+
if len(numeric_cols) >= 2:
|
| 121 |
+
st.markdown("### 🔗 Key Relationships")
|
| 122 |
+
|
| 123 |
+
corr_matrix = df[numeric_cols].corr()
|
| 124 |
+
|
| 125 |
+
# Find strongest correlations
|
| 126 |
+
corr_pairs = []
|
| 127 |
+
for i in range(len(numeric_cols)):
|
| 128 |
+
for j in range(i+1, len(numeric_cols)):
|
| 129 |
+
corr_pairs.append({
|
| 130 |
+
'feature1': numeric_cols[i],
|
| 131 |
+
'feature2': numeric_cols[j],
|
| 132 |
+
'correlation': corr_matrix.iloc[i, j]
|
| 133 |
+
})
|
| 134 |
+
|
| 135 |
+
corr_pairs.sort(key=lambda x: abs(x['correlation']), reverse=True)
|
| 136 |
+
|
| 137 |
+
# Display top 5 correlations
|
| 138 |
+
for pair in corr_pairs[:5]:
|
| 139 |
+
strength = abs(pair['correlation'])
|
| 140 |
+
if strength > 0.7:
|
| 141 |
+
emoji = "🟢"
|
| 142 |
+
desc = "Strong"
|
| 143 |
+
elif strength > 0.3:
|
| 144 |
+
emoji = "🟡"
|
| 145 |
+
desc = "Moderate"
|
| 146 |
+
else:
|
| 147 |
+
emoji = "🔴"
|
| 148 |
+
desc = "Weak"
|
| 149 |
+
|
| 150 |
+
direction = "positive" if pair['correlation'] > 0 else "negative"
|
| 151 |
+
|
| 152 |
+
st.markdown(
|
| 153 |
+
f"{emoji} **{pair['feature1']}** & **{pair['feature2']}**: "
|
| 154 |
+
f"{pair['correlation']:.3f} ({desc} {direction} correlation)"
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
# Insight
|
| 158 |
+
if corr_pairs:
|
| 159 |
+
st.info(f"💡 **Insight**: {corr_pairs[0]['feature1']} and {corr_pairs[0]['feature2']} "
|
| 160 |
+
f"have the strongest {'positive' if corr_pairs[0]['correlation'] > 0 else 'negative'} "
|
| 161 |
+
f"relationship in the dataset.")
|
| 162 |
+
|
| 163 |
+
# Distribution insights
|
| 164 |
+
st.markdown("### 📊 Distribution Analysis")
|
| 165 |
+
|
| 166 |
+
skewness = df[numeric_cols].skew()
|
| 167 |
+
skewed_cols = skewness[abs(skewness) > 1].index.tolist()
|
| 168 |
+
|
| 169 |
+
if skewed_cols:
|
| 170 |
+
st.warning(f"⚠️ **Skewed Features**: {', '.join(skewed_cols[:3])}" +
|
| 171 |
+
(" and more" if len(skewed_cols) > 3 else ""))
|
| 172 |
+
st.markdown("💡 These features might benefit from transformation for better model performance.")
|
| 173 |
+
|
| 174 |
+
# Show distribution of most skewed feature
|
| 175 |
+
if skewed_cols:
|
| 176 |
+
col_to_show = skewed_cols[0]
|
| 177 |
+
fig = px.histogram(df, x=col_to_show, nbins=30,
|
| 178 |
+
title=f"Distribution of {col_to_show} (Most Skewed)",
|
| 179 |
+
marginal="box")
|
| 180 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 181 |
+
else:
|
| 182 |
+
st.info("No numeric columns available for trend analysis")
|
| 183 |
+
|
| 184 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 185 |
+
|
| 186 |
+
with tab3:
|
| 187 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 188 |
+
st.subheader("🎯 Key Business Drivers")
|
| 189 |
+
|
| 190 |
+
if len(numeric_cols) > 0:
|
| 191 |
+
# Find features with highest variance (potential impact)
|
| 192 |
+
variances = df[numeric_cols].var().sort_values(ascending=False)
|
| 193 |
+
|
| 194 |
+
st.markdown("### 📊 High Variance Features")
|
| 195 |
+
st.markdown("Features with high variance often indicate key business drivers")
|
| 196 |
+
|
| 197 |
+
fig = px.bar(x=variances.index[:10], y=variances.values[:10],
|
| 198 |
+
title="Top 10 Features by Variance",
|
| 199 |
+
labels={'x': 'Feature', 'y': 'Variance'})
|
| 200 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 201 |
+
|
| 202 |
+
# Feature importance based on mutual information
|
| 203 |
+
if len(numeric_cols) >= 2:
|
| 204 |
+
st.markdown("### 🔍 Predictive Power")
|
| 205 |
+
|
| 206 |
+
# Use last numeric column as potential target
|
| 207 |
+
target = numeric_cols[-1]
|
| 208 |
+
features = numeric_cols[:-1]
|
| 209 |
+
|
| 210 |
+
if len(features) > 0:
|
| 211 |
+
from sklearn.feature_selection import mutual_info_regression
|
| 212 |
+
|
| 213 |
+
mi_scores = mutual_info_regression(df[features].fillna(0), df[target].fillna(0))
|
| 214 |
+
mi_df = pd.DataFrame({
|
| 215 |
+
'feature': features,
|
| 216 |
+
'importance': mi_scores
|
| 217 |
+
}).sort_values('importance', ascending=False)
|
| 218 |
+
|
| 219 |
+
fig = px.bar(mi_df.head(10), x='importance', y='feature',
|
| 220 |
+
orientation='h',
|
| 221 |
+
title=f"Feature Importance for Predicting {target}")
|
| 222 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 223 |
+
|
| 224 |
+
st.info(f"💡 **Key Driver**: {mi_df.iloc[0]['feature']} appears to be the most "
|
| 225 |
+
f"important factor for predicting {target}")
|
| 226 |
+
else:
|
| 227 |
+
st.info("No numeric columns available for driver analysis")
|
| 228 |
+
|
| 229 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 230 |
+
|
| 231 |
+
with tab4:
|
| 232 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 233 |
+
st.subheader("⚠️ Anomaly Detection")
|
| 234 |
+
|
| 235 |
+
if len(numeric_cols) > 0:
|
| 236 |
+
# Outlier detection using IQR
|
| 237 |
+
outlier_report = []
|
| 238 |
+
|
| 239 |
+
for col in numeric_cols:
|
| 240 |
+
Q1 = df[col].quantile(0.25)
|
| 241 |
+
Q3 = df[col].quantile(0.75)
|
| 242 |
+
IQR = Q3 - Q1
|
| 243 |
+
|
| 244 |
+
outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
|
| 245 |
+
outlier_pct = (len(outliers) / len(df)) * 100
|
| 246 |
+
|
| 247 |
+
if outlier_pct > 5:
|
| 248 |
+
outlier_report.append({
|
| 249 |
+
'column': col,
|
| 250 |
+
'outlier_pct': outlier_pct,
|
| 251 |
+
'lower_bound': Q1 - 1.5 * IQR,
|
| 252 |
+
'upper_bound': Q3 + 1.5 * IQR
|
| 253 |
+
})
|
| 254 |
+
|
| 255 |
+
if outlier_report:
|
| 256 |
+
st.warning(f"⚠️ Found {len(outlier_report)} columns with significant outliers")
|
| 257 |
+
|
| 258 |
+
for item in outlier_report[:5]:
|
| 259 |
+
st.markdown(f"**{item['column']}**: {item['outlier_pct']:.1f}% outliers "
|
| 260 |
+
f"(outside [{item['lower_bound']:.2f}, {item['upper_bound']:.2f}])")
|
| 261 |
+
|
| 262 |
+
# Visualize outliers for first column
|
| 263 |
+
col_to_show = outlier_report[0]['column']
|
| 264 |
+
fig = px.box(df, y=col_to_show, title=f"Outliers in {col_to_show}")
|
| 265 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 266 |
+
|
| 267 |
+
st.markdown("💡 **Recommendation**: Investigate these outliers - they may represent "
|
| 268 |
+
"unusual but important business events or data quality issues.")
|
| 269 |
+
else:
|
| 270 |
+
st.success("✅ No significant outliers detected in numeric columns")
|
| 271 |
+
else:
|
| 272 |
+
st.info("No numeric columns available for outlier detection")
|
| 273 |
+
|
| 274 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 275 |
+
|
| 276 |
+
with tab5:
|
| 277 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 278 |
+
st.subheader("💡 Strategic Recommendations")
|
| 279 |
+
|
| 280 |
+
# Generate business recommendations based on data insights
|
| 281 |
+
recommendations = []
|
| 282 |
+
|
| 283 |
+
if len(numeric_cols) > 0:
|
| 284 |
+
# Check for growth opportunities
|
| 285 |
+
growth_cols = []
|
| 286 |
+
for col in numeric_cols:
|
| 287 |
+
if df[col].min() >= 0 and df[col].max() > df[col].min() * 10:
|
| 288 |
+
growth_cols.append(col)
|
| 289 |
+
|
| 290 |
+
if growth_cols:
|
| 291 |
+
recommendations.append({
|
| 292 |
+
'area': 'Growth Opportunity',
|
| 293 |
+
'recommendation': f"Focus on {growth_cols[0]} which shows high variability "
|
| 294 |
+
f"(range: {df[growth_cols[0]].min():.2f} to {df[growth_cols[0]].max():.2f})",
|
| 295 |
+
'priority': 'High'
|
| 296 |
+
})
|
| 297 |
+
|
| 298 |
+
# Check for efficiency opportunities
|
| 299 |
+
if len(numeric_cols) >= 2:
|
| 300 |
+
# Find features with high correlation - potential redundancy
|
| 301 |
+
corr_matrix = df[numeric_cols].corr()
|
| 302 |
+
for i in range(len(numeric_cols)):
|
| 303 |
+
for j in range(i+1, len(numeric_cols)):
|
| 304 |
+
if abs(corr_matrix.iloc[i, j]) > 0.9:
|
| 305 |
+
recommendations.append({
|
| 306 |
+
'area': 'Efficiency',
|
| 307 |
+
'recommendation': f"Consider consolidating {numeric_cols[i]} and {numeric_cols[j]} "
|
| 308 |
+
f"as they are highly correlated ({corr_matrix.iloc[i, j]:.2f})",
|
| 309 |
+
'priority': 'Medium'
|
| 310 |
+
})
|
| 311 |
+
break
|
| 312 |
+
if len(recommendations) > 3:
|
| 313 |
+
break
|
| 314 |
+
|
| 315 |
+
if categorical_cols:
|
| 316 |
+
# Check for customer/market segments
|
| 317 |
+
for col in categorical_cols[:2]:
|
| 318 |
+
if df[col].nunique() > 1 and df[col].nunique() <= 10:
|
| 319 |
+
top_segment = df[col].value_counts().index[0]
|
| 320 |
+
recommendations.append({
|
| 321 |
+
'area': 'Segmentation',
|
| 322 |
+
'recommendation': f"Target the dominant segment in {col}: '{top_segment}' "
|
| 323 |
+
f"({df[col].value_counts().iloc[0]:,} records)",
|
| 324 |
+
'priority': 'Medium'
|
| 325 |
+
})
|
| 326 |
+
|
| 327 |
+
# Display recommendations
|
| 328 |
+
if recommendations:
|
| 329 |
+
for rec in recommendations:
|
| 330 |
+
priority_color = "🔴" if rec['priority'] == 'High' else "🟡" if rec['priority'] == 'Medium' else "🟢"
|
| 331 |
+
st.markdown(f"{priority_color} **{rec['area']}**: {rec['recommendation']}")
|
| 332 |
+
else:
|
| 333 |
+
st.info("No specific recommendations generated. Try uploading a dataset with more variety.")
|
| 334 |
+
|
| 335 |
+
# Add download insights option
|
| 336 |
+
st.markdown("---")
|
| 337 |
+
st.markdown("### 📥 Export Insights")
|
| 338 |
+
|
| 339 |
+
insight_text = f"""
|
| 340 |
+
BUSINESS INSIGHTS REPORT
|
| 341 |
+
=======================
|
| 342 |
+
|
| 343 |
+
Dataset: {df.shape[0]} rows × {df.shape[1]} columns
|
| 344 |
+
|
| 345 |
+
KEY METRICS:
|
| 346 |
+
• Total Records: {df.shape[0]:,}
|
| 347 |
+
• Total Features: {df.shape[1]}
|
| 348 |
+
• Data Completeness: {completeness:.1f}%
|
| 349 |
+
|
| 350 |
+
COLUMN COMPOSITION:
|
| 351 |
+
• Numeric: {len(numeric_cols)}
|
| 352 |
+
• Categorical: {len(categorical_cols)}
|
| 353 |
+
• Datetime: {len(datetime_cols)}
|
| 354 |
+
|
| 355 |
+
RECOMMENDATIONS:
|
| 356 |
+
"""
|
| 357 |
+
|
| 358 |
+
for rec in recommendations:
|
| 359 |
+
insight_text += f"\n• {rec['area']}: {rec['recommendation']} (Priority: {rec['priority']})"
|
| 360 |
+
|
| 361 |
+
st.download_button(
|
| 362 |
+
label="📥 Download Insights Report",
|
| 363 |
+
data=insight_text,
|
| 364 |
+
file_name="business_insights.txt",
|
| 365 |
+
mime="text/plain",
|
| 366 |
+
use_container_width=True
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
ml_pipeline.py
ADDED
|
@@ -0,0 +1,940 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
|
| 5 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
|
| 6 |
+
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
|
| 7 |
+
confusion_matrix, classification_report, roc_curve, auc,
|
| 8 |
+
mean_squared_error, r2_score, mean_absolute_error)
|
| 9 |
+
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
|
| 10 |
+
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
| 11 |
+
from sklearn.svm import SVC, SVR
|
| 12 |
+
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
| 13 |
+
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
|
| 14 |
+
GradientBoostingClassifier, GradientBoostingRegressor,
|
| 15 |
+
AdaBoostClassifier, AdaBoostRegressor,
|
| 16 |
+
VotingClassifier, VotingRegressor)
|
| 17 |
+
from xgboost import XGBClassifier, XGBRegressor
|
| 18 |
+
from lightgbm import LGBMClassifier, LGBMRegressor
|
| 19 |
+
import plotly.express as px
|
| 20 |
+
import plotly.graph_objects as go
|
| 21 |
+
from plotly.subplots import make_subplots
|
| 22 |
+
import time
|
| 23 |
+
import warnings
|
| 24 |
+
warnings.filterwarnings('ignore')
|
| 25 |
+
|
| 26 |
+
class MLPipelineError(Exception):
|
| 27 |
+
"""Custom exception for ML pipeline errors"""
|
| 28 |
+
pass
|
| 29 |
+
|
| 30 |
+
def validate_ml_data(df, target, features):
|
| 31 |
+
"""Validate data for machine learning"""
|
| 32 |
+
issues = []
|
| 33 |
+
|
| 34 |
+
if df.empty:
|
| 35 |
+
issues.append("Dataset is empty")
|
| 36 |
+
return issues
|
| 37 |
+
|
| 38 |
+
if target not in df.columns:
|
| 39 |
+
issues.append(f"Target column '{target}' not found in dataset")
|
| 40 |
+
|
| 41 |
+
missing_features = [f for f in features if f not in df.columns]
|
| 42 |
+
if missing_features:
|
| 43 |
+
issues.append(f"Features not found: {missing_features}")
|
| 44 |
+
|
| 45 |
+
# Check for sufficient data
|
| 46 |
+
if df.shape[0] < 10:
|
| 47 |
+
issues.append("Dataset too small (minimum 10 rows required)")
|
| 48 |
+
|
| 49 |
+
# Check for constant columns
|
| 50 |
+
for col in features:
|
| 51 |
+
if df[col].nunique() == 1:
|
| 52 |
+
issues.append(f"Feature '{col}' is constant")
|
| 53 |
+
|
| 54 |
+
# Check target for classification
|
| 55 |
+
if target in df.columns:
|
| 56 |
+
if df[target].dtype in ['object', 'category'] or df[target].nunique() <= 20:
|
| 57 |
+
if df[target].nunique() == 1:
|
| 58 |
+
issues.append("Target has only one class")
|
| 59 |
+
elif df[target].nunique() > 50:
|
| 60 |
+
issues.append(f"Target has {df[target].nunique()} classes, which may cause issues")
|
| 61 |
+
|
| 62 |
+
return issues
|
| 63 |
+
|
| 64 |
+
def safe_ml_operation(func, *args, **kwargs):
|
| 65 |
+
"""Safely execute ML operations with error handling"""
|
| 66 |
+
try:
|
| 67 |
+
result = func(*args, **kwargs)
|
| 68 |
+
return result, None
|
| 69 |
+
except ValueError as e:
|
| 70 |
+
error_msg = f"Value Error: {str(e)}. Check your data types and values."
|
| 71 |
+
return None, error_msg
|
| 72 |
+
except MemoryError as e:
|
| 73 |
+
error_msg = "Memory Error: Dataset too large. Try reducing the number of features or using a sample."
|
| 74 |
+
return None, error_msg
|
| 75 |
+
except Exception as e:
|
| 76 |
+
error_msg = f"ML Error: {str(e)}"
|
| 77 |
+
return None, error_msg
|
| 78 |
+
|
| 79 |
+
def run_ml_pipeline(df):
|
| 80 |
+
"""
|
| 81 |
+
Enhanced machine learning pipeline with comprehensive error handling
|
| 82 |
+
"""
|
| 83 |
+
st.markdown("""
|
| 84 |
+
<div style='text-align: center; margin-bottom: 2rem;'>
|
| 85 |
+
<h2>🤖 Advanced Machine Learning Pipeline</h2>
|
| 86 |
+
<p style='color: gray;'>Train, evaluate, and compare multiple ML models with automatic error handling</p>
|
| 87 |
+
</div>
|
| 88 |
+
""", unsafe_allow_html=True)
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
# Check if dataset is suitable for ML
|
| 92 |
+
if df.shape[0] < 10:
|
| 93 |
+
st.error("❌ Dataset too small for machine learning (need at least 10 rows)")
|
| 94 |
+
return
|
| 95 |
+
|
| 96 |
+
# Create tabs for different ML stages
|
| 97 |
+
tab1, tab2, tab3, tab4, tab5 = st.tabs([
|
| 98 |
+
"⚙️ Configuration",
|
| 99 |
+
"📊 Model Training",
|
| 100 |
+
"📈 Model Evaluation",
|
| 101 |
+
"🔮 Predictions",
|
| 102 |
+
"📋 ML Report"
|
| 103 |
+
])
|
| 104 |
+
|
| 105 |
+
with tab1:
|
| 106 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 107 |
+
st.subheader("⚙️ Model Configuration")
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
# Target selection with validation
|
| 111 |
+
st.markdown("### 🎯 Target Variable")
|
| 112 |
+
|
| 113 |
+
# Auto-detect potential target columns
|
| 114 |
+
potential_targets = []
|
| 115 |
+
target_types = {}
|
| 116 |
+
|
| 117 |
+
for col in df.columns:
|
| 118 |
+
try:
|
| 119 |
+
if df[col].dtype in ['int64', 'float64']:
|
| 120 |
+
if df[col].nunique() <= 20:
|
| 121 |
+
potential_targets.append(col)
|
| 122 |
+
target_types[col] = "Classification (low cardinality)"
|
| 123 |
+
else:
|
| 124 |
+
potential_targets.append(col)
|
| 125 |
+
target_types[col] = "Regression"
|
| 126 |
+
elif df[col].dtype in ['object', 'category']:
|
| 127 |
+
if df[col].nunique() <= 50:
|
| 128 |
+
potential_targets.append(col)
|
| 129 |
+
target_types[col] = f"Classification ({df[col].nunique()} classes)"
|
| 130 |
+
except Exception as e:
|
| 131 |
+
st.warning(f"⚠️ Couldn't analyze column {col}: {str(e)}")
|
| 132 |
+
|
| 133 |
+
if not potential_targets:
|
| 134 |
+
st.error("❌ No suitable target columns found. Need numeric or categorical columns with reasonable cardinality.")
|
| 135 |
+
return
|
| 136 |
+
|
| 137 |
+
target = st.selectbox(
|
| 138 |
+
"Select target column",
|
| 139 |
+
potential_targets,
|
| 140 |
+
help=f"Column types: {target_types}"
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# Task type detection
|
| 144 |
+
if df[target].dtype in ['object', 'category'] or df[target].nunique() <= 20:
|
| 145 |
+
task_type = "Classification"
|
| 146 |
+
unique_values = df[target].nunique()
|
| 147 |
+
|
| 148 |
+
if unique_values == 2:
|
| 149 |
+
st.success("✅ **Binary Classification** problem detected")
|
| 150 |
+
elif unique_values <= 10:
|
| 151 |
+
st.info(f"📊 **Multi-class Classification** with {unique_values} classes")
|
| 152 |
+
else:
|
| 153 |
+
st.warning(f"⚠️ **Multi-class Classification** with {unique_values} classes - may be challenging")
|
| 154 |
+
|
| 155 |
+
# Check class balance
|
| 156 |
+
class_dist = df[target].value_counts(normalize=True)
|
| 157 |
+
if class_dist.min() < 0.1:
|
| 158 |
+
st.warning("⚠️ Class imbalance detected. Consider using class weights or resampling.")
|
| 159 |
+
else:
|
| 160 |
+
task_type = "Regression"
|
| 161 |
+
st.info("📈 **Regression** task detected")
|
| 162 |
+
|
| 163 |
+
# Check target distribution
|
| 164 |
+
target_skew = df[target].skew()
|
| 165 |
+
if abs(target_skew) > 1:
|
| 166 |
+
st.warning(f"⚠️ Target variable is highly skewed (skewness: {target_skew:.2f}). Consider log transformation.")
|
| 167 |
+
|
| 168 |
+
# Feature selection
|
| 169 |
+
st.markdown("### 🔍 Feature Selection")
|
| 170 |
+
|
| 171 |
+
# Auto-select features (exclude target)
|
| 172 |
+
all_features = [col for col in df.columns if col != target]
|
| 173 |
+
|
| 174 |
+
# Remove problematic columns
|
| 175 |
+
problematic_cols = []
|
| 176 |
+
for col in all_features:
|
| 177 |
+
try:
|
| 178 |
+
if df[col].nunique() == 1:
|
| 179 |
+
problematic_cols.append(col)
|
| 180 |
+
elif df[col].isnull().sum() > len(df) * 0.5:
|
| 181 |
+
problematic_cols.append(col)
|
| 182 |
+
except:
|
| 183 |
+
problematic_cols.append(col)
|
| 184 |
+
|
| 185 |
+
if problematic_cols:
|
| 186 |
+
st.warning(f"⚠️ Problematic columns detected (will be excluded): {problematic_cols}")
|
| 187 |
+
all_features = [f for f in all_features if f not in problematic_cols]
|
| 188 |
+
|
| 189 |
+
if not all_features:
|
| 190 |
+
st.error("❌ No valid features remaining after filtering.")
|
| 191 |
+
return
|
| 192 |
+
|
| 193 |
+
# Select features
|
| 194 |
+
selected_features = st.multiselect(
|
| 195 |
+
"Choose features for modeling",
|
| 196 |
+
all_features,
|
| 197 |
+
default=all_features[:min(10, len(all_features))],
|
| 198 |
+
help="Select the columns to use as features. Using too many features may cause overfitting."
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
if not selected_features:
|
| 202 |
+
st.warning("⚠️ Please select at least one feature")
|
| 203 |
+
return
|
| 204 |
+
|
| 205 |
+
# Validate selected features
|
| 206 |
+
validation_issues = validate_ml_data(df, target, selected_features)
|
| 207 |
+
if validation_issues:
|
| 208 |
+
for issue in validation_issues:
|
| 209 |
+
st.warning(f"⚠️ {issue}")
|
| 210 |
+
|
| 211 |
+
# Data preprocessing options
|
| 212 |
+
st.markdown("### 🛠️ Preprocessing Options")
|
| 213 |
+
|
| 214 |
+
col1, col2 = st.columns(2)
|
| 215 |
+
with col1:
|
| 216 |
+
test_size = st.slider("Test set size (%)", 10, 40, 20, 5) / 100
|
| 217 |
+
scaler_option = st.selectbox("Feature scaling", ["None", "StandardScaler", "MinMaxScaler"])
|
| 218 |
+
|
| 219 |
+
with col2:
|
| 220 |
+
cv_folds = st.slider("Cross-validation folds", 2, 10, 5)
|
| 221 |
+
if task_type == "Classification":
|
| 222 |
+
handle_imbalance = st.checkbox("Handle class imbalance", value=False,
|
| 223 |
+
help="Use class weights or sampling techniques")
|
| 224 |
+
else:
|
| 225 |
+
handle_imbalance = False
|
| 226 |
+
|
| 227 |
+
# Model selection based on task type
|
| 228 |
+
st.markdown("### 🤖 Model Selection")
|
| 229 |
+
|
| 230 |
+
if task_type == "Classification":
|
| 231 |
+
models = {
|
| 232 |
+
"Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
|
| 233 |
+
"K-Nearest Neighbors": KNeighborsClassifier(),
|
| 234 |
+
"Decision Tree": DecisionTreeClassifier(random_state=42),
|
| 235 |
+
"Random Forest": RandomForestClassifier(random_state=42, n_jobs=-1),
|
| 236 |
+
"Gradient Boosting": GradientBoostingClassifier(random_state=42),
|
| 237 |
+
"XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
|
| 238 |
+
"LightGBM": LGBMClassifier(verbose=-1, random_state=42),
|
| 239 |
+
"AdaBoost": AdaBoostClassifier(random_state=42),
|
| 240 |
+
"SVM": SVC(probability=True, random_state=42)
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
# Default models for quick selection
|
| 244 |
+
default_models = ["Logistic Regression", "Random Forest", "XGBoost"]
|
| 245 |
+
else: # Regression
|
| 246 |
+
models = {
|
| 247 |
+
"Linear Regression": LinearRegression(),
|
| 248 |
+
"Ridge Regression": Ridge(random_state=42),
|
| 249 |
+
"Lasso Regression": Lasso(random_state=42),
|
| 250 |
+
"Decision Tree": DecisionTreeRegressor(random_state=42),
|
| 251 |
+
"Random Forest": RandomForestRegressor(random_state=42, n_jobs=-1),
|
| 252 |
+
"Gradient Boosting": GradientBoostingRegressor(random_state=42),
|
| 253 |
+
"XGBoost": XGBRegressor(random_state=42),
|
| 254 |
+
"LightGBM": LGBMRegressor(verbose=-1, random_state=42),
|
| 255 |
+
"AdaBoost": AdaBoostRegressor(random_state=42),
|
| 256 |
+
"SVR": SVR()
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
default_models = ["Linear Regression", "Random Forest", "XGBoost"]
|
| 260 |
+
|
| 261 |
+
selected_models = st.multiselect(
|
| 262 |
+
"Choose models to train",
|
| 263 |
+
list(models.keys()),
|
| 264 |
+
default=default_models,
|
| 265 |
+
help="Select multiple models to compare performance"
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
if not selected_models:
|
| 269 |
+
st.warning("⚠️ Please select at least one model")
|
| 270 |
+
return
|
| 271 |
+
|
| 272 |
+
# Advanced options
|
| 273 |
+
with st.expander("⚡ Advanced Options"):
|
| 274 |
+
do_tuning = st.checkbox("Perform hyperparameter tuning", value=False,
|
| 275 |
+
help="Grid search for best parameters (may be slow)")
|
| 276 |
+
|
| 277 |
+
if do_tuning:
|
| 278 |
+
tuning_folds = st.slider("Tuning CV folds", 2, 5, 3)
|
| 279 |
+
max_tuning_iter = st.slider("Max tuning iterations per model", 5, 50, 20)
|
| 280 |
+
|
| 281 |
+
use_sampling = st.checkbox("Use data sampling (for large datasets)", value=False,
|
| 282 |
+
help="Use a sample for faster experimentation")
|
| 283 |
+
|
| 284 |
+
if use_sampling:
|
| 285 |
+
sample_size = st.slider("Sample size (%)", 10, 100, 100, 10) / 100
|
| 286 |
+
|
| 287 |
+
random_state = st.number_input("Random seed", value=42, min_value=0, max_value=999)
|
| 288 |
+
|
| 289 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 290 |
+
|
| 291 |
+
# Store configuration in session state
|
| 292 |
+
st.session_state['ml_config'] = {
|
| 293 |
+
'target': target,
|
| 294 |
+
'features': selected_features,
|
| 295 |
+
'task_type': task_type,
|
| 296 |
+
'test_size': test_size,
|
| 297 |
+
'scaler': scaler_option,
|
| 298 |
+
'cv_folds': cv_folds,
|
| 299 |
+
'handle_imbalance': handle_imbalance,
|
| 300 |
+
'models': {name: models[name] for name in selected_models},
|
| 301 |
+
'do_tuning': do_tuning,
|
| 302 |
+
'random_state': random_state
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
except Exception as e:
|
| 306 |
+
st.error(f"❌ Error in configuration: {str(e)}")
|
| 307 |
+
st.info("💡 Tip: Check your data types and ensure all columns are valid")
|
| 308 |
+
return
|
| 309 |
+
|
| 310 |
+
with tab2:
|
| 311 |
+
if 'ml_config' not in st.session_state:
|
| 312 |
+
st.info("ℹ️ Please configure your model in the 'Configuration' tab first")
|
| 313 |
+
return
|
| 314 |
+
|
| 315 |
+
if st.button("🚀 Start Training", use_container_width=True, type="primary"):
|
| 316 |
+
try:
|
| 317 |
+
config = st.session_state['ml_config']
|
| 318 |
+
|
| 319 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 320 |
+
|
| 321 |
+
# Prepare data with error handling
|
| 322 |
+
with st.spinner("📊 Preparing data..."):
|
| 323 |
+
try:
|
| 324 |
+
X = df[config['features']].copy()
|
| 325 |
+
y = df[config['target']].copy()
|
| 326 |
+
|
| 327 |
+
# Handle missing values
|
| 328 |
+
if X.isnull().sum().sum() > 0:
|
| 329 |
+
st.info(f"⚠️ Handling {X.isnull().sum().sum()} missing values in features...")
|
| 330 |
+
X = X.fillna(X.mean(numeric_only=True)).fillna(X.mode().iloc[0])
|
| 331 |
+
|
| 332 |
+
# Handle categorical features
|
| 333 |
+
cat_features = X.select_dtypes(include=['object', 'category']).columns
|
| 334 |
+
if len(cat_features) > 0:
|
| 335 |
+
st.info(f"🔄 Encoding categorical features: {list(cat_features)}")
|
| 336 |
+
X = pd.get_dummies(X, columns=cat_features)
|
| 337 |
+
|
| 338 |
+
# Handle target encoding for classification
|
| 339 |
+
le = None
|
| 340 |
+
if config['task_type'] == "Classification" and y.dtype == 'object':
|
| 341 |
+
le = LabelEncoder()
|
| 342 |
+
y = le.fit_transform(y)
|
| 343 |
+
st.info(f"📊 Target classes: {dict(zip(le.classes_, le.transform(le.classes_)))}")
|
| 344 |
+
|
| 345 |
+
# Handle class imbalance
|
| 346 |
+
if config['task_type'] == "Classification" and config['handle_imbalance']:
|
| 347 |
+
from sklearn.utils.class_weight import compute_class_weight
|
| 348 |
+
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
|
| 349 |
+
st.info(f"⚖️ Using class weights: {dict(zip(np.unique(y), class_weights))}")
|
| 350 |
+
|
| 351 |
+
# Scale features
|
| 352 |
+
scaler = None
|
| 353 |
+
if config['scaler'] != "None":
|
| 354 |
+
if config['scaler'] == "StandardScaler":
|
| 355 |
+
scaler = StandardScaler()
|
| 356 |
+
else:
|
| 357 |
+
scaler = MinMaxScaler()
|
| 358 |
+
X_scaled = scaler.fit_transform(X)
|
| 359 |
+
X = pd.DataFrame(X_scaled, columns=X.columns)
|
| 360 |
+
|
| 361 |
+
# Split data
|
| 362 |
+
stratify = y if config['task_type'] == "Classification" else None
|
| 363 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 364 |
+
X, y, test_size=config['test_size'],
|
| 365 |
+
random_state=config['random_state'],
|
| 366 |
+
stratify=stratify
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
st.success(f"✅ Data prepared: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples")
|
| 370 |
+
|
| 371 |
+
except Exception as e:
|
| 372 |
+
st.error(f"❌ Error in data preparation: {str(e)}")
|
| 373 |
+
return
|
| 374 |
+
|
| 375 |
+
# Train models
|
| 376 |
+
results = []
|
| 377 |
+
trained_models = {}
|
| 378 |
+
progress_bar = st.progress(0)
|
| 379 |
+
status_text = st.empty()
|
| 380 |
+
|
| 381 |
+
for i, (model_name, model) in enumerate(config['models'].items()):
|
| 382 |
+
status_text.text(f"🔄 Training {model_name}...")
|
| 383 |
+
|
| 384 |
+
try:
|
| 385 |
+
# Apply class weights if needed
|
| 386 |
+
if config['task_type'] == "Classification" and config['handle_imbalance']:
|
| 387 |
+
if hasattr(model, 'class_weight'):
|
| 388 |
+
model.set_params(class_weight='balanced')
|
| 389 |
+
|
| 390 |
+
# Train
|
| 391 |
+
start_time = time.time()
|
| 392 |
+
model.fit(X_train, y_train)
|
| 393 |
+
training_time = time.time() - start_time
|
| 394 |
+
|
| 395 |
+
# Store trained model
|
| 396 |
+
trained_models[model_name] = {
|
| 397 |
+
'model': model,
|
| 398 |
+
'scaler': scaler,
|
| 399 |
+
'label_encoder': le,
|
| 400 |
+
'features': X.columns.tolist()
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
# Predict
|
| 404 |
+
y_pred = model.predict(X_test)
|
| 405 |
+
|
| 406 |
+
# Calculate metrics
|
| 407 |
+
if config['task_type'] == "Classification":
|
| 408 |
+
try:
|
| 409 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 410 |
+
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
|
| 411 |
+
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
|
| 412 |
+
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
|
| 413 |
+
|
| 414 |
+
# Cross-validation
|
| 415 |
+
cv_scores = cross_val_score(model, X_train, y_train, cv=config['cv_folds'])
|
| 416 |
+
|
| 417 |
+
results.append({
|
| 418 |
+
"Model": model_name,
|
| 419 |
+
"Accuracy": f"{accuracy:.4f}",
|
| 420 |
+
"Precision": f"{precision:.4f}",
|
| 421 |
+
"Recall": f"{recall:.4f}",
|
| 422 |
+
"F1 Score": f"{f1:.4f}",
|
| 423 |
+
"CV Score": f"{cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})",
|
| 424 |
+
"Time (s)": f"{training_time:.2f}"
|
| 425 |
+
})
|
| 426 |
+
except Exception as e:
|
| 427 |
+
st.warning(f"⚠️ Could not calculate all metrics for {model_name}: {str(e)}")
|
| 428 |
+
|
| 429 |
+
else: # Regression
|
| 430 |
+
try:
|
| 431 |
+
mse = mean_squared_error(y_test, y_pred)
|
| 432 |
+
rmse = np.sqrt(mse)
|
| 433 |
+
mae = mean_absolute_error(y_test, y_pred)
|
| 434 |
+
r2 = r2_score(y_test, y_pred)
|
| 435 |
+
|
| 436 |
+
# Cross-validation
|
| 437 |
+
cv_scores = cross_val_score(model, X_train, y_train, cv=config['cv_folds'], scoring='r2')
|
| 438 |
+
|
| 439 |
+
results.append({
|
| 440 |
+
"Model": model_name,
|
| 441 |
+
"R² Score": f"{r2:.4f}",
|
| 442 |
+
"RMSE": f"{rmse:.4f}",
|
| 443 |
+
"MAE": f"{mae:.4f}",
|
| 444 |
+
"CV R²": f"{cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})",
|
| 445 |
+
"Time (s)": f"{training_time:.2f}"
|
| 446 |
+
})
|
| 447 |
+
except Exception as e:
|
| 448 |
+
st.warning(f"⚠️ Could not calculate all metrics for {model_name}: {str(e)}")
|
| 449 |
+
|
| 450 |
+
except MemoryError:
|
| 451 |
+
st.error(f"❌ Out of memory training {model_name}. Try using fewer features or a sample.")
|
| 452 |
+
except Exception as e:
|
| 453 |
+
st.warning(f"⚠️ Error training {model_name}: {str(e)}")
|
| 454 |
+
|
| 455 |
+
progress_bar.progress((i + 1) / len(config['models']))
|
| 456 |
+
|
| 457 |
+
status_text.text("✅ Training complete!")
|
| 458 |
+
|
| 459 |
+
if not results:
|
| 460 |
+
st.error("❌ No models were successfully trained")
|
| 461 |
+
return
|
| 462 |
+
|
| 463 |
+
# Display results
|
| 464 |
+
st.subheader("📊 Model Performance Comparison")
|
| 465 |
+
results_df = pd.DataFrame(results)
|
| 466 |
+
|
| 467 |
+
# Highlight best model
|
| 468 |
+
if config['task_type'] == "Classification":
|
| 469 |
+
best_idx = results_df['F1 Score'].astype(float).idxmax()
|
| 470 |
+
else:
|
| 471 |
+
best_idx = results_df['R² Score'].astype(float).idxmax()
|
| 472 |
+
|
| 473 |
+
# Style dataframe
|
| 474 |
+
def highlight_best(s):
|
| 475 |
+
is_best = s.index == best_idx
|
| 476 |
+
return ['background-color: #90EE90' if v else '' for v in is_best]
|
| 477 |
+
|
| 478 |
+
st.dataframe(results_df.style.apply(highlight_best), use_container_width=True)
|
| 479 |
+
|
| 480 |
+
# Store results
|
| 481 |
+
st.session_state['trained_models'] = trained_models
|
| 482 |
+
st.session_state['X_train'] = X_train
|
| 483 |
+
st.session_state['X_test'] = X_test
|
| 484 |
+
st.session_state['y_train'] = y_train
|
| 485 |
+
st.session_state['y_test'] = y_test
|
| 486 |
+
st.session_state['task_type'] = config['task_type']
|
| 487 |
+
st.session_state['results_df'] = results_df
|
| 488 |
+
|
| 489 |
+
# Best model info
|
| 490 |
+
best_model_name = results_df.iloc[best_idx]['Model']
|
| 491 |
+
st.success(f"🏆 **Best Model:** {best_model_name}")
|
| 492 |
+
|
| 493 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 494 |
+
|
| 495 |
+
except Exception as e:
|
| 496 |
+
st.error(f"❌ Critical error in training: {str(e)}")
|
| 497 |
+
st.info("💡 Try reducing the number of features or models")
|
| 498 |
+
|
| 499 |
+
with tab3:
|
| 500 |
+
if 'trained_models' not in st.session_state:
|
| 501 |
+
st.info("ℹ️ Train some models first in the 'Model Training' tab")
|
| 502 |
+
return
|
| 503 |
+
|
| 504 |
+
try:
|
| 505 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 506 |
+
st.subheader("📈 Detailed Model Evaluation")
|
| 507 |
+
|
| 508 |
+
# Model selection for detailed evaluation
|
| 509 |
+
selected_eval_model = st.selectbox(
|
| 510 |
+
"Select model for detailed evaluation",
|
| 511 |
+
list(st.session_state['trained_models'].keys())
|
| 512 |
+
)
|
| 513 |
+
|
| 514 |
+
model_info = st.session_state['trained_models'][selected_eval_model]
|
| 515 |
+
model = model_info['model']
|
| 516 |
+
X_test = st.session_state['X_test']
|
| 517 |
+
y_test = st.session_state['y_test']
|
| 518 |
+
task_type = st.session_state['task_type']
|
| 519 |
+
|
| 520 |
+
try:
|
| 521 |
+
y_pred = model.predict(X_test)
|
| 522 |
+
|
| 523 |
+
if task_type == "Classification":
|
| 524 |
+
# Confusion Matrix
|
| 525 |
+
st.markdown("### Confusion Matrix")
|
| 526 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 527 |
+
|
| 528 |
+
fig = px.imshow(cm,
|
| 529 |
+
text_auto=True,
|
| 530 |
+
aspect="auto",
|
| 531 |
+
color_continuous_scale='Blues',
|
| 532 |
+
title=f"Confusion Matrix - {selected_eval_model}")
|
| 533 |
+
|
| 534 |
+
fig.update_layout(xaxis_title="Predicted", yaxis_title="Actual")
|
| 535 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 536 |
+
|
| 537 |
+
# Classification Report
|
| 538 |
+
st.markdown("### Classification Report")
|
| 539 |
+
report = classification_report(y_test, y_pred, output_dict=True)
|
| 540 |
+
report_df = pd.DataFrame(report).transpose()
|
| 541 |
+
st.dataframe(report_df.style.format("{:.4f}"), use_container_width=True)
|
| 542 |
+
|
| 543 |
+
# ROC Curve (for binary classification)
|
| 544 |
+
if len(np.unique(y_test)) == 2 and hasattr(model, "predict_proba"):
|
| 545 |
+
st.markdown("### ROC Curve")
|
| 546 |
+
y_pred_proba = model.predict_proba(X_test)[:, 1]
|
| 547 |
+
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
|
| 548 |
+
roc_auc = auc(fpr, tpr)
|
| 549 |
+
|
| 550 |
+
fig = go.Figure()
|
| 551 |
+
fig.add_trace(go.Scatter(x=fpr, y=tpr,
|
| 552 |
+
mode='lines',
|
| 553 |
+
name=f'ROC (AUC = {roc_auc:.3f})',
|
| 554 |
+
line=dict(color='blue', width=2)))
|
| 555 |
+
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
|
| 556 |
+
mode='lines',
|
| 557 |
+
name='Random',
|
| 558 |
+
line=dict(color='gray', dash='dash')))
|
| 559 |
+
|
| 560 |
+
fig.update_layout(xaxis_title="False Positive Rate",
|
| 561 |
+
yaxis_title="True Positive Rate",
|
| 562 |
+
title=f"ROC Curve - {selected_eval_model}")
|
| 563 |
+
|
| 564 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 565 |
+
|
| 566 |
+
else: # Regression
|
| 567 |
+
# Actual vs Predicted plot
|
| 568 |
+
st.markdown("### Actual vs Predicted")
|
| 569 |
+
|
| 570 |
+
fig = px.scatter(x=y_test, y=y_pred,
|
| 571 |
+
labels={'x': 'Actual', 'y': 'Predicted'},
|
| 572 |
+
title=f"Actual vs Predicted - {selected_eval_model}",
|
| 573 |
+
trendline="ols")
|
| 574 |
+
|
| 575 |
+
# Add perfect prediction line
|
| 576 |
+
min_val = min(y_test.min(), y_pred.min())
|
| 577 |
+
max_val = max(y_test.max(), y_pred.max())
|
| 578 |
+
fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
|
| 579 |
+
mode='lines', name='Perfect Prediction',
|
| 580 |
+
line=dict(color='red', dash='dash')))
|
| 581 |
+
|
| 582 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 583 |
+
|
| 584 |
+
# Residuals plot
|
| 585 |
+
st.markdown("### Residuals Analysis")
|
| 586 |
+
residuals = y_test - y_pred
|
| 587 |
+
|
| 588 |
+
fig = make_subplots(rows=1, cols=2,
|
| 589 |
+
subplot_titles=("Residuals vs Predicted", "Residuals Distribution"))
|
| 590 |
+
|
| 591 |
+
fig.add_trace(go.Scatter(x=y_pred, y=residuals,
|
| 592 |
+
mode='markers',
|
| 593 |
+
name='Residuals',
|
| 594 |
+
marker=dict(color='blue', opacity=0.5)), row=1, col=1)
|
| 595 |
+
|
| 596 |
+
fig.add_hline(y=0, line_dash="dash", line_color="red", row=1, col=1)
|
| 597 |
+
|
| 598 |
+
fig.add_trace(go.Histogram(x=residuals, nbinsx=30,
|
| 599 |
+
name='Distribution',
|
| 600 |
+
marker_color='green'), row=1, col=2)
|
| 601 |
+
|
| 602 |
+
fig.update_layout(title=f"Residual Analysis - {selected_eval_model}")
|
| 603 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 604 |
+
|
| 605 |
+
# Residual statistics
|
| 606 |
+
col1, col2, col3 = st.columns(3)
|
| 607 |
+
with col1:
|
| 608 |
+
st.metric("Mean Residual", f"{residuals.mean():.4f}")
|
| 609 |
+
with col2:
|
| 610 |
+
st.metric("Std Residual", f"{residuals.std():.4f}")
|
| 611 |
+
with col3:
|
| 612 |
+
st.metric("Residual Range", f"{residuals.max() - residuals.min():.4f}")
|
| 613 |
+
|
| 614 |
+
# Feature Importance (if available)
|
| 615 |
+
if hasattr(model, 'feature_importances_'):
|
| 616 |
+
st.markdown("### Feature Importance")
|
| 617 |
+
feature_importance = pd.DataFrame({
|
| 618 |
+
'feature': X_test.columns,
|
| 619 |
+
'importance': model.feature_importances_
|
| 620 |
+
}).sort_values('importance', ascending=True)
|
| 621 |
+
|
| 622 |
+
fig = px.bar(feature_importance.tail(10),
|
| 623 |
+
x='importance', y='feature',
|
| 624 |
+
orientation='h',
|
| 625 |
+
title="Top 10 Feature Importances",
|
| 626 |
+
color='importance',
|
| 627 |
+
color_continuous_scale='Viridis')
|
| 628 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 629 |
+
|
| 630 |
+
except Exception as e:
|
| 631 |
+
st.error(f"❌ Error in evaluation: {str(e)}")
|
| 632 |
+
|
| 633 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 634 |
+
|
| 635 |
+
except Exception as e:
|
| 636 |
+
st.error(f"❌ Error loading evaluation: {str(e)}")
|
| 637 |
+
|
| 638 |
+
with tab4:
|
| 639 |
+
if 'trained_models' not in st.session_state:
|
| 640 |
+
st.info("ℹ️ Train some models first in the 'Model Training' tab")
|
| 641 |
+
return
|
| 642 |
+
|
| 643 |
+
try:
|
| 644 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 645 |
+
st.subheader("🔮 Make Predictions")
|
| 646 |
+
|
| 647 |
+
# Model selection for predictions
|
| 648 |
+
selected_pred_model = st.selectbox(
|
| 649 |
+
"Select model for predictions",
|
| 650 |
+
list(st.session_state['trained_models'].keys()),
|
| 651 |
+
key="pred_model"
|
| 652 |
+
)
|
| 653 |
+
|
| 654 |
+
model_info = st.session_state['trained_models'][selected_pred_model]
|
| 655 |
+
model = model_info['model']
|
| 656 |
+
scaler = model_info['scaler']
|
| 657 |
+
le = model_info.get('label_encoder')
|
| 658 |
+
feature_names = model_info['features']
|
| 659 |
+
|
| 660 |
+
# Input method
|
| 661 |
+
input_method = st.radio(
|
| 662 |
+
"Input method",
|
| 663 |
+
["Manual input", "Upload new data", "Batch prediction"],
|
| 664 |
+
horizontal=True
|
| 665 |
+
)
|
| 666 |
+
|
| 667 |
+
if input_method == "Manual input":
|
| 668 |
+
st.markdown("### Enter feature values")
|
| 669 |
+
|
| 670 |
+
input_data = {}
|
| 671 |
+
cols = st.columns(3)
|
| 672 |
+
|
| 673 |
+
for i, feature in enumerate(feature_names):
|
| 674 |
+
with cols[i % 3]:
|
| 675 |
+
try:
|
| 676 |
+
# Get feature range from training data
|
| 677 |
+
if feature in st.session_state['X_train'].columns:
|
| 678 |
+
min_val = float(st.session_state['X_train'][feature].min())
|
| 679 |
+
max_val = float(st.session_state['X_train'][feature].max())
|
| 680 |
+
mean_val = float(st.session_state['X_train'][feature].mean())
|
| 681 |
+
|
| 682 |
+
input_data[feature] = st.slider(
|
| 683 |
+
f"{feature}",
|
| 684 |
+
min_val, max_val, mean_val,
|
| 685 |
+
format="%.4f",
|
| 686 |
+
key=f"manual_{feature}"
|
| 687 |
+
)
|
| 688 |
+
else:
|
| 689 |
+
input_data[feature] = st.number_input(
|
| 690 |
+
f"{feature}",
|
| 691 |
+
value=0.0,
|
| 692 |
+
key=f"manual_{feature}"
|
| 693 |
+
)
|
| 694 |
+
except Exception as e:
|
| 695 |
+
st.warning(f"⚠️ Error with {feature}: {str(e)}")
|
| 696 |
+
input_data[feature] = 0.0
|
| 697 |
+
|
| 698 |
+
if st.button("🔮 Predict", use_container_width=True):
|
| 699 |
+
try:
|
| 700 |
+
# Convert input to DataFrame
|
| 701 |
+
input_df = pd.DataFrame([input_data])
|
| 702 |
+
|
| 703 |
+
# Ensure all features are present
|
| 704 |
+
for col in feature_names:
|
| 705 |
+
if col not in input_df.columns:
|
| 706 |
+
input_df[col] = 0
|
| 707 |
+
|
| 708 |
+
input_df = input_df[feature_names]
|
| 709 |
+
|
| 710 |
+
# Scale if needed
|
| 711 |
+
if scaler is not None:
|
| 712 |
+
input_scaled = scaler.transform(input_df)
|
| 713 |
+
input_df = pd.DataFrame(input_scaled, columns=feature_names)
|
| 714 |
+
|
| 715 |
+
# Make prediction
|
| 716 |
+
prediction = model.predict(input_df)[0]
|
| 717 |
+
|
| 718 |
+
# Decode if needed
|
| 719 |
+
if le is not None:
|
| 720 |
+
prediction = le.inverse_transform([int(prediction)])[0]
|
| 721 |
+
|
| 722 |
+
# Display prediction with styling
|
| 723 |
+
st.markdown("""
|
| 724 |
+
<div class="success-container" style="text-align: center; padding: 2rem;">
|
| 725 |
+
<h3>🎯 Prediction Result</h3>
|
| 726 |
+
<h1 style="font-size: 3rem;">{}</h1>
|
| 727 |
+
</div>
|
| 728 |
+
""".format(prediction), unsafe_allow_html=True)
|
| 729 |
+
|
| 730 |
+
except Exception as e:
|
| 731 |
+
st.error(f"❌ Prediction error: {str(e)}")
|
| 732 |
+
|
| 733 |
+
elif input_method == "Upload new data":
|
| 734 |
+
pred_file = st.file_uploader("Upload data for predictions",
|
| 735 |
+
type=["csv", "xlsx"],
|
| 736 |
+
key="pred_file")
|
| 737 |
+
|
| 738 |
+
if pred_file:
|
| 739 |
+
try:
|
| 740 |
+
if pred_file.name.endswith("csv"):
|
| 741 |
+
pred_df = pd.read_csv(pred_file)
|
| 742 |
+
else:
|
| 743 |
+
pred_df = pd.read_excel(pred_file)
|
| 744 |
+
|
| 745 |
+
st.subheader("📋 Uploaded Data Preview")
|
| 746 |
+
st.dataframe(pred_df.head())
|
| 747 |
+
|
| 748 |
+
if st.button("🔮 Predict for all rows", use_container_width=True):
|
| 749 |
+
with st.spinner("Making predictions..."):
|
| 750 |
+
try:
|
| 751 |
+
# Prepare data
|
| 752 |
+
pred_processed = pred_df.copy()
|
| 753 |
+
|
| 754 |
+
# Handle categorical features if needed
|
| 755 |
+
for col in pred_processed.columns:
|
| 756 |
+
if pred_processed[col].dtype == 'object':
|
| 757 |
+
pred_processed = pd.get_dummies(pred_processed, columns=[col])
|
| 758 |
+
|
| 759 |
+
# Align columns with training data
|
| 760 |
+
for col in feature_names:
|
| 761 |
+
if col not in pred_processed.columns:
|
| 762 |
+
pred_processed[col] = 0
|
| 763 |
+
|
| 764 |
+
pred_processed = pred_processed[feature_names]
|
| 765 |
+
|
| 766 |
+
# Scale if needed
|
| 767 |
+
if scaler is not None:
|
| 768 |
+
pred_scaled = scaler.transform(pred_processed)
|
| 769 |
+
pred_processed = pd.DataFrame(pred_scaled, columns=feature_names)
|
| 770 |
+
|
| 771 |
+
# Make predictions
|
| 772 |
+
predictions = model.predict(pred_processed)
|
| 773 |
+
|
| 774 |
+
# Decode if needed
|
| 775 |
+
if le is not None:
|
| 776 |
+
predictions = le.inverse_transform(predictions.astype(int))
|
| 777 |
+
|
| 778 |
+
# Add predictions to dataframe
|
| 779 |
+
pred_df['Prediction'] = predictions
|
| 780 |
+
|
| 781 |
+
st.subheader("📊 Predictions Result")
|
| 782 |
+
st.dataframe(pred_df)
|
| 783 |
+
|
| 784 |
+
# Download predictions
|
| 785 |
+
csv = pred_df.to_csv(index=False)
|
| 786 |
+
st.download_button(
|
| 787 |
+
label="📥 Download Predictions",
|
| 788 |
+
data=csv,
|
| 789 |
+
file_name="predictions.csv",
|
| 790 |
+
mime="text/csv",
|
| 791 |
+
use_container_width=True
|
| 792 |
+
)
|
| 793 |
+
|
| 794 |
+
except Exception as e:
|
| 795 |
+
st.error(f"❌ Prediction error: {str(e)}")
|
| 796 |
+
|
| 797 |
+
except Exception as e:
|
| 798 |
+
st.error(f"❌ Error reading file: {str(e)}")
|
| 799 |
+
|
| 800 |
+
elif input_method == "Batch prediction":
|
| 801 |
+
st.markdown("### Batch Prediction Settings")
|
| 802 |
+
|
| 803 |
+
n_samples = st.number_input("Number of samples to generate",
|
| 804 |
+
min_value=1, max_value=1000, value=10)
|
| 805 |
+
|
| 806 |
+
if st.button("🎲 Generate Random Samples & Predict", use_container_width=True):
|
| 807 |
+
try:
|
| 808 |
+
# Generate random samples based on training data distribution
|
| 809 |
+
random_samples = {}
|
| 810 |
+
for feature in feature_names:
|
| 811 |
+
if feature in st.session_state['X_train'].columns:
|
| 812 |
+
mean = st.session_state['X_train'][feature].mean()
|
| 813 |
+
std = st.session_state['X_train'][feature].std()
|
| 814 |
+
random_samples[feature] = np.random.normal(mean, std, n_samples)
|
| 815 |
+
else:
|
| 816 |
+
random_samples[feature] = np.zeros(n_samples)
|
| 817 |
+
|
| 818 |
+
batch_df = pd.DataFrame(random_samples)
|
| 819 |
+
|
| 820 |
+
# Scale if needed
|
| 821 |
+
if scaler is not None:
|
| 822 |
+
batch_scaled = scaler.transform(batch_df)
|
| 823 |
+
batch_df = pd.DataFrame(batch_scaled, columns=feature_names)
|
| 824 |
+
|
| 825 |
+
# Make predictions
|
| 826 |
+
predictions = model.predict(batch_df)
|
| 827 |
+
|
| 828 |
+
# Decode if needed
|
| 829 |
+
if le is not None:
|
| 830 |
+
predictions = le.inverse_transform(predictions.astype(int))
|
| 831 |
+
|
| 832 |
+
# Add predictions to dataframe
|
| 833 |
+
batch_df['Prediction'] = predictions
|
| 834 |
+
|
| 835 |
+
st.subheader("📊 Batch Predictions")
|
| 836 |
+
st.dataframe(batch_df)
|
| 837 |
+
|
| 838 |
+
# Statistics
|
| 839 |
+
if le is None: # Numerical predictions
|
| 840 |
+
st.subheader("📈 Prediction Statistics")
|
| 841 |
+
col1, col2, col3 = st.columns(3)
|
| 842 |
+
with col1:
|
| 843 |
+
st.metric("Mean", f"{predictions.mean():.4f}")
|
| 844 |
+
with col2:
|
| 845 |
+
st.metric("Std", f"{predictions.std():.4f}")
|
| 846 |
+
with col3:
|
| 847 |
+
st.metric("Range", f"{predictions.max() - predictions.min():.4f}")
|
| 848 |
+
|
| 849 |
+
# Download predictions
|
| 850 |
+
csv = batch_df.to_csv(index=False)
|
| 851 |
+
st.download_button(
|
| 852 |
+
label="📥 Download Batch Predictions",
|
| 853 |
+
data=csv,
|
| 854 |
+
file_name="batch_predictions.csv",
|
| 855 |
+
mime="text/csv",
|
| 856 |
+
use_container_width=True
|
| 857 |
+
)
|
| 858 |
+
|
| 859 |
+
except Exception as e:
|
| 860 |
+
st.error(f"❌ Batch prediction error: {str(e)}")
|
| 861 |
+
|
| 862 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 863 |
+
|
| 864 |
+
except Exception as e:
|
| 865 |
+
st.error(f"❌ Error in prediction: {str(e)}")
|
| 866 |
+
|
| 867 |
+
with tab5:
|
| 868 |
+
if 'results_df' not in st.session_state:
|
| 869 |
+
st.info("ℹ️ Train some models first in the 'Model Training' tab")
|
| 870 |
+
return
|
| 871 |
+
|
| 872 |
+
try:
|
| 873 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 874 |
+
st.subheader("📋 Machine Learning Report")
|
| 875 |
+
|
| 876 |
+
results_df = st.session_state['results_df']
|
| 877 |
+
config = st.session_state.get('ml_config', {})
|
| 878 |
+
|
| 879 |
+
# Generate report
|
| 880 |
+
report = f"""
|
| 881 |
+
# Machine Learning Pipeline Report
|
| 882 |
+
|
| 883 |
+
## Configuration Summary
|
| 884 |
+
- **Task Type:** {config.get('task_type', 'N/A')}
|
| 885 |
+
- **Target Variable:** {config.get('target', 'N/A')}
|
| 886 |
+
- **Number of Features:** {len(config.get('features', []))}
|
| 887 |
+
- **Test Size:** {config.get('test_size', 0.2)*100:.0f}%
|
| 888 |
+
- **Cross-Validation Folds:** {config.get('cv_folds', 5)}
|
| 889 |
+
- **Feature Scaling:** {config.get('scaler', 'None')}
|
| 890 |
+
|
| 891 |
+
## Dataset Information
|
| 892 |
+
- **Total Samples:** {st.session_state.get('X_train', pd.DataFrame()).shape[0] + st.session_state.get('X_test', pd.DataFrame()).shape[0]}
|
| 893 |
+
- **Training Samples:** {st.session_state.get('X_train', pd.DataFrame()).shape[0]}
|
| 894 |
+
- **Test Samples:** {st.session_state.get('X_test', pd.DataFrame()).shape[0]}
|
| 895 |
+
|
| 896 |
+
## Model Performance Summary
|
| 897 |
+
|
| 898 |
+
{results_df.to_string()}
|
| 899 |
+
|
| 900 |
+
## Best Model
|
| 901 |
+
**{results_df.iloc[0]['Model']}** performed best based on {'F1 Score' if config.get('task_type') == 'Classification' else 'R² Score'}.
|
| 902 |
+
|
| 903 |
+
## Recommendations
|
| 904 |
+
"""
|
| 905 |
+
|
| 906 |
+
# Add recommendations based on results
|
| 907 |
+
if config.get('task_type') == 'Classification':
|
| 908 |
+
if float(results_df['Accuracy'].iloc[0]) > 0.9:
|
| 909 |
+
report += "\n- ✓ Excellent model performance achieved"
|
| 910 |
+
elif float(results_df['Accuracy'].iloc[0]) > 0.7:
|
| 911 |
+
report += "\n- ✓ Good model performance"
|
| 912 |
+
else:
|
| 913 |
+
report += "\n- ⚠️ Model performance could be improved. Consider feature engineering or trying different algorithms"
|
| 914 |
+
else:
|
| 915 |
+
if float(results_df['R² Score'].iloc[0]) > 0.8:
|
| 916 |
+
report += "\n- ✓ Excellent model performance achieved"
|
| 917 |
+
elif float(results_df['R² Score'].iloc[0]) > 0.6:
|
| 918 |
+
report += "\n- ✓ Good model performance"
|
| 919 |
+
else:
|
| 920 |
+
report += "\n- ⚠️ Model performance could be improved. Consider feature engineering or trying different algorithms"
|
| 921 |
+
|
| 922 |
+
st.markdown(report)
|
| 923 |
+
|
| 924 |
+
# Download report
|
| 925 |
+
st.download_button(
|
| 926 |
+
label="📥 Download ML Report",
|
| 927 |
+
data=report,
|
| 928 |
+
file_name="ml_report.txt",
|
| 929 |
+
mime="text/plain",
|
| 930 |
+
use_container_width=True
|
| 931 |
+
)
|
| 932 |
+
|
| 933 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 934 |
+
|
| 935 |
+
except Exception as e:
|
| 936 |
+
st.error(f"❌ Error generating report: {str(e)}")
|
| 937 |
+
|
| 938 |
+
except Exception as e:
|
| 939 |
+
st.error(f"❌ Critical error in ML pipeline: {str(e)}")
|
| 940 |
+
st.info("💡 Please check your data and try again. If the problem persists, try with a smaller dataset.")
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.28.0
|
| 2 |
+
pandas>=2.0.0
|
| 3 |
+
numpy>=1.24.0
|
| 4 |
+
scikit-learn>=1.3.0
|
| 5 |
+
plotly>=5.17.0
|
| 6 |
+
matplotlib>=3.7.0
|
| 7 |
+
xgboost>=1.7.0
|
| 8 |
+
lightgbm>=4.0.0
|
| 9 |
+
openpyxl>=3.1.0
|
| 10 |
+
scipy>=1.10.0
|
| 11 |
+
shap>=0.42.0
|
| 12 |
+
imbalanced-learn>=0.11.0
|
| 13 |
+
category-encoders>=2.6.0
|
| 14 |
+
statsmodels>=0.14.0
|
| 15 |
+
seaborn>=0.12.0
|
| 16 |
+
joblib>=1.3.0
|
statistical_analysis.py
ADDED
|
@@ -0,0 +1,928 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import scipy.stats as stats
|
| 5 |
+
import plotly.express as px
|
| 6 |
+
import plotly.graph_objects as go
|
| 7 |
+
from plotly.subplots import make_subplots
|
| 8 |
+
import statsmodels.api as sm
|
| 9 |
+
from statsmodels.formula.api import ols
|
| 10 |
+
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
| 11 |
+
from statsmodels.tsa.stattools import adfuller, kpss
|
| 12 |
+
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
|
| 13 |
+
import matplotlib.pyplot as plt
|
| 14 |
+
import warnings
|
| 15 |
+
warnings.filterwarnings('ignore')
|
| 16 |
+
|
| 17 |
+
def statistical_analysis(df):
|
| 18 |
+
"""
|
| 19 |
+
Enhanced statistical analysis with advanced statistical tests and visualizations
|
| 20 |
+
"""
|
| 21 |
+
st.markdown("""
|
| 22 |
+
<div style='text-align: center; margin-bottom: 2rem;'>
|
| 23 |
+
<h2>📐 Advanced Statistical Analysis</h2>
|
| 24 |
+
<p style='color: gray;'>Comprehensive statistical tests, hypothesis testing, and probability analysis</p>
|
| 25 |
+
</div>
|
| 26 |
+
""", unsafe_allow_html=True)
|
| 27 |
+
|
| 28 |
+
# Error handling for empty dataframe
|
| 29 |
+
if df.empty:
|
| 30 |
+
st.error("❌ The dataset is empty. Please upload a valid dataset.")
|
| 31 |
+
return
|
| 32 |
+
|
| 33 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 34 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 35 |
+
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
|
| 36 |
+
|
| 37 |
+
if not numeric_cols:
|
| 38 |
+
st.warning("⚠️ No numeric columns found. Statistical analysis requires numeric data.")
|
| 39 |
+
return
|
| 40 |
+
|
| 41 |
+
# Create tabs for different statistical analyses
|
| 42 |
+
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
|
| 43 |
+
"📊 Descriptive Stats",
|
| 44 |
+
"📈 Correlation Analysis",
|
| 45 |
+
"🔬 Hypothesis Testing",
|
| 46 |
+
"📊 Distribution Analysis",
|
| 47 |
+
"📉 Time Series Analysis",
|
| 48 |
+
"🎲 Probability & Sampling"
|
| 49 |
+
])
|
| 50 |
+
|
| 51 |
+
with tab1:
|
| 52 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 53 |
+
st.subheader("📊 Descriptive Statistics")
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
# Basic statistics with confidence intervals
|
| 57 |
+
stats_df = pd.DataFrame()
|
| 58 |
+
for col in numeric_cols:
|
| 59 |
+
data = df[col].dropna()
|
| 60 |
+
if len(data) > 0:
|
| 61 |
+
# Calculate confidence interval
|
| 62 |
+
ci = stats.t.interval(0.95, len(data)-1, loc=data.mean(), scale=stats.sem(data))
|
| 63 |
+
|
| 64 |
+
stats_df[col] = {
|
| 65 |
+
'Count': len(data),
|
| 66 |
+
'Mean': data.mean(),
|
| 67 |
+
'Std Dev': data.std(),
|
| 68 |
+
'Variance': data.var(),
|
| 69 |
+
'Min': data.min(),
|
| 70 |
+
'Q1 (25%)': data.quantile(0.25),
|
| 71 |
+
'Median (50%)': data.median(),
|
| 72 |
+
'Q3 (75%)': data.quantile(0.75),
|
| 73 |
+
'Max': data.max(),
|
| 74 |
+
'Range': data.max() - data.min(),
|
| 75 |
+
'IQR': data.quantile(0.75) - data.quantile(0.25),
|
| 76 |
+
'Skewness': data.skew(),
|
| 77 |
+
'Kurtosis': data.kurtosis(),
|
| 78 |
+
'Coefficient of Variation (%)': (data.std() / data.mean() * 100) if data.mean() != 0 else np.nan,
|
| 79 |
+
'95% CI Lower': ci[0],
|
| 80 |
+
'95% CI Upper': ci[1]
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
stats_df = pd.DataFrame(stats_df).T
|
| 84 |
+
st.dataframe(stats_df.style.format("{:.4f}"), use_container_width=True)
|
| 85 |
+
|
| 86 |
+
# Summary cards
|
| 87 |
+
st.subheader("📊 Summary Cards")
|
| 88 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 89 |
+
|
| 90 |
+
with col1:
|
| 91 |
+
st.metric("Total Numeric Columns", len(numeric_cols))
|
| 92 |
+
with col2:
|
| 93 |
+
st.metric("Total Observations", f"{df.shape[0]:,}")
|
| 94 |
+
with col3:
|
| 95 |
+
st.metric("Complete Cases", f"{df.dropna().shape[0]:,}")
|
| 96 |
+
with col4:
|
| 97 |
+
completeness = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
|
| 98 |
+
st.metric("Data Completeness", f"{completeness:.1f}%")
|
| 99 |
+
|
| 100 |
+
# Distribution visualization
|
| 101 |
+
st.subheader("Distribution Analysis")
|
| 102 |
+
selected_col = st.selectbox("Select column for detailed distribution analysis", numeric_cols)
|
| 103 |
+
|
| 104 |
+
data = df[selected_col].dropna()
|
| 105 |
+
|
| 106 |
+
fig = make_subplots(rows=2, cols=2,
|
| 107 |
+
subplot_titles=("Histogram with KDE", "Box Plot",
|
| 108 |
+
"Violin Plot", "Q-Q Plot"),
|
| 109 |
+
specs=[[{"type": "xy"}, {"type": "xy"}],
|
| 110 |
+
[{"type": "xy"}, {"type": "xy"}]])
|
| 111 |
+
|
| 112 |
+
# Histogram with KDE
|
| 113 |
+
hist_data = go.Histogram(x=data, nbinsx=30, name="Histogram", opacity=0.7)
|
| 114 |
+
fig.add_trace(hist_data, row=1, col=1)
|
| 115 |
+
|
| 116 |
+
# Box plot
|
| 117 |
+
box_data = go.Box(y=data, name="Box Plot", boxpoints='outliers')
|
| 118 |
+
fig.add_trace(box_data, row=1, col=2)
|
| 119 |
+
|
| 120 |
+
# Violin plot
|
| 121 |
+
violin_data = go.Violin(y=data, name="Violin Plot", box_visible=True, meanline_visible=True)
|
| 122 |
+
fig.add_trace(violin_data, row=2, col=1)
|
| 123 |
+
|
| 124 |
+
# Q-Q plot
|
| 125 |
+
theoretical_q = np.random.normal(data.mean(), data.std(), len(data))
|
| 126 |
+
theoretical_q.sort()
|
| 127 |
+
data_sorted = np.sort(data)
|
| 128 |
+
qq_data = go.Scatter(x=theoretical_q, y=data_sorted, mode='markers', name='Q-Q')
|
| 129 |
+
fig.add_trace(qq_data, row=2, col=2)
|
| 130 |
+
|
| 131 |
+
# Add reference line to Q-Q plot
|
| 132 |
+
min_val = min(theoretical_q.min(), data_sorted.min())
|
| 133 |
+
max_val = max(theoretical_q.max(), data_sorted.max())
|
| 134 |
+
ref_line = go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
|
| 135 |
+
mode='lines', name='Reference', line=dict(color='red', dash='dash'))
|
| 136 |
+
fig.add_trace(ref_line, row=2, col=2)
|
| 137 |
+
|
| 138 |
+
fig.update_layout(height=800, title_text=f"Distribution Analysis of {selected_col}")
|
| 139 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 140 |
+
|
| 141 |
+
# Outlier detection
|
| 142 |
+
Q1 = data.quantile(0.25)
|
| 143 |
+
Q3 = data.quantile(0.75)
|
| 144 |
+
IQR = Q3 - Q1
|
| 145 |
+
outliers = data[(data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR)]
|
| 146 |
+
|
| 147 |
+
if len(outliers) > 0:
|
| 148 |
+
st.warning(f"⚠️ **Outliers detected**: {len(outliers)} outliers found ({len(outliers)/len(data)*100:.2f}%)")
|
| 149 |
+
with st.expander("View outlier values"):
|
| 150 |
+
st.write(outliers.tolist())
|
| 151 |
+
else:
|
| 152 |
+
st.success("✅ No outliers detected in this column")
|
| 153 |
+
|
| 154 |
+
except Exception as e:
|
| 155 |
+
st.error(f"❌ Error in descriptive statistics: {str(e)}")
|
| 156 |
+
st.info("💡 Tip: Check if your data contains non-numeric values or extreme outliers")
|
| 157 |
+
|
| 158 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 159 |
+
|
| 160 |
+
with tab2:
|
| 161 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 162 |
+
st.subheader("📈 Advanced Correlation Analysis")
|
| 163 |
+
|
| 164 |
+
try:
|
| 165 |
+
if len(numeric_cols) >= 2:
|
| 166 |
+
# Multiple correlation methods
|
| 167 |
+
corr_method = st.radio(
|
| 168 |
+
"Select correlation method",
|
| 169 |
+
["Pearson (linear)", "Spearman (rank)", "Kendall (ordinal)"],
|
| 170 |
+
horizontal=True
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
method_map = {
|
| 174 |
+
"Pearson (linear)": "pearson",
|
| 175 |
+
"Spearman (rank)": "spearman",
|
| 176 |
+
"Kendall (ordinal)": "kendall"
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
# Calculate correlation matrix
|
| 180 |
+
corr_matrix = df[numeric_cols].corr(method=method_map[corr_method])
|
| 181 |
+
|
| 182 |
+
# Heatmap with improved visualization
|
| 183 |
+
fig = px.imshow(corr_matrix,
|
| 184 |
+
text_auto=True,
|
| 185 |
+
aspect="auto",
|
| 186 |
+
color_continuous_scale='RdBu_r',
|
| 187 |
+
title=f"{corr_method} Correlation Matrix",
|
| 188 |
+
zmin=-1, zmax=1)
|
| 189 |
+
|
| 190 |
+
fig.update_layout(height=600)
|
| 191 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 192 |
+
|
| 193 |
+
# Correlation significance testing
|
| 194 |
+
st.subheader("📊 Correlation Significance Testing")
|
| 195 |
+
|
| 196 |
+
col1, col2 = st.columns(2)
|
| 197 |
+
with col1:
|
| 198 |
+
feat1 = st.selectbox("Select first feature", numeric_cols, key="corr_feat1")
|
| 199 |
+
with col2:
|
| 200 |
+
feat2 = st.selectbox("Select second feature", [c for c in numeric_cols if c != feat1], key="corr_feat2")
|
| 201 |
+
|
| 202 |
+
data1 = df[feat1].dropna()
|
| 203 |
+
data2 = df[feat2].dropna()
|
| 204 |
+
|
| 205 |
+
# Align data
|
| 206 |
+
combined = pd.concat([data1, data2], axis=1).dropna()
|
| 207 |
+
if len(combined) > 0:
|
| 208 |
+
corr_coef, p_value = stats.pearsonr(combined.iloc[:, 0], combined.iloc[:, 1])
|
| 209 |
+
|
| 210 |
+
st.write(f"**Pearson correlation coefficient:** {corr_coef:.4f}")
|
| 211 |
+
st.write(f"**P-value:** {p_value:.4f}")
|
| 212 |
+
|
| 213 |
+
if p_value < 0.05:
|
| 214 |
+
st.success(f"✅ Statistically significant correlation (p < 0.05)")
|
| 215 |
+
else:
|
| 216 |
+
st.info(f"ℹ️ No statistically significant correlation (p >= 0.05)")
|
| 217 |
+
|
| 218 |
+
# Confidence interval for correlation
|
| 219 |
+
n = len(combined)
|
| 220 |
+
r = corr_coef
|
| 221 |
+
z = np.arctanh(r)
|
| 222 |
+
se = 1 / np.sqrt(n - 3)
|
| 223 |
+
ci_z = stats.norm.interval(0.95, loc=z, scale=se)
|
| 224 |
+
ci_r = np.tanh(ci_z)
|
| 225 |
+
|
| 226 |
+
st.write(f"**95% Confidence Interval:** [{ci_r[0]:.4f}, {ci_r[1]:.4f}]")
|
| 227 |
+
|
| 228 |
+
# Scatter plot with regression line
|
| 229 |
+
fig = px.scatter(combined, x=combined.columns[0], y=combined.columns[1],
|
| 230 |
+
trendline="ols", title=f"Relationship: {feat1} vs {feat2}")
|
| 231 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 232 |
+
|
| 233 |
+
# Partial correlation analysis
|
| 234 |
+
st.subheader("🔍 Partial Correlation Analysis")
|
| 235 |
+
if len(numeric_cols) >= 3:
|
| 236 |
+
from sklearn.linear_model import LinearRegression
|
| 237 |
+
|
| 238 |
+
control_var = st.selectbox("Select control variable",
|
| 239 |
+
[c for c in numeric_cols if c not in [feat1, feat2]])
|
| 240 |
+
|
| 241 |
+
# Calculate partial correlation
|
| 242 |
+
X_control = df[[control_var]].dropna()
|
| 243 |
+
y1 = df[feat1].dropna()
|
| 244 |
+
y2 = df[feat2].dropna()
|
| 245 |
+
|
| 246 |
+
# Align data
|
| 247 |
+
aligned_data = pd.concat([X_control, y1, y2], axis=1).dropna()
|
| 248 |
+
|
| 249 |
+
if len(aligned_data) > 0:
|
| 250 |
+
# Residualize
|
| 251 |
+
model1 = LinearRegression().fit(aligned_data[[control_var]], aligned_data[feat1])
|
| 252 |
+
res1 = aligned_data[feat1] - model1.predict(aligned_data[[control_var]])
|
| 253 |
+
|
| 254 |
+
model2 = LinearRegression().fit(aligned_data[[control_var]], aligned_data[feat2])
|
| 255 |
+
res2 = aligned_data[feat2] - model2.predict(aligned_data[[control_var]])
|
| 256 |
+
|
| 257 |
+
partial_corr, partial_p = stats.pearsonr(res1, res2)
|
| 258 |
+
|
| 259 |
+
st.write(f"**Partial correlation (controlling for {control_var}):** {partial_corr:.4f}")
|
| 260 |
+
st.write(f"**P-value:** {partial_p:.4f}")
|
| 261 |
+
|
| 262 |
+
if abs(partial_corr) < abs(corr_coef):
|
| 263 |
+
st.info(f"ℹ️ The correlation decreases when controlling for {control_var}, suggesting it may be a confounding variable")
|
| 264 |
+
else:
|
| 265 |
+
st.warning("⚠️ Need at least 2 numeric columns for correlation analysis")
|
| 266 |
+
|
| 267 |
+
except Exception as e:
|
| 268 |
+
st.error(f"❌ Error in correlation analysis: {str(e)}")
|
| 269 |
+
st.info("💡 Tip: Ensure your data has sufficient non-null values for correlation calculation")
|
| 270 |
+
|
| 271 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 272 |
+
|
| 273 |
+
with tab3:
|
| 274 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 275 |
+
st.subheader("🔬 Statistical Hypothesis Testing")
|
| 276 |
+
|
| 277 |
+
try:
|
| 278 |
+
test_category = st.selectbox(
|
| 279 |
+
"Select test category",
|
| 280 |
+
["Parametric Tests", "Non-parametric Tests", "ANOVA & Post-hoc", "Goodness of Fit"]
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
if test_category == "Parametric Tests":
|
| 284 |
+
param_test = st.selectbox(
|
| 285 |
+
"Select parametric test",
|
| 286 |
+
["One-Sample t-test", "Independent t-test", "Paired t-test", "Z-test"]
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
if param_test == "One-Sample t-test":
|
| 290 |
+
if numeric_cols:
|
| 291 |
+
col = st.selectbox("Select variable", numeric_cols)
|
| 292 |
+
test_value = st.number_input("Test value (population mean)", value=0.0)
|
| 293 |
+
|
| 294 |
+
data = df[col].dropna()
|
| 295 |
+
if len(data) > 0:
|
| 296 |
+
t_stat, p_value = stats.ttest_1samp(data, test_value)
|
| 297 |
+
|
| 298 |
+
st.write(f"**t-statistic:** {t_stat:.4f}")
|
| 299 |
+
st.write(f"**p-value:** {p_value:.4f}")
|
| 300 |
+
st.write(f"**Degrees of freedom:** {len(data)-1}")
|
| 301 |
+
|
| 302 |
+
# Effect size (Cohen's d)
|
| 303 |
+
cohens_d = (data.mean() - test_value) / data.std()
|
| 304 |
+
st.write(f"**Cohen's d (effect size):** {cohens_d:.4f}")
|
| 305 |
+
|
| 306 |
+
if p_value < 0.05:
|
| 307 |
+
st.success(f"✅ Reject null hypothesis: Mean is significantly different from {test_value}")
|
| 308 |
+
else:
|
| 309 |
+
st.info(f"ℹ️ Fail to reject null hypothesis: Mean is not significantly different from {test_value}")
|
| 310 |
+
|
| 311 |
+
# Visualization
|
| 312 |
+
fig = go.Figure()
|
| 313 |
+
fig.add_trace(go.Histogram(x=data, name="Sample", opacity=0.7))
|
| 314 |
+
fig.add_vline(x=test_value, line_dash="dash", line_color="red",
|
| 315 |
+
annotation_text=f"Test value: {test_value}")
|
| 316 |
+
fig.add_vline(x=data.mean(), line_color="green",
|
| 317 |
+
annotation_text=f"Sample mean: {data.mean():.2f}")
|
| 318 |
+
fig.update_layout(title=f"One-Sample t-test: {col}")
|
| 319 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 320 |
+
|
| 321 |
+
elif param_test == "Independent t-test":
|
| 322 |
+
if len(numeric_cols) >= 1 and len(categorical_cols) >= 1:
|
| 323 |
+
num_col = st.selectbox("Select numeric variable", numeric_cols, key="ind_num")
|
| 324 |
+
cat_col = st.selectbox("Select grouping variable", categorical_cols, key="ind_cat")
|
| 325 |
+
|
| 326 |
+
groups = df[cat_col].dropna().unique()
|
| 327 |
+
if len(groups) == 2:
|
| 328 |
+
group1 = df[df[cat_col] == groups[0]][num_col].dropna()
|
| 329 |
+
group2 = df[df[cat_col] == groups[1]][num_col].dropna()
|
| 330 |
+
|
| 331 |
+
# Test for equal variances
|
| 332 |
+
levene_stat, levene_p = stats.levene(group1, group2)
|
| 333 |
+
equal_var = levene_p > 0.05
|
| 334 |
+
|
| 335 |
+
t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=equal_var)
|
| 336 |
+
|
| 337 |
+
st.write(f"**Groups:** {groups[0]} (n={len(group1)}) vs {groups[1]} (n={len(group2)})")
|
| 338 |
+
st.write(f"**Levene's test for equal variances:** p={levene_p:.4f}")
|
| 339 |
+
st.write(f"**Assuming {'equal' if equal_var else 'unequal'} variances")
|
| 340 |
+
st.write(f"**t-statistic:** {t_stat:.4f}")
|
| 341 |
+
st.write(f"**p-value:** {p_value:.4f}")
|
| 342 |
+
|
| 343 |
+
# Effect size (Cohen's d)
|
| 344 |
+
pooled_std = np.sqrt(((len(group1)-1)*group1.std()**2 + (len(group2)-1)*group2.std()**2) /
|
| 345 |
+
(len(group1)+len(group2)-2))
|
| 346 |
+
cohens_d = (group1.mean() - group2.mean()) / pooled_std
|
| 347 |
+
st.write(f"**Cohen's d (effect size):** {cohens_d:.4f}")
|
| 348 |
+
|
| 349 |
+
if p_value < 0.05:
|
| 350 |
+
st.success(f"✅ Significant difference found between groups")
|
| 351 |
+
else:
|
| 352 |
+
st.info(f"ℹ️ No significant difference found between groups")
|
| 353 |
+
|
| 354 |
+
# Visualization
|
| 355 |
+
fig = px.box(df, x=cat_col, y=num_col, title=f"Comparison: {num_col} by {cat_col}")
|
| 356 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 357 |
+
else:
|
| 358 |
+
st.warning(f"⚠️ Independent t-test requires exactly 2 groups. Found {len(groups)} groups.")
|
| 359 |
+
|
| 360 |
+
elif param_test == "Paired t-test":
|
| 361 |
+
if len(numeric_cols) >= 2:
|
| 362 |
+
col1 = st.selectbox("Select first measurement", numeric_cols, key="paired1")
|
| 363 |
+
col2 = st.selectbox("Select second measurement", numeric_cols, key="paired2")
|
| 364 |
+
|
| 365 |
+
paired_data = df[[col1, col2]].dropna()
|
| 366 |
+
if len(paired_data) > 0:
|
| 367 |
+
t_stat, p_value = stats.ttest_rel(paired_data[col1], paired_data[col2])
|
| 368 |
+
|
| 369 |
+
st.write(f"**Sample size:** {len(paired_data)}")
|
| 370 |
+
st.write(f"**Mean difference:** {(paired_data[col1] - paired_data[col2]).mean():.4f}")
|
| 371 |
+
st.write(f"**t-statistic:** {t_stat:.4f}")
|
| 372 |
+
st.write(f"**p-value:** {p_value:.4f}")
|
| 373 |
+
|
| 374 |
+
if p_value < 0.05:
|
| 375 |
+
st.success(f"✅ Significant difference found between measurements")
|
| 376 |
+
else:
|
| 377 |
+
st.info(f"ℹ️ No significant difference found between measurements")
|
| 378 |
+
|
| 379 |
+
# Visualization
|
| 380 |
+
fig = go.Figure()
|
| 381 |
+
fig.add_trace(go.Scatter(x=paired_data[col1], y=paired_data[col2],
|
| 382 |
+
mode='markers', text=paired_data.index))
|
| 383 |
+
fig.add_trace(go.Scatter(x=[paired_data[col1].min(), paired_data[col1].max()],
|
| 384 |
+
y=[paired_data[col1].min(), paired_data[col1].max()],
|
| 385 |
+
mode='lines', name='y=x', line=dict(dash='dash')))
|
| 386 |
+
fig.update_layout(title=f"Paired Comparison: {col1} vs {col2}")
|
| 387 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 388 |
+
|
| 389 |
+
elif test_category == "Non-parametric Tests":
|
| 390 |
+
nonparam_test = st.selectbox(
|
| 391 |
+
"Select non-parametric test",
|
| 392 |
+
["Mann-Whitney U", "Wilcoxon Signed-Rank", "Kruskal-Wallis H", "Friedman Test"]
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
if nonparam_test == "Mann-Whitney U":
|
| 396 |
+
if len(numeric_cols) >= 1 and len(categorical_cols) >= 1:
|
| 397 |
+
num_col = st.selectbox("Select numeric variable", numeric_cols, key="mw_num")
|
| 398 |
+
cat_col = st.selectbox("Select grouping variable", categorical_cols, key="mw_cat")
|
| 399 |
+
|
| 400 |
+
groups = df[cat_col].dropna().unique()
|
| 401 |
+
if len(groups) == 2:
|
| 402 |
+
group1 = df[df[cat_col] == groups[0]][num_col].dropna()
|
| 403 |
+
group2 = df[df[cat_col] == groups[1]][num_col].dropna()
|
| 404 |
+
|
| 405 |
+
u_stat, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')
|
| 406 |
+
|
| 407 |
+
st.write(f"**U-statistic:** {u_stat:.4f}")
|
| 408 |
+
st.write(f"**p-value:** {p_value:.4f}")
|
| 409 |
+
|
| 410 |
+
# Effect size (r = Z/√N)
|
| 411 |
+
from scipy.stats import norm
|
| 412 |
+
z_score = norm.ppf(p_value/2) if p_value < 1 else 0
|
| 413 |
+
effect_size = abs(z_score) / np.sqrt(len(group1) + len(group2))
|
| 414 |
+
st.write(f"**Effect size (r):** {effect_size:.4f}")
|
| 415 |
+
|
| 416 |
+
if p_value < 0.05:
|
| 417 |
+
st.success(f"✅ Significant difference found between groups")
|
| 418 |
+
else:
|
| 419 |
+
st.info(f"ℹ️ No significant difference found between groups")
|
| 420 |
+
|
| 421 |
+
# Visualization
|
| 422 |
+
fig = px.violin(df, x=cat_col, y=num_col, box=True, points="all",
|
| 423 |
+
title=f"Mann-Whitney U Test: {num_col} by {cat_col}")
|
| 424 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 425 |
+
|
| 426 |
+
elif test_category == "ANOVA & Post-hoc":
|
| 427 |
+
if len(numeric_cols) >= 1 and len(categorical_cols) >= 1:
|
| 428 |
+
num_col = st.selectbox("Select numeric variable", numeric_cols, key="anova_num")
|
| 429 |
+
cat_col = st.selectbox("Select grouping variable", categorical_cols, key="anova_cat")
|
| 430 |
+
|
| 431 |
+
groups = [df[df[cat_col] == group][num_col].dropna()
|
| 432 |
+
for group in df[cat_col].unique() if len(df[df[cat_col] == group]) > 0]
|
| 433 |
+
|
| 434 |
+
if len(groups) >= 2:
|
| 435 |
+
# One-way ANOVA
|
| 436 |
+
f_stat, p_value = stats.f_oneway(*groups)
|
| 437 |
+
|
| 438 |
+
st.write("**One-way ANOVA Results:**")
|
| 439 |
+
st.write(f"**F-statistic:** {f_stat:.4f}")
|
| 440 |
+
st.write(f"**p-value:** {p_value:.4f}")
|
| 441 |
+
|
| 442 |
+
if p_value < 0.05:
|
| 443 |
+
st.success("✅ Significant differences found between groups")
|
| 444 |
+
|
| 445 |
+
# Post-hoc Tukey HSD
|
| 446 |
+
if st.button("Run Tukey HSD Post-hoc Test"):
|
| 447 |
+
tukey = pairwise_tukeyhsd(df[num_col].dropna(), df[cat_col].dropna())
|
| 448 |
+
tukey_df = pd.DataFrame(data=tukey.summary().data[1:],
|
| 449 |
+
columns=tukey.summary().data[0])
|
| 450 |
+
st.dataframe(tukey_df)
|
| 451 |
+
|
| 452 |
+
# Visualize confidence intervals
|
| 453 |
+
fig = go.Figure()
|
| 454 |
+
for i, row in enumerate(tukey_df.itertuples()):
|
| 455 |
+
if row.padj < 0.05:
|
| 456 |
+
color = 'green'
|
| 457 |
+
else:
|
| 458 |
+
color = 'red'
|
| 459 |
+
fig.add_trace(go.Scatter(x=[row[4], row[5]], y=[i, i],
|
| 460 |
+
mode='lines', line=dict(color=color, width=3),
|
| 461 |
+
name=f"{row[1]} vs {row[2]}"))
|
| 462 |
+
fig.update_layout(title="Tukey HSD Confidence Intervals",
|
| 463 |
+
xaxis_title="Mean Difference",
|
| 464 |
+
yaxis_title="Comparison")
|
| 465 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 466 |
+
else:
|
| 467 |
+
st.info("ℹ️ No significant differences found between groups")
|
| 468 |
+
|
| 469 |
+
# Visualization
|
| 470 |
+
fig = px.box(df, x=cat_col, y=num_col, title=f"ANOVA: {num_col} by {cat_col}")
|
| 471 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 472 |
+
|
| 473 |
+
except Exception as e:
|
| 474 |
+
st.error(f"❌ Error in hypothesis testing: {str(e)}")
|
| 475 |
+
st.info("💡 Tip: Ensure you have sufficient data and appropriate variable types for the selected test")
|
| 476 |
+
|
| 477 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 478 |
+
|
| 479 |
+
with tab4:
|
| 480 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 481 |
+
st.subheader("📊 Distribution Analysis & Normality Tests")
|
| 482 |
+
|
| 483 |
+
try:
|
| 484 |
+
if numeric_cols:
|
| 485 |
+
col = st.selectbox("Select column for distribution analysis", numeric_cols, key="dist_col")
|
| 486 |
+
data = df[col].dropna()
|
| 487 |
+
|
| 488 |
+
if len(data) > 0:
|
| 489 |
+
# Multiple normality tests
|
| 490 |
+
st.markdown("### 🔍 Normality Tests")
|
| 491 |
+
|
| 492 |
+
col1, col2 = st.columns(2)
|
| 493 |
+
|
| 494 |
+
with col1:
|
| 495 |
+
# Shapiro-Wilk test
|
| 496 |
+
if len(data) <= 5000:
|
| 497 |
+
shapiro_stat, shapiro_p = stats.shapiro(data)
|
| 498 |
+
st.write("**Shapiro-Wilk Test**")
|
| 499 |
+
st.write(f"Statistic: {shapiro_stat:.4f}")
|
| 500 |
+
st.write(f"P-value: {shapiro_p:.4f}")
|
| 501 |
+
if shapiro_p < 0.05:
|
| 502 |
+
st.error("❌ Not normally distributed")
|
| 503 |
+
else:
|
| 504 |
+
st.success("✅ Normally distributed")
|
| 505 |
+
|
| 506 |
+
with col2:
|
| 507 |
+
# Kolmogorov-Smirnov test
|
| 508 |
+
ks_stat, ks_p = stats.kstest(data, 'norm', args=(data.mean(), data.std()))
|
| 509 |
+
st.write("**Kolmogorov-Smirnov Test**")
|
| 510 |
+
st.write(f"Statistic: {ks_stat:.4f}")
|
| 511 |
+
st.write(f"P-value: {ks_p:.4f}")
|
| 512 |
+
if ks_p < 0.05:
|
| 513 |
+
st.error("❌ Not normally distributed")
|
| 514 |
+
else:
|
| 515 |
+
st.success("✅ Normally distributed")
|
| 516 |
+
|
| 517 |
+
# Anderson-Darling test
|
| 518 |
+
anderson_stat, anderson_crit, anderson_sig = stats.anderson(data, dist='norm')
|
| 519 |
+
st.write("**Anderson-Darling Test**")
|
| 520 |
+
st.write(f"Statistic: {anderson_stat:.4f}")
|
| 521 |
+
for i in range(len(anderson_crit)):
|
| 522 |
+
st.write(f"Critical value at {anderson_sig[i]}%: {anderson_crit[i]:.4f}")
|
| 523 |
+
|
| 524 |
+
# D'Agostino's K-squared test
|
| 525 |
+
skew_stat, skew_p = stats.skewtest(data)
|
| 526 |
+
kurt_stat, kurt_p = stats.kurtosistest(data)
|
| 527 |
+
|
| 528 |
+
st.write("**D'Agostino's Tests**")
|
| 529 |
+
st.write(f"Skewness test p-value: {skew_p:.4f}")
|
| 530 |
+
st.write(f"Kurtosis test p-value: {kurt_p:.4f}")
|
| 531 |
+
|
| 532 |
+
# Distribution fitting
|
| 533 |
+
st.markdown("### 📈 Distribution Fitting")
|
| 534 |
+
|
| 535 |
+
distributions = ['norm', 'expon', 'gamma', 'beta', 'lognorm', 'uniform']
|
| 536 |
+
selected_dist = st.selectbox("Select distribution to fit", distributions)
|
| 537 |
+
|
| 538 |
+
if selected_dist == 'norm':
|
| 539 |
+
params = stats.norm.fit(data)
|
| 540 |
+
pdf = stats.norm.pdf(np.sort(data), *params)
|
| 541 |
+
elif selected_dist == 'expon':
|
| 542 |
+
params = stats.expon.fit(data)
|
| 543 |
+
pdf = stats.expon.pdf(np.sort(data), *params)
|
| 544 |
+
elif selected_dist == 'gamma':
|
| 545 |
+
params = stats.gamma.fit(data)
|
| 546 |
+
pdf = stats.gamma.pdf(np.sort(data), *params)
|
| 547 |
+
elif selected_dist == 'beta':
|
| 548 |
+
# Scale data to [0,1] for beta distribution
|
| 549 |
+
scaled_data = (data - data.min()) / (data.max() - data.min())
|
| 550 |
+
scaled_data = scaled_data[(scaled_data > 0) & (scaled_data < 1)]
|
| 551 |
+
if len(scaled_data) > 0:
|
| 552 |
+
params = stats.beta.fit(scaled_data)
|
| 553 |
+
pdf = stats.beta.pdf(np.sort(scaled_data), *params)
|
| 554 |
+
elif selected_dist == 'lognorm':
|
| 555 |
+
params = stats.lognorm.fit(data)
|
| 556 |
+
pdf = stats.lognorm.pdf(np.sort(data), *params)
|
| 557 |
+
elif selected_dist == 'uniform':
|
| 558 |
+
params = stats.uniform.fit(data)
|
| 559 |
+
pdf = stats.uniform.pdf(np.sort(data), *params)
|
| 560 |
+
|
| 561 |
+
# Plot histogram with fitted distribution
|
| 562 |
+
fig = go.Figure()
|
| 563 |
+
fig.add_trace(go.Histogram(x=data, nbinsx=30, name="Data", opacity=0.7))
|
| 564 |
+
|
| 565 |
+
if selected_dist != 'beta':
|
| 566 |
+
fig.add_trace(go.Scatter(x=np.sort(data), y=pdf * len(data) * (data.max() - data.min()) / 30,
|
| 567 |
+
mode='lines', name=f"Fitted {selected_dist}",
|
| 568 |
+
line=dict(color='red', width=2)))
|
| 569 |
+
|
| 570 |
+
fig.update_layout(title=f"Histogram with Fitted {selected_dist} Distribution")
|
| 571 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 572 |
+
|
| 573 |
+
# Q-Q plot with confidence bands
|
| 574 |
+
st.markdown("### 📊 Enhanced Q-Q Plot")
|
| 575 |
+
|
| 576 |
+
# Generate theoretical quantiles
|
| 577 |
+
theoretical_q = np.random.normal(data.mean(), data.std(), len(data))
|
| 578 |
+
theoretical_q.sort()
|
| 579 |
+
data_sorted = np.sort(data)
|
| 580 |
+
|
| 581 |
+
# Calculate confidence bands (bootstrap)
|
| 582 |
+
n_bootstrap = 100
|
| 583 |
+
bootstrap_lines = []
|
| 584 |
+
for i in range(n_bootstrap):
|
| 585 |
+
bootstrap_sample = np.random.choice(data, len(data), replace=True)
|
| 586 |
+
bootstrap_sample.sort()
|
| 587 |
+
bootstrap_lines.append(bootstrap_sample)
|
| 588 |
+
|
| 589 |
+
bootstrap_lines = np.array(bootstrap_lines)
|
| 590 |
+
lower_band = np.percentile(bootstrap_lines, 2.5, axis=0)
|
| 591 |
+
upper_band = np.percentile(bootstrap_lines, 97.5, axis=0)
|
| 592 |
+
|
| 593 |
+
fig = go.Figure()
|
| 594 |
+
|
| 595 |
+
# Add confidence band
|
| 596 |
+
fig.add_trace(go.Scatter(x=np.concatenate([theoretical_q, theoretical_q[::-1]]),
|
| 597 |
+
y=np.concatenate([lower_band, upper_band[::-1]]),
|
| 598 |
+
fill='toself', fillcolor='rgba(0,100,80,0.2)',
|
| 599 |
+
line=dict(color='rgba(255,255,255,0)'),
|
| 600 |
+
name='95% CI'))
|
| 601 |
+
|
| 602 |
+
# Add data points
|
| 603 |
+
fig.add_trace(go.Scatter(x=theoretical_q, y=data_sorted,
|
| 604 |
+
mode='markers', name='Data'))
|
| 605 |
+
|
| 606 |
+
# Add reference line
|
| 607 |
+
fig.add_trace(go.Scatter(x=[data_sorted.min(), data_sorted.max()],
|
| 608 |
+
y=[data_sorted.min(), data_sorted.max()],
|
| 609 |
+
mode='lines', name='Reference',
|
| 610 |
+
line=dict(color='red', dash='dash')))
|
| 611 |
+
|
| 612 |
+
fig.update_layout(title=f"Enhanced Q-Q Plot with 95% Confidence Band")
|
| 613 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 614 |
+
|
| 615 |
+
except Exception as e:
|
| 616 |
+
st.error(f"❌ Error in distribution analysis: {str(e)}")
|
| 617 |
+
st.info("💡 Tip: Ensure you have sufficient data points for distribution fitting")
|
| 618 |
+
|
| 619 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 620 |
+
|
| 621 |
+
with tab5:
|
| 622 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 623 |
+
st.subheader("📉 Advanced Time Series Analysis")
|
| 624 |
+
|
| 625 |
+
try:
|
| 626 |
+
if datetime_cols and numeric_cols:
|
| 627 |
+
date_col = st.selectbox("Select date column", datetime_cols)
|
| 628 |
+
value_col = st.selectbox("Select value column", numeric_cols, key="ts_value_adv")
|
| 629 |
+
|
| 630 |
+
# Prepare time series data
|
| 631 |
+
ts_df = df[[date_col, value_col]].dropna().sort_values(date_col)
|
| 632 |
+
ts_df.set_index(date_col, inplace=True)
|
| 633 |
+
|
| 634 |
+
if len(ts_df) >= 10:
|
| 635 |
+
# Time series decomposition
|
| 636 |
+
st.markdown("### 🔄 Time Series Decomposition")
|
| 637 |
+
|
| 638 |
+
from statsmodels.tsa.seasonal import seasonal_decompose
|
| 639 |
+
|
| 640 |
+
# Determine frequency
|
| 641 |
+
freq_options = {
|
| 642 |
+
'Auto-detect': None,
|
| 643 |
+
'Daily (7)': 7,
|
| 644 |
+
'Weekly (52)': 52,
|
| 645 |
+
'Monthly (12)': 12,
|
| 646 |
+
'Quarterly (4)': 4
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
selected_freq = st.selectbox("Select seasonal period", list(freq_options.keys()))
|
| 650 |
+
period = freq_options[selected_freq]
|
| 651 |
+
|
| 652 |
+
if period is None:
|
| 653 |
+
# Auto-detect frequency
|
| 654 |
+
try:
|
| 655 |
+
freq = pd.infer_freq(ts_df.index)
|
| 656 |
+
if freq:
|
| 657 |
+
period_map = {'D': 7, 'W': 52, 'M': 12, 'Q': 4}
|
| 658 |
+
period = period_map.get(freq[0], 7)
|
| 659 |
+
except:
|
| 660 |
+
period = 7
|
| 661 |
+
|
| 662 |
+
if len(ts_df) >= 2 * period:
|
| 663 |
+
decomposition = seasonal_decompose(ts_df[value_col], model='additive', period=period)
|
| 664 |
+
|
| 665 |
+
fig = make_subplots(rows=4, cols=1,
|
| 666 |
+
subplot_titles=('Original', 'Trend', 'Seasonal', 'Residual'))
|
| 667 |
+
|
| 668 |
+
fig.add_trace(go.Scatter(x=ts_df.index, y=ts_df[value_col],
|
| 669 |
+
mode='lines', name='Original'), row=1, col=1)
|
| 670 |
+
fig.add_trace(go.Scatter(x=ts_df.index, y=decomposition.trend,
|
| 671 |
+
mode='lines', name='Trend'), row=2, col=1)
|
| 672 |
+
fig.add_trace(go.Scatter(x=ts_df.index, y=decomposition.seasonal,
|
| 673 |
+
mode='lines', name='Seasonal'), row=3, col=1)
|
| 674 |
+
fig.add_trace(go.Scatter(x=ts_df.index, y=decomposition.resid,
|
| 675 |
+
mode='lines', name='Residual'), row=4, col=1)
|
| 676 |
+
|
| 677 |
+
fig.update_layout(height=800, title="Time Series Decomposition")
|
| 678 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 679 |
+
|
| 680 |
+
# Stationarity tests
|
| 681 |
+
st.markdown("### 📊 Stationarity Tests")
|
| 682 |
+
|
| 683 |
+
col1, col2 = st.columns(2)
|
| 684 |
+
|
| 685 |
+
with col1:
|
| 686 |
+
# ADF test
|
| 687 |
+
adf_result = adfuller(ts_df[value_col].dropna())
|
| 688 |
+
st.write("**Augmented Dickey-Fuller Test**")
|
| 689 |
+
st.write(f"ADF Statistic: {adf_result[0]:.4f}")
|
| 690 |
+
st.write(f"p-value: {adf_result[1]:.4f}")
|
| 691 |
+
st.write(f"Critical values:")
|
| 692 |
+
for key, value in adf_result[4].items():
|
| 693 |
+
st.write(f" {key}: {value:.4f}")
|
| 694 |
+
|
| 695 |
+
if adf_result[1] < 0.05:
|
| 696 |
+
st.success("✅ Series is stationary")
|
| 697 |
+
else:
|
| 698 |
+
st.warning("⚠️ Series is non-stationary")
|
| 699 |
+
|
| 700 |
+
with col2:
|
| 701 |
+
# KPSS test
|
| 702 |
+
kpss_result = kpss(ts_df[value_col].dropna(), regression='c')
|
| 703 |
+
st.write("**KPSS Test**")
|
| 704 |
+
st.write(f"KPSS Statistic: {kpss_result[0]:.4f}")
|
| 705 |
+
st.write(f"p-value: {kpss_result[1]:.4f}")
|
| 706 |
+
st.write(f"Critical values:")
|
| 707 |
+
for key, value in kpss_result[3].items():
|
| 708 |
+
st.write(f" {key}: {value:.4f}")
|
| 709 |
+
|
| 710 |
+
if kpss_result[1] < 0.05:
|
| 711 |
+
st.warning("⚠️ Series is non-stationary")
|
| 712 |
+
else:
|
| 713 |
+
st.success("✅ Series is stationary")
|
| 714 |
+
|
| 715 |
+
# ACF and PACF plots
|
| 716 |
+
st.markdown("### 📈 ACF and PACF Plots")
|
| 717 |
+
|
| 718 |
+
lags = st.slider("Number of lags", 10, 50, 20)
|
| 719 |
+
|
| 720 |
+
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))
|
| 721 |
+
plot_acf(ts_df[value_col].dropna(), lags=lags, ax=ax1)
|
| 722 |
+
plot_pacf(ts_df[value_col].dropna(), lags=lags, ax=ax2)
|
| 723 |
+
plt.tight_layout()
|
| 724 |
+
st.pyplot(fig)
|
| 725 |
+
|
| 726 |
+
# Forecasting with simple models
|
| 727 |
+
st.markdown("### 🔮 Simple Forecasting")
|
| 728 |
+
|
| 729 |
+
forecast_periods = st.slider("Forecast periods", 1, 30, 10)
|
| 730 |
+
|
| 731 |
+
from statsmodels.tsa.holtwinters import ExponentialSmoothing
|
| 732 |
+
|
| 733 |
+
model = ExponentialSmoothing(ts_df[value_col],
|
| 734 |
+
seasonal_periods=period,
|
| 735 |
+
trend='add', seasonal='add')
|
| 736 |
+
fitted_model = model.fit()
|
| 737 |
+
forecast = fitted_model.forecast(forecast_periods)
|
| 738 |
+
|
| 739 |
+
# Plot forecast
|
| 740 |
+
fig = go.Figure()
|
| 741 |
+
fig.add_trace(go.Scatter(x=ts_df.index, y=ts_df[value_col],
|
| 742 |
+
mode='lines', name='Historical'))
|
| 743 |
+
fig.add_trace(go.Scatter(x=forecast.index, y=forecast,
|
| 744 |
+
mode='lines+markers', name='Forecast',
|
| 745 |
+
line=dict(color='red')))
|
| 746 |
+
fig.update_layout(title=f"Exponential Smoothing Forecast ({forecast_periods} periods)")
|
| 747 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 748 |
+
|
| 749 |
+
else:
|
| 750 |
+
st.info("ℹ️ Need both datetime and numeric columns for time series analysis")
|
| 751 |
+
|
| 752 |
+
except Exception as e:
|
| 753 |
+
st.error(f"❌ Error in time series analysis: {str(e)}")
|
| 754 |
+
st.info("💡 Tip: Ensure your date column is properly formatted as datetime")
|
| 755 |
+
|
| 756 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 757 |
+
|
| 758 |
+
with tab6:
|
| 759 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 760 |
+
st.subheader("🎲 Probability & Sampling Analysis")
|
| 761 |
+
|
| 762 |
+
try:
|
| 763 |
+
if numeric_cols:
|
| 764 |
+
col = st.selectbox("Select column for probability analysis", numeric_cols, key="prob_col")
|
| 765 |
+
data = df[col].dropna()
|
| 766 |
+
|
| 767 |
+
if len(data) > 0:
|
| 768 |
+
# Probability distribution fitting
|
| 769 |
+
st.markdown("### 📊 Probability Distribution Fitting")
|
| 770 |
+
|
| 771 |
+
# Calculate empirical CDF
|
| 772 |
+
sorted_data = np.sort(data)
|
| 773 |
+
ecdf = np.arange(1, len(sorted_data)+1) / len(sorted_data)
|
| 774 |
+
|
| 775 |
+
fig = go.Figure()
|
| 776 |
+
fig.add_trace(go.Scatter(x=sorted_data, y=ecdf,
|
| 777 |
+
mode='lines', name='Empirical CDF'))
|
| 778 |
+
|
| 779 |
+
# Fit theoretical distributions
|
| 780 |
+
dist_options = ['Normal', 'Exponential', 'Gamma', 'Log-normal']
|
| 781 |
+
selected_dist = st.multiselect("Select distributions to compare", dist_options, default=['Normal'])
|
| 782 |
+
|
| 783 |
+
colors = ['red', 'green', 'blue', 'orange']
|
| 784 |
+
for i, dist_name in enumerate(selected_dist):
|
| 785 |
+
if dist_name == 'Normal':
|
| 786 |
+
params = stats.norm.fit(data)
|
| 787 |
+
theoretical_cdf = stats.norm.cdf(sorted_data, *params)
|
| 788 |
+
elif dist_name == 'Exponential':
|
| 789 |
+
params = stats.expon.fit(data)
|
| 790 |
+
theoretical_cdf = stats.expon.cdf(sorted_data, *params)
|
| 791 |
+
elif dist_name == 'Gamma':
|
| 792 |
+
params = stats.gamma.fit(data)
|
| 793 |
+
theoretical_cdf = stats.gamma.cdf(sorted_data, *params)
|
| 794 |
+
elif dist_name == 'Log-normal':
|
| 795 |
+
params = stats.lognorm.fit(data)
|
| 796 |
+
theoretical_cdf = stats.lognorm.cdf(sorted_data, *params)
|
| 797 |
+
|
| 798 |
+
fig.add_trace(go.Scatter(x=sorted_data, y=theoretical_cdf,
|
| 799 |
+
mode='lines', name=f'{dist_name} CDF',
|
| 800 |
+
line=dict(color=colors[i], dash='dash')))
|
| 801 |
+
|
| 802 |
+
fig.update_layout(title="CDF Comparison: Empirical vs Theoretical",
|
| 803 |
+
xaxis_title=col, yaxis_title="Cumulative Probability")
|
| 804 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 805 |
+
|
| 806 |
+
# Goodness of fit tests
|
| 807 |
+
st.markdown("### 📈 Goodness of Fit Tests")
|
| 808 |
+
|
| 809 |
+
for dist_name in selected_dist:
|
| 810 |
+
if dist_name == 'Normal':
|
| 811 |
+
ks_stat, ks_p = stats.kstest(data, 'norm', args=stats.norm.fit(data))
|
| 812 |
+
elif dist_name == 'Exponential':
|
| 813 |
+
ks_stat, ks_p = stats.kstest(data, 'expon', args=stats.expon.fit(data))
|
| 814 |
+
elif dist_name == 'Gamma':
|
| 815 |
+
ks_stat, ks_p = stats.kstest(data, 'gamma', args=stats.gamma.fit(data))
|
| 816 |
+
elif dist_name == 'Log-normal':
|
| 817 |
+
ks_stat, ks_p = stats.kstest(data, 'lognorm', args=stats.lognorm.fit(data))
|
| 818 |
+
|
| 819 |
+
st.write(f"**{dist_name} Distribution**")
|
| 820 |
+
st.write(f"KS Statistic: {ks_stat:.4f}")
|
| 821 |
+
st.write(f"P-value: {ks_p:.4f}")
|
| 822 |
+
|
| 823 |
+
if ks_p < 0.05:
|
| 824 |
+
st.error(f"❌ Data does NOT follow {dist_name} distribution")
|
| 825 |
+
else:
|
| 826 |
+
st.success(f"✅ Data may follow {dist_name} distribution")
|
| 827 |
+
|
| 828 |
+
# Sampling analysis
|
| 829 |
+
st.markdown("### 🎯 Sampling Analysis")
|
| 830 |
+
|
| 831 |
+
sample_size = st.slider("Sample size", 10, min(500, len(data)), 100)
|
| 832 |
+
n_samples = st.slider("Number of samples", 10, 1000, 100)
|
| 833 |
+
|
| 834 |
+
# Bootstrap sampling
|
| 835 |
+
bootstrap_means = []
|
| 836 |
+
for i in range(n_samples):
|
| 837 |
+
sample = np.random.choice(data, sample_size, replace=True)
|
| 838 |
+
bootstrap_means.append(sample.mean())
|
| 839 |
+
|
| 840 |
+
bootstrap_means = np.array(bootstrap_means)
|
| 841 |
+
|
| 842 |
+
# Plot sampling distribution
|
| 843 |
+
fig = make_subplots(rows=1, cols=2,
|
| 844 |
+
subplot_titles=("Sampling Distribution of Mean",
|
| 845 |
+
"Confidence Intervals"))
|
| 846 |
+
|
| 847 |
+
fig.add_trace(go.Histogram(x=bootstrap_means, nbinsx=30,
|
| 848 |
+
name="Sample Means"), row=1, col=1)
|
| 849 |
+
|
| 850 |
+
# Add confidence intervals
|
| 851 |
+
ci_lower = np.percentile(bootstrap_means, 2.5)
|
| 852 |
+
ci_upper = np.percentile(bootstrap_means, 97.5)
|
| 853 |
+
|
| 854 |
+
fig.add_trace(go.Scatter(x=[ci_lower, ci_lower], y=[0, 10],
|
| 855 |
+
mode='lines', name='95% CI Lower',
|
| 856 |
+
line=dict(color='red', dash='dash')), row=1, col=1)
|
| 857 |
+
fig.add_trace(go.Scatter(x=[ci_upper, ci_upper], y=[0, 10],
|
| 858 |
+
mode='lines', name='95% CI Upper',
|
| 859 |
+
line=dict(color='red', dash='dash')), row=1, col=1)
|
| 860 |
+
|
| 861 |
+
# Confidence interval plot
|
| 862 |
+
for i in range(min(20, n_samples)):
|
| 863 |
+
sample_mean = bootstrap_means[i]
|
| 864 |
+
fig.add_trace(go.Scatter(x=[i, i], y=[sample_mean - data.std()/np.sqrt(sample_size),
|
| 865 |
+
sample_mean + data.std()/np.sqrt(sample_size)],
|
| 866 |
+
mode='lines', line=dict(color='blue', width=1),
|
| 867 |
+
showlegend=False), row=1, col=2)
|
| 868 |
+
fig.add_trace(go.Scatter(x=[i], y=[sample_mean],
|
| 869 |
+
mode='markers', marker=dict(color='red', size=5),
|
| 870 |
+
showlegend=False), row=1, col=2)
|
| 871 |
+
|
| 872 |
+
fig.update_layout(height=500, title="Bootstrap Sampling Analysis")
|
| 873 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 874 |
+
|
| 875 |
+
# Sampling statistics
|
| 876 |
+
col1, col2, col3 = st.columns(3)
|
| 877 |
+
with col1:
|
| 878 |
+
st.metric("Population Mean", f"{data.mean():.4f}")
|
| 879 |
+
with col2:
|
| 880 |
+
st.metric("Mean of Sample Means", f"{bootstrap_means.mean():.4f}")
|
| 881 |
+
with col3:
|
| 882 |
+
st.metric("Standard Error", f"{bootstrap_means.std():.4f}")
|
| 883 |
+
|
| 884 |
+
st.write(f"**95% Confidence Interval:** [{ci_lower:.4f}, {ci_upper:.4f}]")
|
| 885 |
+
|
| 886 |
+
except Exception as e:
|
| 887 |
+
st.error(f"❌ Error in probability analysis: {str(e)}")
|
| 888 |
+
st.info("💡 Tip: Ensure you have sufficient data for probability analysis")
|
| 889 |
+
|
| 890 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 891 |
+
|
| 892 |
+
# Export options
|
| 893 |
+
st.markdown("---")
|
| 894 |
+
st.markdown("### 📥 Export Statistical Report")
|
| 895 |
+
|
| 896 |
+
try:
|
| 897 |
+
report_text = f"""
|
| 898 |
+
STATISTICAL ANALYSIS REPORT
|
| 899 |
+
===========================
|
| 900 |
+
|
| 901 |
+
Dataset Information:
|
| 902 |
+
• Total Rows: {df.shape[0]:,}
|
| 903 |
+
• Total Columns: {df.shape[1]}
|
| 904 |
+
• Numeric Columns: {len(numeric_cols)}
|
| 905 |
+
• Categorical Columns: {len(categorical_cols)}
|
| 906 |
+
• Datetime Columns: {len(datetime_cols)}
|
| 907 |
+
|
| 908 |
+
Summary Statistics:
|
| 909 |
+
{df[numeric_cols].describe().to_string()}
|
| 910 |
+
|
| 911 |
+
Analysis Performed:
|
| 912 |
+
• Descriptive Statistics
|
| 913 |
+
• Correlation Analysis
|
| 914 |
+
• Hypothesis Testing
|
| 915 |
+
• Distribution Analysis
|
| 916 |
+
• Time Series Analysis (if applicable)
|
| 917 |
+
• Probability & Sampling Analysis
|
| 918 |
+
"""
|
| 919 |
+
|
| 920 |
+
st.download_button(
|
| 921 |
+
label="📥 Download Complete Statistical Report",
|
| 922 |
+
data=report_text,
|
| 923 |
+
file_name="statistical_analysis_report.txt",
|
| 924 |
+
mime="text/plain",
|
| 925 |
+
use_container_width=True
|
| 926 |
+
)
|
| 927 |
+
except Exception as e:
|
| 928 |
+
st.error(f"❌ Error generating report: {str(e)}")
|
utils.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import streamlit as st
|
| 4 |
+
|
| 5 |
+
def detect_column_types(df):
|
| 6 |
+
"""
|
| 7 |
+
Detect and return column types
|
| 8 |
+
"""
|
| 9 |
+
numeric = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 10 |
+
categorical = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 11 |
+
datetime = df.select_dtypes(include=['datetime64']).columns.tolist()
|
| 12 |
+
boolean = df.select_dtypes(include=['bool']).columns.tolist()
|
| 13 |
+
|
| 14 |
+
return numeric, categorical, datetime, boolean
|
| 15 |
+
|
| 16 |
+
def get_basic_stats(df):
|
| 17 |
+
"""
|
| 18 |
+
Return basic statistics about the dataset
|
| 19 |
+
"""
|
| 20 |
+
stats = {
|
| 21 |
+
'rows': df.shape[0],
|
| 22 |
+
'columns': df.shape[1],
|
| 23 |
+
'missing_values': df.isnull().sum().sum(),
|
| 24 |
+
'missing_percentage': (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100,
|
| 25 |
+
'duplicates': df.duplicated().sum(),
|
| 26 |
+
'memory_usage': df.memory_usage(deep=True).sum() / 1024**2 # MB
|
| 27 |
+
}
|
| 28 |
+
return stats
|
| 29 |
+
|
| 30 |
+
def suggest_visualizations(df):
|
| 31 |
+
"""
|
| 32 |
+
Suggest appropriate visualizations based on data types
|
| 33 |
+
"""
|
| 34 |
+
numeric, categorical, datetime, boolean = detect_column_types(df)
|
| 35 |
+
|
| 36 |
+
suggestions = []
|
| 37 |
+
|
| 38 |
+
if len(numeric) > 0:
|
| 39 |
+
suggestions.append({
|
| 40 |
+
'type': 'histogram',
|
| 41 |
+
'description': f'Distribution of numeric columns',
|
| 42 |
+
'columns': numeric[:3]
|
| 43 |
+
})
|
| 44 |
+
|
| 45 |
+
if len(categorical) > 0:
|
| 46 |
+
suggestions.append({
|
| 47 |
+
'type': 'bar_chart',
|
| 48 |
+
'description': f'Category distributions',
|
| 49 |
+
'columns': categorical[:3]
|
| 50 |
+
})
|
| 51 |
+
|
| 52 |
+
if len(numeric) >= 2:
|
| 53 |
+
suggestions.append({
|
| 54 |
+
'type': 'scatter_plot',
|
| 55 |
+
'description': 'Relationship between numeric variables',
|
| 56 |
+
'columns': numeric[:2]
|
| 57 |
+
})
|
| 58 |
+
|
| 59 |
+
if len(datetime) > 0 and len(numeric) > 0:
|
| 60 |
+
suggestions.append({
|
| 61 |
+
'type': 'line_chart',
|
| 62 |
+
'description': 'Time series trends',
|
| 63 |
+
'columns': [datetime[0], numeric[0]]
|
| 64 |
+
})
|
| 65 |
+
|
| 66 |
+
if len(numeric) > 1:
|
| 67 |
+
suggestions.append({
|
| 68 |
+
'type': 'correlation_heatmap',
|
| 69 |
+
'description': 'Correlations between numeric variables'
|
| 70 |
+
})
|
| 71 |
+
|
| 72 |
+
return suggestions
|
| 73 |
+
|
| 74 |
+
def format_number(num):
|
| 75 |
+
"""
|
| 76 |
+
Format large numbers with commas
|
| 77 |
+
"""
|
| 78 |
+
if pd.isna(num):
|
| 79 |
+
return "N/A"
|
| 80 |
+
return f"{num:,.0f}"
|
| 81 |
+
|
| 82 |
+
def format_percentage(num):
|
| 83 |
+
"""
|
| 84 |
+
Format as percentage
|
| 85 |
+
"""
|
| 86 |
+
if pd.isna(num):
|
| 87 |
+
return "N/A"
|
| 88 |
+
return f"{num:.1f}%"
|
| 89 |
+
|
| 90 |
+
def get_data_quality_issues(df):
|
| 91 |
+
"""
|
| 92 |
+
Identify data quality issues
|
| 93 |
+
"""
|
| 94 |
+
issues = []
|
| 95 |
+
|
| 96 |
+
# Check for missing values
|
| 97 |
+
missing_cols = df.columns[df.isnull().any()].tolist()
|
| 98 |
+
if missing_cols:
|
| 99 |
+
issues.append({
|
| 100 |
+
'type': 'missing_values',
|
| 101 |
+
'severity': 'high' if df.isnull().sum().sum() > len(df) * 0.1 else 'medium',
|
| 102 |
+
'description': f'Missing values in {len(missing_cols)} columns',
|
| 103 |
+
'columns': missing_cols
|
| 104 |
+
})
|
| 105 |
+
|
| 106 |
+
# Check for duplicates
|
| 107 |
+
duplicates = df.duplicated().sum()
|
| 108 |
+
if duplicates > 0:
|
| 109 |
+
issues.append({
|
| 110 |
+
'type': 'duplicates',
|
| 111 |
+
'severity': 'medium' if duplicates > len(df) * 0.05 else 'low',
|
| 112 |
+
'description': f'{duplicates} duplicate rows found',
|
| 113 |
+
'count': duplicates
|
| 114 |
+
})
|
| 115 |
+
|
| 116 |
+
# Check for constant columns
|
| 117 |
+
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
|
| 118 |
+
if constant_cols:
|
| 119 |
+
issues.append({
|
| 120 |
+
'type': 'constant_columns',
|
| 121 |
+
'severity': 'low',
|
| 122 |
+
'description': f'{len(constant_cols)} constant columns found',
|
| 123 |
+
'columns': constant_cols
|
| 124 |
+
})
|
| 125 |
+
|
| 126 |
+
# Check for outliers in numeric columns
|
| 127 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 128 |
+
for col in numeric_cols:
|
| 129 |
+
Q1 = df[col].quantile(0.25)
|
| 130 |
+
Q3 = df[col].quantile(0.75)
|
| 131 |
+
IQR = Q3 - Q1
|
| 132 |
+
outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
|
| 133 |
+
if len(outliers) > len(df) * 0.1:
|
| 134 |
+
issues.append({
|
| 135 |
+
'type': 'outliers',
|
| 136 |
+
'severity': 'medium',
|
| 137 |
+
'description': f'Significant outliers in {col}',
|
| 138 |
+
'column': col,
|
| 139 |
+
'outlier_count': len(outliers)
|
| 140 |
+
})
|
| 141 |
+
break # Just report first outlier issue
|
| 142 |
+
|
| 143 |
+
return issues
|
| 144 |
+
|
| 145 |
+
def get_recommendations(df):
|
| 146 |
+
"""
|
| 147 |
+
Generate data analysis recommendations
|
| 148 |
+
"""
|
| 149 |
+
numeric, categorical, datetime, boolean = detect_column_types(df)
|
| 150 |
+
|
| 151 |
+
recommendations = []
|
| 152 |
+
|
| 153 |
+
# Missing data recommendations
|
| 154 |
+
if df.isnull().sum().sum() > 0:
|
| 155 |
+
recommendations.append("Consider handling missing values using imputation or removal")
|
| 156 |
+
|
| 157 |
+
# Feature engineering suggestions
|
| 158 |
+
if len(numeric) >= 2:
|
| 159 |
+
recommendations.append("Create interaction features between highly correlated variables")
|
| 160 |
+
|
| 161 |
+
if datetime:
|
| 162 |
+
recommendations.append("Extract time-based features (hour, day, month, year) from datetime columns")
|
| 163 |
+
|
| 164 |
+
# Modeling suggestions
|
| 165 |
+
if len(numeric) > 5:
|
| 166 |
+
recommendations.append("Consider dimensionality reduction techniques (PCA, t-SNE)")
|
| 167 |
+
|
| 168 |
+
if df.shape[0] > 10000:
|
| 169 |
+
recommendations.append("Dataset is large - consider sampling for faster exploration")
|
| 170 |
+
|
| 171 |
+
# Visualization suggestions
|
| 172 |
+
if len(numeric) > 2:
|
| 173 |
+
recommendations.append("Use pair plots to visualize relationships between multiple variables")
|
| 174 |
+
|
| 175 |
+
if len(categorical) > 1:
|
| 176 |
+
recommendations.append("Create contingency tables to analyze categorical relationships")
|
| 177 |
+
|
| 178 |
+
return recommendations
|
| 179 |
+
|
| 180 |
+
def create_sample_dataset():
|
| 181 |
+
"""
|
| 182 |
+
Create a sample dataset for testing
|
| 183 |
+
"""
|
| 184 |
+
np.random.seed(42)
|
| 185 |
+
n_rows = 1000
|
| 186 |
+
|
| 187 |
+
data = {
|
| 188 |
+
'id': range(n_rows),
|
| 189 |
+
'age': np.random.normal(40, 15, n_rows).clip(18, 90).astype(int),
|
| 190 |
+
'income': np.random.normal(50000, 20000, n_rows).clip(20000, 150000).astype(int),
|
| 191 |
+
'score': np.random.uniform(0, 100, n_rows).round(2),
|
| 192 |
+
'category': np.random.choice(['A', 'B', 'C', 'D'], n_rows),
|
| 193 |
+
'region': np.random.choice(['North', 'South', 'East', 'West'], n_rows),
|
| 194 |
+
'purchased': np.random.choice([0, 1], n_rows, p=[0.7, 0.3]),
|
| 195 |
+
'signup_date': pd.date_range('2023-01-01', periods=n_rows, freq='D'),
|
| 196 |
+
'satisfaction': np.random.choice([1, 2, 3, 4, 5], n_rows, p=[0.1, 0.15, 0.3, 0.25, 0.2])
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
# Add some missing values
|
| 200 |
+
df = pd.DataFrame(data)
|
| 201 |
+
mask = np.random.random(df.shape) < 0.05
|
| 202 |
+
df = df.mask(mask)
|
| 203 |
+
|
| 204 |
+
# Add some duplicates
|
| 205 |
+
duplicate_rows = np.random.choice(n_rows, 10, replace=False)
|
| 206 |
+
df = pd.concat([df, df.iloc[duplicate_rows]]).reset_index(drop=True)
|
| 207 |
+
|
| 208 |
+
return df
|
visualization.py
ADDED
|
@@ -0,0 +1,435 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import plotly.express as px
|
| 3 |
+
import plotly.graph_objects as go
|
| 4 |
+
from plotly.subplots import make_subplots
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
def auto_visualizations(df):
|
| 9 |
+
|
| 10 |
+
st.markdown("""
|
| 11 |
+
<div style='text-align: center; margin-bottom: 2rem;'>
|
| 12 |
+
<h2>📊 Interactive Data Visualization</h2>
|
| 13 |
+
<p style='color: gray;'>Create beautiful, interactive visualizations with just a few clicks</p>
|
| 14 |
+
</div>
|
| 15 |
+
""", unsafe_allow_html=True)
|
| 16 |
+
|
| 17 |
+
# Get column types
|
| 18 |
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 19 |
+
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 20 |
+
date_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
|
| 21 |
+
|
| 22 |
+
# Visualization type selector
|
| 23 |
+
viz_type = st.selectbox(
|
| 24 |
+
"🎨 Select Visualization Type",
|
| 25 |
+
["Distribution Plots", "Categorical Plots", "Relationship Plots",
|
| 26 |
+
"Time Series Plots", "Statistical Plots", "Advanced Plots"]
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
if viz_type == "Distribution Plots":
|
| 30 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 31 |
+
st.subheader("📈 Distribution Plots")
|
| 32 |
+
|
| 33 |
+
if num_cols:
|
| 34 |
+
# Create tabs for different distribution plots
|
| 35 |
+
dist_tab1, dist_tab2, dist_tab3 = st.tabs(["Histogram", "Box Plot", "Violin Plot"])
|
| 36 |
+
|
| 37 |
+
with dist_tab1:
|
| 38 |
+
col1, col2 = st.columns(2)
|
| 39 |
+
with col1:
|
| 40 |
+
hist_col = st.selectbox("Select column", num_cols, key="hist")
|
| 41 |
+
with col2:
|
| 42 |
+
bins = st.slider("Number of bins", 5, 100, 30)
|
| 43 |
+
|
| 44 |
+
fig = px.histogram(df, x=hist_col, nbins=bins,
|
| 45 |
+
title=f"Distribution of {hist_col}",
|
| 46 |
+
marginal="box", opacity=0.7)
|
| 47 |
+
fig.update_layout(showlegend=False)
|
| 48 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 49 |
+
|
| 50 |
+
with dist_tab2:
|
| 51 |
+
if cat_cols:
|
| 52 |
+
box_col = st.selectbox("Numeric column", num_cols, key="box_num")
|
| 53 |
+
box_cat = st.selectbox("Category column (optional)", ["None"] + cat_cols, key="box_cat")
|
| 54 |
+
|
| 55 |
+
if box_cat == "None":
|
| 56 |
+
fig = px.box(df, y=box_col, title=f"Box Plot of {box_col}")
|
| 57 |
+
else:
|
| 58 |
+
fig = px.box(df, x=box_cat, y=box_col, title=f"{box_col} by {box_cat}")
|
| 59 |
+
|
| 60 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 61 |
+
else:
|
| 62 |
+
st.info("Add categorical columns to create grouped box plots")
|
| 63 |
+
|
| 64 |
+
with dist_tab3:
|
| 65 |
+
if cat_cols:
|
| 66 |
+
violin_col = st.selectbox("Numeric column", num_cols, key="violin_num")
|
| 67 |
+
violin_cat = st.selectbox("Category column", cat_cols, key="violin_cat")
|
| 68 |
+
|
| 69 |
+
fig = px.violin(df, x=violin_cat, y=violin_col,
|
| 70 |
+
box=True, points="all",
|
| 71 |
+
title=f"Violin Plot of {violin_col} by {violin_cat}")
|
| 72 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 73 |
+
else:
|
| 74 |
+
st.warning("No numeric columns available for distribution plots")
|
| 75 |
+
|
| 76 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 77 |
+
|
| 78 |
+
elif viz_type == "Categorical Plots":
|
| 79 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 80 |
+
st.subheader("📊 Categorical Plots")
|
| 81 |
+
|
| 82 |
+
if cat_cols:
|
| 83 |
+
# Create tabs for categorical plots
|
| 84 |
+
cat_tab1, cat_tab2, cat_tab3 = st.tabs(["Bar Chart", "Pie Chart", "Sunburst Chart"])
|
| 85 |
+
|
| 86 |
+
with cat_tab1:
|
| 87 |
+
bar_col = st.selectbox("Select categorical column", cat_cols, key="bar")
|
| 88 |
+
|
| 89 |
+
# Get value counts
|
| 90 |
+
value_counts = df[bar_col].value_counts().reset_index()
|
| 91 |
+
value_counts.columns = [bar_col, 'count']
|
| 92 |
+
|
| 93 |
+
# Color option
|
| 94 |
+
if num_cols:
|
| 95 |
+
color_by = st.selectbox("Color by (optional)", ["None"] + num_cols, key="bar_color")
|
| 96 |
+
else:
|
| 97 |
+
color_by = "None"
|
| 98 |
+
|
| 99 |
+
if color_by == "None":
|
| 100 |
+
fig = px.bar(value_counts, x=bar_col, y='count',
|
| 101 |
+
title=f"Distribution of {bar_col}",
|
| 102 |
+
color_discrete_sequence=['#636EFA'])
|
| 103 |
+
else:
|
| 104 |
+
# Aggregate numeric column by category
|
| 105 |
+
agg_data = df.groupby(bar_col)[color_by].mean().reset_index()
|
| 106 |
+
fig = px.bar(agg_data, x=bar_col, y=color_by,
|
| 107 |
+
title=f"Average {color_by} by {bar_col}",
|
| 108 |
+
color=bar_col)
|
| 109 |
+
|
| 110 |
+
fig.update_layout(xaxis_tickangle=-45)
|
| 111 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 112 |
+
|
| 113 |
+
with cat_tab2:
|
| 114 |
+
pie_col = st.selectbox("Select column for pie chart", cat_cols, key="pie")
|
| 115 |
+
|
| 116 |
+
# Limit to top 10 categories for readability
|
| 117 |
+
top_n = st.slider("Show top N categories", 3, 20, 10)
|
| 118 |
+
value_counts = df[pie_col].value_counts().head(top_n)
|
| 119 |
+
|
| 120 |
+
fig = px.pie(values=value_counts.values, names=value_counts.index,
|
| 121 |
+
title=f"Proportion of {pie_col} (Top {top_n})",
|
| 122 |
+
hole=0.3)
|
| 123 |
+
fig.update_traces(textposition='inside', textinfo='percent+label')
|
| 124 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 125 |
+
|
| 126 |
+
with cat_tab3:
|
| 127 |
+
if len(cat_cols) >= 2:
|
| 128 |
+
st.markdown("**Hierarchical View**")
|
| 129 |
+
path = st.multiselect("Select hierarchy (order matters)",
|
| 130 |
+
cat_cols, default=cat_cols[:2])
|
| 131 |
+
|
| 132 |
+
if len(path) >= 2:
|
| 133 |
+
fig = px.sunburst(df, path=path,
|
| 134 |
+
title="Hierarchical Distribution")
|
| 135 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 136 |
+
else:
|
| 137 |
+
st.info("Need at least 2 categorical columns for sunburst chart")
|
| 138 |
+
else:
|
| 139 |
+
st.warning("No categorical columns available")
|
| 140 |
+
|
| 141 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 142 |
+
|
| 143 |
+
elif viz_type == "Relationship Plots":
|
| 144 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 145 |
+
st.subheader("🔄 Relationship Plots")
|
| 146 |
+
|
| 147 |
+
if len(num_cols) >= 2:
|
| 148 |
+
rel_tab1, rel_tab2, rel_tab3 = st.tabs(["Scatter Plot", "Line Plot", "Heatmap"])
|
| 149 |
+
|
| 150 |
+
with rel_tab1:
|
| 151 |
+
col1, col2, col3 = st.columns(3)
|
| 152 |
+
with col1:
|
| 153 |
+
x_col = st.selectbox("X axis", num_cols, key="scatter_x")
|
| 154 |
+
with col2:
|
| 155 |
+
y_col = st.selectbox("Y axis", [c for c in num_cols if c != x_col], key="scatter_y")
|
| 156 |
+
with col3:
|
| 157 |
+
color_col = st.selectbox("Color by", ["None"] + cat_cols + num_cols, key="scatter_color")
|
| 158 |
+
|
| 159 |
+
size_col = st.selectbox("Size by (optional)", ["None"] + num_cols, key="scatter_size")
|
| 160 |
+
|
| 161 |
+
# Create scatter plot
|
| 162 |
+
if color_col == "None" and size_col == "None":
|
| 163 |
+
fig = px.scatter(df, x=x_col, y=y_col,
|
| 164 |
+
title=f"{y_col} vs {x_col}",
|
| 165 |
+
trendline="ols")
|
| 166 |
+
elif color_col != "None" and size_col == "None":
|
| 167 |
+
fig = px.scatter(df, x=x_col, y=y_col, color=color_col,
|
| 168 |
+
title=f"{y_col} vs {x_col} colored by {color_col}",
|
| 169 |
+
trendline="ols")
|
| 170 |
+
elif color_col == "None" and size_col != "None":
|
| 171 |
+
fig = px.scatter(df, x=x_col, y=y_col, size=size_col,
|
| 172 |
+
title=f"{y_col} vs {x_col} sized by {size_col}",
|
| 173 |
+
trendline="ols")
|
| 174 |
+
else:
|
| 175 |
+
fig = px.scatter(df, x=x_col, y=y_col, color=color_col, size=size_col,
|
| 176 |
+
title=f"{y_col} vs {x_col}",
|
| 177 |
+
trendline="ols")
|
| 178 |
+
|
| 179 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 180 |
+
|
| 181 |
+
with rel_tab2:
|
| 182 |
+
col1, col2 = st.columns(2)
|
| 183 |
+
with col1:
|
| 184 |
+
line_x = st.selectbox("X axis (usually time)", num_cols + date_cols, key="line_x")
|
| 185 |
+
with col2:
|
| 186 |
+
line_y = st.selectbox("Y axis", num_cols, key="line_y")
|
| 187 |
+
|
| 188 |
+
line_color = st.selectbox("Color by", ["None"] + cat_cols, key="line_color")
|
| 189 |
+
|
| 190 |
+
if line_color == "None":
|
| 191 |
+
fig = px.line(df, x=line_x, y=line_y,
|
| 192 |
+
title=f"{line_y} over {line_x}")
|
| 193 |
+
else:
|
| 194 |
+
fig = px.line(df, x=line_x, y=line_y, color=line_color,
|
| 195 |
+
title=f"{line_y} over {line_x} by {line_color}")
|
| 196 |
+
|
| 197 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 198 |
+
|
| 199 |
+
with rel_tab3:
|
| 200 |
+
# Correlation heatmap
|
| 201 |
+
corr_matrix = df[num_cols].corr()
|
| 202 |
+
|
| 203 |
+
# Mask for upper triangle
|
| 204 |
+
mask = np.triu(np.ones_like(corr_matrix), k=1)
|
| 205 |
+
masked_corr = corr_matrix * (1 - mask)
|
| 206 |
+
|
| 207 |
+
fig = px.imshow(masked_corr,
|
| 208 |
+
text_auto=True,
|
| 209 |
+
aspect="auto",
|
| 210 |
+
color_continuous_scale='RdBu_r',
|
| 211 |
+
title="Correlation Heatmap",
|
| 212 |
+
zmin=-1, zmax=1)
|
| 213 |
+
|
| 214 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 215 |
+
|
| 216 |
+
# Show strongest correlations
|
| 217 |
+
st.markdown("**Strongest Correlations:**")
|
| 218 |
+
corr_pairs = []
|
| 219 |
+
for i in range(len(num_cols)):
|
| 220 |
+
for j in range(i+1, len(num_cols)):
|
| 221 |
+
corr_pairs.append((num_cols[i], num_cols[j],
|
| 222 |
+
corr_matrix.iloc[i, j]))
|
| 223 |
+
|
| 224 |
+
corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
|
| 225 |
+
|
| 226 |
+
for col1, col2, corr in corr_pairs[:5]:
|
| 227 |
+
strength = "🟢" if abs(corr) > 0.7 else "🟡" if abs(corr) > 0.3 else "🔴"
|
| 228 |
+
st.write(f"{strength} **{col1}** & **{col2}**: {corr:.3f}")
|
| 229 |
+
else:
|
| 230 |
+
st.warning("Need at least 2 numeric columns for relationship plots")
|
| 231 |
+
|
| 232 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 233 |
+
|
| 234 |
+
elif viz_type == "Time Series Plots":
|
| 235 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 236 |
+
st.subheader("📅 Time Series Plots")
|
| 237 |
+
|
| 238 |
+
if date_cols:
|
| 239 |
+
ts_tab1, ts_tab2 = st.tabs(["Time Series", "Resampling"])
|
| 240 |
+
|
| 241 |
+
with ts_tab1:
|
| 242 |
+
date_col = st.selectbox("Date column", date_cols, key="ts_date")
|
| 243 |
+
value_col = st.selectbox("Value column", num_cols if num_cols else [], key="ts_value")
|
| 244 |
+
|
| 245 |
+
if num_cols and date_col:
|
| 246 |
+
# Sort by date
|
| 247 |
+
df_sorted = df.sort_values(date_col)
|
| 248 |
+
|
| 249 |
+
fig = go.Figure()
|
| 250 |
+
fig.add_trace(go.Scatter(x=df_sorted[date_col], y=df_sorted[value_col],
|
| 251 |
+
mode='lines+markers', name=value_col))
|
| 252 |
+
|
| 253 |
+
fig.update_layout(title=f"{value_col} over Time",
|
| 254 |
+
xaxis_title="Date",
|
| 255 |
+
yaxis_title=value_col)
|
| 256 |
+
|
| 257 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 258 |
+
|
| 259 |
+
with ts_tab2:
|
| 260 |
+
if num_cols and date_cols:
|
| 261 |
+
date_col = st.selectbox("Select date column", date_cols, key="resample_date")
|
| 262 |
+
resample_col = st.selectbox("Select column to resample", num_cols, key="resample_col")
|
| 263 |
+
|
| 264 |
+
freq = st.selectbox("Resampling frequency",
|
| 265 |
+
["Daily", "Weekly", "Monthly", "Quarterly", "Yearly"])
|
| 266 |
+
|
| 267 |
+
freq_map = {
|
| 268 |
+
"Daily": "D",
|
| 269 |
+
"Weekly": "W",
|
| 270 |
+
"Monthly": "M",
|
| 271 |
+
"Quarterly": "Q",
|
| 272 |
+
"Yearly": "Y"
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
# Set date as index
|
| 276 |
+
df_date = df.set_index(date_col)
|
| 277 |
+
|
| 278 |
+
# Resample
|
| 279 |
+
resampled = df_date[resample_col].resample(freq_map[freq]).mean().reset_index()
|
| 280 |
+
|
| 281 |
+
fig = px.line(resampled, x=date_col, y=resample_col,
|
| 282 |
+
title=f"{resample_col} ({freq} Aggregated)")
|
| 283 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 284 |
+
else:
|
| 285 |
+
st.warning("No datetime columns found. Convert a column to datetime first.")
|
| 286 |
+
|
| 287 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 288 |
+
|
| 289 |
+
elif viz_type == "Statistical Plots":
|
| 290 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 291 |
+
st.subheader("📐 Statistical Plots")
|
| 292 |
+
|
| 293 |
+
if num_cols:
|
| 294 |
+
stat_tab1, stat_tab2, stat_tab3 = st.tabs(["QQ Plot", "ECDF", "Density Heatmap"])
|
| 295 |
+
|
| 296 |
+
with stat_tab1:
|
| 297 |
+
qq_col = st.selectbox("Select column for QQ plot", num_cols, key="qq")
|
| 298 |
+
|
| 299 |
+
# Calculate quantiles
|
| 300 |
+
data = df[qq_col].dropna()
|
| 301 |
+
theoretical_quantiles = np.percentile(np.random.normal(0, 1, len(data)),
|
| 302 |
+
np.linspace(0, 100, len(data)))
|
| 303 |
+
sample_quantiles = np.percentile(data, np.linspace(0, 100, len(data)))
|
| 304 |
+
|
| 305 |
+
fig = go.Figure()
|
| 306 |
+
fig.add_trace(go.Scatter(x=theoretical_quantiles, y=sample_quantiles,
|
| 307 |
+
mode='markers', name='Data'))
|
| 308 |
+
|
| 309 |
+
# Add diagonal line
|
| 310 |
+
min_val = min(theoretical_quantiles.min(), sample_quantiles.min())
|
| 311 |
+
max_val = max(theoretical_quantiles.max(), sample_quantiles.max())
|
| 312 |
+
fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
|
| 313 |
+
mode='lines', name='Normal',
|
| 314 |
+
line=dict(color='red', dash='dash')))
|
| 315 |
+
|
| 316 |
+
fig.update_layout(title=f"QQ Plot - {qq_col}",
|
| 317 |
+
xaxis_title="Theoretical Quantiles",
|
| 318 |
+
yaxis_title="Sample Quantiles")
|
| 319 |
+
|
| 320 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 321 |
+
|
| 322 |
+
with stat_tab2:
|
| 323 |
+
ecdf_col = st.selectbox("Select column for ECDF", num_cols, key="ecdf")
|
| 324 |
+
|
| 325 |
+
fig = px.ecdf(df, x=ecdf_col,
|
| 326 |
+
title=f"Empirical Cumulative Distribution - {ecdf_col}")
|
| 327 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 328 |
+
|
| 329 |
+
with stat_tab3:
|
| 330 |
+
if len(num_cols) >= 2:
|
| 331 |
+
x_col = st.selectbox("X axis", num_cols, key="density_x")
|
| 332 |
+
y_col = st.selectbox("Y axis", [c for c in num_cols if c != x_col], key="density_y")
|
| 333 |
+
|
| 334 |
+
fig = px.density_heatmap(df, x=x_col, y=y_col,
|
| 335 |
+
title=f"Density Heatmap: {y_col} vs {x_col}",
|
| 336 |
+
marginal_x="histogram",
|
| 337 |
+
marginal_y="histogram")
|
| 338 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 339 |
+
else:
|
| 340 |
+
st.warning("No numeric columns available for statistical plots")
|
| 341 |
+
|
| 342 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 343 |
+
|
| 344 |
+
elif viz_type == "Advanced Plots":
|
| 345 |
+
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
|
| 346 |
+
st.subheader("🚀 Advanced Visualizations")
|
| 347 |
+
|
| 348 |
+
adv_tab1, adv_tab2, adv_tab3 = st.tabs(["3D Scatter", "Parallel Coordinates", "Radar Chart"])
|
| 349 |
+
|
| 350 |
+
with adv_tab1:
|
| 351 |
+
if len(num_cols) >= 3:
|
| 352 |
+
col1, col2, col3 = st.columns(3)
|
| 353 |
+
with col1:
|
| 354 |
+
x_3d = st.selectbox("X axis", num_cols, key="3d_x")
|
| 355 |
+
with col2:
|
| 356 |
+
y_3d = st.selectbox("Y axis", [c for c in num_cols if c != x_3d], key="3d_y")
|
| 357 |
+
with col3:
|
| 358 |
+
z_3d = st.selectbox("Z axis", [c for c in num_cols if c not in [x_3d, y_3d]], key="3d_z")
|
| 359 |
+
|
| 360 |
+
color_3d = st.selectbox("Color by", ["None"] + cat_cols + num_cols, key="3d_color")
|
| 361 |
+
|
| 362 |
+
if color_3d == "None":
|
| 363 |
+
fig = px.scatter_3d(df, x=x_3d, y=y_3d, z=z_3d,
|
| 364 |
+
title=f"3D Scatter: {x_3d}, {y_3d}, {z_3d}")
|
| 365 |
+
else:
|
| 366 |
+
fig = px.scatter_3d(df, x=x_3d, y=y_3d, z=z_3d, color=color_3d,
|
| 367 |
+
title=f"3D Scatter colored by {color_3d}")
|
| 368 |
+
|
| 369 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 370 |
+
else:
|
| 371 |
+
st.info("Need at least 3 numeric columns for 3D scatter plot")
|
| 372 |
+
|
| 373 |
+
with adv_tab2:
|
| 374 |
+
if num_cols:
|
| 375 |
+
selected_dims = st.multiselect("Select dimensions", num_cols, default=num_cols[:4])
|
| 376 |
+
|
| 377 |
+
if selected_dims and len(selected_dims) >= 2:
|
| 378 |
+
color_dim = st.selectbox("Color dimension", ["None"] + cat_cols + num_cols)
|
| 379 |
+
|
| 380 |
+
if color_dim == "None":
|
| 381 |
+
fig = px.parallel_coordinates(df, dimensions=selected_dims,
|
| 382 |
+
title="Parallel Coordinates Plot")
|
| 383 |
+
else:
|
| 384 |
+
fig = px.parallel_coordinates(df, dimensions=selected_dims,
|
| 385 |
+
color=color_dim,
|
| 386 |
+
title=f"Parallel Coordinates colored by {color_dim}")
|
| 387 |
+
|
| 388 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 389 |
+
|
| 390 |
+
with adv_tab3:
|
| 391 |
+
if num_cols:
|
| 392 |
+
st.markdown("**Radar Chart** (requires at least 3 numeric columns)")
|
| 393 |
+
selected_radar = st.multiselect("Select metrics for radar chart",
|
| 394 |
+
num_cols, default=num_cols[:3])
|
| 395 |
+
|
| 396 |
+
if len(selected_radar) >= 3:
|
| 397 |
+
# Get first row as sample
|
| 398 |
+
sample = df[selected_radar].iloc[0]
|
| 399 |
+
|
| 400 |
+
fig = go.Figure(data=go.Scatterpolar(
|
| 401 |
+
r=sample.values,
|
| 402 |
+
theta=selected_radar,
|
| 403 |
+
fill='toself'
|
| 404 |
+
))
|
| 405 |
+
|
| 406 |
+
fig.update_layout(
|
| 407 |
+
polar=dict(
|
| 408 |
+
radialaxis=dict(
|
| 409 |
+
visible=True,
|
| 410 |
+
range=[sample.min(), sample.max()]
|
| 411 |
+
)),
|
| 412 |
+
showlegend=False,
|
| 413 |
+
title="Radar Chart (First Row)"
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 417 |
+
|
| 418 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 419 |
+
|
| 420 |
+
# Download plot data option
|
| 421 |
+
st.markdown("---")
|
| 422 |
+
st.markdown("### 💾 Export Options")
|
| 423 |
+
|
| 424 |
+
col1, col2 = st.columns(2)
|
| 425 |
+
with col1:
|
| 426 |
+
st.info("To save any plot, hover over it and click the camera icon 📷")
|
| 427 |
+
with col2:
|
| 428 |
+
csv = df.to_csv(index=False)
|
| 429 |
+
st.download_button(
|
| 430 |
+
label="📥 Download Data as CSV",
|
| 431 |
+
data=csv,
|
| 432 |
+
file_name="visualization_data.csv",
|
| 433 |
+
mime="text/csv",
|
| 434 |
+
use_container_width=True
|
| 435 |
+
)
|