import streamlit as st
import pandas as pd
import plotly.io as pio
import traceback
import sys
from datetime import datetime
from data_preprocessing import preprocess_data
from insights import generate_business_insights
from dataset_overview import eda_analysis # Updated import
from visualization import auto_visualizations
from ml_pipeline import run_ml_pipeline
from statistical_analysis import statistical_analysis
from data_quality import quality_report
from chatbot import data_chatbot
# Set plotly template
pio.templates.default = "plotly_white"
# ---------------------------------------
# PAGE CONFIG
# ---------------------------------------
st.set_page_config(
page_title="AI Data Analyst Pro",
layout="wide",
page_icon="đ",
initial_sidebar_state="expanded"
)
# ---------------------------------------
# CUSTOM ERROR HANDLER
# ---------------------------------------
class StreamlitExceptionHandler:
"""Custom exception handler for Streamlit"""
@staticmethod
def handle_exception(e, context="application"):
"""Handle exceptions with user-friendly messages"""
error_type = type(e).__name__
error_msg = str(e)
# Create user-friendly error message
user_message = f"""
### â An error occurred in the {context}
**Error Type:** {error_type}
**What happened:** {error_msg if error_msg else "An unexpected error occurred"}
**Possible solutions:**
"""
# Add specific solutions based on error type
if "MemoryError" in error_type:
user_message += """
- Your dataset might be too large. Try uploading a smaller file.
- Close other applications to free up memory.
- Consider sampling your data before uploading.
"""
elif "KeyError" in error_type or "IndexError" in error_type:
user_message += """
- The requested column or index doesn't exist in your dataset.
- Check if you've selected valid columns for the operation.
- Try refreshing the page and uploading your data again.
"""
elif "ValueError" in error_type:
user_message += """
- The data values don't match the expected format.
- Check for invalid values in your dataset (e.g., text in numeric columns).
- Ensure your data types are correct for the selected operation.
"""
elif "TypeError" in error_type:
user_message += """
- There's a mismatch in data types.
- Check if you're mixing numeric and text data in operations.
- Use the preprocessing tab to convert data types appropriately.
"""
elif "FileNotFoundError" in error_type:
user_message += """
- The file couldn't be found. Please upload it again.
- Check if the file path is correct.
"""
elif "PermissionError" in error_type:
user_message += """
- Permission denied when accessing the file.
- Make sure the file isn't open in another program.
"""
elif "pd.errors.EmptyDataError" in error_type:
user_message += """
- The uploaded file is empty.
- Please upload a file containing data.
"""
elif "pd.errors.ParserError" in error_type:
user_message += """
- Couldn't parse the file. Check if it's a valid CSV or Excel file.
- Ensure the file format matches the selected file type.
"""
else:
user_message += """
- Try refreshing the page and uploading your data again.
- Check if your data format is compatible with the operation.
- If the problem persists, try with a smaller sample of your data.
"""
# Add technical details in an expander for debugging
user_message += f"""
**Technical Details:**
"""
return user_message
# Initialize session state for error tracking
if "error_log" not in st.session_state:
st.session_state.error_log = []
if "last_successful_operation" not in st.session_state:
st.session_state.last_successful_operation = None
# ---------------------------------------
# ADVANCED CSS WITH RESPONSIVE DESIGN
# ---------------------------------------
st.markdown("""
""", unsafe_allow_html=True)
# ---------------------------------------
# HEADER WITH ANIMATION
# ---------------------------------------
st.markdown("""
""", unsafe_allow_html=True)
# ---------------------------------------
# SESSION STATE INITIALIZATION
# ---------------------------------------
if "data" not in st.session_state:
st.session_state.data = None
if "processed_data" not in st.session_state:
st.session_state.processed_data = None
if "uploaded_file_name" not in st.session_state:
st.session_state.uploaded_file_name = None
if "upload_error" not in st.session_state:
st.session_state.upload_error = None
if "data_loaded" not in st.session_state:
st.session_state.data_loaded = False
if "operation_status" not in st.session_state:
st.session_state.operation_status = {}
# ---------------------------------------
# HELPER FUNCTIONS
# ---------------------------------------
def safe_dataframe_operation(func, df, *args, **kwargs):
"""Safely execute dataframe operations with error handling"""
try:
result = func(df, *args, **kwargs)
st.session_state.last_successful_operation = func.__name__
return result, None
except Exception as e:
error_msg = StreamlitExceptionHandler.handle_exception(e, func.__name__)
return None, error_msg
def validate_dataset(df):
"""Validate dataset for common issues"""
issues = []
if df.empty:
issues.append("The dataset is empty")
if df.shape[0] == 0:
issues.append("No rows in the dataset")
if df.shape[1] == 0:
issues.append("No columns in the dataset")
# Check for memory issues
memory_usage = df.memory_usage(deep=True).sum() / 1024**3 # GB
if memory_usage > 1:
issues.append(f"Large dataset detected ({memory_usage:.2f} GB). Some operations may be slow.")
# Check for mixed types
for col in df.columns:
if df[col].dtype == 'object':
# Check if column has mixed types
types = df[col].apply(type).unique()
if len(types) > 1:
issues.append(f"Column '{col}' has mixed data types: {types}")
return issues
def show_validation_warnings(issues):
"""Display validation warnings"""
if issues:
st.markdown("""
', unsafe_allow_html=True)
st.markdown("### đ Upload Your Dataset")
# File uploader with size limit warning
file = st.file_uploader(
"Choose a CSV or Excel file",
type=["csv", "xlsx"],
help="Maximum recommended file size: 200MB. Larger files may cause performance issues."
)
if file:
try:
# Check file size
file_size = file.size / 1024**2 # MB
if file_size > 200:
st.warning(f"â ī¸ Large file detected ({file_size:.2f} MB). Processing may be slow.")
with st.spinner("đ Loading file..."):
# Read file based on extension
if file.name.endswith("csv"):
# Try different encodings
encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
df = None
for encoding in encodings:
try:
df = pd.read_csv(file, encoding=encoding)
break
except UnicodeDecodeError:
continue
if df is None:
st.error("â Could not read CSV file with any common encoding.")
st.stop()
elif file.name.endswith(("xlsx", "xls")):
try:
df = pd.read_excel(file)
except Exception as e:
st.error(f"â Error reading Excel file: {str(e)}")
st.info("đĄ Try saving the file as CSV and uploading again.")
st.stop()
# Validate dataset
issues = validate_dataset(df)
show_validation_warnings(issues)
if not issues or all("Large dataset" not in issue for issue in issues):
# Store in session state
st.session_state.data = df
st.session_state.uploaded_file_name = file.name
st.session_state.data_loaded = True
st.session_state.upload_error = None
# Show success message
st.markdown("""
â
Successfully loaded: {}
đ Shape: {} rows à {} columns
""".format(file.name, df.shape[0], df.shape[1]), unsafe_allow_html=True)
# File statistics
st.markdown("### đ File Statistics")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Rows", f"{df.shape[0]:,}")
with col2:
st.metric("Total Columns", df.shape[1])
with col3:
memory = df.memory_usage(deep=True).sum() / 1024**2
st.metric("Memory Usage", f"{memory:.2f} MB")
# Data preview with scroll
st.markdown("### đī¸ Data Preview")
st.dataframe(
df.head(10),
use_container_width=True,
height=300
)
# Column info with sorting
st.markdown("### đ Column Information")
col_info = pd.DataFrame({
'Column': df.columns,
'Type': df.dtypes.astype(str),
'Non-Null Count': df.count().values,
'Null Count': df.isnull().sum().values,
'Null %': (df.isnull().sum().values / len(df) * 100).round(2),
'Unique Values': [df[col].nunique() for col in df.columns]
})
# Sort by null count
col_info = col_info.sort_values('Null %', ascending=False)
st.dataframe(
col_info.style.background_gradient(subset=['Null %'], cmap='YlOrRd'),
use_container_width=True
)
# Quick stats
st.markdown("### đ Quick Statistics")
numeric_cols = df.select_dtypes(include=['number']).columns
if len(numeric_cols) > 0:
st.dataframe(
df[numeric_cols].describe(),
use_container_width=True
)
# Navigation buttons
st.markdown("### đ Next Steps")
col1, col2, col3 = st.columns(3)
with col1:
if st.button("đ ī¸ Go to Preprocessing", use_container_width=True):
st.session_state.page = "đ ī¸ Preprocessing"
st.rerun()
with col2:
if st.button("đ Go to EDA", use_container_width=True):
st.session_state.page = "đ EDA"
st.rerun()
with col3:
if st.button("đ Go to Visualization", use_container_width=True):
st.session_state.page = "đ Visualization"
st.rerun()
except pd.errors.EmptyDataError:
st.error("â The uploaded file is empty. Please upload a file with data.")
except pd.errors.ParserError as e:
st.error(f"â Error parsing file: {str(e)}")
st.info("đĄ Check if your CSV file has consistent delimiters and quoting.")
except MemoryError:
st.error("â Out of memory! The file is too large to process.")
st.info("đĄ Try uploading a smaller file or sampling your data first.")
except Exception as e:
error_msg = StreamlitExceptionHandler.handle_exception(e, "file upload")
st.markdown(f'
{error_msg}
', unsafe_allow_html=True)
# Log error
st.session_state.error_log.append({
'timestamp': datetime.now(),
'error': str(e),
'traceback': traceback.format_exc()
})
st.markdown('
', unsafe_allow_html=True)
# Sample data option
with st.expander("đ Or use sample data"):
st.markdown("Don't have a dataset? Try our sample data:")
if st.button("Load Sample Dataset", use_container_width=True):
try:
from utils import create_sample_dataset
sample_df = create_sample_dataset()
st.session_state.data = sample_df
st.session_state.uploaded_file_name = "sample_dataset.csv"
st.session_state.data_loaded = True
st.success("â
Sample dataset loaded successfully!")
st.rerun()
except Exception as e:
st.error(f"â Error loading sample data: {str(e)}")
# ---------------------------------------
# PREPROCESSING PAGE
# ---------------------------------------
elif current_page == "preprocess":
try:
if st.session_state.data is not None:
df = st.session_state.data
# Validate data before preprocessing
issues = validate_dataset(df)
if issues:
show_validation_warnings(issues)
# Run preprocessing with error handling
with st.spinner("đ Preprocessing data..."):
processed_df, error = safe_dataframe_operation(preprocess_data, df)
if error:
st.markdown(f'