Spaces:

entropy25
/

data-analysis-platform

Sleeping

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

f71de9c

verified ·

1 Parent(s): 59db6f8

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -343

app.py CHANGED Viewed

@@ -1,363 +1,131 @@
-"""
-Data Analysis Platform
-Copyright (c) 2025 JEAN YOUNG
-All rights reserved.
-This software is proprietary and confidential.
-Unauthorized copying, distribution, or use is prohibited.
-"""
 import streamlit as st
 import pandas as pd
-import numpy as np
-import plotly.express as px
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-import warnings
-from typing import Dict, List, Any, Optional
-warnings.filterwarnings('ignore')
-# Import custom modules
-from data_handler import (
-    load_csv_with_encoding,
-    load_excel_file,
-    calculate_basic_stats,
-    calculate_missing_data,
-    calculate_correlation_matrix,
-    get_column_types,
-    clean_data
-)
-from analyzer import DataAnalyzer
-# Page configuration
-st.set_page_config(
-    page_title="Enhanced Data Analysis Platform",
-    page_icon="📊",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-# Custom CSS
-st.markdown("""
-<style>
-    .main-header {
-        font-size: 2.5rem;
-        font-weight: bold;
-        text-align: center;
-        margin-bottom: 2rem;
-        color: #1f77b4;
-    }
-    .metric-card {
-        background-color: #f0f2f6;
-        padding: 1rem;
-        border-radius: 10px;
-        border-left: 5px solid #1f77b4;
-    }
-    .success-message {
-        padding: 1rem;
-        border-radius: 5px;
-        background-color: #d4edda;
-        border: 1px solid #c3e6cb;
-        color: #155724;
-    }
-</style>
-""", unsafe_allow_html=True)
 def main():
-    st.markdown('<h1 class="main-header">📊 Data Analysis Platform</h1>', unsafe_allow_html=True)
-    # Sidebar configuration
-    st.sidebar.title("🔧 Configuration")
-    # File upload section
-    st.sidebar.subheader("📁 Data Upload")
-    uploaded_file = st.sidebar.file_uploader(
-        "Choose your data file",
-        type=['csv', 'xlsx', 'xls'],
-        help="Upload CSV or Excel files (max 100MB)"
-    )
-    # Main content area
     if uploaded_file is not None:
         try:
-            # File size check
-            file_size = len(uploaded_file.getvalue()) / (1024**2)  # MB
-            if file_size > 100:
-                st.error(f"⚠️ File too large: {file_size:.1f}MB. Maximum allowed: 100MB")
-                return
             # Load data
-            with st.spinner("📥 Loading data..."):
-                df = load_data_file(uploaded_file)
-            if df is not None and not df.empty:
-                st.success(f"✅ Data loaded successfully! Shape: {df.shape}")
-                # Initialize analyzer
-                analyzer = DataAnalyzer(df)
-                # Sidebar options
-                st.sidebar.subheader("🎯 Analysis Options")
-                analysis_steps = [
-                    "📊 Data Overview",
-                    "🔍 Data Exploration",
-                    "🧹 Data Quality Check",
-                    "🔬 Advanced Analysis",
-                    "🤖 Machine Learning",
-                    "📈 Insights & Report"
-                ]
-                selected_step = st.sidebar.selectbox(
-                    "Select Analysis Step:",
-                    analysis_steps,
-                    index=0
-                )
-                # Display selected analysis
-                display_analysis_step(analyzer, selected_step, df)
-            else:
-                st.error("❌ Failed to load data. Please check your file format.")
-        except Exception as e:
-            st.error(f"❌ Error processing file: {str(e)}")
-    else:
-        # Welcome screen
-        display_welcome_screen()
-def load_data_file(uploaded_file) -> Optional[pd.DataFrame]:
-    """Load uploaded file based on its extension"""
-    try:
-        file_extension = uploaded_file.name.split('.')[-1].lower()
-        file_content = uploaded_file.getvalue()
-        if file_extension == 'csv':
-            return load_csv_with_encoding(file_content, uploaded_file.name)
-        elif file_extension in ['xlsx', 'xls']:
-            return load_excel_file(file_content, uploaded_file.name)
-        else:
-            st.error("❌ Unsupported file format. Please upload CSV or Excel files.")
-            return None
-    except Exception as e:
-        st.error(f"❌ Error loading file: {str(e)}")
-        return None
-def display_analysis_step(analyzer: DataAnalyzer, step: str, df: pd.DataFrame):
-    """Display the selected analysis step"""
-    if step == "📊 Data Overview":
-        display_data_overview(analyzer, df)
-    elif step == "🔍 Data Exploration":
-        display_data_exploration(analyzer, df)
-    elif step == "🧹 Data Quality Check":
-        display_data_quality(analyzer, df)
-    elif step == "🔬 Advanced Analysis":
-        display_advanced_analysis(analyzer, df)
-    elif step == "🤖 Machine Learning":
-        display_machine_learning(analyzer, df)
-    elif step == "📈 Insights & Report":
-        display_insights_report(analyzer, df)
-def display_data_overview(analyzer: DataAnalyzer, df: pd.DataFrame):
-    """Display data overview section"""
-    st.header("📊 Data Overview")
-    # Basic statistics
-    stats = calculate_basic_stats(df)
-    # Display metrics
-    col1, col2, col3, col4 = st.columns(4)
-    with col1:
-        st.metric("📏 Rows", f"{stats['shape'][0]:,}")
-    with col2:
-        st.metric("📋 Columns", f"{stats['shape'][1]:,}")
-    with col3:
-        st.metric("💾 Memory Usage", f"{stats['memory_usage']:.1f} MB")
-    with col4:
-        st.metric("✅ Completeness", f"{stats['completeness']:.1f}%")
-    # Data types
-    col1, col2 = st.columns([1, 1])
-    with col1:
-        st.subheader("📊 Data Types")
-        dtype_df = pd.DataFrame(list(stats['dtypes'].items()), columns=['Type', 'Count'])
-        fig = px.pie(dtype_df, values='Count', names='Type', title="Column Data Types")
-        st.plotly_chart(fig, use_container_width=True)
-    with col2:
-        st.subheader("🔍 Data Sample")
-        st.dataframe(df.head(10), use_container_width=True)
-def display_data_exploration(analyzer: DataAnalyzer, df: pd.DataFrame):
-    """Display data exploration section"""
-    st.header("🔍 Data Exploration")
-    # Column selection for exploration
-    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
-    if numeric_cols:
-        st.subheader("📈 Numeric Data Distribution")
-        selected_numeric = st.selectbox("Select numeric column:", numeric_cols)
-        col1, col2 = st.columns([2, 1])
-        with col1:
-            fig = px.histogram(df, x=selected_numeric, title=f"Distribution of {selected_numeric}")
-            st.plotly_chart(fig, use_container_width=True)
-        with col2:
-            st.write("**Statistics:**")
-            stats = df[selected_numeric].describe()
-            st.dataframe(stats)
-    if len(numeric_cols) >= 2:
-        st.subheader("🔗 Correlation Analysis")
-        corr_matrix = calculate_correlation_matrix(df)
-        if not corr_matrix.empty:
-            fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
-                           title="Correlation Matrix")
-            st.plotly_chart(fig, use_container_width=True)
-def display_data_quality(analyzer: DataAnalyzer, df: pd.DataFrame):
-    """Display data quality check section"""
-    st.header("🧹 Data Quality Check")
-    # Missing data analysis
-    missing_df = calculate_missing_data(df)
-    if not missing_df.empty:
-        st.subheader("❓ Missing Data Analysis")
-        st.dataframe(missing_df, use_container_width=True)
-        # Missing data visualization
-        fig = px.bar(missing_df, x='Column', y='Missing %',
-                     title="Missing Data by Column",
-                     color='Severity',
-                     color_discrete_map={
-                         'Critical': '#dc3545',
-                         'High': '#fd7e14',
-                         'Medium': '#ffc107',
-                         'Low': '#28a745'
-                     })
-        st.plotly_chart(fig, use_container_width=True)
-    else:
-        st.success("✅ No missing data found!")
-    # Duplicate analysis
-    duplicates = df.duplicated().sum()
-    if duplicates > 0:
-        st.warning(f"⚠️ Found {duplicates} duplicate rows")
-    else:
-        st.success("✅ No duplicate rows found!")
-def display_advanced_analysis(analyzer: DataAnalyzer, df: pd.DataFrame):
-    """Display advanced analysis section"""
-    st.header("🔬 Advanced Analysis")
-    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    if len(numeric_cols) >= 2:
-        st.subheader("🎯 Scatter Plot Analysis")
-        col1, col2 = st.columns(2)
-        with col1:
-            x_col = st.selectbox("Select X-axis:", numeric_cols, key="x_axis")
-        with col2:
-            y_col = st.selectbox("Select Y-axis:", numeric_cols, key="y_axis", index=1 if len(numeric_cols) > 1 else 0)
-        if x_col != y_col:
-            fig = px.scatter(df, x=x_col, y=y_col, title=f"{x_col} vs {y_col}")
-            st.plotly_chart(fig, use_container_width=True)
-def display_machine_learning(analyzer: DataAnalyzer, df: pd.DataFrame):
-    """Display machine learning section"""
-    st.header("🤖 Machine Learning")
-    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    if len(numeric_cols) < 2:
-        st.warning("⚠️ Need at least 2 numeric columns for ML analysis")
-        return
-    st.subheader("🎯 Model Configuration")
-    target_col = st.selectbox("Select target column:", numeric_cols)
-    if st.button("🚀 Run ML Analysis"):
-        with st.spinner("🤖 Training models..."):
-            try:
-                results = analyzer.run_ml_analysis(target_col)
-                if results:
-                    st.success("✅ ML Analysis completed!")
-                    # Display results
-                    for model_name, metrics in results.items():
-                        st.subheader(f"📊 {model_name}")
-                        col1, col2 = st.columns(2)
-                        with col1:
-                            for metric, value in metrics.items():
-                                if isinstance(value, (int, float)):
-                                    st.metric(metric.replace('_', ' ').title(), f"{value:.4f}")
-                else:
-                    st.error("❌ ML analysis failed")
-            except Exception as e:
-                st.error(f"❌ Error in ML analysis: {str(e)}")
-def display_insights_report(analyzer: DataAnalyzer, df: pd.DataFrame):
-    """Display insights and report section"""
-    st.header("📈 Insights & Report")
-    # Generate comprehensive report
-    with st.spinner("📝 Generating insights..."):
-        try:
-            insights = analyzer.generate_insights()
-            for section, content in insights.items():
-                st.subheader(f"📊 {section.replace('_', ' ').title()}")
-                if isinstance(content, dict):
-                    for key, value in content.items():
-                        st.write(f"**{key.replace('_', ' ').title()}:** {value}")
-                elif isinstance(content, list):
-                    for item in content:
-                        st.write(f"• {item}")
                 else:
-                    st.write(content)
-                st.write("---")
         except Exception as e:
-            st.error(f"❌ Error generating insights: {str(e)}")
-def display_welcome_screen():
-    """Display welcome screen when no file is uploaded"""
-    st.markdown("""
-    ## 🚀 Welcome to Enhanced Data Analysis Platform
-    **Features:**
-    - 📊 **Comprehensive Data Overview** - Get instant insights about your data
-    - 🔍 **Interactive Exploration** - Visualize patterns and relationships
-    - 🧹 **Data Quality Assessment** - Identify and address data issues
-    - 🔬 **Advanced Analytics** - Perform statistical analysis
-    - 🤖 **Machine Learning** - Automated model building and evaluation
-    - 📈 **Smart Insights** - AI-generated recommendations
-    **Supported Formats:**
-    - CSV files (.csv)
-    - Excel files (.xlsx, .xls)
-    **Getting Started:**
-    1. Upload your data file using the sidebar
-    2. Select analysis steps to explore your data
-    3. Generate insights and export results
-    ---
-    *Upload a file to begin your analysis journey!*
-    """)
 if __name__ == "__main__":
     main()

 import streamlit as st
 import pandas as pd
+from data_handler import load_data
+from analyzer import DataAnalysisWorkflow, AIAssistant
 def main():
+    st.set_page_config(
+        page_title="Data Analysis Platform",
+        page_icon="📊",
+        layout="wide"
+    )
+    st.title("📊 Data Analysis Platform")
+    st.markdown("**Optimized workflow with caching and pagination**")
+    # Initialize session state
+    if 'current_stage' not in st.session_state:
+        st.session_state.current_stage = 1
+    if 'workflow' not in st.session_state:
+        st.session_state.workflow = None
+    if 'ai_assistant' not in st.session_state:
+        st.session_state.ai_assistant = AIAssistant()
+    # File upload
+    uploaded_file = st.file_uploader("Upload Dataset", type=['csv', 'xlsx'])
     if uploaded_file is not None:
         try:
             # Load data
+            df = load_data(uploaded_file)
+            st.success(f"✅ Dataset loaded! Shape: {df.shape}")
+            # Initialize workflow
+            if st.session_state.workflow is None:
+                st.session_state.workflow = DataAnalysisWorkflow(df)
+            # Progress sidebar
+            st.sidebar.header("Progress")
+            progress = st.sidebar.progress(st.session_state.current_stage / 5)
+            stages = ["Data Overview", "Exploration", "Quality Check", "Analysis", "Summary"]
+            for i, stage in enumerate(stages, 1):
+                if i == st.session_state.current_stage:
+                    st.sidebar.write(f"🔄 **{i}. {stage}**")
+                elif i < st.session_state.current_stage:
+                    st.sidebar.write(f"✅ {i}. {stage}")
                 else:
+                    st.sidebar.write(f"⏳ {i}. {stage}")
+            # Navigation
+            col1, col2 = st.sidebar.columns(2)
+            with col1:
+                if st.button("← Previous") and st.session_state.current_stage > 1:
+                    st.session_state.current_stage -= 1
+                    st.rerun()
+            with col2:
+                if st.button("Next →") and st.session_state.current_stage < 5:
+                    st.session_state.current_stage += 1
+                    st.rerun()
+            # Recent insights
+            st.sidebar.header("💡 Recent Insights")
+            recent_insights = st.session_state.workflow.insights[-3:]
+            for insight in recent_insights:
+                st.sidebar.info(f"**Stage {insight['stage']}:** {insight['insight']}")
+            # Main content with AI assistant
+            main_col, ai_col = st.columns([3, 1])
+            with main_col:
+                # Execute current stage
+                if st.session_state.current_stage == 1:
+                    st.session_state.workflow.stage_1_overview()
+                elif st.session_state.current_stage == 2:
+                    st.session_state.workflow.stage_2_exploration()
+                elif st.session_state.current_stage == 3:
+                    st.session_state.workflow.stage_3_cleaning()
+                elif st.session_state.current_stage == 4:
+                    st.session_state.workflow.stage_4_analysis()
+                elif st.session_state.current_stage == 5:
+                    st.session_state.workflow.stage_5_summary()
+            with ai_col:
+                st.subheader("🤖 AI Assistant")
+                # AI model selection
+                available_models = st.session_state.ai_assistant.get_available_models()
+                if available_models:
+                    selected_model = st.selectbox("AI Model:", available_models)
+                    if st.button("Get AI Insights"):
+                        if st.session_state.workflow.insights:
+                            with st.spinner("Analyzing with AI..."):
+                                ai_analysis = st.session_state.ai_assistant.analyze_insights(
+                                    df, st.session_state.workflow.insights, selected_model
+                                )
+                                st.write("**AI Analysis:**")
+                                st.write(ai_analysis)
+                        else:
+                            st.warning("Complete some analysis stages first.")
+                else:
+                    st.warning("No AI models available.")
+                    st.info("Set GOOGLE_API_KEY or OPENAI_API_KEY environment variables.")
+                # Quick insights
+                st.subheader("📊 Quick Stats")
+                if st.session_state.workflow.insights:
+                    st.metric("Total Insights", len(st.session_state.workflow.insights))
+                    st.metric("Current Stage", f"{st.session_state.current_stage}/5")
+                    # Latest insight
+                    if st.session_state.workflow.insights:
+                        latest = st.session_state.workflow.insights[-1]
+                        st.info(f"**Latest:** {latest['insight']}")
+                # Data quality indicator
+                quality_score = 100
+                if st.session_state.workflow.stats['missing_values'] > 0:
+                    quality_score -= 30
+                if st.session_state.workflow.stats['duplicates'] > 0:
+                    quality_score -= 20
+                st.metric("Data Quality", f"{quality_score}%")
         except Exception as e:
+            st.error(f"Error: {str(e)}")
+            st.info("Please check your file format and try again.")
 if __name__ == "__main__":
     main()