Spaces:

entropy25
/

data-analysis-platform

Build error

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

1348d26

verified ·

1 Parent(s): 86805f4

Update app.py

Browse files

Files changed (1) hide show

app.py +516 -104

app.py CHANGED Viewed

@@ -1,131 +1,543 @@
 import streamlit as st
 import pandas as pd
-from data_handler import load_data
 from analyzer import DataAnalysisWorkflow, AIAssistant
-def main():
     st.set_page_config(
         page_title="Data Analysis Platform",
         page_icon="📊",
-        layout="wide"
     )
     st.title("📊 Data Analysis Platform")
-    st.markdown("**Optimized workflow with caching and pagination**")
-    # Initialize session state
-    if 'current_stage' not in st.session_state:
-        st.session_state.current_stage = 1
-    if 'workflow' not in st.session_state:
-        st.session_state.workflow = None
-    if 'ai_assistant' not in st.session_state:
         st.session_state.ai_assistant = AIAssistant()
-    # File upload
-    uploaded_file = st.file_uploader("Upload Dataset", type=['csv', 'xlsx'])
-    if uploaded_file is not None:
-        try:
-            # Load data
-            df = load_data(uploaded_file)
-            st.success(f"✅ Dataset loaded! Shape: {df.shape}")
-            # Initialize workflow
-            if st.session_state.workflow is None:
-                st.session_state.workflow = DataAnalysisWorkflow(df)
-            # Progress sidebar
-            st.sidebar.header("Progress")
-            progress = st.sidebar.progress(st.session_state.current_stage / 5)
-            stages = ["Data Overview", "Exploration", "Quality Check", "Analysis", "Summary"]
-            for i, stage in enumerate(stages, 1):
-                if i == st.session_state.current_stage:
-                    st.sidebar.write(f"🔄 **{i}. {stage}**")
-                elif i < st.session_state.current_stage:
-                    st.sidebar.write(f"✅ {i}. {stage}")
                 else:
-                    st.sidebar.write(f"⏳ {i}. {stage}")
-            # Navigation
-            col1, col2 = st.sidebar.columns(2)
-            with col1:
-                if st.button("← Previous") and st.session_state.current_stage > 1:
-                    st.session_state.current_stage -= 1
-                    st.rerun()
-            with col2:
-                if st.button("Next →") and st.session_state.current_stage < 5:
-                    st.session_state.current_stage += 1
-                    st.rerun()
-            # Recent insights
-            st.sidebar.header("💡 Recent Insights")
-            recent_insights = st.session_state.workflow.insights[-3:]
-            for insight in recent_insights:
-                st.sidebar.info(f"**Stage {insight['stage']}:** {insight['insight']}")
-            # Main content with AI assistant
             main_col, ai_col = st.columns([3, 1])
             with main_col:
-                # Execute current stage
-                if st.session_state.current_stage == 1:
-                    st.session_state.workflow.stage_1_overview()
-                elif st.session_state.current_stage == 2:
-                    st.session_state.workflow.stage_2_exploration()
-                elif st.session_state.current_stage == 3:
-                    st.session_state.workflow.stage_3_cleaning()
-                elif st.session_state.current_stage == 4:
-                    st.session_state.workflow.stage_4_analysis()
-                elif st.session_state.current_stage == 5:
-                    st.session_state.workflow.stage_5_summary()
             with ai_col:
-                st.subheader("🤖 AI Assistant")
-                # AI model selection
-                available_models = st.session_state.ai_assistant.get_available_models()
-                if available_models:
-                    selected_model = st.selectbox("AI Model:", available_models)
-                    if st.button("Get AI Insights"):
-                        if st.session_state.workflow.insights:
-                            with st.spinner("Analyzing with AI..."):
-                                ai_analysis = st.session_state.ai_assistant.analyze_insights(
-                                    df, st.session_state.workflow.insights, selected_model
-                                )
-                                st.write("**AI Analysis:**")
-                                st.write(ai_analysis)
-                        else:
-                            st.warning("Complete some analysis stages first.")
-                else:
-                    st.warning("No AI models available.")
-                    st.info("Set GOOGLE_API_KEY or OPENAI_API_KEY environment variables.")
-                # Quick insights
-                st.subheader("📊 Quick Stats")
-                if st.session_state.workflow.insights:
-                    st.metric("Total Insights", len(st.session_state.workflow.insights))
-                    st.metric("Current Stage", f"{st.session_state.current_stage}/5")
-                    # Latest insight
-                    if st.session_state.workflow.insights:
-                        latest = st.session_state.workflow.insights[-1]
-                        st.info(f"**Latest:** {latest['insight']}")
-                # Data quality indicator
-                quality_score = 100
-                if st.session_state.workflow.stats['missing_values'] > 0:
-                    quality_score -= 30
-                if st.session_state.workflow.stats['duplicates'] > 0:
-                    quality_score -= 20
-                st.metric("Data Quality", f"{quality_score}%")
-        except Exception as e:
-            st.error(f"Error: {str(e)}")
-            st.info("Please check your file format and try again.")
 if __name__ == "__main__":
     main()

 import streamlit as st
 import pandas as pd
+import logging
+from data_handler import load_data, validate_dataframe
 from analyzer import DataAnalysisWorkflow, AIAssistant
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def initialize_session_state():
+    """Initialize all session state variables"""
+    defaults = {
+        'current_stage': 1,
+        'workflow': None,
+        'ai_assistant': None,
+        'show_help': False,
+        'analysis_complete': False,
+        'error_log': []
+    }
+    for key, value in defaults.items():
+        if key not in st.session_state:
+            st.session_state[key] = value
+def display_header():
+    """Display enhanced application header"""
     st.set_page_config(
         page_title="Data Analysis Platform",
         page_icon="📊",
+        layout="wide",
+        initial_sidebar_state="expanded"
     )
     st.title("📊 Data Analysis Platform")
+    st.markdown("**Professional data analysis workflow with AI assistance**")
+    # Quick stats in header
+    if st.session_state.workflow is not None:
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric("📁 Rows", f"{st.session_state.workflow.df.shape[0]:,}")
+        with col2:
+            st.metric("📋 Columns", f"{st.session_state.workflow.df.shape[1]:,}")
+        with col3:
+            st.metric("🔍 Insights", len(st.session_state.workflow.insights))
+        with col4:
+            stage_progress = (st.session_state.current_stage / 5) * 100
+            st.metric("📈 Progress", f"{stage_progress:.0f}%")
+def display_sidebar():
+    """Enhanced sidebar with progress tracking and navigation"""
+    st.sidebar.header("🗺️ Analysis Progress")
+    # Progress bar
+    progress_value = st.session_state.current_stage / 5
+    st.sidebar.progress(progress_value)
+    # Stage navigation with enhanced UI
+    stages = [
+        {"name": "Data Overview", "icon": "📊", "desc": "Basic statistics and quality"},
+        {"name": "Exploration", "icon": "🔍", "desc": "Patterns and distributions"},
+        {"name": "Quality Check", "icon": "🧹", "desc": "Cleaning and validation"},
+        {"name": "Analysis", "icon": "🔬", "desc": "Advanced insights"},
+        {"name": "Summary", "icon": "📈", "desc": "Results and export"}
+    ]
+    st.sidebar.markdown("### 📋 Analysis Stages")
+    for i, stage in enumerate(stages, 1):
+        if i == st.session_state.current_stage:
+            st.sidebar.markdown(f"🔄 **{i}. {stage['name']}**")
+            st.sidebar.caption(f"   {stage['desc']}")
+        elif i < st.session_state.current_stage:
+            st.sidebar.markdown(f"✅ {i}. {stage['name']}")
+        else:
+            st.sidebar.markdown(f"⏳ {i}. {stage['name']}")
+    # Navigation buttons
+    st.sidebar.markdown("### 🧭 Navigation")
+    col1, col2 = st.sidebar.columns(2)
+    with col1:
+        if st.button("⬅️ Previous",
+                    disabled=st.session_state.current_stage <= 1,
+                    help="Go to previous analysis stage"):
+            st.session_state.current_stage -= 1
+            st.rerun()
+    with col2:
+        if st.button("➡️ Next",
+                    disabled=st.session_state.current_stage >= 5,
+                    help="Go to next analysis stage"):
+            st.session_state.current_stage += 1
+            st.rerun()
+    # Quick stage jumper
+    st.sidebar.markdown("### 🚀 Quick Jump")
+    target_stage = st.sidebar.selectbox(
+        "Jump to stage:",
+        options=list(range(1, 6)),
+        index=st.session_state.current_stage - 1,
+        format_func=lambda x: f"{x}. {stages[x-1]['name']}"
+    )
+    if target_stage != st.session_state.current_stage:
+        if st.sidebar.button("🎯 Jump to Stage"):
+            st.session_state.current_stage = target_stage
+            st.rerun()
+    # Recent insights panel
+    if st.session_state.workflow and st.session_state.workflow.insights:
+        st.sidebar.markdown("### 💡 Latest Insights")
+        recent_insights = st.session_state.workflow.insights[-3:]
+        for insight in recent_insights:
+            icon = {"success": "✅", "warning": "⚠️", "error": "❌"}.get(insight.get('type'), "ℹ️")
+            with st.sidebar.expander(f"{icon} Stage {insight['stage']}", expanded=False):
+                st.write(insight['insight'])
+    # Help and settings
+    st.sidebar.markdown("---")
+    if st.sidebar.button("❓ Toggle Help", help="Show/hide help information"):
+        st.session_state.show_help = not st.session_state.show_help
+    # Error log
+    if st.session_state.error_log:
+        with st.sidebar.expander("⚠️ Error Log", expanded=False):
+            for error in st.session_state.error_log[-5:]:  # Show last 5 errors
+                st.error(error)
+def display_ai_assistant():
+    """Enhanced AI assistant panel"""
+    st.subheader("🤖 AI Assistant")
+    if st.session_state.ai_assistant is None:
         st.session_state.ai_assistant = AIAssistant()
+    available_models = st.session_state.ai_assistant.get_available_models()
+    if available_models:
+        selected_model = st.selectbox("AI Model:", available_models,
+                                    help="Choose your preferred AI model for analysis")
+        # AI analysis button with loading state
+        if st.button("🧠 Get AI Insights", type="primary"):
+            if st.session_state.workflow and st.session_state.workflow.insights:
+                with st.spinner("🔮 AI is analyzing your data..."):
+                    try:
+                        ai_analysis = st.session_state.ai_assistant.analyze_insights(
+                            st.session_state.workflow.df,
+                            st.session_state.workflow.insights,
+                            selected_model
+                        )
+                        if ai_analysis and "Error" not in ai_analysis:
+                            st.markdown("### 🎯 AI Analysis Results")
+                            st.markdown(ai_analysis)
+                            # Add AI insight to workflow
+                            st.session_state.workflow.add_insight("AI analysis completed",
+                                                                st.session_state.current_stage, "success")
+                        else:
+                            st.error(ai_analysis or "Failed to get AI analysis")
+                    except Exception as e:
+                        error_msg = f"AI analysis failed: {str(e)}"
+                        st.error(error_msg)
+                        st.session_state.error_log.append(error_msg)
+                        logger.error(error_msg)
+            else:
+                st.warning("⚠️ Complete some analysis stages first to get AI insights")
+        # AI model status
+        st.markdown("### 📊 AI Status")
+        for model in available_models:
+            st.success(f"✅ {model} Ready")
+    else:
+        st.warning("⚠️ No AI models available")
+        with st.expander("🔧 Setup AI Models", expanded=False):
+            st.markdown("""
+            **To enable AI features, add API keys to your environment:**
+            ```bash
+            # For Google Gemini
+            export GOOGLE_API_KEY="your_gemini_key"
+            # For OpenAI GPT
+            export OPENAI_API_KEY="your_openai_key"
+            ```
+            **Or create a `.env` file:**
+            ```
+            GOOGLE_API_KEY=your_gemini_key
+            OPENAI_API_KEY=your_openai_key
+            ```
+            """)
+    # Quick insights panel
+    if st.session_state.workflow:
+        st.markdown("### ⚡ Quick Stats")
+        workflow = st.session_state.workflow
+        # Data quality indicator
+        missing_pct = (workflow.stats['missing_values'] / (len(workflow.df) * len(workflow.df.columns))) * 100
+        duplicate_pct = (workflow.stats['duplicates'] / len(workflow.df)) * 100
+        quality_score = 100 - (missing_pct * 2) - (duplicate_pct * 3)
+        quality_score = max(0, quality_score)
+        if quality_score >= 90:
+            st.success(f"🌟 Excellent Quality ({quality_score:.0f}%)")
+        elif quality_score >= 70:
+            st.info(f"👍 Good Quality ({quality_score:.0f}%)")
+        else:
+            st.warning(f"⚠️ Needs Improvement ({quality_score:.0f}%)")
+        # Stage completion indicators
+        st.metric("Current Stage", f"{st.session_state.current_stage}/5")
+        st.metric("Operations", len(workflow.cleaning_history))
+def handle_file_upload():
+    """Enhanced file upload with validation and preview"""
+    st.markdown("### 📁 Upload Your Dataset")
+    # File upload with help
+    uploaded_file = st.file_uploader(
+        "Choose your data file",
+        type=['csv', 'xlsx', 'xls'],
+        help="Supported formats: CSV, Excel (.xlsx, .xls). Maximum recommended size: 200MB"
+    )
+    if uploaded_file is not None:
+        # File information
+        file_size = len(uploaded_file.getvalue()) / 1024**2
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("📁 File Name", uploaded_file.name)
+        with col2:
+            st.metric("📊 File Size", f"{file_size:.1f} MB")
+        with col3:
+            file_type = uploaded_file.name.split('.')[-1].upper()
+            st.metric("📋 Format", file_type)
+        # Load data with progress
+        with st.spinner("🔄 Loading and validating your data..."):
+            try:
+                df = load_data(uploaded_file)
+                if df is not None:
+                    # Validate data
+                    is_valid, validation_issues = validate_dataframe(df)
+                    if is_valid:
+                        st.success(f"✅ **Dataset loaded successfully!** Shape: {df.shape[0]:,} rows × {df.shape[1]:,} columns")
+                        # Quick preview
+                        with st.expander("👀 Quick Data Preview", expanded=False):
+                            st.dataframe(df.head(), use_container_width=True)
+                            # Basic info
+                            col1, col2 = st.columns(2)
+                            with col1:
+                                st.write("**Column Types:**")
+                                dtype_summary = df.dtypes.value_counts()
+                                for dtype, count in dtype_summary.items():
+                                    st.write(f"• {dtype}: {count} columns")
+                            with col2:
+                                st.write("**Quick Stats:**")
+                                st.write(f"• Missing values: {df.isnull().sum().sum():,}")
+                                st.write(f"• Duplicate rows: {df.duplicated().sum():,}")
+                                st.write(f"• Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
+                        # Initialize workflow
+                        st.session_state.workflow = DataAnalysisWorkflow(df)
+                        st.session_state.current_stage = 1
+                        st.session_state.analysis_complete = False
+                        return True
+                    else:
+                        st.error("❌ **Data validation failed:**")
+                        for issue in validation_issues:
+                            st.write(f"• {issue}")
+                        st.session_state.error_log.extend(validation_issues)
+                        return False
                 else:
+                    st.error("❌ Failed to load data. Please check file format and try again.")
+                    return False
+            except Exception as e:
+                error_msg = f"Error processing file: {str(e)}"
+                st.error(f"❌ {error_msg}")
+                st.session_state.error_log.append(error_msg)
+                logger.error(error_msg)
+                return False
+    return False
+def display_help_section():
+    """Display contextual help based on current stage"""
+    if st.session_state.show_help:
+        help_content = {
+            1: {
+                "title": "📊 Data Overview Help",
+                "content": """
+                **What you'll see:**
+                - Basic dataset statistics (rows, columns, memory usage)
+                - Data quality score and grade
+                - Column type classification and cardinality analysis
+                - Missing values and duplicates detection
+                **What to look for:**
+                - Quality score below 80 indicates data issues
+                - Constant columns that can be removed
+                - High memory usage that can be optimized
+                - Missing value patterns
+                """
+            },
+            2: {
+                "title": "🔍 Exploration Help",
+                "content": """
+                **What you'll analyze:**
+                - Distribution of numeric variables
+                - Frequency of categorical variables
+                - Relationships between variables
+                **Key insights to find:**
+                - Skewed distributions that need transformation
+                - High cardinality categories
+                - Strong correlations between variables
+                - Imbalanced categorical data
+                """
+            },
+            3: {
+                "title": "🧹 Data Cleaning Help",
+                "content": """
+                **Available operations:**
+                - Missing value treatment (fill, drop, impute)
+                - Duplicate row removal
+                - Outlier detection and treatment
+                - Data type corrections
+                **Best practices:**
+                - Preview operations before applying
+                - Keep track of all changes made
+                - Use domain knowledge for cleaning decisions
+                - Test different approaches
+                """
+            },
+            4: {
+                "title": "🔬 Advanced Analysis Help",
+                "content": """
+                **Advanced features:**
+                - Statistical correlation testing
+                - Group comparisons and ANOVA
+                - Distribution analysis and normality testing
+                **What to look for:**
+                - Statistically significant relationships
+                - Group differences in key metrics
+                - Non-normal distributions
+                - Interaction effects
+                """
+            },
+            5: {
+                "title": "📈 Summary Help",
+                "content": """
+                **Final deliverables:**
+                - Comprehensive analysis report
+                - Cleaned dataset export
+                - Reproducible Python code
+                - Executive summary
+                **Export options:**
+                - Multiple report formats (Markdown, HTML, Text)
+                - Various data formats (CSV, Excel, Parquet)
+                - Ready-to-use Python scripts
+                """
+            }
+        }
+        current_help = help_content.get(st.session_state.current_stage, {})
+        if current_help:
+            st.info(f"**{current_help['title']}**\n{current_help['content']}")
+def execute_analysis_stage():
+    """Execute the current analysis stage with error handling"""
+    try:
+        workflow = st.session_state.workflow
+        stage = st.session_state.current_stage
+        if stage == 1:
+            workflow.stage_1_overview()
+        elif stage == 2:
+            workflow.stage_2_exploration()
+        elif stage == 3:
+            workflow.stage_3_cleaning()
+        elif stage == 4:
+            workflow.stage_4_analysis()
+        elif stage == 5:
+            workflow.stage_5_summary()
+            if not st.session_state.analysis_complete:
+                st.session_state.analysis_complete = True
+                st.balloons()  # Celebration for completion
+    except Exception as e:
+        error_msg = f"Error in stage {st.session_state.current_stage}: {str(e)}"
+        st.error(f"❌ {error_msg}")
+        st.session_state.error_log.append(error_msg)
+        logger.error(error_msg)
+        # Fallback UI
+        st.warning("⚠️ There was an issue with this analysis stage. Please try refreshing or contact support.")
+def display_footer():
+    """Display application footer with additional information"""
+    st.markdown("---")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.markdown("**📊 Platform Features:**")
+        st.markdown("• 5-stage analysis workflow")
+        st.markdown("• AI-powered insights")
+        st.markdown("• Interactive visualizations")
+        st.markdown("• Multiple export formats")
+    with col2:
+        st.markdown("**🔧 Supported Formats:**")
+        st.markdown("• CSV files (any encoding)")
+        st.markdown("• Excel files (.xlsx, .xls)")
+        st.markdown("• Large datasets (up to 200MB)")
+        st.markdown("• Mixed data types")
+    with col3:
+        st.markdown("**💡 Tips for Best Results:**")
+        st.markdown("• Ensure clean column headers")
+        st.markdown("• Include data dictionary if available")
+        st.markdown("• Review quality score recommendations")
+        st.markdown("• Use AI insights for deeper analysis")
+def main():
+    """Enhanced main application with comprehensive error handling"""
+    try:
+        # Initialize application
+        initialize_session_state()
+        display_header()
+        # Show help if enabled
+        display_help_section()
+        # File upload section
+        data_loaded = handle_file_upload()
+        if data_loaded and st.session_state.workflow is not None:
+            # Create main layout
             main_col, ai_col = st.columns([3, 1])
             with main_col:
+                # Execute current analysis stage
+                execute_analysis_stage()
             with ai_col:
+                # AI Assistant panel
+                display_ai_assistant()
+            # Display sidebar navigation
+            display_sidebar()
+            # Show completion message
+            if st.session_state.analysis_complete:
+                st.success("🎉 **Analysis Complete!** Your comprehensive data analysis is ready.")
+        elif not data_loaded:
+            # Landing page content
+            st.markdown("### 🚀 Welcome to the Data Analysis Platform")
+            col1, col2 = st.columns(2)
+            with col1:
+                st.markdown("""
+                **🎯 What this platform does:**
+                - **Automated Data Quality Assessment** - Get instant quality scores and recommendations
+                - **Interactive Exploration** - Visualize distributions, correlations, and patterns
+                - **Smart Data Cleaning** - Handle missing values, duplicates, and outliers
+                - **AI-Powered Insights** - Get business recommendations from your data
+                - **Professional Reports** - Export analysis in multiple formats
+                """)
+            with col2:
+                st.markdown("""
+                **📋 5-Stage Analysis Workflow:**
+                1. **📊 Data Overview** - Quality assessment and structure analysis
+                2. **🔍 Exploration** - Distribution and pattern discovery
+                3. **🧹 Quality Check** - Data cleaning and validation
+                4. **🔬 Analysis** - Advanced statistical analysis
+                5. **📈 Summary** - Results compilation and export
+                """)
+            # Sample data section
+            st.markdown("### 📝 Supported Data Formats")
+            format_info = pd.DataFrame({
+                'Format': ['CSV', 'Excel (.xlsx)', 'Excel (.xls)'],
+                'Max Size': ['200MB', '200MB', '100MB'],
+                'Encoding': ['Auto-detect', 'UTF-8', 'UTF-8'],
+                'Features': ['All features', 'All features', 'Basic features']
+            })
+            st.dataframe(format_info, use_container_width=True, hide_index=True)
+        # Footer
+        display_footer()
+    except Exception as e:
+        # Global error handler
+        error_msg = f"Critical application error: {str(e)}"
+        st.error(f"❌ {error_msg}")
+        st.session_state.error_log.append(error_msg)
+        logger.critical(error_msg)
+        # Recovery options
+        st.markdown("### 🔧 Recovery Options")
+        col1, col2 = st.columns(2)
+        with col1:
+            if st.button("🔄 Restart Analysis"):
+                # Clear session state
+                for key in list(st.session_state.keys()):
+                    del st.session_state[key]
+                st.rerun()
+        with col2:
+            if st.button("📋 View Error Log"):
+                st.write("**Recent Errors:**")
+                for error in st.session_state.error_log[-10:]:
+                    st.code(error)
 if __name__ == "__main__":
     main()