Spaces:

entropy25
/

data-analysis-platform

Sleeping

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

2fad68d

verified ·

1 Parent(s): c50f214

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -366

app.py CHANGED Viewed

@@ -1,395 +1,131 @@
 import streamlit as st
 import pandas as pd
-import logging
-from data_handler import load_data, validate_dataframe
 from analyzer import DataAnalysisWorkflow, AIAssistant
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-def initialize_session_state():
-    """Initialize all session state variables"""
-    defaults = {
-        'current_stage': 1,
-        'workflow': None,
-        'ai_assistant': None,
-        'show_help': False,
-        'analysis_complete': False,
-        'error_log': []
-    }
-    for key, value in defaults.items():
-        if key not in st.session_state:
-            st.session_state[key] = value
-def display_header():
-    """Display application header"""
     st.set_page_config(
         page_title="Data Analysis Platform",
         page_icon="📊",
-        layout="wide",
-        initial_sidebar_state="expanded"
     )
     st.title("📊 Data Analysis Platform")
-    st.markdown("**Professional data analysis workflow with AI assistance**")
-    # Quick stats in header
-    if st.session_state.workflow is not None:
-        col1, col2, col3, col4 = st.columns(4)
-        with col1:
-            st.metric("📁 Rows", f"{st.session_state.workflow.df.shape[0]:,}")
-        with col2:
-            st.metric("📋 Columns", f"{st.session_state.workflow.df.shape[1]:,}")
-        with col3:
-            st.metric("🔍 Insights", len(st.session_state.workflow.insights))
-        with col4:
-            stage_progress = (st.session_state.current_stage / 5) * 100
-            st.metric("📈 Progress", f"{stage_progress:.0f}%")
-def display_sidebar():
-    """Sidebar with progress tracking and navigation"""
-    st.sidebar.header("🗺️ Analysis Progress")
-    progress_value = st.session_state.current_stage / 5
-    st.sidebar.progress(progress_value)
-    stages = [
-        {"name": "Data Overview", "icon": "📊"},
-        {"name": "Exploration", "icon": "🔍"},
-        {"name": "Quality Check", "icon": "🧹"},
-        {"name": "Analysis", "icon": "🔬"},
-        {"name": "Summary", "icon": "📈"}
-    ]
-    st.sidebar.markdown("### 📋 Analysis Stages")
-    for i, stage in enumerate(stages, 1):
-        if i == st.session_state.current_stage:
-            st.sidebar.markdown(f"🔄 **{i}. {stage['name']}**")
-        elif i < st.session_state.current_stage:
-            st.sidebar.markdown(f"✅ {i}. {stage['name']}")
-        else:
-            st.sidebar.markdown(f"⏳ {i}. {stage['name']}")
-    # Navigation buttons
-    st.sidebar.markdown("### 🧭 Navigation")
-    col1, col2 = st.sidebar.columns(2)
-    with col1:
-        if st.button("⬅️ Previous", disabled=st.session_state.current_stage <= 1):
-            st.session_state.current_stage -= 1
-            st.rerun()
-    with col2:
-        if st.button("➡️ Next", disabled=st.session_state.current_stage >= 5):
-            st.session_state.current_stage += 1
-            st.rerun()
-    # Error log
-    if st.session_state.error_log:
-        with st.sidebar.expander("⚠️ Error Log", expanded=False):
-            for error in st.session_state.error_log[-5:]:
-                st.error(error)
-def display_ai_assistant():
-    """AI assistant panel"""
-    st.subheader("🤖 AI Assistant")
-    if st.session_state.ai_assistant is None:
         st.session_state.ai_assistant = AIAssistant()
-    try:
-        available_models = st.session_state.ai_assistant.get_available_models()
-    except:
-        available_models = []
-    if available_models:
-        selected_model = st.selectbox("AI Model:", available_models)
-        if st.button("🧠 Get AI Insights", type="primary"):
-            if st.session_state.workflow and st.session_state.workflow.insights:
-                with st.spinner("🔮 AI is analyzing your data..."):
-                    try:
-                        ai_analysis = st.session_state.ai_assistant.analyze_insights(
-                            st.session_state.workflow.df,
-                            st.session_state.workflow.insights,
-                            selected_model
-                        )
-                        if ai_analysis and "Error" not in ai_analysis:
-                            st.markdown("### 🎯 AI Analysis Results")
-                            st.markdown(ai_analysis)
-                            st.session_state.workflow.add_insight("AI analysis completed",
-                                                                st.session_state.current_stage, "success")
-                        else:
-                            st.error("Failed to get AI analysis")
-                    except Exception as e:
-                        error_msg = f"AI analysis failed: {str(e)}"
-                        st.error(error_msg)
-                        st.session_state.error_log.append(error_msg)
-            else:
-                st.warning("⚠️ Complete some analysis stages first")
-    else:
-        st.warning("⚠️ No AI models available")
-def handle_file_upload():
-    """File upload with validation and preview"""
-    st.markdown("### 📁 Upload Your Dataset")
-    uploaded_file = st.file_uploader(
-        "Choose your data file",
-        type=['csv', 'xlsx', 'xls'],
-        help="Supported formats: CSV, Excel (.xlsx, .xls)"
-    )
     if uploaded_file is not None:
-        file_size = len(uploaded_file.getvalue()) / 1024**2
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.metric("📁 File Name", uploaded_file.name)
-        with col2:
-            st.metric("📊 File Size", f"{file_size:.1f} MB")
-        with col3:
-            file_type = uploaded_file.name.split('.')[-1].upper()
-            st.metric("📋 Format", file_type)
-        with st.spinner("🔄 Loading data..."):
-            try:
-                df = load_data(uploaded_file)
-                if df is not None:
-                    is_valid, validation_issues = validate_dataframe(df)
-                    if is_valid:
-                        st.success(f"✅ **Dataset loaded!** Shape: {df.shape[0]:,} rows × {df.shape[1]:,} columns")
-                        with st.expander("👀 Data Preview", expanded=False):
-                            st.dataframe(df.head(), use_container_width=True)
-                        st.session_state.workflow = DataAnalysisWorkflow(df)
-                        st.session_state.current_stage = 1
-                        st.session_state.analysis_complete = False
-                        return True
-                    else:
-                        st.error("❌ **Data validation failed:**")
-                        for issue in validation_issues:
-                            st.write(f"• {issue}")
-                        return False
                 else:
-                    st.error("❌ Failed to load data")
-                    return False
-            except Exception as e:
-                error_msg = f"Error loading file: {str(e)}"
-                st.error(f"❌ {error_msg}")
-                st.session_state.error_log.append(error_msg)
-                return False
-    return False
-def safe_execute_stage():
-    """Execute current stage with comprehensive error handling"""
-    try:
-        workflow = st.session_state.workflow
-        stage = st.session_state.current_stage
-        if stage == 1:
-            # Stage 1: Data Overview with safe execution
-            st.subheader("📊 Data Overview")
-            # Basic info
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Rows", f"{len(workflow.df):,}")
-            with col2:
-                st.metric("Columns", f"{len(workflow.df.columns):,}")
-            with col3:
-                memory_mb = workflow.df.memory_usage(deep=True).sum() / 1024**2
-                st.metric("Memory", f"{memory_mb:.1f} MB")
-            # Data types
-            st.markdown("### Column Information")
-            dtype_counts = workflow.df.dtypes.value_counts()
-            for dtype, count in dtype_counts.items():
-                st.write(f"• **{dtype}**: {count} columns")
-            # Missing values
-            missing_total = workflow.df.isnull().sum().sum()
-            duplicates = workflow.df.duplicated().sum()
-            col1, col2 = st.columns(2)
             with col1:
-                st.metric("Missing Values", f"{missing_total:,}")
             with col2:
-                st.metric("Duplicates", f"{duplicates:,}")
-            # Quality score calculation (fixed)
-            total_cells = len(workflow.df) * len(workflow.df.columns)
-            missing_pct = (missing_total / total_cells) * 100 if total_cells > 0 else 0
-            duplicate_pct = (duplicates / len(workflow.df)) * 100 if len(workflow.df) > 0 else 0
-            quality_score = max(0, 100 - (missing_pct * 2) - (duplicate_pct * 3))
-            if quality_score >= 90:
-                st.success(f"🌟 Excellent Data Quality: {quality_score:.0f}%")
-            elif quality_score >= 70:
-                st.info(f"👍 Good Data Quality: {quality_score:.0f}%")
-            else:
-                st.warning(f"⚠️ Data Quality Needs Improvement: {quality_score:.0f}%")
-            # Add insight to workflow
-            workflow.add_insight(f"Data overview completed. Quality score: {quality_score:.0f}%",
-                               stage, "success")
-        elif stage == 2:
-            # Stage 2: Data Exploration
-            st.subheader("🔍 Data Exploration")
-            numeric_cols = workflow.df.select_dtypes(include=['number']).columns
-            categorical_cols = workflow.df.select_dtypes(include=['object', 'category']).columns
-            if len(numeric_cols) > 0:
-                st.markdown("### Numeric Columns")
-                st.dataframe(workflow.df[numeric_cols].describe(), use_container_width=True)
-            if len(categorical_cols) > 0:
-                st.markdown("### Categorical Columns")
-                for col in categorical_cols[:5]:  # Show first 5 categorical columns
-                    unique_count = workflow.df[col].nunique()
-                    st.write(f"**{col}**: {unique_count} unique values")
-            workflow.add_insight("Data exploration completed", stage, "success")
-        elif stage == 3:
-            # Stage 3: Data Cleaning
-            st.subheader("🧹 Data Quality Check")
-            # Missing values by column
-            missing_by_col = workflow.df.isnull().sum()
-            missing_cols = missing_by_col[missing_by_col > 0]
-            if len(missing_cols) > 0:
-                st.markdown("### Missing Values by Column")
-                for col, count in missing_cols.items():
-                    pct = (count / len(workflow.df)) * 100
-                    st.write(f"• **{col}**: {count} missing ({pct:.1f}%)")
-            else:
-                st.success("✅ No missing values found")
-            # Duplicates
-            duplicates = workflow.df.duplicated().sum()
-            if duplicates > 0:
-                st.warning(f"⚠️ Found {duplicates} duplicate rows")
-            else:
-                st.success("✅ No duplicate rows found")
-            workflow.add_insight("Quality check completed", stage, "success")
-        elif stage == 4:
-            # Stage 4: Advanced Analysis
-            st.subheader("🔬 Advanced Analysis")
-            numeric_cols = workflow.df.select_dtypes(include=['number']).columns
-            if len(numeric_cols) >= 2:
-                st.markdown("### Correlation Matrix")
-                corr_matrix = workflow.df[numeric_cols].corr()
-                st.dataframe(corr_matrix, use_container_width=True)
-            workflow.add_insight("Advanced analysis completed", stage, "success")
-        elif stage == 5:
-            # Stage 5: Summary
-            st.subheader("📈 Analysis Summary")
-            st.markdown("### Analysis Complete!")
-            st.write(f"• Dataset: {len(workflow.df):,} rows × {len(workflow.df.columns):,} columns")
-            st.write(f"• Insights generated: {len(workflow.insights)}")
-            st.write(f"• Analysis stages completed: {st.session_state.current_stage}")
-            # Export options
-            st.markdown("### Export Options")
-            if st.button("📊 Download CSV"):
-                csv = workflow.df.to_csv(index=False)
-                st.download_button(
-                    label="Download CSV",
-                    data=csv,
-                    file_name="analyzed_data.csv",
-                    mime="text/csv"
-                )
-            if not st.session_state.analysis_complete:
-                st.session_state.analysis_complete = True
-                st.balloons()
-            workflow.add_insight("Analysis summary completed", stage, "success")
-        # Initialize stats if not exists
-        if not hasattr(workflow, 'stats'):
-            workflow.stats = {
-                'missing_values': workflow.df.isnull().sum().sum(),
-                'duplicates': workflow.df.duplicated().sum()
-            }
-    except Exception as e:
-        error_msg = f"Error in stage {st.session_state.current_stage}: {str(e)}"
-        st.error(f"❌ {error_msg}")
-        st.session_state.error_log.append(error_msg)
-        logger.error(error_msg)
-        # Show fallback content
-        st.warning("⚠️ Analysis stage encountered an issue. Please try refreshing.")
-def main():
-    """Main application"""
-    try:
-        initialize_session_state()
-        display_header()
-        data_loaded = handle_file_upload()
-        if data_loaded and st.session_state.workflow is not None:
             main_col, ai_col = st.columns([3, 1])
             with main_col:
-                safe_execute_stage()
             with ai_col:
-                display_ai_assistant()
-            display_sidebar()
-            if st.session_state.analysis_complete:
-                st.success("🎉 **Analysis Complete!**")
-        elif not data_loaded:
-            st.markdown("### 🚀 Welcome to Data Analysis Platform")
-            st.markdown("""
-            **Features:**
-            - 5-stage analysis workflow
-            - AI-powered insights
-            - Data quality assessment
-            - Interactive visualizations
-            - Export capabilities
-            **Supported Formats:** CSV, Excel (.xlsx, .xls)
-            """)
-    except Exception as e:
-        error_msg = f"Application error: {str(e)}"
-        st.error(f"❌ {error_msg}")
-        st.session_state.error_log.append(error_msg)
-        if st.button("🔄 Restart"):
-            for key in list(st.session_state.keys()):
-                del st.session_state[key]
-            st.rerun()
 if __name__ == "__main__":
     main()

 import streamlit as st
 import pandas as pd
+from data_handler import load_data
 from analyzer import DataAnalysisWorkflow, AIAssistant
+def main():
     st.set_page_config(
         page_title="Data Analysis Platform",
         page_icon="📊",
+        layout="wide"
     )
     st.title("📊 Data Analysis Platform")
+    st.markdown("**Optimized workflow with caching and pagination**")
+    # Initialize session state
+    if 'current_stage' not in st.session_state:
+        st.session_state.current_stage = 1
+    if 'workflow' not in st.session_state:
+        st.session_state.workflow = None
+    if 'ai_assistant' not in st.session_state:
         st.session_state.ai_assistant = AIAssistant()
+    # File upload
+    uploaded_file = st.file_uploader("Upload Dataset", type=['csv', 'xlsx'])
     if uploaded_file is not None:
+        try:
+            # Load data
+            df = load_data(uploaded_file)
+            st.success(f"✅ Dataset loaded! Shape: {df.shape}")
+            # Initialize workflow
+            if st.session_state.workflow is None:
+                st.session_state.workflow = DataAnalysisWorkflow(df)
+            # Progress sidebar
+            st.sidebar.header("Progress")
+            progress = st.sidebar.progress(st.session_state.current_stage / 5)
+            stages = ["Data Overview", "Exploration", "Quality Check", "Analysis", "Summary"]
+            for i, stage in enumerate(stages, 1):
+                if i == st.session_state.current_stage:
+                    st.sidebar.write(f"🔄 **{i}. {stage}**")
+                elif i < st.session_state.current_stage:
+                    st.sidebar.write(f"✅ {i}. {stage}")
                 else:
+                    st.sidebar.write(f"⏳ {i}. {stage}")
+            # Navigation
+            col1, col2 = st.sidebar.columns(2)
             with col1:
+                if st.button("← Previous") and st.session_state.current_stage > 1:
+                    st.session_state.current_stage -= 1
+                    st.rerun()
             with col2:
+                if st.button("Next →") and st.session_state.current_stage < 5:
+                    st.session_state.current_stage += 1
+                    st.rerun()
+            # Recent insights
+            st.sidebar.header("💡 Recent Insights")
+            recent_insights = st.session_state.workflow.insights[-3:]
+            for insight in recent_insights:
+                st.sidebar.info(f"**Stage {insight['stage']}:** {insight['insight']}")
+            # Main content with AI assistant
             main_col, ai_col = st.columns([3, 1])
             with main_col:
+                # Execute current stage
+                if st.session_state.current_stage == 1:
+                    st.session_state.workflow.stage_1_overview()
+                elif st.session_state.current_stage == 2:
+                    st.session_state.workflow.stage_2_exploration()
+                elif st.session_state.current_stage == 3:
+                    st.session_state.workflow.stage_3_cleaning()
+                elif st.session_state.current_stage == 4:
+                    st.session_state.workflow.stage_4_analysis()
+                elif st.session_state.current_stage == 5:
+                    st.session_state.workflow.stage_5_summary()
             with ai_col:
+                st.subheader("🤖 AI Assistant")
+                # AI model selection
+                available_models = st.session_state.ai_assistant.get_available_models()
+                if available_models:
+                    selected_model = st.selectbox("AI Model:", available_models)
+                    if st.button("Get AI Insights"):
+                        if st.session_state.workflow.insights:
+                            with st.spinner("Analyzing with AI..."):
+                                ai_analysis = st.session_state.ai_assistant.analyze_insights(
+                                    df, st.session_state.workflow.insights, selected_model
+                                )
+                                st.write("**AI Analysis:**")
+                                st.write(ai_analysis)
+                        else:
+                            st.warning("Complete some analysis stages first.")
+                else:
+                    st.warning("No AI models available.")
+                    st.info("Set GOOGLE_API_KEY or OPENAI_API_KEY environment variables.")
+                # Quick insights
+                st.subheader("📊 Quick Stats")
+                if st.session_state.workflow.insights:
+                    st.metric("Total Insights", len(st.session_state.workflow.insights))
+                    st.metric("Current Stage", f"{st.session_state.current_stage}/5")
+                    # Latest insight
+                    if st.session_state.workflow.insights:
+                        latest = st.session_state.workflow.insights[-1]
+                        st.info(f"**Latest:** {latest['insight']}")
+                # Data quality indicator
+                quality_score = 100
+                if st.session_state.workflow.stats['missing_values'] > 0:
+                    quality_score -= 30
+                if st.session_state.workflow.stats['duplicates'] > 0:
+                    quality_score -= 20
+                st.metric("Data Quality", f"{quality_score}%")
+        except Exception as e:
+            st.error(f"Error: {str(e)}")
+            st.info("Please check your file format and try again.")
 if __name__ == "__main__":
     main()