Spaces:

entropy25
/

data-analysis-platform

Sleeping

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

78b8458

verified ·

1 Parent(s): 6a3c971

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -317

app.py CHANGED Viewed

@@ -1,44 +1,17 @@
 import streamlit as st
 import pandas as pd
-import os
 from data_handler import load_data
 from analyzer import DataAnalysisWorkflow, AIAssistant
-# HuggingFace specific configurations
-def configure_for_hf():
-    """Configure app for HuggingFace Spaces deployment"""
     st.set_page_config(
-        page_title="AI Data Quality Inspector",
-        page_icon="🔍",
-        layout="wide",
-        initial_sidebar_state="expanded"
     )
-    # Add custom CSS for better mobile experience
-    st.markdown("""
-    <style>
-    .main > div {
-        padding-top: 2rem;
-    }
-    .stMetric {
-        background-color: #f0f2f6;
-        padding: 1rem;
-        border-radius: 0.5rem;
-        border: 1px solid #e6e9ef;
-    }
-    </style>
-    """, unsafe_allow_html=True)
-def main():
-    configure_for_hf()
-    # Header with improved styling
-    st.title("🔍 AI Data Quality Inspector")
-    st.markdown("""
-    **Upload → Inspect → Repair → Download** | *Transform messy data into clean datasets in 3 minutes*
-    🎯 **Features**: Visual quality scoring • AI repair suggestions • Interactive problem detection • One-click fixes
-    """)
     # Initialize session state
     if 'current_stage' not in st.session_state:
@@ -48,309 +21,111 @@ def main():
     if 'ai_assistant' not in st.session_state:
         st.session_state.ai_assistant = AIAssistant()
-    # File upload with enhanced UI
-    st.markdown("### 📁 Upload Dataset")
-    uploaded_file = st.file_uploader(
-        "Choose CSV or Excel file",
-        type=['csv', 'xlsx', 'xls'],
-        help="Supports files up to 200MB. CSV files with UTF-8, Latin-1, or CP1252 encoding."
-    )
-    # Sample data option for demo
-    col1, col2 = st.columns([3, 1])
-    with col2:
-        if st.button("🎮 Try Sample Data", use_container_width=True):
-            # Create sample problematic dataset
-            import numpy as np
-            np.random.seed(42)
-            sample_data = {
-                'customer_id': range(1, 1001),
-                'age': np.random.normal(35, 10, 1000),
-                'income': np.random.normal(50000, 15000, 1000),
-                'score': np.random.normal(75, 20, 1000),
-                'category': np.random.choice(['Premium', 'Standard', 'Basic', None], 1000, p=[0.3, 0.4, 0.2, 0.1]),
-                'region': np.random.choice(['North', 'South', 'East', 'West'], 1000)
-            }
-            df = pd.DataFrame(sample_data)
-            # Inject quality issues for demonstration
-            missing_indices = np.random.choice(df.index, 150, replace=False)
-            df.loc[missing_indices, 'income'] = None
-            outlier_indices = np.random.choice(df.index, 50, replace=False)
-            df.loc[outlier_indices, 'age'] = np.random.uniform(100, 150, 50)
-            df = pd.concat([df, df.head(25)])  # Add duplicates
-            st.session_state.sample_data = df
-            st.success("✅ Sample data loaded! Continue below to analyze.")
-    # Handle file upload or sample data
-    df = None
     if uploaded_file is not None:
         try:
-            with st.spinner("🔄 Loading and analyzing dataset..."):
-                df = load_data(uploaded_file)
-                st.success(f"✅ Dataset loaded successfully! Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
-        except Exception as e:
-            st.error(f"❌ Error loading file: {str(e)}")
-            st.info("Please check your file format. Supported: CSV (UTF-8, Latin-1), Excel (.xlsx, .xls)")
-            return
-    elif 'sample_data' in st.session_state:
-        df = st.session_state.sample_data
-        st.info("📊 Using sample dataset for demonstration")
-    if df is not None:
-        # Initialize or update workflow
-        if (st.session_state.workflow is None or
-            st.session_state.workflow.df.shape != df.shape):
-            st.session_state.workflow = DataAnalysisWorkflow(df)
-        workflow = st.session_state.workflow
-        # Enhanced sidebar with progress
-        with st.sidebar:
-            st.header("🎯 Analysis Progress")
-            # Progress indicator - Fixed to prevent values > 1.0
-            progress_value = min(st.session_state.current_stage / 5, 1.0)
-            st.progress(progress_value)
-            st.write(f"Stage {st.session_state.current_stage} of 5")
-            # Stage navigation
-            stages = [
-                ("📊", "Data Overview", "Get instant quality insights"),
-                ("🔍", "Exploration", "Discover patterns and relationships"),
-                ("🧹", "Quality Check", "Detect and fix data issues"),
-                ("🔬", "Analysis", "Advanced statistical analysis"),
-                ("📈", "Summary", "Export results and reports")
-            ]
-            st.markdown("### 📋 Stages")
-            for i, (icon, name, desc) in enumerate(stages, 1):
                 if i == st.session_state.current_stage:
-                    st.markdown(f"**{icon} {i}. {name}** 🔄")
-                    st.caption(desc)
                 elif i < st.session_state.current_stage:
-                    st.markdown(f"✅ {icon} {i}. {name}")
                 else:
-                    st.markdown(f"⏳ {icon} {i}. {name}")
-            # Navigation buttons
-            col1, col2 = st.columns(2)
             with col1:
-                if st.button("← Previous", disabled=st.session_state.current_stage <= 1):
                     st.session_state.current_stage -= 1
                     st.rerun()
             with col2:
-                if st.button("Next →", disabled=st.session_state.current_stage >= 5):
                     st.session_state.current_stage += 1
                     st.rerun()
-            # Quick insights panel
-            if workflow.insights:
-                st.markdown("### 💡 Latest Insights")
-                recent_insights = workflow.insights[-3:]
-                for insight in recent_insights:
-                    with st.expander(f"Stage {insight['stage']}", expanded=False):
-                        st.write(insight['insight'])
-            # Quick stats
-            st.markdown("### 📊 Quick Stats")
-            st.metric("Data Quality", "Calculating..." if not workflow.insights else "Good")
-            st.metric("Issues Found", len([i for i in workflow.insights if 'issue' in i['insight'].lower()]))
-            st.metric("Memory Usage", f"{workflow.stats['memory_usage']:.1f} MB")
-        # Main content area with enhanced layout
-        st.markdown("---")
-        # Stage execution with improved styling
-        if st.session_state.current_stage == 1:
-            st.markdown("## 📊 Data Overview & Quality Assessment")
-            workflow.stage_1_overview()
-        elif st.session_state.current_stage == 2:
-            st.markdown("## 🔍 Exploratory Data Analysis")
-            workflow.stage_2_exploration()
-        elif st.session_state.current_stage == 3:
-            st.markdown("## 🧹 Data Quality Check & Repair")
-            workflow.stage_3_cleaning()
-        elif st.session_state.current_stage == 4:
-            st.markdown("## 🔬 Advanced Statistical Analysis")
-            workflow.stage_4_analysis()
-        elif st.session_state.current_stage == 5:
-            st.markdown("## 📈 Summary & Export")
-            workflow.stage_5_summary()
-        # AI Assistant panel (enhanced for HF)
-        with st.expander("🤖 AI Assistant", expanded=False):
-            st.markdown("### AI-Powered Data Insights")
-            # Show available AI features (mock for HF deployment)
-            st.info("💡 **AI Features Available:**\n- Automated quality scoring\n- Smart repair suggestions\n- Business impact analysis\n- Pattern recognition")
-            if st.button("🧠 Generate AI Analysis"):
-                if workflow.insights:
-                    with st.spinner("🤖 AI analyzing your data..."):
-                        # Simulate AI analysis with built-in intelligence
-                        ai_insights = generate_builtin_ai_analysis(workflow.df, workflow.insights)
-                        st.markdown("**🎯 AI Analysis Results:**")
-                        for category, insight in ai_insights.items():
-                            with st.expander(f"📋 {category}", expanded=True):
-                                st.write(insight)
                 else:
-                    st.warning("Complete some analysis stages first to get AI insights.")
-    else:
-        # Enhanced landing page
-        st.markdown("""
-        ## 🚀 Welcome to AI Data Quality Inspector
-        Transform your messy datasets into analysis-ready data in just **3 minutes**!
-        ### ✨ What You Get:
-        """)
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.markdown("""
-            **🔍 Instant Detection**
-            - Visual quality scoring
-            - Missing value heatmaps
-            - Outlier identification
-            - Duplicate detection
-            """)
-        with col2:
-            st.markdown("""
-            **🤖 AI Guidance**
-            - Smart repair suggestions
-            - Business impact analysis
-            - Confidence scoring
-            - One-click fixes
-            """)
-        with col3:
-            st.markdown("""
-            **📊 Professional Results**
-            - Clean datasets
-            - Quality reports
-            - Visual comparisons
-            - Export options
-            """)
-        st.markdown("""
-        ### 🎯 Perfect For:
-        - **Business Analysts**: Validate data before reporting
-        - **Data Engineers**: Pre-import quality checks
-        - **Operations Teams**: Non-technical data assessment
-        ### 🚀 Get Started:
-        1. Upload your CSV or Excel file above
-        2. Navigate through the 5-stage analysis workflow
-        3. Apply AI-suggested repairs with one click
-        4. Download your cleaned dataset and quality report
-        """)
-def generate_builtin_ai_analysis(df: pd.DataFrame, insights: list) -> dict:
-    """Generate AI-style analysis without external APIs"""
-    analysis = {}
-    # Data Quality Assessment
-    missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
-    duplicate_pct = (df.duplicated().sum() / len(df)) * 100
-    if missing_pct > 10:
-        analysis["🚨 Data Completeness"] = f"""
-        **Issue**: {missing_pct:.1f}% of your data is missing, which could significantly impact analysis reliability.
-        **Business Impact**: Missing data can lead to biased insights and incorrect business decisions.
-        **Recommendation**: Focus on columns with >20% missing values - consider external data sources or business process improvements.
-        """
-    elif missing_pct > 0:
-        analysis["✅ Data Completeness"] = f"""
-        **Status**: Only {missing_pct:.1f}% missing data - within acceptable limits.
-        **Recommendation**: Apply median/mode filling for remaining gaps before analysis.
-        """
-    # Outlier Analysis
-    numeric_cols = df.select_dtypes(include=['number']).columns
-    total_outliers = 0
-    for col in numeric_cols:
-        Q1 = df[col].quantile(0.25)
-        Q3 = df[col].quantile(0.75)
-        IQR = Q3 - Q1
-        outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
-        total_outliers += len(outliers)
-    if total_outliers > len(df) * 0.05:  # More than 5% outliers
-        analysis["⚠️ Statistical Anomalies"] = f"""
-        **Issue**: {total_outliers} outliers detected ({(total_outliers/len(df)*100):.1f}% of data).
-        **Possible Causes**: Data entry errors, system glitches, or genuine extreme values.
-        **Recommendation**: Investigate business context before removing. Consider capping instead of deletion.
-        """
-    # Data Distribution Analysis
-    if len(numeric_cols) > 1:
-        corr_matrix = df[numeric_cols].corr()
-        max_corr = corr_matrix.abs().unstack().sort_values(ascending=False).iloc[1]  # Exclude self-correlation
-        if max_corr > 0.8:
-            analysis["🔗 Strong Correlations"] = f"""
-            **Finding**: Strong correlation detected (r={max_corr:.3f}) between variables.
-            **Opportunity**: This suggests predictable relationships in your data - valuable for forecasting and modeling.
-            **Next Steps**: Use correlated variables for predictive analysis or data validation.
-            """
-    # Memory and Performance
-    memory_mb = df.memory_usage(deep=True).sum() / 1024**2
-    if memory_mb > 100:
-        analysis["🔧 Performance Optimization"] = f"""
-        **Status**: Dataset uses {memory_mb:.1f}MB memory.
-        **Optimization**: Convert categorical columns to 'category' dtype to reduce memory usage by up to 50%.
-        **Benefit**: Faster processing and lower resource consumption.
-        """
-    # Business Insights
-    categorical_cols = df.select_dtypes(include=['object']).columns
-    if len(categorical_cols) > 0:
-        high_cardinality_cols = [col for col in categorical_cols if df[col].nunique() > len(df) * 0.5]
-        if high_cardinality_cols:
-            analysis["📈 Business Intelligence"] = f"""
-            **Finding**: High-cardinality columns detected: {', '.join(high_cardinality_cols)}.
-            **Insight**: These might be customer IDs or transaction codes - valuable for tracking but not for grouping analysis.
-            **Strategy**: Use for joins and lookups, avoid in statistical summaries.
-            """
-    if not analysis:
-        analysis["🎉 Excellent Data Quality"] = """
-        **Status**: Your dataset shows excellent quality metrics across all dimensions.
-        **Ready for**: Advanced analytics, machine learning, and business intelligence applications.
-        **Next Steps**: Proceed with confidence to your analytical objectives.
-        """
-    return analysis
 if __name__ == "__main__":
     main()

 import streamlit as st
 import pandas as pd
 from data_handler import load_data
 from analyzer import DataAnalysisWorkflow, AIAssistant
+def main():
     st.set_page_config(
+        page_title="Data Analysis Platform",
+        page_icon="📊",
+        layout="wide"
     )
+    st.title("📊 Data Analysis Platform")
+    st.markdown("**Optimized workflow with caching and pagination**")
     # Initialize session state
     if 'current_stage' not in st.session_state:
     if 'ai_assistant' not in st.session_state:
         st.session_state.ai_assistant = AIAssistant()
+    # File upload
+    uploaded_file = st.file_uploader("Upload Dataset", type=['csv', 'xlsx'])
     if uploaded_file is not None:
         try:
+            # Load data
+            df = load_data(uploaded_file)
+            st.success(f"✅ Dataset loaded! Shape: {df.shape}")
+            # Initialize workflow
+            if st.session_state.workflow is None:
+                st.session_state.workflow = DataAnalysisWorkflow(df)
+            # Progress sidebar
+            st.sidebar.header("Progress")
+            progress = st.sidebar.progress(st.session_state.current_stage / 5)
+            stages = ["Data Overview", "Exploration", "Quality Check", "Analysis", "Summary"]
+            for i, stage in enumerate(stages, 1):
                 if i == st.session_state.current_stage:
+                    st.sidebar.write(f"🔄 **{i}. {stage}**")
                 elif i < st.session_state.current_stage:
+                    st.sidebar.write(f"✅ {i}. {stage}")
                 else:
+                    st.sidebar.write(f"⏳ {i}. {stage}")
+            # Navigation
+            col1, col2 = st.sidebar.columns(2)
             with col1:
+                if st.button("← Previous") and st.session_state.current_stage > 1:
                     st.session_state.current_stage -= 1
                     st.rerun()
             with col2:
+                if st.button("Next →") and st.session_state.current_stage < 5:
                     st.session_state.current_stage += 1
                     st.rerun()
+            # Recent insights
+            st.sidebar.header("💡 Recent Insights")
+            recent_insights = st.session_state.workflow.insights[-3:]
+            for insight in recent_insights:
+                st.sidebar.info(f"**Stage {insight['stage']}:** {insight['insight']}")
+            # Main content with AI assistant
+            main_col, ai_col = st.columns([3, 1])
+            with main_col:
+                # Execute current stage
+                if st.session_state.current_stage == 1:
+                    st.session_state.workflow.stage_1_overview()
+                elif st.session_state.current_stage == 2:
+                    st.session_state.workflow.stage_2_exploration()
+                elif st.session_state.current_stage == 3:
+                    st.session_state.workflow.stage_3_cleaning()
+                elif st.session_state.current_stage == 4:
+                    st.session_state.workflow.stage_4_analysis()
+                elif st.session_state.current_stage == 5:
+                    st.session_state.workflow.stage_5_summary()
+            with ai_col:
+                st.subheader("🤖 AI Assistant")
+                # AI model selection
+                available_models = st.session_state.ai_assistant.get_available_models()
+                if available_models:
+                    selected_model = st.selectbox("AI Model:", available_models)
+                    if st.button("Get AI Insights"):
+                        if st.session_state.workflow.insights:
+                            with st.spinner("Analyzing with AI..."):
+                                ai_analysis = st.session_state.ai_assistant.analyze_insights(
+                                    df, st.session_state.workflow.insights, selected_model
+                                )
+                                st.write("**AI Analysis:**")
+                                st.write(ai_analysis)
+                        else:
+                            st.warning("Complete some analysis stages first.")
                 else:
+                    st.warning("No AI models available.")
+                    st.info("Set GOOGLE_API_KEY or OPENAI_API_KEY environment variables.")
+                # Quick insights
+                st.subheader("📊 Quick Stats")
+                if st.session_state.workflow.insights:
+                    st.metric("Total Insights", len(st.session_state.workflow.insights))
+                    st.metric("Current Stage", f"{st.session_state.current_stage}/5")
+                    # Latest insight
+                    if st.session_state.workflow.insights:
+                        latest = st.session_state.workflow.insights[-1]
+                        st.info(f"**Latest:** {latest['insight']}")
+                # Data quality indicator
+                quality_score = 100
+                if st.session_state.workflow.stats['missing_values'] > 0:
+                    quality_score -= 30
+                if st.session_state.workflow.stats['duplicates'] > 0:
+                    quality_score -= 20
+                st.metric("Data Quality", f"{quality_score}%")
+        except Exception as e:
+            st.error(f"Error: {str(e)}")
+            st.info("Please check your file format and try again.")
 if __name__ == "__main__":
     main()