Spaces:

entropy25
/

data-analysis-platform

Build error

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

ed4ea1f

verified ·

1 Parent(s): 2fad68d

Update data_handler.py

Browse files

Files changed (1) hide show

data_handler.py +196 -529

data_handler.py CHANGED Viewed

@@ -2,101 +2,40 @@ import streamlit as st
 import pandas as pd
 import numpy as np
 import warnings
-from typing import Dict, List, Any, Tuple, Optional
 from scipy import stats
-import logging
 warnings.filterwarnings('ignore')
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Enhanced error handling decorator
-def handle_errors(func):
-    """Decorator for consistent error handling"""
-    def wrapper(*args, **kwargs):
-        try:
-            return func(*args, **kwargs)
-        except Exception as e:
-            logger.error(f"Error in {func.__name__}: {str(e)}")
-            st.error(f"Error in {func.__name__}: {str(e)}")
-            return None
-    return wrapper
 @st.cache_data
-@handle_errors
-def load_csv_with_encoding(file_content: bytes, filename: str) -> Optional[pd.DataFrame]:
-    """Load CSV with automatic encoding detection and enhanced error handling"""
     try:
-        import chardet
-        detected = chardet.detect(file_content)
-        encoding = detected.get('encoding', 'utf-8')
         from io import BytesIO
-        df = pd.read_csv(BytesIO(file_content), encoding=encoding)
-        # Validate loaded data
-        if df.empty:
-            raise ValueError("The uploaded file is empty")
-        if df.shape[1] == 1 and df.columns[0].count(',') > 0:
-            # Might be semicolon separated
-            df = pd.read_csv(BytesIO(file_content), encoding=encoding, sep=';')
-        logger.info(f"Successfully loaded CSV: {df.shape}")
-        return df
-    except Exception as e:
-        # Try alternative encodings
         encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
         for enc in encodings:
             try:
-                df = pd.read_csv(BytesIO(file_content), encoding=enc)
-                if not df.empty:
-                    logger.info(f"Loaded CSV with encoding {enc}: {df.shape}")
-                    return df
             except:
                 continue
-        raise Exception(f"Cannot read file with any standard encoding. Original error: {str(e)}")
 @st.cache_data
-@handle_errors
-def load_excel_file(file_content: bytes) -> Optional[pd.DataFrame]:
-    """Load Excel file with enhanced error handling"""
     from io import BytesIO
-    try:
-        # Try loading first sheet
-        df = pd.read_excel(BytesIO(file_content))
-        if df.empty:
-            raise ValueError("The Excel file is empty")
-        logger.info(f"Successfully loaded Excel: {df.shape}")
-        return df
-    except Exception as e:
-        # Try with different engines
-        for engine in ['openpyxl', 'xlrd']:
-            try:
-                df = pd.read_excel(BytesIO(file_content), engine=engine)
-                if not df.empty:
-                    logger.info(f"Loaded Excel with engine {engine}: {df.shape}")
-                    return df
-            except:
-                continue
-        raise Exception(f"Cannot read Excel file. Error: {str(e)}")
 @st.cache_data
-@handle_errors
 def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
-    """Calculate basic statistics with error handling"""
-    if df is None or df.empty:
-        return {}
     dtype_counts = df.dtypes.value_counts()
     dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
@@ -109,522 +48,250 @@ def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
     }
 @st.cache_data
-@handle_errors
 def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
-    """Calculate column cardinality analysis with improved categorization"""
-    if df is None or df.empty:
-        return pd.DataFrame()
     cardinality_data = []
     for col in df.columns:
-        try:
-            unique_count = df[col].nunique()
-            total_count = len(df)
-            unique_ratio = unique_count / total_count if total_count > 0 else 0
-            # Enhanced type classification
-            if unique_count == 1:
-                col_type = "Constant"
-            elif unique_count == total_count:
-                col_type = "Unique Identifier"
-            elif unique_ratio < 0.01:
-                col_type = "Very Low Cardinality"
-            elif unique_ratio < 0.05:
-                col_type = "Low Cardinality"
-            elif unique_ratio < 0.5:
-                col_type = "Medium Cardinality"
-            else:
-                col_type = "High Cardinality"
-            cardinality_data.append({
-                'Column': col,
-                'Unique Count': unique_count,
-                'Total Count': total_count,
-                'Unique Ratio': round(unique_ratio, 4),
-                'Type': col_type,
-                'Data Type': str(df[col].dtype)
-            })
-        except Exception as e:
-            logger.warning(f"Error processing column {col}: {str(e)}")
-            continue
     return pd.DataFrame(cardinality_data)
 @st.cache_data
-@handle_errors
 def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
-    """Calculate memory optimization suggestions with validation"""
-    if df is None or df.empty:
-        return {'suggestions': [], 'current_memory_mb': 0, 'potential_savings_mb': 0, 'potential_savings_pct': 0}
     suggestions = []
     current_memory = df.memory_usage(deep=True).sum() / 1024**2
     potential_savings = 0
     for col in df.columns:
-        try:
-            if df[col].dtype == 'object' and not df[col].isnull().all():
-                unique_ratio = df[col].nunique() / len(df)
-                if unique_ratio < 0.5:  # Less than 50% unique values
-                    # Calculate potential savings
-                    test_series = df[col].dropna().head(1000)  # Sample for estimation
-                    if len(test_series) > 0:
-                        category_memory = test_series.astype('category').memory_usage(deep=True)
-                        object_memory = test_series.memory_usage(deep=True)
-                        savings_ratio = (object_memory - category_memory) / object_memory
-                        if savings_ratio > 0.1:  # More than 10% savings
-                            estimated_savings = (df[col].memory_usage(deep=True) * savings_ratio) / 1024**2
-                            suggestions.append({
-                                'Column': col,
-                                'Current Type': 'object',
-                                'Suggested Type': 'category',
-                                'Estimated Savings (MB)': round(estimated_savings, 2),
-                                'Unique Ratio': round(unique_ratio, 3)
-                            })
-                            potential_savings += estimated_savings
-            # Check for int64 that could be int32
-            elif df[col].dtype == 'int64':
-                if df[col].min() >= -2147483648 and df[col].max() <= 2147483647:
-                    savings = df[col].memory_usage(deep=True) * 0.5 / 1024**2
                     suggestions.append({
-                        'Column': col,
-                        'Current Type': 'int64',
-                        'Suggested Type': 'int32',
-                        'Estimated Savings (MB)': round(savings, 2),
-                        'Unique Ratio': 'N/A'
                     })
                     potential_savings += savings
-        except Exception as e:
-            logger.warning(f"Error analyzing memory for column {col}: {str(e)}")
-            continue
     return {
         'suggestions': suggestions,
-        'current_memory_mb': round(current_memory, 2),
-        'potential_savings_mb': round(potential_savings, 2),
-        'potential_savings_pct': round((potential_savings / current_memory) * 100, 1) if current_memory > 0 else 0
     }
 @st.cache_data
-@handle_errors
 def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
-    """Calculate missing data analysis with enhanced insights"""
-    if df is None or df.empty:
-        return pd.DataFrame()
     missing_data = df.isnull().sum()
     if missing_data.sum() > 0:
         missing_df = pd.DataFrame({
             'Column': missing_data.index,
             'Missing Count': missing_data.values,
-            'Missing %': round((missing_data.values / len(df)) * 100, 2),
-            'Data Type': [str(df[col].dtype) for col in missing_data.index]
         })
-        result = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
-        return result.reset_index(drop=True)
     return pd.DataFrame()
 @st.cache_data
-@handle_errors
 def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
-    """Calculate correlation matrix with validation"""
-    if df is None or df.empty:
-        return pd.DataFrame()
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    if len(numeric_cols) < 2:
-        return pd.DataFrame()
-    # Remove columns with all NaN or constant values
-    valid_cols = []
-    for col in numeric_cols:
-        if not df[col].isnull().all() and df[col].nunique() > 1:
-            valid_cols.append(col)
-    if len(valid_cols) < 2:
-        return pd.DataFrame()
-    return df[valid_cols].corr()
 @st.cache_data
-@handle_errors
 def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
-    """Enhanced column type detection"""
-    if df is None or df.empty:
-        return {'numeric': [], 'categorical': [], 'datetime': [], 'boolean': []}
-    result = {
         'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
-        'categorical': df.select_dtypes(include=['object', 'category']).columns.tolist(),
-        'datetime': df.select_dtypes(include=['datetime64']).columns.tolist(),
-        'boolean': df.select_dtypes(include=['bool']).columns.tolist()
     }
-    # Auto-detect potential datetime columns in object type
-    potential_datetime = []
-    for col in result['categorical']:
-        if df[col].dtype == 'object':
-            sample = df[col].dropna().head(100)
-            if len(sample) > 0:
-                try:
-                    pd.to_datetime(sample.iloc[0])
-                    potential_datetime.append(col)
-                except:
-                    pass
-    if potential_datetime:
-        result['potential_datetime'] = potential_datetime
-    return result
 @st.cache_data
-@handle_errors
-def calculate_outliers(df: pd.DataFrame, column: str, method: str = 'iqr') -> pd.DataFrame:
-    """Enhanced outlier detection with multiple methods"""
-    if df is None or df.empty or column not in df.columns:
-        return pd.DataFrame()
-    if not pd.api.types.is_numeric_dtype(df[column]):
-        return pd.DataFrame()
     series = df[column].dropna()
-    if len(series) == 0:
-        return pd.DataFrame()
-    if method == 'iqr':
-        Q1 = series.quantile(0.25)
-        Q3 = series.quantile(0.75)
-        IQR = Q3 - Q1
-        if IQR == 0:  # All values are the same
-            return pd.DataFrame()
-        lower_bound = Q1 - 1.5 * IQR
-        upper_bound = Q3 + 1.5 * IQR
-        outlier_mask = (df[column] < lower_bound) | (df[column] > upper_bound)
-    elif method == 'zscore':
-        z_scores = np.abs(stats.zscore(series))
-        outlier_indices = series.index[z_scores > 3]
-        outlier_mask = df.index.isin(outlier_indices)
-    else:  # percentile
-        lower_bound = series.quantile(0.01)
-        upper_bound = series.quantile(0.99)
-        outlier_mask = (df[column] < lower_bound) | (df[column] > upper_bound)
-    return df[outlier_mask]
 @st.cache_data
-@handle_errors
 def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
-    """Detect columns with mixed data types and provide detailed analysis"""
-    if df is None or df.empty:
-        return []
     mixed_type_issues = []
     for col in df.select_dtypes(include=['object']).columns:
-        try:
-            # Skip if all values are null
-            if df[col].isnull().all():
-                continue
-            # Try numeric conversion
-            numeric_conversion = pd.to_numeric(df[col], errors='coerce')
-            new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
-            if new_nulls > 0 and new_nulls < len(df[col]) * 0.9:  # Not too many conversion failures
-                # Find problematic values
-                original_not_null = df[col].notnull()
-                converted_null = numeric_conversion.isnull()
-                problematic_mask = original_not_null & converted_null
-                if problematic_mask.sum() > 0:
-                    sample_problems = df[col][problematic_mask].value_counts().head(5)
-                    mixed_type_issues.append({
-                        'column': col,
-                        'problematic_values': int(new_nulls),
-                        'total_values': int(len(df[col])),
-                        'percentage': round((new_nulls / len(df[col])) * 100, 2),
-                        'sample_issues': sample_problems.to_dict()
-                    })
-        except Exception as e:
-            logger.warning(f"Error analyzing mixed types for column {col}: {str(e)}")
-            continue
     return mixed_type_issues
 @st.cache_data
-@handle_errors
-def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> Optional[pd.Series]:
-    """Get value counts with validation"""
-    if df is None or df.empty or column not in df.columns:
-        return pd.Series()
     return df[column].value_counts().head(top_n)
 @st.cache_data
-@handle_errors
-def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> Optional[pd.DataFrame]:
-    """Calculate group statistics with validation"""
-    if df is None or df.empty or group_col not in df.columns or metric_col not in df.columns:
-        return pd.DataFrame()
-    if not pd.api.types.is_numeric_dtype(df[metric_col]):
-        return pd.DataFrame()
-    # Limit to top groups for performance
-    top_groups = df[group_col].value_counts().head(20).index
-    filtered_df = df[df[group_col].isin(top_groups)]
-    stats_df = filtered_df.groupby(group_col)[metric_col].agg([
-        'count', 'mean', 'median', 'std', 'min', 'max'
-    ]).round(3)
-    return stats_df.reset_index()
 @st.cache_data
-@handle_errors
 def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
-    """Enhanced data quality scoring with detailed feedback"""
-    if df is None or df.empty:
-        return {'score': 0, 'issues': ['Dataset is empty'], 'grade': 'F'}
     score = 100
     issues = []
-    recommendations = []
-    try:
-        # Missing values assessment
-        total_cells = len(df) * len(df.columns)
-        missing_count = df.isnull().sum().sum()
-        missing_pct = (missing_count / total_cells) * 100 if total_cells > 0 else 0
-        if missing_pct > 0:
-            penalty = min(25, missing_pct * 2)
-            score -= penalty
-            issues.append(f"Missing values: {missing_pct:.1f}% of total data")
-            if missing_pct < 5:
-                recommendations.append("Low missing data - consider simple imputation")
-            elif missing_pct < 20:
-                recommendations.append("Moderate missing data - analyze patterns before imputation")
-            else:
-                recommendations.append("High missing data - investigate data collection process")
-        # Duplicates assessment
-        duplicate_count = df.duplicated().sum()
-        duplicate_pct = (duplicate_count / len(df)) * 100 if len(df) > 0 else 0
-        if duplicate_pct > 0:
-            penalty = min(20, duplicate_pct * 3)
-            score -= penalty
-            issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
-            recommendations.append("Remove or investigate duplicate records")
-        # Constant columns
-        constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
-        if constant_cols:
-            penalty = min(15, len(constant_cols) * 2)
-            score -= penalty
-            issues.append(f"Constant columns: {len(constant_cols)} columns have no variation")
-            recommendations.append("Consider removing constant columns")
-        # Mixed types
-        mixed_types = detect_mixed_types(df)
-        if mixed_types:
-            penalty = min(15, len(mixed_types) * 3)
-            score -= penalty
-            issues.append(f"Mixed data types: {len(mixed_types)} columns need type conversion")
-            recommendations.append("Fix data type inconsistencies")
-        # Data size assessment
-        if len(df) < 10:
-            score -= 10
-            issues.append("Very small dataset - statistical power may be limited")
-        # Grade assignment
-        if score >= 95:
-            grade = 'A+'
-        elif score >= 90:
-            grade = 'A'
-        elif score >= 85:
-            grade = 'B+'
-        elif score >= 80:
-            grade = 'B'
-        elif score >= 75:
-            grade = 'C+'
-        elif score >= 70:
-            grade = 'C'
-        elif score >= 60:
-            grade = 'D'
-        else:
-            grade = 'F'
-        return {
-            'score': max(0, round(score, 1)),
-            'issues': issues,
-            'recommendations': recommendations,
-            'grade': grade
-        }
-    except Exception as e:
-        logger.error(f"Error calculating data quality: {str(e)}")
-        return {
-            'score': 0,
-            'issues': [f"Error calculating quality score: {str(e)}"],
-            'recommendations': ['Please check your data format'],
-            'grade': 'Error'
-        }
-def load_data(uploaded_file) -> Optional[pd.DataFrame]:
-    """Unified data loading with comprehensive error handling"""
-    if uploaded_file is None:
-        return None
-    try:
-        file_content = uploaded_file.read()
-        uploaded_file.seek(0)
-        file_size_mb = len(file_content) / 1024**2
-        # File size warning
-        if file_size_mb > 100:
-            st.warning(f"⚠️ Large file detected ({file_size_mb:.1f} MB). Processing may take longer.")
-        # Load based on file extension
-        if uploaded_file.name.lower().endswith('.csv'):
-            df = load_csv_with_encoding(file_content, uploaded_file.name)
-        elif uploaded_file.name.lower().endswith(('.xlsx', '.xls')):
-            df = load_excel_file(file_content)
-        else:
-            raise ValueError(f"Unsupported file format: {uploaded_file.name}")
-        if df is None:
-            raise ValueError("Failed to load data from file")
-        # Additional validations
-        if df.empty:
-            raise ValueError("The uploaded file contains no data")
-        if len(df.columns) == 0:
-            raise ValueError("No columns found in the dataset")
-        # Clean column names
-        df.columns = df.columns.astype(str).str.strip()
-        # Remove completely empty rows/columns
-        df = df.dropna(how='all').dropna(axis=1, how='all')
-        if df.empty:
-            raise ValueError("No valid data remaining after cleaning empty rows/columns")
-        logger.info(f"Successfully loaded and validated data: {df.shape}")
-        return df
-    except Exception as e:
-        error_msg = f"Failed to load data: {str(e)}"
-        logger.error(error_msg)
-        st.error(error_msg)
-        st.info("💡 **Tips for successful upload:**\n"
-                "- Ensure file is not corrupted\n"
-                "- Check file encoding (UTF-8 recommended)\n"
-                "- Verify file has proper headers\n"
-                "- File size should be under 200MB for optimal performance")
-        return None
-@handle_errors
-def validate_dataframe(df: pd.DataFrame) -> Tuple[bool, List[str]]:
-    """Validate dataframe for analysis readiness"""
-    if df is None:
-        return False, ["No dataframe provided"]
-    issues = []
-    if df.empty:
-        issues.append("Dataset is empty")
-    if len(df.columns) == 0:
-        issues.append("No columns found")
-    if len(df) < 2:
-        issues.append("Insufficient data for analysis (minimum 2 rows required)")
-    # Check for problematic column names
-    problematic_cols = [col for col in df.columns if not isinstance(col, str) or col.strip() == '']
-    if problematic_cols:
-        issues.append(f"Problematic column names detected: {len(problematic_cols)} columns")
-    return len(issues) == 0, issues
-@handle_errors
-def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> Optional[pd.DataFrame]:
-    """Apply data cleaning operations with validation and rollback capability"""
-    if df is None or df.empty:
-        return df
     cleaned_df = df.copy()
-    applied_operations = []
-    try:
-        for operation in operations:
-            operation_type = operation.get('type')
-            column = operation.get('column')
-            # Validate operation
-            if operation_type == 'fill_missing' and column in cleaned_df.columns:
-                method = operation.get('method', 'mean')
-                if method == 'mean' and pd.api.types.is_numeric_dtype(cleaned_df[column]):
-                    fill_value = cleaned_df[column].mean()
-                elif method == 'median' and pd.api.types.is_numeric_dtype(cleaned_df[column]):
-                    fill_value = cleaned_df[column].median()
-                elif method == 'mode':
-                    mode_values = cleaned_df[column].mode()
-                    fill_value = mode_values.iloc[0] if not mode_values.empty else 'Unknown'
-                elif method == 'drop':
-                    original_len = len(cleaned_df)
-                    cleaned_df = cleaned_df.dropna(subset=[column])
-                    applied_operations.append(f"Dropped {original_len - len(cleaned_df)} rows with missing {column}")
-                    continue
-                else:
-                    fill_value = operation.get('value', 0)
-                original_missing = cleaned_df[column].isnull().sum()
-                cleaned_df[column] = cleaned_df[column].fillna(fill_value)
-                applied_operations.append(f"Filled {original_missing} missing values in {column} using {method}")
-            elif operation_type == 'remove_duplicates':
-                original_len = len(cleaned_df)
-                cleaned_df = cleaned_df.drop_duplicates()
-                removed = original_len - len(cleaned_df)
-                applied_operations.append(f"Removed {removed} duplicate rows")
-            elif operation_type == 'remove_outliers' and column in cleaned_df.columns:
-                original_len = len(cleaned_df)
-                outliers = calculate_outliers(cleaned_df, column)
-                if outliers is not None and not outliers.empty:
-                    cleaned_df = cleaned_df[~cleaned_df.index.isin(outliers.index)]
-                    removed = original_len - len(cleaned_df)
-                    applied_operations.append(f"Removed {removed} outliers from {column}")
-        logger.info(f"Applied {len(applied_operations)} cleaning operations")
-        return cleaned_df
-    except Exception as e:
-        error_msg = f"Error during data cleaning: {str(e)}"
-        logger.error(error_msg)
-        st.error(error_msg)
-        return df  # Return original data if cleaning fails

 import pandas as pd
 import numpy as np
 import warnings
+from typing import Dict, List, Any, Tuple
 from scipy import stats
 warnings.filterwarnings('ignore')
+# All cached data processing functions
 @st.cache_data
+def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
+    """Load CSV with automatic encoding detection - cached"""
+    import chardet
+    detected = chardet.detect(file_content)
+    encoding = detected['encoding']
     try:
         from io import BytesIO
+        return pd.read_csv(BytesIO(file_content), encoding=encoding)
+    except:
         encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
         for enc in encodings:
             try:
+                return pd.read_csv(BytesIO(file_content), encoding=enc)
             except:
                 continue
+        raise Exception("Cannot read file with any encoding")
 @st.cache_data
+def load_excel_file(file_content: bytes) -> pd.DataFrame:
+    """Load Excel file - cached"""
     from io import BytesIO
+    return pd.read_excel(BytesIO(file_content))
 @st.cache_data
 def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate basic statistics - cached"""
     dtype_counts = df.dtypes.value_counts()
     dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
     }
 @st.cache_data
 def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate column cardinality analysis - cached"""
     cardinality_data = []
     for col in df.columns:
+        unique_count = df[col].nunique()
+        unique_ratio = unique_count / len(df)
+        # Determine column type based on cardinality
+        if unique_count == 1:
+            col_type = "Constant"
+        elif unique_count == len(df):
+            col_type = "Unique Identifier"
+        elif unique_ratio < 0.05:
+            col_type = "Low Cardinality"
+        elif unique_ratio < 0.5:
+            col_type = "Medium Cardinality"
+        else:
+            col_type = "High Cardinality"
+        cardinality_data.append({
+            'Column': col,
+            'Unique Count': unique_count,
+            'Unique Ratio': unique_ratio,
+            'Type': col_type,
+            'Data Type': str(df[col].dtype)
+        })
     return pd.DataFrame(cardinality_data)
 @st.cache_data
 def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate memory optimization suggestions - cached"""
     suggestions = []
     current_memory = df.memory_usage(deep=True).sum() / 1024**2
     potential_savings = 0
     for col in df.columns:
+        if df[col].dtype == 'object':
+            unique_ratio = df[col].nunique() / len(df)
+            if unique_ratio < 0.5:  # Less than 50% unique values
+                # Estimate category memory usage
+                category_memory = df[col].astype('category').memory_usage(deep=True)
+                object_memory = df[col].memory_usage(deep=True)
+                savings = (object_memory - category_memory) / 1024**2
+                if savings > 0.1:  # More than 0.1MB savings
                     suggestions.append({
+                        'column': col,
+                        'current_type': 'object',
+                        'suggested_type': 'category',
+                        'savings_mb': savings
                     })
                     potential_savings += savings
     return {
         'suggestions': suggestions,
+        'current_memory_mb': current_memory,
+        'potential_savings_mb': potential_savings,
+        'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0
     }
 @st.cache_data
 def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate missing data analysis - cached"""
     missing_data = df.isnull().sum()
     if missing_data.sum() > 0:
         missing_df = pd.DataFrame({
             'Column': missing_data.index,
             'Missing Count': missing_data.values,
+            'Missing %': (missing_data.values / len(df)) * 100
         })
+        return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
     return pd.DataFrame()
 @st.cache_data
 def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate correlation matrix - cached"""
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    return df[numeric_cols].corr() if len(numeric_cols) > 1 else pd.DataFrame()
 @st.cache_data
 def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
+    """Get column types - cached"""
+    return {
         'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
+        'categorical': df.select_dtypes(include=['object']).columns.tolist(),
+        'datetime': df.select_dtypes(include=['datetime64']).columns.tolist()
     }
 @st.cache_data
+def calculate_numeric_stats(df: pd.DataFrame, column: str) -> Dict[str, float]:
+    """Calculate enhanced numeric statistics - cached"""
     series = df[column].dropna()
+    return {
+        'mean': series.mean(),
+        'median': series.median(),
+        'std': series.std(),
+        'skewness': series.skew(),
+        'kurtosis': series.kurtosis(),
+        'min': series.min(),
+        'max': series.max(),
+        'q25': series.quantile(0.25),
+        'q75': series.quantile(0.75)
+    }
+@st.cache_data
+def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
+    """Calculate outliers using IQR method - cached"""
+    Q1 = df[column].quantile(0.25)
+    Q3 = df[column].quantile(0.75)
+    IQR = Q3 - Q1
+    lower_bound = Q1 - 1.5 * IQR
+    upper_bound = Q3 + 1.5 * IQR
+    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]
 @st.cache_data
 def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
+    """Detect columns with mixed data types - cached"""
     mixed_type_issues = []
     for col in df.select_dtypes(include=['object']).columns:
+        # Try to convert to numeric
+        numeric_conversion = pd.to_numeric(df[col], errors='coerce')
+        new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
+        if new_nulls > 0:
+            mixed_type_issues.append({
+                'column': col,
+                'problematic_values': new_nulls,
+                'total_values': len(df[col]),
+                'percentage': (new_nulls / len(df[col])) * 100
+            })
     return mixed_type_issues
 @st.cache_data
+def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
+    """Get value counts for categorical column - cached"""
     return df[column].value_counts().head(top_n)
 @st.cache_data
+def calculate_crosstab(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
+    """Calculate crosstab between two categorical columns - cached"""
+    return pd.crosstab(df[col1], df[col2])
+@st.cache_data
+def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
+    """Calculate group statistics - cached"""
+    return df.groupby(group_col)[metric_col].agg(['mean', 'median', 'std', 'count'])
 @st.cache_data
 def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate overall data quality score - cached"""
     score = 100
     issues = []
+    # Missing values penalty
+    missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
+    if missing_pct > 0:
+        penalty = min(30, missing_pct * 2)  # Max 30 points penalty
+        score -= penalty
+        issues.append(f"Missing values: {missing_pct:.1f}%")
+    # Duplicates penalty
+    duplicate_pct = (df.duplicated().sum() / len(df)) * 100
+    if duplicate_pct > 0:
+        penalty = min(20, duplicate_pct * 4)  # Max 20 points penalty
+        score -= penalty
+        issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
+    # Constant columns penalty
+    constant_cols = [col for col in df.columns if df[col].nunique() == 1]
+    if constant_cols:
+        penalty = min(10, len(constant_cols) * 2)
+        score -= penalty
+        issues.append(f"Constant columns: {len(constant_cols)}")
+    # Mixed types penalty
+    mixed_types = detect_mixed_types(df)
+    if mixed_types:
+        penalty = min(10, len(mixed_types) * 3)
+        score -= penalty
+        issues.append(f"Mixed type columns: {len(mixed_types)}")
+    return {
+        'score': max(0, score),
+        'issues': issues,
+        'grade': 'A' if score >= 90 else 'B' if score >= 80 else 'C' if score >= 70 else 'D' if score >= 60 else 'F'
+    }
+def load_data(uploaded_file):
+    """Unified data loading function"""
+    file_content = uploaded_file.read()
+    uploaded_file.seek(0)
+    if uploaded_file.name.endswith('.csv'):
+        return load_csv_with_encoding(file_content, uploaded_file.name)
+    else:
+        return load_excel_file(file_content)
+def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> pd.DataFrame:
+    """Apply data cleaning operations"""
     cleaned_df = df.copy()
+    for operation in operations:
+        if operation['type'] == 'fill_missing':
+            if operation['method'] == 'mean':
+                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
+                    cleaned_df[operation['column']].mean())
+            elif operation['method'] == 'median':
+                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
+                    cleaned_df[operation['column']].median())
+            elif operation['method'] == 'mode':
+                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
+                    cleaned_df[operation['column']].mode().iloc[0] if not cleaned_df[operation['column']].mode().empty else 0)
+            elif operation['method'] == 'drop':
+                cleaned_df = cleaned_df.dropna(subset=[operation['column']])
+        elif operation['type'] == 'remove_duplicates':
+            cleaned_df = cleaned_df.drop_duplicates()
+        elif operation['type'] == 'remove_outliers':
+            Q1 = cleaned_df[operation['column']].quantile(0.25)
+            Q3 = cleaned_df[operation['column']].quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            cleaned_df = cleaned_df[
+                (cleaned_df[operation['column']] >= lower_bound) &
+                (cleaned_df[operation['column']] <= upper_bound)
+            ]
+        elif operation['type'] == 'cap_outliers':
+            Q1 = cleaned_df[operation['column']].quantile(0.25)
+            Q3 = cleaned_df[operation['column']].quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            cleaned_df[operation['column']] = cleaned_df[operation['column']].clip(lower_bound, upper_bound)
+        elif operation['type'] == 'convert_type':
+            if operation['target_type'] == 'category':
+                cleaned_df[operation['column']] = cleaned_df[operation['column']].astype('category')
+    return cleaned_df