Spaces:

entropy25
/

data-analysis-platform

Sleeping

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

1cc5290

verified ·

1 Parent(s): ee51cad

Update data_handler.py

Browse files

Files changed (1) hide show

data_handler.py +529 -196

data_handler.py CHANGED Viewed

@@ -2,40 +2,101 @@ import streamlit as st
 import pandas as pd
 import numpy as np
 import warnings
-from typing import Dict, List, Any, Tuple
 from scipy import stats
 warnings.filterwarnings('ignore')
-# All cached data processing functions
 @st.cache_data
-def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
-    """Load CSV with automatic encoding detection - cached"""
-    import chardet
-    detected = chardet.detect(file_content)
-    encoding = detected['encoding']
     try:
         from io import BytesIO
-        return pd.read_csv(BytesIO(file_content), encoding=encoding)
-    except:
         encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
         for enc in encodings:
             try:
-                return pd.read_csv(BytesIO(file_content), encoding=enc)
             except:
                 continue
-        raise Exception("Cannot read file with any encoding")
 @st.cache_data
-def load_excel_file(file_content: bytes) -> pd.DataFrame:
-    """Load Excel file - cached"""
     from io import BytesIO
-    return pd.read_excel(BytesIO(file_content))
 @st.cache_data
 def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
-    """Calculate basic statistics - cached"""
     dtype_counts = df.dtypes.value_counts()
     dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
@@ -48,250 +109,522 @@ def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
     }
 @st.cache_data
 def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
-    """Calculate column cardinality analysis - cached"""
     cardinality_data = []
     for col in df.columns:
-        unique_count = df[col].nunique()
-        unique_ratio = unique_count / len(df)
-        # Determine column type based on cardinality
-        if unique_count == 1:
-            col_type = "Constant"
-        elif unique_count == len(df):
-            col_type = "Unique Identifier"
-        elif unique_ratio < 0.05:
-            col_type = "Low Cardinality"
-        elif unique_ratio < 0.5:
-            col_type = "Medium Cardinality"
-        else:
-            col_type = "High Cardinality"
-        cardinality_data.append({
-            'Column': col,
-            'Unique Count': unique_count,
-            'Unique Ratio': unique_ratio,
-            'Type': col_type,
-            'Data Type': str(df[col].dtype)
-        })
     return pd.DataFrame(cardinality_data)
 @st.cache_data
 def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
-    """Calculate memory optimization suggestions - cached"""
     suggestions = []
     current_memory = df.memory_usage(deep=True).sum() / 1024**2
     potential_savings = 0
     for col in df.columns:
-        if df[col].dtype == 'object':
-            unique_ratio = df[col].nunique() / len(df)
-            if unique_ratio < 0.5:  # Less than 50% unique values
-                # Estimate category memory usage
-                category_memory = df[col].astype('category').memory_usage(deep=True)
-                object_memory = df[col].memory_usage(deep=True)
-                savings = (object_memory - category_memory) / 1024**2
-                if savings > 0.1:  # More than 0.1MB savings
                     suggestions.append({
-                        'column': col,
-                        'current_type': 'object',
-                        'suggested_type': 'category',
-                        'savings_mb': savings
                     })
                     potential_savings += savings
     return {
         'suggestions': suggestions,
-        'current_memory_mb': current_memory,
-        'potential_savings_mb': potential_savings,
-        'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0
     }
 @st.cache_data
 def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
-    """Calculate missing data analysis - cached"""
     missing_data = df.isnull().sum()
     if missing_data.sum() > 0:
         missing_df = pd.DataFrame({
             'Column': missing_data.index,
             'Missing Count': missing_data.values,
-            'Missing %': (missing_data.values / len(df)) * 100
         })
-        return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
     return pd.DataFrame()
 @st.cache_data
 def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
-    """Calculate correlation matrix - cached"""
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    return df[numeric_cols].corr() if len(numeric_cols) > 1 else pd.DataFrame()
 @st.cache_data
 def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
-    """Get column types - cached"""
-    return {
         'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
-        'categorical': df.select_dtypes(include=['object']).columns.tolist(),
-        'datetime': df.select_dtypes(include=['datetime64']).columns.tolist()
     }
 @st.cache_data
-def calculate_numeric_stats(df: pd.DataFrame, column: str) -> Dict[str, float]:
-    """Calculate enhanced numeric statistics - cached"""
     series = df[column].dropna()
-    return {
-        'mean': series.mean(),
-        'median': series.median(),
-        'std': series.std(),
-        'skewness': series.skew(),
-        'kurtosis': series.kurtosis(),
-        'min': series.min(),
-        'max': series.max(),
-        'q25': series.quantile(0.25),
-        'q75': series.quantile(0.75)
-    }
-@st.cache_data
-def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
-    """Calculate outliers using IQR method - cached"""
-    Q1 = df[column].quantile(0.25)
-    Q3 = df[column].quantile(0.75)
-    IQR = Q3 - Q1
-    lower_bound = Q1 - 1.5 * IQR
-    upper_bound = Q3 + 1.5 * IQR
-    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]
 @st.cache_data
 def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
-    """Detect columns with mixed data types - cached"""
     mixed_type_issues = []
     for col in df.select_dtypes(include=['object']).columns:
-        # Try to convert to numeric
-        numeric_conversion = pd.to_numeric(df[col], errors='coerce')
-        new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
-        if new_nulls > 0:
-            mixed_type_issues.append({
-                'column': col,
-                'problematic_values': new_nulls,
-                'total_values': len(df[col]),
-                'percentage': (new_nulls / len(df[col])) * 100
-            })
     return mixed_type_issues
 @st.cache_data
-def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
-    """Get value counts for categorical column - cached"""
     return df[column].value_counts().head(top_n)
 @st.cache_data
-def calculate_crosstab(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
-    """Calculate crosstab between two categorical columns - cached"""
-    return pd.crosstab(df[col1], df[col2])
-@st.cache_data
-def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
-    """Calculate group statistics - cached"""
-    return df.groupby(group_col)[metric_col].agg(['mean', 'median', 'std', 'count'])
 @st.cache_data
 def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
-    """Calculate overall data quality score - cached"""
     score = 100
     issues = []
-    # Missing values penalty
-    missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
-    if missing_pct > 0:
-        penalty = min(30, missing_pct * 2)  # Max 30 points penalty
-        score -= penalty
-        issues.append(f"Missing values: {missing_pct:.1f}%")
-    # Duplicates penalty
-    duplicate_pct = (df.duplicated().sum() / len(df)) * 100
-    if duplicate_pct > 0:
-        penalty = min(20, duplicate_pct * 4)  # Max 20 points penalty
-        score -= penalty
-        issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
-    # Constant columns penalty
-    constant_cols = [col for col in df.columns if df[col].nunique() == 1]
-    if constant_cols:
-        penalty = min(10, len(constant_cols) * 2)
-        score -= penalty
-        issues.append(f"Constant columns: {len(constant_cols)}")
-    # Mixed types penalty
-    mixed_types = detect_mixed_types(df)
-    if mixed_types:
-        penalty = min(10, len(mixed_types) * 3)
-        score -= penalty
-        issues.append(f"Mixed type columns: {len(mixed_types)}")
-    return {
-        'score': max(0, score),
-        'issues': issues,
-        'grade': 'A' if score >= 90 else 'B' if score >= 80 else 'C' if score >= 70 else 'D' if score >= 60 else 'F'
-    }
-def load_data(uploaded_file):
-    """Unified data loading function"""
-    file_content = uploaded_file.read()
-    uploaded_file.seek(0)
-    if uploaded_file.name.endswith('.csv'):
-        return load_csv_with_encoding(file_content, uploaded_file.name)
-    else:
-        return load_excel_file(file_content)
-def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> pd.DataFrame:
-    """Apply data cleaning operations"""
     cleaned_df = df.copy()
-    for operation in operations:
-        if operation['type'] == 'fill_missing':
-            if operation['method'] == 'mean':
-                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
-                    cleaned_df[operation['column']].mean())
-            elif operation['method'] == 'median':
-                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
-                    cleaned_df[operation['column']].median())
-            elif operation['method'] == 'mode':
-                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
-                    cleaned_df[operation['column']].mode().iloc[0] if not cleaned_df[operation['column']].mode().empty else 0)
-            elif operation['method'] == 'drop':
-                cleaned_df = cleaned_df.dropna(subset=[operation['column']])
-        elif operation['type'] == 'remove_duplicates':
-            cleaned_df = cleaned_df.drop_duplicates()
-        elif operation['type'] == 'remove_outliers':
-            Q1 = cleaned_df[operation['column']].quantile(0.25)
-            Q3 = cleaned_df[operation['column']].quantile(0.75)
-            IQR = Q3 - Q1
-            lower_bound = Q1 - 1.5 * IQR
-            upper_bound = Q3 + 1.5 * IQR
-            cleaned_df = cleaned_df[
-                (cleaned_df[operation['column']] >= lower_bound) &
-                (cleaned_df[operation['column']] <= upper_bound)
-            ]
-        elif operation['type'] == 'cap_outliers':
-            Q1 = cleaned_df[operation['column']].quantile(0.25)
-            Q3 = cleaned_df[operation['column']].quantile(0.75)
-            IQR = Q3 - Q1
-            lower_bound = Q1 - 1.5 * IQR
-            upper_bound = Q3 + 1.5 * IQR
-            cleaned_df[operation['column']] = cleaned_df[operation['column']].clip(lower_bound, upper_bound)
-        elif operation['type'] == 'convert_type':
-            if operation['target_type'] == 'category':
-                cleaned_df[operation['column']] = cleaned_df[operation['column']].astype('category')
-    return cleaned_df

 import pandas as pd
 import numpy as np
 import warnings
+from typing import Dict, List, Any, Tuple, Optional
 from scipy import stats
+import logging
 warnings.filterwarnings('ignore')
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Enhanced error handling decorator
+def handle_errors(func):
+    """Decorator for consistent error handling"""
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            logger.error(f"Error in {func.__name__}: {str(e)}")
+            st.error(f"Error in {func.__name__}: {str(e)}")
+            return None
+    return wrapper
 @st.cache_data
+@handle_errors
+def load_csv_with_encoding(file_content: bytes, filename: str) -> Optional[pd.DataFrame]:
+    """Load CSV with automatic encoding detection and enhanced error handling"""
     try:
+        import chardet
+        detected = chardet.detect(file_content)
+        encoding = detected.get('encoding', 'utf-8')
         from io import BytesIO
+        df = pd.read_csv(BytesIO(file_content), encoding=encoding)
+        # Validate loaded data
+        if df.empty:
+            raise ValueError("The uploaded file is empty")
+        if df.shape[1] == 1 and df.columns[0].count(',') > 0:
+            # Might be semicolon separated
+            df = pd.read_csv(BytesIO(file_content), encoding=encoding, sep=';')
+        logger.info(f"Successfully loaded CSV: {df.shape}")
+        return df
+    except Exception as e:
+        # Try alternative encodings
         encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
         for enc in encodings:
             try:
+                df = pd.read_csv(BytesIO(file_content), encoding=enc)
+                if not df.empty:
+                    logger.info(f"Loaded CSV with encoding {enc}: {df.shape}")
+                    return df
             except:
                 continue
+        raise Exception(f"Cannot read file with any standard encoding. Original error: {str(e)}")
 @st.cache_data
+@handle_errors
+def load_excel_file(file_content: bytes) -> Optional[pd.DataFrame]:
+    """Load Excel file with enhanced error handling"""
     from io import BytesIO
+    try:
+        # Try loading first sheet
+        df = pd.read_excel(BytesIO(file_content))
+        if df.empty:
+            raise ValueError("The Excel file is empty")
+        logger.info(f"Successfully loaded Excel: {df.shape}")
+        return df
+    except Exception as e:
+        # Try with different engines
+        for engine in ['openpyxl', 'xlrd']:
+            try:
+                df = pd.read_excel(BytesIO(file_content), engine=engine)
+                if not df.empty:
+                    logger.info(f"Loaded Excel with engine {engine}: {df.shape}")
+                    return df
+            except:
+                continue
+        raise Exception(f"Cannot read Excel file. Error: {str(e)}")
 @st.cache_data
+@handle_errors
 def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate basic statistics with error handling"""
+    if df is None or df.empty:
+        return {}
     dtype_counts = df.dtypes.value_counts()
     dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
     }
 @st.cache_data
+@handle_errors
 def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate column cardinality analysis with improved categorization"""
+    if df is None or df.empty:
+        return pd.DataFrame()
     cardinality_data = []
     for col in df.columns:
+        try:
+            unique_count = df[col].nunique()
+            total_count = len(df)
+            unique_ratio = unique_count / total_count if total_count > 0 else 0
+            # Enhanced type classification
+            if unique_count == 1:
+                col_type = "Constant"
+            elif unique_count == total_count:
+                col_type = "Unique Identifier"
+            elif unique_ratio < 0.01:
+                col_type = "Very Low Cardinality"
+            elif unique_ratio < 0.05:
+                col_type = "Low Cardinality"
+            elif unique_ratio < 0.5:
+                col_type = "Medium Cardinality"
+            else:
+                col_type = "High Cardinality"
+            cardinality_data.append({
+                'Column': col,
+                'Unique Count': unique_count,
+                'Total Count': total_count,
+                'Unique Ratio': round(unique_ratio, 4),
+                'Type': col_type,
+                'Data Type': str(df[col].dtype)
+            })
+        except Exception as e:
+            logger.warning(f"Error processing column {col}: {str(e)}")
+            continue
     return pd.DataFrame(cardinality_data)
 @st.cache_data
+@handle_errors
 def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate memory optimization suggestions with validation"""
+    if df is None or df.empty:
+        return {'suggestions': [], 'current_memory_mb': 0, 'potential_savings_mb': 0, 'potential_savings_pct': 0}
     suggestions = []
     current_memory = df.memory_usage(deep=True).sum() / 1024**2
     potential_savings = 0
     for col in df.columns:
+        try:
+            if df[col].dtype == 'object' and not df[col].isnull().all():
+                unique_ratio = df[col].nunique() / len(df)
+                if unique_ratio < 0.5:  # Less than 50% unique values
+                    # Calculate potential savings
+                    test_series = df[col].dropna().head(1000)  # Sample for estimation
+                    if len(test_series) > 0:
+                        category_memory = test_series.astype('category').memory_usage(deep=True)
+                        object_memory = test_series.memory_usage(deep=True)
+                        savings_ratio = (object_memory - category_memory) / object_memory
+                        if savings_ratio > 0.1:  # More than 10% savings
+                            estimated_savings = (df[col].memory_usage(deep=True) * savings_ratio) / 1024**2
+                            suggestions.append({
+                                'Column': col,
+                                'Current Type': 'object',
+                                'Suggested Type': 'category',
+                                'Estimated Savings (MB)': round(estimated_savings, 2),
+                                'Unique Ratio': round(unique_ratio, 3)
+                            })
+                            potential_savings += estimated_savings
+            # Check for int64 that could be int32
+            elif df[col].dtype == 'int64':
+                if df[col].min() >= -2147483648 and df[col].max() <= 2147483647:
+                    savings = df[col].memory_usage(deep=True) * 0.5 / 1024**2
                     suggestions.append({
+                        'Column': col,
+                        'Current Type': 'int64',
+                        'Suggested Type': 'int32',
+                        'Estimated Savings (MB)': round(savings, 2),
+                        'Unique Ratio': 'N/A'
                     })
                     potential_savings += savings
+        except Exception as e:
+            logger.warning(f"Error analyzing memory for column {col}: {str(e)}")
+            continue
     return {
         'suggestions': suggestions,
+        'current_memory_mb': round(current_memory, 2),
+        'potential_savings_mb': round(potential_savings, 2),
+        'potential_savings_pct': round((potential_savings / current_memory) * 100, 1) if current_memory > 0 else 0
     }
 @st.cache_data
+@handle_errors
 def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate missing data analysis with enhanced insights"""
+    if df is None or df.empty:
+        return pd.DataFrame()
     missing_data = df.isnull().sum()
     if missing_data.sum() > 0:
         missing_df = pd.DataFrame({
             'Column': missing_data.index,
             'Missing Count': missing_data.values,
+            'Missing %': round((missing_data.values / len(df)) * 100, 2),
+            'Data Type': [str(df[col].dtype) for col in missing_data.index]
         })
+        result = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
+        return result.reset_index(drop=True)
     return pd.DataFrame()
 @st.cache_data
+@handle_errors
 def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate correlation matrix with validation"""
+    if df is None or df.empty:
+        return pd.DataFrame()
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    if len(numeric_cols) < 2:
+        return pd.DataFrame()
+    # Remove columns with all NaN or constant values
+    valid_cols = []
+    for col in numeric_cols:
+        if not df[col].isnull().all() and df[col].nunique() > 1:
+            valid_cols.append(col)
+    if len(valid_cols) < 2:
+        return pd.DataFrame()
+    return df[valid_cols].corr()
 @st.cache_data
+@handle_errors
 def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
+    """Enhanced column type detection"""
+    if df is None or df.empty:
+        return {'numeric': [], 'categorical': [], 'datetime': [], 'boolean': []}
+    result = {
         'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
+        'categorical': df.select_dtypes(include=['object', 'category']).columns.tolist(),
+        'datetime': df.select_dtypes(include=['datetime64']).columns.tolist(),
+        'boolean': df.select_dtypes(include=['bool']).columns.tolist()
     }
+    # Auto-detect potential datetime columns in object type
+    potential_datetime = []
+    for col in result['categorical']:
+        if df[col].dtype == 'object':
+            sample = df[col].dropna().head(100)
+            if len(sample) > 0:
+                try:
+                    pd.to_datetime(sample.iloc[0])
+                    potential_datetime.append(col)
+                except:
+                    pass
+    if potential_datetime:
+        result['potential_datetime'] = potential_datetime
+    return result
 @st.cache_data
+@handle_errors
+def calculate_outliers(df: pd.DataFrame, column: str, method: str = 'iqr') -> pd.DataFrame:
+    """Enhanced outlier detection with multiple methods"""
+    if df is None or df.empty or column not in df.columns:
+        return pd.DataFrame()
+    if not pd.api.types.is_numeric_dtype(df[column]):
+        return pd.DataFrame()
     series = df[column].dropna()
+    if len(series) == 0:
+        return pd.DataFrame()
+    if method == 'iqr':
+        Q1 = series.quantile(0.25)
+        Q3 = series.quantile(0.75)
+        IQR = Q3 - Q1
+        if IQR == 0:  # All values are the same
+            return pd.DataFrame()
+        lower_bound = Q1 - 1.5 * IQR
+        upper_bound = Q3 + 1.5 * IQR
+        outlier_mask = (df[column] < lower_bound) | (df[column] > upper_bound)
+    elif method == 'zscore':
+        z_scores = np.abs(stats.zscore(series))
+        outlier_indices = series.index[z_scores > 3]
+        outlier_mask = df.index.isin(outlier_indices)
+    else:  # percentile
+        lower_bound = series.quantile(0.01)
+        upper_bound = series.quantile(0.99)
+        outlier_mask = (df[column] < lower_bound) | (df[column] > upper_bound)
+    return df[outlier_mask]
 @st.cache_data
+@handle_errors
 def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
+    """Detect columns with mixed data types and provide detailed analysis"""
+    if df is None or df.empty:
+        return []
     mixed_type_issues = []
     for col in df.select_dtypes(include=['object']).columns:
+        try:
+            # Skip if all values are null
+            if df[col].isnull().all():
+                continue
+            # Try numeric conversion
+            numeric_conversion = pd.to_numeric(df[col], errors='coerce')
+            new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
+            if new_nulls > 0 and new_nulls < len(df[col]) * 0.9:  # Not too many conversion failures
+                # Find problematic values
+                original_not_null = df[col].notnull()
+                converted_null = numeric_conversion.isnull()
+                problematic_mask = original_not_null & converted_null
+                if problematic_mask.sum() > 0:
+                    sample_problems = df[col][problematic_mask].value_counts().head(5)
+                    mixed_type_issues.append({
+                        'column': col,
+                        'problematic_values': int(new_nulls),
+                        'total_values': int(len(df[col])),
+                        'percentage': round((new_nulls / len(df[col])) * 100, 2),
+                        'sample_issues': sample_problems.to_dict()
+                    })
+        except Exception as e:
+            logger.warning(f"Error analyzing mixed types for column {col}: {str(e)}")
+            continue
     return mixed_type_issues
 @st.cache_data
+@handle_errors
+def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> Optional[pd.Series]:
+    """Get value counts with validation"""
+    if df is None or df.empty or column not in df.columns:
+        return pd.Series()
     return df[column].value_counts().head(top_n)
 @st.cache_data
+@handle_errors
+def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> Optional[pd.DataFrame]:
+    """Calculate group statistics with validation"""
+    if df is None or df.empty or group_col not in df.columns or metric_col not in df.columns:
+        return pd.DataFrame()
+    if not pd.api.types.is_numeric_dtype(df[metric_col]):
+        return pd.DataFrame()
+    # Limit to top groups for performance
+    top_groups = df[group_col].value_counts().head(20).index
+    filtered_df = df[df[group_col].isin(top_groups)]
+    stats_df = filtered_df.groupby(group_col)[metric_col].agg([
+        'count', 'mean', 'median', 'std', 'min', 'max'
+    ]).round(3)
+    return stats_df.reset_index()
 @st.cache_data
+@handle_errors
 def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
+    """Enhanced data quality scoring with detailed feedback"""
+    if df is None or df.empty:
+        return {'score': 0, 'issues': ['Dataset is empty'], 'grade': 'F'}
     score = 100
     issues = []
+    recommendations = []
+    try:
+        # Missing values assessment
+        total_cells = len(df) * len(df.columns)
+        missing_count = df.isnull().sum().sum()
+        missing_pct = (missing_count / total_cells) * 100 if total_cells > 0 else 0
+        if missing_pct > 0:
+            penalty = min(25, missing_pct * 2)
+            score -= penalty
+            issues.append(f"Missing values: {missing_pct:.1f}% of total data")
+            if missing_pct < 5:
+                recommendations.append("Low missing data - consider simple imputation")
+            elif missing_pct < 20:
+                recommendations.append("Moderate missing data - analyze patterns before imputation")
+            else:
+                recommendations.append("High missing data - investigate data collection process")
+        # Duplicates assessment
+        duplicate_count = df.duplicated().sum()
+        duplicate_pct = (duplicate_count / len(df)) * 100 if len(df) > 0 else 0
+        if duplicate_pct > 0:
+            penalty = min(20, duplicate_pct * 3)
+            score -= penalty
+            issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
+            recommendations.append("Remove or investigate duplicate records")
+        # Constant columns
+        constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
+        if constant_cols:
+            penalty = min(15, len(constant_cols) * 2)
+            score -= penalty
+            issues.append(f"Constant columns: {len(constant_cols)} columns have no variation")
+            recommendations.append("Consider removing constant columns")
+        # Mixed types
+        mixed_types = detect_mixed_types(df)
+        if mixed_types:
+            penalty = min(15, len(mixed_types) * 3)
+            score -= penalty
+            issues.append(f"Mixed data types: {len(mixed_types)} columns need type conversion")
+            recommendations.append("Fix data type inconsistencies")
+        # Data size assessment
+        if len(df) < 10:
+            score -= 10
+            issues.append("Very small dataset - statistical power may be limited")
+        # Grade assignment
+        if score >= 95:
+            grade = 'A+'
+        elif score >= 90:
+            grade = 'A'
+        elif score >= 85:
+            grade = 'B+'
+        elif score >= 80:
+            grade = 'B'
+        elif score >= 75:
+            grade = 'C+'
+        elif score >= 70:
+            grade = 'C'
+        elif score >= 60:
+            grade = 'D'
+        else:
+            grade = 'F'
+        return {
+            'score': max(0, round(score, 1)),
+            'issues': issues,
+            'recommendations': recommendations,
+            'grade': grade
+        }
+    except Exception as e:
+        logger.error(f"Error calculating data quality: {str(e)}")
+        return {
+            'score': 0,
+            'issues': [f"Error calculating quality score: {str(e)}"],
+            'recommendations': ['Please check your data format'],
+            'grade': 'Error'
+        }
+def load_data(uploaded_file) -> Optional[pd.DataFrame]:
+    """Unified data loading with comprehensive error handling"""
+    if uploaded_file is None:
+        return None
+    try:
+        file_content = uploaded_file.read()
+        uploaded_file.seek(0)
+        file_size_mb = len(file_content) / 1024**2
+        # File size warning
+        if file_size_mb > 100:
+            st.warning(f"⚠️ Large file detected ({file_size_mb:.1f} MB). Processing may take longer.")
+        # Load based on file extension
+        if uploaded_file.name.lower().endswith('.csv'):
+            df = load_csv_with_encoding(file_content, uploaded_file.name)
+        elif uploaded_file.name.lower().endswith(('.xlsx', '.xls')):
+            df = load_excel_file(file_content)
+        else:
+            raise ValueError(f"Unsupported file format: {uploaded_file.name}")
+        if df is None:
+            raise ValueError("Failed to load data from file")
+        # Additional validations
+        if df.empty:
+            raise ValueError("The uploaded file contains no data")
+        if len(df.columns) == 0:
+            raise ValueError("No columns found in the dataset")
+        # Clean column names
+        df.columns = df.columns.astype(str).str.strip()
+        # Remove completely empty rows/columns
+        df = df.dropna(how='all').dropna(axis=1, how='all')
+        if df.empty:
+            raise ValueError("No valid data remaining after cleaning empty rows/columns")
+        logger.info(f"Successfully loaded and validated data: {df.shape}")
+        return df
+    except Exception as e:
+        error_msg = f"Failed to load data: {str(e)}"
+        logger.error(error_msg)
+        st.error(error_msg)
+        st.info("💡 **Tips for successful upload:**\n"
+                "- Ensure file is not corrupted\n"
+                "- Check file encoding (UTF-8 recommended)\n"
+                "- Verify file has proper headers\n"
+                "- File size should be under 200MB for optimal performance")
+        return None
+@handle_errors
+def validate_dataframe(df: pd.DataFrame) -> Tuple[bool, List[str]]:
+    """Validate dataframe for analysis readiness"""
+    if df is None:
+        return False, ["No dataframe provided"]
+    issues = []
+    if df.empty:
+        issues.append("Dataset is empty")
+    if len(df.columns) == 0:
+        issues.append("No columns found")
+    if len(df) < 2:
+        issues.append("Insufficient data for analysis (minimum 2 rows required)")
+    # Check for problematic column names
+    problematic_cols = [col for col in df.columns if not isinstance(col, str) or col.strip() == '']
+    if problematic_cols:
+        issues.append(f"Problematic column names detected: {len(problematic_cols)} columns")
+    return len(issues) == 0, issues
+@handle_errors
+def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> Optional[pd.DataFrame]:
+    """Apply data cleaning operations with validation and rollback capability"""
+    if df is None or df.empty:
+        return df
     cleaned_df = df.copy()
+    applied_operations = []
+    try:
+        for operation in operations:
+            operation_type = operation.get('type')
+            column = operation.get('column')
+            # Validate operation
+            if operation_type == 'fill_missing' and column in cleaned_df.columns:
+                method = operation.get('method', 'mean')
+                if method == 'mean' and pd.api.types.is_numeric_dtype(cleaned_df[column]):
+                    fill_value = cleaned_df[column].mean()
+                elif method == 'median' and pd.api.types.is_numeric_dtype(cleaned_df[column]):
+                    fill_value = cleaned_df[column].median()
+                elif method == 'mode':
+                    mode_values = cleaned_df[column].mode()
+                    fill_value = mode_values.iloc[0] if not mode_values.empty else 'Unknown'
+                elif method == 'drop':
+                    original_len = len(cleaned_df)
+                    cleaned_df = cleaned_df.dropna(subset=[column])
+                    applied_operations.append(f"Dropped {original_len - len(cleaned_df)} rows with missing {column}")
+                    continue
+                else:
+                    fill_value = operation.get('value', 0)
+                original_missing = cleaned_df[column].isnull().sum()
+                cleaned_df[column] = cleaned_df[column].fillna(fill_value)
+                applied_operations.append(f"Filled {original_missing} missing values in {column} using {method}")
+            elif operation_type == 'remove_duplicates':
+                original_len = len(cleaned_df)
+                cleaned_df = cleaned_df.drop_duplicates()
+                removed = original_len - len(cleaned_df)
+                applied_operations.append(f"Removed {removed} duplicate rows")
+            elif operation_type == 'remove_outliers' and column in cleaned_df.columns:
+                original_len = len(cleaned_df)
+                outliers = calculate_outliers(cleaned_df, column)
+                if outliers is not None and not outliers.empty:
+                    cleaned_df = cleaned_df[~cleaned_df.index.isin(outliers.index)]
+                    removed = original_len - len(cleaned_df)
+                    applied_operations.append(f"Removed {removed} outliers from {column}")
+        logger.info(f"Applied {len(applied_operations)} cleaning operations")
+        return cleaned_df
+    except Exception as e:
+        error_msg = f"Error during data cleaning: {str(e)}"
+        logger.error(error_msg)
+        st.error(error_msg)
+        return df  # Return original data if cleaning fails