Spaces:

entropy25
/

data-analysis-platform

Sleeping

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

ee51cad

verified ·

1 Parent(s): 78b8458

Update data_handler.py

Browse files

Files changed (1) hide show

data_handler.py +202 -673

data_handler.py CHANGED Viewed

@@ -4,53 +4,38 @@ import numpy as np
 import warnings
 from typing import Dict, List, Any, Tuple
 from scipy import stats
-import chardet
-from io import BytesIO
 warnings.filterwarnings('ignore')
-# HuggingFace optimized data processing functions with enhanced caching
-@st.cache_data(show_spinner=False)
 def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
-    """Load CSV with automatic encoding detection - optimized for HF"""
-    # Try to detect encoding
     try:
-        detected = chardet.detect(file_content[:10000])  # Sample first 10KB for speed
-        encoding = detected['encoding'] if detected['confidence'] > 0.7 else 'utf-8'
     except:
-        encoding = 'utf-8'
-    # Try detected encoding first, then fallbacks
-    encodings_to_try = [encoding, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
-    for enc in encodings_to_try:
-        try:
-            return pd.read_csv(BytesIO(file_content), encoding=enc)
-        except:
-            continue
-    raise Exception(f"Cannot read CSV file '{filename}' with any supported encoding")
-@st.cache_data(show_spinner=False)
 def load_excel_file(file_content: bytes) -> pd.DataFrame:
-    """Load Excel file - optimized for HF"""
-    try:
-        return pd.read_excel(BytesIO(file_content))
-    except Exception as e:
-        raise Exception(f"Cannot read Excel file: {str(e)}")
-@st.cache_data(show_spinner=False)
 def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
-    """Calculate basic statistics with performance optimization"""
-    # Optimize for large datasets
-    if len(df) > 100000:
-        sample_df = df.sample(n=50000, random_state=42)
-        st.info("📊 Using statistical sample for large dataset analysis")
-    else:
-        sample_df = df
     dtype_counts = df.dtypes.value_counts()
     dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
@@ -59,710 +44,254 @@ def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
         'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
         'missing_values': int(df.isnull().sum().sum()),
         'dtypes': dtype_dict,
-        'duplicates': int(df.duplicated().sum()),
-        'sample_used': len(sample_df) != len(df)
-    }
-@st.cache_data(show_spinner=False)
-def calculate_enhanced_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
-    """Calculate comprehensive quality score with business intelligence"""
-    score = 100
-    issues = []
-    recommendations = []
-    critical_issues = []
-    # Missing values analysis (max -30 points)
-    missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
-    if missing_pct > 0:
-        penalty = min(30, missing_pct * 1.5)
-        score -= penalty
-        issues.append(f"Missing values: {missing_pct:.1f}%")
-        if missing_pct > 20:
-            critical_issues.append("High missing value rate")
-            recommendations.append("🚨 Critical: Review data collection processes")
-        elif missing_pct > 5:
-            recommendations.append("🔧 Apply intelligent filling strategies")
-        else:
-            recommendations.append("✅ Missing values within acceptable limits")
-    # Duplicates analysis (max -25 points)
-    duplicate_pct = (df.duplicated().sum() / len(df)) * 100
-    if duplicate_pct > 0:
-        penalty = min(25, duplicate_pct * 3)
-        score -= penalty
-        issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
-        if duplicate_pct > 5:
-            critical_issues.append("High duplication rate")
-            recommendations.append("🚨 Investigate data collection pipeline")
-        else:
-            recommendations.append("🗑️ Remove duplicates before analysis")
-    # Outliers analysis (max -20 points)
-    numeric_cols = df.select_dtypes(include=[np.number]).columns
-    total_outliers = 0
-    problematic_cols = []
-    for col in numeric_cols:
-        try:
-            Q1 = df[col].quantile(0.25)
-            Q3 = df[col].quantile(0.75)
-            IQR = Q3 - Q1
-            if IQR > 0:  # Avoid division by zero
-                outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
-                outlier_pct = (len(outliers) / len(df)) * 100
-                total_outliers += len(outliers)
-                if outlier_pct > 5:
-                    problematic_cols.append(col)
-        except:
-            continue
-    if total_outliers > 0:
-        outlier_overall_pct = (total_outliers / len(df)) * 100
-        penalty = min(20, outlier_overall_pct * 2)
-        score -= penalty
-        issues.append(f"Statistical outliers: {outlier_overall_pct:.1f}%")
-        if problematic_cols:
-            recommendations.append(f"📊 Investigate outliers in: {', '.join(problematic_cols[:3])}")
-    # Type consistency analysis (max -15 points)
-    mixed_type_issues = detect_mixed_types(df)
-    if mixed_type_issues:
-        penalty = min(15, len(mixed_type_issues) * 5)
-        score -= penalty
-        issues.append(f"Type inconsistencies: {len(mixed_type_issues)} columns")
-        recommendations.append("🔧 Standardize data types")
-    # Constant columns analysis (max -10 points)
-    constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
-    if constant_cols:
-        penalty = min(10, len(constant_cols) * 3)
-        score -= penalty
-        issues.append(f"Constant columns: {len(constant_cols)}")
-        recommendations.append("🗑️ Remove uninformative columns")
-    # Grade assignment
-    if score >= 90:
-        grade, color = "A", "#22c55e"
-    elif score >= 80:
-        grade, color = "B", "#3b82f6"
-    elif score >= 70:
-        grade, color = "C", "#f59e0b"
-    elif score >= 60:
-        grade, color = "D", "#f97316"
-    else:
-        grade, color = "F", "#ef4444"
-    return {
-        'score': max(0, score),
-        'grade': grade,
-        'color': color,
-        'issues': issues,
-        'recommendations': recommendations,
-        'critical_issues': critical_issues,
-        'missing_pct': missing_pct,
-        'duplicate_pct': duplicate_pct,
-        'outlier_pct': (total_outliers / len(df)) * 100 if len(df) > 0 else 0,
-        'constant_cols': constant_cols,
-        'mixed_type_cols': len(mixed_type_issues)
     }
-@st.cache_data(show_spinner=False)
 def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
-    """Enhanced column cardinality analysis with business intelligence"""
     cardinality_data = []
     for col in df.columns:
         unique_count = df[col].nunique()
-        unique_ratio = unique_count / len(df) if len(df) > 0 else 0
-        missing_count = df[col].isnull().sum()
-        missing_pct = (missing_count / len(df)) * 100 if len(df) > 0 else 0
-        # Enhanced type classification
         if unique_count == 1:
             col_type = "Constant"
-            business_value = "None - Consider removal"
-        elif unique_count == len(df) - missing_count:
             col_type = "Unique Identifier"
-            business_value = "High - Key for joins"
-        elif unique_ratio < 0.01:
-            col_type = "Very Low Cardinality"
-            business_value = "Medium - Good for flags"
         elif unique_ratio < 0.05:
             col_type = "Low Cardinality"
-            business_value = "High - Perfect for grouping"
         elif unique_ratio < 0.5:
             col_type = "Medium Cardinality"
-            business_value = "Medium - Use for segmentation"
         else:
             col_type = "High Cardinality"
-            business_value = "Low - Avoid in group analysis"
-        # Memory impact estimation
-        if df[col].dtype == 'object' and unique_ratio < 0.5:
-            category_memory = df[col].astype('category').memory_usage(deep=True)
-            object_memory = df[col].memory_usage(deep=True)
-            memory_savings = (object_memory - category_memory) / 1024**2
-            memory_note = f"Save {memory_savings:.1f}MB with category type" if memory_savings > 0.1 else "Optimized"
-        else:
-            memory_note = "Optimized"
         cardinality_data.append({
             'Column': col,
             'Unique Count': unique_count,
             'Unique Ratio': unique_ratio,
-            'Missing %': missing_pct,
             'Type': col_type,
-            'Business Value': business_value,
-            'Data Type': str(df[col].dtype),
-            'Memory Note': memory_note
         })
     return pd.DataFrame(cardinality_data)
-@st.cache_data(show_spinner=False)
-def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
-    """Enhanced missing data analysis with pattern detection"""
     missing_data = df.isnull().sum()
     if missing_data.sum() > 0:
         missing_df = pd.DataFrame({
             'Column': missing_data.index,
             'Missing Count': missing_data.values,
-            'Missing %': (missing_data.values / len(df)) * 100,
-            'Data Type': [str(df[col].dtype) for col in missing_data.index]
         })
-        # Add severity classification
-        def classify_severity(pct):
-            if pct > 50:
-                return "🚨 Critical"
-            elif pct > 20:
-                return "⚠️ High"
-            elif pct > 5:
-                return "🔸 Medium"
-            else:
-                return "🔹 Low"
-        missing_df['Severity'] = missing_df['Missing %'].apply(classify_severity)
-        # Add AI suggestions
-        def get_ai_suggestion(row):
-            col_name = row['Column']
-            missing_pct = row['Missing %']
-            data_type = row['Data Type']
-            if missing_pct > 50:
-                return "Drop column - too many missing values"
-            elif 'int' in data_type or 'float' in data_type:
-                return "Fill with median (robust to outliers)"
-            elif 'object' in data_type:
-                return "Fill with mode (most frequent value)"
-            else:
-                return "Manual review recommended"
-        missing_df['AI Suggestion'] = missing_df.apply(get_ai_suggestion, axis=1)
         return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
     return pd.DataFrame()
-@st.cache_data(show_spinner=False)
 def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
-    """Calculate correlation matrix with performance optimization"""
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    if len(numeric_cols) > 1:
-        # Use sample for very large datasets
-        if len(df) > 50000:
-            sample_df = df[numeric_cols].sample(n=25000, random_state=42)
-        else:
-            sample_df = df[numeric_cols]
-        return sample_df.corr()
-    return pd.DataFrame()
-@st.cache_data(show_spinner=False)
 def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
-    """Enhanced column type detection with business context"""
     return {
         'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
         'categorical': df.select_dtypes(include=['object']).columns.tolist(),
-        'datetime': df.select_dtypes(include=['datetime64']).columns.tolist(),
-        'boolean': df.select_dtypes(include=['bool']).columns.tolist()
     }
-@st.cache_data(show_spinner=False)
 def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
-    """Enhanced outlier detection with business context"""
-    try:
-        Q1 = df[column].quantile(0.25)
-        Q3 = df[column].quantile(0.75)
-        IQR = Q3 - Q1
-        if IQR == 0:  # No variation in data
-            return pd.DataFrame()
-        lower_bound = Q1 - 1.5 * IQR
-        upper_bound = Q3 + 1.5 * IQR
-        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
-        # Add outlier context
-        if not outliers.empty:
-            outliers = outliers.copy()
-            outliers['outlier_type'] = outliers[column].apply(
-                lambda x: 'extreme_high' if x > upper_bound else 'extreme_low'
-            )
-            outliers['severity'] = outliers[column].apply(
-                lambda x: abs(x - df[column].median()) / df[column].std() if df[column].std() > 0 else 0
-            )
-        return outliers
-    except Exception as e:
-        st.warning(f"Could not calculate outliers for '{column}': {str(e)}")
-        return pd.DataFrame()
-@st.cache_data(show_spinner=False)
 def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
-    """Enhanced mixed type detection with AI insights"""
     mixed_type_issues = []
     for col in df.select_dtypes(include=['object']).columns:
-        try:
-            # Try numeric conversion
-            numeric_conversion = pd.to_numeric(df[col], errors='coerce')
-            new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
-            if new_nulls > 0:
-                # Analyze the problematic values
-                problematic_mask = pd.to_numeric(df[col], errors='coerce').isnull() & df[col].notnull()
-                problematic_values = df.loc[problematic_mask, col].unique()[:5]  # Top 5 examples
-                mixed_type_issues.append({
-                    'column': col,
-                    'problematic_values': new_nulls,
-                    'total_values': len(df[col]),
-                    'percentage': (new_nulls / len(df[col])) * 100,
-                    'examples': problematic_values.tolist(),
-                    'suggestion': 'Convert to numeric with error handling' if new_nulls < len(df[col]) * 0.1 else 'Keep as text'
-                })
-        except:
-            continue
     return mixed_type_issues
-@st.cache_data(show_spinner=False)
-def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
-    """Enhanced memory optimization with detailed suggestions"""
-    suggestions = []
-    current_memory = df.memory_usage(deep=True).sum() / 1024**2
-    potential_savings = 0
-    for col in df.columns:
-        col_memory = df[col].memory_usage(deep=True) / 1024**2
-        if df[col].dtype == 'object':
-            unique_ratio = df[col].nunique() / len(df)
-            # Category optimization
-            if unique_ratio < 0.5:
-                try:
-                    category_memory = df[col].astype('category').memory_usage(deep=True) / 1024**2
-                    savings = col_memory - category_memory
-                    if savings > 0.1:  # Significant savings
-                        suggestions.append({
-                            'column': col,
-                            'current_type': 'object',
-                            'suggested_type': 'category',
-                            'current_memory_mb': col_memory,
-                            'optimized_memory_mb': category_memory,
-                            'savings_mb': savings,
-                            'savings_pct': (savings / col_memory) * 100
-                        })
-                        potential_savings += savings
-                except:
-                    continue
-        elif df[col].dtype == 'int64':
-            # Integer downcast optimization
-            col_min = df[col].min()
-            col_max = df[col].max()
-            if col_min >= 0:  # Unsigned integers
-                if col_max < 255:
-                    new_type = 'uint8'
-                elif col_max < 65535:
-                    new_type = 'uint16'
-                elif col_max < 4294967295:
-                    new_type = 'uint32'
-                else:
-                    new_type = 'int64'
-            else:  # Signed integers
-                if col_min >= -128 and col_max <= 127:
-                    new_type = 'int8'
-                elif col_min >= -32768 and col_max <= 32767:
-                    new_type = 'int16'
-                elif col_min >= -2147483648 and col_max <= 2147483647:
-                    new_type = 'int32'
-                else:
-                    new_type = 'int64'
-            if new_type != 'int64':
-                try:
-                    optimized_memory = df[col].astype(new_type).memory_usage(deep=True) / 1024**2
-                    savings = col_memory - optimized_memory
-                    if savings > 0.1:
-                        suggestions.append({
-                            'column': col,
-                            'current_type': 'int64',
-                            'suggested_type': new_type,
-                            'current_memory_mb': col_memory,
-                            'optimized_memory_mb': optimized_memory,
-                            'savings_mb': savings,
-                            'savings_pct': (savings / col_memory) * 100
-                        })
-                        potential_savings += savings
-                except:
-                    continue
-    return {
-        'suggestions': suggestions,
-        'current_memory_mb': current_memory,
-        'potential_savings_mb': potential_savings,
-        'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0,
-        'optimization_available': len(suggestions) > 0
-    }
-@st.cache_data(show_spinner=False)
 def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
-    """Get value counts with performance optimization"""
-    try:
-        value_counts = df[column].value_counts()
-        # Add percentage information
-        value_counts_pct = (value_counts / len(df)) * 100
-        return value_counts.head(top_n)
-    except:
-        return pd.Series()
-@st.cache_data(show_spinner=False)
 def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
-    """Enhanced group statistics with business insights"""
-    try:
-        # Basic group statistics
-        group_stats = df.groupby(group_col)[metric_col].agg([
-            'count', 'mean', 'median', 'std', 'min', 'max'
-        ]).round(3)
-        # Add business insights
-        group_stats['cv'] = (group_stats['std'] / group_stats['mean']).round(3)  # Coefficient of variation
-        group_stats['range'] = group_stats['max'] - group_stats['min']
-        # Sort by mean for better insights
-        group_stats = group_stats.sort_values('mean', ascending=False)
-        return group_stats
-    except Exception as e:
-        st.error(f"Error calculating group statistics: {str(e)}")
-        return pd.DataFrame()
-@st.cache_data(show_spinner=False)
 def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
-    """Backward compatibility wrapper"""
-    return calculate_enhanced_quality_score(df)
-def load_data(uploaded_file) -> pd.DataFrame:
-    """Enhanced data loading with better error handling for HuggingFace"""
-    if uploaded_file is None:
-        return None
-    try:
-        # Check file size (HuggingFace has limits)
-        file_size_mb = len(uploaded_file.getvalue()) / 1024**2
-        if file_size_mb > 200:  # 200MB limit for HF
-            st.error(f"File too large ({file_size_mb:.1f}MB). Please upload files under 200MB.")
-            return None
-        # Get file content
-        file_content = uploaded_file.read()
-        uploaded_file.seek(0)  # Reset file pointer
-        # Load based on file extension
-        if uploaded_file.name.endswith('.csv'):
-            df = load_csv_with_encoding(file_content, uploaded_file.name)
-        elif uploaded_file.name.endswith(('.xlsx', '.xls')):
-            df = load_excel_file(file_content)
-        else:
-            st.error("Unsupported file format. Please upload CSV or Excel files.")
-            return None
-        # Basic validation
-        if df.empty:
-            st.error("The uploaded file appears to be empty.")
-            return None
-        if len(df.columns) == 0:
-            st.error("No columns detected in the file.")
-            return None
-        # Performance warning for large datasets
-        if len(df) > 100000:
-            st.warning(f"⚡ Large dataset detected ({len(df):,} rows). Some operations will use sampling for performance.")
-        return df
-    except Exception as e:
-        st.error(f"Error loading file: {str(e)}")
-        st.info("💡 **Troubleshooting Tips:**\n- Ensure CSV files are properly formatted\n- Check for special characters in Excel files\n- Try saving Excel as CSV first")
-        return None
-def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> Tuple[pd.DataFrame, List[str]]:
-    """Apply comprehensive data cleaning operations with logging"""
-    cleaned_df = df.copy()
-    operation_log = []
-    for operation in operations:
-        try:
-            if operation['type'] == 'fill_missing':
-                col = operation['column']
-                method = operation['method']
-                if method == 'mean' and cleaned_df[col].dtype in ['int64', 'float64']:
-                    fill_value = cleaned_df[col].mean()
-                    cleaned_df[col] = cleaned_df[col].fillna(fill_value)
-                    operation_log.append(f"Filled missing values in '{col}' with mean ({fill_value:.2f})")
-                elif method == 'median' and cleaned_df[col].dtype in ['int64', 'float64']:
-                    fill_value = cleaned_df[col].median()
-                    cleaned_df[col] = cleaned_df[col].fillna(fill_value)
-                    operation_log.append(f"Filled missing values in '{col}' with median ({fill_value:.2f})")
-                elif method == 'mode':
-                    mode_values = cleaned_df[col].mode()
-                    if not mode_values.empty:
-                        fill_value = mode_values.iloc[0]
-                        cleaned_df[col] = cleaned_df[col].fillna(fill_value)
-                        operation_log.append(f"Filled missing values in '{col}' with mode ('{fill_value}')")
-                elif method == 'drop':
-                    original_len = len(cleaned_df)
-                    cleaned_df = cleaned_df.dropna(subset=[col])
-                    removed = original_len - len(cleaned_df)
-                    operation_log.append(f"Dropped {removed} rows with missing values in '{col}'")
-            elif operation['type'] == 'remove_duplicates':
-                original_len = len(cleaned_df)
-                cleaned_df = cleaned_df.drop_duplicates()
-                removed = original_len - len(cleaned_df)
-                if removed > 0:
-                    operation_log.append(f"Removed {removed} duplicate rows")
-            elif operation['type'] == 'remove_outliers':
-                col = operation['column']
-                Q1 = cleaned_df[col].quantile(0.25)
-                Q3 = cleaned_df[col].quantile(0.75)
-                IQR = Q3 - Q1
-                lower_bound = Q1 - 1.5 * IQR
-                upper_bound = Q3 + 1.5 * IQR
-                outliers = cleaned_df[(cleaned_df[col] < lower_bound) | (cleaned_df[col] > upper_bound)]
-                cleaned_df = cleaned_df[~cleaned_df.index.isin(outliers.index)]
-                operation_log.append(f"Removed {len(outliers)} outliers from '{col}'")
-            elif operation['type'] == 'cap_outliers':
-                col = operation['column']
-                Q1 = cleaned_df[col].quantile(0.25)
-                Q3 = cleaned_df[col].quantile(0.75)
-                IQR = Q3 - Q1
-                lower_bound = Q1 - 1.5 * IQR
-                upper_bound = Q3 + 1.5 * IQR
-                original_outliers = len(cleaned_df[(cleaned_df[col] < lower_bound) | (cleaned_df[col] > upper_bound)])
-                cleaned_df[col] = cleaned_df[col].clip(lower_bound, upper_bound)
-                operation_log.append(f"Capped {original_outliers} outliers in '{col}' to statistical bounds")
-            elif operation['type'] == 'convert_type':
-                col = operation['column']
-                target_type = operation['target_type']
-                if target_type == 'category':
-                    cleaned_df[col] = cleaned_df[col].astype('category')
-                    operation_log.append(f"Converted '{col}' to category type")
-                elif target_type == 'numeric':
-                    cleaned_df[col] = pd.to_numeric(cleaned_df[col], errors='coerce')
-                    operation_log.append(f"Converted '{col}' to numeric type")
-            elif operation['type'] == 'drop_column':
-                col = operation['column']
-                cleaned_df = cleaned_df.drop(columns=[col])
-                operation_log.append(f"Dropped column '{col}'")
-        except Exception as e:
-            operation_log.append(f"Failed to apply {operation['type']}: {str(e)}")
-    return cleaned_df, operation_log
-# HuggingFace specific optimizations
-def optimize_dataframe_for_hf(df: pd.DataFrame) -> pd.DataFrame:
-    """Apply HuggingFace specific optimizations"""
-    optimized_df = df.copy()
-    # Convert high-cardinality object columns to category
-    for col in optimized_df.select_dtypes(include=['object']).columns:
-        if optimized_df[col].nunique() / len(optimized_df) < 0.5:
-            try:
-                optimized_df[col] = optimized_df[col].astype('category')
-            except:
-                continue
-    # Downcast numeric types for memory efficiency
-    for col in optimized_df.select_dtypes(include=['int64']).columns:
-        try:
-            optimized_df[col] = pd.to_numeric(optimized_df[col], downcast='integer')
-        except:
-            continue
-    for col in optimized_df.select_dtypes(include=['float64']).columns:
-        try:
-            optimized_df[col] = pd.to_numeric(optimized_df[col], downcast='float')
-        except:
-            continue
-    return optimized_df
-@st.cache_data(show_spinner=False)
-def generate_sample_data() -> pd.DataFrame:
-    """Generate sample dataset for demonstration"""
-    np.random.seed(42)
-    n_samples = 1000
-    # Create realistic business dataset
-    data = {
-        'customer_id': [f"CUST_{i:06d}" for i in range(1, n_samples + 1)],
-        'age': np.random.normal(35, 12, n_samples),
-        'annual_income': np.random.lognormal(10.5, 0.5, n_samples),
-        'credit_score': np.random.normal(650, 100, n_samples),
-        'account_balance': np.random.normal(5000, 3000, n_samples),
-        'region': np.random.choice(['North', 'South', 'East', 'West', 'Central'], n_samples),
-        'customer_segment': np.random.choice(['Premium', 'Standard', 'Basic'], n_samples, p=[0.2, 0.5, 0.3]),
-        'is_active': np.random.choice([True, False], n_samples, p=[0.8, 0.2]),
-        'signup_date': pd.date_range('2020-01-01', periods=n_samples, freq='D')[:n_samples]
     }
-    df = pd.DataFrame(data)
-    # Inject realistic quality issues for demonstration
-    # 1. Missing values in income (realistic - some customers don't disclose)
-    missing_income_idx = np.random.choice(df.index, size=int(n_samples * 0.15), replace=False)
-    df.loc[missing_income_idx, 'annual_income'] = np.nan
-    # 2. Missing values in credit score (realistic - new customers)
-    missing_credit_idx = np.random.choice(df.index, size=int(n_samples * 0.08), replace=False)
-    df.loc[missing_credit_idx, 'credit_score'] = np.nan
-    # 3. Outliers in age (data entry errors)
-    outlier_age_idx = np.random.choice(df.index, size=25, replace=False)
-    df.loc[outlier_age_idx, 'age'] = np.random.uniform(150, 999, 25)  # Obvious errors
-    # 4. Outliers in income (legitimate high earners + errors)
-    outlier_income_idx = np.random.choice(df.index, size=30, replace=False)
-    df.loc[outlier_income_idx, 'annual_income'] = np.random.uniform(500000, 2000000, 30)
-    # 5. Negative account balances (overdrafts - realistic)
-    negative_balance_idx = np.random.choice(df.index, size=50, replace=False)
-    df.loc[negative_balance_idx, 'account_balance'] = np.random.uniform(-5000, -100, 50)
-    # 6. Duplicate records (system errors)
-    duplicate_records = df.sample(n=35).copy()
-    df = pd.concat([df, duplicate_records], ignore_index=True)
-    # 7. Mixed types in a column (add some text to numeric column)
-    mixed_type_idx = np.random.choice(df.index, size=15, replace=False)
-    df.loc[mixed_type_idx, 'credit_score'] = 'PENDING'
-    return df
-# Additional utility functions for HuggingFace deployment
-def check_dataset_compatibility(df: pd.DataFrame) -> Dict[str, Any]:
-    """Check if dataset is compatible with HuggingFace processing limits"""
-    compatibility = {
-        'size_ok': True,
-        'memory_ok': True,
-        'columns_ok': True,
-        'warnings': [],
-        'recommendations': []
-    }
-    # Size checks
-    if len(df) > 1000000:  # 1M rows
-        compatibility['size_ok'] = False
-        compatibility['warnings'].append(f"Large dataset: {len(df):,} rows")
-        compatibility['recommendations'].append("Consider sampling for interactive analysis")
-    # Memory checks
-    memory_mb = df.memory_usage(deep=True).sum() / 1024**2
-    if memory_mb > 500:  # 500MB
-        compatibility['memory_ok'] = False
-        compatibility['warnings'].append(f"High memory usage: {memory_mb:.1f}MB")
-        compatibility['recommendations'].append("Apply memory optimization techniques")
-    # Column count checks
-    if len(df.columns) > 100:
-        compatibility['columns_ok'] = False
-        compatibility['warnings'].append(f"Many columns: {len(df.columns)}")
-        compatibility['recommendations'].append("Focus analysis on key business columns")
-    return compatibility
-def get_smart_sample(df: pd.DataFrame, target_size: int = 10000) -> pd.DataFrame:
-    """Get intelligent sample that preserves data characteristics"""
-    if len(df) <= target_size:
-        return df
-    # Stratified sampling if categorical columns exist
-    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
-    if len(categorical_cols) > 0:
-        # Use the first categorical column for stratification
-        strat_col = categorical_cols[0]
-        try:
-            sample_df = df.groupby(strat_col, group_keys=False).apply(
-                lambda x: x.sample(min(len(x), max(1, int(target_size * len(x) / len(df)))))
-            )
-            return sample_df.reset_index(drop=True)
-        except:
-            # Fall back to random sampling
-            pass
-    # Random sampling
-    return df.sample(n=target_size, random_state=42).reset_index(drop=True)

 import warnings
 from typing import Dict, List, Any, Tuple
 from scipy import stats
 warnings.filterwarnings('ignore')
+# All cached data processing functions
+@st.cache_data
 def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
+    """Load CSV with automatic encoding detection - cached"""
+    import chardet
+    detected = chardet.detect(file_content)
+    encoding = detected['encoding']
     try:
+        from io import BytesIO
+        return pd.read_csv(BytesIO(file_content), encoding=encoding)
     except:
+        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
+        for enc in encodings:
+            try:
+                return pd.read_csv(BytesIO(file_content), encoding=enc)
+            except:
+                continue
+        raise Exception("Cannot read file with any encoding")
+@st.cache_data
 def load_excel_file(file_content: bytes) -> pd.DataFrame:
+    """Load Excel file - cached"""
+    from io import BytesIO
+    return pd.read_excel(BytesIO(file_content))
+@st.cache_data
 def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate basic statistics - cached"""
     dtype_counts = df.dtypes.value_counts()
     dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
         'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
         'missing_values': int(df.isnull().sum().sum()),
         'dtypes': dtype_dict,
+        'duplicates': int(df.duplicated().sum())
     }
+@st.cache_data
 def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate column cardinality analysis - cached"""
     cardinality_data = []
     for col in df.columns:
         unique_count = df[col].nunique()
+        unique_ratio = unique_count / len(df)
+        # Determine column type based on cardinality
         if unique_count == 1:
             col_type = "Constant"
+        elif unique_count == len(df):
             col_type = "Unique Identifier"
         elif unique_ratio < 0.05:
             col_type = "Low Cardinality"
         elif unique_ratio < 0.5:
             col_type = "Medium Cardinality"
         else:
             col_type = "High Cardinality"
         cardinality_data.append({
             'Column': col,
             'Unique Count': unique_count,
             'Unique Ratio': unique_ratio,
             'Type': col_type,
+            'Data Type': str(df[col].dtype)
         })
     return pd.DataFrame(cardinality_data)
+@st.cache_data
+def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate memory optimization suggestions - cached"""
+    suggestions = []
+    current_memory = df.memory_usage(deep=True).sum() / 1024**2
+    potential_savings = 0
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            unique_ratio = df[col].nunique() / len(df)
+            if unique_ratio < 0.5:  # Less than 50% unique values
+                # Estimate category memory usage
+                category_memory = df[col].astype('category').memory_usage(deep=True)
+                object_memory = df[col].memory_usage(deep=True)
+                savings = (object_memory - category_memory) / 1024**2
+                if savings > 0.1:  # More than 0.1MB savings
+                    suggestions.append({
+                        'column': col,
+                        'current_type': 'object',
+                        'suggested_type': 'category',
+                        'savings_mb': savings
+                    })
+                    potential_savings += savings
+    return {
+        'suggestions': suggestions,
+        'current_memory_mb': current_memory,
+        'potential_savings_mb': potential_savings,
+        'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0
+    }
+@st.cache_data
+def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate missing data analysis - cached"""
     missing_data = df.isnull().sum()
     if missing_data.sum() > 0:
         missing_df = pd.DataFrame({
             'Column': missing_data.index,
             'Missing Count': missing_data.values,
+            'Missing %': (missing_data.values / len(df)) * 100
         })
         return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
     return pd.DataFrame()
+@st.cache_data
 def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate correlation matrix - cached"""
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    return df[numeric_cols].corr() if len(numeric_cols) > 1 else pd.DataFrame()
+@st.cache_data
 def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
+    """Get column types - cached"""
     return {
         'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
         'categorical': df.select_dtypes(include=['object']).columns.tolist(),
+        'datetime': df.select_dtypes(include=['datetime64']).columns.tolist()
+    }
+@st.cache_data
+def calculate_numeric_stats(df: pd.DataFrame, column: str) -> Dict[str, float]:
+    """Calculate enhanced numeric statistics - cached"""
+    series = df[column].dropna()
+    return {
+        'mean': series.mean(),
+        'median': series.median(),
+        'std': series.std(),
+        'skewness': series.skew(),
+        'kurtosis': series.kurtosis(),
+        'min': series.min(),
+        'max': series.max(),
+        'q25': series.quantile(0.25),
+        'q75': series.quantile(0.75)
     }
+@st.cache_data
 def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
+    """Calculate outliers using IQR method - cached"""
+    Q1 = df[column].quantile(0.25)
+    Q3 = df[column].quantile(0.75)
+    IQR = Q3 - Q1
+    lower_bound = Q1 - 1.5 * IQR
+    upper_bound = Q3 + 1.5 * IQR
+    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]
+@st.cache_data
 def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
+    """Detect columns with mixed data types - cached"""
     mixed_type_issues = []
     for col in df.select_dtypes(include=['object']).columns:
+        # Try to convert to numeric
+        numeric_conversion = pd.to_numeric(df[col], errors='coerce')
+        new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
+        if new_nulls > 0:
+            mixed_type_issues.append({
+                'column': col,
+                'problematic_values': new_nulls,
+                'total_values': len(df[col]),
+                'percentage': (new_nulls / len(df[col])) * 100
+            })
     return mixed_type_issues
+@st.cache_data
 def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
+    """Get value counts for categorical column - cached"""
+    return df[column].value_counts().head(top_n)
+@st.cache_data
+def calculate_crosstab(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
+    """Calculate crosstab between two categorical columns - cached"""
+    return pd.crosstab(df[col1], df[col2])
+@st.cache_data
 def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
+    """Calculate group statistics - cached"""
+    return df.groupby(group_col)[metric_col].agg(['mean', 'median', 'std', 'count'])
+@st.cache_data
 def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate overall data quality score - cached"""
+    score = 100
+    issues = []
+    # Missing values penalty
+    missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
+    if missing_pct > 0:
+        penalty = min(30, missing_pct * 2)  # Max 30 points penalty
+        score -= penalty
+        issues.append(f"Missing values: {missing_pct:.1f}%")
+    # Duplicates penalty
+    duplicate_pct = (df.duplicated().sum() / len(df)) * 100
+    if duplicate_pct > 0:
+        penalty = min(20, duplicate_pct * 4)  # Max 20 points penalty
+        score -= penalty
+        issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
+    # Constant columns penalty
+    constant_cols = [col for col in df.columns if df[col].nunique() == 1]
+    if constant_cols:
+        penalty = min(10, len(constant_cols) * 2)
+        score -= penalty
+        issues.append(f"Constant columns: {len(constant_cols)}")
+    # Mixed types penalty
+    mixed_types = detect_mixed_types(df)
+    if mixed_types:
+        penalty = min(10, len(mixed_types) * 3)
+        score -= penalty
+        issues.append(f"Mixed type columns: {len(mixed_types)}")
+    return {
+        'score': max(0, score),
+        'issues': issues,
+        'grade': 'A' if score >= 90 else 'B' if score >= 80 else 'C' if score >= 70 else 'D' if score >= 60 else 'F'
     }
+def load_data(uploaded_file):
+    """Unified data loading function"""
+    file_content = uploaded_file.read()
+    uploaded_file.seek(0)
+    if uploaded_file.name.endswith('.csv'):
+        return load_csv_with_encoding(file_content, uploaded_file.name)
+    else:
+        return load_excel_file(file_content)
+def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> pd.DataFrame:
+    """Apply data cleaning operations"""
+    cleaned_df = df.copy()
+    for operation in operations:
+        if operation['type'] == 'fill_missing':
+            if operation['method'] == 'mean':
+                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
+                    cleaned_df[operation['column']].mean())
+            elif operation['method'] == 'median':
+                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
+                    cleaned_df[operation['column']].median())
+            elif operation['method'] == 'mode':
+                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
+                    cleaned_df[operation['column']].mode().iloc[0] if not cleaned_df[operation['column']].mode().empty else 0)
+            elif operation['method'] == 'drop':
+                cleaned_df = cleaned_df.dropna(subset=[operation['column']])
+        elif operation['type'] == 'remove_duplicates':
+            cleaned_df = cleaned_df.drop_duplicates()
+        elif operation['type'] == 'remove_outliers':
+            Q1 = cleaned_df[operation['column']].quantile(0.25)
+            Q3 = cleaned_df[operation['column']].quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            cleaned_df = cleaned_df[
+                (cleaned_df[operation['column']] >= lower_bound) &
+                (cleaned_df[operation['column']] <= upper_bound)
+            ]
+        elif operation['type'] == 'cap_outliers':
+            Q1 = cleaned_df[operation['column']].quantile(0.25)
+            Q3 = cleaned_df[operation['column']].quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            cleaned_df[operation['column']] = cleaned_df[operation['column']].clip(lower_bound, upper_bound)
+        elif operation['type'] == 'convert_type':
+            if operation['target_type'] == 'category':
+                cleaned_df[operation['column']] = cleaned_df[operation['column']].astype('category')
+    return cleaned_df